137 lines
5.4 KiB
Python
137 lines
5.4 KiB
Python
#####################################################
|
|
# Copyright (c) Xuanyi Dong [GitHub D-X-Y], 2021.06 #
|
|
#####################################################
|
|
from torch.optim.lr_scheduler import _LRScheduler
|
|
|
|
|
|
class CosineDecayWithWarmup(_LRScheduler):
|
|
r"""Set the learning rate of each parameter group using a cosine annealing
|
|
schedule, where :math:`\eta_{max}` is set to the initial lr, :math:`T_{cur}`
|
|
is the number of epochs since the last restart and :math:`T_{i}` is the number
|
|
of epochs between two warm restarts in SGDR:
|
|
.. math::
|
|
\eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
|
|
\cos\left(\frac{T_{cur}}{T_{i}}\pi\right)\right)
|
|
When :math:`T_{cur}=T_{i}`, set :math:`\eta_t = \eta_{min}`.
|
|
When :math:`T_{cur}=0` after restart, set :math:`\eta_t=\eta_{max}`.
|
|
It has been proposed in
|
|
`SGDR: Stochastic Gradient Descent with Warm Restarts`_.
|
|
Args:
|
|
optimizer (Optimizer): Wrapped optimizer.
|
|
T_0 (int): Number of iterations for the first restart.
|
|
T_mult (int, optional): A factor increases :math:`T_{i}` after a restart. Default: 1.
|
|
eta_min (float, optional): Minimum learning rate. Default: 0.
|
|
last_epoch (int, optional): The index of last epoch. Default: -1.
|
|
verbose (bool): If ``True``, prints a message to stdout for
|
|
each update. Default: ``False``.
|
|
.. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
|
|
https://arxiv.org/abs/1608.03983
|
|
"""
|
|
|
|
def __init__(
|
|
self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose=False
|
|
):
|
|
if T_0 <= 0 or not isinstance(T_0, int):
|
|
raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
|
|
if T_mult < 1 or not isinstance(T_mult, int):
|
|
raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult))
|
|
self.T_0 = T_0
|
|
self.T_i = T_0
|
|
self.T_mult = T_mult
|
|
self.eta_min = eta_min
|
|
|
|
super(CosineDecayWithWarmup, self).__init__(optimizer, last_epoch, verbose)
|
|
|
|
self.T_cur = self.last_epoch
|
|
|
|
def get_lr(self):
|
|
if not self._get_lr_called_within_step:
|
|
warnings.warn(
|
|
"To get the last learning rate computed by the scheduler, "
|
|
"please use `get_last_lr()`.",
|
|
UserWarning,
|
|
)
|
|
|
|
return [
|
|
self.eta_min
|
|
+ (base_lr - self.eta_min)
|
|
* (1 + math.cos(math.pi * self.T_cur / self.T_i))
|
|
/ 2
|
|
for base_lr in self.base_lrs
|
|
]
|
|
|
|
def step(self, epoch=None):
|
|
"""Step could be called after every batch update
|
|
Example:
|
|
>>> scheduler = CosineDecayWithWarmup(optimizer, T_0, T_mult)
|
|
>>> iters = len(dataloader)
|
|
>>> for epoch in range(20):
|
|
>>> for i, sample in enumerate(dataloader):
|
|
>>> inputs, labels = sample['inputs'], sample['labels']
|
|
>>> optimizer.zero_grad()
|
|
>>> outputs = net(inputs)
|
|
>>> loss = criterion(outputs, labels)
|
|
>>> loss.backward()
|
|
>>> optimizer.step()
|
|
>>> scheduler.step(epoch + i / iters)
|
|
This function can be called in an interleaved way.
|
|
Example:
|
|
>>> scheduler = CosineDecayWithWarmup(optimizer, T_0, T_mult)
|
|
>>> for epoch in range(20):
|
|
>>> scheduler.step()
|
|
>>> scheduler.step(26)
|
|
>>> scheduler.step() # scheduler.step(27), instead of scheduler(20)
|
|
"""
|
|
|
|
if epoch is None and self.last_epoch < 0:
|
|
epoch = 0
|
|
|
|
if epoch is None:
|
|
epoch = self.last_epoch + 1
|
|
self.T_cur = self.T_cur + 1
|
|
if self.T_cur >= self.T_i:
|
|
self.T_cur = self.T_cur - self.T_i
|
|
self.T_i = self.T_i * self.T_mult
|
|
else:
|
|
if epoch < 0:
|
|
raise ValueError(
|
|
"Expected non-negative epoch, but got {}".format(epoch)
|
|
)
|
|
if epoch >= self.T_0:
|
|
if self.T_mult == 1:
|
|
self.T_cur = epoch % self.T_0
|
|
else:
|
|
n = int(
|
|
math.log(
|
|
(epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult
|
|
)
|
|
)
|
|
self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (
|
|
self.T_mult - 1
|
|
)
|
|
self.T_i = self.T_0 * self.T_mult ** (n)
|
|
else:
|
|
self.T_i = self.T_0
|
|
self.T_cur = epoch
|
|
self.last_epoch = math.floor(epoch)
|
|
|
|
class _enable_get_lr_call:
|
|
def __init__(self, o):
|
|
self.o = o
|
|
|
|
def __enter__(self):
|
|
self.o._get_lr_called_within_step = True
|
|
return self
|
|
|
|
def __exit__(self, type, value, traceback):
|
|
self.o._get_lr_called_within_step = False
|
|
return self
|
|
|
|
with _enable_get_lr_call(self):
|
|
for i, data in enumerate(zip(self.optimizer.param_groups, self.get_lr())):
|
|
param_group, lr = data
|
|
param_group["lr"] = lr
|
|
self.print_lr(self.verbose, i, lr, epoch)
|
|
|
|
self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
|