Annealing

This is the decorator we will use for all of our scheduling functions, as it transforms a function taking (start, end, pos) to something taking (start, end) and return a function depending of pos.

annealings = "NO LINEAR COS EXP".split()
p = torch.linspace(0.,1,100)
fns = [SchedNo, SchedLin, SchedCos, SchedExp]

for fn, t in zip(fns, annealings):
    plt.plot(p, [fn(2, 1e-2)(o) for o in p], label=t)
f = SchedPoly(2,1e-2,0.5)
plt.plot(p, [f(o) for o in p], label="POLY(0.5)")
plt.legend();

sched = SchedLin(0, 2)
test_eq(L(map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0.5, 1., 1.5, 2.])

sched = SchedCos(0, 2)
test_close(L(map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0.29289, 1., 1.70711, 2.])

sched = SchedNo(0, 2)
test_close(L(map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0., 0., 0., 0.])

sched = SchedExp(1, 2)
test_close(L(map(sched, [0., 0.25, 0.5, 0.75, 1.])), [1., 1.18921, 1.41421, 1.68179, 2.])

sched = SchedPoly(0, 2, 2)
test_close(L(map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0.125, 0.5, 1.125, 2.])

p = torch.linspace(0.,1,100)

pows = [0.5,1.,2.]
for e in pows:
    f = SchedPoly(2, 0, e)
    plt.plot(p, [f(o) for o in p], label=f'power {e}')
plt.legend();

pcts must be a list of positive numbers that add up to 1 and is the same length as scheds. The generated function will use scheds[0] from 0 to pcts[0] then scheds[1] from pcts[0] to pcts[0]+pcts[1] and so forth.

p = torch.linspace(0.,1,100)
f = combine_scheds([0.3,0.7], [SchedCos(0.3,0.6), SchedCos(0.6,0.2)])
plt.plot(p, [f(o) for o in p]);

/mnt/d/lib/python3.7/site-packages/ipykernel_launcher.py:10: UserWarning: This overload of nonzero is deprecated:
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  # Remove the CWD from sys.path while we load stuff.

p = torch.linspace(0.,1,100)
f = combine_scheds([0.3,0.2,0.5], [SchedLin(0.,1.), SchedNo(1.,1.), SchedCos(1., 0.)])
plt.plot(p, [f(o) for o in p]);

This is a useful helper function for the 1cycle policy. pct is used for the start to middle part, 1-pct for the middle to end. Handles floats or collection of floats. For example:

f = combined_cos(0.25,0.5,1.,0.)
plt.plot(p, [f(o) for o in p]);

scheds is a dictionary with one key for each hyper-parameter you want to schedule, with either a scheduler or a list of schedulers as values (in the second case, the list must have the same length as the the number of parameters groups of the optimizer).

learn = synth_learner()
sched = {'lr': SchedLin(1e-3, 1e-2)}
learn.fit(1, cbs=ParamScheduler(sched))
n = len(learn.dls.train)
test_close(learn.recorder.hps['lr'], [1e-3 + (1e-2-1e-3) * i/n for i in range(n)])

[0, 6.161931037902832, 4.104144096374512, '00:00']

/mnt/d/lib/python3.7/site-packages/torch/autograd/__init__.py:132: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at  /pytorch/c10/cuda/CUDAFunctions.cpp:100.)
  allow_unreachable=True)  # allow_unreachable flag

The 1cycle policy was introduced by Leslie N. Smith et al. in Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates. It schedules the learning rate with a cosine annealing from lr_max/div to lr_max then lr_max/div_final (pass an array to lr_max if you want to use differential learning rates) and the momentum with cosine annealing according to the values in moms. The first phase takes pct_start of the training. You can optionally pass additional cbs and reset_opt.

learn = synth_learner(lr=1e-2)
xb,yb = learn.dls.one_batch()
init_loss = learn.loss_func(learn.model(xb), yb)
learn.fit_one_cycle(2)
xb,yb = learn.dls.one_batch()
final_loss = learn.loss_func(learn.model(xb), yb)
assert final_loss < init_loss

[0, 26.9017391204834, 28.374866485595703, '00:00']
[1, 26.332597732543945, 27.804462432861328, '00:00']

lrs,moms = learn.recorder.hps['lr'],learn.recorder.hps['mom']
test_close(lrs,  [combined_cos(0.25,1e-2/25,1e-2,1e-7)(i/20) for i in range(20)])
test_close(moms, [combined_cos(0.25,0.95,0.85,0.95)(i/20) for i in range(20)])

learn = synth_learner()
learn.fit_one_cycle(2)

[0, 17.926883697509766, 20.23248291015625, '00:00']
[1, 17.875675201416016, 20.192337036132812, '00:00']

learn.recorder.plot_sched()

learn = synth_learner()
learn.fit_flat_cos(2)

[0, 18.8897762298584, 17.584243774414062, '00:00']
[1, 18.792402267456055, 17.517009735107422, '00:00']

learn.recorder.plot_sched()

This schedule was introduced by Ilya Loshchilov et al. in SGDR: Stochastic Gradient Descent with Warm Restarts. It consists of n_cycles that are cosine annealings from lr_max (defaults to the Learner lr) to 0, with a length of cycle_len * cycle_mult**i for the i-th cycle (first one is cycle_len-long, then we multiply the length by cycle_mult at each epoch). You can optionally pass additional cbs and reset_opt.

learn = synth_learner()
with learn.no_logging(): learn.fit_sgdr(3, 1)
test_eq(learn.n_epoch, 7)
iters = [k * len(learn.dls.train) for k in [0,1,3,7]]
for i in range(3):
    n = iters[i+1]-iters[i]
    #The start of a cycle can be mixed with the 0 of the previous cycle with rounding errors, so we test at +1
    test_close(learn.recorder.lrs[iters[i]+1:iters[i+1]], [SchedCos(learn.lr, 0)(k/n) for k in range(1,n)])

learn.recorder.plot_sched()

learn.fine_tune(1)

[0, 8.343265533447266, 9.842547416687012, '00:00']
[0, 8.404812812805176, 9.797117233276367, '00:00']

from fastai.vision.all import *
set_seed(99, True)
path = untar_data(URLs.PETS)/'images'

image_files = get_image_files(path)
if sys.platform == "win32" and IN_NOTEBOOK:
    image_files = random.choices(image_files, k=int(len(image_files)/8))
    print("Randomly select 1/8 files in NOTEBOOK on Windows to save time")

# pickle can't serializer lamda function.
def _label_func(x):
    return x[0].isupper()

dls = ImageDataLoaders.from_name_func(
    path, image_files, valid_pct=0.2,
    label_func=_label_func, item_tfms=Resize(224))

learn = cnn_learner(dls, resnet18)
learn.fit(1)
learn.opt.state_dict()['state'][1]['grad_avg']

tensor([-5.0481e-03,  1.8275e-02,  0.0000e+00, -9.6743e-03,  0.0000e+00,
        -6.3870e-03,  5.7152e-03,  0.0000e+00,  1.1021e-03,  0.0000e+00,
        -6.3257e-03, -2.2116e-03, -2.0698e-03,  0.0000e+00,  1.6720e-02,
        -8.3156e-03, -7.9550e-03, -8.6084e-04, -9.1418e-04, -6.1664e-04,
        -6.1177e-03, -1.3059e-02,  1.1902e-02, -1.5803e-02, -2.4922e-02,
         2.8713e-02,  2.4694e-02,  4.4344e-03, -1.5961e-03,  7.6739e-03,
         7.4914e-03, -1.3638e-02, -3.0059e-03,  2.1931e-02,  5.9706e-03,
         4.2344e-03,  0.0000e+00,  8.9989e-03,  0.0000e+00, -2.0990e-02,
        -1.3239e-03, -2.4970e-03,  2.1901e-03, -2.4358e-02, -1.6894e-02,
        -1.0530e-02,  5.4196e-05,  1.0004e-02,  0.0000e+00, -5.9237e-03,
         2.4995e-02,  1.6727e-02,  1.2013e-02, -1.1876e-02, -4.2850e-03,
         1.7863e-02,  1.6690e-02,  1.3798e-03,  9.8079e-03, -2.9349e-04,
         7.1303e-03,  3.3969e-02,  2.9827e-03, -1.5872e-02], device='cuda:0')

learn.lr_find()
learn.opt.state_dict()['state'][1]['grad_avg']

tensor([-5.0481e-03,  1.8275e-02,  0.0000e+00, -9.6743e-03,  0.0000e+00,
        -6.3870e-03,  5.7152e-03,  0.0000e+00,  1.1021e-03,  0.0000e+00,
        -6.3257e-03, -2.2116e-03, -2.0698e-03,  0.0000e+00,  1.6720e-02,
        -8.3156e-03, -7.9550e-03, -8.6084e-04, -9.1418e-04, -6.1664e-04,
        -6.1177e-03, -1.3059e-02,  1.1902e-02, -1.5803e-02, -2.4922e-02,
         2.8713e-02,  2.4694e-02,  4.4344e-03, -1.5961e-03,  7.6739e-03,
         7.4914e-03, -1.3638e-02, -3.0059e-03,  2.1931e-02,  5.9706e-03,
         4.2344e-03,  0.0000e+00,  8.9989e-03,  0.0000e+00, -2.0990e-02,
        -1.3239e-03, -2.4970e-03,  2.1901e-03, -2.4358e-02, -1.6894e-02,
        -1.0530e-02,  5.4196e-05,  1.0004e-02,  0.0000e+00, -5.9237e-03,
         2.4995e-02,  1.6727e-02,  1.2013e-02, -1.1876e-02, -4.2850e-03,
         1.7863e-02,  1.6690e-02,  1.3798e-03,  9.8079e-03, -2.9349e-04,
         7.1303e-03,  3.3969e-02,  2.9827e-03, -1.5872e-02], device='cuda:0')

learn.lr_find()
learn.opt.state_dict()['state'][1]['grad_avg']

tensor([-5.0481e-03,  1.8275e-02,  0.0000e+00, -9.6743e-03,  0.0000e+00,
        -6.3870e-03,  5.7152e-03,  0.0000e+00,  1.1021e-03,  0.0000e+00,
        -6.3257e-03, -2.2116e-03, -2.0698e-03,  0.0000e+00,  1.6720e-02,
        -8.3156e-03, -7.9550e-03, -8.6084e-04, -9.1418e-04, -6.1664e-04,
        -6.1177e-03, -1.3059e-02,  1.1902e-02, -1.5803e-02, -2.4922e-02,
         2.8713e-02,  2.4694e-02,  4.4344e-03, -1.5961e-03,  7.6739e-03,
         7.4914e-03, -1.3638e-02, -3.0059e-03,  2.1931e-02,  5.9706e-03,
         4.2344e-03,  0.0000e+00,  8.9989e-03,  0.0000e+00, -2.0990e-02,
        -1.3239e-03, -2.4970e-03,  2.1901e-03, -2.4358e-02, -1.6894e-02,
        -1.0530e-02,  5.4196e-05,  1.0004e-02,  0.0000e+00, -5.9237e-03,
         2.4995e-02,  1.6727e-02,  1.2013e-02, -1.1876e-02, -4.2850e-03,
         1.7863e-02,  1.6690e-02,  1.3798e-03,  9.8079e-03, -2.9349e-04,
         7.1303e-03,  3.3969e-02,  2.9827e-03, -1.5872e-02], device='cuda:0')

import tempfile
from fastcore.basics import range_of
from fastcore.xtras import Path

with tempfile.TemporaryDirectory() as d:
    learn = synth_learner(path=Path(d))
    init_a,init_b = learn.model.a,learn.model.b
    with learn.no_logging(): learn.fit(20, cbs=LRFinder(num_it=100))
    assert len(learn.recorder.lrs) <= 100
    test_eq(len(learn.recorder.lrs), len(learn.recorder.losses))
    #Check stop if diverge
    if len(learn.recorder.lrs) < 100: assert learn.recorder.losses[-1] > 4 * min(learn.recorder.losses)
    #Test schedule
    test_eq(learn.recorder.lrs, [SchedExp(1e-7, 10)(i/100) for i in range_of(learn.recorder.lrs)])
    #No validation data
    test_eq([len(v) for v in learn.recorder.values], [1 for _ in range_of(learn.recorder.values)])
    #Model loaded back properly
    test_eq(learn.model.a, init_a)
    test_eq(learn.model.b, init_b)
    test_eq(learn.opt.state_dict()['state'], {})

First introduced by Leslie N. Smith in Cyclical Learning Rates for Training Neural Networks, the LR Finder trains the model with exponentially growing learning rates from start_lr to end_lr for num_it and stops in case of divergence (unless stop_div=False) then plots the losses vs the learning rates with a log scale.

A good value for the learning rates is then either:

one tenth of the minimum before the divergence
when the slope is the steepest

Those two values are returned by default by the Learning Rate Finder.

with tempfile.TemporaryDirectory() as d:
    learn = synth_learner(path=Path(d))
    weights_pre_lr_find = L(learn.model.parameters())
    lr_min,lr_steep = learn.lr_find()
    weights_post_lr_find = L(learn.model.parameters())
test_eq(weights_pre_lr_find, weights_post_lr_find)
print(f"Minimum/10: {lr_min:.2e}, steepest point: {lr_steep:.2e}")

Minimum/10: 3.31e-01, steepest point: 9.12e-07

Training Callbacks

Annealing

`annealer`[source]

`sched_lin`[source]

`sched_cos`[source]

`sched_no`[source]

`sched_exp`[source]

`SchedLin`[source]

`SchedCos`[source]

`SchedNo`[source]

`SchedExp`[source]

`SchedPoly`[source]

`combine_scheds`[source]

`combined_cos`[source]

`class` `ParamScheduler`[source]

`ParamScheduler.before_fit`[source]

`ParamScheduler.before_batch`[source]

`ParamScheduler.after_batch`[source]

`ParamScheduler.after_fit`[source]

`Learner.fit_one_cycle`[source]

`Recorder.plot_sched`[source]

`Learner.fit_flat_cos`[source]

`Learner.fit_sgdr`[source]

`Learner.fine_tune`[source]

`class` `LRFinder`[source]

`LRFinder.before_fit`[source]

`LRFinder.before_batch`[source]

`LRFinder.after_batch`[source]

`LRFinder.before_validate`[source]

`Recorder.plot_lr_find`[source]

`Learner.lr_find`[source]

Training Callbacks

Annealing

annealer[source]

sched_lin[source]

sched_cos[source]

sched_no[source]

sched_exp[source]

SchedLin[source]

SchedCos[source]

SchedNo[source]

SchedExp[source]

SchedPoly[source]

combine_scheds[source]

combined_cos[source]

class ParamScheduler[source]

ParamScheduler.before_fit[source]

ParamScheduler.before_batch[source]

ParamScheduler.after_batch[source]

ParamScheduler.after_fit[source]

Learner.fit_one_cycle[source]

Recorder.plot_sched[source]

Learner.fit_flat_cos[source]

Learner.fit_sgdr[source]

Learner.fine_tune[source]

class LRFinder[source]

LRFinder.before_fit[source]

LRFinder.before_batch[source]

LRFinder.after_batch[source]

LRFinder.before_validate[source]

Recorder.plot_lr_find[source]

Learner.lr_find[source]

`annealer`[source]

`sched_lin`[source]

`sched_cos`[source]

`sched_no`[source]

`sched_exp`[source]

`SchedLin`[source]

`SchedCos`[source]

`SchedNo`[source]

`SchedExp`[source]

`SchedPoly`[source]

`combine_scheds`[source]

`combined_cos`[source]

`class` `ParamScheduler`[source]

`ParamScheduler.before_fit`[source]

`ParamScheduler.before_batch`[source]

`ParamScheduler.after_batch`[source]

`ParamScheduler.after_fit`[source]

`Learner.fit_one_cycle`[source]

`Recorder.plot_sched`[source]

`Learner.fit_flat_cos`[source]

`Learner.fit_sgdr`[source]

`Learner.fine_tune`[source]

`class` `LRFinder`[source]

`LRFinder.before_fit`[source]

`LRFinder.before_batch`[source]

`LRFinder.after_batch`[source]

`LRFinder.before_validate`[source]

`Recorder.plot_lr_find`[source]

`Learner.lr_find`[source]