Adam optimizer:
weight_decay=5e-4
adam_lr=0.01
optimizer = mx.optimizer.Adam(learning_rate=adam_lr, wd=weight_decay)
wd = 0.0005
opt = optimizer.SGD(learning_rate=lr,
momentum=0.9,
wd=wd,
rescale_grad=1.0/len(ctx),
clip_gradient=None)