Skip to content
Snippets Groups Projects
Commit 51dde1fb authored by fhoelzken's avatar fhoelzken
Browse files

added gradient clipping and adaptive learning rate to Adam

parent 92bea08a
No related branches found
No related tags found
No related merge requests found
...@@ -26,81 +26,114 @@ ...@@ -26,81 +26,114 @@
#include <string> #include <string>
#include <math.h> #include <math.h>
#include <iomanip> #include <iomanip>
#include "MinFunctions/AdamMinimizer.hh" #include "MinFunctions/AdamMinimizer.hh"
#include "PwaUtils/GlobalEnv.hh" #include "PwaUtils/GlobalEnv.hh"
#include "ErrLogger/ErrLogger.hh" #include "ErrLogger/ErrLogger.hh"
#include "PwaUtils/GlobalEnv.hh" #include "PwaUtils/GlobalEnv.hh"
#include "ConfigParser/ParserBase.hh" #include "ConfigParser/ParserBase.hh"
AdamMinimizer::AdamMinimizer(std::shared_ptr<AbsFcn<FCNGradientBase>> theAbsFcnPtr, std::shared_ptr<AbsPawianParameters> upar) : AdamMinimizer::AdamMinimizer(std::shared_ptr<AbsFcn<FCNGradientBase>> theAbsFcnPtr, std::shared_ptr<AbsPawianParameters> upar)
AbsPawianMinimizer<FCNGradientBase>(theAbsFcnPtr, upar) : AbsPawianMinimizer<FCNGradientBase>(theAbsFcnPtr, upar),
,_max_iterations(1000) _max_iterations(5000),
,_s(upar->Params().size(),0.) _s(upar->Params().size(), 0.0),
,_v(upar->Params().size(),0.) _v(upar->Params().size(), 0.0),
,_learning_rate(0.2) _learning_rate(0.001),
,_iterations(0) _initial_lr(0.001),
,_currentPawianParams(std::shared_ptr<AbsPawianParameters>(upar->Clone())) _decay_rate(0.01),
,_bestLH(1.e20) _iterations(0),
,_bestPawianParams(std::shared_ptr<AbsPawianParameters>(upar->Clone())) _currentPawianParams(std::shared_ptr<AbsPawianParameters>(upar->Clone())),
,_noItersWoImprovement(0) _bestLH(1.e20),
{ _bestPawianParams(std::shared_ptr<AbsPawianParameters>(upar->Clone())),
_noItersWoImprovement(0),
_patience(10) {
} }
AdamMinimizer::~AdamMinimizer() AdamMinimizer::~AdamMinimizer() {}
{
} void AdamMinimizer::minimize() {
const double gradient_tolerance = 1e-8;
if (_iterations == 0) {
double currentLH = _absFcn->operator()(_bestPawianParams->Params());
if (currentLH < _bestLH) _bestLH = currentLH;
std::cout << "Initial best LH: " << _bestLH << "\tnoIters: " << _iterations << "\tnoItersWoImprovement: " << _noItersWoImprovement << std::endl;
}
while (_iterations <= _max_iterations && _noItersWoImprovement < 1000) {
if (_noItersWoImprovement > 0 && _noItersWoImprovement >= _patience) {
_currentPawianParams = std::shared_ptr<AbsPawianParameters>(_bestPawianParams->Clone());
_patience = std::min(_patience * 1.05, 30.0);
_noItersWoImprovement = 0;
}
std::vector<double> derivatives = _absFcn->Gradient(_currentPawianParams->Params());
updateParameters(_currentPawianParams, derivatives, _s, _v, _iterations);
void AdamMinimizer::minimize(){ double currentLH = _absFcn->operator()(_currentPawianParams->Params());
if(_iterations==0){
double currentLH=_absFcn->operator()(_bestPawianParams->Params()); if (currentLH < _bestLH) {
if(currentLH<_bestLH) _bestLH=currentLH; _bestLH = currentLH;
InfoMsg << "best LH: " << _bestLH << "\tnoIters: " << _iterations << "\tnoItersWoImprovement: " << _noItersWoImprovement << endmsg; _bestPawianParams = std::shared_ptr<AbsPawianParameters>(_currentPawianParams->Clone());
} _noItersWoImprovement = 0;
while(_iterations <= _max_iterations && _noItersWoImprovement<100){ } else {
if(_noItersWoImprovement>0 && _noItersWoImprovement%10 == 0){ //continue with parameters of the best fit ++_noItersWoImprovement;
_currentPawianParams=std::shared_ptr<AbsPawianParameters>(_bestPawianParams->Clone()); }
}
std::vector<double> derivatives = _absFcn->Gradient(_currentPawianParams->Params()); double gradientNorm = 0.0;
for (const auto& grad : derivatives) {
updateParameters(_currentPawianParams, derivatives, _s, _v, _iterations); gradientNorm += grad * grad;
double currentLH=_absFcn->operator()(_currentPawianParams->Params()); }
if(currentLH<_bestLH){ gradientNorm = sqrt(gradientNorm);
_bestLH=currentLH;
_bestPawianParams=std::shared_ptr<AbsPawianParameters>(_currentPawianParams->Clone()); if (gradientNorm < gradient_tolerance && std::abs(currentLH - _bestLH) / std::abs(_bestLH) < 1e-4) {
_noItersWoImprovement=0; std::cout << "Stopping early: Gradient norm below tolerance and no significant likelihood improvement." << std::endl;
} break;
else ++_noItersWoImprovement; }
++_iterations;
_learning_rate *= 0.999;
InfoMsg << "best LH: " << _bestLH << "\tnoIters: " << _iterations << "\tnoItersWoImprovement: " << _noItersWoImprovement << endmsg; ++_iterations;
_learning_rate *= 0.999;
std::cout << "best LH: " << _bestLH << "\tnoIters: " << _iterations << "\tnoItersWoImprovement: " << _noItersWoImprovement << std::endl;
} }
} }
void AdamMinimizer::updateParameters(std::shared_ptr<AbsPawianParameters> pawianParams, std::vector<double>& gradients, std::vector<double>& s, std::vector<double>& v, int t){ void AdamMinimizer::updateParameters(std::shared_ptr<AbsPawianParameters> pawianParams, std::vector<double>& gradients, std::vector<double>& s, std::vector<double>& v, int t) {
double beta1=0.9; double beta1 = 0.9;
double beta2=0.99; double beta2 = 0.999;
double epsilon=1.*pow(10., -8); double epsilon = 1e-8;
for(unsigned int i = 0; i < pawianParams->Params().size(); ++i){ for (unsigned int i = 0; i < pawianParams->Params().size(); ++i) {
if (pawianParams->IsFixed(i)) continue; if (pawianParams->IsFixed(i)) continue;
s.at(i) = beta1 * s.at(i) + (1.0 - beta1) * gradients.at(i);
v.at(i) = beta2 * v.at(i) + (1.0 - beta2) * gradients.at(i) * gradients.at(i); double clip_value = 0.2 * std::abs(pawianParams->Value(i));
gradients[i] = std::min(clip_value, std::max(-clip_value, gradients[i]));
double s_hat = s.at(i) / (1.0 - pow(beta1 , (t + 1))); s.at(i) = beta1 * s.at(i) + (1.0 - beta1) * gradients.at(i);
double v_hat = v.at(i) / (1.0 - pow(beta2 , (t + 1))); v.at(i) = beta2 * v.at(i) + (1.0 - beta2) * gradients.at(i) * gradients.at(i);
double newVal = pawianParams->Value(i) - _learning_rate * s_hat / (std::sqrt(v_hat) + epsilon);
if(pawianParams->HasLimits(i)){ double s_hat = s.at(i) / (1.0 - pow(beta1, (t + 1)));
if(newVal>pawianParams->UpperLimit(i)) newVal=pawianParams->UpperLimit(i); double v_hat = v.at(i) / (1.0 - pow(beta2, (t + 1)));
else if(newVal<pawianParams->LowerLimit(i)) newVal=pawianParams->LowerLimit(i);
//double learning_rate_t = _initial_lr / (1.0 + _decay_rate * t);
//double adaptive_lr = learning_rate_t / (std::abs(pawianParams->Value(i)) + epsilon);
//double gradient_update = adaptive_lr * s_hat / (std::sqrt(v_hat) + epsilon);
double newVal = pawianParams->Value(i) - _learning_rate * s_hat / (std::sqrt(v_hat) + epsilon);
if (pawianParams->HasLimits(i)) {
if (newVal > pawianParams->UpperLimit(i)) newVal = pawianParams->UpperLimit(i);
else if (newVal < pawianParams->LowerLimit(i)) newVal = pawianParams->LowerLimit(i);
}
pawianParams->SetValue(i, newVal);
} }
pawianParams->SetValue(i,newVal);
}
} }
void AdamMinimizer::printFitResult(double evtWeightSumData){ void AdamMinimizer::printFitResult(double evtWeightSumData){
InfoMsg << "\n\n********************** Final fit parameters *************************" << endmsg; InfoMsg << "\n\n********************** Final fit parameters *************************" << endmsg;
_bestPawianParams->print(std::cout, true); _bestPawianParams->print(std::cout, true);
......
...@@ -61,12 +61,14 @@ protected: ...@@ -61,12 +61,14 @@ protected:
std::vector<double> _s; std::vector<double> _s;
std::vector<double> _v; std::vector<double> _v;
double _learning_rate; double _learning_rate;
double _initial_lr;
double _decay_rate;
unsigned int _iterations; unsigned int _iterations;
std::shared_ptr<AbsPawianParameters> _currentPawianParams; std::shared_ptr<AbsPawianParameters> _currentPawianParams;
double _bestLH; double _bestLH;
std::shared_ptr<AbsPawianParameters> _bestPawianParams; std::shared_ptr<AbsPawianParameters> _bestPawianParams;
unsigned int _noItersWoImprovement; unsigned int _noItersWoImprovement;
double _patience;
void updateParameters(std::shared_ptr<AbsPawianParameters> pawianParams, std::vector<double>& gradients, std::vector<double>& s, std::vector<double>& v, int t); void updateParameters(std::shared_ptr<AbsPawianParameters> pawianParams, std::vector<double>& gradients, std::vector<double>& s, std::vector<double>& v, int t);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment