1 // MeCab: Yet Another Part-of-Speech and Morphological Analyzer
4 // lbfgs.c was ported from the FORTRAN code of lbfgs.m to C
7 // http://www.ece.northwestern.edu/~nocedal/lbfgs.html
9 // Software for Large-scale Unconstrained Optimization
10 // L-BFGS is a limited-memory quasi-Newton code for unconstrained
12 // The code has been developed at the Optimization Technology Center,
13 // a joint venture of Argonne National Laboratory and Northwestern University.
19 // - J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage(1980),
20 // Mathematics of Computation 35, pp. 773-782.
21 // - D.C. Liu and J. Nocedal. On the Limited Memory Method for
22 // Large Scale Optimization(1989),
23 // Mathematical Programming B, 45, 3, pp. 503-528.
31 static const double ftol = 1e-4;
32 static const double xtol = 1e-16;
33 static const double eps = 1e-7;
34 static const double lb3_1_gtol = 0.9;
35 static const double lb3_1_stpmin = 1e-20;
36 static const double lb3_1_stpmax = 1e20;
37 static const int lb3_1_mp = 6;
38 static const int lb3_1_lp = 6;
40 inline double sigma(double x) {
49 inline double pi(double x, double y) {
50 return sigma(x) == sigma(y) ? x : 0.0;
53 inline void daxpy_(int n, double da, const double *dx, double *dy) {
54 for (int i = 0; i < n; ++i) {
59 inline double ddot_(int size, const double *dx, const double *dy) {
60 return std::inner_product(dx, dx + size, dy, 0.0);
63 void mcstep(double *stx, double *fx, double *dx,
64 double *sty, double *fy, double *dy,
65 double *stp, double fp, double dp,
67 double stpmin, double stpmax,
70 double p, q, s, d1, d2, d3, r, gamma, theta, stpq, stpc, stpf;
73 if (*brackt && ((*stp <= std::min(*stx, *sty) ||
74 *stp >= std::max(*stx, *sty)) ||
75 *dx * (*stp - *stx) >= 0.0 || stpmax < stpmin)) {
79 double sgnd = dp * (*dx / std::abs(*dx));
84 theta =(*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
87 d1 = std::max(d1, d2);
91 gamma = s * std::sqrt(d1 * d1 - *dx / s *(dp / s));
95 p = gamma - *dx + theta;
96 q = gamma - *dx + gamma + dp;
98 stpc = *stx + r * (*stp - *stx);
99 stpq = *stx + *dx / ((*fx - fp) /
100 (*stp - *stx) + *dx) / 2 * (*stp - *stx);
101 if ((d1 = stpc - *stx, std::abs(d1)) < (d2 = stpq - *stx, std::abs(d2))) {
104 stpf = stpc + (stpq - stpc) / 2;
107 } else if (sgnd < 0.0) {
110 theta = (*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
111 d1 = std::abs(theta);
113 d1 = std::max(d1, d2);
115 s = std::max(d1, d2);
117 gamma = s * std::sqrt(d1 * d1 - *dx / s * (dp / s));
121 p = gamma - dp + theta;
122 q = gamma - dp + gamma + *dx;
124 stpc = *stp + r *(*stx - *stp);
125 stpq = *stp + dp /(dp - *dx) * (*stx - *stp);
126 if ((d1 = stpc - *stp, std::abs(d1)) > (d2 = stpq - *stp, std::abs(d2))) {
132 } else if (std::abs(dp) < std::abs(*dx)) {
135 theta = (*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
136 d1 = std::abs(theta);
138 d1 = std::max(d1, d2);
140 s = std::max(d1, d2);
143 d2 = d3 * d3 - *dx / s *(dp / s);
144 gamma = s * std::sqrt((std::max(d1, d2)));
148 p = gamma - dp + theta;
149 q = gamma + (*dx - dp) + gamma;
151 if (r < 0.0 && gamma != 0.0) {
152 stpc = *stp + r *(*stx - *stp);
153 } else if (*stp > *stx) {
158 stpq = *stp + dp /(dp - *dx) * (*stx - *stp);
160 if ((d1 = *stp - stpc, std::abs(d1)) <
161 (d2 = *stp - stpq, std::abs(d2))) {
167 if ((d1 = *stp - stpc, std::abs(d1)) >
168 (d2 = *stp - stpq, std::abs(d2))) {
178 theta =(fp - *fy) * 3 / (*sty - *stp) + *dy + dp;
179 d1 = std::abs(theta);
181 d1 = std::max(d1, d2);
183 s = std::max(d1, d2);
185 gamma = s * std::sqrt(d1 * d1 - *dy / s * (dp / s));
189 p = gamma - dp + theta;
190 q = gamma - dp + gamma + *dy;
192 stpc = *stp + r * (*sty - *stp);
194 } else if (*stp > *stx) {
216 stpf = std::min(stpmax, stpf);
217 stpf = std::max(stpmin, stpf);
219 if (*brackt && bound) {
221 d1 = *stx + (*sty - *stx) * 0.66;
222 *stp = std::min(d1, *stp);
224 d1 = *stx + (*sty - *stx) * 0.66;
225 *stp = std::max(d1, *stp);
235 class LBFGS::Mcsrch {
237 int infoc, stage1, brackt;
238 double finit, dginit, dgtest, width, width1;
239 double stx, fx, dgx, sty, fy, dgy, stmin, stmax;
246 finit(0.0), dginit(0.0), dgtest(0.0), width(0.0), width1(0.0),
247 stx(0.0), fx(0.0), dgx(0.0), sty(0.0), fy(0.0), dgy(0.0),
248 stmin(0.0), stmax(0.0) {}
250 void mcsrch(int size,
252 double f, const double *g, double *s,
254 int *info, int *nfev, double *wa, bool orthant, double C) {
255 const double p5 = 0.5;
256 const double p66 = 0.66;
257 const double xtrapf = 4.0;
258 const int maxfev = 20;
260 /* Parameter adjustments */
271 if (size <= 0 || *stp <= 0.0) {
275 dginit = ddot_(size, &g[1], &s[1]);
284 dgtest = ftol * dginit;
285 width = lb3_1_stpmax - lb3_1_stpmin;
287 for (int j = 1; j <= size; ++j) {
300 stmin = std::min(stx, sty);
301 stmax = std::max(stx, sty);
304 stmax = *stp + xtrapf * (*stp - stx);
307 *stp = std::max(*stp, lb3_1_stpmin);
308 *stp = std::min(*stp, lb3_1_stpmax);
310 if ((brackt && ((*stp <= stmin || *stp >= stmax) ||
311 *nfev >= maxfev - 1 || infoc == 0)) ||
312 (brackt && (stmax - stmin <= xtol * stmax))) {
317 for (int j = 1; j <= size; ++j) {
318 double grad_neg = 0.0;
319 double grad_pos = 0.0;
322 grad_neg = g[j] - 1.0 / C;
323 grad_pos = g[j] + 1.0 / C;
325 grad_pos = grad_neg = g[j] + 1.0 * sigma(wa[j]) / C;
327 if (grad_neg > 0.0) {
329 } else if (grad_pos < 0.0) {
334 const double p = pi(s[j], -grad);
335 const double xi = wa[j] == 0.0 ? sigma(-grad) : sigma(wa[j]);
336 x[j] = pi(wa[j] + *stp * p, xi);
339 for (int j = 1; j <= size; ++j) {
340 x[j] = wa[j] + *stp * s[j];
349 double dg = ddot_(size, &g[1], &s[1]);
350 double ftest1 = finit + *stp * dgtest;
352 if (brackt && ((*stp <= stmin || *stp >= stmax) || infoc == 0)) {
355 if (*stp == lb3_1_stpmax && f <= ftest1 && dg <= dgtest) {
358 if (*stp == lb3_1_stpmin && (f > ftest1 || dg >= dgtest)) {
361 if (*nfev >= maxfev) {
364 if (brackt && stmax - stmin <= xtol * stmax) {
367 if (f <= ftest1 && std::abs(dg) <= lb3_1_gtol * (-dginit)) {
375 if (stage1 && f <= ftest1 && dg >= std::min(ftol, lb3_1_gtol) * dginit) {
379 if (stage1 && f <= fx && f > ftest1) {
380 double fm = f - *stp * dgtest;
381 double fxm = fx - stx * dgtest;
382 double fym = fy - sty * dgtest;
383 double dgm = dg - dgtest;
384 double dgxm = dgx - dgtest;
385 double dgym = dgy - dgtest;
386 mcstep(&stx, &fxm, &dgxm, &sty, &fym, &dgym, stp, fm, dgm, &brackt,
387 stmin, stmax, &infoc);
388 fx = fxm + stx * dgtest;
389 fy = fym + sty * dgtest;
393 mcstep(&stx, &fx, &dgx, &sty, &fy, &dgy, stp, f, dg, &brackt,
394 stmin, stmax, &infoc);
399 if ((d1 = sty - stx, std::abs(d1)) >= p66 * width1) {
400 *stp = stx + p5 * (sty - stx);
403 width = (d1 = sty - stx, std::abs(d1));
411 void LBFGS::clear() {
412 iflag_ = iscn = nfev = iycn = point = npt =
413 iter = info = ispt = isyt = iypt = 0;
421 void LBFGS::lbfgs_optimize(int size,
442 mcsrch_ = new Mcsrch;
455 for (int i = 1; i <= size; ++i) {
458 ispt = size + (msize << 1);
459 iypt = ispt + size * msize;
460 for (int i = 1; i <= size; ++i) {
461 w[ispt + i] = -g[i] * diag[i];
463 stp1 = 1.0 / std::sqrt(ddot_(size, &g[1], &g[1]));
466 // MAIN ITERATION LOOP
470 if (iter == 1) goto L165;
471 if (iter > size) bound = size;
473 // COMPUTE -H*G USING THE FORMULA GIVEN IN: Nocedal, J. 1980,
474 // "Updating quasi-Newton matrices with limited storage",
475 // Mathematics of Computation, Vol.24, No.151, pp. 773-782.
476 ys = ddot_(size, &w[iypt + npt + 1], &w[ispt + npt + 1]);
477 yy = ddot_(size, &w[iypt + npt + 1], &w[iypt + npt + 1]);
478 for (int i = 1; i <= size; ++i) {
484 if (point == 0) cp = msize;
485 w[size + cp] = 1.0 / ys;
487 for (int i = 1; i <= size; ++i) {
491 bound = std::min(iter - 1, msize);
494 for (int i = 1; i <= bound; ++i) {
496 if (cp == -1) cp = msize - 1;
497 double sq = ddot_(size, &w[ispt + cp * size + 1], &w[1]);
498 int inmc = size + msize + cp + 1;
499 iycn = iypt + cp * size;
500 w[inmc] = w[size + cp + 1] * sq;
502 daxpy_(size, d, &w[iycn + 1], &w[1]);
505 for (int i = 1; i <= size; ++i) {
506 w[i] = diag[i] * w[i];
509 for (int i = 1; i <= bound; ++i) {
510 double yr = ddot_(size, &w[iypt + cp * size + 1], &w[1]);
511 double beta = w[size + cp + 1] * yr;
512 int inmc = size + msize + cp + 1;
513 beta = w[inmc] - beta;
514 iscn = ispt + cp * size;
515 daxpy_(size, beta, &w[iscn + 1], &w[1]);
522 // STORE THE NEW SEARCH DIRECTION
523 for (int i = 1; i <= size; ++i) {
524 w[ispt + point * size + i] = w[i];
528 // OBTAIN THE ONE-DIMENSIONAL MINIMIZER OF THE FUNCTION
529 // BY USING THE LINE SEARCH ROUTINE MCSRCH
535 for (int i = 1; i <= size; ++i) {
540 mcsrch_->mcsrch(size, &x[1], f, &g[1], &w[ispt + point * size + 1],
541 &stp, &info, &nfev, &diag[1], orthant, C);
543 *iflag = 1; // next value
547 std::cerr << "The line search routine mcsrch failed: error code:"
548 << info << std::endl;
553 // COMPUTE THE NEW STEP AND GRADIENT CHANGE
555 for (int i = 1; i <= size; ++i) {
556 w[ispt + npt + i] = stp * w[ispt + npt + i];
557 w[iypt + npt + i] = g[i] - w[i];
560 if (point == msize) {
564 double gnorm = std::sqrt(ddot_(size, &g[1], &g[1]));
565 double xnorm = std::max(1.0, std::sqrt(ddot_(size, &x[1], &x[1])));
566 if (gnorm / xnorm <= eps) {
567 *iflag = 0; // OK terminated