double dermin ( int maxits , // Iteration limit double critlim , // Quit if crit drops this low double tol , // Convergence tolerance double (*criter) (double * , int , double * , double * ) , // Criterion func int n , // Number of variables double *x , // In/out of independent variable double ystart , // Input of starting function value double *base , // Work vector n long double *direc , // Work vector n long double *g , // Work vector n long double *h , // Work vector n long double *deriv2 , // Work vector n long int progress // Print progress? ) { int i, iter, user_quit, convergence_counter, poor_cj_counter ; double fval, fbest, high, scale, t1, t2, t3, y1, y2, y3, dlen, dot1, dot2 ; double prev_best, toler, gam, improvement ; char msg[400] ; /* Initialize for the local univariate criterion which may be called by 'glob_min' and 'brentmin' to minimize along the search direction. */ local_x = x ; local_base = base ; local_direc = direc ; local_n = n ; local_criter = criter ; /* Initialize that the user has not pressed ESCape. Evaluate the function and, more importantly, its derivatives, at the starting point. This call to criter puts the gradient into direc, but we flip its sign to get the downhill search direction. Also initialize the CJ algorithm by putting that vector in g and h. */ user_quit = 0 ; fbest = criter ( x , 1 , direc , deriv2 ) ; prev_best = 1.e30 ; for (i=0 ; i<n ; i++) direc[i] = -direc[i] ; memcpy ( g , direc , n * sizeof(double) ) ; memcpy ( h , direc , n * sizeof(double) ) ; #if DEBUG printf ( "\nDERMIN starting error = %lf", fbest ) ; #endif if (fbest < 0.0) { // If user pressed ESCape during criter call fbest = ystart ; user_quit = 1 ; goto FINISH ; } /* Main loop. For safety we impose a limit on iterations. There are two counters that have somewhat similar purposes. The first, convergence_counter, counts how many times an iteration failed to reduce the function value to the user's tolerance level. We require failure several times in a row before termination. The second, poor_cj_counter, has a (generally) higher threshold. It keeps track of poor improvement, and imposes successively small limits on gamma, thus forcing the algorithm back to steepest descent if CJ is doing poorly. */ convergence_counter = 0 ; poor_cj_counter = 0 ; iter = 0 ; for (;;) { if ((maxits > 0) && (iter++ >= maxits)) break ; if (fbest < critlim) // Do we satisfy user yet? break ; /* Convergence check */ if (prev_best <= 1.0) // If the function is small toler = tol ; // Work on absolutes else // But if it is large toler = tol * prev_best ; // Keep things relative if ((prev_best - fbest) <= toler) { // If little improvement if (++convergence_counter >= 3) // Then count how many break ; // And quit if too many } else // But a good iteration convergence_counter = 0 ; // Resets this counter /* Does the user want to quit? */ if ((user_quit = user_pressed_escape ()) != 0) break ; /* Here we do a few quick things for housekeeping. We save the base for the linear search in 'base', which lets us parameterize from t=0. We find the greatest second derivative. This makes an excellent scaling factor for the search direction so that the initial global search for a trio containing the minimum is fast. Because this is so stable, we use it to bound the generally better but unstable Newton scale. We also compute the length of the search vector and its dot product with the gradient vector, as well as the directional second derivative. That lets us use a sort of Newton's method to help us scale the initial global search to be as fast as possible. In the ideal case, the 't' parameter will be exactly equal to 'scale', the center point of the call to glob_min. */ dot1 = dot2 = dlen = 0.0 ; // For finding directional derivs high = 1.e-4 ; // For scaling glob_min for (i=0 ; i<n ; i++) { base[i] = x[i] ; // We step out from here if (deriv2[i] > high) // Keep track of second derivatives high = deriv2[i] ; // For linear search via glob_min dot1 += direc[i] * g[i] ; // Directional first derivative (neg) dot2 += direc[i] * direc[i] * deriv2[i] ; // and second dlen += direc[i] * direc[i] ; // Length of search vector } dlen = sqrt ( dlen ) ; // Actual length #if DEBUG printf ( "\n(x d1 d2) d1=%lf d2=%lf len=%lf rat=%lf h=%lf:", dot1, dot2, dlen, dot1 / dot2, 1.5 / high ) ; #endif #if DEBUG > 1 for (i=0 ; i<n ; i++) printf ( "( %lf %lf %lf)", x[i], direc[i], deriv2[i] ) ; #endif /* The search direction is in 'direc' and the maximum second derivative is in 'high'. That stable value makes a good approximate scaling factor. The ideal Newton scaling factor is numerically unstable. So compute the Newton ideal, then bound it to be near the less ideal but far more stable maximum second derivative. Pass the first function value, corresponding to t=0, to the routine in *y2 and flag this by using a negative npts. */ scale = dot1 / dot2 ; // Newton's ideal but unstable scale high = 1.5 / high ; // Less ideal but more stable heuristic if (high < 1.e-4) // Subjectively keep it realistic high = 1.e-4 ; if (scale < 0.0) // This is truly pathological scale = high ; // So stick with old reliable else if (scale < 0.1 * high) // Bound the Newton scale scale = 0.1 * high ; // To be close to the stable scale else if (scale > 10.0 * high) // Bound it both above and below scale = 10.0 * high ; y2 = prev_best = fbest ; #if DEBUG printf ( "\nStarting GLOBAL " ) ; #endif user_quit = glob_min ( 0.0 , 2.0 * scale , -3 , 0 , critlim , univar_crit , &t1 , &y1 , &t2 , &y2 , &t3 , &y3 , progress) ; #if DEBUG printf ( "\nGLOBAL t=%lf f=%lf", t2 / scale , y2 ) ; #endif if (user_quit || (y2 < critlim)) { // ESCape or good enough already? if (y2 < fbest) { // If global caused improvement for (i=0 ; i<n ; i++) // Implement that improvement x[i] = base[i] + t2 * direc[i] ; fbest = y2 ; } else { // Else revert to starting point for (i=0 ; i<n ; i++) x[i] = base[i] ; } break ; } /* We just used a crude global strategy to find three points that bracket the minimum. Refine using Brent's method. If we are possibly near the end, as indicated by the convergence_counter being nonzero, then try extra hard. */ if (convergence_counter) fbest = brentmin ( 20 , critlim , tol , 1.e-7 , univar_crit , &t1 , &t2 , &t3 , y2 , progress ) ; else fbest = brentmin ( 10 , critlim , 10.0 * tol , 1.e-5 , univar_crit , &t1 , &t2 , &t3 , y2 , progress ) ; #if DEBUG printf ( "\nBRENT t=%lf f=%lf", t2 / scale , fbest ) ; #endif /* We just completed the global and refined search. Update the current point to reflect the minimum obtained. Then evaluate the error and its derivatives there. (The linear optimizers only evaluated the error, not its derivatives.) If the user pressed ESCape during dermin, fbest will be returned negative. */ for (i=0 ; i<n ; i++) x[i] = base[i] + t2 * direc[i] ; if (fbest < 0.0) { // If user pressed ESCape fbest = -fbest ; user_quit = 1 ; break ; } improvement = (prev_best - fbest) / prev_best ; #if DEBUG printf ( "\nDIREC improvement = %lf %%", 100. * improvement ) ; #endif #if DEBUG > 1 printf ( "\a..." ) ; getch () ; #endif if (fbest < critlim) // Do we satisfy user yet? break ; fval = criter ( x , 1 , direc , deriv2 ) ; // Need derivs now for (i=0 ; i<n ; i++) // Flip sign to get direc[i] = -direc[i] ; // negative gradient if (fval < 0.0) { // If user pressed ESCape user_quit = 1 ; break ; } sprintf ( msg , "scale=%lf f=%le dlen=%le improvement=%lf%%", t2 / scale , fval, dlen, 100.0 * improvement ) ; if (progress) write_progress ( msg ) ; else write_non_progress ( msg ) ; #if DEBUG printf ( "\nf=%lf at (", fval ) ; #endif #if DEBUG > 1 for (i=0 ; i<n ; i++) printf ( " %lf", x[i] ) ; printf ( ")...\a" ) ; getch () ; #endif gam = gamma ( n , g , direc ) ; #if DEBUG dlen = 0.0 ; for (i=0 ; i<n ; i++) dlen += direc[i] * direc[i] ; printf ( "\nGamma = %lf with grad len = %lf", gam, sqrt(dlen) ) ; #endif if (gam < 0.0) gam = 0.0 ; if (gam > 10.0) // limit gamma gam = 10.0 ; if (improvement < 0.001) // Count how many times we ++poor_cj_counter ; // got poor improvement else // in a row poor_cj_counter = 0 ; if (poor_cj_counter >= 2) { // If several times if (gam > 1.0) // limit gamma gam = 1.0 ; } if (poor_cj_counter >= 6) { // If too many times poor_cj_counter = 0 ; // set gamma to 0 gam = 0.0 ; // to use steepest descent (gradient) #if DEBUG printf ( "\nSetting Gamma=0" ) ; #endif } find_new_dir ( n , gam , g , h , direc ) ; // Compute search direction } // Main loop FINISH: if (user_quit) return -fbest ; else return fbest ; }
float LayerNet::conjgrad ( TrainingSet *tptr , // Training set to use int maxits , // Maximum iterations allowed float reltol , // Relative error change tolerance float errtol // Quit if error drops this low ) { int i, j, n, iter, pnum, key, retry, max_retry ; float gam, *g, *h, *outdelta, *hid2delta, *grad, *base ; float corr, error, *cptr, *gptr, *pptr, maxgrad ; float prev_err ; char msg[80]; max_retry = 5 ; /* Allocate work memory */ MEMTEXT ( "CONJGRAD work" ) ; if (nhid2) { hid2delta = (float *) MALLOC ( nhid2 * sizeof(float) ) ; if (hid2delta == NULL) return -2.0 ; } else hid2delta = NULL ; outdelta = (float *) MALLOC ( nout * sizeof(float) ) ; if (nhid1 == 0) // No hidden layer n = nout * (nin+1) ; else if (nhid2 == 0) // One hidden layer n = nhid1 * (nin+1) + nout * (nhid1+1) ; else // Two hidden layers n = nhid1 * (nin+1) + nhid2 * (nhid1+1) + nout * (nhid2+1) ; grad = (float *) MALLOC ( n * sizeof(float) ) ; base = (float *) MALLOC ( n * sizeof(float) ) ; g = (float *) MALLOC ( n * sizeof(float) ) ; h = (float *) MALLOC ( n * sizeof(float) ) ; if ((outdelta == NULL) || (grad == NULL) || (base == NULL) || (g == NULL) || (h == NULL)) { if (hid2delta != NULL) FREE ( hid2delta ) ; if (outdelta != NULL) FREE ( outdelta ) ; if (grad != NULL) FREE ( grad ) ; if (base != NULL) FREE ( base ) ; if (g != NULL) FREE ( g ) ; if (h != NULL) FREE ( h ) ; return -2.0 ; // Flags error } prev_err = 1.e30 ; error = find_grad ( tptr , hid2delta , outdelta , grad ) ; memcpy ( g , grad , n * sizeof(float) ) ; memcpy ( h , grad , n * sizeof(float) ) ; /* Main iteration loop is here */ for (iter=0 ; iter<maxits ; iter++) { // Each iter is an epoch /* Check current error against user's max. Abort if user pressed ESCape */ sprintf ( msg , "Gradient Finding...Iter Nø %d : Error = %lf %%", iter, 100.0 * error ) ; normal_message ( msg ) ; if (error <= errtol) // If our error is within user's limit break ; // then we are done! if (error <= reltol) // Generally not necessary: reltol<errtol in break ; // practice, but help silly users if (kbhit()) { // Was a key pressed? key = getch () ; // Read it if so while (kbhit()) // Flush key buffer in case function key getch () ; // or key was held down if (key == 27) { // ESCape error = -error ; // Flags user that ESCape was pressed break ; } } prev_err = error ; error = direcmin ( tptr , error , 10 , 1.e-10 , 0.5 , base , grad ) ; if (error < 0.0) // Indicates user pressed ESCape goto CGFINISH ; if ((2.0 * (prev_err - error)) <= // If this direc gave poor result (reltol * (prev_err + error + 1.e-10))) { // will use random direc prev_err = error ; // But first exhaust grad error = find_grad ( tptr , hid2delta , outdelta , grad ) ; error = direcmin ( tptr , error , 15 , 1.e-10 , 1.e-3 , base , grad ) ; for (retry=0 ; retry<max_retry ; retry++) { for (i=0 ; i<n ; i++) grad[i] = (float) (rand() - RANDMAX/2) / (RANDMAX * 10.0) ; error = direcmin ( tptr , error , 10 , 1.e-10 , 1.e-2 , base , grad ) ; if (error < 0.0) // Indicates user pressed ESCape goto CGFINISH ; if (retry < max_retry/2) continue ; if ((2.0 * (prev_err - error)) > (reltol * (prev_err + error + 1.e-10))) break ; // Get out of retry loop if we improved enough } // For retry if (retry == max_retry) // If we exhausted all tries break ; // probably hopeless memcpy ( g , grad , n * sizeof(float) ) ; memcpy ( h , grad , n * sizeof(float) ) ; } // If this dir gave poor result prev_err = error ; /* Setup for next iteration */ error = find_grad ( tptr , hid2delta , outdelta , grad ) ; gam = gamma ( g , grad ) ; if (gam < 0.0) gam = 0.0 ; if (gam > 1.0) gam = 1.0 ; find_new_dir ( gam , g , h , grad ) ; } // This is the end of the main iteration loop /* Free work memory */ CGFINISH: MEMTEXT ( "CONJGRAD work" ) ; if (hid2delta != NULL) FREE ( hid2delta ) ; FREE ( outdelta ) ; FREE ( grad ) ; FREE ( base ) ; FREE ( g ) ; FREE ( h ) ; return error ; }