void apply_momentum_correction(rbm_t *rbm, delta_w_t *dw) { for(int i=0;i<rbm[0].n_outputs;i++) { rbm[0].bias_outputs[i]+= rbm[0].learning_rate*dw[0].delta_output_bias[i]/(double)dw[0].batch_size; rbm[0].output_momentum[i]+= rbm[0].learning_rate*dw[0].delta_output_bias[i]/(double)dw[0].batch_size; for(int j=0;j<rbm[0].n_inputs;j++) { double step= get_matrix_value(dw[0].delta_w, i, j); // delta_w_i_j double previous_w_i_j= get_matrix_value(rbm[0].io_weights, i, j); // If using L2 penalty (a.k.a "weight decay"), apply that here. if(rbm[0].use_l2_penalty) step-= rbm[0].weight_cost*previous_w_i_j; // Do I apply this to the momentum term as well, or just the correction?! step*= rbm[0].learning_rate/(double)dw[0].batch_size; // For the momentum method ... do I still scale by the batch size?! // Update weights. \theta_t = \theta_t' - \epsilon_{t-1} \gradient_f(\theta_{t-1} + \mu_{t-1}v_{t-1}) // (eq. 7.10, 2nd half). // \theta_t' was applied before taking the step. set_matrix_value(rbm[0].io_weights, i, j, previous_w_i_j+step); // // Update velocities. v_t = v_t' - \epsilon_{t-1} \gradient_f(\theta_{t-1} + \mu_{t-1}v_{t-1}) // (eq. 7.11, 2nd half). // v_t' was applied before taking the step. double previous_momentum_i_j= get_matrix_value(rbm[0].momentum, i, j); set_matrix_value(rbm[0].momentum, i, j, previous_momentum_i_j+step); if(i==0 && rbm[0].update_input_bias) { // Only update once... and if everything says to update. rbm[0].bias_inputs[j]+= rbm[0].learning_rate*dw[0].delta_input_bias[j]/(double)dw[0].batch_size; rbm[0].input_momentum[j]+= rbm[0].learning_rate*dw[0].delta_input_bias[j]/(double)dw[0].batch_size; } } } }
void apply_momentum_correction(rbm_t *rbm, delta_w_t *dw) { double alpha= (rbm->learning_rate/(double)dw->batch_size); for(int i=0;i<rbm->n_outputs;i++) { rbm->bias_outputs[i]+= alpha*dw->delta_output_bias[i]; rbm->output_momentum[i]+= alpha*dw->delta_output_bias[i]; for(int j=0;j<rbm->n_inputs;j++) { double step= alpha*get_matrix_value(dw->delta_w, i, j); // delta_w_i_j double previous_w_i_j= get_matrix_value(rbm->io_weights, i, j); { // NOTE: Moving this next block (below || above) the L2 penalty application (will || not) apply the penalty to the momentum term. // Update velocities. v_t = v_t' - \epsilon_{t-1} \gradient_f(\theta_{t-1} + \mu_{t-1}v_{t-1}) // (eq. 7.11, 2nd half). // v_t' was applied before taking the step. double previous_momentum_i_j= get_matrix_value(rbm->momentum, i, j); set_matrix_value(rbm->momentum, i, j, previous_momentum_i_j+step); } // If using L2 penalty (a.k.a "weight decay"), apply that here. if(rbm->use_l2_penalty) // rbm->learning_rate * (delta_w/ batch_size - weightcost * w_i_j) [same as: http://www.cs.toronto.edu/~hinton/code/rbm.m] step-= rbm->weight_cost*previous_w_i_j*rbm->learning_rate; // Do I apply this to the momentum term as well, or just the correction?! // Update weights. \theta_t = \theta_t' - \epsilon_{t-1} \gradient_f(\theta_{t-1} + \mu_{t-1}v_{t-1}) // (eq. 7.10, 2nd half). // \theta_t' was applied before taking the step. set_matrix_value(rbm->io_weights, i, j, previous_w_i_j+step); // if(i==0 && rbm->update_input_bias) { // Only update once... and if everything says to update. rbm->bias_inputs[j]+= alpha*dw->delta_input_bias[j]; rbm->input_momentum[j]+= alpha*dw->delta_input_bias[j]; } } } }
void initial_momentum_step(rbm_t *rbm) { for(int i=0;i<rbm[0].n_outputs;i++) { rbm[0].output_momentum[i]*= rbm[0].momentum_decay; rbm[0].bias_outputs[i]+= rbm[0].output_momentum[i]; for(int j=0;j<rbm[0].n_inputs;j++) { // Computes(eq 7.11, 1st half): v_t=\mu_{t-1}v_{t-1}. double momentum_i_j= rbm[0].momentum_decay*get_matrix_value(rbm[0].momentum, i, j); set_matrix_value(rbm[0].momentum, i, j, momentum_i_j); // Updates momentum matrix. // Computes(eq 7.10, 1st half): \theta_t = \theta_{t-1} + \mu_{t-11}v_{t-1}. set_matrix_value(rbm[0].io_weights, i, j, get_matrix_value(rbm[0].io_weights, i, j)+momentum_i_j); if(i==0 && rbm[0].update_input_bias) { rbm[0].input_momentum[j]*= rbm[0].momentum_decay; rbm[0].bias_inputs[j]+= rbm[0].input_momentum[j]; } } } }
/* * Add matricies io_weights and delta_w. The result will be in io_weights. * * Also includes */ void apply_delta_w(rbm_t *rbm, delta_w_t *dw) { for(int i=0;i<rbm[0].n_outputs;i++) { rbm[0].bias_outputs[i] += rbm[0].learning_rate*dw[0].delta_output_bias[i]/(double)dw[0].batch_size; for(int j=0;j<rbm[0].n_inputs;j++) { double previous_w_i_j= get_matrix_value(rbm[0].io_weights, i, j); double delta_w_i_j= get_matrix_value(dw[0].delta_w, i, j); // If using L2 penalty (a.k.a "weight decay"), apply that here. if(rbm[0].use_l2_penalty) delta_w_i_j-= rbm[0].weight_cost*previous_w_i_j; // Is this the right sign!? double new_w_i_j= previous_w_i_j+rbm[0].learning_rate*delta_w_i_j/(double)dw[0].batch_size; set_matrix_value(rbm[0].io_weights, i, j, new_w_i_j); if(i==0 && rbm[0].update_input_bias) // Only update once... and if everything says to update. rbm[0].bias_inputs[j] += rbm[0].learning_rate*dw[0].delta_input_bias[j]/(double)dw[0].batch_size; } } }
/* * Add matricies io_weights and delta_w. The result will be in io_weights. * * Also includes */ void apply_delta_w(rbm_t *rbm, delta_w_t *dw) { double alpha= (rbm->learning_rate/(double)dw->batch_size); for(int i=0;i<rbm->n_outputs;i++) { rbm->bias_outputs[i] += alpha*dw->delta_output_bias[i]; for(int j=0;j<rbm->n_inputs;j++) { double previous_w_i_j= get_matrix_value(rbm->io_weights, i, j); double delta_w_i_j= alpha*get_matrix_value(dw->delta_w, i, j); // delta_w_i_j/ batch_size // If using L2 penalty (a.k.a "weight decay"), apply that here. if(rbm->use_l2_penalty) delta_w_i_j-= rbm->weight_cost*previous_w_i_j*rbm->learning_rate; // from: http://www.cs.toronto.edu/~hinton/code/rbm.m, distributing learning rate. double new_w_i_j= previous_w_i_j+delta_w_i_j; set_matrix_value(rbm->io_weights, i, j, new_w_i_j); if(i==0 && rbm->update_input_bias) // Only update once... and if everything says to update. rbm->bias_inputs[j] += alpha*dw->delta_input_bias[j]; } } }