/* INTERNAL FUNCTION The iRprop- algorithm */ void fann_update_weights_irpropm(struct fann *ann, unsigned int first_weight, unsigned int past_end) { fann_type *train_slopes = ann->train_slopes; fann_type *weights = ann->weights; fann_type *prev_steps = ann->prev_steps; fann_type *prev_train_slopes = ann->prev_train_slopes; fann_type prev_step, slope, prev_slope, next_step, same_sign; float increase_factor = ann->rprop_increase_factor; /*1.2; */ float decrease_factor = ann->rprop_decrease_factor; /*0.5; */ float delta_min = ann->rprop_delta_min; /*0.0; */ float delta_max = ann->rprop_delta_max; /*50.0; */ unsigned int i = first_weight; unsigned int *connections_to_weights = ann->connections_to_weights; for(; i != past_end; i++) { prev_step = fann_max(prev_steps[i], (fann_type) 0.0001); /* prev_step may not be zero because then the training will stop */ slope = train_slopes[i]; prev_slope = prev_train_slopes[i]; same_sign = prev_slope * slope; if(same_sign >= 0.0) next_step = fann_min(prev_step * increase_factor, delta_max); else { next_step = fann_max(prev_step * decrease_factor, delta_min); slope = 0; } if(slope < 0) { weights[connections_to_weights[i]] -= next_step; if(weights[connections_to_weights[i]] < -1500) weights[connections_to_weights[i]] = -1500; } else { weights[connections_to_weights[i]] += next_step; if(weights[connections_to_weights[i]] > 1500) weights[connections_to_weights[i]] = 1500; } /*if(i == 2){ * printf("weight=%f, slope=%f, next_step=%f, prev_step=%f\n", weights[i], slope, next_step, prev_step); * } */ /* update global data arrays */ prev_steps[i] = next_step; prev_train_slopes[i] = slope; train_slopes[i] = 0.0; } }
/* INTERNAL FUNCTION Adjust the steepwise functions (if used) */ void fann_update_stepwise(struct fann *ann) { unsigned int i = 0; /* Calculate the parameters for the stepwise linear * sigmoid function fixed point. * Using a rewritten sigmoid function. * results 0.005, 0.05, 0.25, 0.75, 0.95, 0.995 */ ann->sigmoid_results[0] = fann_max((fann_type) (ann->multiplier / 200.0 + 0.5), 1); ann->sigmoid_results[1] = fann_max((fann_type) (ann->multiplier / 20.0 + 0.5), 1); ann->sigmoid_results[2] = fann_max((fann_type) (ann->multiplier / 4.0 + 0.5), 1); ann->sigmoid_results[3] = fann_min(ann->multiplier - (fann_type) (ann->multiplier / 4.0 + 0.5), ann->multiplier - 1); ann->sigmoid_results[4] = fann_min(ann->multiplier - (fann_type) (ann->multiplier / 20.0 + 0.5), ann->multiplier - 1); ann->sigmoid_results[5] = fann_min(ann->multiplier - (fann_type) (ann->multiplier / 200.0 + 0.5), ann->multiplier - 1); ann->sigmoid_symmetric_results[0] = fann_max((fann_type) ((ann->multiplier / 100.0) - ann->multiplier - 0.5), (fann_type) (1 - (fann_type) ann->multiplier)); ann->sigmoid_symmetric_results[1] = fann_max((fann_type) ((ann->multiplier / 10.0) - ann->multiplier - 0.5), (fann_type) (1 - (fann_type) ann->multiplier)); ann->sigmoid_symmetric_results[2] = fann_max((fann_type) ((ann->multiplier / 2.0) - ann->multiplier - 0.5), (fann_type) (1 - (fann_type) ann->multiplier)); ann->sigmoid_symmetric_results[3] = fann_min(ann->multiplier - (fann_type) (ann->multiplier / 2.0 + 0.5), ann->multiplier - 1); ann->sigmoid_symmetric_results[4] = fann_min(ann->multiplier - (fann_type) (ann->multiplier / 10.0 + 0.5), ann->multiplier - 1); ann->sigmoid_symmetric_results[5] = fann_min(ann->multiplier - (fann_type) (ann->multiplier / 100.0 + 1.0), ann->multiplier - 1); for(i = 0; i < 6; i++) { ann->sigmoid_values[i] = (fann_type) (((log(ann->multiplier / (float) ann->sigmoid_results[i] - 1) * (float) ann->multiplier) / -2.0) * (float) ann->multiplier); ann->sigmoid_symmetric_values[i] = (fann_type) (((log ((ann->multiplier - (float) ann->sigmoid_symmetric_results[i]) / ((float) ann->sigmoid_symmetric_results[i] + ann->multiplier)) * (float) ann->multiplier) / -2.0) * (float) ann->multiplier); } }
float train_epoch_sarprop_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb, vector< vector<fann_type> >& predicted_outputs) { if(ann->prev_train_slopes == NULL) { fann_clear_train_arrays(ann); } fann_reset_MSE(ann); predicted_outputs.resize(data->num_data,vector<fann_type> (data->num_output)); vector<struct fann *> ann_vect(threadnumb); int i=0,j=0; //generate copies of the ann omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(j) { #pragma omp for schedule(static) for(i=0; i<(int)threadnumb; i++) { ann_vect[i]=fann_copy(ann); } //parallel computing of the updates #pragma omp for schedule(static) for(i = 0; i < (int)data->num_data; i++) { j=omp_get_thread_num(); fann_type* temp_predicted_output=fann_run(ann_vect[j], data->input[i]); for(unsigned int k=0;k<data->num_output;++k) { predicted_outputs[i][k]=temp_predicted_output[k]; } fann_compute_MSE(ann_vect[j], data->output[i]); fann_backpropagate_MSE(ann_vect[j]); fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1); } } { fann_type *weights = ann->weights; fann_type *prev_steps = ann->prev_steps; fann_type *prev_train_slopes = ann->prev_train_slopes; const unsigned int first_weight=0; const unsigned int past_end=ann->total_connections; const unsigned int epoch=ann->sarprop_epoch; fann_type next_step; /* These should be set from variables */ const float increase_factor = ann->rprop_increase_factor; /*1.2; */ const float decrease_factor = ann->rprop_decrease_factor; /*0.5; */ /* TODO: why is delta_min 0.0 in iRprop? SARPROP uses 1x10^-6 (Braun and Riedmiller, 1993) */ const float delta_min = 0.000001f; const float delta_max = ann->rprop_delta_max; /*50.0; */ const float weight_decay_shift = ann->sarprop_weight_decay_shift; /* ld 0.01 = -6.644 */ const float step_error_threshold_factor = ann->sarprop_step_error_threshold_factor; /* 0.1 */ const float step_error_shift = ann->sarprop_step_error_shift; /* ld 3 = 1.585 */ const float T = ann->sarprop_temperature; //merge of MSEs for(i=0;i<(int)threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; } const float MSE = fann_get_MSE(ann); const float RMSE = (float)sqrt(MSE); /* for all weights; TODO: are biases included? */ omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(next_step) { #pragma omp for schedule(static) for(i=first_weight; i < (int)past_end; i++) { /* TODO: confirm whether 1x10^-6 == delta_min is really better */ const fann_type prev_step = fann_max(prev_steps[i], (fann_type) 0.000001); /* prev_step may not be zero because then the training will stop */ /* calculate SARPROP slope; TODO: better as new error function? (see SARPROP paper)*/ fann_type temp_slopes=0.0; unsigned int k; fann_type *train_slopes; for(k=0;k<threadnumb;++k) { train_slopes=ann_vect[k]->train_slopes; temp_slopes+= train_slopes[i]; train_slopes[i]=0.0; } temp_slopes= -temp_slopes - weights[i] * (fann_type)fann_exp2(-T * epoch + weight_decay_shift); next_step=0.0; /* TODO: is prev_train_slopes[i] 0.0 in the beginning? */ const fann_type prev_slope = prev_train_slopes[i]; const fann_type same_sign = prev_slope * temp_slopes; if(same_sign > 0.0) { next_step = fann_min(prev_step * increase_factor, delta_max); /* TODO: are the signs inverted? see differences between SARPROP paper and iRprop */ if (temp_slopes < 0.0) weights[i] += next_step; else weights[i] -= next_step; } else if(same_sign < 0.0) { #ifndef RAND_MAX #define RAND_MAX 0x7fffffff #endif if(prev_step < step_error_threshold_factor * MSE) next_step = prev_step * decrease_factor + (float)rand() / RAND_MAX * RMSE * (fann_type)fann_exp2(-T * epoch + step_error_shift); else next_step = fann_max(prev_step * decrease_factor, delta_min); temp_slopes = 0.0; } else { if(temp_slopes < 0.0) weights[i] += prev_step; else weights[i] -= prev_step; } /* update global data arrays */ prev_steps[i] = next_step; prev_train_slopes[i] = temp_slopes; } } } ++(ann->sarprop_epoch); //already computed before /*//merge of MSEs for(i=0;i<threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; }*/ //destroy the copies of the ann for(i=0; i<(int)threadnumb; i++) { fann_destroy(ann_vect[i]); } return fann_get_MSE(ann); }
float train_epoch_irpropm_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb) { if(ann->prev_train_slopes == NULL) { fann_clear_train_arrays(ann); } //#define THREADNUM 1 fann_reset_MSE(ann); vector<struct fann *> ann_vect(threadnumb); int i=0,j=0; //generate copies of the ann omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(j) { #pragma omp for schedule(static) for(i=0; i<(int)threadnumb; i++) { ann_vect[i]=fann_copy(ann); } //parallel computing of the updates #pragma omp for schedule(static) for(i = 0; i < (int)data->num_data; i++) { j=omp_get_thread_num(); fann_run(ann_vect[j], data->input[i]); fann_compute_MSE(ann_vect[j], data->output[i]); fann_backpropagate_MSE(ann_vect[j]); fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1); } } { fann_type *weights = ann->weights; fann_type *prev_steps = ann->prev_steps; fann_type *prev_train_slopes = ann->prev_train_slopes; fann_type next_step; const float increase_factor = ann->rprop_increase_factor; //1.2; const float decrease_factor = ann->rprop_decrease_factor; //0.5; const float delta_min = ann->rprop_delta_min; //0.0; const float delta_max = ann->rprop_delta_max; //50.0; const unsigned int first_weight=0; const unsigned int past_end=ann->total_connections; omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(next_step) { #pragma omp for schedule(static) for(i=first_weight; i < (int)past_end; i++) { const fann_type prev_step = fann_max(prev_steps[i], (fann_type) 0.0001); // prev_step may not be zero because then the training will stop fann_type temp_slopes=0.0; unsigned int k; fann_type *train_slopes; for(k=0;k<threadnumb;++k) { train_slopes=ann_vect[k]->train_slopes; temp_slopes+= train_slopes[i]; train_slopes[i]=0.0; } const fann_type prev_slope = prev_train_slopes[i]; const fann_type same_sign = prev_slope * temp_slopes; if(same_sign >= 0.0) next_step = fann_min(prev_step * increase_factor, delta_max); else { next_step = fann_max(prev_step * decrease_factor, delta_min); temp_slopes = 0; } if(temp_slopes < 0) { weights[i] -= next_step; if(weights[i] < -1500) weights[i] = -1500; } else { weights[i] += next_step; if(weights[i] > 1500) weights[i] = 1500; } // update global data arrays prev_steps[i] = next_step; prev_train_slopes[i] = temp_slopes; } } } //merge of MSEs for(i=0;i<(int)threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; fann_destroy(ann_vect[i]); } return fann_get_MSE(ann); }
/* INTERNAL FUNCTION The SARprop- algorithm */ void fann_update_weights_sarprop(struct fann *ann, unsigned int epoch, unsigned int first_weight, unsigned int past_end) { fann_type *train_slopes = ann->train_slopes; fann_type *weights = ann->weights; fann_type *prev_steps = ann->prev_steps; fann_type *prev_train_slopes = ann->prev_train_slopes; fann_type prev_step, slope, prev_slope, next_step = 0, same_sign; /* These should be set from variables */ float increase_factor = ann->rprop_increase_factor; /*1.2; */ float decrease_factor = ann->rprop_decrease_factor; /*0.5; */ /* TODO: why is delta_min 0.0 in iRprop? SARPROP uses 1x10^-6 (Braun and Riedmiller, 1993) */ float delta_min = 0.000001f; float delta_max = ann->rprop_delta_max; /*50.0; */ float weight_decay_shift = ann->sarprop_weight_decay_shift; /* ld 0.01 = -6.644 */ float step_error_threshold_factor = ann->sarprop_step_error_threshold_factor; /* 0.1 */ float step_error_shift = ann->sarprop_step_error_shift; /* ld 3 = 1.585 */ float T = ann->sarprop_temperature; float MSE = fann_get_MSE(ann); float RMSE = (float)sqrt(MSE); unsigned int i = first_weight; /* for all weights; TODO: are biases included? */ for(; i != past_end; i++) { /* TODO: confirm whether 1x10^-6 == delta_min is really better */ prev_step = fann_max(prev_steps[i], (fann_type) 0.000001); /* prev_step may not be zero because then the training will stop */ /* calculate SARPROP slope; TODO: better as new error function? (see SARPROP paper)*/ slope = -train_slopes[i] - weights[i] * (fann_type)fann_exp2(-T * epoch + weight_decay_shift); /* TODO: is prev_train_slopes[i] 0.0 in the beginning? */ prev_slope = prev_train_slopes[i]; same_sign = prev_slope * slope; if(same_sign > 0.0) { next_step = fann_min(prev_step * increase_factor, delta_max); /* TODO: are the signs inverted? see differences between SARPROP paper and iRprop */ if (slope < 0.0) weights[i] += next_step; else weights[i] -= next_step; } else if(same_sign < 0.0) { if(prev_step < step_error_threshold_factor * MSE) next_step = prev_step * decrease_factor + (float)rand() / RAND_MAX * RMSE * (fann_type)fann_exp2(-T * epoch + step_error_shift); else next_step = fann_max(prev_step * decrease_factor, delta_min); slope = 0.0; } else { if(slope < 0.0) weights[i] += prev_step; else weights[i] -= prev_step; } /*if(i == 2){ * printf("weight=%f, slope=%f, next_step=%f, prev_step=%f\n", weights[i], slope, next_step, prev_step); * } */ /* update global data arrays */ prev_steps[i] = next_step; prev_train_slopes[i] = slope; train_slopes[i] = 0.0; } }
/* INTERNAL FUNCTION The iRprop- algorithm */ void fann_sparse_neuron_irpropm_update(struct fann *ann, struct fann_neuron *neuron) { struct fann_neuron_private_data_connected_any_any *priv = (struct fann_neuron_private_data_connected_any_any *) neuron->private_data; fann_type *weights = neuron->weights; fann_type *weights_deltas = neuron->weights_deltas; fann_type *prev_weights_deltas = priv->prev_weights_deltas; fann_type *prev_steps = priv->prev_steps; fann_type *mask = ((struct fann_sparse_neuron_private_data*) neuron->private_data)->mask; const unsigned int num_outputs = neuron->num_outputs; const unsigned int num_inputs = neuron->num_inputs; float increase_factor = ann->rprop_params->rprop_increase_factor; /*1.2; */ float decrease_factor = ann->rprop_params->rprop_decrease_factor; /*0.5; */ float delta_min = ann->rprop_params->rprop_delta_min; /*0.0; */ float delta_max = ann->rprop_params->rprop_delta_max; /*50.0; */ unsigned int o, i; fann_type prev_step, delta, prev_delta, next_step, same_sign; if (neuron->num_backprop_done==0) { fann_error(NULL, FANN_E_CANT_USE_TRAIN_ALG); return; } for (o = 0; o < num_outputs; o++) { for (i = 0; i < num_inputs; i++) { /*don't update masked connections*/ if (!mask[i]) continue; prev_step = fann_max(prev_steps[i], (fann_type) 0.0001); /* prev_step may not be zero because then the training will stop */ /* does 0.0001 make sense????*/ delta = weights_deltas[i]; prev_delta = prev_weights_deltas[i]; same_sign = prev_delta * delta; if(same_sign >= 0.0) next_step = fann_min(prev_step * increase_factor, delta_max); else { next_step = fann_max(prev_step * decrease_factor, delta_min); delta = 0; } if(delta < 0) { weights[i] -= next_step; if(weights[i] < -1500) weights[i] = -1500; } else { weights[i] += next_step; if(weights[i] > 1500) weights[i] = 1500; } /* update data arrays */ prev_steps[i] = next_step; prev_weights_deltas[i] = delta; weights_deltas[i] = 0.0; } weights += num_inputs; weights_deltas += num_inputs; prev_weights_deltas += num_inputs; prev_steps += num_inputs; mask +=num_inputs; } neuron->num_backprop_done=0; }