main () { thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } omp_set_dynamic (0); prvt.i = MAGICNO; prvt.d = MAGICNO+1; #pragma omp parallel firstprivate (prvt) { int id = omp_get_thread_num (); if (prvt.i != MAGICNO) { #pragma omp critical errors += 1; } if (prvt.d != MAGICNO+1) { #pragma omp critical errors += 1; } prvt.i = id; prvt.d = id-1; #pragma omp barrier if (prvt.i != id) { #pragma omp critical errors += 1; } if (prvt.d != id-1) { #pragma omp critical errors += 1; } if (sizeof(prvt) != sizeof(struct x)) { #pragma omp critical errors += 1; } } prvt.i = MAGICNO*2; prvt.d = MAGICNO*2+1; #pragma omp parallel firstprivate (prvt) func1 (MAGICNO*2, &prvt); prvt.i = MAGICNO*3; prvt.d = MAGICNO*3+1; #pragma omp parallel firstprivate (prvt) func2 (MAGICNO*3); if (errors == 0) { printf ("firstprivate 013 : SUCCESS\n"); return 0; } else { printf ("firstprivate 013 : FAILED\n"); return 1; } }
main () { int lp, finish; int errors = 0; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } buf = (int *) malloc (sizeof (int) * thds); if (buf == NULL) { printf ("can not allocate memory.\n"); exit (1); } omp_set_dynamic (0); finish = 0; clear (); #pragma omp parallel for schedule (static,1) for (lp=0; lp<thds; lp++) { int id = omp_get_thread_num (); /* make barrier, and thread 1-x delay any sec from thread 0 */ if (id == 0) { finish = 1; } else { while (finish == 0) { #pragma omp flush } waittime (1); } buf[id] = omp_get_thread_num (); #pragma omp flush } for (lp=0; lp<thds; lp++) { if (buf[lp] == -1) { errors += 1; } } finish = 0; clear (); #pragma omp parallel for schedule (dynamic,1) for (lp=0; lp<thds; lp++) { int id = omp_get_thread_num (); barrier (thds); /* make barrier, and thread 1-x delay any sec from thread 0 */ if (omp_get_thread_num () == 0) { finish = 1; } else { while (finish == 0) { #pragma omp flush } waittime (1); } buf[id] = omp_get_thread_num (); #pragma omp flush } for (lp=0; lp<thds; lp++) { if (buf[lp] == -1) { errors += 1; } } clear (); #pragma omp parallel for schedule (guided,1) for (lp=0; lp<thds*4; lp++) { int id = omp_get_thread_num (); /* make barrier, and thread 1-x delay any sec from thread 0 */ buf[id] = -2; #pragma omp flush if (id != 0) { waittime (1); } buf[id] = id; #pragma omp flush } for (lp=0; lp<thds; lp++) { if (buf[lp] == -2) { errors += 1; } } if (errors == 0) { printf ("parallel for 008 : SUCCESS\n"); return 0; } else { printf ("parallel for 008 : FAILED\n"); return 1; } }
const DBSCAN::DistanceMatrix DBSCAN::calc_dist_matrix( const DBSCAN::ClusterData& C, const DBSCAN::FeaturesWeights& W ) { DBSCAN::ClusterData cl_d = C; omp_set_dynamic( 0 ); omp_set_num_threads( m_num_threads ); #pragma omp parallel for for ( size_t i = 0; i < cl_d.size2(); ++i ) { ublas::matrix_column< DBSCAN::ClusterData > col( cl_d, i ); const auto r = minmax_element( col.begin(), col.end() ); double data_min = *r.first; double data_range = *r.second - *r.first; if ( data_range == 0.0 ) { data_range = 1.0; } const double scale = 1 / data_range; const double min = -1.0 * data_min * scale; col *= scale; col.plus_assign( ublas::scalar_vector< typename ublas::matrix_column< DBSCAN::ClusterData >::value_type >( col.size(), min ) ); } // rows x rows DBSCAN::DistanceMatrix d_m( cl_d.size1(), cl_d.size1() ); ublas::vector< double > d_max( cl_d.size1() ); ublas::vector< double > d_min( cl_d.size1() ); omp_set_dynamic( 0 ); omp_set_num_threads( m_num_threads ); #pragma omp parallel for for ( size_t i = 0; i < cl_d.size1(); ++i ) { for ( size_t j = i; j < cl_d.size1(); ++j ) { d_m( i, j ) = 0.0; if ( i != j ) { ublas::matrix_row< DBSCAN::ClusterData > U( cl_d, i ); ublas::matrix_row< DBSCAN::ClusterData > V( cl_d, j ); int k = 0; for ( const auto e : ( U - V ) ) { d_m( i, j ) += fabs( e ) * W[k++]; } d_m( j, i ) = d_m( i, j ); } } const auto cur_row = ublas::matrix_row< DBSCAN::DistanceMatrix >( d_m, i ); const auto mm = minmax_element( cur_row.begin(), cur_row.end() ); d_max( i ) = *mm.second; d_min( i ) = *mm.first; } m_dmin = *( min_element( d_min.begin(), d_min.end() ) ); m_dmax = *( max_element( d_max.begin(), d_max.end() ) ); m_eps = ( m_dmax - m_dmin ) * m_eps + m_dmin; return d_m; }
FANN_EXTERNAL float FANN_API fann_train_epoch_batch_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb) { /*vector<struct fann *> ann_vect(threadnumb);*/ struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*)); int i=0,j=0; fann_reset_MSE(ann); //generate copies of the ann omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(j) { #pragma omp for schedule(static) for(i=0; i<(int)threadnumb; i++) { ann_vect[i]=fann_copy(ann); } //parallel computing of the updates #pragma omp for schedule(static) for(i = 0; i < (int)data->num_data; i++) { j=omp_get_thread_num(); if (ann->do_dropout) { fann_run_dropout(ann_vect[j], data->input[i]); } else { fann_run(ann_vect[j], data->input[i]); } fann_compute_MSE(ann_vect[j], data->output[i]); fann_backpropagate_MSE(ann_vect[j]); fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1); } } //parallel update of the weights { const unsigned int num_data=data->num_data; const unsigned int first_weight=0; const unsigned int past_end=ann->total_connections; fann_type *weights = ann->weights; const fann_type epsilon = ann->learning_rate / num_data; omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel { #pragma omp for schedule(static) for(i=first_weight; i < (int)past_end; i++) { fann_type temp_slopes=0.0; unsigned int k; fann_type *train_slopes; for(k=0;k<threadnumb;++k) { train_slopes=ann_vect[k]->train_slopes; temp_slopes+= train_slopes[i]; train_slopes[i]=0.0; } weights[i] += temp_slopes*epsilon; } } } //merge of MSEs for(i=0;i<(int)threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; fann_destroy(ann_vect[i]); } free(ann_vect); return fann_get_MSE(ann); }
main () { int lp; int false = 0; double dfalse = 0.0; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } buf = (int *) malloc (sizeof (int) * (thds + 1)); if (buf == NULL) { printf ("can not allocate memory.\n"); exit (1); } omp_set_dynamic (0); clear (); #pragma omp parallel for schedule(static,1) if (0) for (lp=0; lp<thds; lp++) { buf[lp] = omp_get_thread_num (); check_parallel (0); } errors += check_result (); clear (); #pragma omp parallel for schedule(static,1) if (dfalse) for (lp=0; lp<thds; lp++) { buf[lp] = omp_get_thread_num (); check_parallel (0); } errors += check_result (); clear (); #pragma omp parallel for schedule(static,1) if (false == 1) for (lp=0; lp<thds; lp++) { buf[lp] = omp_get_thread_num (); check_parallel (0); } errors += check_result (); clear (); #pragma omp parallel for schedule(static,1) if (sameas(false)) for (lp=0; lp<thds; lp++) { buf[lp] = omp_get_thread_num (); check_parallel (0); } errors += check_result (); if (errors == 0) { printf ("parallel for 012 : SUCCESS\n"); return 0; } else { printf ("parallel for 012 : FAILED\n"); return 1; } }
int main(int argc, char *argv[]) { struct pngquant_options options = { .floyd = 1.f, // floyd-steinberg dithering }; options.liq = liq_attr_create(); if (!options.liq) { fputs("SSE-capable CPU is required for this build.\n", stderr); return WRONG_ARCHITECTURE; } unsigned int error_count=0, skipped_count=0, file_count=0; pngquant_error latest_error=SUCCESS; const char *newext = NULL, *output_file_path = NULL; fix_obsolete_options(argc, argv); int opt; do { opt = getopt_long(argc, argv, "Vvqfhs:Q:o:", long_options, NULL); switch (opt) { case 'v': options.verbose = true; break; case 'q': options.verbose = false; break; case arg_floyd: options.floyd = optarg ? atof(optarg) : 1.0; if (options.floyd < 0 || options.floyd > 1.f) { fputs("--floyd argument must be in 0..1 range\n", stderr); return INVALID_ARGUMENT; } break; case arg_ordered: options.floyd = 0; break; case 'f': options.force = true; break; case arg_no_force: options.force = false; break; case arg_ext: newext = optarg; break; case 'o': if (output_file_path) { fputs("--output option can be used only once\n", stderr); return INVALID_ARGUMENT; } output_file_path = optarg; break; case arg_iebug: // opacities above 238 will be rounded up to 255, because IE6 truncates <255 to 0. liq_set_min_opacity(options.liq, 238); options.ie_mode = true; break; case arg_transbug: liq_set_last_index_transparent(options.liq, true); break; case arg_skip_larger: options.skip_if_larger = true; break; case 's': { int speed = atoi(optarg); if (speed >= 10) { options.fast_compression = true; } if (speed == 11) { options.floyd = 0; speed = 10; } if (LIQ_OK != liq_set_speed(options.liq, speed)) { fputs("Speed should be between 1 (slow) and 11 (fast).\n", stderr); return INVALID_ARGUMENT; } } break; case 'Q': if (!parse_quality(optarg, options.liq, &options.min_quality_limit)) { fputs("Quality should be in format min-max where min and max are numbers in range 0-100.\n", stderr); return INVALID_ARGUMENT; } break; case arg_posterize: if (LIQ_OK != liq_set_min_posterization(options.liq, atoi(optarg))) { fputs("Posterization should be number of bits in range 0-4.\n", stderr); return INVALID_ARGUMENT; } break; case arg_map: { png24_image tmp = {}; if (SUCCESS != read_image(options.liq, optarg, false, &tmp, &options.fixed_palette_image, false, false)) { fprintf(stderr, " error: Unable to load %s", optarg); return INVALID_ARGUMENT; } } break; case 'h': print_full_version(stdout); print_usage(stdout); return SUCCESS; case 'V': puts(PNGQUANT_VERSION); return SUCCESS; case -1: break; default: return INVALID_ARGUMENT; } } while (opt != -1); int argn = optind; if (argn >= argc) { if (argn > 1) { fputs("No input files specified. See -h for help.\n", stderr); } else { print_full_version(stderr); print_usage(stderr); } return MISSING_ARGUMENT; } if (options.verbose) { liq_set_log_callback(options.liq, log_callback, NULL); options.log_callback = log_callback; } char *colors_end; unsigned long colors = strtoul(argv[argn], &colors_end, 10); if (colors_end != argv[argn] && '\0' == colors_end[0]) { if (LIQ_OK != liq_set_max_colors(options.liq, colors)) { fputs("Number of colors must be between 2 and 256.\n", stderr); return INVALID_ARGUMENT; } argn++; } if (newext && output_file_path) { fputs("--ext and --output options can't be used at the same time\n", stderr); return INVALID_ARGUMENT; } // new filename extension depends on options used. Typically basename-fs8.png if (newext == NULL) { newext = options.floyd > 0 ? "-ie-fs8.png" : "-ie-or8.png"; if (!options.ie_mode) { newext += 3; /* skip "-ie" */ } } if (argn == argc || (argn == argc-1 && 0==strcmp(argv[argn],"-"))) { options.using_stdin = true; argn = argc-1; } if (options.using_stdin && output_file_path) { fputs("--output can't be mixed with stdin\n", stderr); return INVALID_ARGUMENT; } const int num_files = argc-argn; if (output_file_path && num_files != 1) { fputs("Only one input file is allowed when --output is used\n", stderr); return INVALID_ARGUMENT; } #ifdef _OPENMP // if there's a lot of files, coarse parallelism can be used if (num_files > 2*omp_get_max_threads()) { omp_set_nested(0); omp_set_dynamic(1); } else { omp_set_nested(1); } #endif #pragma omp parallel for \ schedule(static, 1) reduction(+:skipped_count) reduction(+:error_count) reduction(+:file_count) shared(latest_error) for(int i=0; i < num_files; i++) { struct pngquant_options opts = options; opts.liq = liq_attr_copy(options.liq); const char *filename = opts.using_stdin ? "stdin" : argv[argn+i]; #ifdef _OPENMP struct buffered_log buf = {}; if (opts.log_callback && omp_get_num_threads() > 1 && num_files > 1) { liq_set_log_callback(opts.liq, log_callback_buferred, &buf); liq_set_log_flush_callback(opts.liq, log_callback_buferred_flush, &buf); options.log_callback = log_callback_buferred; options.log_callback_user_info = &buf; } #endif pngquant_error retval = SUCCESS; const char *outname = output_file_path; char *outname_free = NULL; if (!options.using_stdin) { if (!outname) { outname = outname_free = add_filename_extension(filename, newext); } if (!options.force && file_exists(outname)) { fprintf(stderr, " error: %s exists; not overwriting\n", outname); retval = NOT_OVERWRITING_ERROR; } } if (!retval) { retval = pngquant_file(filename, outname, &opts); } free(outname_free); liq_attr_destroy(opts.liq); if (retval) { #pragma omp critical { latest_error = retval; } if (retval == TOO_LOW_QUALITY || retval == TOO_LARGE_FILE) { skipped_count++; } else { error_count++; } } ++file_count; } if (error_count) { verbose_printf(&options, "There were errors quantizing %d file%s out of a total of %d file%s.", error_count, (error_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s"); } if (skipped_count) { verbose_printf(&options, "Skipped %d file%s out of a total of %d file%s.", skipped_count, (skipped_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s"); } if (!skipped_count && !error_count) { verbose_printf(&options, "No errors detected while quantizing %d image%s.", file_count, (file_count == 1)? "" : "s"); } liq_image_destroy(options.fixed_palette_image); liq_attr_destroy(options.liq); return latest_error; } #endif pngquant_error pngquant_file(const char *filename, const char *outname, struct pngquant_options *options) { pngquant_error retval = SUCCESS; verbose_printf(options, "%s:", filename); liq_image *input_image = NULL; png24_image input_image_rwpng = {}; bool keep_input_pixels = options->skip_if_larger || (options->using_stdin && options->min_quality_limit); // original may need to be output to stdout if (!retval) { retval = read_image(options->liq, filename, options->using_stdin, &input_image_rwpng, &input_image, keep_input_pixels, options->verbose); } int quality_percent = 90; // quality on 0-100 scale, updated upon successful remap png8_image output_image = {}; if (!retval) { verbose_printf(options, " read %luKB file", (input_image_rwpng.file_size+1023UL)/1024UL); #if USE_LCMS if (input_image_rwpng.lcms_status == ICCP) { verbose_printf(options, " used embedded ICC profile to transform image to sRGB colorspace"); } else if (input_image_rwpng.lcms_status == GAMA_CHRM) { verbose_printf(options, " used gAMA and cHRM chunks to transform image to sRGB colorspace"); } else if (input_image_rwpng.lcms_status == ICCP_WARN_GRAY) { verbose_printf(options, " warning: ignored ICC profile in GRAY colorspace"); } #endif if (input_image_rwpng.gamma != 0.45455) { verbose_printf(options, " corrected image from gamma %2.1f to sRGB gamma", 1.0/input_image_rwpng.gamma); } // when using image as source of a fixed palette the palette is extracted using regular quantization liq_result *remap = liq_quantize_image(options->liq, options->fixed_palette_image ? options->fixed_palette_image : input_image); if (remap) { liq_set_output_gamma(remap, 0.45455); // fixed gamma ~2.2 for the web. PNG can't store exact 1/2.2 liq_set_dithering_level(remap, options->floyd); retval = prepare_output_image(remap, input_image, &output_image); if (!retval) { if (LIQ_OK != liq_write_remapped_image_rows(remap, input_image, output_image.row_pointers)) { retval = OUT_OF_MEMORY_ERROR; } set_palette(remap, &output_image); double palette_error = liq_get_quantization_error(remap); if (palette_error >= 0) { quality_percent = liq_get_quantization_quality(remap); verbose_printf(options, " mapped image to new colors...MSE=%.3f (Q=%d)", palette_error, quality_percent); } } liq_result_destroy(remap); } else { retval = TOO_LOW_QUALITY; } } if (!retval) { if (options->skip_if_larger) { // this is very rough approximation, but generally avoid losing more quality than is gained in file size. // Quality is squared, because even greater savings are needed to justify big quality loss. double quality = quality_percent/100.0; output_image.maximum_file_size = (input_image_rwpng.file_size-1) * quality*quality; } output_image.fast_compression = options->fast_compression; output_image.chunks = input_image_rwpng.chunks; input_image_rwpng.chunks = NULL; retval = write_image(&output_image, NULL, outname, options); if (TOO_LARGE_FILE == retval) { verbose_printf(options, " file exceeded expected size of %luKB", (unsigned long)output_image.maximum_file_size/1024UL); } } if (options->using_stdin && keep_input_pixels && (TOO_LARGE_FILE == retval || TOO_LOW_QUALITY == retval)) { // when outputting to stdout it'd be nasty to create 0-byte file // so if quality is too low, output 24-bit original pngquant_error write_retval = write_image(NULL, &input_image_rwpng, outname, options); if (write_retval) { retval = write_retval; } } liq_image_destroy(input_image); rwpng_free_image24(&input_image_rwpng); rwpng_free_image8(&output_image); return retval; }
FANN_EXTERNAL float FANN_API fann_train_epoch_irpropm_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb) { struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*)); int i=0,j=0; if(ann->prev_train_slopes == NULL) { fann_clear_train_arrays(ann); } //#define THREADNUM 1 fann_reset_MSE(ann); /*vector<struct fann *> ann_vect(threadnumb);*/ //generate copies of the ann omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(j) { #pragma omp for schedule(static) for(i=0; i<(int)threadnumb; i++) { ann_vect[i]=fann_copy(ann); } //parallel computing of the updates #pragma omp for schedule(static) for(i = 0; i < (int)data->num_data; i++) { j=omp_get_thread_num(); if (ann->do_dropout) { fann_run_dropout(ann_vect[j], data->input[i]); } else { fann_run(ann_vect[j], data->input[i]); } fann_compute_MSE(ann_vect[j], data->output[i]); fann_backpropagate_MSE(ann_vect[j]); fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1); } } { fann_type *weights = ann->weights; fann_type *prev_steps = ann->prev_steps; fann_type *prev_train_slopes = ann->prev_train_slopes; fann_type next_step; const float increase_factor = ann->rprop_increase_factor; //1.2; const float decrease_factor = ann->rprop_decrease_factor; //0.5; const float delta_min = ann->rprop_delta_min; //0.0; const float delta_max = ann->rprop_delta_max; //50.0; const unsigned int first_weight=0; const unsigned int past_end=ann->total_connections; omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(next_step) { #pragma omp for schedule(static) for(i=first_weight; i < (int)past_end; i++) { fann_type prev_slope, same_sign; const fann_type prev_step = fann_max(prev_steps[i], (fann_type) 0.0001); // prev_step may not be zero because then the training will stop fann_type temp_slopes=0.0; unsigned int k; fann_type *train_slopes; for(k=0;k<threadnumb;++k) { train_slopes=ann_vect[k]->train_slopes; temp_slopes+= train_slopes[i]; train_slopes[i]=0.0; } prev_slope = prev_train_slopes[i]; same_sign = prev_slope * temp_slopes; if(same_sign >= 0.0) next_step = fann_min(prev_step * increase_factor, delta_max); else { next_step = fann_max(prev_step * decrease_factor, delta_min); temp_slopes = 0; } if(temp_slopes < 0) { weights[i] -= next_step; if(weights[i] < -1500) weights[i] = -1500; } else { weights[i] += next_step; if(weights[i] > 1500) weights[i] = 1500; } // update global data arrays prev_steps[i] = next_step; prev_train_slopes[i] = temp_slopes; } } } //merge of MSEs for(i=0;i<(int)threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; fann_destroy(ann_vect[i]); } free(ann_vect); return fann_get_MSE(ann); }
main () { int i; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } if (4 < thds) { thds = 4; omp_set_num_threads (4); } omp_set_dynamic (0); clear (); #pragma omp parallel { #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic atom_incr ++; } #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic ++ atom_incr2; } #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic atom_decr --; } #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic -- atom_decr2; } #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic atom_plus += sameas(2) - 1; } #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic atom_minus -= sameas(2) - 1; } #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic atom_mul *= sameas(3) - 1; } #pragma omp for for (i=0; i<LOOPNUM; i++) { #pragma omp atomic atom_div /= 4 + sameas(-2); } } errors += check (); if (errors == 0) { printf ("atomic 009 : SUCCESS\n"); return 0; } else { printf ("atomic 009 : FAILED\n"); return 1; } }
void add_to_ring(DTYPE* indata, DTYPE* outdata, CM_DTYPE* chan_map, DTYPE* ring_buffer_data, int ringt0, int chunk_size, int ring_length, float delta_t, size_t nfreq, float freq0, float delta_f, int depth) { omp_set_dynamic(0); omp_set_num_threads(8); //zero-pad data int ndm = get_nchan_from_depth(depth); float * indata_pad = (float*)malloc(sizeof(float)*nfreq*(chunk_size + ndm)); for(int i = 0; i < nfreq; i++){ memcpy(indata_pad + i*(chunk_size + ndm), indata + i*chunk_size,sizeof(float)*chunk_size); memset(indata_pad + i*(chunk_size + ndm) + chunk_size,0,sizeof(float)*(ndm)); } Data *dat=put_data_into_burst_struct(indata_pad,chunk_size + ndm,nfreq,chan_map,depth); remap_data(dat); int nchan = dat->nchan; float** ring_buffer = (float**)malloc(sizeof(float*)*nchan); make_rect_mat(ring_buffer,ring_buffer_data,nchan,ring_length); //allocate the triangular matrix for output //float* tmp = malloc((nchan*chunk_size + (nchan*(nchan - 1))/2)*sizeof(float)); //float* tmp = (float*)malloc(nchan*(chunk_size + nchan)*sizeof(float)); //float** tmp_mat = (float**)malloc(nchan*sizeof(float*)); //make_triangular_mat(tmp_mat,tmp,nchan,chunk_size,1); //make_rect_mat(tmp_mat, tmp, nchan, chunk_size + nchan); float** tmp_mat = matrix(nchan,chunk_size + nchan); dedisperse_lagged(dat->data,tmp_mat,nchan,chunk_size); //printf("ringt0: %i\n",ringt0); update_ring_buffer(tmp_mat,ring_buffer,nchan,chunk_size,ring_length,&ringt0); //printf("ringt0: %i\n",ringt0); //probably not the most efficient way to use the output array //does not stop copying if data is incomplete //does not prevent overlap //ring buffer must be long enough //because of the search padding requirement... for(int i = 0; i < nchan; i++){ int src0 = (ring_length + ringt0 - i) % ring_length; int src1 = (ring_length + ringt0 + chunk_size - i) % ring_length; //printf("ring length %i, cs %i\n",ring_length,chunk_size); //printf("i: %i, src0: %i, src1 %i\n",i,src0,src1); if (src1 < src0){ int first_cpy = (ring_length - src0); int second_cpy = chunk_size - first_cpy; memcpy(outdata + i*(chunk_size + nchan), ring_buffer[i] + src0, first_cpy*sizeof(float)); memcpy(outdata + i*(chunk_size + nchan) + first_cpy, ring_buffer[i] + src0 + first_cpy, (second_cpy)*sizeof(float)); } else{ memcpy(outdata + i*(chunk_size + nchan), ring_buffer[i] + src0, (chunk_size)*sizeof(float)); } } free(dat->data[0]); free(dat->data); free(dat->raw_data[0]); free(dat->raw_data); free(dat); //free(tmp); free(indata_pad); //free(tmp_mat[0]); free(tmp_mat[0]); free(tmp_mat); //free(ring_buffer); }
/**************************************************************** * * Function: main * Input: int argc number of command line arguements * char **arg pointer to those arguements * * Output: int 0 for success and 1 for error * * Description: Runs the simple merge algorithm multiple * times averaging the results and printing them to terminal. * *****************************************************************/ int main(int argc, char **argv) { struct timeval startt, endt, result; char name[8] = "omp/"; int status=0; int n; int* S; int* R; int RUNS; //Check if app was given enough input if(argc < 6){ printf("Missing Arguement Parameters\n"); printf("Format ./seq path_input input_size ans_Path RUNS MAX_THREADS\n"); return 1; } //Save args to memory and allocate memory for arrays n = atoi(argv[2])+1; RUNS = atoi(argv[4]); MAX_THREADS = atoi(argv[5]); S = malloc(n*sizeof(int)); R = malloc(n*sizeof(int)); if(n<50){ chunk = 4; /*For Small N*/ } omp_set_dynamic(0); //Makes sure the number of threads available is fixed omp_set_num_threads(MAX_THREADS); //Set thread number if(S==NULL){ printf("Failed to Allocate Memory for Input Array S"); } if(R==NULL){ printf("Failed to Allocate Memory for Input Array R"); } //Read the input array from file and save to memory status = read_input(S, n, argv[1]); if(status){ #ifdef DEBUG printf("Failed to Read Input S\n"); #endif return 1; } int *P_temp = malloc(n*sizeof(int)); int *R_temp = malloc(n*sizeof(int)); int *P = malloc(n*sizeof(int)); //Start of testing of the algorithm int j; double average; for(j=0; j<RUNS; j++){ memset(R, 0, n*sizeof(int)); /*Start Timer*/ result.tv_sec=0; result.tv_usec=0; gettimeofday (&startt, NULL); /*Start Algorithm*/ nodeLength(S, R, n, P_temp, R_temp, P); /*Stop Timer*/ gettimeofday (&endt, NULL); result.tv_usec = (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); average += result.tv_usec; } average = average/RUNS; //Average the execution times //print results to terminal printf("%d %f us \n",n-1,average); if(atoi(argv[3])!=1) { status = outputCheck(R, argv[3], n); if(status){ printf("Incorrect Answer\n"); } else{ printf("Correct Answer\n"); } } /*Save the Results if the output is less than 50 elements*/ if(n<=50){ status = write_output(S, R, n, name); } if(status){ printf("Failed to Write Output \n"); return 1; } free(S); free(R); free(P_temp); free(R_temp); free(P); return 0; }
PUBLIC float vrna_pf(vrna_fold_compound_t *vc, char *structure){ int n; FLT_OR_DBL Q; double free_energy; vrna_md_t *md; vrna_exp_param_t *params; vrna_mx_pf_t *matrices; free_energy = (float)(INF/100.); if(vc){ /* make sure, everything is set up properly to start partition function computations */ if(!vrna_fold_compound_prepare(vc, VRNA_OPTION_PF)){ vrna_message_warning("vrna_pf@part_func.c: Failed to prepare vrna_fold_compound"); return free_energy; } n = vc->length; params = vc->exp_params; matrices = vc->exp_matrices; md = &(params->model_details); #ifdef _OPENMP /* Explicitly turn off dynamic threads */ omp_set_dynamic(0); #endif #ifdef SUN4 nonstandard_arithmetic(); #else #ifdef HP9 fpsetfastmode(1); #endif #endif /* call user-defined recursion status callback function */ if(vc->stat_cb) vc->stat_cb(VRNA_STATUS_PF_PRE, vc->auxdata); switch(vc->type){ case VRNA_FC_TYPE_SINGLE: /* do the linear pf fold and fill all matrices */ pf_linear(vc); if(md->circ) pf_circ(vc); /* do post processing step for circular RNAs */ break; case VRNA_FC_TYPE_COMPARATIVE: /* do the linear pf fold and fill all matrices */ alipf_linear(vc); /* calculate post processing step for circular */ /* RNAs */ if(md->circ) wrap_alipf_circ(vc, structure); break; default: vrna_message_warning("vrna_pf@part_func.c: Unrecognized fold compound type"); return free_energy; break; } /* call user-defined recursion status callback function */ if(vc->stat_cb) vc->stat_cb(VRNA_STATUS_PF_POST, vc->auxdata); /* calculate base pairing probability matrix (bppm) */ if(md->compute_bpp){ vrna_pairing_probs(vc, structure); #ifdef VRNA_BACKWARD_COMPAT /* * Backward compatibility: * This block may be removed if deprecated functions * relying on the global variable "pr" vanish from within the package! */ pr = matrices->probs; /* { if(pr) free(pr); pr = (FLT_OR_DBL *) vrna_alloc(sizeof(FLT_OR_DBL) * ((n+1)*(n+2)/2)); memcpy(pr, probs, sizeof(FLT_OR_DBL) * ((n+1)*(n+2)/2)); } */ #endif } if (md->backtrack_type=='C') Q = matrices->qb[vc->iindx[1]-n]; else if (md->backtrack_type=='M') Q = matrices->qm[vc->iindx[1]-n]; else Q = (md->circ) ? matrices->qo : matrices->q[vc->iindx[1]-n]; /* ensemble free energy in Kcal/mol */ if (Q<=FLT_MIN) vrna_message_warning("pf_scale too large"); switch(vc->type){ case VRNA_FC_TYPE_COMPARATIVE: free_energy = (-log(Q)-n*log(params->pf_scale))*params->kT/(1000.0 * vc->n_seq); break; case VRNA_FC_TYPE_SINGLE: /* fall through */ default: free_energy = (-log(Q)-n*log(params->pf_scale))*params->kT/1000.0; break; } #ifdef SUN4 standard_arithmetic(); #else #ifdef HP9 fpsetfastmode(0); #endif #endif } return free_energy; }
void SimpleSpeechRec::frameSegmentation() { time++; if (time == 0) { timeZeroInit(); return; } SRecToken** newTokenBuffer = new SRecToken*[cbNum]; for (int i = 0; i < cbNum; i++) { newTokenBuffer[i] = NULL; } int* cbTypeLookup = new int[cbNum]; for (int i = 0; i < cbNum; i++) { cbTypeLookup[i] = dict->getCbType(i); } omp_set_dynamic(true); #pragma omp parallel for for (int i = 0; i < cbNum; i++) { STokenBin* bin = binSet[i]; SRecToken* candToken = bin->getPreviousBest(); if (!candToken) continue; int cbType = (i == cbNum-1) ? DI_TAIL_NOISE: cbTypeLookup[i]; bool isCrossWord = isCrossWordCb(cbType); SRecToken* candWord = NULL; SRecToken* newToken = factory.getInstance(); //newToken->copyFrom(candToken); if (isCrossWord) { candWord = factory.getInstance(); candWord->copyFrom(candToken); if (candToken->prev) InterlockedIncrement(&candToken->prev->refcnt); candWord->endTime = time; } else { candWord = candToken->prev; newToken->CId = candToken->CId; newToken->VId = candToken->VId; newToken->wordId = candToken->wordId; if (dict->triPhone) { if (cbType == INITIAL1) { newToken->CId = dict->getCVIdFromCbId(i); } else if (cbType == FINAL0) { newToken->VId = dict->getCVIdFromCbId(i); newToken->wordId = dict->getWordIdFromCVLink(candToken->currentCbId, i); } } else{ if (cbType == DI_INITIAL1) { newToken->CId = dict->getCVIdFromCbId(i); } else if (cbType == DI_FINAL0) { newToken->VId = dict->getCVIdFromCbId(i); newToken->wordId = dict->getWordIdFromCVLink(candToken->currentCbId, i); } } } // newToken->currentCbId = i; newToken->dur = 1; double durLh = useSegmentModel ? bc->getDurLh(i, 1) : 0; double stateLh = bc->getStateLh(i, time); newToken->lh = candToken->lh + durLh + stateLh; newToken->prev = candWord; InterlockedIncrement(&candWord->refcnt); newTokenBuffer[i] = newToken; } //状态驻留 for (int i = 0; i < cbNum; i++) { STokenBin* bin = binSet.at(i); int k =i; if(m_bHeadNOise&&i == cbNum-1) k= dict->noiseId; double stateLh = bc->getStateLh(k, time); for (auto j = bin->content.begin(); j != bin->content.end(); j++) { SRecToken* t = *j; t->dur += 1; int dur = t->dur; double deltaDurLh = useSegmentModel ? bc->getDurLhDelta(k, dur) : 0; t->lh += deltaDurLh + stateLh; } } //完成状态跳转 for (int i = 0; i < cbNum; i++) { STokenBin* bin = binSet.at(i); if (newTokenBuffer[i] != NULL) bin->addToken(newTokenBuffer[i]); prune(bin); } delete [] cbTypeLookup; delete [] newTokenBuffer; }
int main() { const int nr_threads = 2; const int n = N; const int nr_runs = 20000000; double a[n], sum = 0.0; int j; omp_set_dynamic(0); omp_set_num_threads(nr_threads); #pragma omp parallel default(none) shared(a) { #pragma omp sections { #pragma omp section { struct timeval tv1, tv2; int i, run_nr; int thread_nr = omp_get_thread_num(); for (i = 0; i < n/2; i += 1) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = 0; i < n/2 ;i += 1) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } #pragma omp section { struct timeval tv1, tv2; int thread_nr = omp_get_thread_num(); int i, run_nr; for (i = n/2; i < n; i += 1) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = n/2; i < n ;i += 1) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } } } sum = 0.0; for (j = 0; j < n; j++) sum += a[j]; printf("no false sharing: %.1lf\n", sum); #pragma omp parallel default(none) shared(a) { #pragma omp sections { #pragma omp section { struct timeval tv1, tv2; int i, run_nr; int thread_nr = omp_get_thread_num(); for (i = 0; i < n; i += 2) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = 0; i < n ;i += 2) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } #pragma omp section { struct timeval tv1, tv2; int i, run_nr; int thread_nr = omp_get_thread_num(); for (i = 1; i < n; i += 2) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = 1; i < n ;i += 2) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } } } sum = 0.0; for (j = 0; j < n; j++) sum += a[j]; printf("false sharing: %.1lf\n", sum); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { FILE *bytemaskfile; float **dataavg = NULL, **datastd = NULL, **datapow = NULL; float *chandata = NULL, powavg, powstd, powmax; float inttime, norm, fracterror = RFI_FRACTERROR; float *rawdata = NULL; unsigned char **bytemask = NULL; short *srawdata = NULL; char *outfilenm, *statsfilenm, *maskfilenm; char *bytemaskfilenm, *rfifilenm; int numchan = 0, numint = 0, newper = 0, oldper = 0; int blocksperint, ptsperint = 0, ptsperblock = 0, padding = 0; int numcands, candnum, numrfi = 0, numrfivect = NUM_RFI_VECT; int ii, jj, kk, slen, insubs = 0; int harmsum = RFI_NUMHARMSUM, lobin = RFI_LOBIN, numbetween = RFI_NUMBETWEEN; double davg, dvar, freq; struct spectra_info s; presto_interptype interptype; rfi *rfivect = NULL; mask oldmask, newmask; fftcand *cands; infodata idata; Cmdline *cmd; /* Call usage() if we have no command line arguments */ if (argc == 1) { Program = argv[0]; printf("\n"); usage(); exit(0); } /* Parse the command line using the excellent program Clig */ cmd = parseCmdline(argc, argv); spectra_info_set_defaults(&s); s.filenames = cmd->argv; s.num_files = cmd->argc; // If we are zeroDMing, make sure that clipping is off. if (cmd->zerodmP) cmd->noclipP = 1; s.clip_sigma = cmd->clip; // -1 causes the data to determine if we use weights, scales, & // offsets for PSRFITS or flip the band for any data type where // we can figure that out with the data s.apply_flipband = (cmd->invertP) ? 1 : -1; s.apply_weight = (cmd->noweightsP) ? 0 : -1; s.apply_scale = (cmd->noscalesP) ? 0 : -1; s.apply_offset = (cmd->nooffsetsP) ? 0 : -1; s.remove_zerodm = (cmd->zerodmP) ? 1 : 0; if (cmd->noclipP) { cmd->clip = 0.0; s.clip_sigma = 0.0; } if (cmd->ifsP) { // 0 = default or summed, 1-4 are possible also s.use_poln = cmd->ifs + 1; } slen = strlen(cmd->outfile) + 20; if (cmd->ncpus > 1) { #ifdef _OPENMP int maxcpus = omp_get_num_procs(); int openmp_numthreads = (cmd->ncpus <= maxcpus) ? cmd->ncpus : maxcpus; // Make sure we are not dynamically setting the number of threads omp_set_dynamic(0); omp_set_num_threads(openmp_numthreads); printf("Using %d threads with OpenMP\n\n", openmp_numthreads); #endif } else { #ifdef _OPENMP omp_set_num_threads(1); // Explicitly turn off OpenMP #endif } #ifdef DEBUG showOptionValues(); #endif printf("\n\n"); printf(" Pulsar Data RFI Finder\n"); printf(" by Scott M. Ransom\n\n"); /* The following is the root of all the output files */ outfilenm = (char *) calloc(slen, sizeof(char)); sprintf(outfilenm, "%s_rfifind", cmd->outfile); /* And here are the output file names */ maskfilenm = (char *) calloc(slen, sizeof(char)); sprintf(maskfilenm, "%s.mask", outfilenm); bytemaskfilenm = (char *) calloc(slen, sizeof(char)); sprintf(bytemaskfilenm, "%s.bytemask", outfilenm); rfifilenm = (char *) calloc(slen, sizeof(char)); sprintf(rfifilenm, "%s.rfi", outfilenm); statsfilenm = (char *) calloc(slen, sizeof(char)); sprintf(statsfilenm, "%s.stats", outfilenm); sprintf(idata.name, "%s", outfilenm); if (RAWDATA) { if (cmd->filterbankP) s.datatype = SIGPROCFB; else if (cmd->psrfitsP) s.datatype = PSRFITS; else if (cmd->pkmbP) s.datatype = SCAMP; else if (cmd->bcpmP) s.datatype = BPP; else if (cmd->wappP) s.datatype = WAPP; else if (cmd->spigotP) s.datatype = SPIGOT; } else { // Attempt to auto-identify the data identify_psrdatatype(&s, 1); if (s.datatype==SIGPROCFB) cmd->filterbankP = 1; else if (s.datatype==PSRFITS) cmd->psrfitsP = 1; else if (s.datatype==SCAMP) cmd->pkmbP = 1; else if (s.datatype==BPP) cmd->bcpmP = 1; else if (s.datatype==WAPP) cmd->wappP = 1; else if (s.datatype==SPIGOT) cmd->spigotP = 1; else if (s.datatype==SUBBAND) insubs = 1; else { printf("Error: Unable to identify input data files. Please specify type.\n\n"); exit(1); } } /* Read an input mask if wanted */ if (cmd->maskfileP) { read_mask(cmd->maskfile, &oldmask); printf("Read old mask information from '%s'\n\n", cmd->maskfile); } else { oldmask.numchan = oldmask.numint = 0; } if (!cmd->nocomputeP) { if (RAWDATA || insubs) { char description[40]; psrdatatype_description(description, s.datatype); if (s.num_files > 1) printf("Reading %s data from %d files:\n", description, s.num_files); else printf("Reading %s data from 1 file:\n", description); if (insubs) s.files = (FILE **)malloc(sizeof(FILE *) * s.num_files); for (ii = 0; ii < s.num_files; ii++) { printf(" '%s'\n", cmd->argv[ii]); if (insubs) s.files[ii] = chkfopen(cmd->argv[ii], "rb"); } printf("\n"); } if (RAWDATA) { read_rawdata_files(&s); print_spectra_info_summary(&s); spectra_info_to_inf(&s, &idata); ptsperblock = s.spectra_per_subint; numchan = s.num_channels; idata.dm = 0.0; writeinf(&idata); } if (insubs) { /* Set-up values if we are using subbands */ char *tmpname, *root, *suffix; if (split_root_suffix(s.filenames[0], &root, &suffix) == 0) { printf("Error: The input filename (%s) must have a suffix!\n\n", s.filenames[0]); exit(1); } if (strncmp(suffix, "sub", 3) == 0) { tmpname = calloc(strlen(root) + 6, 1); sprintf(tmpname, "%s.sub", root); readinf(&idata, tmpname); free(tmpname); } else { printf("\nThe input files (%s) must be subbands! (i.e. *.sub##)\n\n", s.filenames[0]); exit(1); } free(root); free(suffix); ptsperblock = 1; /* Compensate for the fact that we have subbands and not channels */ idata.freq = idata.freq - 0.5 * idata.chan_wid + 0.5 * idata.chan_wid * (idata.num_chan / s.num_files); idata.chan_wid = idata.num_chan / s.num_files * idata.chan_wid; idata.num_chan = numchan = s.num_files; idata.dm = 0.0; sprintf(idata.name, "%s", outfilenm); writeinf(&idata); s.padvals = gen_fvect(s.num_files); for (ii = 0 ; ii < s.num_files ; ii++) s.padvals[ii] = 0.0; } if (cmd->maskfileP) determine_padvals(cmd->maskfile, &oldmask, s.padvals); /* The number of data points and blocks to work with at a time */ if (cmd->blocksP) { blocksperint = cmd->blocks; cmd->time = blocksperint * ptsperblock * idata.dt; } else { blocksperint = (int) (cmd->time / (ptsperblock * idata.dt) + 0.5); // Must process at least 1 block at a time if (blocksperint==0) blocksperint = 1; } ptsperint = blocksperint * ptsperblock; numint = (long long) idata.N / ptsperint; if ((long long) idata.N % ptsperint) numint++; inttime = ptsperint * idata.dt; printf("Analyzing data sections of length %d points (%.6g sec).\n", ptsperint, inttime); { int *factors, numfactors; factors = get_prime_factors(ptsperint, &numfactors); printf(" Prime factors are: "); for (ii = 0; ii < numfactors; ii++) printf("%d ", factors[ii]); printf("\n"); if (factors[numfactors - 1] > 13) { printf(" WARNING: The largest prime factor is pretty big! This will\n" " cause the FFTs to take a long time to compute. I\n" " recommend choosing a different -time value.\n"); } printf("\n"); free(factors); } /* Allocate our workarrays */ if (RAWDATA) rawdata = gen_fvect(idata.num_chan * ptsperblock * blocksperint); else if (insubs) srawdata = gen_svect(idata.num_chan * ptsperblock * blocksperint); dataavg = gen_fmatrix(numint, numchan); datastd = gen_fmatrix(numint, numchan); datapow = gen_fmatrix(numint, numchan); chandata = gen_fvect(ptsperint); bytemask = gen_bmatrix(numint, numchan); for (ii = 0; ii < numint; ii++) for (jj = 0; jj < numchan; jj++) bytemask[ii][jj] = GOODDATA; rfivect = rfi_vector(rfivect, numchan, numint, 0, numrfivect); if (numbetween == 2) interptype = INTERBIN; else interptype = INTERPOLATE; /* Main loop */ printf("Writing mask data to '%s'.\n", maskfilenm); printf("Writing RFI data to '%s'.\n", rfifilenm); printf("Writing statistics to '%s'.\n\n", statsfilenm); printf("Massaging the data ...\n\n"); printf("Amount Complete = %3d%%", oldper); fflush(stdout); for (ii = 0; ii < numint; ii++) { /* Loop over the intervals */ newper = (int) ((float) ii / numint * 100.0 + 0.5); if (newper > oldper) { printf("\rAmount Complete = %3d%%", newper); fflush(stdout); oldper = newper; } /* Read a chunk of data */ if (RAWDATA) { read_rawblocks(rawdata, blocksperint, &s, &padding); // Clip nasty RFI if requested (we are not masking) if (s.clip_sigma > 0.0) clip_times(rawdata, ptsperint, s.num_channels, s.clip_sigma, s.padvals); } else if (insubs) { read_subband_rawblocks(s.files, s.num_files, srawdata, blocksperint, &padding); // TODO: should implement clipping for subbands } if (padding) for (jj = 0; jj < numchan; jj++) bytemask[ii][jj] |= PADDING; for (jj = 0; jj < numchan; jj++) { /* Loop over the channels */ if (RAWDATA) get_channel(chandata, jj, blocksperint, rawdata, &s); else if (insubs) get_subband(jj, chandata, srawdata, blocksperint); /* Calculate the averages and standard deviations */ /* for each point in time. */ if (padding) { dataavg[ii][jj] = 0.0; datastd[ii][jj] = 0.0; datapow[ii][jj] = 1.0; } else { avg_var(chandata, ptsperint, &davg, &dvar); dataavg[ii][jj] = davg; datastd[ii][jj] = sqrt(dvar); realfft(chandata, ptsperint, -1); numcands = 0; norm = datastd[ii][jj] * datastd[ii][jj] * ptsperint; if (norm == 0.0) norm = (chandata[0] == 0.0) ? 1.0 : chandata[0]; cands = search_fft((fcomplex *) chandata, ptsperint / 2, lobin, ptsperint / 2, harmsum, numbetween, interptype, norm, cmd->freqsigma, &numcands, &powavg, &powstd, &powmax); datapow[ii][jj] = powmax; /* Record the birdies */ if (numcands) { for (kk = 0; kk < numcands; kk++) { freq = cands[kk].r / inttime; candnum = find_rfi(rfivect, numrfi, freq, RFI_FRACTERROR); if (candnum >= 0) { update_rfi(rfivect + candnum, freq, cands[kk].sig, jj, ii); } else { update_rfi(rfivect + numrfi, freq, cands[kk].sig, jj, ii); numrfi++; if (numrfi == numrfivect) { numrfivect *= 2; rfivect = rfi_vector(rfivect, numchan, numint, numrfivect / 2, numrfivect); } } } free(cands); } } } } printf("\rAmount Complete = 100%%\n"); /* Write the data to the output files */ write_rfifile(rfifilenm, rfivect, numrfi, numchan, numint, ptsperint, lobin, numbetween, harmsum, fracterror, cmd->freqsigma); write_statsfile(statsfilenm, datapow[0], dataavg[0], datastd[0], numchan, numint, ptsperint, lobin, numbetween); } else { /* If "-nocompute" */ float freqsigma; /* Read the data from the output files */ printf("Reading RFI data from '%s'.\n", rfifilenm); printf("Reading statistics from '%s'.\n", statsfilenm); readinf(&idata, outfilenm); read_rfifile(rfifilenm, &rfivect, &numrfi, &numchan, &numint, &ptsperint, &lobin, &numbetween, &harmsum, &fracterror, &freqsigma); numrfivect = numrfi; read_statsfile(statsfilenm, &datapow, &dataavg, &datastd, &numchan, &numint, &ptsperint, &lobin, &numbetween); bytemask = gen_bmatrix(numint, numchan); printf("Reading bytemask from '%s'.\n\n", bytemaskfilenm); bytemaskfile = chkfopen(bytemaskfilenm, "rb"); chkfread(bytemask[0], numint * numchan, 1, bytemaskfile); fclose(bytemaskfile); for (ii = 0; ii < numint; ii++) for (jj = 0; jj < numchan; jj++) bytemask[ii][jj] &= PADDING; /* Clear all but the PADDING bits */ inttime = ptsperint * idata.dt; } /* Make the plots and set the mask */ { int *zapints, *zapchan; int numzapints = 0, numzapchan = 0; if (cmd->zapintsstrP) { zapints = ranges_to_ivect(cmd->zapintsstr, 0, numint - 1, &numzapints); zapints = (int *) realloc(zapints, (size_t) (sizeof(int) * numint)); } else { zapints = gen_ivect(numint); } if (cmd->zapchanstrP) { zapchan = ranges_to_ivect(cmd->zapchanstr, 0, numchan - 1, &numzapchan); zapchan = (int *) realloc(zapchan, (size_t) (sizeof(int) * numchan)); } else { zapchan = gen_ivect(numchan); } rfifind_plot(numchan, numint, ptsperint, cmd->timesigma, cmd->freqsigma, cmd->inttrigfrac, cmd->chantrigfrac, dataavg, datastd, datapow, zapchan, numzapchan, zapints, numzapints, &idata, bytemask, &oldmask, &newmask, rfivect, numrfi, cmd->rfixwinP, cmd->rfipsP, cmd->xwinP); vect_free(zapints); vect_free(zapchan); } /* Write the new mask and bytemask to the file */ write_mask(maskfilenm, &newmask); bytemaskfile = chkfopen(bytemaskfilenm, "wb"); chkfwrite(bytemask[0], numint * numchan, 1, bytemaskfile); fclose(bytemaskfile); /* Determine the percent of good and bad data */ { int numpad = 0, numbad = 0, numgood = 0; for (ii = 0; ii < numint; ii++) { for (jj = 0; jj < numchan; jj++) { if (bytemask[ii][jj] == GOODDATA) { numgood++; } else { if (bytemask[ii][jj] & PADDING) numpad++; else numbad++; } } } printf("\nTotal number of intervals in the data: %d\n\n", numint * numchan); printf(" Number of padded intervals: %7d (%6.3f%%)\n", numpad, (float) numpad / (float) (numint * numchan) * 100.0); printf(" Number of good intervals: %7d (%6.3f%%)\n", numgood, (float) numgood / (float) (numint * numchan) * 100.0); printf(" Number of bad intervals: %7d (%6.3f%%)\n\n", numbad, (float) numbad / (float) (numint * numchan) * 100.0); qsort(rfivect, numrfi, sizeof(rfi), compare_rfi_sigma); printf(" Ten most significant birdies:\n"); printf("# Sigma Period(ms) Freq(Hz) Number \n"); printf("----------------------------------------------------\n"); for (ii = 0; ii < 10; ii++) { double pperr; char temp1[40], temp2[40]; if (rfivect[ii].freq_var == 0.0) { pperr = 0.0; sprintf(temp1, " %-14g", rfivect[ii].freq_avg); sprintf(temp2, " %-14g", 1000.0 / rfivect[ii].freq_avg); } else { pperr = 1000.0 * sqrt(rfivect[ii].freq_var) / (rfivect[ii].freq_avg * rfivect[ii].freq_avg); nice_output_2(temp1, rfivect[ii].freq_avg, sqrt(rfivect[ii].freq_var), -15); nice_output_2(temp2, 1000.0 / rfivect[ii].freq_avg, pperr, -15); } printf("%-2d %-8.2f %13s %13s %-8d\n", ii + 1, rfivect[ii].sigma_avg, temp2, temp1, rfivect[ii].numobs); } qsort(rfivect, numrfi, sizeof(rfi), compare_rfi_numobs); printf("\n Ten most numerous birdies:\n"); printf("# Number Period(ms) Freq(Hz) Sigma \n"); printf("----------------------------------------------------\n"); for (ii = 0; ii < 10; ii++) { double pperr; char temp1[40], temp2[40]; if (rfivect[ii].freq_var == 0.0) { pperr = 0.0; sprintf(temp1, " %-14g", rfivect[ii].freq_avg); sprintf(temp2, " %-14g", 1000.0 / rfivect[ii].freq_avg); } else { pperr = 1000.0 * sqrt(rfivect[ii].freq_var) / (rfivect[ii].freq_avg * rfivect[ii].freq_avg); nice_output_2(temp1, rfivect[ii].freq_avg, sqrt(rfivect[ii].freq_var), -15); nice_output_2(temp2, 1000.0 / rfivect[ii].freq_avg, pperr, -15); } printf("%-2d %-8d %13s %13s %-8.2f\n", ii + 1, rfivect[ii].numobs, temp2, temp1, rfivect[ii].sigma_avg); } printf("\nDone.\n\n"); } /* Close the files and cleanup */ free_rfi_vector(rfivect, numrfivect); free_mask(newmask); if (cmd->maskfileP) free_mask(oldmask); free(outfilenm); free(statsfilenm); free(bytemaskfilenm); free(maskfilenm); free(rfifilenm); vect_free(dataavg[0]); vect_free(dataavg); vect_free(datastd[0]); vect_free(datastd); vect_free(datapow[0]); vect_free(datapow); vect_free(bytemask[0]); vect_free(bytemask); if (!cmd->nocomputeP) { // Close all the raw files and free their vectors close_rawfiles(&s); vect_free(chandata); if (insubs) vect_free(srawdata); else vect_free(rawdata); } return (0); }
int /* O [nbr] Thread number */ nco_openmp_ini /* [fnc] Initialize OpenMP threading environment */ (const int thr_nbr) /* I [nbr] User-requested thread number */ { /* Purpose: Initialize OpenMP multi-threading environment Honor user-requested thread number, balance against known code efficiency, print diagnostics Returns thr_nbr=1 in three situations: 1. UP codes (not threaded) 2. SMP codes compiled with compilers which lack OpenMP support 3. SMP codes where single thread requested/advised Otherwise returns system-dependent thr_nbr */ /* Using naked stdin/stdout/stderr in parallel region generates warning Copy appropriate filehandle to variable scoped shared in parallel clause */ char *nvr_OMP_NUM_THREADS; /* [sng] Environment variable OMP_NUM_THREADS */ char *sng_cnv_rcd=NULL_CEWI; /* [sng] strtol()/strtoul() return code */ FILE * const fp_stderr=stderr; /* [fl] stderr filehandle CEWI */ nco_bool USR_SPC_THR_RQS=False; int dyn_thr=1; /* [flg] Allow system to dynamically set number of threads */ int ntg_OMP_NUM_THREADS=int_CEWI; // [nbr] OMP_NUM_THREADS environment variable int prc_nbr_max; /* [nbr] Maximum number of processors available */ int thr_nbr_act; /* O [nbr] Number of threads NCO uses */ int thr_nbr_max_fsh=4; /* [nbr] Maximum number of threads program can use efficiently */ int thr_nbr_max=int_CEWI; /* [nbr] Maximum number of threads system allows */ int thr_nbr_rqs=int_CEWI; /* [nbr] Number of threads to request */ #ifndef _OPENMP if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO Build compiler lacked (or user turned off) OpenMP support. Code will execute with single thread in Uni-Processor (UP) mode.\n",nco_prg_nm_get()); return (int)1; #endif /* !_OPENMP */ /* Strategy: 0. Determine maximum number of threads system will allocate (thr_nbr_max) 1. Command-line thread request, if any, overrides automatic algorithm 2. If no command-line request then system allocates OMP_NUM_THREADS if possible 3. Reduce maximum number of threads available to system to thr_nbr_max_fsh Many operators cannot use more than thr_nbr_max_fsh ~ 2--4 threads efficiently Play nice: Set dynamic threading so that system can make efficiency decisions When dynamic threads are set, system never allocates more than thr_nbr_max_fsh */ if(thr_nbr < 0){ (void)fprintf(fp_stderr,"%s: ERROR User-requested thread number = %d is less than zero\n",nco_prg_nm_get(),thr_nbr); nco_exit(EXIT_FAILURE); } /* endif err */ if(thr_nbr == 0) if(nco_dbg_lvl_get() >= nco_dbg_scl && nco_dbg_lvl_get() != nco_dbg_dev ) (void)fprintf(fp_stderr,"%s: INFO User did not specify thread request > 0 on command line. NCO will automatically assign threads based on OMP_NUM_THREADS environment and machine capabilities.\nHINT: Not specifiying any --thr_nbr (or specifying --thr_nbr=0) causes NCO to try to pick the optimal thread number. Specifying --thr_nbr=1 tells NCO to execute in Uni-Processor (UP) (i.e., single-threaded) mode.\n",nco_prg_nm_get()); if(thr_nbr > 0) USR_SPC_THR_RQS=True; prc_nbr_max=omp_get_num_procs(); /* [nbr] Maximum number of processors available */ if(omp_in_parallel()){ (void)fprintf(fp_stderr,"%s: ERROR Attempted to get maximum thread number from within parallel region\n",nco_prg_nm_get()); nco_exit(EXIT_FAILURE); }else{ thr_nbr_max=omp_get_max_threads(); /* [nbr] Maximum number of threads system allows */ } /* end error */ if(nco_dbg_lvl_get() >= nco_dbg_scl && nco_dbg_lvl_get() != nco_dbg_dev){ if((nvr_OMP_NUM_THREADS=getenv("OMP_NUM_THREADS"))) ntg_OMP_NUM_THREADS=(int)strtol(nvr_OMP_NUM_THREADS,&sng_cnv_rcd,NCO_SNG_CNV_BASE10); /* [sng] Environment variable OMP_NUM_THREADS */ if(nvr_OMP_NUM_THREADS && *sng_cnv_rcd) nco_sng_cnv_err(nvr_OMP_NUM_THREADS,"strtol",sng_cnv_rcd); (void)fprintf(fp_stderr,"%s: INFO Environment variable OMP_NUM_THREADS ",nco_prg_nm_get()); if(ntg_OMP_NUM_THREADS > 0) (void)fprintf(fp_stderr,"= %d\n",ntg_OMP_NUM_THREADS); else (void)fprintf(fp_stderr,"does not exist\n"); (void)fprintf(fp_stderr,"%s: INFO omp_get_num_procs() reports number of processors available is %d\n",nco_prg_nm_get(),prc_nbr_max); (void)fprintf(fp_stderr,"%s: INFO omp_get_max_threads() reports maximum number of threads system allows is %d\n",nco_prg_nm_get(),thr_nbr_max); } /* endif dbg */ if(USR_SPC_THR_RQS){ /* Try to honor user-specified thread request... */ thr_nbr_rqs=thr_nbr; /* [nbr] Number of threads to request */ /* ...if possible... */ if(nco_dbg_lvl_get() >= nco_dbg_scl) (void)fprintf(fp_stderr,"%s: INFO Command-line requests %d thread%s\n",nco_prg_nm_get(),thr_nbr,(thr_nbr > 1) ? "s" : ""); if(thr_nbr > thr_nbr_max){ (void)fprintf(fp_stderr,"%s: WARNING Reducing user-requested thread number = %d to maximum thread number allowed = %d\n",nco_prg_nm_get(),thr_nbr,thr_nbr_max); thr_nbr_rqs=thr_nbr_max; /* [nbr] Number of threads to request */ } /* endif */ }else{ /* !USR_SPC_THR_RQS */ /* Otherwise use automatic thread allocation algorithm */ /* Request maximum number of threads permitted */ thr_nbr_rqs=thr_nbr_max; /* [nbr] Number of threads to request */ /* Restrict threading on per-program basis to play nicely with others */ switch(nco_prg_id_get()){ /* Operators with pre-set thread limit NB: All operators currently have default restrictions 2007: Only ncwa and ncap2 have a chance to scale on non-parallel filesystems ncap2 may, one day, see a big performance boost from threading However, as of 20090327, ncap2 threading may be buggy due to ANTLR Moreover, we want to prevent hogging processes on 32-way nodes until/unless clear benefits of threading are demonstrated. 2015: Threads improve ncks regridding performance by 2-3x on ACME ~1-20 GB netCDF3 files */ case ncap: /* 20090327: Restrict ncap2 to one thread until ANTLR threading resolved */ thr_nbr_max_fsh=1; break; case ncecat: case ncrcat: /* ncecat and ncrcat are extremely I/O intensive Maximum efficiency when one thread reads from input file while other writes to output file */ // 20140219: Turn-off OpenMP until thoroughly tested // thr_nbr_max_fsh=2; thr_nbr_max_fsh=1; break; case ncks: // 20150529: Turn-on OpenMP for regridder thr_nbr_max_fsh=16; break; case ncwa: // 20150530: Turn-on OpenMP for debugging // 20150610: Eight threads with ncwa seemed to work for a little while, then it got flaky. Turned-off for 4.5.0 release // 20150622: Allowing eight threads again for debugging with -D 3 // 20150701: Firmly established that netCDF4 involvement hoses threading because HDF5 is not threadsafe by default // 20150710: Turned-off for 4.5.1 release // Symptoms of bugs, if any, show up with // cd ~/nco/bm;nco_bm.pl --regress ncwa;cd - thr_nbr_max_fsh=1; if(nco_dbg_lvl_get() >= nco_dbg_scl) thr_nbr_max_fsh=1; break; /* Operators with higher maximum pre-set thread limit (NB: not all of these are threaded!) */ case ncra: thr_nbr_max_fsh=1; if(nco_dbg_lvl_get() >= nco_dbg_scl) thr_nbr_max_fsh=1; break; case ncbo: case ncatted: case ncfe: case ncflint: case ncpdq: case ncrename: case ncge: // 20140219: Turn-off OpenMP until thoroughly tested // thr_nbr_max_fsh=4; thr_nbr_max_fsh=1; break; default: nco_dfl_case_prg_id_err(); break; } /* end case */ /* Automatic algorithm tries to play nice with others */ (void)omp_set_dynamic(dyn_thr); /* [flg] Allow system to dynamically set number of threads */ if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO omp_set_dynamic() used to %s OS to dynamically set threads\n",nco_prg_nm_get(),(dyn_thr ? "ALLOW" : "DISALLOW")); dyn_thr=omp_get_dynamic(); /* [flg] Allow system to dynamically set number of threads */ if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO omp_get_dynamic() reports system will%s utilize dynamic threading\n",nco_prg_nm_get(),(dyn_thr ? "" : " NOT")); /* Apply program/system limitations */ if(thr_nbr_max > thr_nbr_max_fsh){ if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO Reducing default thread number from %d to %d, an operator-dependent \"play-nice\" number set in nco_openmp_ini()\n",nco_prg_nm_get(),thr_nbr_max,thr_nbr_max_fsh); thr_nbr_rqs=thr_nbr_max_fsh; /* [nbr] Number of threads to request */ } /* endif */ } /* !USR_SPC_THR_RQS */ #ifdef ENABLE_NETCDF4 if(nco_prg_id_get() != ncks && nco_prg_id_get() != ncwa && nco_prg_id_get() != ncra && thr_nbr_rqs > 1){ if(USR_SPC_THR_RQS && nco_dbg_lvl_get() >= nco_dbg_fl) (void)fprintf(stdout,"%s: WARNING This is TODO nco939. Requested threading with netCDF4 (HDF5) support. The NCO thread request algorithm considers user-input, environment variables, and software and hardware limitations in determining the number of threads to request, thr_nbr_rqs. At this point NCO would request result %d threads from a netCDF3-based library. However, this NCO was built with netCDF4, which relies on HDF5. netCDF4 is not thread-safe unless HDF5 is configured with the (non-default) --enable-threadsafe option. NCO currently has no way to know whether HDF5 was built thread-safe. Hence, all netCDF4-based operators are currently restricted to a single thread. The program will now automatically set thr_nbr_rqs = 1.\nThis unfortunate limitation is necessary to keep the NCO developers sane. If you want/need threading in netCDF4-based NCO, please politely yet firmly request of the Unidata netCDF developers that better thread support be built into netCDF4, and request of the HDF5 developers that they make the --enable-threadsafe option compatible with all HDF5 libraries and APIs, including Fortran (which, as of HDF5 1.8.0 in 2008, is incompatible with --enable-threadsafe).\n",nco_prg_nm_get(),thr_nbr_rqs); thr_nbr_rqs=1; } /* endif */ #endif /* !ENABLE_NETCDF4 */ /* Set thread number */ if(omp_in_parallel()){ (void)fprintf(fp_stderr,"%s: ERROR Attempted to set thread number from within parallel region\n",nco_prg_nm_get()); nco_exit(EXIT_FAILURE); }else{ (void)omp_set_num_threads(thr_nbr_rqs); if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO omp_set_num_threads() used to set execution environment to spawn teams of %d thread(s)\n",nco_prg_nm_get(),thr_nbr_rqs); } /* end error */ thr_nbr_act=omp_get_max_threads(); if(nco_dbg_lvl_get() >= nco_dbg_scl) (void)fprintf(fp_stderr,"%s: INFO After using omp_set_num_threads() to adjust for any user requests/NCO optimizations, omp_get_max_threads() reports that a parallel construct here/now would spawn %d thread(s)\n",nco_prg_nm_get(),thr_nbr_act); #ifdef _OPENMP if(nco_dbg_lvl_get() >= nco_dbg_scl){ # pragma omp parallel default(none) shared(thr_nbr_act) { /* begin OpenMP parallel */ # pragma omp single nowait { /* begin OpenMP single */ thr_nbr_act=omp_get_num_threads(); /* [nbr] Number of threads NCO uses */ if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO Small parallel test region spawned team of %d thread(s)\n",nco_prg_nm_get(),thr_nbr_act); } /* end OpenMP single */ } /* end OpenMP parallel */ } /* end dbg */ #endif /* !_OPENMP */ /* Issue any warnings about OpenMP credibility during debugging phase */ if(True) if((nco_prg_id_get() == ncwa || nco_prg_id_get() == ncra) && thr_nbr_act > 1) if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: WARNING OpenMP threading active with %d threads but not guaranteed to work on this operator. If strange behavior (e.g., NaN results) ensues, manually turn-off multi-threading by specifying \"-t 1\" option.\n",nco_prg_nm_get(),thr_nbr_act); return thr_nbr_act; /* O [nbr] Number of threads NCO uses */ } /* end nco_openmp_ini() */
main () { int i; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } omp_set_dynamic (0); #pragma omp parallel { #pragma omp for schedule(static,1) lastprivate (prvt1,prvt2) lastprivate (prvt3) for (i=0; i<thds; i++) { prvt1 = i; prvt2 = i; prvt3 = i; barrier (thds); if (prvt1 != i) { #pragma omp critical errors += 1; } if (prvt2 != i) { #pragma omp critical errors += 1; } if (prvt3 != i) { #pragma omp critical errors += 1; } if (i==0) { waittime (1); } prvt1 = i; prvt2 = i; prvt3 = i; } if (prvt1 != thds - 1) { #pragma omp critical errors += 1; } if (prvt2 != thds - 1) { #pragma omp critical errors += 1; } if (prvt3 != thds - 1) { #pragma omp critical errors += 1; } } #pragma omp parallel func (thds); func (1); if (errors == 0) { printf ("lastprivate 004 : SUCCESS\n"); return 0; } else { printf ("lastprivate 004 : FAILED\n"); return 1; } }
int main() { double AllTime=PortableGetTime(); double x0=0.0, y0=0.0, z0=0.0; double xn=10.0, yn=10.0, zn=10.0; int Sx=300, Sy=300, Sz=300, St=100; double * masprev; double * masnext; masprev=new double[Sx*Sy*Sz]; masnext=new double[Sx*Sy*Sz]; double dx=(xn-x0)/Sx, dy=(yn-y0)/Sy, dz=(zn-z0)/Sz; FILE* filex=fopen("filex.txt","w"); FILE* filey=fopen("filey.txt","w"); FILE* filez=fopen("filez.txt","w"); double dt=0.00001; //выбираем dt omp_set_dynamic(0); // запретить библиотеке openmp менять число потоков во время исполнения omp_set_num_threads(4); // установить число потоков в 10 memset(masprev, 0, Sx*Sy*Sz*sizeof(double)); memset(masnext, 0, Sx*Sy*Sz*sizeof(double)); for (int x=1; x<Sx-1; x++) for(int y=1; y<Sy-1; y++) for(int z=1; z<Sz-1; z++) masprev[x+y*Sx+z*Sx*Sy]=u(x0+dx*x, y0+dy*y, z0+dz*z); fprintf(filex,"%e\n", dx); fprintf(filey,"%e\n", dy); fprintf(filez,"%e\n", dz); fprintf(filex,"%i\n", Sx); fprintf(filey,"%i\n", Sy); fprintf(filez,"%i\n", Sz); for(int x=0; x<Sx; x++) fprintf(filex,"%lf ", masprev[x+49*Sx+49*Sx*Sy]); for(int y=0; y<Sy; y++) fprintf(filey,"%lf ", masprev[49+y*Sx+49*Sx*Sy]); for(int z=0; z<Sz; z++) fprintf(filez,"%lf ", masprev[49+49*Sx+z*Sx*Sy]); fprintf(filex,"\n"); fprintf(filey,"\n"); fprintf(filez,"\n"); double Time=PortableGetTime(); for (int t=1; t<St; t++) { #pragma omp parallel for for (int z=1; z<Sz-1; z++) { for(int y=1; y<Sy-1; y++) { for(int x=1; x<Sx-1; x++) { masnext[x+y*Sx+z*Sx*Sy]= dt*((masprev[(x+1)+y*Sx+z*Sx*Sy]-2*masprev[x+y*Sx+z*Sx*Sy]+masprev[(x-1)+y*Sx+z*Sx*Sy])/(dx*dx) +(masprev[x+(y+1)*Sx+z*Sx*Sy]-2*masprev[x+y*Sx+z*Sx*Sy]+masprev[x+(y-1)*Sx+z*Sx*Sy])/(dy*dy) +(masprev[x+y*Sx+(z+1)*Sx*Sy]-2*masprev[x+y*Sx+z*Sx*Sy]+masprev[x+y*Sx+(z-1)*Sx*Sy])/(dz*dz) +f(x0+dx*x, y0+dy*y, z0+dz*z)-masprev[x+y*Sx+z*Sx*Sy])+masprev[x+y*Sx+z*Sx*Sy]; } } } double* tmp=masprev; masprev=masnext; masnext=tmp; } Time=PortableGetTime()-Time; fprintf(filex,"%i\n", Sx); fprintf(filey,"%i\n", Sy); fprintf(filez,"%i\n", Sz); for(int x=0; x<Sx; x++) fprintf(filex,"%lf ", masprev[x+49*Sx+49*Sx*Sy]); for(int y=0; y<Sy; y++) fprintf(filey,"%lf ", masprev[49+y*Sx+49*Sx*Sy]); for(int z=0; z<Sz; z++) fprintf(filez,"%lf ", masprev[49+49*Sx+z*Sx*Sy]); fprintf(filex,"\n"); fprintf(filey,"\n"); fprintf(filez,"\n"); AllTime=PortableGetTime()-AllTime; printf(" %lf \n %lf \n",Time, AllTime); fclose(filex); fclose(filey); fclose(filez); delete[] masprev; delete[] masnext; return 0; }
main () { int i; thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } omp_set_dynamic (0); rdct_inc = rdct_inc2 = rdct_pls = rdct_pls2 = rdct_pls3 = 1; rdct_dec = rdct_dec2 = rdct_mns = rdct_mns2 = 2; rdct_mul = rdct_mul2 = rdct_mul3 = 3; rdct_land = rdct_land2 = rdct_land3 = -2; rdct_lor = rdct_lor2 = rdct_lor3 = 1; rdct_xor = rdct_xor2 = rdct_xor3 = 2; rdct_and = rdct_and2 = 1; rdct_or = rdct_or2 = 0; #pragma omp parallel for reduction(+:rdct_inc,rdct_inc2,rdct_pls,rdct_pls2,rdct_pls3) \ reduction(-:rdct_dec,rdct_dec2,rdct_mns,rdct_mns2) \ reduction(*:rdct_mul,rdct_mul2,rdct_mul3) \ reduction(&:rdct_land,rdct_land2,rdct_land3) \ reduction (|:rdct_lor,rdct_lor2,rdct_lor3) \ reduction (^:rdct_xor,rdct_xor2,rdct_xor3) \ reduction (&&:rdct_and,rdct_and2) \ reduction (||:rdct_or,rdct_or2) for (i=0; i<LOOPNUM; i++) { rdct_inc ++; ++ rdct_inc2; rdct_pls += i; rdct_pls2 = rdct_pls2 + i; rdct_pls3 = i + rdct_pls3; rdct_dec --; -- rdct_dec2; rdct_mns -= i; rdct_mns2 = rdct_mns2 - i; rdct_mul *= i; rdct_mul2 = rdct_mul2 * i; rdct_mul3 = i * rdct_mul3; rdct_land &= 1<<i; rdct_land2 = rdct_land2 & (1<<i); rdct_land3 = (1<<i) & rdct_land3; rdct_lor |= 1<<i; rdct_lor2 = rdct_lor2 | (1<<i); rdct_lor3 = (1<<i) | rdct_lor3; rdct_xor ^= 1<<i; rdct_xor2 = rdct_xor2 ^ (1<<i); rdct_xor3 = (1<<i) ^ rdct_xor3; rdct_and = rdct_and && i; rdct_and2 = (i+1) && rdct_and2; rdct_or = rdct_or || i; rdct_or2 = 0 || rdct_or2; if (sizeof(rdct_inc) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_inc2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_pls) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_pls2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_pls3) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_dec) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_dec2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_mns) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_mns2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_mul) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_mul2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_mul3) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_land) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_land2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_land3) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_lor) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_lor2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_lor3) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_xor) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_xor2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_xor3) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_and) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_and2) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_or) != sizeof(long long)) { #pragma omp critical errors += 1; } if (sizeof(rdct_or2) != sizeof(long long)) { #pragma omp critical errors += 1; } } rst_inc = rst_inc2 = rst_pls = rst_pls2 = rst_pls3 = 1; rst_dec = rst_dec2 = rst_mns = rst_mns2 = 2; rst_mul = rst_mul2 = rst_mul3 = 3; rst_land = rst_land2 = rst_land3 = -2; rst_lor = rst_lor2 = rst_lor3 = 1; rst_xor = rst_xor2 = rst_xor3 = 2; rst_and = rst_and2 = 1; rst_or = rst_or2 = 0; for (i=0; i<LOOPNUM; i++) { rst_inc ++; ++ rst_inc2; rst_pls += i; rst_pls2 = rst_pls2 + i; rst_pls3 = i + rst_pls3; rst_dec --; -- rst_dec2; rst_mns -= i; rst_mns2 = rst_mns2 - i; rst_mul *= i; rst_mul2 = rst_mul2 * i; rst_mul3 = i * rst_mul3; rst_land &= 1<<i; rst_land2 = rst_land2 & (1<<i); rst_land3 = (1<<i) & rst_land3; rst_lor |= 1<<i; rst_lor2 = rst_lor2 | (1<<i); rst_lor3 = (1<<i) | rst_lor3; rst_xor ^= 1<<i; rst_xor2 = rst_xor2 ^ (1<<i); rst_xor3 = (1<<i) ^ rst_xor3; rst_and = rst_and && i; rst_and2 = (i+1) && rst_and2; rst_or = rst_or || i; rst_or2 = 0 || rst_or2; } if (rst_inc != rdct_inc) { errors += 1; } if (rst_inc2 != rdct_inc2) { errors += 1; } if (rst_pls != rdct_pls) { errors += 1; } if (rst_pls2 != rdct_pls2) { errors += 1; } if (rst_pls3 != rdct_pls3) { errors += 1; } if (rst_dec != rdct_dec) { errors += 1; } if (rst_dec2 != rdct_dec2) { errors += 1; } if (rst_mns != rdct_mns) { errors += 1; } if (rst_mns2 != rdct_mns2) { errors += 1; } if (rst_mul != rdct_mul) { errors += 1; } if (rst_mul2 != rdct_mul2) { errors += 1; } if (rst_mul3 != rdct_mul3) { errors += 1; } if (rst_land != rdct_land) { errors += 1; } if (rst_land2 != rdct_land2) { errors += 1; } if (rst_land3 != rdct_land3) { errors += 1; } if (rst_lor != rdct_lor) { errors += 1; } if (rst_lor2 != rdct_lor2) { errors += 1; } if (rst_lor3 != rdct_lor3) { errors += 1; } if (rst_xor != rdct_xor) { errors += 1; } if (rst_xor2 != rdct_xor2) { errors += 1; } if (rst_xor3 != rdct_xor3) { errors += 1; } if (rst_and != rdct_and) { errors += 1; } if (rst_and2 != rdct_and2) { errors += 1; } if (rst_or != rdct_or) { errors += 1; } if (rst_or2 != rdct_or2) { errors += 1; } if (errors == 0) { printf ("reduction 016 : SUCCESS\n"); return 0; } else { printf ("reduction 016 : FAILED\n"); return 1; } }
void test_product_large() { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_1( test_aliasing<float>() ); } #if defined EIGEN_TEST_PART_6 { // test a specific issue in DiagonalProduct int N = 1000000; VectorXf v = VectorXf::Ones(N); MatrixXf m = MatrixXf::Ones(N,3); m = (v+v).asDiagonal() * m; VERIFY_IS_APPROX(m, MatrixXf::Constant(N,3,2)); } { // test deferred resizing in Matrix::operator= MatrixXf a = MatrixXf::Random(10,4), b = MatrixXf::Random(4,10), c = a; VERIFY_IS_APPROX((a = a * b), (c * b).eval()); } { // check the functions to setup blocking sizes compile and do not segfault // FIXME check they do what they are supposed to do !! std::ptrdiff_t l1 = internal::random<int>(10000,20000); std::ptrdiff_t l2 = internal::random<int>(100000,200000); std::ptrdiff_t l3 = internal::random<int>(1000000,2000000); setCpuCacheSizes(l1,l2,l3); VERIFY(l1==l1CacheSize()); VERIFY(l2==l2CacheSize()); std::ptrdiff_t k1 = internal::random<int>(10,100)*16; std::ptrdiff_t m1 = internal::random<int>(10,100)*16; std::ptrdiff_t n1 = internal::random<int>(10,100)*16; // only makes sure it compiles fine internal::computeProductBlockingSizes<float,float,std::ptrdiff_t>(k1,m1,n1,1); } { // test regression in row-vector by matrix (bad Map type) MatrixXf mat1(10,32); mat1.setRandom(); MatrixXf mat2(32,32); mat2.setRandom(); MatrixXf r1 = mat1.row(2)*mat2.transpose(); VERIFY_IS_APPROX(r1, (mat1.row(2)*mat2.transpose()).eval()); MatrixXf r2 = mat1.row(2)*mat2; VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval()); } { Eigen::MatrixXd A(10,10), B, C; A.setRandom(); C = A; for(int k=0; k<79; ++k) C = C * A; B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))) * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))); VERIFY_IS_APPROX(B,C); } #endif // Regression test for bug 714: #if defined EIGEN_HAS_OPENMP omp_set_dynamic(1); for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_6( product(Matrix<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) ); } #endif }
double run(int threads_num, double b) { int i, j, step,k; double* y = calloc(N + 1, sizeof(double)); /* сеточное решение */ double* dy = calloc(N + 1, sizeof(double)); /* разность y^n-y^n+1 двух соседних приближений по итерациям метода Ньютона */ double *A[R], *B[R], *C[R], *G[R]; /* коэффициенты трёхдиагональной системы для каждого шага редукции */ double begin, end; omp_set_dynamic(0); /* нельзя динамически изменять количество нитей */ omp_set_num_threads(threads_num); /* 4 нити */ for(i = 0; i < R; i++) { A[i] = calloc(N + 1, sizeof(double)); B[i] = calloc(N + 1, sizeof(double)); C[i] = calloc(N + 1, sizeof(double)); G[i] = calloc(N + 1, sizeof(double)); } begin = omp_get_wtime(); /* начальная точка отсчёта времени */ for( k = 0; k < REPEATS; k++){ #pragma omp parallel private(i, j) { #pragma omp for for(i = 0; i <= N; i++) y[i] = 1.0 + (b - 1.0) * i / N; /* нулевое приближение */ #pragma omp single { dy[0] = dy[N] = 0.0; for(j = 0; j < R; j++) B[j][0] = B[j][N] = 1.0; /* при редукции крайние значения матрицы одни и те же во всех итерациях метода Ньютона */ } while(1) /* итерации метода Ньютона в цикле */ { #pragma omp for for(i = 1; i < N; i++) /* изначальные значения коэффициентов */ { B[0][i] = (-2.0 / (h * h) - 5 * exp(y[i]) / 6); A[0][i] = (1.0 / (h * h) - exp(y[i - 1]) / 12); C[0][i] = (1.0 / (h * h) - exp(y[i + 1]) / 12); G[0][i] = my_func(y, b, i); } for(j = 1; j < R; j++) /* значения коэффициентов после редукции */ { step = pow(2, j); /* шаг прогонки при редукции */ #pragma omp for for(i = step; i < N; i += step) { B[j][i] = B[j - 1][i] - A[j - 1][i] * C[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * A[j - 1][i + step / 2] / B[j - 1][i + step / 2]; A[j][i] = - A[j - 1][i] * A[j - 1][i - step / 2] / B[j - 1][i - step / 2]; C[j][i] = - C[j - 1][i] * C[j - 1][i + step / 2] / B[j - 1][i + step / 2]; G[j][i] = G[j - 1][i] - A[j - 1][i] * G[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * G[j - 1][i + step / 2] / B[j - 1][i + step / 2]; } } /* редукция прогонки завершена */ #pragma omp single { dy[N / 2] = G[R - 1][N / 2] / B[R - 1][N / 2]; /* первый обратный шаг редукции */ dy[N / 4] = (G[R - 2][N / 4] - C[R - 2][N / 4] * dy[N / 2]) / B[R - 2][N / 4]; dy[N * 3 / 4] = (G[R - 2][N * 3 / 4] - A[R - 2][N * 3 / 4] * dy[N / 2] ) / B[R - 2][N * 3 / 4]; /* второй обратный шаг редукции */ } for(j = R - 3; j >= 0; j--) { step = pow(2, j); #pragma omp for for(i = step; i < N; i += 2 * step) dy[i] = (G[j][i] - C[j][i] * dy[i + step] - A[j][i] * dy[i - step]) / B[j][i]; } /* оставшиеся обратные шаги редукции */ #pragma omp for for(i = 0; i <= N; i++) y[i] -= dy[i]; /* одна итерация метода Ньютона */ if (norm(dy) < epsilon) break; /* условие останова метода Ньютона */ } } } end = omp_get_wtime(); /* конечная точка отсчёта времени */ for(i = 0; i < R; i++) { free(A[i]); free(B[i]); free(C[i]); free(G[i]); } if( threads_num == 1){ char str_dest[50]; sprintf( str_dest, "prog_1_b_%f_results.txt",b); FILE* fp = fopen(str_dest, "w"); /* вывод полученной функции в файл */ fprintf(fp, "X\tY\r\n"); for(i = 0; i <= N; i++) fprintf(fp, "%e\t%e\r\n", ((double) i / N), y[i]); fclose(fp); } free(y); free(dy); return (end - begin)/REPEATS; }
FANN_EXTERNAL float FANN_API fann_train_epoch_quickprop_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb) { struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*)); int i=0,j=0; if(ann->prev_train_slopes == NULL) { fann_clear_train_arrays(ann); } //#define THREADNUM 1 fann_reset_MSE(ann); /*vector<struct fann *> ann_vect(threadnumb);*/ //generate copies of the ann omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(j) { #pragma omp for schedule(static) for(i=0; i<(int)threadnumb; i++) { ann_vect[i]=fann_copy(ann); } //parallel computing of the updates #pragma omp for schedule(static) for(i = 0; i < (int)data->num_data; i++) { j=omp_get_thread_num(); if (ann->do_dropout) { fann_run_dropout(ann_vect[j], data->input[i]); } else { fann_run(ann_vect[j], data->input[i]); } fann_compute_MSE(ann_vect[j], data->output[i]); fann_backpropagate_MSE(ann_vect[j]); fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1); } } { fann_type *weights = ann->weights; fann_type *prev_steps = ann->prev_steps; fann_type *prev_train_slopes = ann->prev_train_slopes; const unsigned int first_weight=0; const unsigned int past_end=ann->total_connections; fann_type w=0.0, next_step; const float epsilon = ann->learning_rate / data->num_data; const float decay = ann->quickprop_decay; /*-0.0001;*/ const float mu = ann->quickprop_mu; /*1.75; */ const float shrink_factor = (float) (mu / (1.0 + mu)); omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(w, next_step) { #pragma omp for schedule(static) for(i=first_weight; i < (int)past_end; i++) { fann_type temp_slopes=0.0; unsigned int k; fann_type *train_slopes; fann_type prev_step, prev_slope; w = weights[i]; for(k=0;k<threadnumb;++k) { train_slopes=ann_vect[k]->train_slopes; temp_slopes+= train_slopes[i]; train_slopes[i]=0.0; } temp_slopes+= decay * w; prev_step = prev_steps[i]; prev_slope = prev_train_slopes[i]; next_step = 0.0; /* The step must always be in direction opposite to the slope. */ if(prev_step > 0.001) { /* If last step was positive... */ if(temp_slopes > 0.0) /* Add in linear term if current slope is still positive. */ next_step += epsilon * temp_slopes; /*If current slope is close to or larger than prev slope... */ if(temp_slopes > (shrink_factor * prev_slope)) next_step += mu * prev_step; /* Take maximum size negative step. */ else next_step += prev_step * temp_slopes / (prev_slope - temp_slopes); /* Else, use quadratic estimate. */ } else if(prev_step < -0.001) { /* If last step was negative... */ if(temp_slopes < 0.0) /* Add in linear term if current slope is still negative. */ next_step += epsilon * temp_slopes; /* If current slope is close to or more neg than prev slope... */ if(temp_slopes < (shrink_factor * prev_slope)) next_step += mu * prev_step; /* Take maximum size negative step. */ else next_step += prev_step * temp_slopes / (prev_slope - temp_slopes); /* Else, use quadratic estimate. */ } else /* Last step was zero, so use only linear term. */ next_step += epsilon * temp_slopes; /* update global data arrays */ prev_steps[i] = next_step; prev_train_slopes[i] = temp_slopes; w += next_step; if(w > 1500) weights[i] = 1500; else if(w < -1500) weights[i] = -1500; else weights[i] = w; } } } //merge of MSEs for(i=0;i<(int)threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; fann_destroy(ann_vect[i]); } free(ann_vect); return fann_get_MSE(ann); }
float train_epoch_batch_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb,vector< vector<fann_type> >& predicted_outputs) { fann_reset_MSE(ann); predicted_outputs.resize(data->num_data,vector<fann_type> (data->num_output)); vector<struct fann *> ann_vect(threadnumb); int i=0,j=0; //generate copies of the ann omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(j) { #pragma omp for schedule(static) for(i=0; i<(int)threadnumb; i++) { ann_vect[i]=fann_copy(ann); } //parallel computing of the updates #pragma omp for schedule(static) for(i = 0; i < (int)data->num_data; i++) { j=omp_get_thread_num(); fann_type* temp_predicted_output=fann_run(ann_vect[j], data->input[i]); for(unsigned int k=0;k<data->num_output;++k) { predicted_outputs[i][k]=temp_predicted_output[k]; } fann_compute_MSE(ann_vect[j], data->output[i]); fann_backpropagate_MSE(ann_vect[j]); fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1); } } //parallel update of the weights { const unsigned int num_data=data->num_data; const unsigned int first_weight=0; const unsigned int past_end=ann->total_connections; fann_type *weights = ann->weights; const fann_type epsilon = ann->learning_rate / num_data; omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel { #pragma omp for schedule(static) for(i=first_weight; i < (int)past_end; i++) { fann_type temp_slopes=0.0; unsigned int k; fann_type *train_slopes; for(k=0;k<threadnumb;++k) { train_slopes=ann_vect[k]->train_slopes; temp_slopes+= train_slopes[i]; train_slopes[i]=0.0; } weights[i] += temp_slopes*epsilon; } } } //merge of MSEs for(i=0;i<(int)threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; fann_destroy(ann_vect[i]); } return fann_get_MSE(ann); }
FANN_EXTERNAL float FANN_API fann_train_epoch_sarprop_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb) { struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*)); int i=0,j=0; if(ann->prev_train_slopes == NULL) { fann_clear_train_arrays(ann); } //#define THREADNUM 1 fann_reset_MSE(ann); /*vector<struct fann *> ann_vect(threadnumb);*/ //generate copies of the ann omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(j) { #pragma omp for schedule(static) for(i=0; i<(int)threadnumb; i++) { ann_vect[i]=fann_copy(ann); } //parallel computing of the updates #pragma omp for schedule(static) for(i = 0; i < (int)data->num_data; i++) { j=omp_get_thread_num(); if (ann->do_dropout) { fann_run_dropout(ann_vect[j], data->input[i]); } else { fann_run(ann_vect[j], data->input[i]); } fann_compute_MSE(ann_vect[j], data->output[i]); fann_backpropagate_MSE(ann_vect[j]); fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1); } } { fann_type *weights = ann->weights; fann_type *prev_steps = ann->prev_steps; fann_type *prev_train_slopes = ann->prev_train_slopes; const unsigned int first_weight=0; const unsigned int past_end=ann->total_connections; const unsigned int epoch=ann->sarprop_epoch; fann_type next_step; /* These should be set from variables */ const float increase_factor = ann->rprop_increase_factor; /*1.2; */ const float decrease_factor = ann->rprop_decrease_factor; /*0.5; */ /* TODO: why is delta_min 0.0 in iRprop? SARPROP uses 1x10^-6 (Braun and Riedmiller, 1993) */ const float delta_min = 0.000001f; const float delta_max = ann->rprop_delta_max; /*50.0; */ const float weight_decay_shift = ann->sarprop_weight_decay_shift; /* ld 0.01 = -6.644 */ const float step_error_threshold_factor = ann->sarprop_step_error_threshold_factor; /* 0.1 */ const float step_error_shift = ann->sarprop_step_error_shift; /* ld 3 = 1.585 */ const float T = ann->sarprop_temperature; float MSE, RMSE; //merge of MSEs for(i=0;i<(int)threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; } MSE = fann_get_MSE(ann); RMSE = sqrtf(MSE); /* for all weights; TODO: are biases included? */ omp_set_dynamic(0); omp_set_num_threads(threadnumb); #pragma omp parallel private(next_step) { #pragma omp for schedule(static) for(i=first_weight; i < (int)past_end; i++) { /* TODO: confirm whether 1x10^-6 == delta_min is really better */ const fann_type prev_step = fann_max(prev_steps[i], (fann_type) 0.000001); /* prev_step may not be zero because then the training will stop */ /* calculate SARPROP slope; TODO: better as new error function? (see SARPROP paper)*/ fann_type prev_slope, same_sign; fann_type temp_slopes=0.0; unsigned int k; fann_type *train_slopes; for(k=0;k<threadnumb;++k) { train_slopes=ann_vect[k]->train_slopes; temp_slopes+= train_slopes[i]; train_slopes[i]=0.0; } temp_slopes= -temp_slopes - weights[i] * (fann_type)fann_exp2(-T * epoch + weight_decay_shift); next_step=0.0; /* TODO: is prev_train_slopes[i] 0.0 in the beginning? */ prev_slope = prev_train_slopes[i]; same_sign = prev_slope * temp_slopes; if(same_sign > 0.0) { next_step = fann_min(prev_step * increase_factor, delta_max); /* TODO: are the signs inverted? see differences between SARPROP paper and iRprop */ if (temp_slopes < 0.0) weights[i] += next_step; else weights[i] -= next_step; } else if(same_sign < 0.0) { #ifndef RAND_MAX #define RAND_MAX 0x7fffffff #endif if(prev_step < step_error_threshold_factor * MSE) next_step = prev_step * decrease_factor + (float)rand() / RAND_MAX * RMSE * (fann_type)fann_exp2(-T * epoch + step_error_shift); else next_step = fann_max(prev_step * decrease_factor, delta_min); temp_slopes = 0.0; } else { if(temp_slopes < 0.0) weights[i] += prev_step; else weights[i] -= prev_step; } /* update global data arrays */ prev_steps[i] = next_step; prev_train_slopes[i] = temp_slopes; } } } ++(ann->sarprop_epoch); //already computed before /*//merge of MSEs for(i=0;i<threadnumb;++i) { ann->MSE_value+= ann_vect[i]->MSE_value; ann->num_MSE+=ann_vect[i]->num_MSE; }*/ //destroy the copies of the ann for(i=0; i<(int)threadnumb; i++) { fann_destroy(ann_vect[i]); } free(ann_vect); return fann_get_MSE(ann); }
void omp_set_dynamic_ (const int32_t *set) { omp_set_dynamic (*set); }
int main(int argc, char *argv[]) { struct pngquant_options options = { .floyd = 1.f, // floyd-steinberg dithering }; options.liq = liq_attr_create(); #if USE_SSE if (!options.liq) { print_full_version(stderr); fputs("SSE2-capable CPU is required for this build.\n", stderr); return WRONG_ARCHITECTURE; } #endif unsigned int error_count=0, skipped_count=0, file_count=0; pngquant_error latest_error=SUCCESS; const char *newext = NULL, *output_file_path = NULL; fix_obsolete_options(argc, argv); int opt; do { opt = getopt_long(argc, argv, "Vvqfhs:Q:o:", long_options, NULL); switch (opt) { case 'v': liq_set_log_callback(options.liq, log_callback, NULL); options.log_callback = log_callback; break; case 'q': liq_set_log_callback(options.liq, NULL, NULL); options.log_callback = NULL; break; case arg_floyd: options.floyd = optarg ? atof(optarg) : 1.0; if (options.floyd < 0 || options.floyd > 1.0) { fputs("--floyd argument must be in 0..1 range\n", stderr); return INVALID_ARGUMENT; } break; case arg_ordered: options.floyd = 0; break; case 'f': options.force = true; break; case arg_no_force: options.force = false; break; case arg_ext: newext = optarg; break; case 'o': if (output_file_path) { fputs("--output option can be used only once\n", stderr); return INVALID_ARGUMENT; } output_file_path = optarg; break; case arg_iebug: // opacities above 238 will be rounded up to 255, because IE6 truncates <255 to 0. liq_set_min_opacity(options.liq, 238); options.ie_mode = true; break; case arg_transbug: liq_set_last_index_transparent(options.liq, true); break; case 's': { int speed = atoi(optarg); if (speed >= 10) { options.fast_compression = true; } if (speed == 11) { options.floyd = 0; speed = 10; } if (LIQ_OK != liq_set_speed(options.liq, speed)) { fputs("Speed should be between 1 (slow) and 11 (fast).\n", stderr); return INVALID_ARGUMENT; } } break; case 'Q': if (!parse_quality(optarg, options.liq, &options.min_quality_limit)) { fputs("Quality should be in format min-max where min and max are numbers in range 0-100.\n", stderr); return INVALID_ARGUMENT; } break; case arg_posterize: if (LIQ_OK != liq_set_min_posterization(options.liq, atoi(optarg))) { fputs("Posterization should be number of bits in range 0-4.\n", stderr); return INVALID_ARGUMENT; } break; case arg_map: { png24_image tmp = {}; if (SUCCESS != read_image(options.liq, optarg, false, &tmp, &options.fixed_palette_image, false)) { fprintf(stderr, " error: Unable to load %s", optarg); return INVALID_ARGUMENT; } } break; case 'h': print_full_version(stdout); print_usage(stdout); return SUCCESS; case 'V': puts(PNGQUANT_VERSION); return SUCCESS; case -1: break; default: return INVALID_ARGUMENT; } } while (opt != -1); int argn = optind; if (argn >= argc) { if (argn > 1) { fputs("No input files specified. See -h for help.\n", stderr); } else { print_full_version(stderr); print_usage(stderr); } return MISSING_ARGUMENT; } char *colors_end; unsigned long colors = strtoul(argv[argn], &colors_end, 10); if (colors_end != argv[argn] && '\0' == colors_end[0]) { if (LIQ_OK != liq_set_max_colors(options.liq, colors)) { fputs("Number of colors must be between 2 and 256.\n", stderr); return INVALID_ARGUMENT; } argn++; } if (newext && output_file_path) { fputs("--ext and --output options can't be used at the same time\n", stderr); return INVALID_ARGUMENT; } // new filename extension depends on options used. Typically basename-fs8.png if (newext == NULL) { newext = options.floyd > 0 ? "-ie-fs8.png" : "-ie-or8.png"; if (!options.ie_mode) newext += 3; /* skip "-ie" */ } if (argn == argc || (argn == argc-1 && 0==strcmp(argv[argn],"-"))) { options.using_stdin = true; argn = argc-1; } if (options.using_stdin && output_file_path) { fputs("--output can't be mixed with stdin\n", stderr); return INVALID_ARGUMENT; } const int num_files = argc-argn; if (output_file_path && num_files != 1) { fputs("Only one input file is allowed when --output is used\n", stderr); return INVALID_ARGUMENT; } #ifdef _OPENMP // if there's a lot of files, coarse parallelism can be used if (num_files > 2*omp_get_max_threads()) { omp_set_nested(0); omp_set_dynamic(1); } else { omp_set_nested(1); } #endif #pragma omp parallel for \ schedule(dynamic) reduction(+:skipped_count) reduction(+:error_count) reduction(+:file_count) shared(latest_error) for(int i=0; i < num_files; i++) { struct pngquant_options opts = options; opts.liq = liq_attr_copy(options.liq); const char *filename = opts.using_stdin ? "stdin" : argv[argn+i]; #ifdef _OPENMP struct buffered_log buf = {}; if (opts.log_callback && omp_get_num_threads() > 1 && num_files > 1) { liq_set_log_callback(opts.liq, log_callback_buferred, &buf); liq_set_log_flush_callback(opts.liq, log_callback_buferred_flush, &buf); options.log_callback = log_callback_buferred; options.log_callback_user_info = &buf; } #endif pngquant_error retval = SUCCESS; const char *outname = output_file_path; char *outname_free = NULL; if (!options.using_stdin) { if (!outname) { outname = outname_free = add_filename_extension(filename, newext); } if (!options.force && file_exists(outname)) { fprintf(stderr, " error: %s exists; not overwriting\n", outname); retval = NOT_OVERWRITING_ERROR; } } if (!retval) { retval = pngquant_file(filename, outname, &opts); } free(outname_free); liq_attr_destroy(opts.liq); if (retval) { #pragma omp critical { latest_error = retval; } if (retval == TOO_LOW_QUALITY) { skipped_count++; } else { error_count++; } } ++file_count; } if (error_count) { verbose_printf(&options, "There were errors quantizing %d file%s out of a total of %d file%s.", error_count, (error_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s"); } if (skipped_count) { verbose_printf(&options, "Skipped %d file%s out of a total of %d file%s.", skipped_count, (skipped_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s"); } if (!skipped_count && !error_count) { verbose_printf(&options, "No errors detected while quantizing %d image%s.", file_count, (file_count == 1)? "" : "s"); } liq_image_destroy(options.fixed_palette_image); liq_attr_destroy(options.liq); return latest_error; } static void pngquant_output_image_free(png8_image *output_image) { free(output_image->indexed_data); output_image->indexed_data = NULL; free(output_image->row_pointers); output_image->row_pointers = NULL; }
void omp_set_dynamic_8_ (const int64_t *set) { omp_set_dynamic (!!*set); }
int main(int argc, char *argv[]) { /* Any variable that begins with 't' means topocentric */ /* Any variable that begins with 'b' means barycentric */ FILE **outfiles; float **outdata = NULL; short **subsdata = NULL; double dtmp, *dms = NULL, avgdm = 0.0, maxdm, dsdt = 0; double tlotoa = 0.0, blotoa = 0.0, BW_ddelay = 0.0; double max = -9.9E30, min = 9.9E30, var = 0.0, avg = 0.0; double *btoa = NULL, *ttoa = NULL, avgvoverc = 0.0; char obs[3], ephem[10], rastring[50], decstring[50]; long totnumtowrite, totwrote = 0, padwrote = 0, datawrote = 0; int **offsets; int ii, jj, numadded = 0, numremoved = 0, padding = 0; int numbarypts = 0, blocksperread = 0, worklen = 0; int numread = 0, numtowrite = 0; int padtowrite = 0, statnum = 0, good_padvals = 0; int numdiffbins = 0, *diffbins = NULL, *diffbinptr = NULL; int *idispdt; char *datafilenm; int dmprecision = 2; struct spectra_info s; infodata idata; mask obsmask; /* Call usage() if we have no command line arguments */ if (argc == 1) { Program = argv[0]; printf("\n"); usage(); exit(0); } /* Parse the command line using the excellent program Clig */ cmd = parseCmdline(argc, argv); spectra_info_set_defaults(&s); dmprecision = cmd->dmprec; s.filenames = cmd->argv; s.num_files = cmd->argc; // If we are zeroDMing, make sure that clipping is off. if (cmd->zerodmP) cmd->noclipP = 1; s.clip_sigma = cmd->clip; // -1 causes the data to determine if we use weights, scales, & // offsets for PSRFITS or flip the band for any data type where // we can figure that out with the data s.apply_flipband = (cmd->invertP) ? 1 : -1; s.apply_weight = (cmd->noweightsP) ? 0 : -1; s.apply_scale = (cmd->noscalesP) ? 0 : -1; s.apply_offset = (cmd->nooffsetsP) ? 0 : -1; s.remove_zerodm = (cmd->zerodmP) ? 1 : 0; if (cmd->noclipP) { cmd->clip = 0.0; s.clip_sigma = 0.0; } if (cmd->ifsP) { // 0 = default or summed, 1-4 are possible also s.use_poln = cmd->ifs + 1; } if (!cmd->numoutP) cmd->numout = LONG_MAX; if (cmd->ncpus > 1) { #ifdef _OPENMP int maxcpus = omp_get_num_procs(); int openmp_numthreads = (cmd->ncpus <= maxcpus) ? cmd->ncpus : maxcpus; // Make sure we are not dynamically setting the number of threads omp_set_dynamic(0); omp_set_num_threads(openmp_numthreads); printf("Using %d threads with OpenMP\n\n", openmp_numthreads); #endif } else { #ifdef _OPENMP omp_set_num_threads(1); // Explicitly turn off OpenMP #endif } #ifdef DEBUG showOptionValues(); #endif printf("\n\n"); printf(" Pulsar Subband De-dispersion Routine\n"); printf(" by Scott M. Ransom\n\n"); if (RAWDATA) { if (cmd->filterbankP) s.datatype = SIGPROCFB; else if (cmd->psrfitsP) s.datatype = PSRFITS; else if (cmd->pkmbP) s.datatype = SCAMP; else if (cmd->bcpmP) s.datatype = BPP; else if (cmd->wappP) s.datatype = WAPP; else if (cmd->spigotP) s.datatype = SPIGOT; } else { // Attempt to auto-identify the data identify_psrdatatype(&s, 1); if (s.datatype == SIGPROCFB) cmd->filterbankP = 1; else if (s.datatype == PSRFITS) cmd->psrfitsP = 1; else if (s.datatype == SCAMP) cmd->pkmbP = 1; else if (s.datatype == BPP) cmd->bcpmP = 1; else if (s.datatype == WAPP) cmd->wappP = 1; else if (s.datatype == SPIGOT) cmd->spigotP = 1; else if (s.datatype == SUBBAND) insubs = 1; else { printf ("Error: Unable to identify input data files. Please specify type.\n\n"); exit(1); } } if (!RAWDATA) s.files = (FILE **) malloc(sizeof(FILE *) * s.num_files); if (RAWDATA || insubs) { char description[40]; psrdatatype_description(description, s.datatype); if (s.num_files > 1) printf("Reading %s data from %d files:\n", description, s.num_files); else printf("Reading %s data from 1 file:\n", description); for (ii = 0; ii < s.num_files; ii++) { printf(" '%s'\n", cmd->argv[ii]); if (insubs) s.files[ii] = chkfopen(s.filenames[ii], "rb"); } printf("\n"); if (RAWDATA) { read_rawdata_files(&s); print_spectra_info_summary(&s); spectra_info_to_inf(&s, &idata); } else { // insubs cmd->nsub = s.num_files; s.N = chkfilelen(s.files[0], sizeof(short)); s.padvals = gen_fvect(s.num_files); for (ii = 0; ii < s.num_files; ii++) s.padvals[ii] = 0.0; s.start_MJD = (long double *) malloc(sizeof(long double)); s.start_spec = (long long *) malloc(sizeof(long long)); s.num_spec = (long long *) malloc(sizeof(long long)); s.num_pad = (long long *) malloc(sizeof(long long)); s.start_spec[0] = 0L; s.num_spec[0] = s.N; s.num_pad[0] = 0L; } /* Read an input mask if wanted */ if (cmd->maskfileP) { read_mask(cmd->maskfile, &obsmask); printf("Read mask information from '%s'\n\n", cmd->maskfile); good_padvals = determine_padvals(cmd->maskfile, &obsmask, s.padvals); } else { obsmask.numchan = obsmask.numint = 0; } } if (insubs) { char *root, *suffix; if (split_root_suffix(s.filenames[0], &root, &suffix) == 0) { printf("Error: The input filename (%s) must have a suffix!\n\n", s.filenames[0]); exit(1); } if (strncmp(suffix, "sub", 3) == 0) { char *tmpname; tmpname = calloc(strlen(root) + 10, 1); sprintf(tmpname, "%s.sub", root); readinf(&idata, tmpname); free(tmpname); s.num_channels = idata.num_chan; s.start_MJD[0] = idata.mjd_i + idata.mjd_f; s.dt = idata.dt; s.T = s.N * s.dt; s.lo_freq = idata.freq; s.df = idata.chan_wid; s.hi_freq = s.lo_freq + (s.num_channels - 1.0) * s.df; s.BW = s.num_channels * s.df; s.fctr = s.lo_freq - 0.5 * s.df + 0.5 * s.BW; s.spectra_per_subint = SUBSBLOCKLEN; print_spectra_info_summary(&s); } else { printf("\nThe input files (%s) must be subbands! (i.e. *.sub##)\n\n", cmd->argv[0]); exit(1); } free(root); free(suffix); } /* Determine the output file names and open them */ datafilenm = (char *) calloc(strlen(cmd->outfile) + 20, 1); if (!cmd->subP) { printf("Writing output data to:\n"); outfiles = (FILE **) malloc(cmd->numdms * sizeof(FILE *)); dms = gen_dvect(cmd->numdms); for (ii = 0; ii < cmd->numdms; ii++) { dms[ii] = cmd->lodm + ii * cmd->dmstep; avgdm += dms[ii]; sprintf(datafilenm, "%s_DM%.*f.dat", cmd->outfile, dmprecision, dms[ii]); outfiles[ii] = chkfopen(datafilenm, "wb"); printf(" '%s'\n", datafilenm); } avgdm /= cmd->numdms; maxdm = dms[cmd->numdms - 1]; } else { char format_str[30]; int num_places; if (!cmd->nobaryP) { printf("\nWarning: You cannot (currently) barycenter subbands.\n" " Setting the '-nobary' flag automatically.\n"); cmd->nobaryP = 1; } printf("Writing subbands to:\n"); cmd->numdms = 1; dms = gen_dvect(cmd->numdms); dms[0] = cmd->subdm; cmd->lodm = cmd->subdm; avgdm = cmd->subdm; maxdm = cmd->subdm; outfiles = (FILE **) malloc(cmd->nsub * sizeof(FILE *)); num_places = (int) ceil(log10(cmd->nsub)); sprintf(format_str, "%%s_DM%%.*f.sub%%0%dd", num_places); for (ii = 0; ii < cmd->nsub; ii++) { sprintf(datafilenm, format_str, cmd->outfile, dmprecision, avgdm, ii); outfiles[ii] = chkfopen(datafilenm, "wb"); printf(" '%s'\n", datafilenm); } } /* Set a few other key values */ if (insubs) avgdm = idata.dm; if (RAWDATA) idata.dm = avgdm; dsdt = cmd->downsamp * idata.dt; BW_ddelay = delay_from_dm(maxdm, idata.freq) - delay_from_dm(maxdm, idata.freq + (idata.num_chan - 1) * idata.chan_wid); blocksperread = ((int) (BW_ddelay / idata.dt) / s.spectra_per_subint + 1); worklen = s.spectra_per_subint * blocksperread; /* The number of topo to bary time points to generate with TEMPO */ numbarypts = (int) (s.T * 1.1 / TDT + 5.5) + 1; // Identify the TEMPO observatory code { char *outscope = (char *) calloc(40, sizeof(char)); telescope_to_tempocode(idata.telescope, outscope, obs); free(outscope); } if (cmd->nsub > s.num_channels) { printf ("Warning: The number of requested subbands (%d) is larger than the number of channels (%d).\n", cmd->nsub, s.num_channels); printf(" Re-setting the number of subbands to %d.\n\n", s.num_channels); cmd->nsub = s.num_channels; } if (s.spectra_per_subint % cmd->downsamp) { printf ("Error: The downsample factor (%d) must be a factor of the\n", cmd->downsamp); printf(" blocklength (%d). Exiting.\n\n", s.spectra_per_subint); exit(1); } tlotoa = idata.mjd_i + idata.mjd_f; /* Topocentric epoch */ if (cmd->numoutP) totnumtowrite = cmd->numout; else totnumtowrite = (long) idata.N / cmd->downsamp; if (cmd->nobaryP) { /* Main loop if we are not barycentering... */ double *dispdt; /* Dispersion delays (in bins). The high freq gets no delay */ /* All other delays are positive fractions of bin length (dt) */ dispdt = subband_search_delays(s.num_channels, cmd->nsub, avgdm, idata.freq, idata.chan_wid, 0.0); idispdt = gen_ivect(s.num_channels); for (ii = 0; ii < s.num_channels; ii++) idispdt[ii] = NEAREST_LONG(dispdt[ii] / idata.dt); vect_free(dispdt); /* The subband dispersion delays (see note above) */ offsets = gen_imatrix(cmd->numdms, cmd->nsub); for (ii = 0; ii < cmd->numdms; ii++) { double *subdispdt; subdispdt = subband_delays(s.num_channels, cmd->nsub, dms[ii], idata.freq, idata.chan_wid, 0.0); dtmp = subdispdt[cmd->nsub - 1]; for (jj = 0; jj < cmd->nsub; jj++) offsets[ii][jj] = NEAREST_LONG((subdispdt[jj] - dtmp) / dsdt); vect_free(subdispdt); } /* Allocate our data array and start getting data */ printf("\nDe-dispersing using:\n"); printf(" Subbands = %d\n", cmd->nsub); printf(" Average DM = %.7g\n", avgdm); if (cmd->downsamp > 1) { printf(" Downsample = %d\n", cmd->downsamp); printf(" New sample dt = %.10g\n", dsdt); } printf("\n"); if (cmd->subP) subsdata = gen_smatrix(cmd->nsub, worklen / cmd->downsamp); else outdata = gen_fmatrix(cmd->numdms, worklen / cmd->downsamp); numread = get_data(outdata, blocksperread, &s, &obsmask, idispdt, offsets, &padding, subsdata); while (numread == worklen) { numread /= cmd->downsamp; print_percent_complete(totwrote, totnumtowrite); /* Write the latest chunk of data, but don't */ /* write more than cmd->numout points. */ numtowrite = numread; if (cmd->numoutP && (totwrote + numtowrite) > cmd->numout) numtowrite = cmd->numout - totwrote; if (cmd->subP) write_subs(outfiles, cmd->nsub, subsdata, 0, numtowrite); else write_data(outfiles, cmd->numdms, outdata, 0, numtowrite); totwrote += numtowrite; /* Update the statistics */ if (!padding && !cmd->subP) { for (ii = 0; ii < numtowrite; ii++) update_stats(statnum + ii, outdata[0][ii], &min, &max, &avg, &var); statnum += numtowrite; } /* Stop if we have written out all the data we need to */ if (cmd->numoutP && (totwrote == cmd->numout)) break; numread = get_data(outdata, blocksperread, &s, &obsmask, idispdt, offsets, &padding, subsdata); } datawrote = totwrote; } else { /* Main loop if we are barycentering... */ double maxvoverc = -1.0, minvoverc = 1.0, *voverc = NULL; double *dispdt; /* What ephemeris will we use? (Default is DE405) */ strcpy(ephem, "DE405"); /* Define the RA and DEC of the observation */ ra_dec_to_string(rastring, idata.ra_h, idata.ra_m, idata.ra_s); ra_dec_to_string(decstring, idata.dec_d, idata.dec_m, idata.dec_s); /* Allocate some arrays */ btoa = gen_dvect(numbarypts); ttoa = gen_dvect(numbarypts); voverc = gen_dvect(numbarypts); for (ii = 0; ii < numbarypts; ii++) ttoa[ii] = tlotoa + TDT * ii / SECPERDAY; /* Call TEMPO for the barycentering */ printf("\nGenerating barycentric corrections...\n"); barycenter(ttoa, btoa, voverc, numbarypts, rastring, decstring, obs, ephem); for (ii = 0; ii < numbarypts; ii++) { if (voverc[ii] > maxvoverc) maxvoverc = voverc[ii]; if (voverc[ii] < minvoverc) minvoverc = voverc[ii]; avgvoverc += voverc[ii]; } avgvoverc /= numbarypts; vect_free(voverc); blotoa = btoa[0]; printf(" Average topocentric velocity (c) = %.7g\n", avgvoverc); printf(" Maximum topocentric velocity (c) = %.7g\n", maxvoverc); printf(" Minimum topocentric velocity (c) = %.7g\n\n", minvoverc); printf("De-dispersing and barycentering using:\n"); printf(" Subbands = %d\n", cmd->nsub); printf(" Average DM = %.7g\n", avgdm); if (cmd->downsamp > 1) { printf(" Downsample = %d\n", cmd->downsamp); printf(" New sample dt = %.10g\n", dsdt); } printf("\n"); /* Dispersion delays (in bins). The high freq gets no delay */ /* All other delays are positive fractions of bin length (dt) */ dispdt = subband_search_delays(s.num_channels, cmd->nsub, avgdm, idata.freq, idata.chan_wid, avgvoverc); idispdt = gen_ivect(s.num_channels); for (ii = 0; ii < s.num_channels; ii++) idispdt[ii] = NEAREST_LONG(dispdt[ii] / idata.dt); vect_free(dispdt); /* The subband dispersion delays (see note above) */ offsets = gen_imatrix(cmd->numdms, cmd->nsub); for (ii = 0; ii < cmd->numdms; ii++) { double *subdispdt; subdispdt = subband_delays(s.num_channels, cmd->nsub, dms[ii], idata.freq, idata.chan_wid, avgvoverc); dtmp = subdispdt[cmd->nsub - 1]; for (jj = 0; jj < cmd->nsub; jj++) offsets[ii][jj] = NEAREST_LONG((subdispdt[jj] - dtmp) / dsdt); vect_free(subdispdt); } /* Convert the bary TOAs to differences from the topo TOAs in */ /* units of bin length (dt) rounded to the nearest integer. */ dtmp = (btoa[0] - ttoa[0]); for (ii = 0; ii < numbarypts; ii++) btoa[ii] = ((btoa[ii] - ttoa[ii]) - dtmp) * SECPERDAY / dsdt; { /* Find the points where we need to add or remove bins */ int oldbin = 0, currentbin; double lobin, hibin, calcpt; numdiffbins = abs(NEAREST_LONG(btoa[numbarypts - 1])) + 1; diffbins = gen_ivect(numdiffbins); diffbinptr = diffbins; for (ii = 1; ii < numbarypts; ii++) { currentbin = NEAREST_LONG(btoa[ii]); if (currentbin != oldbin) { if (currentbin > 0) { calcpt = oldbin + 0.5; lobin = (ii - 1) * TDT / dsdt; hibin = ii * TDT / dsdt; } else { calcpt = oldbin - 0.5; lobin = -((ii - 1) * TDT / dsdt); hibin = -(ii * TDT / dsdt); } while (fabs(calcpt) < fabs(btoa[ii])) { /* Negative bin number means remove that bin */ /* Positive bin number means add a bin there */ *diffbinptr = NEAREST_LONG(LININTERP(calcpt, btoa[ii - 1], btoa[ii], lobin, hibin)); diffbinptr++; calcpt = (currentbin > 0) ? calcpt + 1.0 : calcpt - 1.0; } oldbin = currentbin; } } *diffbinptr = cmd->numout; /* Used as a marker */ } diffbinptr = diffbins; /* Now perform the barycentering */ if (cmd->subP) subsdata = gen_smatrix(cmd->nsub, worklen / cmd->downsamp); else outdata = gen_fmatrix(cmd->numdms, worklen / cmd->downsamp); numread = get_data(outdata, blocksperread, &s, &obsmask, idispdt, offsets, &padding, subsdata); while (numread == worklen) { /* Loop to read and write the data */ int numwritten = 0; double block_avg, block_var; numread /= cmd->downsamp; /* Determine the approximate local average */ avg_var(outdata[0], numread, &block_avg, &block_var); print_percent_complete(totwrote, totnumtowrite); /* Simply write the data if we don't have to add or */ /* remove any bins from this batch. */ /* OR write the amount of data up to cmd->numout or */ /* the next bin that will be added or removed. */ numtowrite = abs(*diffbinptr) - datawrote; if (cmd->numoutP && (totwrote + numtowrite) > cmd->numout) numtowrite = cmd->numout - totwrote; if (numtowrite > numread) numtowrite = numread; if (cmd->subP) write_subs(outfiles, cmd->nsub, subsdata, 0, numtowrite); else write_data(outfiles, cmd->numdms, outdata, 0, numtowrite); datawrote += numtowrite; totwrote += numtowrite; numwritten += numtowrite; /* Update the statistics */ if (!padding && !cmd->subP) { for (ii = 0; ii < numtowrite; ii++) update_stats(statnum + ii, outdata[0][ii], &min, &max, &avg, &var); statnum += numtowrite; } if ((datawrote == abs(*diffbinptr)) && (numwritten != numread) && (totwrote < cmd->numout)) { /* Add/remove a bin */ int skip, nextdiffbin; skip = numtowrite; do { /* Write the rest of the data after adding/removing a bin */ if (*diffbinptr > 0) { /* Add a bin */ write_padding(outfiles, cmd->numdms, block_avg, 1); numadded++; totwrote++; } else { /* Remove a bin */ numremoved++; datawrote++; numwritten++; skip++; } diffbinptr++; /* Write the part after the diffbin */ numtowrite = numread - numwritten; if (cmd->numoutP && (totwrote + numtowrite) > cmd->numout) numtowrite = cmd->numout - totwrote; nextdiffbin = abs(*diffbinptr) - datawrote; if (numtowrite > nextdiffbin) numtowrite = nextdiffbin; if (cmd->subP) write_subs(outfiles, cmd->nsub, subsdata, skip, numtowrite); else write_data(outfiles, cmd->numdms, outdata, skip, numtowrite); numwritten += numtowrite; datawrote += numtowrite; totwrote += numtowrite; /* Update the statistics and counters */ if (!padding && !cmd->subP) { for (ii = 0; ii < numtowrite; ii++) update_stats(statnum + ii, outdata[0][skip + ii], &min, &max, &avg, &var); statnum += numtowrite; } skip += numtowrite; /* Stop if we have written out all the data we need to */ if (cmd->numoutP && (totwrote == cmd->numout)) break; } while (numwritten < numread); } /* Stop if we have written out all the data we need to */ if (cmd->numoutP && (totwrote == cmd->numout)) break; numread = get_data(outdata, blocksperread, &s, &obsmask, idispdt, offsets, &padding, subsdata); } } /* Calculate the amount of padding we need */ if (cmd->numoutP && (cmd->numout > totwrote)) padwrote = padtowrite = cmd->numout - totwrote; /* Write the new info file for the output data */ idata.dt = dsdt; update_infodata(&idata, totwrote, padtowrite, diffbins, numdiffbins, cmd->downsamp); for (ii = 0; ii < cmd->numdms; ii++) { idata.dm = dms[ii]; if (!cmd->nobaryP) { double baryepoch, barydispdt, baryhifreq; baryhifreq = idata.freq + (s.num_channels - 1) * idata.chan_wid; barydispdt = delay_from_dm(dms[ii], doppler(baryhifreq, avgvoverc)); baryepoch = blotoa - (barydispdt / SECPERDAY); idata.bary = 1; idata.mjd_i = (int) floor(baryepoch); idata.mjd_f = baryepoch - idata.mjd_i; } if (cmd->subP) sprintf(idata.name, "%s_DM%.*f.sub", cmd->outfile, dmprecision, dms[ii]); else sprintf(idata.name, "%s_DM%.*f", cmd->outfile, dmprecision, dms[ii]); writeinf(&idata); } /* Set the padded points equal to the average data point */ if (idata.numonoff >= 1) { int index, startpad, endpad; for (ii = 0; ii < cmd->numdms; ii++) { fclose(outfiles[ii]); sprintf(datafilenm, "%s_DM%.*f.dat", cmd->outfile, dmprecision, dms[ii]); outfiles[ii] = chkfopen(datafilenm, "rb+"); } for (ii = 0; ii < idata.numonoff; ii++) { index = 2 * ii; startpad = idata.onoff[index + 1]; if (ii == idata.numonoff - 1) endpad = idata.N - 1; else endpad = idata.onoff[index + 2]; for (jj = 0; jj < cmd->numdms; jj++) chkfseek(outfiles[jj], (startpad + 1) * sizeof(float), SEEK_SET); padtowrite = endpad - startpad; write_padding(outfiles, cmd->numdms, avg, padtowrite); } } /* Print simple stats and results */ if (!cmd->subP) { var /= (datawrote - 1); print_percent_complete(1, 1); printf("\n\nDone.\n\nSimple statistics of the output data:\n"); printf(" Data points written: %ld\n", totwrote); if (padwrote) printf(" Padding points written: %ld\n", padwrote); if (!cmd->nobaryP) { if (numadded) printf(" Bins added for barycentering: %d\n", numadded); if (numremoved) printf(" Bins removed for barycentering: %d\n", numremoved); } printf(" Maximum value of data: %.2f\n", max); printf(" Minimum value of data: %.2f\n", min); printf(" Data average value: %.2f\n", avg); printf(" Data standard deviation: %.2f\n", sqrt(var)); printf("\n"); } else { printf("\n\nDone.\n"); printf(" Data points written: %ld\n", totwrote); if (padwrote) printf(" Padding points written: %ld\n", padwrote); if (!cmd->nobaryP) { if (numadded) printf(" Bins added for barycentering: %d\n", numadded); if (numremoved) printf(" Bins removed for barycentering: %d\n", numremoved); } printf("\n"); } /* Close the files and cleanup */ if (cmd->maskfileP) { free_mask(obsmask); } // Close all the raw files and free their vectors close_rawfiles(&s); for (ii = 0; ii < cmd->numdms; ii++) fclose(outfiles[ii]); if (cmd->subP) { vect_free(subsdata[0]); vect_free(subsdata); } else { vect_free(outdata[0]); vect_free(outdata); } free(outfiles); vect_free(dms); vect_free(idispdt); vect_free(offsets[0]); vect_free(offsets); free(datafilenm); if (!cmd->nobaryP) { vect_free(btoa); vect_free(ttoa); vect_free(diffbins); } return (0); }
// Coordinate descent for logistic models RcppExport SEXP cdfit_binomial_hsr(SEXP X_, SEXP y_, SEXP row_idx_, SEXP lambda_, SEXP nlambda_, SEXP lam_scale_, SEXP lambda_min_, SEXP alpha_, SEXP user_, SEXP eps_, SEXP max_iter_, SEXP multiplier_, SEXP dfmax_, SEXP ncore_, SEXP warn_, SEXP verbose_) { XPtr<BigMatrix> xMat(X_); double *y = REAL(y_); int *row_idx = INTEGER(row_idx_); double lambda_min = REAL(lambda_min_)[0]; double alpha = REAL(alpha_)[0]; int n = Rf_length(row_idx_); // number of observations used for fitting model int p = xMat->ncol(); int L = INTEGER(nlambda_)[0]; int lam_scale = INTEGER(lam_scale_)[0]; double eps = REAL(eps_)[0]; int max_iter = INTEGER(max_iter_)[0]; double *m = REAL(multiplier_); int dfmax = INTEGER(dfmax_)[0]; int warn = INTEGER(warn_)[0]; int user = INTEGER(user_)[0]; int verbose = INTEGER(verbose_)[0]; NumericVector lambda(L); NumericVector Dev(L); IntegerVector iter(L); IntegerVector n_reject(L); NumericVector beta0(L); NumericVector center(p); NumericVector scale(p); int p_keep = 0; // keep columns whose scale > 1e-6 int *p_keep_ptr = &p_keep; vector<int> col_idx; vector<double> z; double lambda_max = 0.0; double *lambda_max_ptr = &lambda_max; int xmax_idx = 0; int *xmax_ptr = &xmax_idx; // set up omp int useCores = INTEGER(ncore_)[0]; #ifdef BIGLASSO_OMP_H_ int haveCores = omp_get_num_procs(); if(useCores < 1) { useCores = haveCores; } omp_set_dynamic(0); omp_set_num_threads(useCores); #endif if (verbose) { char buff1[100]; time_t now1 = time (0); strftime (buff1, 100, "%Y-%m-%d %H:%M:%S.000", localtime (&now1)); Rprintf("\nPreprocessing start: %s\n", buff1); } // standardize: get center, scale; get p_keep_ptr, col_idx; get z, lambda_max, xmax_idx; standardize_and_get_residual(center, scale, p_keep_ptr, col_idx, z, lambda_max_ptr, xmax_ptr, xMat, y, row_idx, lambda_min, alpha, n, p); p = p_keep; // set p = p_keep, only loop over columns whose scale > 1e-6 if (verbose) { char buff1[100]; time_t now1 = time (0); strftime (buff1, 100, "%Y-%m-%d %H:%M:%S.000", localtime (&now1)); Rprintf("Preprocessing end: %s\n", buff1); Rprintf("\n-----------------------------------------------\n"); } arma::sp_mat beta = arma::sp_mat(p, L); //beta double *a = Calloc(p, double); //Beta from previous iteration double a0 = 0.0; //beta0 from previousiteration double *w = Calloc(n, double); double *s = Calloc(n, double); //y_i - pi_i double *eta = Calloc(n, double); int *e1 = Calloc(p, int); //ever-active set int *e2 = Calloc(p, int); //strong set double xwr, xwx, pi, u, v, cutoff, l1, l2, shift, si; double max_update, update, thresh; // for convergence check int i, j, jj, l, violations, lstart; double ybar = sum(y, n) / n; a0 = beta0[0] = log(ybar / (1-ybar)); double nullDev = 0; double *r = Calloc(n, double); for (i = 0; i < n; i++) { r[i] = y[i]; nullDev = nullDev - y[i]*log(ybar) - (1-y[i])*log(1-ybar); s[i] = y[i] - ybar; eta[i] = a0; } thresh = eps * nullDev / n; double sumS = sum(s, n); // temp result sum of s double sumWResid = 0.0; // temp result: sum of w * r // set up lambda if (user == 0) { if (lam_scale) { // set up lambda, equally spaced on log scale double log_lambda_max = log(lambda_max); double log_lambda_min = log(lambda_min*lambda_max); double delta = (log_lambda_max - log_lambda_min) / (L-1); for (l = 0; l < L; l++) { lambda[l] = exp(log_lambda_max - l * delta); } } else { // equally spaced on linear scale double delta = (lambda_max - lambda_min*lambda_max) / (L-1); for (l = 0; l < L; l++) { lambda[l] = lambda_max - l * delta; } } Dev[0] = nullDev; lstart = 1; n_reject[0] = p; } else { lstart = 0; lambda = Rcpp::as<NumericVector>(lambda_); } for (l = lstart; l < L; l++) { if(verbose) { // output time char buff[100]; time_t now = time (0); strftime (buff, 100, "%Y-%m-%d %H:%M:%S.000", localtime (&now)); Rprintf("Lambda %d. Now time: %s\n", l, buff); } if (l != 0) { // Check dfmax int nv = 0; for (j = 0; j < p; j++) { if (a[j] != 0) { nv++; } } if (nv > dfmax) { for (int ll=l; ll<L; ll++) iter[ll] = NA_INTEGER; Free_memo_bin_hsr(s, w, a, r, e1, e2, eta); return List::create(beta0, beta, center, scale, lambda, Dev, iter, n_reject, Rcpp::wrap(col_idx)); } // strong set cutoff = 2*lambda[l] - lambda[l-1]; for (j = 0; j < p; j++) { if (fabs(z[j]) > (cutoff * alpha * m[col_idx[j]])) { e2[j] = 1; } else { e2[j] = 0; } } } else { // strong set cutoff = 2*lambda[l] - lambda_max; for (j = 0; j < p; j++) { if (fabs(z[j]) > (cutoff * alpha * m[col_idx[j]])) { e2[j] = 1; } else { e2[j] = 0; } } } n_reject[l] = p - sum(e2, p); while (iter[l] < max_iter) { while (iter[l] < max_iter) { while (iter[l] < max_iter) { iter[l]++; Dev[l] = 0.0; for (i = 0; i < n; i++) { if (eta[i] > 10) { pi = 1; w[i] = .0001; } else if (eta[i] < -10) { pi = 0; w[i] = .0001; } else { pi = exp(eta[i]) / (1 + exp(eta[i])); w[i] = pi * (1 - pi); } s[i] = y[i] - pi; r[i] = s[i] / w[i]; if (y[i] == 1) { Dev[l] = Dev[l] - log(pi); } else { Dev[l] = Dev[l] - log(1-pi); } } if (Dev[l] / nullDev < .01) { if (warn) warning("Model saturated; exiting..."); for (int ll=l; ll<L; ll++) iter[ll] = NA_INTEGER; Free_memo_bin_hsr(s, w, a, r, e1, e2, eta); return List::create(beta0, beta, center, scale, lambda, Dev, iter, n_reject, Rcpp::wrap(col_idx)); } // Intercept xwr = crossprod(w, r, n, 0); xwx = sum(w, n); beta0[l] = xwr / xwx + a0; si = beta0[l] - a0; if (si != 0) { a0 = beta0[l]; for (i = 0; i < n; i++) { r[i] -= si; //update r eta[i] += si; //update eta } } sumWResid = wsum(r, w, n); // update temp result: sum of w * r, used for computing xwr; max_update = 0.0; for (j = 0; j < p; j++) { if (e1[j]) { jj = col_idx[j]; xwr = wcrossprod_resid(xMat, r, sumWResid, row_idx, center[jj], scale[jj], w, n, jj); v = wsqsum_bm(xMat, w, row_idx, center[jj], scale[jj], n, jj) / n; u = xwr/n + v * a[j]; l1 = lambda[l] * m[jj] * alpha; l2 = lambda[l] * m[jj] * (1-alpha); beta(j, l) = lasso(u, l1, l2, v); shift = beta(j, l) - a[j]; if (shift !=0) { // update change of objective function // update = - u * shift + (0.5 * v + 0.5 * l2) * (pow(beta(j, l), 2) - pow(a[j], 2)) + l1 * (fabs(beta(j, l)) - fabs(a[j])); update = pow(beta(j, l) - a[j], 2) * v; if (update > max_update) max_update = update; update_resid_eta(r, eta, xMat, shift, row_idx, center[jj], scale[jj], n, jj); // update r sumWResid = wsum(r, w, n); // update temp result w * r, used for computing xwr; a[j] = beta(j, l); // update a } } } // Check for convergence if (max_update < thresh) break; } // Scan for violations in strong set sumS = sum(s, n); violations = check_strong_set_bin(e1, e2, z, xMat, row_idx, col_idx, center, scale, a, lambda[l], sumS, alpha, s, m, n, p); if (violations==0) break; } // Scan for violations in rest violations = check_rest_set_bin(e1, e2, z, xMat, row_idx, col_idx, center, scale, a, lambda[l], sumS, alpha, s, m, n, p); if (violations==0) break; } } Free_memo_bin_hsr(s, w, a, r, e1, e2, eta); return List::create(beta0, beta, center, scale, lambda, Dev, iter, n_reject, Rcpp::wrap(col_idx)); }
int main () { char *env_proc_bind = getenv ("OMP_PROC_BIND"); int test_false = env_proc_bind && strcmp (env_proc_bind, "false") == 0; int test_true = env_proc_bind && strcmp (env_proc_bind, "true") == 0; int test_spread_master_close = env_proc_bind && strcmp (env_proc_bind, "spread,master,close") == 0; char *env_places = getenv ("OMP_PLACES"); int test_places = 0; #ifdef DO_FORK if (env_places == NULL && contig_cpucount >= 8 && test_false && getenv ("GOMP_AFFINITY") == NULL) { int i, j, status; pid_t pid; for (j = 0; j < 2; j++) { if (setenv ("OMP_PROC_BIND", j ? "spread,master,close" : "true", 1) < 0) break; for (i = sizeof (places_array) / sizeof (places_array[0]) - 1; i; --i) { if (setenv ("OMP_PLACES", places_array[i].name, 1) < 0) break; pid = fork (); if (pid == -1) break; if (pid == 0) { execl ("/proc/self/exe", "affinity-1.exe", NULL); _exit (1); } if (waitpid (pid, &status, 0) < 0) break; if (WIFSIGNALED (status) && WTERMSIG (status) == SIGABRT) abort (); else if (!WIFEXITED (status) || WEXITSTATUS (status) != 0) break; } if (i) break; } } #endif int first = 1; if (env_proc_bind) { printf ("OMP_PROC_BIND='%s'", env_proc_bind); first = 0; } if (env_places) printf ("%sOMP_PLACES='%s'", first ? "" : " ", env_places); printf ("\n"); if (env_places && contig_cpucount >= 8 && (test_true || test_spread_master_close)) { for (test_places = sizeof (places_array) / sizeof (places_array[0]) - 1; test_places; --test_places) if (strcmp (env_places, places_array[test_places].name) == 0) break; } #define verify(if_true, if_s_m_c) \ if (test_false && omp_get_proc_bind () != omp_proc_bind_false) \ abort (); \ if (test_true && omp_get_proc_bind () != if_true) \ abort (); \ if (test_spread_master_close && omp_get_proc_bind () != if_s_m_c) \ abort (); verify (omp_proc_bind_true, omp_proc_bind_spread); printf ("Initial thread"); print_affinity (places_array[test_places].places[0]); printf ("\n"); omp_set_nested (1); omp_set_dynamic (0); #pragma omp parallel if (0) { verify (omp_proc_bind_true, omp_proc_bind_master); #pragma omp parallel if (0) { verify (omp_proc_bind_true, omp_proc_bind_close); #pragma omp parallel if (0) { verify (omp_proc_bind_true, omp_proc_bind_close); } #pragma omp parallel if (0) proc_bind (spread) { verify (omp_proc_bind_spread, omp_proc_bind_spread); } } #pragma omp parallel if (0) proc_bind (master) { verify (omp_proc_bind_master, omp_proc_bind_close); #pragma omp parallel if (0) { verify (omp_proc_bind_master, omp_proc_bind_close); } #pragma omp parallel if (0) proc_bind (spread) { verify (omp_proc_bind_spread, omp_proc_bind_spread); } } } /* True/spread */ #pragma omp parallel num_threads (4) { verify (omp_proc_bind_true, omp_proc_bind_master); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#1 thread %d", thr); if (omp_get_num_threads () == 4 && test_spread_master_close) switch (places_array[test_places].count) { case 8: /* T = 4, P = 8, each subpartition has 2 places. */ case 7: /* T = 4, P = 7, each subpartition has 2 places, but last partition, which has just one place. */ p = places_array[test_places].places[2 * thr]; break; case 5: /* T = 4, P = 5, first subpartition has 2 places, the rest just one. */ p = places_array[test_places].places[thr ? 1 + thr : 0]; break; case 3: /* T = 4, P = 3, unit sized subpartitions, first gets thr0 and thr3, second thr1, third thr2. */ p = places_array[test_places].places[thr == 3 ? 0 : thr]; break; case 2: /* T = 4, P = 2, unit sized subpartitions, each with 2 threads. */ p = places_array[test_places].places[thr / 2]; break; } print_affinity (p); printf ("\n"); } #pragma omp barrier if (omp_get_thread_num () == 3) { /* True/spread, true/master. */ #pragma omp parallel num_threads (3) { verify (omp_proc_bind_true, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#1,#1 thread 3,%d", thr); if (omp_get_num_threads () == 3 && test_spread_master_close) /* Outer is spread, inner master, so just bind to the place or the master thread, which is thr 3 above. */ switch (places_array[test_places].count) { case 8: case 7: p = places_array[test_places].places[6]; break; case 5: p = places_array[test_places].places[4]; break; case 3: p = places_array[test_places].places[0]; break; case 2: p = places_array[test_places].places[1]; break; } print_affinity (p); printf ("\n"); } } /* True/spread, spread. */ #pragma omp parallel num_threads (5) proc_bind (spread) { verify (omp_proc_bind_spread, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#1,#2 thread 3,%d", thr); if (omp_get_num_threads () == 5 && test_spread_master_close) /* Outer is spread, inner spread. */ switch (places_array[test_places].count) { case 8: /* T = 5, P = 2, unit sized subpartitions. */ p = places_array[test_places].places[thr == 4 ? 6 : 6 + thr / 2]; break; /* The rest are T = 5, P = 1. */ case 7: p = places_array[test_places].places[6]; break; case 5: p = places_array[test_places].places[4]; break; case 3: p = places_array[test_places].places[0]; break; case 2: p = places_array[test_places].places[1]; break; } print_affinity (p); printf ("\n"); } #pragma omp barrier if (omp_get_thread_num () == 3) { /* True/spread, spread, close. */ #pragma omp parallel num_threads (5) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#1,#2,#1 thread 3,3,%d", thr); if (omp_get_num_threads () == 5 && test_spread_master_close) /* Outer is spread, inner spread, innermost close. */ switch (places_array[test_places].count) { /* All are T = 5, P = 1. */ case 8: p = places_array[test_places].places[7]; break; case 7: p = places_array[test_places].places[6]; break; case 5: p = places_array[test_places].places[4]; break; case 3: p = places_array[test_places].places[0]; break; case 2: p = places_array[test_places].places[1]; break; } print_affinity (p); printf ("\n"); } } } } /* True/spread, master. */ #pragma omp parallel num_threads (4) proc_bind(master) { verify (omp_proc_bind_master, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#1,#3 thread 3,%d", thr); if (omp_get_num_threads () == 4 && test_spread_master_close) /* Outer is spread, inner master, so just bind to the place or the master thread, which is thr 3 above. */ switch (places_array[test_places].count) { case 8: case 7: p = places_array[test_places].places[6]; break; case 5: p = places_array[test_places].places[4]; break; case 3: p = places_array[test_places].places[0]; break; case 2: p = places_array[test_places].places[1]; break; } print_affinity (p); printf ("\n"); } } /* True/spread, close. */ #pragma omp parallel num_threads (6) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#1,#4 thread 3,%d", thr); if (omp_get_num_threads () == 6 && test_spread_master_close) /* Outer is spread, inner close. */ switch (places_array[test_places].count) { case 8: /* T = 6, P = 2, unit sized subpartitions. */ p = places_array[test_places].places[6 + thr / 3]; break; /* The rest are T = 6, P = 1. */ case 7: p = places_array[test_places].places[6]; break; case 5: p = places_array[test_places].places[4]; break; case 3: p = places_array[test_places].places[0]; break; case 2: p = places_array[test_places].places[1]; break; } print_affinity (p); printf ("\n"); } } } } /* Spread. */ #pragma omp parallel num_threads (5) proc_bind(spread) { verify (omp_proc_bind_spread, omp_proc_bind_master); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#2 thread %d", thr); if (omp_get_num_threads () == 5 && (test_spread_master_close || test_true)) switch (places_array[test_places].count) { case 8: /* T = 5, P = 8, first 3 subpartitions have 2 places, last 2 one place. */ p = places_array[test_places].places[thr < 3 ? 2 * thr : 3 + thr]; break; case 7: /* T = 5, P = 7, first 2 subpartitions have 2 places, last 3 one place. */ p = places_array[test_places].places[thr < 2 ? 2 * thr : 2 + thr]; break; case 5: /* T = 5, P = 5, unit sized subpartitions, each one with one thread. */ p = places_array[test_places].places[thr]; break; case 3: /* T = 5, P = 3, unit sized subpartitions, first gets thr0 and thr3, second thr1 and thr4, third thr2. */ p = places_array[test_places].places[thr >= 3 ? thr - 3 : thr]; break; case 2: /* T = 5, P = 2, unit sized subpartitions, first with thr{0,1,4} and second with thr{2,3}. */ p = places_array[test_places].places[thr == 4 ? 0 : thr / 2]; break; } print_affinity (p); printf ("\n"); } #pragma omp barrier if (omp_get_thread_num () == 3) { int pp = 0; switch (places_array[test_places].count) { case 8: pp = 6; break; case 7: pp = 5; break; case 5: pp = 3; break; case 2: pp = 1; break; } /* Spread, spread/master. */ #pragma omp parallel num_threads (3) firstprivate (pp) { verify (omp_proc_bind_spread, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#2,#1 thread 3,%d", thr); if (test_spread_master_close || test_true) /* Outer is spread, inner spread resp. master, bit we have just unit sized partitions. */ p = places_array[test_places].places[pp]; print_affinity (p); printf ("\n"); } } /* Spread, spread. */ #pragma omp parallel num_threads (5) proc_bind (spread) \ firstprivate (pp) { verify (omp_proc_bind_spread, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#2,#2 thread 3,%d", thr); if (test_spread_master_close || test_true) /* Outer is spread, inner spread, bit we have just unit sized partitions. */ p = places_array[test_places].places[pp]; print_affinity (p); printf ("\n"); } } /* Spread, master. */ #pragma omp parallel num_threads (4) proc_bind(master) \ firstprivate(pp) { verify (omp_proc_bind_master, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#2,#3 thread 3,%d", thr); if (test_spread_master_close || test_true) /* Outer is spread, inner master, bit we have just unit sized partitions. */ p = places_array[test_places].places[pp]; print_affinity (p); printf ("\n"); } } /* Spread, close. */ #pragma omp parallel num_threads (6) proc_bind (close) \ firstprivate (pp) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#2,#4 thread 3,%d", thr); if (test_spread_master_close || test_true) /* Outer is spread, inner close, bit we have just unit sized partitions. */ p = places_array[test_places].places[pp]; print_affinity (p); printf ("\n"); } } } } /* Master. */ #pragma omp parallel num_threads (3) proc_bind(master) { verify (omp_proc_bind_master, omp_proc_bind_master); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#3 thread %d", thr); if (test_spread_master_close || test_true) p = places_array[test_places].places[0]; print_affinity (p); printf ("\n"); } #pragma omp barrier if (omp_get_thread_num () == 2) { /* Master, master. */ #pragma omp parallel num_threads (4) { verify (omp_proc_bind_master, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#3,#1 thread 2,%d", thr); if (test_spread_master_close || test_true) /* Outer is master, inner is master. */ p = places_array[test_places].places[0]; print_affinity (p); printf ("\n"); } } /* Master, spread. */ #pragma omp parallel num_threads (4) proc_bind (spread) { verify (omp_proc_bind_spread, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#3,#2 thread 2,%d", thr); if (omp_get_num_threads () == 4 && (test_spread_master_close || test_true)) /* Outer is master, inner is spread. */ switch (places_array[test_places].count) { case 8: /* T = 4, P = 8, each subpartition has 2 places. */ case 7: /* T = 4, P = 7, each subpartition has 2 places, but last partition, which has just one place. */ p = places_array[test_places].places[2 * thr]; break; case 5: /* T = 4, P = 5, first subpartition has 2 places, the rest just one. */ p = places_array[test_places].places[thr ? 1 + thr : 0]; break; case 3: /* T = 4, P = 3, unit sized subpartitions, first gets thr0 and thr3, second thr1, third thr2. */ p = places_array[test_places].places[thr == 3 ? 0 : thr]; break; case 2: /* T = 4, P = 2, unit sized subpartitions, each with 2 threads. */ p = places_array[test_places].places[thr / 2]; break; } print_affinity (p); printf ("\n"); } #pragma omp barrier if (omp_get_thread_num () == 0) { /* Master, spread, close. */ #pragma omp parallel num_threads (5) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#3,#2,#1 thread 2,0,%d", thr); if (omp_get_num_threads () == 5 && (test_spread_master_close || test_true)) /* Outer is master, inner spread, innermost close. */ switch (places_array[test_places].count) { /* First 3 are T = 5, P = 2. */ case 8: case 7: case 5: p = places_array[test_places].places[(thr & 2) / 2]; break; /* All the rest are T = 5, P = 1. */ case 3: case 2: p = places_array[test_places].places[0]; break; } print_affinity (p); printf ("\n"); } } } #pragma omp barrier if (omp_get_thread_num () == 3) { /* Master, spread, close. */ #pragma omp parallel num_threads (5) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#3,#2,#2 thread 2,3,%d", thr); if (omp_get_num_threads () == 5 && (test_spread_master_close || test_true)) /* Outer is master, inner spread, innermost close. */ switch (places_array[test_places].count) { case 8: /* T = 5, P = 2. */ p = places_array[test_places].places[6 + (thr & 2) / 2]; break; /* All the rest are T = 5, P = 1. */ case 7: p = places_array[test_places].places[6]; break; case 5: p = places_array[test_places].places[4]; break; case 3: p = places_array[test_places].places[0]; break; case 2: p = places_array[test_places].places[1]; break; } print_affinity (p); printf ("\n"); } } } } /* Master, master. */ #pragma omp parallel num_threads (4) proc_bind(master) { verify (omp_proc_bind_master, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#3,#3 thread 2,%d", thr); if (test_spread_master_close || test_true) /* Outer is master, inner master. */ p = places_array[test_places].places[0]; print_affinity (p); printf ("\n"); } } /* Master, close. */ #pragma omp parallel num_threads (6) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#3,#4 thread 2,%d", thr); if (omp_get_num_threads () == 6 && (test_spread_master_close || test_true)) switch (places_array[test_places].count) { case 8: /* T = 6, P = 8. */ case 7: /* T = 6, P = 7. */ p = places_array[test_places].places[thr]; break; case 5: /* T = 6, P = 5. thr{0,5} go into the first place. */ p = places_array[test_places].places[thr == 5 ? 0 : thr]; break; case 3: /* T = 6, P = 3, two threads into each place. */ p = places_array[test_places].places[thr / 2]; break; case 2: /* T = 6, P = 2, 3 threads into each place. */ p = places_array[test_places].places[thr / 3]; break; } print_affinity (p); printf ("\n"); } } } } #pragma omp parallel num_threads (5) proc_bind(close) { verify (omp_proc_bind_close, omp_proc_bind_master); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4 thread %d", thr); if (omp_get_num_threads () == 5 && (test_spread_master_close || test_true)) switch (places_array[test_places].count) { case 8: /* T = 5, P = 8. */ case 7: /* T = 5, P = 7. */ case 5: /* T = 5, P = 5. */ p = places_array[test_places].places[thr]; break; case 3: /* T = 5, P = 3, thr{0,3} in first place, thr{1,4} in second, thr2 in third. */ p = places_array[test_places].places[thr >= 3 ? thr - 3 : thr]; break; case 2: /* T = 5, P = 2, thr{0,1,4} in first place, thr{2,3} in second. */ p = places_array[test_places].places[thr == 4 ? 0 : thr / 2]; break; } print_affinity (p); printf ("\n"); } #pragma omp barrier if (omp_get_thread_num () == 2) { int pp = 0; switch (places_array[test_places].count) { case 8: case 7: case 5: case 3: pp = 2; break; case 2: pp = 1; break; } /* Close, close/master. */ #pragma omp parallel num_threads (4) firstprivate (pp) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4,#1 thread 2,%d", thr); if (test_spread_master_close) /* Outer is close, inner is master. */ p = places_array[test_places].places[pp]; else if (omp_get_num_threads () == 4 && test_true) /* Outer is close, inner is close. */ switch (places_array[test_places].count) { case 8: /* T = 4, P = 8. */ case 7: /* T = 4, P = 7. */ p = places_array[test_places].places[2 + thr]; break; case 5: /* T = 4, P = 5. There is wrap-around for thr3. */ p = places_array[test_places].places[thr == 3 ? 0 : 2 + thr]; break; case 3: /* T = 4, P = 3, thr{0,3} go into p2, thr1 into p0, thr2 into p1. */ p = places_array[test_places].places[(2 + thr) % 3]; break; case 2: /* T = 4, P = 2, 2 threads into each place. */ p = places_array[test_places].places[1 - thr / 2]; break; } print_affinity (p); printf ("\n"); } } /* Close, spread. */ #pragma omp parallel num_threads (4) proc_bind (spread) { verify (omp_proc_bind_spread, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4,#2 thread 2,%d", thr); if (omp_get_num_threads () == 4 && (test_spread_master_close || test_true)) /* Outer is close, inner is spread. */ switch (places_array[test_places].count) { case 8: /* T = 4, P = 8, each subpartition has 2 places. */ case 7: /* T = 4, P = 7, each subpartition has 2 places, but last partition, which has just one place. */ p = places_array[test_places].places[thr == 3 ? 0 : 2 + 2 * thr]; break; case 5: /* T = 4, P = 5, first subpartition has 2 places, the rest just one. */ p = places_array[test_places].places[thr == 3 ? 0 : 2 + thr]; break; case 3: /* T = 4, P = 3, unit sized subpartitions, third gets thr0 and thr3, first thr1, second thr2. */ p = places_array[test_places].places[thr == 0 ? 2 : thr - 1]; break; case 2: /* T = 4, P = 2, unit sized subpartitions, each with 2 threads. */ p = places_array[test_places].places[1 - thr / 2]; break; } print_affinity (p); printf ("\n"); } #pragma omp barrier if (omp_get_thread_num () == 0) { /* Close, spread, close. */ #pragma omp parallel num_threads (5) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4,#2,#1 thread 2,0,%d", thr); if (omp_get_num_threads () == 5 && (test_spread_master_close || test_true)) /* Outer is close, inner spread, innermost close. */ switch (places_array[test_places].count) { case 8: case 7: /* T = 5, P = 2. */ p = places_array[test_places].places[2 + (thr & 2) / 2]; break; /* All the rest are T = 5, P = 1. */ case 5: case 3: p = places_array[test_places].places[2]; break; case 2: p = places_array[test_places].places[1]; break; } print_affinity (p); printf ("\n"); } } } #pragma omp barrier if (omp_get_thread_num () == 2) { /* Close, spread, close. */ #pragma omp parallel num_threads (5) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4,#2,#2 thread 2,2,%d", thr); if (omp_get_num_threads () == 5 && (test_spread_master_close || test_true)) /* Outer is close, inner spread, innermost close. */ switch (places_array[test_places].count) { case 8: /* T = 5, P = 2. */ p = places_array[test_places].places[6 + (thr & 2) / 2]; break; /* All the rest are T = 5, P = 1. */ case 7: p = places_array[test_places].places[6]; break; case 5: p = places_array[test_places].places[4]; break; case 3: p = places_array[test_places].places[1]; break; case 2: p = places_array[test_places].places[0]; break; } print_affinity (p); printf ("\n"); } } } #pragma omp barrier if (omp_get_thread_num () == 3) { /* Close, spread, close. */ #pragma omp parallel num_threads (5) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4,#2,#3 thread 2,3,%d", thr); if (omp_get_num_threads () == 5 && (test_spread_master_close || test_true)) /* Outer is close, inner spread, innermost close. */ switch (places_array[test_places].count) { case 8: case 7: case 5: /* T = 5, P = 2. */ p = places_array[test_places].places[(thr & 2) / 2]; break; /* All the rest are T = 5, P = 1. */ case 3: p = places_array[test_places].places[2]; break; case 2: p = places_array[test_places].places[0]; break; } print_affinity (p); printf ("\n"); } } } } /* Close, master. */ #pragma omp parallel num_threads (4) proc_bind(master) \ firstprivate (pp) { verify (omp_proc_bind_master, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4,#3 thread 2,%d", thr); if (test_spread_master_close || test_true) /* Outer is close, inner master. */ p = places_array[test_places].places[pp]; print_affinity (p); printf ("\n"); } } /* Close, close. */ #pragma omp parallel num_threads (6) proc_bind (close) { verify (omp_proc_bind_close, omp_proc_bind_close); #pragma omp critical { struct place p = places_array[0].places[0]; int thr = omp_get_thread_num (); printf ("#4,#4 thread 2,%d", thr); if (omp_get_num_threads () == 6 && (test_spread_master_close || test_true)) switch (places_array[test_places].count) { case 8: /* T = 6, P = 8. */ p = places_array[test_places].places[2 + thr]; break; case 7: /* T = 6, P = 7. */ p = places_array[test_places].places[thr == 5 ? 0 : 2 + thr]; break; case 5: /* T = 6, P = 5. thr{0,5} go into the third place. */ p = places_array[test_places].places[thr >= 3 ? thr - 3 : 2 + thr]; break; case 3: /* T = 6, P = 3, two threads into each place. */ p = places_array[test_places].places[thr < 2 ? 2 : thr / 2 - 1]; break; case 2: /* T = 6, P = 2, 3 threads into each place. */ p = places_array[test_places].places[1 - thr / 3]; break; } print_affinity (p); printf ("\n"); } } } } return 0; }
int main(int argc, char** argv){ extern int nthreads; // Time counting variables struct timeval startwtime, endwtime; if (argc != 7) { // Check if the command line arguments are correct printf("Usage: %s N dist pop rep P\n" "where\n" "N : number of points\n" "dist : distribution code (0-cube, 1-sphere)\n" "pop : population threshold\n" "rep : repetitions\n" "L : maximum tree height.\n", argv[0]); return (1); } // Input command line arguments int N = atoi(argv[1]); // Number of points int dist = atoi(argv[2]); // Distribution identifier int population_threshold = atoi(argv[3]); // populatiton threshold int repeat = atoi(argv[4]); // number of independent runs int maxlev = atoi(argv[5]); // maximum tree height nthreads = atoi(argv[6]); // maximum tree height omp_set_dynamic(0); omp_set_num_threads(nthreads); printf("Running for %d particles with maximum height: %d\n", N, maxlev); float *X = (float *) malloc(N*DIM*sizeof(float)); float *Y = (float *) malloc(N*DIM*sizeof(float)); unsigned int *hash_codes = (unsigned int *) malloc(DIM*N*sizeof(unsigned int)); unsigned long int *morton_codes = (unsigned long int *) malloc(N*sizeof(unsigned long int)); unsigned long int *sorted_morton_codes = (unsigned long int *) malloc(N*sizeof(unsigned long int)); unsigned int *permutation_vector = (unsigned int *) malloc(N*sizeof(unsigned int)); unsigned int *index = (unsigned int *) malloc(N*sizeof(unsigned int)); unsigned int *level_record = (unsigned int *) calloc(N,sizeof(unsigned int)); // record of the leaf of the tree and their level // initialize the index for(int i=0; i<N; i++){ index[i] = i; } /* Generate a 3-dimensional data distribution */ create_dataset(X, N, dist); /* Find the boundaries of the space */ float max[DIM], min[DIM]; find_max(max, X, N); find_min(min, X, N); int nbins = (1 << maxlev); // maximum number of boxes at the leaf level // Independent runs for(int it = 0; it<repeat; it++){ gettimeofday (&startwtime, NULL); compute_hash_codes(hash_codes, X, N, nbins, min, max); // compute the hash codes gettimeofday (&endwtime, NULL); double hash_time = (double)((endwtime.tv_usec - startwtime.tv_usec) /1.0e6 + endwtime.tv_sec - startwtime.tv_sec); printf("Time to compute the hash codes : %fs\n", hash_time); gettimeofday (&startwtime, NULL); morton_encoding(morton_codes, hash_codes, N, maxlev); // computes the Morton codes of the particles gettimeofday (&endwtime, NULL); double morton_encoding_time = (double)((endwtime.tv_usec - startwtime.tv_usec) /1.0e6 + endwtime.tv_sec - startwtime.tv_sec); printf("Time to compute the morton encoding : %fs\n", morton_encoding_time); gettimeofday (&startwtime, NULL); // Truncated msd radix sort truncated_radix_sort(morton_codes, sorted_morton_codes, permutation_vector, index, level_record, N, population_threshold, 3*(maxlev-1), 0); gettimeofday (&endwtime, NULL); double sort_time = (double)((endwtime.tv_usec - startwtime.tv_usec) /1.0e6 + endwtime.tv_sec - startwtime.tv_sec); printf("Time for the truncated radix sort : %fs\n", sort_time); gettimeofday (&startwtime, NULL); // Data rearrangement data_rearrangement(Y, X, permutation_vector, N); gettimeofday (&endwtime, NULL); double rearrange_time = (double)((endwtime.tv_usec - startwtime.tv_usec) /1.0e6 + endwtime.tv_sec - startwtime.tv_sec); printf("Time to rearrange the particles in memory : %fs\n", rearrange_time); /* The following code is for verification */ // Check if every point is assigned to one leaf of the tree int pass = check_index(permutation_vector, N); if(pass){ printf("Index test PASS\n"); } else{ printf("Index test FAIL\n"); } // Check is all particles that are in the same box have the same encoding. pass = check_codes(Y, sorted_morton_codes, level_record, N, maxlev); if(pass){ printf("Encoding test PASS\n"); } else{ printf("Encoding test FAIL\n"); } } /* clear memory */ free(X); free(Y); free(hash_codes); free(morton_codes); free(sorted_morton_codes); free(permutation_vector); free(index); free(level_record); }