main ()
{
  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  omp_set_dynamic (0);


  prvt.i = MAGICNO;
  prvt.d = MAGICNO+1;
  #pragma omp parallel firstprivate (prvt)
  {
    int	id = omp_get_thread_num ();

    if (prvt.i != MAGICNO) {
      #pragma omp critical
      errors += 1;
    }
    if (prvt.d != MAGICNO+1) {
      #pragma omp critical
      errors += 1;
    }

    prvt.i = id;
    prvt.d = id-1;

    #pragma omp barrier
    if (prvt.i != id) {
      #pragma omp critical
      errors += 1;
    }
    if (prvt.d != id-1) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(prvt) != sizeof(struct x)) {
      #pragma omp critical
      errors += 1;
    }
  }


  prvt.i = MAGICNO*2;
  prvt.d = MAGICNO*2+1;
  #pragma omp parallel firstprivate (prvt)
  func1 (MAGICNO*2, &prvt);


  prvt.i = MAGICNO*3;
  prvt.d = MAGICNO*3+1;
  #pragma omp parallel firstprivate (prvt)
  func2 (MAGICNO*3);


  if (errors == 0) {
    printf ("firstprivate 013 : SUCCESS\n");
    return 0;
  } else {
    printf ("firstprivate 013 : FAILED\n");
    return 1;
  }
}
Example #2
0
main ()
{
  int	lp, finish;

  int	errors = 0;


  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  buf = (int *) malloc (sizeof (int) * thds);
  if (buf == NULL) {
    printf ("can not allocate memory.\n");
    exit (1);
  }
  omp_set_dynamic (0);

  finish = 0;
  clear ();
  #pragma omp parallel for schedule (static,1)
  for (lp=0; lp<thds; lp++) {
    int	id = omp_get_thread_num ();

    /* make barrier, and thread 1-x delay any sec from thread 0 */
    if (id == 0) {
      finish = 1;
    } else {
      while (finish == 0) {
        #pragma omp flush
      }
      waittime (1);
    }
    buf[id] = omp_get_thread_num ();
    #pragma omp flush
  }

  for (lp=0; lp<thds; lp++) {
    if (buf[lp] == -1) {
      errors += 1;
    }
  }


  finish = 0;
  clear ();
  #pragma omp parallel for schedule (dynamic,1)
  for (lp=0; lp<thds; lp++) {
    int	id = omp_get_thread_num ();

    barrier (thds);

    /* make barrier, and thread 1-x delay any sec from thread 0 */
    if (omp_get_thread_num () == 0) {
      finish = 1;
    } else {
      while (finish == 0) {
        #pragma omp flush
      }
      waittime (1);
    }
    buf[id] = omp_get_thread_num ();
    #pragma omp flush
  }

  for (lp=0; lp<thds; lp++) {
    if (buf[lp] == -1) {
      errors += 1;
    }
  }



  clear ();
  #pragma omp parallel for schedule (guided,1)
  for (lp=0; lp<thds*4; lp++) {
    int	id = omp_get_thread_num ();

    /* make barrier, and thread 1-x delay any sec from thread 0 */

    buf[id] = -2;
    #pragma omp flush

    if (id != 0) {
      waittime (1);
    }

    buf[id] = id;
    #pragma omp flush
  }

  for (lp=0; lp<thds; lp++) {
    if (buf[lp] == -2) {
      errors += 1;
    }
  }


  if (errors == 0) {
    printf ("parallel for 008 : SUCCESS\n");
    return 0;
  } else {
    printf ("parallel for 008 : FAILED\n");
    return 1;
  }
}
Example #3
0
const DBSCAN::DistanceMatrix DBSCAN::calc_dist_matrix( const DBSCAN::ClusterData& C, const DBSCAN::FeaturesWeights& W )
{
    DBSCAN::ClusterData cl_d = C;

    omp_set_dynamic( 0 );
    omp_set_num_threads( m_num_threads );
#pragma omp parallel for
    for ( size_t i = 0; i < cl_d.size2(); ++i ) {
        ublas::matrix_column< DBSCAN::ClusterData > col( cl_d, i );

        const auto r = minmax_element( col.begin(), col.end() );

        double data_min = *r.first;
        double data_range = *r.second - *r.first;

        if ( data_range == 0.0 ) {
            data_range = 1.0;
        }

        const double scale = 1 / data_range;
        const double min = -1.0 * data_min * scale;

        col *= scale;
        col.plus_assign( ublas::scalar_vector< typename ublas::matrix_column< DBSCAN::ClusterData >::value_type >( col.size(), min ) );
    }

    // rows x rows
    DBSCAN::DistanceMatrix d_m( cl_d.size1(), cl_d.size1() );
    ublas::vector< double > d_max( cl_d.size1() );
    ublas::vector< double > d_min( cl_d.size1() );

    omp_set_dynamic( 0 );
    omp_set_num_threads( m_num_threads );
#pragma omp parallel for
    for ( size_t i = 0; i < cl_d.size1(); ++i ) {
        for ( size_t j = i; j < cl_d.size1(); ++j ) {
            d_m( i, j ) = 0.0;

            if ( i != j ) {
                ublas::matrix_row< DBSCAN::ClusterData > U( cl_d, i );
                ublas::matrix_row< DBSCAN::ClusterData > V( cl_d, j );

                int k = 0;
                for ( const auto e : ( U - V ) ) {
                    d_m( i, j ) += fabs( e ) * W[k++];
                }

                d_m( j, i ) = d_m( i, j );
            }
        }

        const auto cur_row = ublas::matrix_row< DBSCAN::DistanceMatrix >( d_m, i );
        const auto mm = minmax_element( cur_row.begin(), cur_row.end() );

        d_max( i ) = *mm.second;
        d_min( i ) = *mm.first;
    }

    m_dmin = *( min_element( d_min.begin(), d_min.end() ) );
    m_dmax = *( max_element( d_max.begin(), d_max.end() ) );

    m_eps = ( m_dmax - m_dmin ) * m_eps + m_dmin;

    return d_m;
}
Example #4
0
FANN_EXTERNAL float FANN_API fann_train_epoch_batch_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb)
{
	/*vector<struct fann *> ann_vect(threadnumb);*/
	struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*));
	int i=0,j=0;
	fann_reset_MSE(ann);

	//generate copies of the ann
	omp_set_dynamic(0);
	omp_set_num_threads(threadnumb);
	#pragma omp parallel private(j)
	{

		#pragma omp for schedule(static)
		for(i=0; i<(int)threadnumb; i++)
		{
			ann_vect[i]=fann_copy(ann);
		}

    //parallel computing of the updates

        #pragma omp for schedule(static)
		for(i = 0; i < (int)data->num_data; i++)
		{
			j=omp_get_thread_num();
			if (ann->do_dropout) {
				fann_run_dropout(ann_vect[j], data->input[i]);
			}
			else {
				fann_run(ann_vect[j], data->input[i]);
			}
			fann_compute_MSE(ann_vect[j], data->output[i]);
			fann_backpropagate_MSE(ann_vect[j]);
			fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1);
		}
	}

    //parallel update of the weights
	{
		const unsigned int num_data=data->num_data;
		const unsigned int first_weight=0;
		const unsigned int past_end=ann->total_connections;
		fann_type *weights = ann->weights;
		const fann_type epsilon = ann->learning_rate / num_data;
		omp_set_dynamic(0);
		omp_set_num_threads(threadnumb);
		#pragma omp parallel
		{
			#pragma omp for schedule(static)
				for(i=first_weight; i < (int)past_end; i++)
				{
					fann_type temp_slopes=0.0;
					unsigned int k;
					fann_type *train_slopes;
					for(k=0;k<threadnumb;++k)
					{
						train_slopes=ann_vect[k]->train_slopes;
						temp_slopes+= train_slopes[i];
						train_slopes[i]=0.0;
					}
					weights[i] += temp_slopes*epsilon;
				}
			}
	}
	//merge of MSEs
	for(i=0;i<(int)threadnumb;++i)
	{
		ann->MSE_value+= ann_vect[i]->MSE_value;
		ann->num_MSE+=ann_vect[i]->num_MSE;
		fann_destroy(ann_vect[i]);
	}
	free(ann_vect);
	return fann_get_MSE(ann);
}
Example #5
0
main ()
{
  int	lp;

  int	 false = 0;
  double dfalse = 0.0;


  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  buf = (int *) malloc (sizeof (int) * (thds + 1));
  if (buf == NULL) {
    printf ("can not allocate memory.\n");
    exit (1);
  }
  omp_set_dynamic (0);


  clear ();
  #pragma omp parallel for schedule(static,1) if (0)
  for (lp=0; lp<thds; lp++) {
    buf[lp] = omp_get_thread_num ();
    check_parallel (0);
  }
  errors += check_result ();

  clear ();
  #pragma omp parallel for schedule(static,1) if (dfalse)
  for (lp=0; lp<thds; lp++) {
    buf[lp] = omp_get_thread_num ();
    check_parallel (0);
  }
  errors += check_result ();

  clear ();
  #pragma omp parallel for schedule(static,1) if (false == 1)
  for (lp=0; lp<thds; lp++) {
    buf[lp] = omp_get_thread_num ();
    check_parallel (0);
  }
  errors += check_result ();

  clear ();
  #pragma omp parallel for schedule(static,1) if (sameas(false))
  for (lp=0; lp<thds; lp++) {
    buf[lp] = omp_get_thread_num ();
    check_parallel (0);
  }
  errors += check_result ();


  if (errors == 0) {
    printf ("parallel for 012 : SUCCESS\n");
    return 0;
  } else {
    printf ("parallel for 012 : FAILED\n");
    return 1;
  }
}
int main(int argc, char *argv[])
{
    struct pngquant_options options = {
        .floyd = 1.f, // floyd-steinberg dithering
    };
    options.liq = liq_attr_create();

    if (!options.liq) {
        fputs("SSE-capable CPU is required for this build.\n", stderr);
        return WRONG_ARCHITECTURE;
    }

    unsigned int error_count=0, skipped_count=0, file_count=0;
    pngquant_error latest_error=SUCCESS;
    const char *newext = NULL, *output_file_path = NULL;

    fix_obsolete_options(argc, argv);

    int opt;
    do {
        opt = getopt_long(argc, argv, "Vvqfhs:Q:o:", long_options, NULL);
        switch (opt) {
            case 'v':
                options.verbose = true;
                break;
            case 'q':
                options.verbose = false;
                break;

            case arg_floyd:
                options.floyd = optarg ? atof(optarg) : 1.0;
                if (options.floyd < 0 || options.floyd > 1.f) {
                    fputs("--floyd argument must be in 0..1 range\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;
            case arg_ordered: options.floyd = 0; break;

            case 'f': options.force = true; break;
            case arg_no_force: options.force = false; break;

            case arg_ext: newext = optarg; break;
            case 'o':
                if (output_file_path) {
                    fputs("--output option can be used only once\n", stderr);
                    return INVALID_ARGUMENT;
                }
                output_file_path = optarg; break;

            case arg_iebug:
                // opacities above 238 will be rounded up to 255, because IE6 truncates <255 to 0.
                liq_set_min_opacity(options.liq, 238);
                options.ie_mode = true;
                break;

            case arg_transbug:
                liq_set_last_index_transparent(options.liq, true);
                break;

            case arg_skip_larger:
                options.skip_if_larger = true;
                break;

            case 's':
                {
                    int speed = atoi(optarg);
                    if (speed >= 10) {
                        options.fast_compression = true;
                    }
                    if (speed == 11) {
                        options.floyd = 0;
                        speed = 10;
                    }
                    if (LIQ_OK != liq_set_speed(options.liq, speed)) {
                        fputs("Speed should be between 1 (slow) and 11 (fast).\n", stderr);
                        return INVALID_ARGUMENT;
                    }
                }
                break;

            case 'Q':
                if (!parse_quality(optarg, options.liq, &options.min_quality_limit)) {
                    fputs("Quality should be in format min-max where min and max are numbers in range 0-100.\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;

            case arg_posterize:
                if (LIQ_OK != liq_set_min_posterization(options.liq, atoi(optarg))) {
                    fputs("Posterization should be number of bits in range 0-4.\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;

            case arg_map:
                {
                    png24_image tmp = {};
                    if (SUCCESS != read_image(options.liq, optarg, false, &tmp, &options.fixed_palette_image, false, false)) {
                        fprintf(stderr, "  error: Unable to load %s", optarg);
                        return INVALID_ARGUMENT;
                    }
                }
                break;

            case 'h':
                print_full_version(stdout);
                print_usage(stdout);
                return SUCCESS;

            case 'V':
                puts(PNGQUANT_VERSION);
                return SUCCESS;

            case -1: break;

            default:
                return INVALID_ARGUMENT;
        }
    } while (opt != -1);

    int argn = optind;

    if (argn >= argc) {
        if (argn > 1) {
            fputs("No input files specified. See -h for help.\n", stderr);
        } else {
            print_full_version(stderr);
            print_usage(stderr);
        }
        return MISSING_ARGUMENT;
    }

    if (options.verbose) {
        liq_set_log_callback(options.liq, log_callback, NULL);
        options.log_callback = log_callback;
    }

    char *colors_end;
    unsigned long colors = strtoul(argv[argn], &colors_end, 10);
    if (colors_end != argv[argn] && '\0' == colors_end[0]) {
        if (LIQ_OK != liq_set_max_colors(options.liq, colors)) {
            fputs("Number of colors must be between 2 and 256.\n", stderr);
            return INVALID_ARGUMENT;
        }
        argn++;
    }

    if (newext && output_file_path) {
        fputs("--ext and --output options can't be used at the same time\n", stderr);
        return INVALID_ARGUMENT;
    }

    // new filename extension depends on options used. Typically basename-fs8.png
    if (newext == NULL) {
        newext = options.floyd > 0 ? "-ie-fs8.png" : "-ie-or8.png";
        if (!options.ie_mode) {
            newext += 3;    /* skip "-ie" */
        }
    }

    if (argn == argc || (argn == argc-1 && 0==strcmp(argv[argn],"-"))) {
        options.using_stdin = true;
        argn = argc-1;
    }

    if (options.using_stdin && output_file_path) {
        fputs("--output can't be mixed with stdin\n", stderr);
        return INVALID_ARGUMENT;
    }

    const int num_files = argc-argn;

    if (output_file_path && num_files != 1) {
        fputs("Only one input file is allowed when --output is used\n", stderr);
        return INVALID_ARGUMENT;
    }

#ifdef _OPENMP
    // if there's a lot of files, coarse parallelism can be used
    if (num_files > 2*omp_get_max_threads()) {
        omp_set_nested(0);
        omp_set_dynamic(1);
    } else {
        omp_set_nested(1);
    }
#endif

    #pragma omp parallel for \
        schedule(static, 1) reduction(+:skipped_count) reduction(+:error_count) reduction(+:file_count) shared(latest_error)
    for(int i=0; i < num_files; i++) {
        struct pngquant_options opts = options;
        opts.liq = liq_attr_copy(options.liq);

        const char *filename = opts.using_stdin ? "stdin" : argv[argn+i];

        #ifdef _OPENMP
        struct buffered_log buf = {};
        if (opts.log_callback && omp_get_num_threads() > 1 && num_files > 1) {
            liq_set_log_callback(opts.liq, log_callback_buferred, &buf);
            liq_set_log_flush_callback(opts.liq, log_callback_buferred_flush, &buf);
            options.log_callback = log_callback_buferred;
            options.log_callback_user_info = &buf;
        }
        #endif


        pngquant_error retval = SUCCESS;

        const char *outname = output_file_path;
        char *outname_free = NULL;
        if (!options.using_stdin) {
            if (!outname) {
                outname = outname_free = add_filename_extension(filename, newext);
            }
            if (!options.force && file_exists(outname)) {
                fprintf(stderr, "  error:  %s exists; not overwriting\n", outname);
                retval = NOT_OVERWRITING_ERROR;
            }
        }

        if (!retval) {
            retval = pngquant_file(filename, outname, &opts);
        }

        free(outname_free);

        liq_attr_destroy(opts.liq);

        if (retval) {
            #pragma omp critical
            {
                latest_error = retval;
            }
            if (retval == TOO_LOW_QUALITY || retval == TOO_LARGE_FILE) {
                skipped_count++;
            } else {
                error_count++;
            }
        }
        ++file_count;
    }

    if (error_count) {
        verbose_printf(&options, "There were errors quantizing %d file%s out of a total of %d file%s.",
                       error_count, (error_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s");
    }
    if (skipped_count) {
        verbose_printf(&options, "Skipped %d file%s out of a total of %d file%s.",
                       skipped_count, (skipped_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s");
    }
    if (!skipped_count && !error_count) {
        verbose_printf(&options, "No errors detected while quantizing %d image%s.",
                       file_count, (file_count == 1)? "" : "s");
    }

    liq_image_destroy(options.fixed_palette_image);
    liq_attr_destroy(options.liq);

    return latest_error;
}
#endif

pngquant_error pngquant_file(const char *filename, const char *outname, struct pngquant_options *options)
{
    pngquant_error retval = SUCCESS;

    verbose_printf(options, "%s:", filename);

    liq_image *input_image = NULL;
    png24_image input_image_rwpng = {};
    bool keep_input_pixels = options->skip_if_larger || (options->using_stdin && options->min_quality_limit); // original may need to be output to stdout
    if (!retval) {
        retval = read_image(options->liq, filename, options->using_stdin, &input_image_rwpng, &input_image, keep_input_pixels, options->verbose);
    }

    int quality_percent = 90; // quality on 0-100 scale, updated upon successful remap
    png8_image output_image = {};
    if (!retval) {
        verbose_printf(options, "  read %luKB file", (input_image_rwpng.file_size+1023UL)/1024UL);

#if USE_LCMS
        if (input_image_rwpng.lcms_status == ICCP) {
            verbose_printf(options, "  used embedded ICC profile to transform image to sRGB colorspace");
        } else if (input_image_rwpng.lcms_status == GAMA_CHRM) {
            verbose_printf(options, "  used gAMA and cHRM chunks to transform image to sRGB colorspace");
        } else if (input_image_rwpng.lcms_status == ICCP_WARN_GRAY) {
            verbose_printf(options, "  warning: ignored ICC profile in GRAY colorspace");
        }
#endif

        if (input_image_rwpng.gamma != 0.45455) {
            verbose_printf(options, "  corrected image from gamma %2.1f to sRGB gamma",
                           1.0/input_image_rwpng.gamma);
        }

        // when using image as source of a fixed palette the palette is extracted using regular quantization
        liq_result *remap = liq_quantize_image(options->liq, options->fixed_palette_image ? options->fixed_palette_image : input_image);

        if (remap) {
            liq_set_output_gamma(remap, 0.45455); // fixed gamma ~2.2 for the web. PNG can't store exact 1/2.2
            liq_set_dithering_level(remap, options->floyd);

            retval = prepare_output_image(remap, input_image, &output_image);
            if (!retval) {
                if (LIQ_OK != liq_write_remapped_image_rows(remap, input_image, output_image.row_pointers)) {
                    retval = OUT_OF_MEMORY_ERROR;
                }

                set_palette(remap, &output_image);

                double palette_error = liq_get_quantization_error(remap);
                if (palette_error >= 0) {
                    quality_percent = liq_get_quantization_quality(remap);
                    verbose_printf(options, "  mapped image to new colors...MSE=%.3f (Q=%d)", palette_error, quality_percent);
                }
            }
            liq_result_destroy(remap);
        } else {
            retval = TOO_LOW_QUALITY;
        }
    }

    if (!retval) {

        if (options->skip_if_larger) {
            // this is very rough approximation, but generally avoid losing more quality than is gained in file size.
            // Quality is squared, because even greater savings are needed to justify big quality loss.
            double quality = quality_percent/100.0;
            output_image.maximum_file_size = (input_image_rwpng.file_size-1) * quality*quality;
        }

        output_image.fast_compression = options->fast_compression;
        output_image.chunks = input_image_rwpng.chunks; input_image_rwpng.chunks = NULL;
        retval = write_image(&output_image, NULL, outname, options);

        if (TOO_LARGE_FILE == retval) {
            verbose_printf(options, "  file exceeded expected size of %luKB", (unsigned long)output_image.maximum_file_size/1024UL);
        }
    }

    if (options->using_stdin && keep_input_pixels && (TOO_LARGE_FILE == retval || TOO_LOW_QUALITY == retval)) {
        // when outputting to stdout it'd be nasty to create 0-byte file
        // so if quality is too low, output 24-bit original
        pngquant_error write_retval = write_image(NULL, &input_image_rwpng, outname, options);
        if (write_retval) {
            retval = write_retval;
        }
    }

    liq_image_destroy(input_image);
    rwpng_free_image24(&input_image_rwpng);
    rwpng_free_image8(&output_image);

    return retval;
}
Example #7
0
FANN_EXTERNAL float FANN_API fann_train_epoch_irpropm_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb)
{
	struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*));
	int i=0,j=0;

	if(ann->prev_train_slopes == NULL)
	{
		fann_clear_train_arrays(ann);
	}

	//#define THREADNUM 1
	fann_reset_MSE(ann);

	/*vector<struct fann *> ann_vect(threadnumb);*/

	//generate copies of the ann
	omp_set_dynamic(0);
	omp_set_num_threads(threadnumb);
	#pragma omp parallel private(j)
	{

		#pragma omp for schedule(static)
		for(i=0; i<(int)threadnumb; i++)
		{
			ann_vect[i]=fann_copy(ann);
		}

    //parallel computing of the updates


        #pragma omp for schedule(static)
		for(i = 0; i < (int)data->num_data; i++)
		{
			j=omp_get_thread_num();
			if (ann->do_dropout) {
				fann_run_dropout(ann_vect[j], data->input[i]);
			}
			else {
				fann_run(ann_vect[j], data->input[i]);
			}
			fann_compute_MSE(ann_vect[j], data->output[i]);
			fann_backpropagate_MSE(ann_vect[j]);
			fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1);
		}
	}

	{
    	fann_type *weights = ann->weights;
    	fann_type *prev_steps = ann->prev_steps;
    	fann_type *prev_train_slopes = ann->prev_train_slopes;

    	fann_type next_step;

    	const float increase_factor = ann->rprop_increase_factor;	//1.2;
    	const float decrease_factor = ann->rprop_decrease_factor;	//0.5;
    	const float delta_min = ann->rprop_delta_min;	//0.0;
    	const float delta_max = ann->rprop_delta_max;	//50.0;
		const unsigned int first_weight=0;
		const unsigned int past_end=ann->total_connections;

		omp_set_dynamic(0);
		omp_set_num_threads(threadnumb);
		#pragma omp parallel private(next_step)
		{
			#pragma omp for schedule(static)
				for(i=first_weight; i < (int)past_end; i++)
				{
					fann_type prev_slope, same_sign;
		    		const fann_type prev_step = fann_max(prev_steps[i], (fann_type) 0.0001);	// prev_step may not be zero because then the training will stop

		    		fann_type temp_slopes=0.0;
					unsigned int k;
					fann_type *train_slopes;
					for(k=0;k<threadnumb;++k)
					{
						train_slopes=ann_vect[k]->train_slopes;
						temp_slopes+= train_slopes[i];
						train_slopes[i]=0.0;
					}

		    		prev_slope = prev_train_slopes[i];

		    		same_sign = prev_slope * temp_slopes;

		    		if(same_sign >= 0.0)
		    			next_step = fann_min(prev_step * increase_factor, delta_max);
		    		else
		    		{
		    			next_step = fann_max(prev_step * decrease_factor, delta_min);
		    			temp_slopes = 0;
		    		}

		    		if(temp_slopes < 0)
		    		{
		    			weights[i] -= next_step;
		    			if(weights[i] < -1500)
		    				weights[i] = -1500;
		    		}
		    		else
		    		{
		    			weights[i] += next_step;
		    			if(weights[i] > 1500)
		    				weights[i] = 1500;
		    		}

		    		// update global data arrays
		    		prev_steps[i] = next_step;
		    		prev_train_slopes[i] = temp_slopes;

				}
			}
	}

	//merge of MSEs
	for(i=0;i<(int)threadnumb;++i)
	{
		ann->MSE_value+= ann_vect[i]->MSE_value;
		ann->num_MSE+=ann_vect[i]->num_MSE;
		fann_destroy(ann_vect[i]);
	}
	free(ann_vect);
	return fann_get_MSE(ann);
}
Example #8
0
main ()
{
  int	i;


  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  if (4 < thds) {
    thds = 4;
    omp_set_num_threads (4);
  }

  omp_set_dynamic (0);


  clear ();
  #pragma omp parallel
  {
    #pragma omp for 
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic
      atom_incr ++;
    }

    #pragma omp for 
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic
      ++ atom_incr2;
    }

    #pragma omp for 
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic
      atom_decr --;
    }

    #pragma omp for
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic
      -- atom_decr2;
    }

    #pragma omp for
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic	
      atom_plus += sameas(2) - 1;
    }

    #pragma omp for
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic	
      atom_minus -= sameas(2) - 1;
    }

    #pragma omp for
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic	
      atom_mul *= sameas(3) - 1;
    }

    #pragma omp for
    for (i=0; i<LOOPNUM; i++) {
      #pragma omp atomic	
      atom_div /= 4 + sameas(-2);
    }
  }
  errors += check ();

  if (errors == 0) {
    printf ("atomic 009 : SUCCESS\n");
    return 0;
  } else {
    printf ("atomic 009 : FAILED\n");
    return 1;
  }
}
void add_to_ring(DTYPE* indata, DTYPE* outdata, CM_DTYPE* chan_map, DTYPE* ring_buffer_data, int ringt0, int chunk_size, int ring_length, float delta_t, size_t nfreq, float freq0, float delta_f, int depth)
{
     omp_set_dynamic(0);
     omp_set_num_threads(8);

     //zero-pad data
     int ndm = get_nchan_from_depth(depth);
     float * indata_pad = (float*)malloc(sizeof(float)*nfreq*(chunk_size + ndm));
     for(int i = 0; i < nfreq; i++){
       memcpy(indata_pad + i*(chunk_size + ndm), indata + i*chunk_size,sizeof(float)*chunk_size);
       memset(indata_pad + i*(chunk_size + ndm) + chunk_size,0,sizeof(float)*(ndm));
     }


	Data *dat=put_data_into_burst_struct(indata_pad,chunk_size + ndm,nfreq,chan_map,depth);
	remap_data(dat);
     int nchan = dat->nchan;


	float** ring_buffer = (float**)malloc(sizeof(float*)*nchan);
	make_rect_mat(ring_buffer,ring_buffer_data,nchan,ring_length);

	//allocate the triangular matrix for output
	//float* tmp = malloc((nchan*chunk_size + (nchan*(nchan - 1))/2)*sizeof(float));
     //float* tmp = (float*)malloc(nchan*(chunk_size + nchan)*sizeof(float));
	//float** tmp_mat = (float**)malloc(nchan*sizeof(float*));
	//make_triangular_mat(tmp_mat,tmp,nchan,chunk_size,1);
     //make_rect_mat(tmp_mat, tmp, nchan, chunk_size + nchan);
     float** tmp_mat = matrix(nchan,chunk_size + nchan);

	dedisperse_lagged(dat->data,tmp_mat,nchan,chunk_size);
     //printf("ringt0: %i\n",ringt0);
	update_ring_buffer(tmp_mat,ring_buffer,nchan,chunk_size,ring_length,&ringt0);
     //printf("ringt0: %i\n",ringt0);

     //probably not the most efficient way to use the output array
     //does not stop copying if data is incomplete
     //does not prevent overlap
     //ring buffer must be long enough

     //because of the search padding requirement...
	for(int i = 0; i < nchan; i++){
          int src0 = (ring_length + ringt0 - i) % ring_length;
          int src1 = (ring_length + ringt0 + chunk_size - i) % ring_length;
          //printf("ring length %i, cs %i\n",ring_length,chunk_size);
          //printf("i: %i, src0: %i, src1 %i\n",i,src0,src1);
          if (src1 < src0){
           int first_cpy = (ring_length - src0);
           int second_cpy = chunk_size - first_cpy;
           memcpy(outdata + i*(chunk_size + nchan), ring_buffer[i] + src0, first_cpy*sizeof(float));
           memcpy(outdata + i*(chunk_size + nchan) + first_cpy, ring_buffer[i] + src0 + first_cpy, (second_cpy)*sizeof(float));
          }
          else{
		 memcpy(outdata + i*(chunk_size + nchan), ring_buffer[i] + src0, (chunk_size)*sizeof(float));
          }
    }
     free(dat->data[0]);
     free(dat->data);
     free(dat->raw_data[0]);
     free(dat->raw_data);
     free(dat);
	//free(tmp);
     free(indata_pad);
     //free(tmp_mat[0]);
     free(tmp_mat[0]);
     free(tmp_mat);
     //free(ring_buffer);
}
Example #10
0
/****************************************************************
*
*	Function: main
*	Input:	int argc 	number of command line arguements
*		char **arg	pointer to those arguements
*
*	Output: int 0 for success and 1 for error
*
*	Description: Runs the simple merge algorithm multiple
*	times averaging the results and printing them to terminal. 
*
*****************************************************************/
int main(int argc, char **argv)
{
	struct timeval startt, endt, result;
	
	char name[8] = "omp/";

	int status=0;
	int n;
	int* S;
	int* R;
	
	int RUNS;
	
	//Check if app was given enough input
	if(argc < 6){
		printf("Missing Arguement Parameters\n");
		printf("Format ./seq path_input input_size ans_Path RUNS MAX_THREADS\n");
		return 1;
	}
	
	//Save args to memory and allocate memory for arrays
	n = atoi(argv[2])+1;
	RUNS = atoi(argv[4]);
	MAX_THREADS = atoi(argv[5]);
	S = malloc(n*sizeof(int));
	R = malloc(n*sizeof(int));

	if(n<50){
		chunk = 4; /*For Small N*/
	}

	omp_set_dynamic(0); //Makes sure the number of threads available is fixed    
	omp_set_num_threads(MAX_THREADS); //Set thread number


	if(S==NULL){
		printf("Failed to Allocate Memory for Input Array S");	
	}
	if(R==NULL){
		printf("Failed to Allocate Memory for Input Array R");	
	}

	//Read the input array from file and save to memory
	status = read_input(S, n, argv[1]);

	if(status){
		#ifdef DEBUG	
		printf("Failed to Read Input S\n");
		#endif
		return 1;
	}
	
	int *P_temp = malloc(n*sizeof(int));	
	int *R_temp = malloc(n*sizeof(int));
	int *P = malloc(n*sizeof(int));

	//Start of testing of the algorithm
	int j;
	double average;
	for(j=0; j<RUNS; j++){
		memset(R, 0, n*sizeof(int));

		/*Start Timer*/
		result.tv_sec=0;
		result.tv_usec=0;
		gettimeofday (&startt, NULL);

		/*Start Algorithm*/
		nodeLength(S, R, n, P_temp, R_temp, P);

		/*Stop Timer*/
		gettimeofday (&endt, NULL);
		result.tv_usec = (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec);
		average += result.tv_usec;
		
	}
	average = average/RUNS; //Average the execution times

	//print results to terminal
	printf("%d 	%f	us \n",n-1,average);

	if(atoi(argv[3])!=1)
	{
		status = outputCheck(R, argv[3], n);
		if(status){
			printf("Incorrect Answer\n");
		}
		else{
			printf("Correct Answer\n");
		}
	}
	

	/*Save the Results if the output is less than 50 elements*/
	if(n<=50){
		status = write_output(S, R, n, name);
	}

	if(status){	
		printf("Failed to Write Output \n");
		return 1;
	}

	free(S);
	free(R);
	free(P_temp);
	free(R_temp);
	free(P);	
	
    	return 0;
}
Example #11
0
PUBLIC float
vrna_pf(vrna_fold_compound_t  *vc,
        char                  *structure){

  int               n;
  FLT_OR_DBL        Q;
  double            free_energy;
  vrna_md_t         *md;
  vrna_exp_param_t  *params;
  vrna_mx_pf_t      *matrices;

  free_energy = (float)(INF/100.);

  if(vc){
    /* make sure, everything is set up properly to start partition function computations */
    if(!vrna_fold_compound_prepare(vc, VRNA_OPTION_PF)){
      vrna_message_warning("vrna_pf@part_func.c: Failed to prepare vrna_fold_compound");
      return free_energy;
    }

    n         = vc->length;
    params    = vc->exp_params;
    matrices  = vc->exp_matrices;
    md        = &(params->model_details);

#ifdef _OPENMP
/* Explicitly turn off dynamic threads */
    omp_set_dynamic(0);
#endif

#ifdef SUN4
    nonstandard_arithmetic();
#else
#ifdef HP9
    fpsetfastmode(1);
#endif
#endif

    /* call user-defined recursion status callback function */
    if(vc->stat_cb)
      vc->stat_cb(VRNA_STATUS_PF_PRE, vc->auxdata);

    switch(vc->type){
      case VRNA_FC_TYPE_SINGLE:     /* do the linear pf fold and fill all matrices  */
                                    pf_linear(vc);

                                    if(md->circ)
                                      pf_circ(vc); /* do post processing step for circular RNAs */

                                    break;

      case VRNA_FC_TYPE_COMPARATIVE:  /* do the linear pf fold and fill all matrices  */
                                    alipf_linear(vc);

                                    /* calculate post processing step for circular  */
                                    /* RNAs                                         */
                                    if(md->circ)
                                      wrap_alipf_circ(vc, structure);

                                    break;

      default:                      vrna_message_warning("vrna_pf@part_func.c: Unrecognized fold compound type");
                                    return free_energy;
                                    break;
    }


    /* call user-defined recursion status callback function */
    if(vc->stat_cb)
      vc->stat_cb(VRNA_STATUS_PF_POST, vc->auxdata);

    /* calculate base pairing probability matrix (bppm)  */
    if(md->compute_bpp){
      vrna_pairing_probs(vc, structure);

#ifdef  VRNA_BACKWARD_COMPAT

      /*
      *  Backward compatibility:
      *  This block may be removed if deprecated functions
      *  relying on the global variable "pr" vanish from within the package!
      */
      pr = matrices->probs;
      /*
       {
        if(pr) free(pr);
        pr = (FLT_OR_DBL *) vrna_alloc(sizeof(FLT_OR_DBL) * ((n+1)*(n+2)/2));
        memcpy(pr, probs, sizeof(FLT_OR_DBL) * ((n+1)*(n+2)/2));
      }
      */

#endif

    }

    if (md->backtrack_type=='C')
      Q = matrices->qb[vc->iindx[1]-n];
    else if (md->backtrack_type=='M')
      Q = matrices->qm[vc->iindx[1]-n];
    else Q = (md->circ) ? matrices->qo : matrices->q[vc->iindx[1]-n];

    /* ensemble free energy in Kcal/mol              */
    if (Q<=FLT_MIN)
      vrna_message_warning("pf_scale too large");

    switch(vc->type){
      case VRNA_FC_TYPE_COMPARATIVE:  free_energy = (-log(Q)-n*log(params->pf_scale))*params->kT/(1000.0 * vc->n_seq);
                                    break;

      case VRNA_FC_TYPE_SINGLE:     /* fall through */

      default:                      free_energy = (-log(Q)-n*log(params->pf_scale))*params->kT/1000.0;
                                    break;
    }

#ifdef SUN4
    standard_arithmetic();
#else
#ifdef HP9
    fpsetfastmode(0);
#endif
#endif
  }

  return free_energy;
}
Example #12
0
void SimpleSpeechRec::frameSegmentation() {
	time++;
	if (time == 0) {
		timeZeroInit();
		return;
	}

	SRecToken** newTokenBuffer = new SRecToken*[cbNum];
	for (int i = 0; i < cbNum; i++) {
		newTokenBuffer[i] = NULL;
	}

	int* cbTypeLookup = new int[cbNum];
	for (int i = 0; i < cbNum; i++) {
		cbTypeLookup[i] = dict->getCbType(i);
	}

	omp_set_dynamic(true);
#pragma omp parallel for 
	for (int i = 0; i < cbNum; i++) {
		STokenBin* bin = binSet[i];
		SRecToken* candToken = bin->getPreviousBest();
		if (!candToken)
			continue;
		int cbType = (i == cbNum-1) ? DI_TAIL_NOISE: cbTypeLookup[i];
		bool isCrossWord = isCrossWordCb(cbType);



		SRecToken* candWord = NULL;
		SRecToken* newToken = factory.getInstance();
		//newToken->copyFrom(candToken);
		if (isCrossWord) {
			candWord = factory.getInstance();
			candWord->copyFrom(candToken);
			if (candToken->prev)
				InterlockedIncrement(&candToken->prev->refcnt);
			candWord->endTime = time;
		} else {
			candWord = candToken->prev;
			newToken->CId = candToken->CId;
			newToken->VId = candToken->VId;
			newToken->wordId = candToken->wordId;
			if (dict->triPhone) {	
				if (cbType == INITIAL1) {   
					newToken->CId = dict->getCVIdFromCbId(i);
				} else if (cbType == FINAL0) {
					newToken->VId = dict->getCVIdFromCbId(i);
					newToken->wordId = dict->getWordIdFromCVLink(candToken->currentCbId, i);
				}
			}
			else{
				if (cbType == DI_INITIAL1) {
					newToken->CId = dict->getCVIdFromCbId(i);
				} else if (cbType == DI_FINAL0) {
					newToken->VId = dict->getCVIdFromCbId(i);
					newToken->wordId = dict->getWordIdFromCVLink(candToken->currentCbId, i);
				}
			}


		}
		//
		newToken->currentCbId = i;
		newToken->dur = 1;



		double durLh = useSegmentModel ? bc->getDurLh(i, 1) : 0;
		double stateLh = bc->getStateLh(i, time);
		newToken->lh = candToken->lh + durLh + stateLh;
		newToken->prev = candWord;
		InterlockedIncrement(&candWord->refcnt);

		newTokenBuffer[i] = newToken;
	}

	//状态驻留
	for (int i = 0; i < cbNum; i++) {
		STokenBin* bin = binSet.at(i);

		int k =i;
		if(m_bHeadNOise&&i == cbNum-1)
			k=  dict->noiseId;

		double stateLh = bc->getStateLh(k, time);

		for (auto j = bin->content.begin(); j != bin->content.end(); j++) {
			SRecToken* t = *j;

			t->dur += 1;
			int dur = t->dur;
			double deltaDurLh = useSegmentModel ? bc->getDurLhDelta(k, dur) : 0;

			t->lh += deltaDurLh + stateLh;
		}
	}

	//完成状态跳转
	for (int i = 0; i < cbNum; i++) {
		STokenBin* bin = binSet.at(i);
		if (newTokenBuffer[i] != NULL)
			bin->addToken(newTokenBuffer[i]);
		prune(bin);
	}

	delete [] cbTypeLookup;
	delete [] newTokenBuffer;
}
int main() {
    const int nr_threads = 2;
    const int n = N;
    const int nr_runs = 20000000;
    double a[n], sum = 0.0;
    int j;
    omp_set_dynamic(0);
    omp_set_num_threads(nr_threads);

#pragma omp parallel default(none) shared(a)
    {
#pragma omp sections
        {
#pragma omp section
            {
                struct timeval tv1, tv2;
                int i, run_nr;
                int thread_nr = omp_get_thread_num();
                for (i = 0; i < n/2; i += 1)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = 0; i < n/2 ;i += 1)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
#pragma omp section
            {
                struct timeval tv1, tv2;
                int thread_nr = omp_get_thread_num();
                int i, run_nr;
                for (i = n/2; i < n; i += 1)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = n/2; i < n ;i += 1)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
        }
    }
    sum = 0.0;
    for (j = 0; j < n; j++)
        sum += a[j];
    printf("no false sharing: %.1lf\n", sum);

#pragma omp parallel default(none) shared(a)
    {
#pragma omp sections
        {
#pragma omp section
            {
                struct timeval tv1, tv2;
                int i, run_nr;
                int thread_nr = omp_get_thread_num();
                for (i = 0; i < n; i += 2)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = 0; i < n ;i += 2)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
#pragma omp section
            {
                struct timeval tv1, tv2;
                int i, run_nr;
                int thread_nr = omp_get_thread_num();
                for (i = 1; i < n; i += 2)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = 1; i < n ;i += 2)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
        }
    }
    sum = 0.0;
    for (j = 0; j < n; j++)
        sum += a[j];
    printf("false sharing: %.1lf\n", sum);

    return EXIT_SUCCESS;
}
Example #14
0
int main(int argc, char *argv[])
{
   FILE *bytemaskfile;
   float **dataavg = NULL, **datastd = NULL, **datapow = NULL;
   float *chandata = NULL, powavg, powstd, powmax;
   float inttime, norm, fracterror = RFI_FRACTERROR;
   float *rawdata = NULL;
   unsigned char **bytemask = NULL;
   short *srawdata = NULL;
   char *outfilenm, *statsfilenm, *maskfilenm;
   char *bytemaskfilenm, *rfifilenm;
   int numchan = 0, numint = 0, newper = 0, oldper = 0;
   int blocksperint, ptsperint = 0, ptsperblock = 0, padding = 0;
   int numcands, candnum, numrfi = 0, numrfivect = NUM_RFI_VECT;
   int ii, jj, kk, slen, insubs = 0;
   int harmsum = RFI_NUMHARMSUM, lobin = RFI_LOBIN, numbetween = RFI_NUMBETWEEN;
   double davg, dvar, freq;
   struct spectra_info s;
   presto_interptype interptype;
   rfi *rfivect = NULL;
   mask oldmask, newmask;
   fftcand *cands;
   infodata idata;
   Cmdline *cmd;

   /* Call usage() if we have no command line arguments */

   if (argc == 1) {
      Program = argv[0];
      printf("\n");
      usage();
      exit(0);
   }

   /* Parse the command line using the excellent program Clig */

   cmd = parseCmdline(argc, argv);
   spectra_info_set_defaults(&s);
   s.filenames = cmd->argv;
   s.num_files = cmd->argc;
   // If we are zeroDMing, make sure that clipping is off.
   if (cmd->zerodmP) cmd->noclipP = 1;
   s.clip_sigma = cmd->clip;
   // -1 causes the data to determine if we use weights, scales, &
   // offsets for PSRFITS or flip the band for any data type where
   // we can figure that out with the data
   s.apply_flipband = (cmd->invertP) ? 1 : -1;
   s.apply_weight = (cmd->noweightsP) ? 0 : -1;
   s.apply_scale  = (cmd->noscalesP) ? 0 : -1;
   s.apply_offset = (cmd->nooffsetsP) ? 0 : -1;
   s.remove_zerodm = (cmd->zerodmP) ? 1 : 0;
   if (cmd->noclipP) {
       cmd->clip = 0.0;
       s.clip_sigma = 0.0;
   }
   if (cmd->ifsP) {
       // 0 = default or summed, 1-4 are possible also
       s.use_poln = cmd->ifs + 1;
   }
   slen = strlen(cmd->outfile) + 20;

   if (cmd->ncpus > 1) {
#ifdef _OPENMP
      int maxcpus = omp_get_num_procs();
      int openmp_numthreads = (cmd->ncpus <= maxcpus) ? cmd->ncpus : maxcpus;
      // Make sure we are not dynamically setting the number of threads
      omp_set_dynamic(0);
      omp_set_num_threads(openmp_numthreads);
      printf("Using %d threads with OpenMP\n\n", openmp_numthreads);
#endif
   } else {
#ifdef _OPENMP
      omp_set_num_threads(1); // Explicitly turn off OpenMP
#endif
   }

#ifdef DEBUG
   showOptionValues();
#endif

   printf("\n\n");
   printf("               Pulsar Data RFI Finder\n");
   printf("                 by Scott M. Ransom\n\n");

   /* The following is the root of all the output files */

   outfilenm = (char *) calloc(slen, sizeof(char));
   sprintf(outfilenm, "%s_rfifind", cmd->outfile);

   /* And here are the output file names */

   maskfilenm = (char *) calloc(slen, sizeof(char));
   sprintf(maskfilenm, "%s.mask", outfilenm);
   bytemaskfilenm = (char *) calloc(slen, sizeof(char));
   sprintf(bytemaskfilenm, "%s.bytemask", outfilenm);
   rfifilenm = (char *) calloc(slen, sizeof(char));
   sprintf(rfifilenm, "%s.rfi", outfilenm);
   statsfilenm = (char *) calloc(slen, sizeof(char));
   sprintf(statsfilenm, "%s.stats", outfilenm);
   sprintf(idata.name, "%s", outfilenm);

   if (RAWDATA) {
       if (cmd->filterbankP) s.datatype = SIGPROCFB;
       else if (cmd->psrfitsP) s.datatype = PSRFITS;
       else if (cmd->pkmbP) s.datatype = SCAMP;
       else if (cmd->bcpmP) s.datatype = BPP;
       else if (cmd->wappP) s.datatype = WAPP;
       else if (cmd->spigotP) s.datatype = SPIGOT;
   } else {  // Attempt to auto-identify the data
       identify_psrdatatype(&s, 1);
       if (s.datatype==SIGPROCFB) cmd->filterbankP = 1;
       else if (s.datatype==PSRFITS) cmd->psrfitsP = 1;
       else if (s.datatype==SCAMP) cmd->pkmbP = 1;
       else if (s.datatype==BPP) cmd->bcpmP = 1;
       else if (s.datatype==WAPP) cmd->wappP = 1;
       else if (s.datatype==SPIGOT) cmd->spigotP = 1;
       else if (s.datatype==SUBBAND) insubs = 1;
       else {
           printf("Error:  Unable to identify input data files.  Please specify type.\n\n");
           exit(1);
       }
   }

   /* Read an input mask if wanted */
   if (cmd->maskfileP) {
       read_mask(cmd->maskfile, &oldmask);
       printf("Read old mask information from '%s'\n\n", cmd->maskfile);
   } else {
       oldmask.numchan = oldmask.numint = 0;
   }

   if (!cmd->nocomputeP) {

       if (RAWDATA || insubs) {
           char description[40];
           psrdatatype_description(description, s.datatype);
           if (s.num_files > 1)
               printf("Reading %s data from %d files:\n", description, s.num_files);
           else
               printf("Reading %s data from 1 file:\n", description);
           if (insubs) s.files = (FILE **)malloc(sizeof(FILE *) * s.num_files);
           for (ii = 0; ii < s.num_files; ii++) {
               printf("  '%s'\n", cmd->argv[ii]);
               if (insubs) s.files[ii] = chkfopen(cmd->argv[ii], "rb");
           }
           printf("\n");
       }

       if (RAWDATA) {
           read_rawdata_files(&s);
           print_spectra_info_summary(&s);
           spectra_info_to_inf(&s, &idata);
           ptsperblock = s.spectra_per_subint;
           numchan = s.num_channels;
           idata.dm = 0.0;
           writeinf(&idata);
       }

       if (insubs) {
           /* Set-up values if we are using subbands */
           char *tmpname, *root, *suffix;
           if (split_root_suffix(s.filenames[0], &root, &suffix) == 0) {
               printf("Error:  The input filename (%s) must have a suffix!\n\n", s.filenames[0]);
               exit(1);
           }
           if (strncmp(suffix, "sub", 3) == 0) {
               tmpname = calloc(strlen(root) + 6, 1);
               sprintf(tmpname, "%s.sub", root);
               readinf(&idata, tmpname);
               free(tmpname);
           } else {
               printf("\nThe input files (%s) must be subbands!  (i.e. *.sub##)\n\n",
                      s.filenames[0]);
               exit(1);
           }
           free(root);
           free(suffix);
           ptsperblock = 1;
           /* Compensate for the fact that we have subbands and not channels */
           idata.freq = idata.freq - 0.5 * idata.chan_wid +
               0.5 * idata.chan_wid * (idata.num_chan / s.num_files);
           idata.chan_wid = idata.num_chan / s.num_files * idata.chan_wid;
           idata.num_chan = numchan = s.num_files;
           idata.dm = 0.0;
           sprintf(idata.name, "%s", outfilenm);
           writeinf(&idata);
           s.padvals = gen_fvect(s.num_files);
           for (ii = 0 ; ii < s.num_files ; ii++)
               s.padvals[ii] = 0.0;
       }

       if (cmd->maskfileP)
           determine_padvals(cmd->maskfile, &oldmask, s.padvals);

      /* The number of data points and blocks to work with at a time */

      if (cmd->blocksP) {
         blocksperint = cmd->blocks;
         cmd->time = blocksperint * ptsperblock * idata.dt;
      } else {
         blocksperint = (int) (cmd->time / (ptsperblock * idata.dt) + 0.5);
         // Must process at least 1 block at a time
         if (blocksperint==0) blocksperint = 1;
      }
      ptsperint = blocksperint * ptsperblock;
      numint = (long long) idata.N / ptsperint;
      if ((long long) idata.N % ptsperint)
         numint++;
      inttime = ptsperint * idata.dt;
      printf("Analyzing data sections of length %d points (%.6g sec).\n",
             ptsperint, inttime);
      {
         int *factors, numfactors;

         factors = get_prime_factors(ptsperint, &numfactors);
         printf("  Prime factors are:  ");
         for (ii = 0; ii < numfactors; ii++)
            printf("%d ", factors[ii]);
         printf("\n");
         if (factors[numfactors - 1] > 13) {
            printf("  WARNING:  The largest prime factor is pretty big!  This will\n"
                   "            cause the FFTs to take a long time to compute.  I\n"
                   "            recommend choosing a different -time value.\n");
         }
         printf("\n");
         free(factors);
      }

      /* Allocate our workarrays */

      if (RAWDATA)
          rawdata = gen_fvect(idata.num_chan * ptsperblock * blocksperint);
      else if (insubs)
          srawdata = gen_svect(idata.num_chan * ptsperblock * blocksperint);
      dataavg = gen_fmatrix(numint, numchan);
      datastd = gen_fmatrix(numint, numchan);
      datapow = gen_fmatrix(numint, numchan);
      chandata = gen_fvect(ptsperint);
      bytemask = gen_bmatrix(numint, numchan);
      for (ii = 0; ii < numint; ii++)
         for (jj = 0; jj < numchan; jj++)
            bytemask[ii][jj] = GOODDATA;
      rfivect = rfi_vector(rfivect, numchan, numint, 0, numrfivect);
      if (numbetween == 2)
         interptype = INTERBIN;
      else
         interptype = INTERPOLATE;

      /* Main loop */

      printf("Writing mask data  to '%s'.\n", maskfilenm);
      printf("Writing  RFI data  to '%s'.\n", rfifilenm);
      printf("Writing statistics to '%s'.\n\n", statsfilenm);
      printf("Massaging the data ...\n\n");
      printf("Amount Complete = %3d%%", oldper);
      fflush(stdout);

      for (ii = 0; ii < numint; ii++) { /* Loop over the intervals */
         newper = (int) ((float) ii / numint * 100.0 + 0.5);
         if (newper > oldper) {
            printf("\rAmount Complete = %3d%%", newper);
            fflush(stdout);
            oldper = newper;
         }

         /* Read a chunk of data */

         if (RAWDATA) {
             read_rawblocks(rawdata, blocksperint, &s, &padding);
             // Clip nasty RFI if requested (we are not masking)
             if (s.clip_sigma > 0.0)
                 clip_times(rawdata, ptsperint, s.num_channels, s.clip_sigma, s.padvals);
         } else if (insubs) {
             read_subband_rawblocks(s.files, s.num_files,
                                    srawdata, blocksperint, &padding);
             // TODO: should implement clipping for subbands
         }

         if (padding)
            for (jj = 0; jj < numchan; jj++)
               bytemask[ii][jj] |= PADDING;

         for (jj = 0; jj < numchan; jj++) {     /* Loop over the channels */

             if (RAWDATA)
                 get_channel(chandata, jj, blocksperint, rawdata, &s);
             else if (insubs)
                 get_subband(jj, chandata, srawdata, blocksperint);

            /* Calculate the averages and standard deviations */
            /* for each point in time.                        */

            if (padding) {
                dataavg[ii][jj] = 0.0;
                datastd[ii][jj] = 0.0;
                datapow[ii][jj] = 1.0;
            } else {
               avg_var(chandata, ptsperint, &davg, &dvar);
               dataavg[ii][jj] = davg;
               datastd[ii][jj] = sqrt(dvar);
               realfft(chandata, ptsperint, -1);
               numcands = 0;
               norm = datastd[ii][jj] * datastd[ii][jj] * ptsperint;
               if (norm == 0.0)
                  norm = (chandata[0] == 0.0) ? 1.0 : chandata[0];
               cands = search_fft((fcomplex *) chandata, ptsperint / 2,
                                  lobin, ptsperint / 2, harmsum,
                                  numbetween, interptype, norm, cmd->freqsigma,
                                  &numcands, &powavg, &powstd, &powmax);
               datapow[ii][jj] = powmax;

               /* Record the birdies */

               if (numcands) {
                  for (kk = 0; kk < numcands; kk++) {
                     freq = cands[kk].r / inttime;
                     candnum = find_rfi(rfivect, numrfi, freq, RFI_FRACTERROR);
                     if (candnum >= 0) {
                        update_rfi(rfivect + candnum, freq, cands[kk].sig, jj, ii);
                     } else {
                        update_rfi(rfivect + numrfi, freq, cands[kk].sig, jj, ii);
                        numrfi++;
                        if (numrfi == numrfivect) {
                           numrfivect *= 2;
                           rfivect = rfi_vector(rfivect, numchan, numint,
                                                numrfivect / 2, numrfivect);
                        }
                     }
                  }
                  free(cands);
               }
            }
         }
      }
      printf("\rAmount Complete = 100%%\n");

      /* Write the data to the output files */

      write_rfifile(rfifilenm, rfivect, numrfi, numchan, numint,
                    ptsperint, lobin, numbetween, harmsum,
                    fracterror, cmd->freqsigma);
      write_statsfile(statsfilenm, datapow[0], dataavg[0], datastd[0],
                      numchan, numint, ptsperint, lobin, numbetween);

   } else {                     /* If "-nocompute" */
      float freqsigma;

      /* Read the data from the output files */

      printf("Reading  RFI data  from '%s'.\n", rfifilenm);
      printf("Reading statistics from '%s'.\n", statsfilenm);
      readinf(&idata, outfilenm);
      read_rfifile(rfifilenm, &rfivect, &numrfi, &numchan, &numint,
                   &ptsperint, &lobin, &numbetween, &harmsum,
                   &fracterror, &freqsigma);
      numrfivect = numrfi;
      read_statsfile(statsfilenm, &datapow, &dataavg, &datastd,
                     &numchan, &numint, &ptsperint, &lobin, &numbetween);
      bytemask = gen_bmatrix(numint, numchan);
      printf("Reading  bytemask  from '%s'.\n\n", bytemaskfilenm);
      bytemaskfile = chkfopen(bytemaskfilenm, "rb");
      chkfread(bytemask[0], numint * numchan, 1, bytemaskfile);
      fclose(bytemaskfile);
      for (ii = 0; ii < numint; ii++)
         for (jj = 0; jj < numchan; jj++)
            bytemask[ii][jj] &= PADDING;        /* Clear all but the PADDING bits */
      inttime = ptsperint * idata.dt;
   }

   /* Make the plots and set the mask */

   {
      int *zapints, *zapchan;
      int numzapints = 0, numzapchan = 0;

      if (cmd->zapintsstrP) {
         zapints = ranges_to_ivect(cmd->zapintsstr, 0, numint - 1, &numzapints);
         zapints = (int *) realloc(zapints, (size_t) (sizeof(int) * numint));
      } else {
         zapints = gen_ivect(numint);
      }
      if (cmd->zapchanstrP) {
         zapchan = ranges_to_ivect(cmd->zapchanstr, 0, numchan - 1, &numzapchan);
         zapchan = (int *) realloc(zapchan, (size_t) (sizeof(int) * numchan));
      } else {
         zapchan = gen_ivect(numchan);
      }
      rfifind_plot(numchan, numint, ptsperint, cmd->timesigma, cmd->freqsigma,
                   cmd->inttrigfrac, cmd->chantrigfrac,
                   dataavg, datastd, datapow, zapchan, numzapchan,
                   zapints, numzapints, &idata, bytemask,
                   &oldmask, &newmask, rfivect, numrfi,
                   cmd->rfixwinP, cmd->rfipsP, cmd->xwinP);

      vect_free(zapints);
      vect_free(zapchan);
   }

   /* Write the new mask and bytemask to the file */

   write_mask(maskfilenm, &newmask);
   bytemaskfile = chkfopen(bytemaskfilenm, "wb");
   chkfwrite(bytemask[0], numint * numchan, 1, bytemaskfile);
   fclose(bytemaskfile);

   /* Determine the percent of good and bad data */

   {
      int numpad = 0, numbad = 0, numgood = 0;

      for (ii = 0; ii < numint; ii++) {
         for (jj = 0; jj < numchan; jj++) {
            if (bytemask[ii][jj] == GOODDATA) {
               numgood++;
            } else {
               if (bytemask[ii][jj] & PADDING)
                  numpad++;
               else
                  numbad++;
            }
         }
      }
      printf("\nTotal number of intervals in the data:  %d\n\n", numint * numchan);
      printf("  Number of padded intervals:  %7d  (%6.3f%%)\n",
             numpad, (float) numpad / (float) (numint * numchan) * 100.0);
      printf("  Number of  good  intervals:  %7d  (%6.3f%%)\n",
             numgood, (float) numgood / (float) (numint * numchan) * 100.0);
      printf("  Number of  bad   intervals:  %7d  (%6.3f%%)\n\n",
             numbad, (float) numbad / (float) (numint * numchan) * 100.0);
      qsort(rfivect, numrfi, sizeof(rfi), compare_rfi_sigma);
      printf("  Ten most significant birdies:\n");
      printf("#  Sigma     Period(ms)      Freq(Hz)       Number \n");
      printf("----------------------------------------------------\n");
      for (ii = 0; ii < 10; ii++) {
         double pperr;
         char temp1[40], temp2[40];

         if (rfivect[ii].freq_var == 0.0) {
            pperr = 0.0;
            sprintf(temp1, " %-14g", rfivect[ii].freq_avg);
            sprintf(temp2, " %-14g", 1000.0 / rfivect[ii].freq_avg);
         } else {
            pperr = 1000.0 * sqrt(rfivect[ii].freq_var) /
                (rfivect[ii].freq_avg * rfivect[ii].freq_avg);
            nice_output_2(temp1, rfivect[ii].freq_avg, sqrt(rfivect[ii].freq_var),
                          -15);
            nice_output_2(temp2, 1000.0 / rfivect[ii].freq_avg, pperr, -15);
         }
         printf("%-2d %-8.2f %13s %13s %-8d\n", ii + 1, rfivect[ii].sigma_avg,
                temp2, temp1, rfivect[ii].numobs);
      }
      qsort(rfivect, numrfi, sizeof(rfi), compare_rfi_numobs);
      printf("\n  Ten most numerous birdies:\n");
      printf("#  Number    Period(ms)      Freq(Hz)       Sigma \n");
      printf("----------------------------------------------------\n");
      for (ii = 0; ii < 10; ii++) {
         double pperr;
         char temp1[40], temp2[40];

         if (rfivect[ii].freq_var == 0.0) {
            pperr = 0.0;
            sprintf(temp1, " %-14g", rfivect[ii].freq_avg);
            sprintf(temp2, " %-14g", 1000.0 / rfivect[ii].freq_avg);
         } else {
            pperr = 1000.0 * sqrt(rfivect[ii].freq_var) /
                (rfivect[ii].freq_avg * rfivect[ii].freq_avg);
            nice_output_2(temp1, rfivect[ii].freq_avg, sqrt(rfivect[ii].freq_var),
                          -15);
            nice_output_2(temp2, 1000.0 / rfivect[ii].freq_avg, pperr, -15);
         }
         printf("%-2d %-8d %13s %13s %-8.2f\n", ii + 1, rfivect[ii].numobs,
                temp2, temp1, rfivect[ii].sigma_avg);
      }
      printf("\nDone.\n\n");
   }

   /* Close the files and cleanup */

   free_rfi_vector(rfivect, numrfivect);
   free_mask(newmask);
   if (cmd->maskfileP)
      free_mask(oldmask);
   free(outfilenm);
   free(statsfilenm);
   free(bytemaskfilenm);
   free(maskfilenm);
   free(rfifilenm);
   vect_free(dataavg[0]);
   vect_free(dataavg);
   vect_free(datastd[0]);
   vect_free(datastd);
   vect_free(datapow[0]);
   vect_free(datapow);
   vect_free(bytemask[0]);
   vect_free(bytemask);
   if (!cmd->nocomputeP) {
       //  Close all the raw files and free their vectors
       close_rawfiles(&s);
       vect_free(chandata);
       if (insubs)
           vect_free(srawdata);
       else
           vect_free(rawdata);
   }
   return (0);
}
Example #15
0
File: nco_omp.c Project: hdfeos/nco
int /* O [nbr] Thread number */
nco_openmp_ini /* [fnc] Initialize OpenMP threading environment */
(const int thr_nbr) /* I [nbr] User-requested thread number */
{
  /* Purpose: Initialize OpenMP multi-threading environment
     Honor user-requested thread number, balance against known code efficiency,
     print diagnostics
     Returns thr_nbr=1 in three situations:
     1. UP codes (not threaded)
     2. SMP codes compiled with compilers which lack OpenMP support
     3. SMP codes where single thread requested/advised
     Otherwise returns system-dependent thr_nbr */

  /* Using naked stdin/stdout/stderr in parallel region generates warning
     Copy appropriate filehandle to variable scoped shared in parallel clause */

  char *nvr_OMP_NUM_THREADS; /* [sng] Environment variable OMP_NUM_THREADS */
  char *sng_cnv_rcd=NULL_CEWI; /* [sng] strtol()/strtoul() return code */
  FILE * const fp_stderr=stderr; /* [fl] stderr filehandle CEWI */

  nco_bool USR_SPC_THR_RQS=False;

  int dyn_thr=1; /* [flg] Allow system to dynamically set number of threads */

  int ntg_OMP_NUM_THREADS=int_CEWI; // [nbr] OMP_NUM_THREADS environment variable
  int prc_nbr_max; /* [nbr] Maximum number of processors available */
  int thr_nbr_act; /* O [nbr] Number of threads NCO uses */
  int thr_nbr_max_fsh=4; /* [nbr] Maximum number of threads program can use efficiently */
  int thr_nbr_max=int_CEWI; /* [nbr] Maximum number of threads system allows */
  int thr_nbr_rqs=int_CEWI; /* [nbr] Number of threads to request */

#ifndef _OPENMP
  if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO Build compiler lacked (or user turned off) OpenMP support. Code will execute with single thread in Uni-Processor (UP) mode.\n",nco_prg_nm_get());
  return (int)1;
#endif /* !_OPENMP */

  /* Strategy: 
     0. Determine maximum number of threads system will allocate (thr_nbr_max)
     1. Command-line thread request, if any, overrides automatic algorithm
     2. If no command-line request then system allocates OMP_NUM_THREADS if possible
     3. Reduce maximum number of threads available to system to thr_nbr_max_fsh
     Many operators cannot use more than thr_nbr_max_fsh ~ 2--4 threads efficiently
     Play nice: Set dynamic threading so that system can make efficiency decisions
     When dynamic threads are set, system never allocates more than thr_nbr_max_fsh */
  if(thr_nbr < 0){
    (void)fprintf(fp_stderr,"%s: ERROR User-requested thread number = %d is less than zero\n",nco_prg_nm_get(),thr_nbr);
    nco_exit(EXIT_FAILURE);
  } /* endif err */

  if(thr_nbr == 0)
    if(nco_dbg_lvl_get() >= nco_dbg_scl && nco_dbg_lvl_get() != nco_dbg_dev )
      (void)fprintf(fp_stderr,"%s: INFO User did not specify thread request > 0 on command line. NCO will automatically assign threads based on OMP_NUM_THREADS environment and machine capabilities.\nHINT: Not specifiying any --thr_nbr (or specifying --thr_nbr=0) causes NCO to try to pick the optimal thread number. Specifying --thr_nbr=1 tells NCO to execute in Uni-Processor (UP) (i.e., single-threaded) mode.\n",nco_prg_nm_get());

  if(thr_nbr > 0) USR_SPC_THR_RQS=True;

  prc_nbr_max=omp_get_num_procs(); /* [nbr] Maximum number of processors available */
  if(omp_in_parallel()){
    (void)fprintf(fp_stderr,"%s: ERROR Attempted to get maximum thread number from within parallel region\n",nco_prg_nm_get());
    nco_exit(EXIT_FAILURE);
  }else{
    thr_nbr_max=omp_get_max_threads(); /* [nbr] Maximum number of threads system allows */
  } /* end error */

  if(nco_dbg_lvl_get() >= nco_dbg_scl && nco_dbg_lvl_get() != nco_dbg_dev){
    if((nvr_OMP_NUM_THREADS=getenv("OMP_NUM_THREADS"))) ntg_OMP_NUM_THREADS=(int)strtol(nvr_OMP_NUM_THREADS,&sng_cnv_rcd,NCO_SNG_CNV_BASE10); /* [sng] Environment variable OMP_NUM_THREADS */
    if(nvr_OMP_NUM_THREADS && *sng_cnv_rcd) nco_sng_cnv_err(nvr_OMP_NUM_THREADS,"strtol",sng_cnv_rcd);
    (void)fprintf(fp_stderr,"%s: INFO Environment variable OMP_NUM_THREADS ",nco_prg_nm_get());
    if(ntg_OMP_NUM_THREADS > 0) (void)fprintf(fp_stderr,"= %d\n",ntg_OMP_NUM_THREADS); else (void)fprintf(fp_stderr,"does not exist\n");
    (void)fprintf(fp_stderr,"%s: INFO omp_get_num_procs() reports number of processors available is %d\n",nco_prg_nm_get(),prc_nbr_max);
    (void)fprintf(fp_stderr,"%s: INFO omp_get_max_threads() reports maximum number of threads system allows is %d\n",nco_prg_nm_get(),thr_nbr_max);
  } /* endif dbg */

  if(USR_SPC_THR_RQS){
    /* Try to honor user-specified thread request... */
    thr_nbr_rqs=thr_nbr; /* [nbr] Number of threads to request */
    /* ...if possible... */
    if(nco_dbg_lvl_get() >= nco_dbg_scl) (void)fprintf(fp_stderr,"%s: INFO Command-line requests %d thread%s\n",nco_prg_nm_get(),thr_nbr,(thr_nbr > 1) ? "s" : "");
    if(thr_nbr > thr_nbr_max){
      (void)fprintf(fp_stderr,"%s: WARNING Reducing user-requested thread number = %d to maximum thread number allowed = %d\n",nco_prg_nm_get(),thr_nbr,thr_nbr_max);
      thr_nbr_rqs=thr_nbr_max; /* [nbr] Number of threads to request */
    } /* endif */
  }else{ /* !USR_SPC_THR_RQS */
    /* Otherwise use automatic thread allocation algorithm */

    /* Request maximum number of threads permitted */
    thr_nbr_rqs=thr_nbr_max; /* [nbr] Number of threads to request */

    /* Restrict threading on per-program basis to play nicely with others */
    switch(nco_prg_id_get()){
      /* Operators with pre-set thread limit
	 NB: All operators currently have default restrictions
	 2007: Only ncwa and ncap2 have a chance to scale on non-parallel filesystems
	 ncap2 may, one day, see a big performance boost from threading
	 However, as of 20090327, ncap2 threading may be buggy due to ANTLR
	 Moreover, we want to prevent hogging processes on 32-way nodes
	 until/unless clear benefits of threading are demonstrated.
	 2015: Threads improve ncks regridding performance by 2-3x on ACME ~1-20 GB netCDF3 files */
    case ncap: 
      /* 20090327: Restrict ncap2 to one thread until ANTLR threading resolved */
      thr_nbr_max_fsh=1;
      break;
    case ncecat: 
    case ncrcat: 
      /* ncecat and ncrcat are extremely I/O intensive 
	 Maximum efficiency when one thread reads from input file while other writes to output file */
      // 20140219: Turn-off OpenMP until thoroughly tested
      // thr_nbr_max_fsh=2;
      thr_nbr_max_fsh=1;
      break;
    case ncks: 
      // 20150529: Turn-on OpenMP for regridder
      thr_nbr_max_fsh=16;
      break;
    case ncwa: 
      // 20150530: Turn-on OpenMP for debugging
      // 20150610: Eight threads with ncwa seemed to work for a little while, then it got flaky. Turned-off for 4.5.0 release
      // 20150622: Allowing eight threads again for debugging with -D 3
      // 20150701: Firmly established that netCDF4 involvement hoses threading because HDF5 is not threadsafe by default
      // 20150710: Turned-off for 4.5.1 release
      // Symptoms of bugs, if any, show up with
      // cd ~/nco/bm;nco_bm.pl --regress ncwa;cd -
      thr_nbr_max_fsh=1;
      if(nco_dbg_lvl_get() >= nco_dbg_scl) thr_nbr_max_fsh=1;
      break;
      /* Operators with higher maximum pre-set thread limit (NB: not all of these are threaded!) */
    case ncra:
      thr_nbr_max_fsh=1;
      if(nco_dbg_lvl_get() >= nco_dbg_scl) thr_nbr_max_fsh=1;
      break;
    case ncbo: 
    case ncatted: 
    case ncfe:
    case ncflint: 
    case ncpdq: 
    case ncrename: 
    case ncge:
      // 20140219: Turn-off OpenMP until thoroughly tested
      // thr_nbr_max_fsh=4;
      thr_nbr_max_fsh=1;
      break;
    default: nco_dfl_case_prg_id_err(); break;
    } /* end case */
    
    /* Automatic algorithm tries to play nice with others */
    (void)omp_set_dynamic(dyn_thr); /* [flg] Allow system to dynamically set number of threads */
    if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO omp_set_dynamic() used to %s OS to dynamically set threads\n",nco_prg_nm_get(),(dyn_thr ? "ALLOW" : "DISALLOW"));
    dyn_thr=omp_get_dynamic(); /* [flg] Allow system to dynamically set number of threads */
    if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO omp_get_dynamic() reports system will%s utilize dynamic threading\n",nco_prg_nm_get(),(dyn_thr ? "" : " NOT"));

    /* Apply program/system limitations */
    if(thr_nbr_max > thr_nbr_max_fsh){
      if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO Reducing default thread number from %d to %d, an operator-dependent \"play-nice\" number set in nco_openmp_ini()\n",nco_prg_nm_get(),thr_nbr_max,thr_nbr_max_fsh);
      thr_nbr_rqs=thr_nbr_max_fsh; /* [nbr] Number of threads to request */
    } /* endif */      
  } /* !USR_SPC_THR_RQS */

#ifdef ENABLE_NETCDF4
  if(nco_prg_id_get() != ncks && nco_prg_id_get() != ncwa && nco_prg_id_get() != ncra && thr_nbr_rqs > 1){
    if(USR_SPC_THR_RQS && nco_dbg_lvl_get() >= nco_dbg_fl) (void)fprintf(stdout,"%s: WARNING This is TODO nco939. Requested threading with netCDF4 (HDF5) support. The NCO thread request algorithm considers user-input, environment variables, and software and hardware limitations in determining the number of threads to request, thr_nbr_rqs. At this point NCO would request result %d threads from a netCDF3-based library. However, this NCO was built with netCDF4, which relies on HDF5. netCDF4 is not thread-safe unless HDF5 is configured with the (non-default) --enable-threadsafe option. NCO currently has no way to know whether HDF5 was built thread-safe. Hence, all netCDF4-based operators are currently restricted to a single thread. The program will now automatically set thr_nbr_rqs = 1.\nThis unfortunate limitation is necessary to keep the NCO developers sane. If you want/need threading in netCDF4-based NCO, please politely yet firmly request of the Unidata netCDF developers that better thread support be built into netCDF4, and request of the HDF5 developers that they make the --enable-threadsafe option compatible with all HDF5 libraries and APIs, including Fortran (which, as of HDF5 1.8.0 in 2008, is incompatible with --enable-threadsafe).\n",nco_prg_nm_get(),thr_nbr_rqs);
    thr_nbr_rqs=1;
  } /* endif */
#endif /* !ENABLE_NETCDF4 */

  /* Set thread number */
  if(omp_in_parallel()){
    (void)fprintf(fp_stderr,"%s: ERROR Attempted to set thread number from within parallel region\n",nco_prg_nm_get());
    nco_exit(EXIT_FAILURE);
  }else{
    (void)omp_set_num_threads(thr_nbr_rqs); 
    if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO omp_set_num_threads() used to set execution environment to spawn teams of %d thread(s)\n",nco_prg_nm_get(),thr_nbr_rqs);
  } /* end error */

  thr_nbr_act=omp_get_max_threads();
  if(nco_dbg_lvl_get() >= nco_dbg_scl) (void)fprintf(fp_stderr,"%s: INFO After using omp_set_num_threads() to adjust for any user requests/NCO optimizations, omp_get_max_threads() reports that a parallel construct here/now would spawn %d thread(s)\n",nco_prg_nm_get(),thr_nbr_act);
#ifdef _OPENMP
  if(nco_dbg_lvl_get() >= nco_dbg_scl){
# pragma omp parallel default(none) shared(thr_nbr_act)
    { /* begin OpenMP parallel */
# pragma omp single nowait
      { /* begin OpenMP single */
	thr_nbr_act=omp_get_num_threads(); /* [nbr] Number of threads NCO uses */
	if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: INFO Small parallel test region spawned team of %d thread(s)\n",nco_prg_nm_get(),thr_nbr_act);
      } /* end OpenMP single */
    } /* end OpenMP parallel */
  } /* end dbg */
#endif /* !_OPENMP */
  
  /* Issue any warnings about OpenMP credibility during debugging phase */
  if(True)
     if((nco_prg_id_get() == ncwa || nco_prg_id_get() == ncra) && thr_nbr_act > 1)
      if(nco_dbg_lvl_get() >= nco_dbg_std) (void)fprintf(fp_stderr,"%s: WARNING OpenMP threading active with %d threads but not guaranteed to work on this operator. If strange behavior (e.g., NaN results) ensues, manually turn-off multi-threading by specifying \"-t 1\" option.\n",nco_prg_nm_get(),thr_nbr_act);

  return thr_nbr_act; /* O [nbr] Number of threads NCO uses */
} /* end nco_openmp_ini() */
Example #16
0
main ()
{
    int	i;


    thds = omp_get_max_threads ();
    if (thds == 1) {
        printf ("should be run this program on multi threads.\n");
        exit (0);
    }
    omp_set_dynamic (0);


    #pragma omp parallel
    {
        #pragma omp for schedule(static,1) lastprivate (prvt1,prvt2) lastprivate (prvt3)
        for (i=0; i<thds; i++) {
            prvt1 = i;
            prvt2 = i;
            prvt3 = i;
            barrier (thds);
            if (prvt1 != i) {
                #pragma omp critical
                errors += 1;
            }
            if (prvt2 != i) {
                #pragma omp critical
                errors += 1;
            }
            if (prvt3 != i) {
                #pragma omp critical
                errors += 1;
            }
            if (i==0) {
                waittime (1);
            }
            prvt1 = i;
            prvt2 = i;
            prvt3 = i;
        }

        if (prvt1 != thds - 1) {
            #pragma omp critical
            errors += 1;
        }
        if (prvt2 != thds - 1) {
            #pragma omp critical
            errors += 1;
        }
        if (prvt3 != thds - 1) {
            #pragma omp critical
            errors += 1;
        }
    }


    #pragma omp parallel
    func (thds);


    func (1);


    if (errors == 0) {
        printf ("lastprivate 004 : SUCCESS\n");
        return 0;
    } else {
        printf ("lastprivate 004 : FAILED\n");
        return 1;
    }
}
Example #17
0
int main()
{
    double AllTime=PortableGetTime();
    double x0=0.0, y0=0.0, z0=0.0;
    double xn=10.0, yn=10.0, zn=10.0;
    int Sx=300, Sy=300, Sz=300, St=100;
    double * masprev;
    double * masnext;
    masprev=new double[Sx*Sy*Sz];
    masnext=new double[Sx*Sy*Sz];
    double dx=(xn-x0)/Sx, dy=(yn-y0)/Sy, dz=(zn-z0)/Sz;
   
    FILE* filex=fopen("filex.txt","w");
    FILE* filey=fopen("filey.txt","w");
    FILE* filez=fopen("filez.txt","w");

    double dt=0.00001;				//выбираем dt
    omp_set_dynamic(0);      // запретить библиотеке openmp менять число потоков во время исполнения
    omp_set_num_threads(4); // установить число потоков в 10
    memset(masprev, 0, Sx*Sy*Sz*sizeof(double));
	memset(masnext, 0, Sx*Sy*Sz*sizeof(double));
    for (int x=1; x<Sx-1; x++)
        for(int y=1; y<Sy-1; y++)
            for(int z=1; z<Sz-1; z++)
                masprev[x+y*Sx+z*Sx*Sy]=u(x0+dx*x, y0+dy*y, z0+dz*z);
    fprintf(filex,"%e\n", dx);
    fprintf(filey,"%e\n", dy);
    fprintf(filez,"%e\n", dz);
    fprintf(filex,"%i\n", Sx);
    fprintf(filey,"%i\n", Sy);
    fprintf(filez,"%i\n", Sz);
    for(int x=0; x<Sx; x++)
        fprintf(filex,"%lf ", masprev[x+49*Sx+49*Sx*Sy]);
    for(int y=0; y<Sy; y++)
        fprintf(filey,"%lf ", masprev[49+y*Sx+49*Sx*Sy]);
    for(int z=0; z<Sz; z++)
        fprintf(filez,"%lf ", masprev[49+49*Sx+z*Sx*Sy]);
    fprintf(filex,"\n");
    fprintf(filey,"\n");
    fprintf(filez,"\n");

	double Time=PortableGetTime();

    for (int t=1; t<St; t++)
    {        
		#pragma omp parallel for
        for (int z=1; z<Sz-1; z++)
        {
            for(int y=1; y<Sy-1; y++)
            {
                for(int x=1; x<Sx-1; x++)
                {
                    masnext[x+y*Sx+z*Sx*Sy]=
                        dt*((masprev[(x+1)+y*Sx+z*Sx*Sy]-2*masprev[x+y*Sx+z*Sx*Sy]+masprev[(x-1)+y*Sx+z*Sx*Sy])/(dx*dx)
                       +(masprev[x+(y+1)*Sx+z*Sx*Sy]-2*masprev[x+y*Sx+z*Sx*Sy]+masprev[x+(y-1)*Sx+z*Sx*Sy])/(dy*dy)
                       +(masprev[x+y*Sx+(z+1)*Sx*Sy]-2*masprev[x+y*Sx+z*Sx*Sy]+masprev[x+y*Sx+(z-1)*Sx*Sy])/(dz*dz)
                       +f(x0+dx*x, y0+dy*y, z0+dz*z)-masprev[x+y*Sx+z*Sx*Sy])+masprev[x+y*Sx+z*Sx*Sy];
                }
            }
        }

        double* tmp=masprev;
        masprev=masnext;
        masnext=tmp;
    }

    Time=PortableGetTime()-Time;

    fprintf(filex,"%i\n", Sx);
    fprintf(filey,"%i\n", Sy);
    fprintf(filez,"%i\n", Sz);
    for(int x=0; x<Sx; x++)
        fprintf(filex,"%lf ", masprev[x+49*Sx+49*Sx*Sy]);
    for(int y=0; y<Sy; y++)
        fprintf(filey,"%lf ", masprev[49+y*Sx+49*Sx*Sy]);
    for(int z=0; z<Sz; z++)
        fprintf(filez,"%lf ", masprev[49+49*Sx+z*Sx*Sy]);
    fprintf(filex,"\n");
    fprintf(filey,"\n");
    fprintf(filez,"\n");

    AllTime=PortableGetTime()-AllTime;
    printf(" %lf \n %lf \n",Time, AllTime);
    fclose(filex);
    fclose(filey);
    fclose(filez);

	delete[] masprev;
	delete[] masnext;

    return 0;
}
Example #18
0
main ()
{
  int	i;


  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  omp_set_dynamic (0);


  rdct_inc = rdct_inc2 = rdct_pls = rdct_pls2 = rdct_pls3 = 1;
  rdct_dec = rdct_dec2 = rdct_mns = rdct_mns2 = 2;
  rdct_mul = rdct_mul2 = rdct_mul3 = 3;
  rdct_land = rdct_land2 = rdct_land3 = -2;
  rdct_lor = rdct_lor2 = rdct_lor3 = 1;
  rdct_xor = rdct_xor2 = rdct_xor3 = 2;
  rdct_and = rdct_and2 = 1;
  rdct_or = rdct_or2 = 0;

  #pragma omp parallel for reduction(+:rdct_inc,rdct_inc2,rdct_pls,rdct_pls2,rdct_pls3) \
			   reduction(-:rdct_dec,rdct_dec2,rdct_mns,rdct_mns2) \
			   reduction(*:rdct_mul,rdct_mul2,rdct_mul3) \
			   reduction(&:rdct_land,rdct_land2,rdct_land3) \
			   reduction (|:rdct_lor,rdct_lor2,rdct_lor3) \
			   reduction (^:rdct_xor,rdct_xor2,rdct_xor3) \
			   reduction (&&:rdct_and,rdct_and2) \
			   reduction (||:rdct_or,rdct_or2)
  for (i=0; i<LOOPNUM; i++) {

    rdct_inc ++;
    ++ rdct_inc2;
    rdct_pls += i;
    rdct_pls2 = rdct_pls2 + i;
    rdct_pls3 = i + rdct_pls3;

    rdct_dec --;
    -- rdct_dec2;
    rdct_mns -= i;
    rdct_mns2 = rdct_mns2 - i;

    rdct_mul *= i;
    rdct_mul2 = rdct_mul2 * i;
    rdct_mul3 = i * rdct_mul3;

    rdct_land &= 1<<i;
    rdct_land2 = rdct_land2 & (1<<i);
    rdct_land3 = (1<<i) & rdct_land3;

    rdct_lor |= 1<<i;
    rdct_lor2 = rdct_lor2 | (1<<i);
    rdct_lor3 = (1<<i) | rdct_lor3;

    rdct_xor ^= 1<<i;
    rdct_xor2 = rdct_xor2 ^ (1<<i);
    rdct_xor3 = (1<<i) ^ rdct_xor3;

    rdct_and = rdct_and && i;
    rdct_and2 = (i+1) && rdct_and2;

    rdct_or = rdct_or || i;
    rdct_or2 = 0 || rdct_or2;

    if (sizeof(rdct_inc) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_inc2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_pls) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_pls2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_pls3) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_dec) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_dec2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_mns) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_mns2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_mul) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_mul2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_mul3) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_land) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_land2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_land3) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_lor) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_lor2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_lor3) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_xor) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_xor2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_xor3) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_and) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_and2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_or) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(rdct_or2) != sizeof(long long)) {
      #pragma omp critical
      errors += 1;
    }
  }


  rst_inc = rst_inc2 = rst_pls = rst_pls2 = rst_pls3 = 1;
  rst_dec = rst_dec2 = rst_mns = rst_mns2 = 2;
  rst_mul = rst_mul2 = rst_mul3 = 3;
  rst_land = rst_land2 = rst_land3 = -2;
  rst_lor = rst_lor2 = rst_lor3 = 1;
  rst_xor = rst_xor2 = rst_xor3 = 2;
  rst_and = rst_and2 = 1;
  rst_or = rst_or2 = 0;

  for (i=0; i<LOOPNUM; i++) {

    rst_inc ++;
    ++ rst_inc2;
    rst_pls += i;
    rst_pls2 = rst_pls2 + i;
    rst_pls3 = i + rst_pls3;

    rst_dec --;
    -- rst_dec2;
    rst_mns -= i;
    rst_mns2 = rst_mns2 - i;

    rst_mul *= i;
    rst_mul2 = rst_mul2 * i;
    rst_mul3 = i * rst_mul3;

    rst_land &= 1<<i;
    rst_land2 = rst_land2 & (1<<i);
    rst_land3 = (1<<i) & rst_land3;

    rst_lor |= 1<<i;
    rst_lor2 = rst_lor2 | (1<<i);
    rst_lor3 = (1<<i) | rst_lor3;

    rst_xor ^= 1<<i;
    rst_xor2 = rst_xor2 ^ (1<<i);
    rst_xor3 = (1<<i) ^ rst_xor3;

    rst_and = rst_and && i;
    rst_and2 = (i+1) && rst_and2;

    rst_or = rst_or || i;
    rst_or2 = 0 || rst_or2;
  }

  if (rst_inc != rdct_inc) {
    errors += 1;
  }
  if (rst_inc2 != rdct_inc2) {
    errors += 1;
  }
  if (rst_pls != rdct_pls) {
    errors += 1;
  }
  if (rst_pls2 != rdct_pls2) {
    errors += 1;
  }
  if (rst_pls3 != rdct_pls3) {
    errors += 1;
  }
  if (rst_dec != rdct_dec) {
    errors += 1;
  }
  if (rst_dec2 != rdct_dec2) {
    errors += 1;
  }
  if (rst_mns != rdct_mns) {
    errors += 1;
  }
  if (rst_mns2 != rdct_mns2) {
    errors += 1;
  }
  if (rst_mul != rdct_mul) {
    errors += 1;
  }
  if (rst_mul2 != rdct_mul2) {
    errors += 1;
  }
  if (rst_mul3 != rdct_mul3) {
    errors += 1;
  }
  if (rst_land != rdct_land) {
    errors += 1;
  }
  if (rst_land2 != rdct_land2) {
    errors += 1;
  }
  if (rst_land3 != rdct_land3) {
    errors += 1;
  }
  if (rst_lor != rdct_lor) {
    errors += 1;
  }
  if (rst_lor2 != rdct_lor2) {
    errors += 1;
  }
  if (rst_lor3 != rdct_lor3) {
    errors += 1;
  }
  if (rst_xor != rdct_xor) {
    errors += 1;
  }
  if (rst_xor2 != rdct_xor2) {
    errors += 1;
  }
  if (rst_xor3 != rdct_xor3) {
    errors += 1;
  }
  if (rst_and != rdct_and) {
    errors += 1;
  }
  if (rst_and2 != rdct_and2) {
    errors += 1;
  }
  if (rst_or != rdct_or) {
    errors += 1;
  }
  if (rst_or2 != rdct_or2) {
    errors += 1;
  }


  if (errors == 0) {
    printf ("reduction 016 : SUCCESS\n");
    return 0;
  } else {
    printf ("reduction 016 : FAILED\n");
    return 1;
  }
}
void test_product_large()
{
  for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_1( product(MatrixXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
    CALL_SUBTEST_2( product(MatrixXd(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
    CALL_SUBTEST_3( product(MatrixXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
    CALL_SUBTEST_4( product(MatrixXcf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2), internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2))) );
    CALL_SUBTEST_5( product(Matrix<float,Dynamic,Dynamic,RowMajor>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );

    CALL_SUBTEST_1( test_aliasing<float>() );
  }

#if defined EIGEN_TEST_PART_6
  {
    // test a specific issue in DiagonalProduct
    int N = 1000000;
    VectorXf v = VectorXf::Ones(N);
    MatrixXf m = MatrixXf::Ones(N,3);
    m = (v+v).asDiagonal() * m;
    VERIFY_IS_APPROX(m, MatrixXf::Constant(N,3,2));
  }

  {
    // test deferred resizing in Matrix::operator=
    MatrixXf a = MatrixXf::Random(10,4), b = MatrixXf::Random(4,10), c = a;
    VERIFY_IS_APPROX((a = a * b), (c * b).eval());
  }

  {
    // check the functions to setup blocking sizes compile and do not segfault
    // FIXME check they do what they are supposed to do !!
    std::ptrdiff_t l1 = internal::random<int>(10000,20000);
    std::ptrdiff_t l2 = internal::random<int>(100000,200000);
    std::ptrdiff_t l3 = internal::random<int>(1000000,2000000);
    setCpuCacheSizes(l1,l2,l3);
    VERIFY(l1==l1CacheSize());
    VERIFY(l2==l2CacheSize());
    std::ptrdiff_t k1 = internal::random<int>(10,100)*16;
    std::ptrdiff_t m1 = internal::random<int>(10,100)*16;
    std::ptrdiff_t n1 = internal::random<int>(10,100)*16;
    // only makes sure it compiles fine
    internal::computeProductBlockingSizes<float,float,std::ptrdiff_t>(k1,m1,n1,1);
  }

  {
    // test regression in row-vector by matrix (bad Map type)
    MatrixXf mat1(10,32); mat1.setRandom();
    MatrixXf mat2(32,32); mat2.setRandom();
    MatrixXf r1 = mat1.row(2)*mat2.transpose();
    VERIFY_IS_APPROX(r1, (mat1.row(2)*mat2.transpose()).eval());

    MatrixXf r2 = mat1.row(2)*mat2;
    VERIFY_IS_APPROX(r2, (mat1.row(2)*mat2).eval());
  }

  {
    Eigen::MatrixXd A(10,10), B, C;
    A.setRandom();
    C = A;
    for(int k=0; k<79; ++k)
      C = C * A;
    B.noalias() = (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)))
                * (((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)) * ((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A))*((A*A)*(A*A)));
    VERIFY_IS_APPROX(B,C);
  }
#endif

  // Regression test for bug 714:
#if defined EIGEN_HAS_OPENMP
  omp_set_dynamic(1);
  for(int i = 0; i < g_repeat; i++) {
    CALL_SUBTEST_6( product(Matrix<float,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
  }
#endif
}
double run(int threads_num, double b)
{
    int i, j, step,k;
    double* y = calloc(N + 1, sizeof(double)); /* сеточное решение */
    double* dy = calloc(N + 1, sizeof(double)); /* разность y^n-y^n+1 двух соседних приближений по итерациям метода Ньютона */
    double *A[R], *B[R], *C[R], *G[R]; /* коэффициенты трёхдиагональной системы для каждого шага редукции */
    double begin, end;
    omp_set_dynamic(0); /* нельзя динамически изменять количество нитей */
    omp_set_num_threads(threads_num); /* 4 нити */
    for(i = 0; i < R; i++)
    {
        A[i] = calloc(N + 1, sizeof(double));
        B[i] = calloc(N + 1, sizeof(double));
        C[i] = calloc(N + 1, sizeof(double));
        G[i] = calloc(N + 1, sizeof(double));
    }
    begin = omp_get_wtime(); /* начальная точка отсчёта времени */
    for( k = 0; k < REPEATS; k++){
        #pragma omp parallel private(i, j)
        {
                #pragma omp for
                for(i = 0; i <= N; i++) y[i] = 1.0 + (b - 1.0) * i / N; /* нулевое приближение */
                #pragma omp single
                {
                    dy[0] = dy[N] = 0.0;
                    for(j = 0; j < R; j++) B[j][0] = B[j][N] = 1.0; /* при редукции крайние значения матрицы одни и те же во всех итерациях метода Ньютона */
                }
                while(1) /* итерации метода Ньютона в цикле */
                {
                    #pragma omp for
                    for(i = 1; i < N; i++) /* изначальные значения коэффициентов */
                    {
                        B[0][i] = (-2.0 / (h * h) - 5 * exp(y[i]) / 6);
                        A[0][i] = (1.0 / (h * h) - exp(y[i - 1]) / 12);
                        C[0][i] = (1.0 / (h * h) - exp(y[i + 1]) / 12);
                        G[0][i] = my_func(y, b, i);
                    }
                    for(j = 1; j < R; j++) /* значения коэффициентов после редукции */
                    {
                        step = pow(2, j); /* шаг прогонки при редукции */
                        #pragma omp for
                        for(i = step; i < N; i += step)
                        {
                            B[j][i] = B[j - 1][i] - A[j - 1][i] * C[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * A[j - 1][i + step / 2] / B[j - 1][i + step / 2];
                            A[j][i] = - A[j - 1][i] * A[j - 1][i - step / 2] / B[j - 1][i - step / 2];
                            C[j][i] = - C[j - 1][i] * C[j - 1][i + step / 2] / B[j - 1][i + step / 2];
                            G[j][i] = G[j - 1][i] - A[j - 1][i] * G[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * G[j - 1][i + step / 2] / B[j - 1][i + step / 2];
                        }
                    } /* редукция прогонки завершена */
                    #pragma omp single
                    {
                        dy[N / 2] = G[R - 1][N / 2] / B[R - 1][N / 2]; /* первый обратный шаг редукции */
                        dy[N / 4] = (G[R - 2][N / 4] - C[R - 2][N / 4] * dy[N / 2]) / B[R - 2][N / 4];
                        dy[N * 3 / 4] = (G[R - 2][N * 3 / 4] - A[R - 2][N * 3 / 4] * dy[N / 2] ) / B[R - 2][N * 3 / 4]; /* второй обратный шаг редукции */
                    }
                    for(j = R - 3; j >= 0; j--)
                    {
                        step = pow(2, j);
                        #pragma omp for
                        for(i = step; i < N; i += 2 * step) dy[i] = (G[j][i] - C[j][i] * dy[i + step] - A[j][i] * dy[i - step]) / B[j][i];
                    } /* оставшиеся обратные шаги редукции */
                    #pragma omp for
                    for(i = 0; i <= N; i++) y[i] -= dy[i]; /* одна итерация метода Ньютона */
                    if (norm(dy) < epsilon) break; /* условие останова метода Ньютона */
                }
        }
    }
    end = omp_get_wtime(); /* конечная точка отсчёта времени */
                    for(i = 0; i < R; i++)
                    {
                        free(A[i]);
                        free(B[i]);
                        free(C[i]);
                        free(G[i]);
                    }
    if( threads_num == 1){
        char str_dest[50];
        sprintf( str_dest, "prog_1_b_%f_results.txt",b);
        FILE* fp = fopen(str_dest, "w"); /* вывод полученной функции в файл */
        fprintf(fp, "X\tY\r\n");
        for(i = 0; i <= N; i++) fprintf(fp, "%e\t%e\r\n", ((double) i / N), y[i]);
        fclose(fp);
    }
    free(y);
    free(dy);
    return (end - begin)/REPEATS;
}
Example #21
0
FANN_EXTERNAL float FANN_API fann_train_epoch_quickprop_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb)
{
	struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*));
	int i=0,j=0;

	if(ann->prev_train_slopes == NULL)
	{
		fann_clear_train_arrays(ann);
	}

	//#define THREADNUM 1
	fann_reset_MSE(ann);

	/*vector<struct fann *> ann_vect(threadnumb);*/

	//generate copies of the ann
	omp_set_dynamic(0);
	omp_set_num_threads(threadnumb);
	#pragma omp parallel private(j)
	{

		#pragma omp for schedule(static)
		for(i=0; i<(int)threadnumb; i++)
		{
			ann_vect[i]=fann_copy(ann);
		}

    //parallel computing of the updates

        #pragma omp for schedule(static)
		for(i = 0; i < (int)data->num_data; i++)
		{
			j=omp_get_thread_num();
			if (ann->do_dropout) {
				fann_run_dropout(ann_vect[j], data->input[i]);
			}
			else {
				fann_run(ann_vect[j], data->input[i]);
			}
			fann_compute_MSE(ann_vect[j], data->output[i]);
			fann_backpropagate_MSE(ann_vect[j]);
			fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1);
		}
	}

    {
    	fann_type *weights = ann->weights;
    	fann_type *prev_steps = ann->prev_steps;
    	fann_type *prev_train_slopes = ann->prev_train_slopes;
		const unsigned int first_weight=0;
		const unsigned int past_end=ann->total_connections;

    	fann_type w=0.0, next_step;

    	const float epsilon = ann->learning_rate / data->num_data;
    	const float decay = ann->quickprop_decay;	/*-0.0001;*/
    	const float mu = ann->quickprop_mu;	/*1.75; */
    	const float shrink_factor = (float) (mu / (1.0 + mu));

		omp_set_dynamic(0);
		omp_set_num_threads(threadnumb);
		#pragma omp parallel private(w, next_step)
		{
			#pragma omp for schedule(static)
				for(i=first_weight; i < (int)past_end; i++)
				{
					fann_type temp_slopes=0.0;
					unsigned int k;
					fann_type *train_slopes;
					fann_type prev_step, prev_slope;

					w = weights[i];
					for(k=0;k<threadnumb;++k)
					{
						train_slopes=ann_vect[k]->train_slopes;
						temp_slopes+= train_slopes[i];
						train_slopes[i]=0.0;
					}
					temp_slopes+= decay * w;

					prev_step = prev_steps[i];
					prev_slope = prev_train_slopes[i];

					next_step = 0.0;


					/* The step must always be in direction opposite to the slope. */
					if(prev_step > 0.001)
					{
						/* If last step was positive...  */
						if(temp_slopes > 0.0) /*  Add in linear term if current slope is still positive. */
							next_step += epsilon * temp_slopes;

						/*If current slope is close to or larger than prev slope...  */
						if(temp_slopes > (shrink_factor * prev_slope))
							next_step += mu * prev_step;	/* Take maximum size negative step. */
						else
							next_step += prev_step * temp_slopes / (prev_slope - temp_slopes);	/* Else, use quadratic estimate. */
					}
					else if(prev_step < -0.001)
					{
						/* If last step was negative...  */
						if(temp_slopes < 0.0) /*  Add in linear term if current slope is still negative. */
							next_step += epsilon * temp_slopes;

						/* If current slope is close to or more neg than prev slope... */
						if(temp_slopes < (shrink_factor * prev_slope))
							next_step += mu * prev_step;	/* Take maximum size negative step. */
						else
							next_step += prev_step * temp_slopes / (prev_slope - temp_slopes);	/* Else, use quadratic estimate. */
					}
					else /* Last step was zero, so use only linear term. */
						next_step += epsilon * temp_slopes;

					/* update global data arrays */
					prev_steps[i] = next_step;
					prev_train_slopes[i] = temp_slopes;

					w += next_step;

					if(w > 1500)
						weights[i] = 1500;
					else if(w < -1500)
						weights[i] = -1500;
					else
						weights[i] = w;
				}
		}
	}
	//merge of MSEs
	for(i=0;i<(int)threadnumb;++i)
	{
		ann->MSE_value+= ann_vect[i]->MSE_value;
		ann->num_MSE+=ann_vect[i]->num_MSE;
		fann_destroy(ann_vect[i]);
	}
	free(ann_vect);
	return fann_get_MSE(ann);
}
Example #22
0
float train_epoch_batch_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb,vector< vector<fann_type> >& predicted_outputs)
{
	fann_reset_MSE(ann);
	predicted_outputs.resize(data->num_data,vector<fann_type> (data->num_output));
	vector<struct fann *> ann_vect(threadnumb);
	int i=0,j=0;

	//generate copies of the ann
	omp_set_dynamic(0);
	omp_set_num_threads(threadnumb);
	#pragma omp parallel private(j)
	{

		#pragma omp for schedule(static)
		for(i=0; i<(int)threadnumb; i++)
		{
			ann_vect[i]=fann_copy(ann);
		}

    //parallel computing of the updates

        #pragma omp for schedule(static)
		for(i = 0; i < (int)data->num_data; i++)
		{
			j=omp_get_thread_num();

			fann_type* temp_predicted_output=fann_run(ann_vect[j], data->input[i]);
			for(unsigned int k=0;k<data->num_output;++k)
			{
				predicted_outputs[i][k]=temp_predicted_output[k];
			}

			fann_compute_MSE(ann_vect[j], data->output[i]);
			fann_backpropagate_MSE(ann_vect[j]);
			fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1);
		}
	}

    //parallel update of the weights
	{
		const unsigned int num_data=data->num_data;
		const unsigned int first_weight=0;
		const unsigned int past_end=ann->total_connections;
		fann_type *weights = ann->weights;
		const fann_type epsilon = ann->learning_rate / num_data;
		omp_set_dynamic(0);
		omp_set_num_threads(threadnumb);
		#pragma omp parallel
		{
			#pragma omp for schedule(static)
				for(i=first_weight; i < (int)past_end; i++)
				{
					fann_type temp_slopes=0.0;
					unsigned int k;
					fann_type *train_slopes;
					for(k=0;k<threadnumb;++k)
					{
						train_slopes=ann_vect[k]->train_slopes;
						temp_slopes+= train_slopes[i];
						train_slopes[i]=0.0;
					}
					weights[i] += temp_slopes*epsilon;
				}
			}
	}
	//merge of MSEs
	for(i=0;i<(int)threadnumb;++i)
	{
		ann->MSE_value+= ann_vect[i]->MSE_value;
		ann->num_MSE+=ann_vect[i]->num_MSE;
		fann_destroy(ann_vect[i]);
	}
	return fann_get_MSE(ann);
}
Example #23
0
FANN_EXTERNAL float FANN_API fann_train_epoch_sarprop_parallel(struct fann *ann, struct fann_train_data *data, const unsigned int threadnumb)
{
	struct fann** ann_vect= (struct fann**) malloc(threadnumb * sizeof(struct fann*));
	int i=0,j=0;

	if(ann->prev_train_slopes == NULL)
	{
		fann_clear_train_arrays(ann);
	}

	//#define THREADNUM 1
	fann_reset_MSE(ann);

	/*vector<struct fann *> ann_vect(threadnumb);*/

	//generate copies of the ann
	omp_set_dynamic(0);
	omp_set_num_threads(threadnumb);
	#pragma omp parallel private(j)
	{

		#pragma omp for schedule(static)
		for(i=0; i<(int)threadnumb; i++)
		{
			ann_vect[i]=fann_copy(ann);
		}

    //parallel computing of the updates

        #pragma omp for schedule(static)
		for(i = 0; i < (int)data->num_data; i++)
		{
			j=omp_get_thread_num();
			if (ann->do_dropout) {
				fann_run_dropout(ann_vect[j], data->input[i]);
			}
			else {
				fann_run(ann_vect[j], data->input[i]);
			}
			fann_compute_MSE(ann_vect[j], data->output[i]);
			fann_backpropagate_MSE(ann_vect[j]);
			fann_update_slopes_batch(ann_vect[j], ann_vect[j]->first_layer + 1, ann_vect[j]->last_layer - 1);
		}
	}

    {
    	fann_type *weights = ann->weights;
    	fann_type *prev_steps = ann->prev_steps;
    	fann_type *prev_train_slopes = ann->prev_train_slopes;
		const unsigned int first_weight=0;
		const unsigned int past_end=ann->total_connections;
		const unsigned int epoch=ann->sarprop_epoch;

    	fann_type next_step;

    	/* These should be set from variables */
    	const float increase_factor = ann->rprop_increase_factor;	/*1.2; */
    	const float decrease_factor = ann->rprop_decrease_factor;	/*0.5; */
    	/* TODO: why is delta_min 0.0 in iRprop? SARPROP uses 1x10^-6 (Braun and Riedmiller, 1993) */
    	const float delta_min = 0.000001f;
    	const float delta_max = ann->rprop_delta_max;	/*50.0; */
    	const float weight_decay_shift = ann->sarprop_weight_decay_shift; /* ld 0.01 = -6.644 */
    	const float step_error_threshold_factor = ann->sarprop_step_error_threshold_factor; /* 0.1 */
    	const float step_error_shift = ann->sarprop_step_error_shift; /* ld 3 = 1.585 */
    	const float T = ann->sarprop_temperature;
		float MSE, RMSE;


    	//merge of MSEs
    	for(i=0;i<(int)threadnumb;++i)
    	{
    		ann->MSE_value+= ann_vect[i]->MSE_value;
    		ann->num_MSE+=ann_vect[i]->num_MSE;
    	}

    	MSE = fann_get_MSE(ann);
    	RMSE = sqrtf(MSE);

    	/* for all weights; TODO: are biases included? */
		omp_set_dynamic(0);
		omp_set_num_threads(threadnumb);
		#pragma omp parallel private(next_step)
		{
			#pragma omp for schedule(static)
				for(i=first_weight; i < (int)past_end; i++)
				{
					/* TODO: confirm whether 1x10^-6 == delta_min is really better */
					const fann_type prev_step  = fann_max(prev_steps[i], (fann_type) 0.000001);	/* prev_step may not be zero because then the training will stop */

					/* calculate SARPROP slope; TODO: better as new error function? (see SARPROP paper)*/
					fann_type prev_slope, same_sign;
					fann_type temp_slopes=0.0;
					unsigned int k;
					fann_type *train_slopes;
					for(k=0;k<threadnumb;++k)
					{
						train_slopes=ann_vect[k]->train_slopes;
						temp_slopes+= train_slopes[i];
						train_slopes[i]=0.0;
					}
					temp_slopes= -temp_slopes - weights[i] * (fann_type)fann_exp2(-T * epoch + weight_decay_shift);

					next_step=0.0;

					/* TODO: is prev_train_slopes[i] 0.0 in the beginning? */
					prev_slope = prev_train_slopes[i];

					same_sign = prev_slope * temp_slopes;

					if(same_sign > 0.0)
					{
						next_step = fann_min(prev_step * increase_factor, delta_max);
						/* TODO: are the signs inverted? see differences between SARPROP paper and iRprop */
						if (temp_slopes < 0.0)
							weights[i] += next_step;
						else
							weights[i] -= next_step;
					}
					else if(same_sign < 0.0)
					{
						#ifndef RAND_MAX
						#define	RAND_MAX	0x7fffffff
						#endif
						if(prev_step < step_error_threshold_factor * MSE)
							next_step = prev_step * decrease_factor + (float)rand() / RAND_MAX * RMSE * (fann_type)fann_exp2(-T * epoch + step_error_shift);
						else
							next_step = fann_max(prev_step * decrease_factor, delta_min);

						temp_slopes = 0.0;
					}
					else
					{
						if(temp_slopes < 0.0)
							weights[i] += prev_step;
						else
							weights[i] -= prev_step;
					}

					/* update global data arrays */
					prev_steps[i] = next_step;
					prev_train_slopes[i] = temp_slopes;

				}
		}
    }

	++(ann->sarprop_epoch);

	//already computed before
	/*//merge of MSEs
	for(i=0;i<threadnumb;++i)
	{
		ann->MSE_value+= ann_vect[i]->MSE_value;
		ann->num_MSE+=ann_vect[i]->num_MSE;
	}*/
	//destroy the copies of the ann
	for(i=0; i<(int)threadnumb; i++)
	{
		fann_destroy(ann_vect[i]);
	}
	free(ann_vect);
	return fann_get_MSE(ann);
}
Example #24
0
void
omp_set_dynamic_ (const int32_t *set)
{
  omp_set_dynamic (*set);
}
Example #25
0
int main(int argc, char *argv[])
{
    struct pngquant_options options = {
        .floyd = 1.f, // floyd-steinberg dithering
    };
    options.liq = liq_attr_create();

#if USE_SSE
    if (!options.liq) {
        print_full_version(stderr);
        fputs("SSE2-capable CPU is required for this build.\n", stderr);
        return WRONG_ARCHITECTURE;
    }
#endif

    unsigned int error_count=0, skipped_count=0, file_count=0;
    pngquant_error latest_error=SUCCESS;
    const char *newext = NULL, *output_file_path = NULL;

    fix_obsolete_options(argc, argv);

    int opt;
    do {
        opt = getopt_long(argc, argv, "Vvqfhs:Q:o:", long_options, NULL);
        switch (opt) {
            case 'v':
                liq_set_log_callback(options.liq, log_callback, NULL);
                options.log_callback = log_callback;
                break;
            case 'q':
                liq_set_log_callback(options.liq, NULL, NULL);
                options.log_callback = NULL;
                break;

            case arg_floyd:
                options.floyd = optarg ? atof(optarg) : 1.0;
                if (options.floyd < 0 || options.floyd > 1.0) {
                    fputs("--floyd argument must be in 0..1 range\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;
            case arg_ordered: options.floyd = 0; break;
            case 'f': options.force = true; break;
            case arg_no_force: options.force = false; break;

            case arg_ext: newext = optarg; break;
            case 'o':
                if (output_file_path) {
                    fputs("--output option can be used only once\n", stderr);
                    return INVALID_ARGUMENT;
                }
                output_file_path = optarg; break;

            case arg_iebug:
                // opacities above 238 will be rounded up to 255, because IE6 truncates <255 to 0.
                liq_set_min_opacity(options.liq, 238);
                options.ie_mode = true;
                break;

            case arg_transbug:
                liq_set_last_index_transparent(options.liq, true);
                break;

            case 's':
                {
                    int speed = atoi(optarg);
                    if (speed >= 10) {
                        options.fast_compression = true;
                    }
                    if (speed == 11) {
                        options.floyd = 0;
                        speed = 10;
                    }
                    if (LIQ_OK != liq_set_speed(options.liq, speed)) {
                        fputs("Speed should be between 1 (slow) and 11 (fast).\n", stderr);
                        return INVALID_ARGUMENT;
                    }
                }
                break;

            case 'Q':
                if (!parse_quality(optarg, options.liq, &options.min_quality_limit)) {
                    fputs("Quality should be in format min-max where min and max are numbers in range 0-100.\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;

            case arg_posterize:
                if (LIQ_OK != liq_set_min_posterization(options.liq, atoi(optarg))) {
                    fputs("Posterization should be number of bits in range 0-4.\n", stderr);
                    return INVALID_ARGUMENT;
                }
                break;

            case arg_map:
                {
                    png24_image tmp = {};
                    if (SUCCESS != read_image(options.liq, optarg, false, &tmp, &options.fixed_palette_image, false)) {
                        fprintf(stderr, "  error: Unable to load %s", optarg);
                        return INVALID_ARGUMENT;
                    }
                }
                break;

            case 'h':
                print_full_version(stdout);
                print_usage(stdout);
                return SUCCESS;

            case 'V':
                puts(PNGQUANT_VERSION);
                return SUCCESS;

            case -1: break;

            default:
                return INVALID_ARGUMENT;
        }
    } while (opt != -1);

    int argn = optind;

    if (argn >= argc) {
        if (argn > 1) {
            fputs("No input files specified. See -h for help.\n", stderr);
        } else {
            print_full_version(stderr);
            print_usage(stderr);
        }
        return MISSING_ARGUMENT;
    }

    char *colors_end;
    unsigned long colors = strtoul(argv[argn], &colors_end, 10);
    if (colors_end != argv[argn] && '\0' == colors_end[0]) {
        if (LIQ_OK != liq_set_max_colors(options.liq, colors)) {
            fputs("Number of colors must be between 2 and 256.\n", stderr);
            return INVALID_ARGUMENT;
        }
        argn++;
    }

    if (newext && output_file_path) {
        fputs("--ext and --output options can't be used at the same time\n", stderr);
        return INVALID_ARGUMENT;
    }

    // new filename extension depends on options used. Typically basename-fs8.png
    if (newext == NULL) {
        newext = options.floyd > 0 ? "-ie-fs8.png" : "-ie-or8.png";
        if (!options.ie_mode) newext += 3; /* skip "-ie" */
    }

    if (argn == argc || (argn == argc-1 && 0==strcmp(argv[argn],"-"))) {
        options.using_stdin = true;
        argn = argc-1;
    }

    if (options.using_stdin && output_file_path) {
        fputs("--output can't be mixed with stdin\n", stderr);
        return INVALID_ARGUMENT;
    }

    const int num_files = argc-argn;

    if (output_file_path && num_files != 1) {
        fputs("Only one input file is allowed when --output is used\n", stderr);
        return INVALID_ARGUMENT;
    }

#ifdef _OPENMP
    // if there's a lot of files, coarse parallelism can be used
    if (num_files > 2*omp_get_max_threads()) {
        omp_set_nested(0);
        omp_set_dynamic(1);
    } else {
        omp_set_nested(1);
    }
#endif

    #pragma omp parallel for \
        schedule(dynamic) reduction(+:skipped_count) reduction(+:error_count) reduction(+:file_count) shared(latest_error)
    for(int i=0; i < num_files; i++) {
        struct pngquant_options opts = options;
        opts.liq = liq_attr_copy(options.liq);

        const char *filename = opts.using_stdin ? "stdin" : argv[argn+i];

        #ifdef _OPENMP
        struct buffered_log buf = {};
        if (opts.log_callback && omp_get_num_threads() > 1 && num_files > 1) {
            liq_set_log_callback(opts.liq, log_callback_buferred, &buf);
            liq_set_log_flush_callback(opts.liq, log_callback_buferred_flush, &buf);
            options.log_callback = log_callback_buferred;
            options.log_callback_user_info = &buf;
        }
        #endif


        pngquant_error retval = SUCCESS;

        const char *outname = output_file_path;
        char *outname_free = NULL;
        if (!options.using_stdin) {
            if (!outname) {
                outname = outname_free = add_filename_extension(filename, newext);
            }
            if (!options.force && file_exists(outname)) {
                fprintf(stderr, "  error:  %s exists; not overwriting\n", outname);
                retval = NOT_OVERWRITING_ERROR;
            }
        }

        if (!retval) {
            retval = pngquant_file(filename, outname, &opts);
        }

        free(outname_free);

        liq_attr_destroy(opts.liq);

        if (retval) {
            #pragma omp critical
            {
                latest_error = retval;
            }
            if (retval == TOO_LOW_QUALITY) {
                skipped_count++;
            } else {
                error_count++;
            }
        }
        ++file_count;
    }

    if (error_count) {
        verbose_printf(&options, "There were errors quantizing %d file%s out of a total of %d file%s.",
                       error_count, (error_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s");
    }
    if (skipped_count) {
        verbose_printf(&options, "Skipped %d file%s out of a total of %d file%s.",
                       skipped_count, (skipped_count == 1)? "" : "s", file_count, (file_count == 1)? "" : "s");
    }
    if (!skipped_count && !error_count) {
        verbose_printf(&options, "No errors detected while quantizing %d image%s.",
                       file_count, (file_count == 1)? "" : "s");
    }

    liq_image_destroy(options.fixed_palette_image);
    liq_attr_destroy(options.liq);

    return latest_error;
}


static void pngquant_output_image_free(png8_image *output_image)
{
    free(output_image->indexed_data);
    output_image->indexed_data = NULL;

    free(output_image->row_pointers);
    output_image->row_pointers = NULL;
}
Example #26
0
void
omp_set_dynamic_8_ (const int64_t *set)
{
  omp_set_dynamic (!!*set);
}
Example #27
0
int main(int argc, char *argv[])
{
    /* Any variable that begins with 't' means topocentric */
    /* Any variable that begins with 'b' means barycentric */
    FILE **outfiles;
    float **outdata = NULL;
    short **subsdata = NULL;
    double dtmp, *dms = NULL, avgdm = 0.0, maxdm, dsdt = 0;
    double tlotoa = 0.0, blotoa = 0.0, BW_ddelay = 0.0;
    double max = -9.9E30, min = 9.9E30, var = 0.0, avg = 0.0;
    double *btoa = NULL, *ttoa = NULL, avgvoverc = 0.0;
    char obs[3], ephem[10], rastring[50], decstring[50];
    long totnumtowrite, totwrote = 0, padwrote = 0, datawrote = 0;
    int **offsets;
    int ii, jj, numadded = 0, numremoved = 0, padding = 0;
    int numbarypts = 0, blocksperread = 0, worklen = 0;
    int numread = 0, numtowrite = 0;
    int padtowrite = 0, statnum = 0, good_padvals = 0;
    int numdiffbins = 0, *diffbins = NULL, *diffbinptr = NULL;
    int *idispdt;
    char *datafilenm;
    int dmprecision = 2;
    struct spectra_info s;
    infodata idata;
    mask obsmask;

    /* Call usage() if we have no command line arguments */

    if (argc == 1) {
        Program = argv[0];
        printf("\n");
        usage();
        exit(0);
    }

    /* Parse the command line using the excellent program Clig */

    cmd = parseCmdline(argc, argv);
    spectra_info_set_defaults(&s);
    dmprecision = cmd->dmprec;
    s.filenames = cmd->argv;
    s.num_files = cmd->argc;
    // If we are zeroDMing, make sure that clipping is off.
    if (cmd->zerodmP)
        cmd->noclipP = 1;
    s.clip_sigma = cmd->clip;
    // -1 causes the data to determine if we use weights, scales, &
    // offsets for PSRFITS or flip the band for any data type where
    // we can figure that out with the data
    s.apply_flipband = (cmd->invertP) ? 1 : -1;
    s.apply_weight = (cmd->noweightsP) ? 0 : -1;
    s.apply_scale = (cmd->noscalesP) ? 0 : -1;
    s.apply_offset = (cmd->nooffsetsP) ? 0 : -1;
    s.remove_zerodm = (cmd->zerodmP) ? 1 : 0;
    if (cmd->noclipP) {
        cmd->clip = 0.0;
        s.clip_sigma = 0.0;
    }
    if (cmd->ifsP) {
        // 0 = default or summed, 1-4 are possible also
        s.use_poln = cmd->ifs + 1;
    }
    if (!cmd->numoutP)
        cmd->numout = LONG_MAX;

    if (cmd->ncpus > 1) {
#ifdef _OPENMP
        int maxcpus = omp_get_num_procs();
        int openmp_numthreads = (cmd->ncpus <= maxcpus) ? cmd->ncpus : maxcpus;
        // Make sure we are not dynamically setting the number of threads
        omp_set_dynamic(0);
        omp_set_num_threads(openmp_numthreads);
        printf("Using %d threads with OpenMP\n\n", openmp_numthreads);
#endif
    } else {
#ifdef _OPENMP
        omp_set_num_threads(1); // Explicitly turn off OpenMP
#endif
    }

#ifdef DEBUG
    showOptionValues();
#endif

    printf("\n\n");
    printf("          Pulsar Subband De-dispersion Routine\n");
    printf("                 by Scott M. Ransom\n\n");

    if (RAWDATA) {
        if (cmd->filterbankP)
            s.datatype = SIGPROCFB;
        else if (cmd->psrfitsP)
            s.datatype = PSRFITS;
        else if (cmd->pkmbP)
            s.datatype = SCAMP;
        else if (cmd->bcpmP)
            s.datatype = BPP;
        else if (cmd->wappP)
            s.datatype = WAPP;
        else if (cmd->spigotP)
            s.datatype = SPIGOT;
    } else {                    // Attempt to auto-identify the data
        identify_psrdatatype(&s, 1);
        if (s.datatype == SIGPROCFB)
            cmd->filterbankP = 1;
        else if (s.datatype == PSRFITS)
            cmd->psrfitsP = 1;
        else if (s.datatype == SCAMP)
            cmd->pkmbP = 1;
        else if (s.datatype == BPP)
            cmd->bcpmP = 1;
        else if (s.datatype == WAPP)
            cmd->wappP = 1;
        else if (s.datatype == SPIGOT)
            cmd->spigotP = 1;
        else if (s.datatype == SUBBAND)
            insubs = 1;
        else {
            printf
                ("Error:  Unable to identify input data files.  Please specify type.\n\n");
            exit(1);
        }
    }

    if (!RAWDATA)
        s.files = (FILE **) malloc(sizeof(FILE *) * s.num_files);
    if (RAWDATA || insubs) {
        char description[40];
        psrdatatype_description(description, s.datatype);
        if (s.num_files > 1)
            printf("Reading %s data from %d files:\n", description, s.num_files);
        else
            printf("Reading %s data from 1 file:\n", description);
        for (ii = 0; ii < s.num_files; ii++) {
            printf("  '%s'\n", cmd->argv[ii]);
            if (insubs)
                s.files[ii] = chkfopen(s.filenames[ii], "rb");
        }
        printf("\n");
        if (RAWDATA) {
            read_rawdata_files(&s);
            print_spectra_info_summary(&s);
            spectra_info_to_inf(&s, &idata);
        } else {                // insubs
            cmd->nsub = s.num_files;
            s.N = chkfilelen(s.files[0], sizeof(short));
            s.padvals = gen_fvect(s.num_files);
            for (ii = 0; ii < s.num_files; ii++)
                s.padvals[ii] = 0.0;
            s.start_MJD = (long double *) malloc(sizeof(long double));
            s.start_spec = (long long *) malloc(sizeof(long long));
            s.num_spec = (long long *) malloc(sizeof(long long));
            s.num_pad = (long long *) malloc(sizeof(long long));
            s.start_spec[0] = 0L;
            s.num_spec[0] = s.N;
            s.num_pad[0] = 0L;
        }
        /* Read an input mask if wanted */
        if (cmd->maskfileP) {
            read_mask(cmd->maskfile, &obsmask);
            printf("Read mask information from '%s'\n\n", cmd->maskfile);
            good_padvals = determine_padvals(cmd->maskfile, &obsmask, s.padvals);
        } else {
            obsmask.numchan = obsmask.numint = 0;
        }
    }

    if (insubs) {
        char *root, *suffix;
        if (split_root_suffix(s.filenames[0], &root, &suffix) == 0) {
            printf("Error:  The input filename (%s) must have a suffix!\n\n",
                   s.filenames[0]);
            exit(1);
        }
        if (strncmp(suffix, "sub", 3) == 0) {
            char *tmpname;
            tmpname = calloc(strlen(root) + 10, 1);
            sprintf(tmpname, "%s.sub", root);
            readinf(&idata, tmpname);
            free(tmpname);
            s.num_channels = idata.num_chan;
            s.start_MJD[0] = idata.mjd_i + idata.mjd_f;
            s.dt = idata.dt;
            s.T = s.N * s.dt;
            s.lo_freq = idata.freq;
            s.df = idata.chan_wid;
            s.hi_freq = s.lo_freq + (s.num_channels - 1.0) * s.df;
            s.BW = s.num_channels * s.df;
            s.fctr = s.lo_freq - 0.5 * s.df + 0.5 * s.BW;
            s.spectra_per_subint = SUBSBLOCKLEN;
            print_spectra_info_summary(&s);
        } else {
            printf("\nThe input files (%s) must be subbands!  (i.e. *.sub##)\n\n",
                   cmd->argv[0]);
            exit(1);
        }
        free(root);
        free(suffix);
    }

    /* Determine the output file names and open them */

    datafilenm = (char *) calloc(strlen(cmd->outfile) + 20, 1);
    if (!cmd->subP) {
        printf("Writing output data to:\n");
        outfiles = (FILE **) malloc(cmd->numdms * sizeof(FILE *));
        dms = gen_dvect(cmd->numdms);
        for (ii = 0; ii < cmd->numdms; ii++) {
            dms[ii] = cmd->lodm + ii * cmd->dmstep;
            avgdm += dms[ii];
            sprintf(datafilenm, "%s_DM%.*f.dat", cmd->outfile, dmprecision, dms[ii]);
            outfiles[ii] = chkfopen(datafilenm, "wb");
            printf("   '%s'\n", datafilenm);
        }
        avgdm /= cmd->numdms;
        maxdm = dms[cmd->numdms - 1];
    } else {
        char format_str[30];
        int num_places;

        if (!cmd->nobaryP) {
            printf("\nWarning:  You cannot (currently) barycenter subbands.\n"
                   "          Setting the '-nobary' flag automatically.\n");
            cmd->nobaryP = 1;
        }
        printf("Writing subbands to:\n");
        cmd->numdms = 1;
        dms = gen_dvect(cmd->numdms);
        dms[0] = cmd->subdm;
        cmd->lodm = cmd->subdm;
        avgdm = cmd->subdm;
        maxdm = cmd->subdm;
        outfiles = (FILE **) malloc(cmd->nsub * sizeof(FILE *));
        num_places = (int) ceil(log10(cmd->nsub));
        sprintf(format_str, "%%s_DM%%.*f.sub%%0%dd", num_places);
        for (ii = 0; ii < cmd->nsub; ii++) {
            sprintf(datafilenm, format_str, cmd->outfile, dmprecision, avgdm, ii);
            outfiles[ii] = chkfopen(datafilenm, "wb");
            printf("   '%s'\n", datafilenm);
        }
    }

    /* Set a few other key values */
    if (insubs)
        avgdm = idata.dm;
    if (RAWDATA)
        idata.dm = avgdm;
    dsdt = cmd->downsamp * idata.dt;
    BW_ddelay = delay_from_dm(maxdm, idata.freq) -
        delay_from_dm(maxdm, idata.freq + (idata.num_chan - 1) * idata.chan_wid);
    blocksperread = ((int) (BW_ddelay / idata.dt) / s.spectra_per_subint + 1);
    worklen = s.spectra_per_subint * blocksperread;
    /* The number of topo to bary time points to generate with TEMPO */
    numbarypts = (int) (s.T * 1.1 / TDT + 5.5) + 1;

    // Identify the TEMPO observatory code
    {
        char *outscope = (char *) calloc(40, sizeof(char));
        telescope_to_tempocode(idata.telescope, outscope, obs);
        free(outscope);
    }

    if (cmd->nsub > s.num_channels) {
        printf
            ("Warning:  The number of requested subbands (%d) is larger than the number of channels (%d).\n",
             cmd->nsub, s.num_channels);
        printf("          Re-setting the number of subbands to %d.\n\n",
               s.num_channels);
        cmd->nsub = s.num_channels;
    }

    if (s.spectra_per_subint % cmd->downsamp) {
        printf
            ("Error:  The downsample factor (%d) must be a factor of the\n",
             cmd->downsamp);
        printf("        blocklength (%d).  Exiting.\n\n", s.spectra_per_subint);
        exit(1);
    }

    tlotoa = idata.mjd_i + idata.mjd_f; /* Topocentric epoch */

    if (cmd->numoutP)
        totnumtowrite = cmd->numout;
    else
        totnumtowrite = (long) idata.N / cmd->downsamp;

    if (cmd->nobaryP) {         /* Main loop if we are not barycentering... */
        double *dispdt;

        /* Dispersion delays (in bins).  The high freq gets no delay   */
        /* All other delays are positive fractions of bin length (dt)  */

        dispdt = subband_search_delays(s.num_channels, cmd->nsub, avgdm,
                                       idata.freq, idata.chan_wid, 0.0);
        idispdt = gen_ivect(s.num_channels);
        for (ii = 0; ii < s.num_channels; ii++)
            idispdt[ii] = NEAREST_LONG(dispdt[ii] / idata.dt);
        vect_free(dispdt);

        /* The subband dispersion delays (see note above) */

        offsets = gen_imatrix(cmd->numdms, cmd->nsub);
        for (ii = 0; ii < cmd->numdms; ii++) {
            double *subdispdt;

            subdispdt = subband_delays(s.num_channels, cmd->nsub, dms[ii],
                                       idata.freq, idata.chan_wid, 0.0);
            dtmp = subdispdt[cmd->nsub - 1];
            for (jj = 0; jj < cmd->nsub; jj++)
                offsets[ii][jj] = NEAREST_LONG((subdispdt[jj] - dtmp) / dsdt);
            vect_free(subdispdt);
        }

        /* Allocate our data array and start getting data */

        printf("\nDe-dispersing using:\n");
        printf("       Subbands = %d\n", cmd->nsub);
        printf("     Average DM = %.7g\n", avgdm);
        if (cmd->downsamp > 1) {
            printf("     Downsample = %d\n", cmd->downsamp);
            printf("  New sample dt = %.10g\n", dsdt);
        }
        printf("\n");

        if (cmd->subP)
            subsdata = gen_smatrix(cmd->nsub, worklen / cmd->downsamp);
        else
            outdata = gen_fmatrix(cmd->numdms, worklen / cmd->downsamp);
        numread = get_data(outdata, blocksperread, &s,
                           &obsmask, idispdt, offsets, &padding, subsdata);

        while (numread == worklen) {

            numread /= cmd->downsamp;
            print_percent_complete(totwrote, totnumtowrite);

            /* Write the latest chunk of data, but don't   */
            /* write more than cmd->numout points.         */

            numtowrite = numread;
            if (cmd->numoutP && (totwrote + numtowrite) > cmd->numout)
                numtowrite = cmd->numout - totwrote;
            if (cmd->subP)
                write_subs(outfiles, cmd->nsub, subsdata, 0, numtowrite);
            else
                write_data(outfiles, cmd->numdms, outdata, 0, numtowrite);
            totwrote += numtowrite;

            /* Update the statistics */

            if (!padding && !cmd->subP) {
                for (ii = 0; ii < numtowrite; ii++)
                    update_stats(statnum + ii, outdata[0][ii], &min, &max, &avg,
                                 &var);
                statnum += numtowrite;
            }

            /* Stop if we have written out all the data we need to */

            if (cmd->numoutP && (totwrote == cmd->numout))
                break;

            numread = get_data(outdata, blocksperread, &s,
                               &obsmask, idispdt, offsets, &padding, subsdata);
        }
        datawrote = totwrote;

    } else {                    /* Main loop if we are barycentering... */
        double maxvoverc = -1.0, minvoverc = 1.0, *voverc = NULL;
        double *dispdt;

        /* What ephemeris will we use?  (Default is DE405) */
        strcpy(ephem, "DE405");

        /* Define the RA and DEC of the observation */

        ra_dec_to_string(rastring, idata.ra_h, idata.ra_m, idata.ra_s);
        ra_dec_to_string(decstring, idata.dec_d, idata.dec_m, idata.dec_s);

        /* Allocate some arrays */

        btoa = gen_dvect(numbarypts);
        ttoa = gen_dvect(numbarypts);
        voverc = gen_dvect(numbarypts);
        for (ii = 0; ii < numbarypts; ii++)
            ttoa[ii] = tlotoa + TDT * ii / SECPERDAY;

        /* Call TEMPO for the barycentering */

        printf("\nGenerating barycentric corrections...\n");
        barycenter(ttoa, btoa, voverc, numbarypts, rastring, decstring, obs, ephem);
        for (ii = 0; ii < numbarypts; ii++) {
            if (voverc[ii] > maxvoverc)
                maxvoverc = voverc[ii];
            if (voverc[ii] < minvoverc)
                minvoverc = voverc[ii];
            avgvoverc += voverc[ii];
        }
        avgvoverc /= numbarypts;
        vect_free(voverc);
        blotoa = btoa[0];

        printf("   Average topocentric velocity (c) = %.7g\n", avgvoverc);
        printf("   Maximum topocentric velocity (c) = %.7g\n", maxvoverc);
        printf("   Minimum topocentric velocity (c) = %.7g\n\n", minvoverc);
        printf("De-dispersing and barycentering using:\n");
        printf("       Subbands = %d\n", cmd->nsub);
        printf("     Average DM = %.7g\n", avgdm);
        if (cmd->downsamp > 1) {
            printf("     Downsample = %d\n", cmd->downsamp);
            printf("  New sample dt = %.10g\n", dsdt);
        }
        printf("\n");

        /* Dispersion delays (in bins).  The high freq gets no delay   */
        /* All other delays are positive fractions of bin length (dt)  */

        dispdt = subband_search_delays(s.num_channels, cmd->nsub, avgdm,
                                       idata.freq, idata.chan_wid, avgvoverc);
        idispdt = gen_ivect(s.num_channels);
        for (ii = 0; ii < s.num_channels; ii++)
            idispdt[ii] = NEAREST_LONG(dispdt[ii] / idata.dt);
        vect_free(dispdt);

        /* The subband dispersion delays (see note above) */

        offsets = gen_imatrix(cmd->numdms, cmd->nsub);
        for (ii = 0; ii < cmd->numdms; ii++) {
            double *subdispdt;

            subdispdt = subband_delays(s.num_channels, cmd->nsub, dms[ii],
                                       idata.freq, idata.chan_wid, avgvoverc);
            dtmp = subdispdt[cmd->nsub - 1];
            for (jj = 0; jj < cmd->nsub; jj++)
                offsets[ii][jj] = NEAREST_LONG((subdispdt[jj] - dtmp) / dsdt);
            vect_free(subdispdt);
        }

        /* Convert the bary TOAs to differences from the topo TOAs in */
        /* units of bin length (dt) rounded to the nearest integer.   */

        dtmp = (btoa[0] - ttoa[0]);
        for (ii = 0; ii < numbarypts; ii++)
            btoa[ii] = ((btoa[ii] - ttoa[ii]) - dtmp) * SECPERDAY / dsdt;

        {                       /* Find the points where we need to add or remove bins */

            int oldbin = 0, currentbin;
            double lobin, hibin, calcpt;

            numdiffbins = abs(NEAREST_LONG(btoa[numbarypts - 1])) + 1;
            diffbins = gen_ivect(numdiffbins);
            diffbinptr = diffbins;
            for (ii = 1; ii < numbarypts; ii++) {
                currentbin = NEAREST_LONG(btoa[ii]);
                if (currentbin != oldbin) {
                    if (currentbin > 0) {
                        calcpt = oldbin + 0.5;
                        lobin = (ii - 1) * TDT / dsdt;
                        hibin = ii * TDT / dsdt;
                    } else {
                        calcpt = oldbin - 0.5;
                        lobin = -((ii - 1) * TDT / dsdt);
                        hibin = -(ii * TDT / dsdt);
                    }
                    while (fabs(calcpt) < fabs(btoa[ii])) {
                        /* Negative bin number means remove that bin */
                        /* Positive bin number means add a bin there */
                        *diffbinptr = NEAREST_LONG(LININTERP(calcpt, btoa[ii - 1],
                                                             btoa[ii], lobin,
                                                             hibin));
                        diffbinptr++;
                        calcpt = (currentbin > 0) ? calcpt + 1.0 : calcpt - 1.0;
                    }
                    oldbin = currentbin;
                }
            }
            *diffbinptr = cmd->numout;  /* Used as a marker */
        }
        diffbinptr = diffbins;

        /* Now perform the barycentering */

        if (cmd->subP)
            subsdata = gen_smatrix(cmd->nsub, worklen / cmd->downsamp);
        else
            outdata = gen_fmatrix(cmd->numdms, worklen / cmd->downsamp);
        numread = get_data(outdata, blocksperread, &s,
                           &obsmask, idispdt, offsets, &padding, subsdata);

        while (numread == worklen) {    /* Loop to read and write the data */
            int numwritten = 0;
            double block_avg, block_var;

            numread /= cmd->downsamp;
            /* Determine the approximate local average */
            avg_var(outdata[0], numread, &block_avg, &block_var);
            print_percent_complete(totwrote, totnumtowrite);

            /* Simply write the data if we don't have to add or */
            /* remove any bins from this batch.                 */
            /* OR write the amount of data up to cmd->numout or */
            /* the next bin that will be added or removed.      */

            numtowrite = abs(*diffbinptr) - datawrote;
            if (cmd->numoutP && (totwrote + numtowrite) > cmd->numout)
                numtowrite = cmd->numout - totwrote;
            if (numtowrite > numread)
                numtowrite = numread;
            if (cmd->subP)
                write_subs(outfiles, cmd->nsub, subsdata, 0, numtowrite);
            else
                write_data(outfiles, cmd->numdms, outdata, 0, numtowrite);
            datawrote += numtowrite;
            totwrote += numtowrite;
            numwritten += numtowrite;

            /* Update the statistics */

            if (!padding && !cmd->subP) {
                for (ii = 0; ii < numtowrite; ii++)
                    update_stats(statnum + ii, outdata[0][ii], &min, &max, &avg,
                                 &var);
                statnum += numtowrite;
            }

            if ((datawrote == abs(*diffbinptr)) && (numwritten != numread) && (totwrote < cmd->numout)) {       /* Add/remove a bin */
                int skip, nextdiffbin;

                skip = numtowrite;

                do {            /* Write the rest of the data after adding/removing a bin  */

                    if (*diffbinptr > 0) {
                        /* Add a bin */
                        write_padding(outfiles, cmd->numdms, block_avg, 1);
                        numadded++;
                        totwrote++;
                    } else {
                        /* Remove a bin */
                        numremoved++;
                        datawrote++;
                        numwritten++;
                        skip++;
                    }
                    diffbinptr++;

                    /* Write the part after the diffbin */

                    numtowrite = numread - numwritten;
                    if (cmd->numoutP && (totwrote + numtowrite) > cmd->numout)
                        numtowrite = cmd->numout - totwrote;
                    nextdiffbin = abs(*diffbinptr) - datawrote;
                    if (numtowrite > nextdiffbin)
                        numtowrite = nextdiffbin;
                    if (cmd->subP)
                        write_subs(outfiles, cmd->nsub, subsdata, skip, numtowrite);
                    else
                        write_data(outfiles, cmd->numdms, outdata, skip, numtowrite);
                    numwritten += numtowrite;
                    datawrote += numtowrite;
                    totwrote += numtowrite;

                    /* Update the statistics and counters */

                    if (!padding && !cmd->subP) {
                        for (ii = 0; ii < numtowrite; ii++)
                            update_stats(statnum + ii, outdata[0][skip + ii],
                                         &min, &max, &avg, &var);
                        statnum += numtowrite;
                    }
                    skip += numtowrite;

                    /* Stop if we have written out all the data we need to */

                    if (cmd->numoutP && (totwrote == cmd->numout))
                        break;
                } while (numwritten < numread);
            }
            /* Stop if we have written out all the data we need to */

            if (cmd->numoutP && (totwrote == cmd->numout))
                break;

            numread = get_data(outdata, blocksperread, &s,
                               &obsmask, idispdt, offsets, &padding, subsdata);
        }
    }

    /* Calculate the amount of padding we need  */

    if (cmd->numoutP && (cmd->numout > totwrote))
        padwrote = padtowrite = cmd->numout - totwrote;

    /* Write the new info file for the output data */

    idata.dt = dsdt;
    update_infodata(&idata, totwrote, padtowrite, diffbins,
                    numdiffbins, cmd->downsamp);
    for (ii = 0; ii < cmd->numdms; ii++) {
        idata.dm = dms[ii];
        if (!cmd->nobaryP) {
            double baryepoch, barydispdt, baryhifreq;

            baryhifreq = idata.freq + (s.num_channels - 1) * idata.chan_wid;
            barydispdt = delay_from_dm(dms[ii], doppler(baryhifreq, avgvoverc));
            baryepoch = blotoa - (barydispdt / SECPERDAY);
            idata.bary = 1;
            idata.mjd_i = (int) floor(baryepoch);
            idata.mjd_f = baryepoch - idata.mjd_i;
        }
        if (cmd->subP)
            sprintf(idata.name, "%s_DM%.*f.sub", cmd->outfile, dmprecision, dms[ii]);
        else
            sprintf(idata.name, "%s_DM%.*f", cmd->outfile, dmprecision, dms[ii]);
        writeinf(&idata);
    }

    /* Set the padded points equal to the average data point */

    if (idata.numonoff >= 1) {
        int index, startpad, endpad;

        for (ii = 0; ii < cmd->numdms; ii++) {
            fclose(outfiles[ii]);
            sprintf(datafilenm, "%s_DM%.*f.dat", cmd->outfile, dmprecision, dms[ii]);
            outfiles[ii] = chkfopen(datafilenm, "rb+");
        }
        for (ii = 0; ii < idata.numonoff; ii++) {
            index = 2 * ii;
            startpad = idata.onoff[index + 1];
            if (ii == idata.numonoff - 1)
                endpad = idata.N - 1;
            else
                endpad = idata.onoff[index + 2];
            for (jj = 0; jj < cmd->numdms; jj++)
                chkfseek(outfiles[jj], (startpad + 1) * sizeof(float), SEEK_SET);
            padtowrite = endpad - startpad;
            write_padding(outfiles, cmd->numdms, avg, padtowrite);
        }
    }

    /* Print simple stats and results */

    if (!cmd->subP) {
        var /= (datawrote - 1);
        print_percent_complete(1, 1);
        printf("\n\nDone.\n\nSimple statistics of the output data:\n");
        printf("             Data points written:  %ld\n", totwrote);
        if (padwrote)
            printf("          Padding points written:  %ld\n", padwrote);
        if (!cmd->nobaryP) {
            if (numadded)
                printf("    Bins added for barycentering:  %d\n", numadded);
            if (numremoved)
                printf("  Bins removed for barycentering:  %d\n", numremoved);
        }
        printf("           Maximum value of data:  %.2f\n", max);
        printf("           Minimum value of data:  %.2f\n", min);
        printf("              Data average value:  %.2f\n", avg);
        printf("         Data standard deviation:  %.2f\n", sqrt(var));
        printf("\n");
    } else {
        printf("\n\nDone.\n");
        printf("             Data points written:  %ld\n", totwrote);
        if (padwrote)
            printf("          Padding points written:  %ld\n", padwrote);
        if (!cmd->nobaryP) {
            if (numadded)
                printf("    Bins added for barycentering:  %d\n", numadded);
            if (numremoved)
                printf("  Bins removed for barycentering:  %d\n", numremoved);
        }
        printf("\n");
    }

    /* Close the files and cleanup */

    if (cmd->maskfileP) {
        free_mask(obsmask);
    }
    //  Close all the raw files and free their vectors
    close_rawfiles(&s);
    for (ii = 0; ii < cmd->numdms; ii++)
        fclose(outfiles[ii]);
    if (cmd->subP) {
        vect_free(subsdata[0]);
        vect_free(subsdata);
    } else {
        vect_free(outdata[0]);
        vect_free(outdata);
    }
    free(outfiles);
    vect_free(dms);
    vect_free(idispdt);
    vect_free(offsets[0]);
    vect_free(offsets);
    free(datafilenm);
    if (!cmd->nobaryP) {
        vect_free(btoa);
        vect_free(ttoa);
        vect_free(diffbins);
    }
    return (0);
}
Example #28
0
// Coordinate descent for logistic models
RcppExport SEXP cdfit_binomial_hsr(SEXP X_, SEXP y_, SEXP row_idx_, 
                                   SEXP lambda_, SEXP nlambda_, SEXP lam_scale_,
                                   SEXP lambda_min_, SEXP alpha_, SEXP user_, SEXP eps_, 
                                   SEXP max_iter_, SEXP multiplier_, SEXP dfmax_, 
                                   SEXP ncore_, SEXP warn_, SEXP verbose_) {
  XPtr<BigMatrix> xMat(X_);
  double *y = REAL(y_);
  int *row_idx = INTEGER(row_idx_);
  double lambda_min = REAL(lambda_min_)[0];
  double alpha = REAL(alpha_)[0];
  int n = Rf_length(row_idx_); // number of observations used for fitting model
  int p = xMat->ncol();
  int L = INTEGER(nlambda_)[0];
  int lam_scale = INTEGER(lam_scale_)[0];
  double eps = REAL(eps_)[0];
  int max_iter = INTEGER(max_iter_)[0];
  double *m = REAL(multiplier_);
  int dfmax = INTEGER(dfmax_)[0];
  int warn = INTEGER(warn_)[0];
  int user = INTEGER(user_)[0];
  int verbose = INTEGER(verbose_)[0];

  NumericVector lambda(L);
  NumericVector Dev(L);
  IntegerVector iter(L);
  IntegerVector n_reject(L);
  NumericVector beta0(L);
  NumericVector center(p);
  NumericVector scale(p);
  int p_keep = 0; // keep columns whose scale > 1e-6
  int *p_keep_ptr = &p_keep;
  vector<int> col_idx;
  vector<double> z;
  double lambda_max = 0.0;
  double *lambda_max_ptr = &lambda_max;
  int xmax_idx = 0;
  int *xmax_ptr = &xmax_idx;
  
  // set up omp
  int useCores = INTEGER(ncore_)[0];
#ifdef BIGLASSO_OMP_H_
  int haveCores = omp_get_num_procs();
  if(useCores < 1) {
    useCores = haveCores;
  }
  omp_set_dynamic(0);
  omp_set_num_threads(useCores);
#endif
  
  if (verbose) {
    char buff1[100];
    time_t now1 = time (0);
    strftime (buff1, 100, "%Y-%m-%d %H:%M:%S.000", localtime (&now1));
    Rprintf("\nPreprocessing start: %s\n", buff1);
  }
  
  // standardize: get center, scale; get p_keep_ptr, col_idx; get z, lambda_max, xmax_idx;
  standardize_and_get_residual(center, scale, p_keep_ptr, col_idx, z, lambda_max_ptr, xmax_ptr, xMat, 
                               y, row_idx, lambda_min, alpha, n, p);
  p = p_keep; // set p = p_keep, only loop over columns whose scale > 1e-6

  if (verbose) {
    char buff1[100];
    time_t now1 = time (0);
    strftime (buff1, 100, "%Y-%m-%d %H:%M:%S.000", localtime (&now1));
    Rprintf("Preprocessing end: %s\n", buff1);
    Rprintf("\n-----------------------------------------------\n");
  }

  arma::sp_mat beta = arma::sp_mat(p, L); //beta
  double *a = Calloc(p, double); //Beta from previous iteration
  double a0 = 0.0; //beta0 from previousiteration
  double *w = Calloc(n, double);
  double *s = Calloc(n, double); //y_i - pi_i
  double *eta = Calloc(n, double);
  int *e1 = Calloc(p, int); //ever-active set
  int *e2 = Calloc(p, int); //strong set
  double xwr, xwx, pi, u, v, cutoff, l1, l2, shift, si;
  double max_update, update, thresh; // for convergence check
  int i, j, jj, l, violations, lstart;
  
  double ybar = sum(y, n) / n;
  a0 = beta0[0] = log(ybar / (1-ybar));
  double nullDev = 0;
  double *r = Calloc(n, double);
  for (i = 0; i < n; i++) {
    r[i] = y[i];
    nullDev = nullDev - y[i]*log(ybar) - (1-y[i])*log(1-ybar);
    s[i] = y[i] - ybar;
    eta[i] = a0;
  }
  thresh = eps * nullDev / n;
  
  double sumS = sum(s, n); // temp result sum of s
  double sumWResid = 0.0; // temp result: sum of w * r

  // set up lambda
  if (user == 0) {
    if (lam_scale) { // set up lambda, equally spaced on log scale
      double log_lambda_max = log(lambda_max);
      double log_lambda_min = log(lambda_min*lambda_max);
      
      double delta = (log_lambda_max - log_lambda_min) / (L-1);
      for (l = 0; l < L; l++) {
        lambda[l] = exp(log_lambda_max - l * delta);
      }
    } else { // equally spaced on linear scale
      double delta = (lambda_max - lambda_min*lambda_max) / (L-1);
      for (l = 0; l < L; l++) {
        lambda[l] = lambda_max - l * delta;
      }
    }
    Dev[0] = nullDev;
    lstart = 1;
    n_reject[0] = p;
  } else {
    lstart = 0;
    lambda = Rcpp::as<NumericVector>(lambda_);
  }
  
  for (l = lstart; l < L; l++) {
    if(verbose) {
      // output time
      char buff[100];
      time_t now = time (0);
      strftime (buff, 100, "%Y-%m-%d %H:%M:%S.000", localtime (&now));
      Rprintf("Lambda %d. Now time: %s\n", l, buff);
    }
    
    if (l != 0) {
      // Check dfmax
      int nv = 0;
      for (j = 0; j < p; j++) {
        if (a[j] != 0) {
          nv++;
        }
      }
      if (nv > dfmax) {
        for (int ll=l; ll<L; ll++) iter[ll] = NA_INTEGER;
        Free_memo_bin_hsr(s, w, a, r, e1, e2, eta);
        return List::create(beta0, beta, center, scale, lambda, Dev, 
                            iter, n_reject, Rcpp::wrap(col_idx));
      }
   
      // strong set
      cutoff = 2*lambda[l] - lambda[l-1];
      for (j = 0; j < p; j++) {
        if (fabs(z[j]) > (cutoff * alpha * m[col_idx[j]])) {
          e2[j] = 1;
        } else {
          e2[j] = 0;
        }
      }
      
    } else {
      // strong set
      cutoff = 2*lambda[l] - lambda_max;
      for (j = 0; j < p; j++) {
        if (fabs(z[j]) > (cutoff * alpha * m[col_idx[j]])) {
          e2[j] = 1;
        } else {
          e2[j] = 0;
        }
      }
    }
   
    n_reject[l] = p - sum(e2, p);
    while (iter[l] < max_iter) {
      while (iter[l] < max_iter) {
        while (iter[l] < max_iter) {
          iter[l]++;
          Dev[l] = 0.0;
          
          for (i = 0; i < n; i++) {
            if (eta[i] > 10) {
              pi = 1;
              w[i] = .0001;
            } else if (eta[i] < -10) {
              pi = 0;
              w[i] = .0001;
            } else {
              pi = exp(eta[i]) / (1 + exp(eta[i]));
              w[i] = pi * (1 - pi);
            }
            s[i] = y[i] - pi;
            r[i] = s[i] / w[i];
            if (y[i] == 1) {
              Dev[l] = Dev[l] - log(pi);
            } else {
              Dev[l] = Dev[l] - log(1-pi);
            }
          }
          
          if (Dev[l] / nullDev < .01) {
            if (warn) warning("Model saturated; exiting...");
            for (int ll=l; ll<L; ll++) iter[ll] = NA_INTEGER;
            Free_memo_bin_hsr(s, w, a, r, e1, e2, eta);
            return List::create(beta0, beta, center, scale, lambda, Dev,
                                iter, n_reject, Rcpp::wrap(col_idx));
          }
          
          // Intercept
          xwr = crossprod(w, r, n, 0);
          xwx = sum(w, n);
          beta0[l] = xwr / xwx + a0;
          si = beta0[l] - a0;
          if (si != 0) {
            a0 = beta0[l];
            for (i = 0; i < n; i++) {
              r[i] -= si; //update r
              eta[i] += si; //update eta
            }
          }
          sumWResid = wsum(r, w, n); // update temp result: sum of w * r, used for computing xwr;

          max_update = 0.0;
          for (j = 0; j < p; j++) {
            if (e1[j]) {
              jj = col_idx[j];
              xwr = wcrossprod_resid(xMat, r, sumWResid, row_idx, center[jj], scale[jj], w, n, jj);
              v = wsqsum_bm(xMat, w, row_idx, center[jj], scale[jj], n, jj) / n;
              u = xwr/n + v * a[j];
              l1 = lambda[l] * m[jj] * alpha;
              l2 = lambda[l] * m[jj] * (1-alpha);
              beta(j, l) = lasso(u, l1, l2, v);

              shift = beta(j, l) - a[j];
              if (shift !=0) {
                // update change of objective function
                // update = - u * shift + (0.5 * v + 0.5 * l2) * (pow(beta(j, l), 2) - pow(a[j], 2)) + l1 * (fabs(beta(j, l)) - fabs(a[j]));
                
                update = pow(beta(j, l) - a[j], 2) * v;
                if (update > max_update) max_update = update;
                update_resid_eta(r, eta, xMat, shift, row_idx, center[jj], scale[jj], n, jj); // update r
                sumWResid = wsum(r, w, n); // update temp result w * r, used for computing xwr;
                a[j] = beta(j, l); // update a
              }
            }
          }
          // Check for convergence
          if (max_update < thresh)  break;
        }
        // Scan for violations in strong set
        sumS = sum(s, n);
        violations = check_strong_set_bin(e1, e2, z, xMat, row_idx, col_idx, center, scale, a, lambda[l], sumS, alpha, s, m, n, p);
        if (violations==0) break;
      }
      // Scan for violations in rest
      violations = check_rest_set_bin(e1, e2, z, xMat, row_idx, col_idx, center, scale, a, lambda[l], sumS, alpha, s, m, n, p);
      if (violations==0) break;
    }
  }
  Free_memo_bin_hsr(s, w, a, r, e1, e2, eta);
  return List::create(beta0, beta, center, scale, lambda, Dev, iter, n_reject, Rcpp::wrap(col_idx));
  
}
Example #29
0
int
main ()
{
  char *env_proc_bind = getenv ("OMP_PROC_BIND");
  int test_false = env_proc_bind && strcmp (env_proc_bind, "false") == 0;
  int test_true = env_proc_bind && strcmp (env_proc_bind, "true") == 0;
  int test_spread_master_close
    = env_proc_bind && strcmp (env_proc_bind, "spread,master,close") == 0;
  char *env_places = getenv ("OMP_PLACES");
  int test_places = 0;

#ifdef DO_FORK
  if (env_places == NULL && contig_cpucount >= 8 && test_false
      && getenv ("GOMP_AFFINITY") == NULL)
    {
      int i, j, status;
      pid_t pid;
      for (j = 0; j < 2; j++)
	{
	  if (setenv ("OMP_PROC_BIND", j ? "spread,master,close" : "true", 1)
	      < 0)
	    break;
	  for (i = sizeof (places_array) / sizeof (places_array[0]) - 1;
	       i; --i)
	    {
	      if (setenv ("OMP_PLACES", places_array[i].name, 1) < 0)
		break;
	      pid = fork ();
	      if (pid == -1)
		break;
	      if (pid == 0)
		{
		  execl ("/proc/self/exe", "affinity-1.exe", NULL);
		  _exit (1);
		}
	      if (waitpid (pid, &status, 0) < 0)
		break;
	      if (WIFSIGNALED (status) && WTERMSIG (status) == SIGABRT)
		abort ();
	      else if (!WIFEXITED (status) || WEXITSTATUS (status) != 0)
		break;
	    }
	  if (i)
	    break;
	}
    }
#endif

  int first = 1;
  if (env_proc_bind)
    {
      printf ("OMP_PROC_BIND='%s'", env_proc_bind);
      first = 0;
    }
  if (env_places)
    printf ("%sOMP_PLACES='%s'", first ? "" : " ", env_places);
  printf ("\n");

  if (env_places && contig_cpucount >= 8
      && (test_true || test_spread_master_close))
    {
      for (test_places = sizeof (places_array) / sizeof (places_array[0]) - 1;
	   test_places; --test_places)
	if (strcmp (env_places, places_array[test_places].name) == 0)
	  break;
    }

#define verify(if_true, if_s_m_c) \
  if (test_false && omp_get_proc_bind () != omp_proc_bind_false)	\
    abort ();								\
  if (test_true && omp_get_proc_bind () != if_true)			\
    abort ();								\
  if (test_spread_master_close && omp_get_proc_bind () != if_s_m_c)	\
    abort ();

  verify (omp_proc_bind_true, omp_proc_bind_spread);

  printf ("Initial thread");
  print_affinity (places_array[test_places].places[0]);
  printf ("\n");
  omp_set_nested (1);
  omp_set_dynamic (0);

  #pragma omp parallel if (0)
  {
    verify (omp_proc_bind_true, omp_proc_bind_master);
    #pragma omp parallel if (0)
    {
      verify (omp_proc_bind_true, omp_proc_bind_close);
      #pragma omp parallel if (0)
      {
	verify (omp_proc_bind_true, omp_proc_bind_close);
      }
      #pragma omp parallel if (0) proc_bind (spread)
      {
	verify (omp_proc_bind_spread, omp_proc_bind_spread);
      }
    }
    #pragma omp parallel if (0) proc_bind (master)
    {
      verify (omp_proc_bind_master, omp_proc_bind_close);
      #pragma omp parallel if (0)
      {
	verify (omp_proc_bind_master, omp_proc_bind_close);
      }
      #pragma omp parallel if (0) proc_bind (spread)
      {
	verify (omp_proc_bind_spread, omp_proc_bind_spread);
      }
    }
  }

  /* True/spread */
  #pragma omp parallel num_threads (4)
  {
    verify (omp_proc_bind_true, omp_proc_bind_master);
    #pragma omp critical
    {
      struct place p = places_array[0].places[0];
      int thr = omp_get_thread_num ();
      printf ("#1 thread %d", thr);
      if (omp_get_num_threads () == 4 && test_spread_master_close)
	switch (places_array[test_places].count)
	  {
	  case 8:
	    /* T = 4, P = 8, each subpartition has 2 places.  */
	  case 7:
	    /* T = 4, P = 7, each subpartition has 2 places, but
	       last partition, which has just one place.  */
	    p = places_array[test_places].places[2 * thr];
	    break;
	  case 5:
	    /* T = 4, P = 5, first subpartition has 2 places, the
	       rest just one.  */
	    p = places_array[test_places].places[thr ? 1 + thr : 0];
	    break;
	  case 3:
	    /* T = 4, P = 3, unit sized subpartitions, first gets
	       thr0 and thr3, second thr1, third thr2.  */
	    p = places_array[test_places].places[thr == 3 ? 0 : thr];
	    break;
	  case 2:
	    /* T = 4, P = 2, unit sized subpartitions, each with
	       2 threads.  */
	    p = places_array[test_places].places[thr / 2];
	    break;
	  }
      print_affinity (p);
      printf ("\n");
    }
    #pragma omp barrier
    if (omp_get_thread_num () == 3)
      {
	/* True/spread, true/master.  */
	#pragma omp parallel num_threads (3)
	{
	  verify (omp_proc_bind_true, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#1,#1 thread 3,%d", thr);
	    if (omp_get_num_threads () == 3 && test_spread_master_close)
	      /* Outer is spread, inner master, so just bind to the
		 place or the master thread, which is thr 3 above.  */
	      switch (places_array[test_places].count)
		{
		case 8:
		case 7:
		  p = places_array[test_places].places[6];
		  break;
		case 5:
		  p = places_array[test_places].places[4];
		  break;
		case 3:
		  p = places_array[test_places].places[0];
		  break;
		case 2:
		  p = places_array[test_places].places[1];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* True/spread, spread.  */
	#pragma omp parallel num_threads (5) proc_bind (spread)
	{
	  verify (omp_proc_bind_spread, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#1,#2 thread 3,%d", thr);
	    if (omp_get_num_threads () == 5 && test_spread_master_close)
	      /* Outer is spread, inner spread.  */
	      switch (places_array[test_places].count)
		{
		case 8:
		  /* T = 5, P = 2, unit sized subpartitions.  */
		  p = places_array[test_places].places[thr == 4 ? 6
						       : 6 + thr / 2];
		  break;
		/* The rest are T = 5, P = 1.  */
		case 7:
		  p = places_array[test_places].places[6];
		  break;
		case 5:
		  p = places_array[test_places].places[4];
		  break;
		case 3:
		  p = places_array[test_places].places[0];
		  break;
		case 2:
		  p = places_array[test_places].places[1];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	  #pragma omp barrier
	  if (omp_get_thread_num () == 3)
	    {
	      /* True/spread, spread, close.  */
	      #pragma omp parallel num_threads (5) proc_bind (close)
	      {
		verify (omp_proc_bind_close, omp_proc_bind_close);
		#pragma omp critical
		{
		  struct place p = places_array[0].places[0];
		  int thr = omp_get_thread_num ();
		  printf ("#1,#2,#1 thread 3,3,%d", thr);
		  if (omp_get_num_threads () == 5 && test_spread_master_close)
		    /* Outer is spread, inner spread, innermost close.  */
		    switch (places_array[test_places].count)
		      {
		      /* All are T = 5, P = 1.  */
		      case 8:
			p = places_array[test_places].places[7];
			break;
		      case 7:
			p = places_array[test_places].places[6];
			break;
		      case 5:
			p = places_array[test_places].places[4];
			break;
		      case 3:
			p = places_array[test_places].places[0];
			break;
		      case 2:
			p = places_array[test_places].places[1];
			break;
		      }
		  print_affinity (p);
		  printf ("\n");
		}
	      }
	    }
	}
	/* True/spread, master.  */
	#pragma omp parallel num_threads (4) proc_bind(master)
	{
	  verify (omp_proc_bind_master, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#1,#3 thread 3,%d", thr);
	    if (omp_get_num_threads () == 4 && test_spread_master_close)
	      /* Outer is spread, inner master, so just bind to the
		 place or the master thread, which is thr 3 above.  */
	      switch (places_array[test_places].count)
		{
		case 8:
		case 7:
		  p = places_array[test_places].places[6];
		  break;
		case 5:
		  p = places_array[test_places].places[4];
		  break;
		case 3:
		  p = places_array[test_places].places[0];
		  break;
		case 2:
		  p = places_array[test_places].places[1];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* True/spread, close.  */
	#pragma omp parallel num_threads (6) proc_bind (close)
	{
	  verify (omp_proc_bind_close, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#1,#4 thread 3,%d", thr);
	    if (omp_get_num_threads () == 6 && test_spread_master_close)
	      /* Outer is spread, inner close.  */
	      switch (places_array[test_places].count)
		{
		case 8:
		  /* T = 6, P = 2, unit sized subpartitions.  */
		  p = places_array[test_places].places[6 + thr / 3];
		  break;
		/* The rest are T = 6, P = 1.  */
		case 7:
		  p = places_array[test_places].places[6];
		  break;
		case 5:
		  p = places_array[test_places].places[4];
		  break;
		case 3:
		  p = places_array[test_places].places[0];
		  break;
		case 2:
		  p = places_array[test_places].places[1];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	}
      }
  }

  /* Spread.  */
  #pragma omp parallel num_threads (5) proc_bind(spread)
  {
    verify (omp_proc_bind_spread, omp_proc_bind_master);
    #pragma omp critical
    {
      struct place p = places_array[0].places[0];
      int thr = omp_get_thread_num ();
      printf ("#2 thread %d", thr);
      if (omp_get_num_threads () == 5
	  && (test_spread_master_close || test_true))
	switch (places_array[test_places].count)
	  {
	  case 8:
	    /* T = 5, P = 8, first 3 subpartitions have 2 places, last
	       2 one place.  */
	    p = places_array[test_places].places[thr < 3 ? 2 * thr : 3 + thr];
	    break;
	  case 7:
	    /* T = 5, P = 7, first 2 subpartitions have 2 places, last
	       3 one place.  */
	    p = places_array[test_places].places[thr < 2 ? 2 * thr : 2 + thr];
	    break;
	  case 5:
	    /* T = 5, P = 5, unit sized subpartitions, each one with one
	       thread.  */
	    p = places_array[test_places].places[thr];
	    break;
	  case 3:
	    /* T = 5, P = 3, unit sized subpartitions, first gets
	       thr0 and thr3, second thr1 and thr4, third thr2.  */
	    p = places_array[test_places].places[thr >= 3 ? thr - 3 : thr];
	    break;
	  case 2:
	    /* T = 5, P = 2, unit sized subpartitions, first with
	       thr{0,1,4} and second with thr{2,3}.  */
	    p = places_array[test_places].places[thr == 4 ? 0 : thr / 2];
	    break;
	  }
      print_affinity (p);
      printf ("\n");
    }
    #pragma omp barrier
    if (omp_get_thread_num () == 3)
      {
	int pp = 0;
	switch (places_array[test_places].count)
	  {
	  case 8: pp = 6; break;
	  case 7: pp = 5; break;
	  case 5: pp = 3; break;
	  case 2: pp = 1; break;
	  }
	/* Spread, spread/master.  */
	#pragma omp parallel num_threads (3) firstprivate (pp)
	{
	  verify (omp_proc_bind_spread, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#2,#1 thread 3,%d", thr);
	    if (test_spread_master_close || test_true)
	      /* Outer is spread, inner spread resp. master, bit we have
		 just unit sized partitions.  */
	      p = places_array[test_places].places[pp];
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* Spread, spread.  */
	#pragma omp parallel num_threads (5) proc_bind (spread) \
			     firstprivate (pp)
	{
	  verify (omp_proc_bind_spread, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#2,#2 thread 3,%d", thr);
	    if (test_spread_master_close || test_true)
	      /* Outer is spread, inner spread, bit we have
		 just unit sized partitions.  */
	      p = places_array[test_places].places[pp];
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* Spread, master.  */
	#pragma omp parallel num_threads (4) proc_bind(master) \
			     firstprivate(pp)
	{
	  verify (omp_proc_bind_master, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#2,#3 thread 3,%d", thr);
	    if (test_spread_master_close || test_true)
	      /* Outer is spread, inner master, bit we have
		 just unit sized partitions.  */
	      p = places_array[test_places].places[pp];
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* Spread, close.  */
	#pragma omp parallel num_threads (6) proc_bind (close) \
			     firstprivate (pp)
	{
	  verify (omp_proc_bind_close, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#2,#4 thread 3,%d", thr);
	    if (test_spread_master_close || test_true)
	      /* Outer is spread, inner close, bit we have
		 just unit sized partitions.  */
	      p = places_array[test_places].places[pp];
	    print_affinity (p);
	    printf ("\n");
	  }
	}
      }
  }

  /* Master.  */
  #pragma omp parallel num_threads (3) proc_bind(master)
  {
    verify (omp_proc_bind_master, omp_proc_bind_master);
    #pragma omp critical
    {
      struct place p = places_array[0].places[0];
      int thr = omp_get_thread_num ();
      printf ("#3 thread %d", thr);
      if (test_spread_master_close || test_true)
	p = places_array[test_places].places[0];
      print_affinity (p);
      printf ("\n");
    }
    #pragma omp barrier
    if (omp_get_thread_num () == 2)
      {
	/* Master, master.  */
	#pragma omp parallel num_threads (4)
	{
	  verify (omp_proc_bind_master, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#3,#1 thread 2,%d", thr);
	    if (test_spread_master_close || test_true)
	      /* Outer is master, inner is master.  */
	      p = places_array[test_places].places[0];
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* Master, spread.  */
	#pragma omp parallel num_threads (4) proc_bind (spread)
	{
	  verify (omp_proc_bind_spread, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#3,#2 thread 2,%d", thr);
	    if (omp_get_num_threads () == 4
		&& (test_spread_master_close || test_true))
	      /* Outer is master, inner is spread.  */
	      switch (places_array[test_places].count)
		{
		case 8:
		  /* T = 4, P = 8, each subpartition has 2 places.  */
		case 7:
		  /* T = 4, P = 7, each subpartition has 2 places, but
		     last partition, which has just one place.  */
		  p = places_array[test_places].places[2 * thr];
		  break;
		case 5:
		  /* T = 4, P = 5, first subpartition has 2 places, the
		     rest just one.  */
		  p = places_array[test_places].places[thr ? 1 + thr : 0];
		  break;
		case 3:
		  /* T = 4, P = 3, unit sized subpartitions, first gets
		     thr0 and thr3, second thr1, third thr2.  */
		  p = places_array[test_places].places[thr == 3 ? 0 : thr];
		  break;
		case 2:
		  /* T = 4, P = 2, unit sized subpartitions, each with
		     2 threads.  */
		  p = places_array[test_places].places[thr / 2];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	  #pragma omp barrier
	  if (omp_get_thread_num () == 0)
	    {
	      /* Master, spread, close.  */
	      #pragma omp parallel num_threads (5) proc_bind (close)
	      {
		verify (omp_proc_bind_close, omp_proc_bind_close);
		#pragma omp critical
		{
		  struct place p = places_array[0].places[0];
		  int thr = omp_get_thread_num ();
		  printf ("#3,#2,#1 thread 2,0,%d", thr);
		  if (omp_get_num_threads () == 5
		      && (test_spread_master_close || test_true))
		    /* Outer is master, inner spread, innermost close.  */
		    switch (places_array[test_places].count)
		      {
		      /* First 3 are T = 5, P = 2.  */
		      case 8:
		      case 7:
		      case 5:
			p = places_array[test_places].places[(thr & 2) / 2];
			break;
		      /* All the rest are T = 5, P = 1.  */
		      case 3:
		      case 2:
			p = places_array[test_places].places[0];
			break;
		      }
		  print_affinity (p);
		  printf ("\n");
		}
	      }
	    }
	  #pragma omp barrier
	  if (omp_get_thread_num () == 3)
	    {
	      /* Master, spread, close.  */
	      #pragma omp parallel num_threads (5) proc_bind (close)
	      {
		verify (omp_proc_bind_close, omp_proc_bind_close);
		#pragma omp critical
		{
		  struct place p = places_array[0].places[0];
		  int thr = omp_get_thread_num ();
		  printf ("#3,#2,#2 thread 2,3,%d", thr);
		  if (omp_get_num_threads () == 5
		      && (test_spread_master_close || test_true))
		    /* Outer is master, inner spread, innermost close.  */
		    switch (places_array[test_places].count)
		      {
		      case 8:
			/* T = 5, P = 2.  */
			p = places_array[test_places].places[6
							     + (thr & 2) / 2];
			break;
		      /* All the rest are T = 5, P = 1.  */
		      case 7:
			p = places_array[test_places].places[6];
			break;
		      case 5:
			p = places_array[test_places].places[4];
			break;
		      case 3:
			p = places_array[test_places].places[0];
			break;
		      case 2:
			p = places_array[test_places].places[1];
			break;
		      }
		  print_affinity (p);
		  printf ("\n");
		}
	      }
	    }
	}
	/* Master, master.  */
	#pragma omp parallel num_threads (4) proc_bind(master)
	{
	  verify (omp_proc_bind_master, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#3,#3 thread 2,%d", thr);
	    if (test_spread_master_close || test_true)
	      /* Outer is master, inner master.  */
	      p = places_array[test_places].places[0];
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* Master, close.  */
	#pragma omp parallel num_threads (6) proc_bind (close)
	{
	  verify (omp_proc_bind_close, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#3,#4 thread 2,%d", thr);
	    if (omp_get_num_threads () == 6
		&& (test_spread_master_close || test_true))
	      switch (places_array[test_places].count)
		{
		case 8:
		  /* T = 6, P = 8.  */
		case 7:
		  /* T = 6, P = 7.  */
		  p = places_array[test_places].places[thr];
		  break;
		case 5:
		  /* T = 6, P = 5.  thr{0,5} go into the first place.  */
		  p = places_array[test_places].places[thr == 5 ? 0 : thr];
		  break;
		case 3:
		  /* T = 6, P = 3, two threads into each place.  */
		  p = places_array[test_places].places[thr / 2];
		  break;
		case 2:
		  /* T = 6, P = 2, 3 threads into each place.  */
		  p = places_array[test_places].places[thr / 3];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	}
      }
  }

  #pragma omp parallel num_threads (5) proc_bind(close)
  {
    verify (omp_proc_bind_close, omp_proc_bind_master);
    #pragma omp critical
    {
      struct place p = places_array[0].places[0];
      int thr = omp_get_thread_num ();
      printf ("#4 thread %d", thr);
      if (omp_get_num_threads () == 5
	  && (test_spread_master_close || test_true))
	switch (places_array[test_places].count)
	  {
	  case 8:
	    /* T = 5, P = 8.  */
	  case 7:
	    /* T = 5, P = 7.  */
	  case 5:
	    /* T = 5, P = 5.  */
	    p = places_array[test_places].places[thr];
	    break;
	  case 3:
	    /* T = 5, P = 3, thr{0,3} in first place, thr{1,4} in second,
	       thr2 in third.  */
	    p = places_array[test_places].places[thr >= 3 ? thr - 3 : thr];
	    break;
	  case 2:
	    /* T = 5, P = 2, thr{0,1,4} in first place, thr{2,3} in second.  */
	    p = places_array[test_places].places[thr == 4 ? 0 : thr / 2];
	    break;
	  }
      print_affinity (p);
      printf ("\n");
    }
    #pragma omp barrier
    if (omp_get_thread_num () == 2)
      {
	int pp = 0;
	switch (places_array[test_places].count)
	  {
	  case 8:
	  case 7:
	  case 5:
	  case 3:
	    pp = 2;
	    break;
	  case 2:
	    pp = 1;
	    break;
	  }
	/* Close, close/master.  */
	#pragma omp parallel num_threads (4) firstprivate (pp)
	{
	  verify (omp_proc_bind_close, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#4,#1 thread 2,%d", thr);
	    if (test_spread_master_close)
	      /* Outer is close, inner is master.  */
	      p = places_array[test_places].places[pp];
	    else if (omp_get_num_threads () == 4 && test_true)
	      /* Outer is close, inner is close.  */
	      switch (places_array[test_places].count)
		{
		case 8:
		  /* T = 4, P = 8.  */
		case 7:
		  /* T = 4, P = 7.  */
		  p = places_array[test_places].places[2 + thr];
		  break;
		case 5:
		  /* T = 4, P = 5.  There is wrap-around for thr3.  */
		  p = places_array[test_places].places[thr == 3 ? 0 : 2 + thr];
		  break;
		case 3:
		  /* T = 4, P = 3, thr{0,3} go into p2, thr1 into p0, thr2
		     into p1.  */
		  p = places_array[test_places].places[(2 + thr) % 3];
		  break;
		case 2:
		  /* T = 4, P = 2, 2 threads into each place.  */
		  p = places_array[test_places].places[1 - thr / 2];
		  break;
		}

	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* Close, spread.  */
	#pragma omp parallel num_threads (4) proc_bind (spread)
	{
	  verify (omp_proc_bind_spread, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#4,#2 thread 2,%d", thr);
	    if (omp_get_num_threads () == 4
		&& (test_spread_master_close || test_true))
	      /* Outer is close, inner is spread.  */
	      switch (places_array[test_places].count)
		{
		case 8:
		  /* T = 4, P = 8, each subpartition has 2 places.  */
		case 7:
		  /* T = 4, P = 7, each subpartition has 2 places, but
		     last partition, which has just one place.  */
		  p = places_array[test_places].places[thr == 3 ? 0
						       : 2 + 2 * thr];
		  break;
		case 5:
		  /* T = 4, P = 5, first subpartition has 2 places, the
		     rest just one.  */
		  p = places_array[test_places].places[thr == 3 ? 0
						       : 2 + thr];
		  break;
		case 3:
		  /* T = 4, P = 3, unit sized subpartitions, third gets
		     thr0 and thr3, first thr1, second thr2.  */
		  p = places_array[test_places].places[thr == 0 ? 2 : thr - 1];
		  break;
		case 2:
		  /* T = 4, P = 2, unit sized subpartitions, each with
		     2 threads.  */
		  p = places_array[test_places].places[1 - thr / 2];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	  #pragma omp barrier
	  if (omp_get_thread_num () == 0)
	    {
	      /* Close, spread, close.  */
	      #pragma omp parallel num_threads (5) proc_bind (close)
	      {
		verify (omp_proc_bind_close, omp_proc_bind_close);
		#pragma omp critical
		{
		  struct place p = places_array[0].places[0];
		  int thr = omp_get_thread_num ();
		  printf ("#4,#2,#1 thread 2,0,%d", thr);
		  if (omp_get_num_threads () == 5
		      && (test_spread_master_close || test_true))
		    /* Outer is close, inner spread, innermost close.  */
		    switch (places_array[test_places].count)
		      {
		      case 8:
		      case 7:
			/* T = 5, P = 2.  */
			p = places_array[test_places].places[2
							     + (thr & 2) / 2];
			break;
		      /* All the rest are T = 5, P = 1.  */
		      case 5:
		      case 3:
			p = places_array[test_places].places[2];
			break;
		      case 2:
			p = places_array[test_places].places[1];
			break;
		      }
		  print_affinity (p);
		  printf ("\n");
		}
	      }
	    }
	  #pragma omp barrier
	  if (omp_get_thread_num () == 2)
	    {
	      /* Close, spread, close.  */
	      #pragma omp parallel num_threads (5) proc_bind (close)
	      {
		verify (omp_proc_bind_close, omp_proc_bind_close);
		#pragma omp critical
		{
		  struct place p = places_array[0].places[0];
		  int thr = omp_get_thread_num ();
		  printf ("#4,#2,#2 thread 2,2,%d", thr);
		  if (omp_get_num_threads () == 5
		      && (test_spread_master_close || test_true))
		    /* Outer is close, inner spread, innermost close.  */
		    switch (places_array[test_places].count)
		      {
		      case 8:
			/* T = 5, P = 2.  */
			p = places_array[test_places].places[6
							     + (thr & 2) / 2];
			break;
		      /* All the rest are T = 5, P = 1.  */
		      case 7:
			p = places_array[test_places].places[6];
			break;
		      case 5:
			p = places_array[test_places].places[4];
			break;
		      case 3:
			p = places_array[test_places].places[1];
			break;
		      case 2:
			p = places_array[test_places].places[0];
			break;
		      }
		  print_affinity (p);
		  printf ("\n");
		}
	      }
	    }
	  #pragma omp barrier
	  if (omp_get_thread_num () == 3)
	    {
	      /* Close, spread, close.  */
	      #pragma omp parallel num_threads (5) proc_bind (close)
	      {
		verify (omp_proc_bind_close, omp_proc_bind_close);
		#pragma omp critical
		{
		  struct place p = places_array[0].places[0];
		  int thr = omp_get_thread_num ();
		  printf ("#4,#2,#3 thread 2,3,%d", thr);
		  if (omp_get_num_threads () == 5
		      && (test_spread_master_close || test_true))
		    /* Outer is close, inner spread, innermost close.  */
		    switch (places_array[test_places].count)
		      {
		      case 8:
		      case 7:
		      case 5:
			/* T = 5, P = 2.  */
			p = places_array[test_places].places[(thr & 2) / 2];
			break;
		      /* All the rest are T = 5, P = 1.  */
		      case 3:
			p = places_array[test_places].places[2];
			break;
		      case 2:
			p = places_array[test_places].places[0];
			break;
		      }
		  print_affinity (p);
		  printf ("\n");
		}
	      }
	    }
	}
	/* Close, master.  */
	#pragma omp parallel num_threads (4) proc_bind(master) \
			     firstprivate (pp)
	{
	  verify (omp_proc_bind_master, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#4,#3 thread 2,%d", thr);
	    if (test_spread_master_close || test_true)
	      /* Outer is close, inner master.  */
	      p = places_array[test_places].places[pp];
	    print_affinity (p);
	    printf ("\n");
	  }
	}
	/* Close, close.  */
	#pragma omp parallel num_threads (6) proc_bind (close)
	{
	  verify (omp_proc_bind_close, omp_proc_bind_close);
	  #pragma omp critical
	  {
	    struct place p = places_array[0].places[0];
	    int thr = omp_get_thread_num ();
	    printf ("#4,#4 thread 2,%d", thr);
	    if (omp_get_num_threads () == 6
		&& (test_spread_master_close || test_true))
	      switch (places_array[test_places].count)
		{
		case 8:
		  /* T = 6, P = 8.  */
		  p = places_array[test_places].places[2 + thr];
		  break;
		case 7:
		  /* T = 6, P = 7.  */
		  p = places_array[test_places].places[thr == 5 ? 0 : 2 + thr];
		  break;
		case 5:
		  /* T = 6, P = 5.  thr{0,5} go into the third place.  */
		  p = places_array[test_places].places[thr >= 3 ? thr - 3
						       : 2 + thr];
		  break;
		case 3:
		  /* T = 6, P = 3, two threads into each place.  */
		  p = places_array[test_places].places[thr < 2 ? 2
						       : thr / 2 - 1];
		  break;
		case 2:
		  /* T = 6, P = 2, 3 threads into each place.  */
		  p = places_array[test_places].places[1 - thr / 3];
		  break;
		}
	    print_affinity (p);
	    printf ("\n");
	  }
	}
      }
  }

  return 0;
}
Example #30
0
int main(int argc, char** argv){
  extern int nthreads;
  // Time counting variables 
  struct timeval startwtime, endwtime;

  if (argc != 7) { // Check if the command line arguments are correct 
    printf("Usage: %s N dist pop rep P\n"
	   "where\n"
	   "N    : number of points\n"
	   "dist : distribution code (0-cube, 1-sphere)\n"
	   "pop  : population threshold\n"
	   "rep  : repetitions\n"
	   "L    : maximum tree height.\n", argv[0]);
    return (1);
  }

  // Input command line arguments
  int N = atoi(argv[1]); // Number of points
  int dist = atoi(argv[2]); // Distribution identifier 
  int population_threshold = atoi(argv[3]); // populatiton threshold
  int repeat = atoi(argv[4]); // number of independent runs
  int maxlev = atoi(argv[5]); // maximum tree height
   nthreads = atoi(argv[6]); // maximum tree height

  omp_set_dynamic(0);
  omp_set_num_threads(nthreads);

  printf("Running for %d particles with maximum height: %d\n", N, maxlev);

  float *X = (float *) malloc(N*DIM*sizeof(float));
  float *Y = (float *) malloc(N*DIM*sizeof(float));

  unsigned int *hash_codes = (unsigned int *) malloc(DIM*N*sizeof(unsigned int));
  unsigned long int *morton_codes = (unsigned long int *) malloc(N*sizeof(unsigned long int));
  unsigned long int *sorted_morton_codes = (unsigned long int *) malloc(N*sizeof(unsigned long int));
  unsigned int *permutation_vector = (unsigned int *) malloc(N*sizeof(unsigned int)); 
  unsigned int *index = (unsigned int *) malloc(N*sizeof(unsigned int));
  unsigned int *level_record = (unsigned int *) calloc(N,sizeof(unsigned int)); // record of the leaf of the tree and their level

  // initialize the index
  for(int i=0; i<N; i++){
    index[i] = i;
  }

  /* Generate a 3-dimensional data distribution */
  create_dataset(X, N, dist);

  /* Find the boundaries of the space */
  float max[DIM], min[DIM];
  find_max(max, X, N);
  find_min(min, X, N);

  int nbins = (1 << maxlev); // maximum number of boxes at the leaf level

  // Independent runs
  for(int it = 0; it<repeat; it++){

    gettimeofday (&startwtime, NULL); 
  
    compute_hash_codes(hash_codes, X, N, nbins, min, max); // compute the hash codes

    gettimeofday (&endwtime, NULL);

    double hash_time = (double)((endwtime.tv_usec - startwtime.tv_usec)
				/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
    
    printf("Time to compute the hash codes            : %fs\n", hash_time);


    gettimeofday (&startwtime, NULL); 

    morton_encoding(morton_codes, hash_codes, N, maxlev); // computes the Morton codes of the particles

    gettimeofday (&endwtime, NULL);


    double morton_encoding_time = (double)((endwtime.tv_usec - startwtime.tv_usec)
				/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);


    printf("Time to compute the morton encoding       : %fs\n", morton_encoding_time);


    gettimeofday (&startwtime, NULL); 

    // Truncated msd radix sort
    truncated_radix_sort(morton_codes, sorted_morton_codes, 
			 permutation_vector, 
			 index, level_record, N, 
			 population_threshold, 3*(maxlev-1), 0);

    gettimeofday (&endwtime, NULL);

    double sort_time = (double)((endwtime.tv_usec - startwtime.tv_usec)
				/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);

    printf("Time for the truncated radix sort         : %fs\n", sort_time);

    gettimeofday (&startwtime, NULL); 

    // Data rearrangement
    data_rearrangement(Y, X, permutation_vector, N);

    gettimeofday (&endwtime, NULL);


    double rearrange_time = (double)((endwtime.tv_usec - startwtime.tv_usec)
				/1.0e6 + endwtime.tv_sec - startwtime.tv_sec);
    

    printf("Time to rearrange the particles in memory : %fs\n", rearrange_time);

    /* The following code is for verification */ 
    // Check if every point is assigned to one leaf of the tree
    int pass = check_index(permutation_vector, N); 

    if(pass){
      printf("Index test PASS\n");
    }
    else{
      printf("Index test FAIL\n");
    }

    // Check is all particles that are in the same box have the same encoding. 
    pass = check_codes(Y, sorted_morton_codes, 
		       level_record, N, maxlev);

    if(pass){
      printf("Encoding test PASS\n");
    }
    else{
      printf("Encoding test FAIL\n");
    }

  }

  /* clear memory */
  free(X);
  free(Y);
  free(hash_codes);
  free(morton_codes);
  free(sorted_morton_codes);
  free(permutation_vector);
  free(index);
  free(level_record);
}