Exemplo n.º 1
0
int main (int argc, char **argv) {
    /////////////////////
    // Parse Arguments //
    /////////////////////
    params *pars = new params;
    init_pars(pars);
    parse_cmd_args(argc, argv, pars);
    if( pars->version ) {
        printf("ngsF v%s\nCompiled on %s @ %s", version, __DATE__, __TIME__);
#ifdef _USE_BGZF
        printf(" (BGZF library)\n");
#else
        printf(" (STD library)\n");
#endif

        exit(0);
    }
    if( pars->verbose >= 1 ) {
        printf("==> Input Arguments:\n");
        printf("\tglf file: %s\n\tinit_values: %s\n\tfreq_fixed: %s\n\tout file: %s\n\tn_ind: %d\n\tn_sites: %lu\n\tchunk_size: %lu\n\tfast_lkl: %s\n\tapprox_EM: %s\n\tcall_geno: %s\n\tmax_iters: %d\n\tmin_epsilon: %.10f\n\tn_threads: %d\n\tseed: %lu\n\tquick: %s\n\tversion: %s\n\tverbose: %d\n\n",
               pars->in_glf, pars->init_values, pars->freq_fixed ? "true":"false", pars->out_file, pars->n_ind, pars->n_sites, pars->max_chunk_size, pars->fast_lkl ? "true":"false", pars->approx_EM ? "true":"false", pars->call_geno ? "true":"false", pars->max_iters, pars->min_epsilon, pars->n_threads, pars->seed, pars->quick ? "true":"false", version, pars->verbose);
    }
    if( pars->verbose > 4 ) printf("==> Verbose values greater than 4 for debugging purpose only. Expect large amounts of info on screen\n");



    /////////////////////
    // Check Arguments //
    /////////////////////
    if(pars->in_glf == NULL)
        error(__FUNCTION__,"GL input file (-glf) missing!");
    else if( strcmp(pars->in_glf, "-") == 0 ) {
        pars->in_glf_type = new char[6];
        pars->in_glf_type = strcat(pars->in_glf_type, "STDIN");
    } else {
        pars->in_glf_type = strrchr(pars->in_glf, '.');
        if(pars->in_glf_type == NULL)
            error(__FUNCTION__,"invalid file type!");
    }
    if(pars->out_file == NULL)
        error(__FUNCTION__,"output file (-out) missing!");
    if(pars->n_ind == 0)
        error(__FUNCTION__,"number of individuals (-n_ind) missing!");
    if(pars->n_sites == 0)
        error(__FUNCTION__,"number of sites (-n_sites) missing!");



    ///////////////////////
    // Check input files //
    ///////////////////////
    // Get file total size
    struct stat st;
    stat(pars->in_glf, &st);
    if( strcmp(pars->in_glf_type, "STDIN") != 0 ) {
        if( pars->n_sites == st.st_size/sizeof(double)/pars->n_ind/3 && strcmp(pars->in_glf_type, ".glf") == 0 ) {
            if(pars->verbose >= 1)
                printf("==> UNCOMP input file (\"%s\"): number of sites (%lu) match expected file size\n", pars->in_glf_type, pars->n_sites);
        } else if( strcmp(pars->in_glf_type, ".glf") != 0 ) {
            if( pars->verbose >= 1)
                printf("==> COMPRESSED input file (\"%s\"): number of sites (%lu) do NOT match expected file size\n", pars->in_glf_type, pars->n_sites);
        } else
            error(__FUNCTION__,"wrong number of sites or invalid/corrupt file!");
    }


    // Adjust max_chunk_size in case of fewer sites
    if(pars->max_chunk_size > pars->n_sites) {
        if( pars->verbose >= 1 ) printf("==> Fewer sites (%lu) than chunk_size (%lu). Reducing chunk size to match number of sites\n", pars->n_sites, pars->max_chunk_size);
        pars->max_chunk_size = pars->n_sites;
    }
    // Calculate total number of chunks
    pars->n_chunks = ceil( (double) pars->n_sites/ (double) pars->max_chunk_size );
    if( pars->verbose >= 1 ) printf("==> Analysis will be run in %ld chunk(s)\n", pars->n_chunks);
    // Alocate memory for the chunk index
    pars->chunks_voffset = new int64_t[pars->n_chunks];
    memset(pars->chunks_voffset, 0, pars->n_chunks*sizeof(int64_t));
    // Adjust thread number to chunks
    if(pars->n_chunks < pars->n_threads) {
        if( pars->verbose >= 1 ) printf("==> Fewer chunks (%ld) than threads (%d). Reducing the number of threads to match number of chunks\n", pars->n_chunks, pars->n_threads);
        pars->n_threads = pars->n_chunks;
    }


    // Open input file
#ifdef _USE_BGZF
    if( pars->verbose >= 1 ) printf("==> Using BGZF I/O library\n");
    // Open BGZIP file
    if( strcmp(pars->in_glf_type, ".bgz") == 0 ) {
        if( (pars->in_glf_fh = bgzf_open(pars->in_glf, "rb")) < 0 )
            error(__FUNCTION__,"Cannot open BGZIP file!");
    } else
        error(__FUNCTION__,"BGZF library only supports BGZIP files!");

    bgzf_set_cache_size(pars->in_glf_fh, CACHE_SIZE * 1024uL * 1024uL * 1024uL);
#else

    if( pars->verbose >= 1 ) printf("==> Using native I/O library\n");
    // Open GLF file
    if( strcmp(pars->in_glf_type, "STDIN") == 0 )
        pars->in_glf_fh = stdin;
    else if( strcmp(pars->in_glf_type, ".glf") == 0 ) {
        if( (pars->in_glf_fh = fopen(pars->in_glf, "rb")) == NULL )
            error(__FUNCTION__,"Cannot open GLF file!");
    } else
        error(__FUNCTION__,"Standard library only supports UNCOMPRESSED GLF files!");

    // Allocate memory and read from the file
    pars->data = new double* [pars->n_sites];
    for(uint64_t s = 0; s < pars->n_sites; s++) {
        pars->data[s] = new double[pars->n_ind * 3];
        if( fread (pars->data[s], sizeof(double), pars->n_ind * 3, pars->in_glf_fh) != pars->n_ind * 3)
            error(__FUNCTION__,"cannot read GLF file!");
        if(pars->call_geno)
            call_geno(pars->data[s], pars->n_ind, 3);
    }
#endif
    if( pars->in_glf_fh == NULL )
        error(__FUNCTION__,"cannot open GLF file!");



    ///////////////////////////////////
    // Declare variables for results //
    ///////////////////////////////////
    out_data *output = new out_data;
    output->site_freq = new double[pars->n_sites];
    output->site_freq_num = new double[pars->n_sites];
    output->site_freq_den = new double[pars->n_sites];
    output->site_prob_var = new double[pars->n_sites];
    output->site_tmpprob_var = new double[pars->n_sites];
    output->indF = new double[pars->n_ind];
    output->indF_num = new double[pars->n_ind];
    output->indF_den = new double[pars->n_ind];
    output->ind_lkl = new double[pars->n_ind];
    // Initialize output
    init_output(pars, output);



    //////////////////
    // Analyze Data //
    //////////////////
    if( pars->verbose >= 1 && !pars->fast_lkl && strcmp("e", pars->init_values) != 0 ) {
        printf("==> Initial LogLkl: %.15f\n", full_HWE_like(pars, output->site_freq, output->indF, 0, pars->n_ind));
        fflush(stdout);
    }
    do_EM(pars, output);
    if( pars->verbose >= 1 ) printf("\nFinal logLkl: %f\n", output->global_lkl);



    //////////////////
    // Print Output //
    //////////////////
    FILE *out_file;
    if( pars->verbose >= 1 ) printf("Printing Output...\n");

    out_file = fopen(pars->out_file, "w");
    if(out_file == NULL)
        error(__FUNCTION__,"Cannot open OUTPUT file!");
    for(uint16_t i = 0; i < pars->n_ind; i++)
        fprintf(out_file,"%f\n", output->indF[i]);
    fclose(out_file);



    //////////////////////
    // Close Input File //
    //////////////////////
    if( pars->verbose >= 1 ) printf("Exiting...\n");
#ifdef _USE_BGZF
    bgzf_close(pars->in_glf_fh);
#else
    for(uint64_t s = 0; s < pars->n_sites; s++)
        delete [] pars->data[s];
    delete [] pars->data;
    fclose(pars->in_glf_fh);
#endif



    /////////////////
    // Free Memory //
    /////////////////
    delete [] output->site_freq;
    delete [] output->site_freq_num;
    delete [] output->site_freq_den;
    delete [] output->site_prob_var;
    delete [] output->indF;
    delete [] output->indF_num;
    delete [] output->indF_den;
    delete [] output->ind_lkl;
    delete output;
    //if( strcmp("e", pars->init_values) == 0 )
    //delete [] pars->init_values;
    delete [] pars->chunks_voffset;
    delete pars;

    return 0;
}
Exemplo n.º 2
0
Arquivo: EM.cpp Projeto: fgvieira/ngsF
int do_EM (params *pars, out_data *output) {
	SIG_COND = true;
	catch_SIG();

	int iter = ( strcmp("e", pars->init_values) == 0 ? 0 : 1 );
	double lkl_epsilon = 0;
	double est_epsilon = 0;
	sem_init(&pars->_launch_thread_semaph, 0, pars->n_threads);
	sem_init(&pars->_running_thread_semaph, 0, 0); // To avoid warnings from Valgrind
	sem_init(&pars->_running_thread_semaph, 0, -pars->n_chunks);
	pthread_mutex_init(&pars->_F_lock, NULL);

	while( (est_epsilon > pars->min_epsilon || lkl_epsilon > pars->min_epsilon || iter <= pars->min_iters) && iter <= pars->max_iters && SIG_COND ) {
	        // Print of initial Lkl
	        if( iter == 1 && pars->verbose >= 1 ) {
		  output->global_lkl = full_HWE_like(pars, output->site_freq, output->indF, 0, pars->n_ind);
		  printf("==> Initial LogLkl: %.15f\n", output->global_lkl);
		  fflush(stdout);
		}

		// Next Iteration...
		time_t iter_start = time(NULL);
		if( iter > 0 && pars->verbose >= 1 )
		  printf("\nIteration %d:\n", iter);



		////////////////////////////////
		// Loop through all chunks... //
		////////////////////////////////
		for(uint64_t c = 0; c < pars->n_chunks; c++) {
			// Wait for room to launch more threads
			while( sem_wait(&pars->_launch_thread_semaph) );

			if( pars->verbose >= 5 ) printf("\tChunk %lu of %lu\n", c+1, pars->n_chunks);

			// Declare structure
			pth_params *pth_struct = new pth_params;
			// Reserve memory for chunk data
			pth_struct->chunk_data = new double* [pars->max_chunk_size];
#ifdef _USE_BGZF
			for(uint64_t s = 0; s < pars->max_chunk_size; s++)
				pth_struct->chunk_data[s] = new double[pars->n_ind * 3];
#endif
			// Fill in PThread structure
			pth_struct->pars = pars;
			pth_struct->chunk_size = read_chunk(pth_struct->chunk_data, pth_struct->pars, c);
			pth_struct->chunk_abs_start_pos = c * pars->max_chunk_size;
			pth_struct->iter = iter;
			pth_struct->output = output;

			// Initialize and set thread detached attribute
			pthread_t thread_id;
			pthread_attr_t pt_attr;
			pthread_attr_init(&pt_attr);
			pthread_attr_setdetachstate(&pt_attr, PTHREAD_CREATE_DETACHED);

			// Launch thread
			int rc = pthread_create(&thread_id, &pt_attr, run_chunk, (void*) pth_struct);
			if(rc) error(__FUNCTION__,"pthread_create() failed!");

			if( pars->verbose >= 6 ) {
				int n_free_threads = 0;
				sem_getvalue(&pars->_launch_thread_semaph, &n_free_threads);
				printf("Thread launched! Available slots: %d\n", n_free_threads);
			}
			fflush(stdout);
		}



		////////////////////////////////////
		// Wait for all threads to finish //
		////////////////////////////////////
		int n_free_threads = 0;
		do {
			while( sem_wait(&pars->_running_thread_semaph) );
			sem_getvalue(&pars->_launch_thread_semaph, &n_free_threads);
			if( pars->verbose >= 6 ) printf("Waiting for all threads to finish: %d\n", pars->n_threads - n_free_threads);
		}while(n_free_threads < (int) pars->n_threads);



		est_epsilon = 0;
		/////////////////////////////////////
		// Indiv post-iteration processing //
		/////////////////////////////////////
		if( pars->verbose >= 2 ) printf("\tInd F:\t");
		for(uint16_t i = 0; i < pars->n_ind; i++) {
			// Get new indF and check for interval...
			double new_indF = check_interv(output->indF_num[i] / output->indF_den[i], false);
			// If LRT, do not estimate indF (it is fixed)
			if(pars->calc_LRT)
			  new_indF = output->indF[i];
			// Calculate iter epsilon
			est_epsilon += pow(new_indF - output->indF[i], 2);
			// Store new indF
			new_indF = ( new_indF == 1 ? 0.9999 : new_indF );
			output->indF[i] = new_indF;

			// Reset variables...
			output->indF_num[i] = 0;
			output->indF_den[i] = 0;

			// Debug
			if( pars->verbose >= 2 ) printf("\t%.9f", output->indF[i]);
		}
		if( pars->verbose >= 2 ) printf("\n");



		////////////////////////////////////
                // Site post-iteration processing //
                ////////////////////////////////////
                if( pars->verbose >= 4 ) printf("\tFreq:\t");
                for(uint64_t s = 0; s < pars->n_sites; s++) {
                  if(output->site_freq[s] == 0) continue;
                  if(!pars->freq_fixed){
                    double new_site_freq = check_interv(output->site_freq_num[s] / output->site_freq_den[s], true);
                    est_epsilon += pow(new_site_freq - output->site_freq[s], 2);
                    output->site_freq[s] = (new_site_freq > 0.99 ? 0.99 : new_site_freq);
                  }

                  // Reset variables...
                  output->site_freq_num[s] = 0;
                  output->site_freq_den[s] = 0;
                  output->site_prob_var[s] = output->site_tmpprob_var[s];
                  output->site_tmpprob_var[s] = 0;

                  // Debug
                  if( pars->verbose >= 4 ) printf("\t%.9f", output->site_freq[s]);
                }
                if( pars->verbose >= 4 ) printf("\n");



		///////////////////
		// Calculate Lkl //
		///////////////////
		double new_global_lkl = 0;
		for(uint16_t i = 0; i < pars->n_ind; i++) {
		  output->ind_lkl[i] = full_HWE_like(pars, output->site_freq, output->indF, i, 1);
		  new_global_lkl += output->ind_lkl[i];
		}
		// Parameter epsilon
		est_epsilon = sqrt(est_epsilon)/(pars->n_ind + pars->n_sites);
		// Lkl epsilon calculation - On first iteration, since there is no global_lkl, calculate Lkl epsilon based on current lkl
		lkl_epsilon = (new_global_lkl - output->global_lkl)/fabs(output->global_lkl);
		output->global_lkl = new_global_lkl;

		// Print iteration info
		if( iter > 0 && pars->verbose >= 1 ) {
		  time_t iter_end = time(NULL);
		  printf("\tLogLkl: %.15f\t epsilon: %.15f %.15f\ttime: %.0f (s)\n", output->global_lkl, lkl_epsilon, est_epsilon, difftime(iter_end, iter_start) );
		}
		iter++;
		fflush(stdout);



		///////////////////////////////
		// Dump iteration parameters //
		///////////////////////////////
		char* pars_file = (char*) malloc( (strlen(pars->out_file)+5+1)*sizeof(char) );
		memset(pars_file, '\0', (strlen(pars->out_file)+5+1)*sizeof(char));
		strcat(pars_file, pars->out_file); strcat(pars_file, ".pars");
		// Write the last iteration to disk
		FILE* last_est_pars = fopen(pars_file, "w");
		if(last_est_pars == NULL)
		  error(__FUNCTION__, "Cannot open PARS file!");
		fwrite(&output->global_lkl, sizeof(double), 1, last_est_pars);
		fwrite(output->ind_lkl, sizeof(double), pars->n_ind, last_est_pars);
		fwrite(output->indF, sizeof(double), pars->n_ind, last_est_pars);
		fwrite(output->site_freq, sizeof(double), pars->n_sites, last_est_pars);
		fclose(last_est_pars);
		free(pars_file);



		///////////////
		// For debug //
		///////////////
		if( pars->quick ) break;
	}


	if( iter > pars->max_iters )
		printf("WARN: Maximum number of iterations reached! Check if analysis converged... \n");

	return 0;
}