int main (int argc, char **argv) { ///////////////////// // Parse Arguments // ///////////////////// params *pars = new params; init_pars(pars); parse_cmd_args(argc, argv, pars); if( pars->version ) { printf("ngsF v%s\nCompiled on %s @ %s", version, __DATE__, __TIME__); #ifdef _USE_BGZF printf(" (BGZF library)\n"); #else printf(" (STD library)\n"); #endif exit(0); } if( pars->verbose >= 1 ) { printf("==> Input Arguments:\n"); printf("\tglf file: %s\n\tinit_values: %s\n\tfreq_fixed: %s\n\tout file: %s\n\tn_ind: %d\n\tn_sites: %lu\n\tchunk_size: %lu\n\tfast_lkl: %s\n\tapprox_EM: %s\n\tcall_geno: %s\n\tmax_iters: %d\n\tmin_epsilon: %.10f\n\tn_threads: %d\n\tseed: %lu\n\tquick: %s\n\tversion: %s\n\tverbose: %d\n\n", pars->in_glf, pars->init_values, pars->freq_fixed ? "true":"false", pars->out_file, pars->n_ind, pars->n_sites, pars->max_chunk_size, pars->fast_lkl ? "true":"false", pars->approx_EM ? "true":"false", pars->call_geno ? "true":"false", pars->max_iters, pars->min_epsilon, pars->n_threads, pars->seed, pars->quick ? "true":"false", version, pars->verbose); } if( pars->verbose > 4 ) printf("==> Verbose values greater than 4 for debugging purpose only. Expect large amounts of info on screen\n"); ///////////////////// // Check Arguments // ///////////////////// if(pars->in_glf == NULL) error(__FUNCTION__,"GL input file (-glf) missing!"); else if( strcmp(pars->in_glf, "-") == 0 ) { pars->in_glf_type = new char[6]; pars->in_glf_type = strcat(pars->in_glf_type, "STDIN"); } else { pars->in_glf_type = strrchr(pars->in_glf, '.'); if(pars->in_glf_type == NULL) error(__FUNCTION__,"invalid file type!"); } if(pars->out_file == NULL) error(__FUNCTION__,"output file (-out) missing!"); if(pars->n_ind == 0) error(__FUNCTION__,"number of individuals (-n_ind) missing!"); if(pars->n_sites == 0) error(__FUNCTION__,"number of sites (-n_sites) missing!"); /////////////////////// // Check input files // /////////////////////// // Get file total size struct stat st; stat(pars->in_glf, &st); if( strcmp(pars->in_glf_type, "STDIN") != 0 ) { if( pars->n_sites == st.st_size/sizeof(double)/pars->n_ind/3 && strcmp(pars->in_glf_type, ".glf") == 0 ) { if(pars->verbose >= 1) printf("==> UNCOMP input file (\"%s\"): number of sites (%lu) match expected file size\n", pars->in_glf_type, pars->n_sites); } else if( strcmp(pars->in_glf_type, ".glf") != 0 ) { if( pars->verbose >= 1) printf("==> COMPRESSED input file (\"%s\"): number of sites (%lu) do NOT match expected file size\n", pars->in_glf_type, pars->n_sites); } else error(__FUNCTION__,"wrong number of sites or invalid/corrupt file!"); } // Adjust max_chunk_size in case of fewer sites if(pars->max_chunk_size > pars->n_sites) { if( pars->verbose >= 1 ) printf("==> Fewer sites (%lu) than chunk_size (%lu). Reducing chunk size to match number of sites\n", pars->n_sites, pars->max_chunk_size); pars->max_chunk_size = pars->n_sites; } // Calculate total number of chunks pars->n_chunks = ceil( (double) pars->n_sites/ (double) pars->max_chunk_size ); if( pars->verbose >= 1 ) printf("==> Analysis will be run in %ld chunk(s)\n", pars->n_chunks); // Alocate memory for the chunk index pars->chunks_voffset = new int64_t[pars->n_chunks]; memset(pars->chunks_voffset, 0, pars->n_chunks*sizeof(int64_t)); // Adjust thread number to chunks if(pars->n_chunks < pars->n_threads) { if( pars->verbose >= 1 ) printf("==> Fewer chunks (%ld) than threads (%d). Reducing the number of threads to match number of chunks\n", pars->n_chunks, pars->n_threads); pars->n_threads = pars->n_chunks; } // Open input file #ifdef _USE_BGZF if( pars->verbose >= 1 ) printf("==> Using BGZF I/O library\n"); // Open BGZIP file if( strcmp(pars->in_glf_type, ".bgz") == 0 ) { if( (pars->in_glf_fh = bgzf_open(pars->in_glf, "rb")) < 0 ) error(__FUNCTION__,"Cannot open BGZIP file!"); } else error(__FUNCTION__,"BGZF library only supports BGZIP files!"); bgzf_set_cache_size(pars->in_glf_fh, CACHE_SIZE * 1024uL * 1024uL * 1024uL); #else if( pars->verbose >= 1 ) printf("==> Using native I/O library\n"); // Open GLF file if( strcmp(pars->in_glf_type, "STDIN") == 0 ) pars->in_glf_fh = stdin; else if( strcmp(pars->in_glf_type, ".glf") == 0 ) { if( (pars->in_glf_fh = fopen(pars->in_glf, "rb")) == NULL ) error(__FUNCTION__,"Cannot open GLF file!"); } else error(__FUNCTION__,"Standard library only supports UNCOMPRESSED GLF files!"); // Allocate memory and read from the file pars->data = new double* [pars->n_sites]; for(uint64_t s = 0; s < pars->n_sites; s++) { pars->data[s] = new double[pars->n_ind * 3]; if( fread (pars->data[s], sizeof(double), pars->n_ind * 3, pars->in_glf_fh) != pars->n_ind * 3) error(__FUNCTION__,"cannot read GLF file!"); if(pars->call_geno) call_geno(pars->data[s], pars->n_ind, 3); } #endif if( pars->in_glf_fh == NULL ) error(__FUNCTION__,"cannot open GLF file!"); /////////////////////////////////// // Declare variables for results // /////////////////////////////////// out_data *output = new out_data; output->site_freq = new double[pars->n_sites]; output->site_freq_num = new double[pars->n_sites]; output->site_freq_den = new double[pars->n_sites]; output->site_prob_var = new double[pars->n_sites]; output->site_tmpprob_var = new double[pars->n_sites]; output->indF = new double[pars->n_ind]; output->indF_num = new double[pars->n_ind]; output->indF_den = new double[pars->n_ind]; output->ind_lkl = new double[pars->n_ind]; // Initialize output init_output(pars, output); ////////////////// // Analyze Data // ////////////////// if( pars->verbose >= 1 && !pars->fast_lkl && strcmp("e", pars->init_values) != 0 ) { printf("==> Initial LogLkl: %.15f\n", full_HWE_like(pars, output->site_freq, output->indF, 0, pars->n_ind)); fflush(stdout); } do_EM(pars, output); if( pars->verbose >= 1 ) printf("\nFinal logLkl: %f\n", output->global_lkl); ////////////////// // Print Output // ////////////////// FILE *out_file; if( pars->verbose >= 1 ) printf("Printing Output...\n"); out_file = fopen(pars->out_file, "w"); if(out_file == NULL) error(__FUNCTION__,"Cannot open OUTPUT file!"); for(uint16_t i = 0; i < pars->n_ind; i++) fprintf(out_file,"%f\n", output->indF[i]); fclose(out_file); ////////////////////// // Close Input File // ////////////////////// if( pars->verbose >= 1 ) printf("Exiting...\n"); #ifdef _USE_BGZF bgzf_close(pars->in_glf_fh); #else for(uint64_t s = 0; s < pars->n_sites; s++) delete [] pars->data[s]; delete [] pars->data; fclose(pars->in_glf_fh); #endif ///////////////// // Free Memory // ///////////////// delete [] output->site_freq; delete [] output->site_freq_num; delete [] output->site_freq_den; delete [] output->site_prob_var; delete [] output->indF; delete [] output->indF_num; delete [] output->indF_den; delete [] output->ind_lkl; delete output; //if( strcmp("e", pars->init_values) == 0 ) //delete [] pars->init_values; delete [] pars->chunks_voffset; delete pars; return 0; }
int do_EM (params *pars, out_data *output) { SIG_COND = true; catch_SIG(); int iter = ( strcmp("e", pars->init_values) == 0 ? 0 : 1 ); double lkl_epsilon = 0; double est_epsilon = 0; sem_init(&pars->_launch_thread_semaph, 0, pars->n_threads); sem_init(&pars->_running_thread_semaph, 0, 0); // To avoid warnings from Valgrind sem_init(&pars->_running_thread_semaph, 0, -pars->n_chunks); pthread_mutex_init(&pars->_F_lock, NULL); while( (est_epsilon > pars->min_epsilon || lkl_epsilon > pars->min_epsilon || iter <= pars->min_iters) && iter <= pars->max_iters && SIG_COND ) { // Print of initial Lkl if( iter == 1 && pars->verbose >= 1 ) { output->global_lkl = full_HWE_like(pars, output->site_freq, output->indF, 0, pars->n_ind); printf("==> Initial LogLkl: %.15f\n", output->global_lkl); fflush(stdout); } // Next Iteration... time_t iter_start = time(NULL); if( iter > 0 && pars->verbose >= 1 ) printf("\nIteration %d:\n", iter); //////////////////////////////// // Loop through all chunks... // //////////////////////////////// for(uint64_t c = 0; c < pars->n_chunks; c++) { // Wait for room to launch more threads while( sem_wait(&pars->_launch_thread_semaph) ); if( pars->verbose >= 5 ) printf("\tChunk %lu of %lu\n", c+1, pars->n_chunks); // Declare structure pth_params *pth_struct = new pth_params; // Reserve memory for chunk data pth_struct->chunk_data = new double* [pars->max_chunk_size]; #ifdef _USE_BGZF for(uint64_t s = 0; s < pars->max_chunk_size; s++) pth_struct->chunk_data[s] = new double[pars->n_ind * 3]; #endif // Fill in PThread structure pth_struct->pars = pars; pth_struct->chunk_size = read_chunk(pth_struct->chunk_data, pth_struct->pars, c); pth_struct->chunk_abs_start_pos = c * pars->max_chunk_size; pth_struct->iter = iter; pth_struct->output = output; // Initialize and set thread detached attribute pthread_t thread_id; pthread_attr_t pt_attr; pthread_attr_init(&pt_attr); pthread_attr_setdetachstate(&pt_attr, PTHREAD_CREATE_DETACHED); // Launch thread int rc = pthread_create(&thread_id, &pt_attr, run_chunk, (void*) pth_struct); if(rc) error(__FUNCTION__,"pthread_create() failed!"); if( pars->verbose >= 6 ) { int n_free_threads = 0; sem_getvalue(&pars->_launch_thread_semaph, &n_free_threads); printf("Thread launched! Available slots: %d\n", n_free_threads); } fflush(stdout); } //////////////////////////////////// // Wait for all threads to finish // //////////////////////////////////// int n_free_threads = 0; do { while( sem_wait(&pars->_running_thread_semaph) ); sem_getvalue(&pars->_launch_thread_semaph, &n_free_threads); if( pars->verbose >= 6 ) printf("Waiting for all threads to finish: %d\n", pars->n_threads - n_free_threads); }while(n_free_threads < (int) pars->n_threads); est_epsilon = 0; ///////////////////////////////////// // Indiv post-iteration processing // ///////////////////////////////////// if( pars->verbose >= 2 ) printf("\tInd F:\t"); for(uint16_t i = 0; i < pars->n_ind; i++) { // Get new indF and check for interval... double new_indF = check_interv(output->indF_num[i] / output->indF_den[i], false); // If LRT, do not estimate indF (it is fixed) if(pars->calc_LRT) new_indF = output->indF[i]; // Calculate iter epsilon est_epsilon += pow(new_indF - output->indF[i], 2); // Store new indF new_indF = ( new_indF == 1 ? 0.9999 : new_indF ); output->indF[i] = new_indF; // Reset variables... output->indF_num[i] = 0; output->indF_den[i] = 0; // Debug if( pars->verbose >= 2 ) printf("\t%.9f", output->indF[i]); } if( pars->verbose >= 2 ) printf("\n"); //////////////////////////////////// // Site post-iteration processing // //////////////////////////////////// if( pars->verbose >= 4 ) printf("\tFreq:\t"); for(uint64_t s = 0; s < pars->n_sites; s++) { if(output->site_freq[s] == 0) continue; if(!pars->freq_fixed){ double new_site_freq = check_interv(output->site_freq_num[s] / output->site_freq_den[s], true); est_epsilon += pow(new_site_freq - output->site_freq[s], 2); output->site_freq[s] = (new_site_freq > 0.99 ? 0.99 : new_site_freq); } // Reset variables... output->site_freq_num[s] = 0; output->site_freq_den[s] = 0; output->site_prob_var[s] = output->site_tmpprob_var[s]; output->site_tmpprob_var[s] = 0; // Debug if( pars->verbose >= 4 ) printf("\t%.9f", output->site_freq[s]); } if( pars->verbose >= 4 ) printf("\n"); /////////////////// // Calculate Lkl // /////////////////// double new_global_lkl = 0; for(uint16_t i = 0; i < pars->n_ind; i++) { output->ind_lkl[i] = full_HWE_like(pars, output->site_freq, output->indF, i, 1); new_global_lkl += output->ind_lkl[i]; } // Parameter epsilon est_epsilon = sqrt(est_epsilon)/(pars->n_ind + pars->n_sites); // Lkl epsilon calculation - On first iteration, since there is no global_lkl, calculate Lkl epsilon based on current lkl lkl_epsilon = (new_global_lkl - output->global_lkl)/fabs(output->global_lkl); output->global_lkl = new_global_lkl; // Print iteration info if( iter > 0 && pars->verbose >= 1 ) { time_t iter_end = time(NULL); printf("\tLogLkl: %.15f\t epsilon: %.15f %.15f\ttime: %.0f (s)\n", output->global_lkl, lkl_epsilon, est_epsilon, difftime(iter_end, iter_start) ); } iter++; fflush(stdout); /////////////////////////////// // Dump iteration parameters // /////////////////////////////// char* pars_file = (char*) malloc( (strlen(pars->out_file)+5+1)*sizeof(char) ); memset(pars_file, '\0', (strlen(pars->out_file)+5+1)*sizeof(char)); strcat(pars_file, pars->out_file); strcat(pars_file, ".pars"); // Write the last iteration to disk FILE* last_est_pars = fopen(pars_file, "w"); if(last_est_pars == NULL) error(__FUNCTION__, "Cannot open PARS file!"); fwrite(&output->global_lkl, sizeof(double), 1, last_est_pars); fwrite(output->ind_lkl, sizeof(double), pars->n_ind, last_est_pars); fwrite(output->indF, sizeof(double), pars->n_ind, last_est_pars); fwrite(output->site_freq, sizeof(double), pars->n_sites, last_est_pars); fclose(last_est_pars); free(pars_file); /////////////// // For debug // /////////////// if( pars->quick ) break; } if( iter > pars->max_iters ) printf("WARN: Maximum number of iterations reached! Check if analysis converged... \n"); return 0; }