void *mmap_file(size_t *len, const char *filename) { int fd = open(filename, O_RDONLY); if (fd < 0) { LOG_FATAL_F("Error opening file: %s\n", filename); } struct stat st[1]; if (fstat(fd, st)) { LOG_FATAL_F("Error while getting file information: %s\n", filename); } *len = (size_t) st->st_size; if (!*len) { close(fd); return NULL; } void *map = mmap(NULL, *len, PROT_READ, MAP_PRIVATE, fd, 0); if (MAP_FAILED == map) { LOG_FATAL_F("mmap failed for %s\n", filename); } close(fd); return map; }
static void prepare_region_table_statements(region_table_t *table) { sqlite3 *db = table->storage; // Insert regions char *sql_insert = "INSERT INTO regions VALUES (?1, ?2, ?3, ?4, ?5)"; if (sqlite3_prepare_v2(table->storage, sql_insert, strlen(sql_insert), &(table->insert_region_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } // Find exact regions char *sql_find_exact = "SELECT COUNT(*) FROM regions WHERE chromosome = ?1 AND start = ?2 AND end = ?3"; if (sqlite3_prepare_v2(table->storage, sql_find_exact, strlen(sql_find_exact), &(table->find_exact_region_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } char *sql_find_exact_type = "SELECT COUNT(*) FROM regions WHERE chromosome = ?1 AND start = ?2 AND end = ?3 AND type = ?4"; if (sqlite3_prepare_v2(table->storage, sql_find_exact_type, strlen(sql_find_exact_type), &(table->find_exact_region_type_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } // Find regions char *sql_find = "SELECT COUNT(*) FROM regions WHERE chromosome = ?1 AND start <= ?3 AND end >= ?2"; if (sqlite3_prepare_v2(table->storage, sql_find, strlen(sql_find), &(table->find_region_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } char *sql_find_type = "SELECT COUNT(*) FROM regions WHERE chromosome = ?1 AND start <= ?3 AND end >= ?2 AND type = ?4"; if (sqlite3_prepare_v2(table->storage, sql_find_type, strlen(sql_find_type), &(table->find_region_type_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } // Remove regions char *sql_remove_exact = "DELETE FROM regions WHERE chromosome = ?1 AND start = ?2 AND end = ?3"; if (sqlite3_prepare_v2(table->storage, sql_remove_exact, strlen(sql_remove_exact), &(table->remove_exact_region_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } char *sql_remove = "DELETE FROM regions WHERE chromosome = ?1 AND start <= ?3 AND end >= ?2"; if (sqlite3_prepare_v2(table->storage, sql_remove, strlen(sql_remove), &(table->remove_region_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } // Query chromosomes char *sql_get_chromosome = "SELECT * FROM regions WHERE chromosome = ?1"; if (sqlite3_prepare_v2(table->storage, sql_get_chromosome, strlen(sql_get_chromosome), &(table->get_chromosome_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } char *sql_count_in_chromosome = "SELECT COUNT(*) FROM regions WHERE chromosome = ?1"; if (sqlite3_prepare_v2(table->storage, sql_count_in_chromosome, strlen(sql_count_in_chromosome), &(table->count_in_chromosome_stmt), NULL) != SQLITE_OK) { LOG_FATAL_F("Could not create regions database: %s (%d)\n", sqlite3_errmsg(db), sqlite3_errcode(db)); } }
fastq_gzfile_t *fastq_gzopen(char *filename) { FILE *fd = fopen(filename, (char*)"r"); char log_message[50]; if (fd == NULL) { LOG_FATAL_F("Error opening file: %s\n", filename); return NULL; } fastq_gzfile_t* fq_gzfile = (fastq_gzfile_t*) malloc(sizeof(fastq_gzfile_t)); fq_gzfile->filename = filename; fq_gzfile->fd = fd; fq_gzfile->strm.zalloc = Z_NULL; fq_gzfile->strm.zfree = Z_NULL; fq_gzfile->strm.opaque = Z_NULL; fq_gzfile->strm.avail_in = 0; fq_gzfile->strm.next_in = Z_NULL; fq_gzfile->ret = inflateInit2 (&fq_gzfile->strm, 15 + 32); // Using inflateInit2 for GZIP support fq_gzfile->data = NULL; fq_gzfile->data_size = 0; return fq_gzfile; }
filter_t *region_exact_filter_new(char *region_descriptor, int use_region_file, char *type, const char *url, const char *species, const char *version) { assert(region_descriptor); assert(url); assert(species); assert(version); filter_t *filter = (filter_t*) malloc (sizeof(filter_t)); filter->type = REGION; filter->filter_func = region_filter; filter->free_func = region_filter_free; filter->priority = 2; region_filter_args *filter_args = (region_filter_args*) malloc (sizeof(region_filter_args)); if (use_region_file) { snprintf(filter->name, 11, "RegionFile"); snprintf(filter->description, 64, "Regions read from '%s'", region_descriptor); if (ends_with(region_descriptor, ".gff")) { filter_args->regions = parse_regions_from_gff_file(region_descriptor, url, species, version); } else if (ends_with(region_descriptor, ".bed")) { filter_args->regions = parse_regions_from_bed_file(region_descriptor, url, species, version); } else { LOG_FATAL_F("Region file %s format not supported! Please use BED or GFF formats\n", region_descriptor); } } else { snprintf(filter->name, 11, "RegionList"); snprintf(filter->description, 64, "Regions (could be more) %s", region_descriptor); filter_args->regions = parse_regions(region_descriptor, 1, url, species, version); } filter_args->type = type; filter->args = filter_args; return filter; }
/** * Add additional context to execute in framework. */ int bfwork_add_context(bam_fwork_t *fwork, bfwork_context_t *context, uint8_t flags) { assert(fwork); assert(context); assert(flags != 0); //What kind of execution queue have this context? if(flags & FWORK_CONTEXT_PARALLEL) { //Not supported yet LOG_WARN("FWORK_CONTEXT_QUEUE_PARALLEL is not supported yet, changing to FWORK_CONTEXT_QUEUE_SEQUENTIAL\n"); flags = FWORK_CONTEXT_SEQUENTIAL; } if(flags & FWORK_CONTEXT_SEQUENTIAL) { //Add to sequential execution queue fwork->v_context[fwork->v_context_l] = context; fwork->v_context_l++; } else { LOG_FATAL_F("Trying to add a context with not known flags: %x\n", flags); } return NO_ERROR; }
region_table_t *parse_regions_from_gff_file(char *filename, const char *url, const char *species, const char *version) { gff_file_t *file = gff_open(filename); if (file == NULL) { return NULL; } region_table_t *regions_table = create_table(url, species, version); int ret_code = 0; size_t max_batches = 20; size_t batch_size = 2000; list_t *read_list = (list_t*) malloc (sizeof(list_t)); list_init("batches", 1, max_batches, read_list); #pragma omp parallel sections { // The producer reads the GFF file #pragma omp section { LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num()); ret_code = gff_read_batches(read_list, batch_size, file); list_decr_writers(read_list); if (ret_code) { LOG_FATAL_F("Error while reading GFF file %s (%d)\n", filename, ret_code); } } // The consumer inserts regions in the structure #pragma omp section { list_item_t *item = NULL, *batch_item = NULL; gff_batch_t *batch; gff_record_t *record; while ( (item = list_remove_item(read_list)) != NULL ) { batch = item->data_p; // For each record in the batch, generate a new region for (batch_item = batch->first_p; batch_item != NULL; batch_item = batch_item->next_p) { record = batch_item->data_p; region_t *region = (region_t*) malloc (sizeof(region_t)); region->chromosome = (char*) calloc ((strlen(record->sequence)+1), sizeof(char)); strncat(region->chromosome, record->sequence, strlen(record->sequence)); region->start_position = record->start; region->end_position = record->end; LOG_DEBUG_F("region '%s:%u-%u'\n", region->chromosome, region->start_position, region->end_position); insert_region(region, regions_table); } gff_batch_free(item->data_p); list_item_free(item); } } } gff_close(file, 0); return regions_table; }
void write_mapped_read(array_list_t *array_list, bam_file_t *bam_file) { size_t num_items = array_list_size(array_list); alignment_t *alig; bam1_t *bam1; for (size_t j = 0; j < num_items; j++) { alig = (alignment_t *) array_list_get(j, array_list); //printf("\t******** %i(%i)\n", j, num_items); //printf("is null alig->name %i\n", (alig->query_name == NULL)); //printf("name = %s\n", alig->query_name); //printf("read = %s\n", alig->sequence); //printf("\t-----> %s\n", alig->cigar); LOG_DEBUG("writting bam..\n"); //alignment_print(alig); //exit(-1); if (alig != NULL) { bam1 = convert_to_bam(alig, 33); bam_fwrite(bam1, bam_file); bam_destroy1(bam1); alignment_free(alig); } else { LOG_FATAL_F("alig is NULL, num_items = %lu\n", num_items); } //printf("\t**************** %i(%i)\n", j, num_items); } if (array_list) { array_list_free(array_list, NULL); } }
cp_hashtable* associate_samples_and_positions(vcf_file_t* file) { LOG_DEBUG_F("** %zu sample names read\n", file->samples_names->size); array_list_t *sample_names = file->samples_names; cp_hashtable *sample_ids = cp_hashtable_create(sample_names->size * 2, cp_hash_string, (cp_compare_fn) strcasecmp ); int *index; char *name; for (int i = 0; i < sample_names->size; i++) { name = sample_names->items[i]; index = (int*) malloc (sizeof(int)); *index = i; if (cp_hashtable_get(sample_ids, name)) { LOG_FATAL_F("Sample %s appears more than once. File can not be analyzed.\n", name); } cp_hashtable_put(sample_ids, name, index); } // char **keys = (char**) cp_hashtable_get_keys(sample_ids); // int num_keys = cp_hashtable_count(sample_ids); // for (int i = 0; i < num_keys; i++) { // printf("%s\t%d\n", keys[i], *((int*) cp_hashtable_get(sample_ids, keys[i]))); // } return sample_ids; }
fastq_file_t *fastq_fopen_mode(char *filename, char *mode) { FILE *fd = fopen(filename, mode); if (fd == NULL) { LOG_FATAL_F("Error opening file: %s, mode (%s)\n", filename, mode); // printf("Error opening file: %s \n", filename); exit(-1); } fastq_file_t* fq_file = (fastq_file_t*) malloc(sizeof(fastq_file_t)); fq_file->filename = filename; fq_file->mode = mode; fq_file->fd = fd; return fq_file; }
int create_vcf_query_fields(sqlite3 *db) { // create record_stats table for vcf files int rc; char *error_msg; char sql[1000]; sprintf(sql, "CREATE TABLE record_query_fields (\ chromosome TEXT, position INT64, allele_ref TEXT, \ allele_maf TEXT, genotype_maf TEXT, \ allele_maf_freq DOUBLE, genotype_maf_freq DOUBLE, \ miss_allele INT, miss_gt INT, \ mendel_err INT, is_indel INT, \ cases_percent_dominant DOUBLE, \ controls_percent_dominant DOUBLE, \ cases_percent_recessive DOUBLE, \ controls_percent_recessive DOUBLE)"); if (rc = sqlite3_exec(db, sql, NULL, NULL, &error_msg)) { LOG_FATAL_F("Stats database failed: %s\n", error_msg); } return 0; }
int create_stats_db(const char *db_name, int chunksize, int (*create_custom_fields)(sqlite3 *), sqlite3** db) { // create sqlite db if (sqlite3_open(db_name, db)) { LOG_FATAL_F("Could not open stats database (%s): %s\n", db_name, sqlite3_errmsg(*db)); } int rc; char sql[128]; sprintf(sql, "BEGIN TRANSACTION"); rc = exec_sql(sql, *db); // create global stats table and index, and insert the chunksize sprintf(sql, "CREATE TABLE global_stats (name TEXT PRIMARY KEY, title TEXT, value TEXT)"); rc = exec_sql(sql, *db); sprintf(sql, "%i", chunksize); rc = insert_global_stats("CHUNK_SIZE", "Chunk size", sql, *db); rc = insert_global_stats("CHR_PREFIX", "Chromosome prefix", "", *db); // create chunks table sprintf(sql, "CREATE TABLE chunk (chromosome TEXT, chunk_id INT, start INT, end INT, features_count INT)"); rc = exec_sql(sql, *db); // create record_query_fields table for bam, vcf.. files if (create_custom_fields) { rc = create_custom_fields(*db); } sprintf(sql, "END TRANSACTION"); rc = exec_sql(sql, *db); return rc; }
region_table_t *parse_regions_from_gff_file(char *filename, const char *url, const char *species, const char *version) { gff_file_t *file = gff_open(filename); if (file == NULL) { return NULL; } region_table_t *regions_table = new_region_table_from_ws(url, species, version); int ret_code = 0; size_t max_batches = 20, batch_size = 2000; list_t *read_list = (list_t*) malloc (sizeof(list_t)); list_init("batches", 1, max_batches, read_list); #pragma omp parallel sections { // The producer reads the GFF file #pragma omp section { LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num()); ret_code = gff_read_batches(read_list, batch_size, file); list_decr_writers(read_list); if (ret_code) { LOG_FATAL_F("Error while reading GFF file %s (%d)\n", filename, ret_code); } } // The consumer inserts regions in the structure #pragma omp section { list_item_t *item = NULL; gff_batch_t *batch; gff_record_t *record; region_t *regions_batch[REGIONS_CHUNKSIZE]; int avail_regions = 0; while ( item = list_remove_item(read_list) ) { batch = item->data_p; // For each record in the batch, generate a new region for (int i = 0; i < batch->records->size; i++) { record = batch->records->items[i]; region_t *region = region_new(strndup(record->sequence, record->sequence_len), record->start, record->end, record->strand ? strndup(&record->strand, 1) : NULL, record->feature ? strndup(record->feature, record->feature_len) : NULL); LOG_DEBUG_F("region '%s:%u-%u'\n", region->chromosome, region->start_position, region->end_position); regions_batch[avail_regions++] = region; // Save when the recommended size is reached if (avail_regions == REGIONS_CHUNKSIZE) { insert_regions(regions_batch, avail_regions, regions_table); for (int i = 0; i < avail_regions; i++) { free(regions_batch[i]); } avail_regions = 0; } } gff_batch_free(batch); list_item_free(item); } // Save the remaining regions that did not fill a batch if (avail_regions > 0) { insert_regions(regions_batch, avail_regions, regions_table); for (int i = 0; i < avail_regions; i++) { free(regions_batch[i]); } avail_regions = 0; } } } finish_region_table_loading(regions_table); list_free_deep(read_list, NULL); gff_close(file, 1); return regions_table; }
static int bfwork_run_sequential(bam_fwork_t *fwork) { int i, err; size_t reads, reads_to_write; double times; bam_region_t *region; //Context bfwork_context_t *context; size_t pf_l; err = WANDER_REGION_CHANGED; reads = 0; context = fwork->context; pf_l = context->processing_f_l; while(err) { //Create new current region region = (bam_region_t *)malloc(sizeof(bam_region_t)); breg_init(region); //Fill region #ifdef D_TIME_DEBUG times = omp_get_wtime(); #endif err = bfwork_obtain_region(fwork, region); #ifdef D_TIME_DEBUG times = omp_get_wtime() - times; if(region->size != 0) if(context->time_stats) time_add_time_slot(D_FWORK_READ, context->time_stats, times / (double)region->size); #endif if(err) { if(err == WANDER_REGION_CHANGED || err == WANDER_READ_EOF) { //Add region to framework regions bfwork_region_insert(fwork, region); #ifdef D_TIME_DEBUG times = omp_get_wtime(); #endif //Process region for(i = 0; i < pf_l; i++) { context->processing_f[i](fwork, region); } #ifdef D_TIME_DEBUG times = omp_get_wtime() - times; if(context->time_stats) if(region->size != 0) { time_add_time_slot(D_FWORK_PROC, context->time_stats, times / (double)region->size); time_add_time_slot(D_FWORK_PROC_FUNC, context->time_stats, times / (double)region->size); } times = omp_get_wtime(); #endif reads_to_write = region->size; reads += reads_to_write; printf("Reads processed: %lu\r", reads); //Write region breg_write_n(region, reads_to_write, fwork->output_file); //Remove region from list linked_list_remove(region, fwork->regions_list); //Free region breg_destroy(region, 1); free(region); #ifdef D_TIME_DEBUG times = omp_get_wtime() - times; if(context->time_stats) if(reads_to_write != 0) time_add_time_slot(D_FWORK_WRITE, context->time_stats, times / (double)reads_to_write); #endif //End readings if(err == WANDER_READ_EOF) break; } else { if(err == WANDER_READ_TRUNCATED) { LOG_WARN("Readed truncated read\n"); } else { LOG_FATAL_F("Failed to read next region, error code: %d\n", err); } break; } } else { //No more regions, end loop LOG_INFO("No more regions to read"); } } printf("\n"); return err; }
static int bfwork_run_threaded(bam_fwork_t *fwork) { int err; bam_region_t *region; linked_list_t *regions; double times; omp_lock_t end_condition_lock; int end_condition; omp_lock_t reads_lock; size_t reads; size_t reads_to_write; //Init lock omp_init_lock(&end_condition_lock); omp_init_lock(&reads_lock); //#pragma omp parallel private(err, region, regions, times, reads_to_write) { //#pragma omp single { printf("Running in multithreading mode with %d threads\n", omp_get_max_threads()); end_condition = 1; reads = 0; } #pragma omp parallel sections private(err, region, regions, times, reads_to_write) { //Region read #pragma omp section { regions = fwork->regions_list; while(1) { //Create new current region region = (bam_region_t *)malloc(sizeof(bam_region_t)); breg_init(region); //Fill region #ifdef D_TIME_DEBUG times = omp_get_wtime(); #endif err = bfwork_obtain_region(fwork, region); #ifdef D_TIME_DEBUG times = omp_get_wtime() - times; omp_set_lock(®ion->lock); if(fwork->context->time_stats) if(region->size != 0) time_add_time_slot(D_FWORK_READ, fwork->context->time_stats, times / (double)region->size); omp_unset_lock(®ion->lock); #endif if(err) { if(err == WANDER_REGION_CHANGED || err == WANDER_READ_EOF) { //Until process, this region cant be writed omp_test_lock(®ion->write_lock); //Add region to framework regions bfwork_region_insert(fwork, region); #pragma omp task untied firstprivate(region) private(err) { int i; size_t pf_l; double aux_time; //Process region omp_set_lock(®ion->lock); #ifdef D_TIME_DEBUG times = omp_get_wtime(); #endif //Process region pf_l = fwork->context->processing_f_l; for(i = 0; i < pf_l; i++) { fwork->context->processing_f[i](fwork, region); } #ifdef D_TIME_DEBUG times = omp_get_wtime() - times; if(fwork->context->time_stats) if(region->size != 0) time_add_time_slot(D_FWORK_PROC_FUNC, fwork->context->time_stats, times / (double)region->size); aux_time = omp_get_wtime(); #endif omp_unset_lock(®ion->lock); omp_set_lock(&reads_lock); reads += region->size; printf("Reads processed: %lu\r", reads); omp_unset_lock(&reads_lock); #ifdef D_TIME_DEBUG aux_time = omp_get_wtime() - aux_time; omp_set_lock(®ion->lock); if(fwork->context->time_stats) if(region->size != 0) time_add_time_slot(D_FWORK_PROC, fwork->context->time_stats, (times + aux_time) / (double)region->size); omp_unset_lock(®ion->lock); #endif //Set this region as writable omp_unset_lock(®ion->write_lock); } //End readings if(err == WANDER_READ_EOF) break; } else { if(err == WANDER_READ_TRUNCATED) { LOG_WARN("Readed truncated read\n"); } else { LOG_FATAL_F("Failed to read next region, error code: %d\n", err); } break; } } else { //No more regions, end loop LOG_INFO("No more regions to read"); break; } } omp_set_lock(&end_condition_lock); end_condition = 0; omp_unset_lock(&end_condition_lock); //LOG_WARN("Read thread exit\n"); }//End read section //Write section #pragma omp section { regions = fwork->regions_list; omp_set_lock(&end_condition_lock); while(end_condition || linked_list_size(regions) > 0) { omp_unset_lock(&end_condition_lock); #ifdef D_TIME_DEBUG times = omp_get_wtime(); #endif //Get next region omp_set_lock(&fwork->regions_lock); region = linked_list_get_first(regions); omp_unset_lock(&fwork->regions_lock); if(region == NULL) { omp_set_lock(&end_condition_lock); continue; } //Wait region to be writable omp_set_lock(®ion->write_lock); //Write region omp_set_lock(&fwork->output_file_lock); reads_to_write = region->size; breg_write_n(region, reads_to_write, fwork->output_file); omp_unset_lock(&fwork->output_file_lock); //Remove from list omp_set_lock(&fwork->regions_lock); if(linked_list_size(regions) == 1) //Possible bug? linked_list_clear(regions, NULL); else linked_list_remove_first(regions); //Signal read section if regions list is full if(linked_list_size(regions) < (FWORK_REGIONS_MAX / 2) ) omp_unset_lock(&fwork->free_slots); omp_unset_lock(&fwork->regions_lock); #ifdef D_TIME_DEBUG times = omp_get_wtime() - times; omp_set_lock(®ion->lock); if(fwork->context->time_stats) if(reads_to_write != 0) time_add_time_slot(D_FWORK_WRITE, fwork->context->time_stats, times / (double)reads_to_write); omp_unset_lock(®ion->lock); #endif //Free region breg_destroy(region, 1); free(region); omp_set_lock(&end_condition_lock); } omp_unset_lock(&end_condition_lock); //LOG_WARN("Write thread exit\n"); }//End write section }//End sections }//End parallel //Lineskip printf("\n"); //Free omp_destroy_lock(&end_condition_lock); return NO_ERROR; }
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) { int ret_code = 0; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } } char *output_directory = shared_options_data->output_directory; size_t output_directory_len = strlen(output_directory); ret_code = create_directory(output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", output_directory); } // Remove all .txt files in folder ret_code = delete_files_by_extension(output_directory, "txt"); if (ret_code != 0) { return ret_code; } // Initialize environment for connecting to the web service ret_code = init_http_environment(0); if (ret_code != 0) { return ret_code; } // Output file descriptors static cp_hashtable *output_files = NULL; // Lines of the output data in the main .txt files static list_t *output_list = NULL; // Consequence type counters (for summary, must be kept between web service calls) static cp_hashtable *summary_count = NULL; // Gene list (for genes-with-variants, must be kept between web service calls) static cp_hashtable *gene_list = NULL; // Initialize collections of file descriptors and summary counters ret_code = initialize_output_files(output_directory, output_directory_len, &output_files); if (ret_code != 0) { return ret_code; } initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list); initialize_ws_buffers(shared_options_data->num_threads); // Create job.status file char job_status_filename[output_directory_len + 10]; sprintf(job_status_filename, "%s/job.status", output_directory); FILE *job_status = new_job_status_file(job_status_filename); if (!job_status) { LOG_FATAL("Can't create job status file\n"); } else { update_job_status_file(0, job_status); } #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); start = omp_get_wtime(); ret_code = vcf_read(vcf_file, 1, (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines, shared_options_data->batch_bytes <= 0); stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); // Filters and files for filtering output filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); // Pedigree information (used in some filters) individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; // Filename structure outdir/vcfname.errors char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char)); get_filename_from_path(shared_options_data->vcf_filename, prefix_filename); char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char)); sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename); non_processed_file = fopen(non_processed_filename, "w"); free(non_processed_filename); // Maximum size processed by each thread (never allow more than 1000 variants per query) if (shared_options_data->batch_lines > 0) { shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads)); } else { shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY; } LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread); int i = 0; vcf_batch_t *batch = NULL; int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0; start = omp_get_wtime(); while (batch = fetch_vcf_batch(vcf_file)) { if (i == 0) { // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], vcf_file); } // Write file format, header entries and delimiter if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); } if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); } if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); } LOG_DEBUG("VCF header written\n"); if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); } } // printf("batch loaded = '%.*s'\n", 50, batch->text); // printf("batch text len = %zu\n", strlen(batch->text)); // if (i % 10 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); // } int reconnections = 0; int max_reconnections = 3; // TODO allow to configure? // Write records that passed to a separate file, and query the WS with them as args array_list_t *failed_records = NULL; int num_variables = ped_file? get_num_variables(ped_file): 0; array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records); if (passed_records->size > 0) { // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes; int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes); do { // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { int tid = omp_get_thread_num(); LOG_DEBUG_F("[%d] WS invocation\n", tid); LOG_DEBUG_F("[%d] -- effect WS\n", tid); if (!reconnections || ret_ws_0) { ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j], options_data->excludes); parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list); free(effect_line[tid]); effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char)); } if (!options_data->no_phenotypes) { if (!reconnections || ret_ws_1) { LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num()); ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_snp_phenotype_response(tid, output_list); free(snp_line[tid]); snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char)); } if (!reconnections || ret_ws_2) { LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num()); ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_mutation_phenotype_response(tid, output_list); free(mutation_line[tid]); mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char)); } } } LOG_DEBUG_F("*** %dth web services invocation finished\n", i); if (ret_ws_0 || ret_ws_1 || ret_ws_2) { if (ret_ws_0) { LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0)); } if (ret_ws_1) { LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1)); } if (ret_ws_2) { LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2)); } // In presence of errors, wait 4 seconds before retrying reconnections++; LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections); sleep(4); } else { free(chunk_starts); free(chunk_sizes); } } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)); } // If the maximum number of reconnections was reached still with errors, // write the non-processed batch to the corresponding file if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) { #pragma omp critical { write_vcf_batch(batch, non_processed_file); } } // Write records that passed and failed filters to separate files, and free them write_filtering_output_files(passed_records, failed_records, passed_file, failed_file); free_filtered_records(passed_records, failed_records, batch->records); // Free batch and its contents vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources if (passed_file) { fclose(passed_file); } if (failed_file) { fclose(failed_file); } if (non_processed_file) { fclose(non_processed_file); } // Free filters for (i = 0; i < num_filters; i++) { filter_t *filter = filters[i]; filter->free_func(filter); } free(filters); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } } #pragma omp section { // Thread which writes the results to all_variants, summary and one file per consequence type int ret = 0; char *line; list_item_t* item = NULL; FILE *fd = NULL; FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants"); FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes"); FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes"); while ((item = list_remove_item(output_list)) != NULL) { line = item->data_p; // Type greater than 0: consequence type identified by its SO code // Type equals to -1: SNP phenotype // Type equals to -2: mutation phenotype if (item->type > 0) { // Write entry in the consequence type file fd = cp_hashtable_get(output_files, &(item->type)); int ret = fprintf(fd, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to file: '%s'\n", line); } // Write in all_variants ret = fprintf(all_variants_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to all_variants: '%s'\n", line); } } else if (item->type == SNP_PHENOTYPE) { ret = fprintf(snp_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line); } } else if (item->type == MUTATION_PHENOTYPE) { ret = fprintf(mutation_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line); } } free(line); list_item_free(item); } } } write_summary_file(summary_count, cp_hashtable_get(output_files, "summary")); write_genes_with_variants_file(gene_list, output_directory); write_result_file(shared_options_data, options_data, summary_count, output_directory); free_output_data_structures(output_files, summary_count, gene_list); free_ws_buffers(shared_options_data->num_threads); free(output_list); vcf_close(vcf_file); update_job_status_file(100, job_status); close_job_status_file(job_status); return ret_code; }
int run_association_test(shared_options_data_t* shared_options_data, assoc_options_data_t* options_data) { list_t *output_list = (list_t*) malloc (sizeof(list_t)); list_init("output", shared_options_data->num_threads, INT_MAX, output_list); int ret_code = 0; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } // Try to create the directory where the output files will be stored ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } LOG_INFO("About to perform basic association test...\n"); #pragma omp parallel sections private(ret_code) { #pragma omp section { LOG_DEBUG_F("Level %d: number of threads in the team - %d\n", 0, omp_get_num_threads()); double start = omp_get_wtime(); ret_code = vcf_read(vcf_file, 0, (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines, shared_options_data->batch_bytes <= 0); double stop = omp_get_wtime(); double total = stop - start; if (ret_code) { LOG_FATAL_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_reading(vcf_file); } #pragma omp section { LOG_DEBUG_F("Level %d: number of threads in the team - %d\n", 10, omp_get_num_threads()); // Enable nested parallelism omp_set_nested(1); volatile int initialization_done = 0; // Pedigree information individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; // Create chain of filters for the VCF file filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); double start = omp_get_wtime(); double *factorial_logarithms = NULL; int i = 0; #pragma omp parallel num_threads(shared_options_data->num_threads) shared(initialization_done, factorial_logarithms, filters, individuals) { LOG_DEBUG_F("Level %d: number of threads in the team - %d\n", 11, omp_get_num_threads()); char *text_begin, *text_end; vcf_reader_status *status; while(text_begin = fetch_vcf_text_batch(vcf_file)) { text_end = text_begin + strlen(text_begin); if (text_begin == text_end) { // EOF free(text_begin); break; } # pragma omp critical { status = vcf_reader_status_new(shared_options_data->batch_lines, i); i++; } if (shared_options_data->batch_bytes > 0) { ret_code = run_vcf_parser(text_begin, text_end, 0, vcf_file, status); } else if (shared_options_data->batch_lines > 0) { ret_code = run_vcf_parser(text_begin, text_end, shared_options_data->batch_lines, vcf_file, status); } // Initialize structures needed for association tests and write headers of output files if (!initialization_done && vcf_file->samples_names->size > 0) { # pragma omp critical { // Guarantee that just one thread performs this operation if (!initialization_done) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); /* printf("num samples = %zu\n", get_num_vcf_samples(file)); printf("pos = { "); for (int j = 0; j < get_num_vcf_samples(file); j++) { assert(individuals[j]); printf("%s ", individuals[j]->id); } printf("}\n"); */ // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], vcf_file); } // Write file format, header entries and delimiter if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); } if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); } LOG_DEBUG("VCF header written\n"); if (options_data->task == FISHER) { factorial_logarithms = init_logarithm_array(get_num_vcf_samples(vcf_file) * 10); } initialization_done = 1; } } } // If it has not been initialized it means that header is not fully read if (!initialization_done) { continue; } vcf_batch_t *batch = fetch_vcf_batch(vcf_file); if (i % 100 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); } assert(batch); // Launch association test over records that passed the filters array_list_t *failed_records = NULL; int num_variables = ped_file? get_num_variables(ped_file): 0; array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records); if (passed_records->size > 0) { assoc_test(options_data->task, (vcf_record_t**) passed_records->items, passed_records->size, individuals, get_num_vcf_samples(vcf_file), factorial_logarithms, output_list); } // Write records that passed and failed filters to separate files, and free them write_filtering_output_files(passed_records, failed_records, passed_file, failed_file); free_filtered_records(passed_records, failed_records, batch->records); // Free batch and its contents vcf_reader_status_free(status); vcf_batch_free(batch); } notify_end_parsing(vcf_file); } double stop = omp_get_wtime(); double total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources for (int i = 0; i < num_filters; i++) { filter_t *filter = filters[i]; filter->free_func(filter); } free(filters); if (sample_ids) { kh_destroy(ids, sample_ids); } if (individuals) { free(individuals); } // Decrease list writers count for (int i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } } #pragma omp section { // Thread that writes the results to the output file LOG_DEBUG_F("Level %d: number of threads in the team - %d\n", 20, omp_get_num_threads()); double start = omp_get_wtime(); // Get the file descriptor char *path; FILE *fd = get_assoc_output_file(options_data->task, shared_options_data, &path); LOG_INFO_F("Association test output filename = %s\n", path); // Write data: header + one line per variant write_output_header(options_data->task, fd); write_output_body(options_data->task, output_list, fd); fclose(fd); // Sort resulting file char *cmd = calloc (40 + strlen(path) * 4, sizeof(char)); sprintf(cmd, "sort -k1,1h -k2,2n %s > %s.tmp && mv %s.tmp %s", path, path, path, path); int sort_ret = system(cmd); if (sort_ret) { LOG_WARN("Association results could not be sorted by chromosome and position, will be shown unsorted\n"); } double stop = omp_get_wtime(); double total = stop - start; LOG_INFO_F("[%dW] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dW] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } } free(output_list); vcf_close(vcf_file); ped_close(ped_file, 1,1); return ret_code; }
int run_filter(shared_options_data_t *shared_options_data, filter_options_data_t *options_data) { int ret_code; double start, stop, total; vcf_file_t *file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!file) { LOG_FATAL("VCF file does not exist!\n"); } ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); if (shared_options_data->batch_bytes > 0) { ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, file); } else if (shared_options_data->batch_lines > 0) { ret_code = vcf_parse_batches(shared_options_data->batch_lines, file); } stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(file); } #pragma omp section { filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); if (!options_data->save_rejected) { fclose(failed_file); } LOG_DEBUG("File streams created\n"); start = omp_get_wtime(); int i = 0; vcf_batch_t *batch = NULL; while ((batch = fetch_vcf_batch(file)) != NULL) { if (i == 0) { // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], file); } // Write file format, header entries and delimiter write_vcf_header(file, passed_file); if (options_data->save_rejected) { write_vcf_header(file, failed_file); } LOG_DEBUG("VCF header written created\n"); } array_list_t *input_records = batch->records; array_list_t *passed_records, *failed_records; if (i % 100 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); } if (filters == NULL) { passed_records = input_records; } else { failed_records = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); passed_records = run_filter_chain(input_records, failed_records, filters, num_filters); } // Write records that passed and failed to 2 new separated files if (passed_records != NULL && passed_records->size > 0) { LOG_DEBUG_F("[batch %d] %zu passed records\n", i, passed_records->size); #pragma omp critical { for (int r = 0; r < passed_records->size; r++) { write_vcf_record(passed_records->items[r], passed_file); } // write_batch(passed_records, passed_file); } } if (options_data->save_rejected && failed_records != NULL && failed_records->size > 0) { LOG_DEBUG_F("[batch %d] %zu failed records\n", i, failed_records->size); #pragma omp critical { for (int r = 0; r < failed_records->size; r++) { write_vcf_record(failed_records->items[r], failed_file); } // write_batch(failed_records, failed_file); } } // Free batch and its contents vcf_batch_free(batch); // Free items in both lists (not their internal data) if (passed_records != input_records) { array_list_free(passed_records, NULL); } if (failed_records) { array_list_free(failed_records, NULL); } i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources if (passed_file) { fclose(passed_file); } if (options_data->save_rejected && failed_file) { fclose(failed_file); } free_filters(filters, num_filters); } } vcf_close(file); return 0; }
void fill_end_gaps(mapping_batch_t *mapping_batch, sw_optarg_t *sw_optarg, genome_t *genome, int min_H, int min_distance) { int sw_count = 0; fastq_read_t *fq_read; array_list_t *fq_batch = mapping_batch->fq_batch; size_t read_index, read_len; cal_t *cal; array_list_t *cal_list = NULL; size_t num_cals, num_targets = mapping_batch->num_targets; char *seq, *revcomp_seq = NULL; seed_region_t *s; cigar_op_t *cigar_op; cigar_code_t *cigar_code; size_t start, end; size_t gap_read_start, gap_read_end, gap_read_len; size_t gap_genome_start, gap_genome_end, gap_genome_len; int first, last, mode, distance; sw_prepare_t *sw_prepare; char *ref; // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_targets; i++) { read_index = mapping_batch->targets[i]; fq_read = (fastq_read_t *) array_list_get(read_index, fq_batch); cal_list = mapping_batch->mapping_lists[read_index]; num_cals = array_list_size(cal_list); if (num_cals <= 0) continue; read_len = fq_read->length; revcomp_seq = NULL; // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); if (cal->sr_list->size == 0) continue; sw_prepare = NULL; s = (seed_region_t *) linked_list_get_first(cal->sr_list); cigar_code = (cigar_code_t *) s->info; LOG_DEBUG_F("CAL #%i of %i (strand %i), sr_list size = %i, cigar = %s (distance = %i)\n", j, num_cals, cal->strand, cal->sr_list->size, new_cigar_code_string(cigar_code), cigar_code->distance); for (int k = 0; k < 2; k++) { mode = NONE_POS; if (k == 0) { if ((cigar_op = cigar_code_get_op(0, cigar_code)) && cigar_op->name == 'H' && cigar_op->number > min_H) { LOG_DEBUG_F("%i%c\n", cigar_op->number, cigar_op->name); mode = BEGIN_POS; gap_read_start = 0; gap_read_end = cigar_op->number - 1; gap_genome_start = s->genome_start; gap_genome_end = gap_genome_start + cigar_op->number - 1; } } else { if ((cigar_op = cigar_code_get_last_op(cigar_code)) && cigar_op->name == 'H' && cigar_op->number > min_H) { LOG_DEBUG_F("%i%c\n", cigar_op->number, cigar_op->name); mode = END_POS; gap_read_start = read_len - cigar_op->number; gap_read_end = read_len - 1; gap_genome_end = s->genome_end; gap_genome_start = gap_genome_end - cigar_op->number + 1; } } if (mode == NONE_POS) continue; // get query sequence, revcomp if necessary if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(fq_read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } seq = revcomp_seq; } else { seq = fq_read->sequence; } gap_read_len = gap_read_end - gap_read_start + 1; /* char *query = (char *) malloc((gap_read_len + 1) * sizeof(char)); memcpy(query, seq, gap_len); query[gap_read_len] = '\0'; */ // get ref. sequence start = gap_genome_start;// + 1; end = gap_genome_end;// + 1; gap_genome_len = end - start + 1; ref = (char *) malloc((gap_genome_len + 1) * sizeof(char)); genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); ref[gap_genome_len] = '\0'; first = -1; last = -1; distance = 0; for (int k = 0, k1 = gap_read_start; k < gap_read_len; k++, k1++) { if (seq[k1] != ref[k]) { distance++; if (first == -1) first = k; last = k; } // LOG_DEBUG_F("k = %i, k.read = %i: %c - %c : distance = %i, (first, last) = (%i, %i)\n", // k, k1, seq[k1], ref[k], distance, first, last); } if (distance < min_distance) { cigar_op->name = 'M'; cigar_code->distance += distance; free(ref); continue; } else { // LOG_DEBUG_F("query: %s\n", &seq[gap_read_start]); // LOG_DEBUG_F("ref. : %s\n", ref); LOG_FATAL_F("here we must run SW: distance = %i: first = %i, last = %i, gaps (read, genome) = (%i, %i)\n", distance, first, last, gap_read_len, gap_genome_len); } // we must run the SW algorithm // sw_prepare = sw_prepare_new(0, 0, 0, 0); // sw_prepare_sequences( cal, genome, sw_prepare); // array_list_insert(sw_prepare, sw_prepare_list); // sw_count++; } } } LOG_DEBUG_F("sw_count = %i\n", sw_count); // debugging.... for (size_t i = 0; i < num_targets; i++) { read_index = mapping_batch->targets[i]; fq_read = (fastq_read_t *) array_list_get(read_index, fq_batch); LOG_DEBUG_F("Read %s\n", fq_read->id); cal_list = mapping_batch->mapping_lists[read_index]; num_cals = array_list_size(cal_list); if (num_cals <= 0) continue; for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); if (cal->sr_list->size == 0) continue; sw_prepare = NULL; s = (seed_region_t *) linked_list_get_first(cal->sr_list); cigar_code = (cigar_code_t *) s->info; LOG_DEBUG_F("\tCAL #%i of %i (strand %i), sr_list size = %i, cigar = %s (distance = %i)\n", j, num_cals, cal->strand, cal->sr_list->size, new_cigar_code_string(cigar_code), cigar_code->distance); } } }
int run_stats(shared_options_data_t *shared_options_data, stats_options_data_t *options_data) { file_stats_t *file_stats = file_stats_new(); sample_stats_t **sample_stats; // List that stores the batches of records filtered by each thread list_t *output_list[shared_options_data->num_threads]; // List that stores which thread filtered the next batch to save list_t *next_token_list = malloc(sizeof(list_t)); int ret_code; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } if(options_data->variable) { set_variable_field(options_data->variable, 0, ped_file); } else { set_variable_field("PHENO", 6, ped_file); } if(options_data->variable_groups) { int n, m; char *variable_groups = strdup(options_data->variable_groups); char **groups; char **phenos_in_group; groups = split(variable_groups, ":", &n); for(int i = 0; i < n; i++){ phenos_in_group = split(groups[i], ",", &m); if(set_phenotype_group(phenos_in_group, m, ped_file) < 0) { LOG_ERROR("Variable can't appear in two groups\n"); return DUPLICATED_VARIABLE; } free(phenos_in_group); } ped_file->accept_new_values = 0; free(variable_groups); free(groups); } else { ped_file->accept_new_values = 1; } if(options_data->phenotype) { int n; char* phenotypes = strdup(options_data->phenotype); char** pheno_values = split(phenotypes, ",", &n); if(n != 2) { LOG_ERROR("To handle case-control test, only two phenotypes are supported\n"); return MORE_THAN_TWO_PHENOTYPES; } else { set_unaffected_phenotype(pheno_values[0],ped_file); set_affected_phenotype(pheno_values[1],ped_file); } } else { set_unaffected_phenotype("1", ped_file); set_affected_phenotype("2", ped_file); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } if(!ped_file->num_field) { LOG_ERROR_F("Can't find the specified field \"%s\" in file: %s \n", options_data->variable, ped_file->filename); return VARIABLE_FIELD_NOT_FOUND; } } ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } // Initialize variables related to the different threads for (int i = 0; i < shared_options_data->num_threads; i++) { output_list[i] = (list_t*) malloc(sizeof(list_t)); list_init("input", 1, shared_options_data->num_threads * shared_options_data->batch_lines, output_list[i]); } list_init("next_token", shared_options_data->num_threads, INT_MAX, next_token_list); LOG_INFO("About to retrieve statistics from VCF file...\n"); #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); if (shared_options_data->batch_bytes > 0) { ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, vcf_file); } else if (shared_options_data->batch_lines > 0) { ret_code = vcf_parse_batches(shared_options_data->batch_lines, vcf_file); } stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; khash_t(str) *phenotype_ids = NULL; int num_phenotypes; start = omp_get_wtime(); int i = 0; vcf_batch_t *batch = NULL; while ((batch = fetch_vcf_batch(vcf_file)) != NULL) { if (i == 0) { sample_stats = malloc (get_num_vcf_samples(vcf_file) * sizeof(sample_stats_t*)); for (int j = 0; j < get_num_vcf_samples(vcf_file); j++) { sample_stats[j] = sample_stats_new(array_list_get(j, vcf_file->samples_names)); } if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); // Get the khash of the phenotypes in PED file phenotype_ids = get_phenotypes(ped_file); num_phenotypes = get_num_variables(ped_file); } } if (i % 50 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); } // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes = NULL; array_list_t *input_records = batch->records; int *chunk_starts = create_chunks(input_records->size, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads), &num_chunks, &chunk_sizes); // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { LOG_DEBUG_F("[%d] Stats invocation\n", omp_get_thread_num()); // Invoke variant stats and/or sample stats when applies if (options_data->variant_stats) { int index = omp_get_thread_num() % shared_options_data->num_threads; ret_code = get_variants_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), chunk_sizes[j], individuals, sample_ids,num_phenotypes, output_list[index], file_stats); } if (options_data->sample_stats) { ret_code |= get_sample_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), chunk_sizes[j], individuals, sample_ids, sample_stats, file_stats); } } if (options_data->variant_stats) { // Insert as many tokens as elements correspond to each thread for (int t = 0; t < num_chunks; t++) { for (int s = 0; s < chunk_sizes[t]; s++) { list_item_t *token_item = list_item_new(t, 0, NULL); list_insert_item(token_item, next_token_list); } } } free(chunk_starts); free(chunk_sizes); vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(next_token_list); list_decr_writers(output_list[i]); } if (sample_ids) { kh_destroy(ids, sample_ids); } if (individuals) { free(individuals); } } #pragma omp section { LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num()); char *stats_prefix = get_vcf_stats_filename_prefix(shared_options_data->vcf_filename, shared_options_data->output_filename, shared_options_data->output_directory); // File names and descriptors for output to plain text files char *stats_filename, *summary_filename, *phenotype_filename; FILE *stats_fd, *summary_fd, **phenotype_fd; char *stats_db_name; sqlite3 *db = NULL; khash_t(stats_chunks) *hash; khash_t(str) *phenotype_ids; int num_phenotypes; if(ped_file){ phenotype_ids = get_phenotypes(ped_file); num_phenotypes = get_num_variables(ped_file); } if (options_data->save_db) { delete_files_by_extension(shared_options_data->output_directory, "db"); stats_db_name = calloc(strlen(stats_prefix) + strlen(".db") + 2, sizeof(char)); sprintf(stats_db_name, "%s.db", stats_prefix); create_stats_db(stats_db_name, VCF_CHUNKSIZE, create_vcf_query_fields, &db); hash = kh_init(stats_chunks); } // Write variant (and global) statistics if (options_data->variant_stats) { stats_filename = get_variant_stats_output_filename(stats_prefix); if (!(stats_fd = fopen(stats_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants: %s\n", stats_filename); } //Open one file for each phenotype if(ped_file){ phenotype_fd = malloc(sizeof(FILE*)*num_phenotypes); if(options_data->variable_groups){ int n; char *variable_groups = strdup(options_data->variable_groups); char ** names = split(variable_groups, ":", &n); for(int i = 0; i < n; i++) { phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, names[i]); if(!(phenotype_fd[i] = fopen(phenotype_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename); } free(phenotype_filename); } free(names); free(variable_groups); } else { for (khint_t i = kh_begin(phenotype_ids); i != kh_end(phenotype_ids); ++i) { if (!kh_exist(phenotype_ids,i)) continue; phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, kh_key(phenotype_ids,i)); if(!(phenotype_fd[kh_val(phenotype_ids,i)] = fopen(phenotype_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename); } free(phenotype_filename); } } } // Write header report_vcf_variant_stats_header(stats_fd); if(ped_file){ for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats_header(phenotype_fd[i]); } // For each variant, generate a new line int avail_stats = 0; variant_stats_t *var_stats_batch[VCF_CHUNKSIZE]; list_item_t *token_item = NULL, *output_item = NULL; while ( token_item = list_remove_item(next_token_list) ) { output_item = list_remove_item(output_list[token_item->id]); assert(output_item); var_stats_batch[avail_stats] = output_item->data_p; avail_stats++; // Run only when certain amount of stats is available if (avail_stats >= VCF_CHUNKSIZE) { report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch); if(ped_file) for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i); // Free all stats from the "batch" for (int i = 0; i < avail_stats; i++) { variant_stats_free(var_stats_batch[i]); } avail_stats = 0; } // Free resources list_item_free(output_item); list_item_free(token_item); } if (avail_stats > 0) { report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch); if(ped_file) for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i); // Free all stats from the "batch" for (int i = 0; i < avail_stats; i++) { variant_stats_free(var_stats_batch[i]); } avail_stats = 0; } // Write whole file stats (data only got when launching variant stats) summary_filename = get_vcf_file_stats_output_filename(stats_prefix); if (!(summary_fd = fopen(summary_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics summary: %s\n", summary_filename); } report_vcf_summary_stats(summary_fd, db, file_stats); free(stats_filename); free(summary_filename); // Close variant stats file if (stats_fd) { fclose(stats_fd); } if (summary_fd) { fclose(summary_fd); } if(ped_file){ for(int i = 0; i < num_phenotypes; i++) if(phenotype_fd[i]) fclose(phenotype_fd[i]); free(phenotype_fd); } } // Write sample statistics if (options_data->sample_stats) { stats_filename = get_sample_stats_output_filename(stats_prefix); if (!(stats_fd = fopen(stats_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of samples: %s\n", stats_filename); } report_vcf_sample_stats_header(stats_fd); report_vcf_sample_stats(stats_fd, NULL, vcf_file->samples_names->size, sample_stats); // Close sample stats file free(stats_filename); if (stats_fd) { fclose(stats_fd); } } free(stats_prefix); if (db) { insert_chunk_hash(VCF_CHUNKSIZE, hash, db); create_stats_index(create_vcf_index, db); close_stats_db(db, hash); } } } for (int i = 0; i < get_num_vcf_samples(vcf_file); i++) { sample_stats_free(sample_stats[i]); } free(sample_stats); free(file_stats); free(next_token_list); for (int i = 0; i < shared_options_data->num_threads; i++) { free(output_list[i]); } vcf_close(vcf_file); if (ped_file) { ped_close(ped_file, 1,1); } return 0; }
int main (int argc, char *argv[]) { size_t max_batches = 20; size_t batch_size = 2000; list_t *read_list = (list_t*) malloc (sizeof(list_t)); list_init("batches", 1, max_batches, read_list); int ret_code; double start, stop, total; char *filename = (char*) malloc ((strlen(argv[1])+1) * sizeof(char)); strncat(filename, argv[1], strlen(argv[1])); gff_file_t* file; init_log_custom(LOG_LEVEL_DEBUG, 1, NULL, "w"); #pragma omp parallel sections private(start, stop, total) lastprivate(file) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); file = gff_open(filename); ret_code = gff_read_batches(read_list, batch_size, file); stop = omp_get_wtime(); total = (stop - start); if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Writing to a new file if (argc == 3) { start = omp_get_wtime(); ret_code = gff_write(file, argv[2]); stop = omp_get_wtime(); total = (stop - start); if (ret_code) { LOG_ERROR_F("[%dW] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dW] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dW] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } list_decr_writers(read_list); gff_close(file, 0); } #pragma omp section { printf("1st log debug\n"); LOG_DEBUG_F("OMP num threads = %d\n", omp_get_num_threads()); LOG_DEBUG_F("Thread %d prints info\n", omp_get_thread_num()); printf("after 1st log debug\n"); start = omp_get_wtime(); int i = 0; list_item_t* item = NULL; FILE *out = fopen("result.gff", "w"); while ( (item = list_remove_item(read_list)) != NULL ) { if (i % 200 == 0) { int debug = 1; LOG_DEBUG_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), ((gff_batch_t*) item->data_p)->length, ((gff_batch_t*) item->data_p)->max_length); } // gff_write_to_file(file, out); // gff_batch_print(stdout, item->data_p); write_gff_batch(item->data_p, out); gff_batch_free(item->data_p); list_item_free(item); i++; } fclose(out); stop = omp_get_wtime(); total = (stop - start); LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } } free(read_list); return 0; }
int run_merge(shared_options_data_t *shared_options_data, merge_options_data_t *options_data) { if (options_data->num_files == 1) { LOG_INFO("Just one VCF file specified, no need to merge"); return 0; } list_t *read_list[options_data->num_files]; memset(read_list, 0, options_data->num_files * sizeof(list_t*)); list_t *output_header_list = (list_t*) malloc (sizeof(list_t)); list_init("headers", shared_options_data->num_threads, INT_MAX, output_header_list); list_t *output_list = (list_t*) malloc (sizeof(list_t)); list_init("output", shared_options_data->num_threads, shared_options_data->max_batches * shared_options_data->batch_lines, output_list); list_t *merge_tokens = (list_t*) malloc (sizeof(list_t)); list_init("tokens", 1, INT_MAX, merge_tokens); int ret_code = 0; double start, stop, total; vcf_file_t *files[options_data->num_files]; memset(files, 0, options_data->num_files * sizeof(vcf_file_t*)); // Initialize variables related to the different files for (int i = 0; i < options_data->num_files; i++) { files[i] = vcf_open(options_data->input_files[i], shared_options_data->max_batches); if (!files[i]) { LOG_FATAL_F("VCF file %s does not exist!\n", options_data->input_files[i]); } read_list[i] = (list_t*) malloc(sizeof(list_t)); list_init("text", 1, shared_options_data->max_batches, read_list[i]); } ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } chromosome_order = get_chromosome_order(shared_options_data->host_url, shared_options_data->species, shared_options_data->version, &num_chromosomes); printf("Number of threads = %d\n", shared_options_data->num_threads); #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); ret_code = vcf_multiread_batches(read_list, shared_options_data->batch_lines, files, options_data->num_files); stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_ERROR_F("Error %d while reading VCF files\n", ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } #pragma omp section { // Enable nested parallelism omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); int num_eof_found = 0; int eof_found[options_data->num_files]; memset(eof_found, 0, options_data->num_files * sizeof(int)); list_item_t *items[options_data->num_files]; memset(items, 0, options_data->num_files * sizeof(list_item_t*)); char *texts[options_data->num_files]; memset(texts, 0, options_data->num_files * sizeof(char*)); khash_t(pos) *positions_read = kh_init(pos); long max_position_merged = LONG_MAX; char *max_chromosome_merged = NULL; int header_merged = 0; int token = 0; double start_parsing, start_insertion, total_parsing = 0, total_insertion = 0; start = omp_get_wtime(); while (num_eof_found < options_data->num_files) { /* Process: * - N threads getting batches of VCF records and inserting them in a data structure. The common minimum * position of each group of batches will also be stored. * - If the data structure reaches certain size or the end of a chromosome, merge positions prior to the * last minimum registered. */ // Getting text elements in a critical region guarantees that each thread gets variants in positions in the same range for (int i = 0; i < options_data->num_files; i++) { if (eof_found[i]) { continue; } items[i] = list_remove_item(read_list[i]); if (items[i] == NULL || !strcmp(items[i]->data_p, "")) { LOG_INFO_F("[%d] EOF found in file %s\n", omp_get_thread_num(), options_data->input_files[i]); eof_found[i] = 1; num_eof_found++; if(items[i] != NULL && !strcmp(items[i]->data_p, "")) { free(items[i]->data_p); list_item_free(items[i]); LOG_DEBUG_F("[%d] Text batch freed\n", omp_get_thread_num()); } else { LOG_DEBUG_F("[%d] No need to free text batch\n", omp_get_thread_num()); } continue; } assert(items[i]->data_p != NULL); texts[i] = items[i]->data_p; // printf("[%d] text batch from file %d\tcontents = '%s'\n", omp_get_thread_num(), i, texts[i]); } for (int i = 0; i < options_data->num_files; i++) { if (eof_found[i]) { continue; } start_parsing = omp_get_wtime(); char *text_begin = texts[i]; char *text_end = text_begin + strlen(text_begin); assert(text_end != NULL); // printf("batch = '%.*s'\n", text_end - text_begin, text_begin); // Get VCF batches from text batches vcf_reader_status *status = vcf_reader_status_new(shared_options_data->batch_lines, 0); ret_code = run_vcf_parser(text_begin, text_end, shared_options_data->batch_lines, files[i], status); if (ret_code) { // TODO stop? LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, files[i]->filename); continue; } // printf("batches = %d\n", files[i]->record_batches->length); vcf_batch_t *batch = fetch_vcf_batch_non_blocking(files[i]); if (!batch) { continue; } total_parsing += omp_get_wtime() - start_parsing; start_insertion = omp_get_wtime(); // Insert records into hashtable for (int j = 0; j < batch->records->size; j++) { vcf_record_t *record = vcf_record_copy(array_list_get(j, batch->records)); vcf_record_file_link *link = vcf_record_file_link_new(record, files[i]); char key[64]; compose_key_value(record->chromosome, record->position, key); int ret = insert_position_read(key, link, positions_read); assert(ret); } total_insertion += omp_get_wtime() - start_insertion; // Update minimum position being a maximum of these batches vcf_record_t *current_record = (vcf_record_t*) array_list_get(batch->records->size - 1, batch->records); calculate_merge_interval(current_record, &max_chromosome_merged, &max_position_merged, chromosome_order, num_chromosomes); // Free batch and its contents vcf_reader_status_free(status); vcf_batch_free(batch); list_item_free(items[i]); } if (num_eof_found == options_data->num_files) { max_chromosome_merged = chromosome_order[num_chromosomes-1]; max_position_merged = LONG_MAX; } // Merge headers, if not previously done if (!header_merged) { merge_vcf_headers(files, options_data->num_files, options_data, output_header_list); header_merged = 1; // Decrease list writers count for (int i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_header_list); } } // If the data structure reaches certain size or the end of a chromosome, // merge positions prior to the last minimum registered if (num_eof_found < options_data->num_files && kh_size(positions_read) > TREE_LIMIT) { LOG_INFO_F("Merging until position %s:%ld\n", max_chromosome_merged, max_position_merged); token = merge_interval(positions_read, max_chromosome_merged, max_position_merged, chromosome_order, num_chromosomes, files, shared_options_data, options_data, output_list); } // When reaching EOF for all files, merge the remaining entries else if (num_eof_found == options_data->num_files && kh_size(positions_read) > 0) { LOG_INFO_F("Merging remaining positions (last = %s:%ld)\n", chromosome_order[num_chromosomes - 1], LONG_MAX); token = merge_remaining_interval(positions_read, files, shared_options_data, options_data, output_list); } if (token) { int *token_ptr = malloc (sizeof(int)); *token_ptr = token; list_item_t *item = list_item_new(1, 0, token_ptr); list_insert_item(item, merge_tokens); } // Set variables ready for next iteration of the algorithm if (max_chromosome_merged) { free(max_chromosome_merged); } token = 0; max_chromosome_merged = NULL; max_position_merged = LONG_MAX; } kh_destroy(pos, positions_read); stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); LOG_DEBUG_F("** Time in parsing = %f s\n", total_parsing); LOG_DEBUG_F("** Time in insertion = %f s\n", total_insertion); // for (int i = 0; i < shared_options_data->num_threads; i++) { // printf("[%d] Time in searching = %f s\n", i, total_search[i]); // printf("[%d] Time in merging = %f s\n", i, total_merge[i]); // } // Decrease list writers count for (int i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } list_decr_writers(merge_tokens); } #pragma omp section { LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num()); start = omp_get_wtime(); // Create file streams for results char aux_filename[32]; memset(aux_filename, 0, 32 * sizeof(char)); sprintf(aux_filename, "merge_from_%d_files.vcf", options_data->num_files); char *merge_filename; FILE *merge_fd = get_output_file(shared_options_data, aux_filename, &merge_filename); LOG_INFO_F("Output filename = %s\n", merge_filename); free(merge_filename); list_item_t *item1 = NULL, *item2 = NULL; vcf_header_entry_t *entry; vcf_record_t *record; int *num_records; // Write headers while ((item1 = list_remove_item(output_header_list)) != NULL) { entry = item1->data_p; write_vcf_header_entry(entry, merge_fd); } // Write delimiter array_list_t *sample_names = merge_vcf_sample_names(files, options_data->num_files); write_vcf_delimiter_from_samples((char**) sample_names->items, sample_names->size, merge_fd); // Write records // When a token is present, it means a set of batches has been merged. The token contains the number of records merged. // In this case, the records must be sorted by chromosome and position, and written afterwards. while ((item1 = list_remove_item(merge_tokens)) != NULL) { num_records = item1->data_p; vcf_record_t *records[*num_records]; for (int i = 0; i < *num_records; i++) { item2 = list_remove_item(output_list); if (!item2) { break; } records[i] = item2->data_p; list_item_free(item2); } // Sort records qsort(records, *num_records, sizeof(vcf_record_t*), record_cmp); // Write and free sorted records for (int i = 0; i < *num_records; i++) { record = records[i]; write_vcf_record(record, merge_fd); vcf_record_free_deep(record); } free(num_records); list_item_free(item1); } // Close file if (merge_fd != NULL) { fclose(merge_fd); } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%dW] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dW] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } } // Free variables related to the different files for (int i = 0; i < options_data->num_files; i++) { if(files[i]) { vcf_close(files[i]); } if(read_list[i]) { free(read_list[i]); } } free(output_list); return ret_code; }