void assoc_test(enum ASSOC_task test_type, vcf_record_t **variants, int num_variants, individual_t **samples, int num_samples, const void *opt_input, list_t *output_list) { int tid = omp_get_thread_num(); vcf_record_t *record; individual_t *individual; char *sample_data; int gt_position; int allele1, allele2; // Affection counts int A1 = 0, A2 = 0, U1 = 0, U2 = 0; // Perform analysis for each variant for (int i = 0; i < num_variants; i++) { record = variants[i]; // LOG_DEBUG_F("[%d] Checking variant %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position); A1 = 0; A2 = 0; U1 = 0; U2 = 0; gt_position = get_field_position_in_format("GT", strndup(record->format, record->format_len)); // Count over individuals for (int j = 0; j < num_samples; j++) { individual = samples[j]; sample_data = strdup(array_list_get(j, record->samples)); if (!get_alleles(sample_data, gt_position, &allele1, &allele2)) { assoc_count_individual(individual, record, allele1, allele2, &A1, &A2, &U1, &U2); } free(sample_data); } // Finished counting: now compute the statistics if (test_type == CHI_SQUARE) { double assoc_basic_chisq = assoc_basic_test(A1, U1, A2, U2); assoc_basic_result_t *result = assoc_basic_result_new(record->chromosome, record->chromosome_len, record->position, record->reference, record->reference_len, record->alternate, record->alternate_len, A1, A2, U1, U2, assoc_basic_chisq); list_item_t *output_item = list_item_new(tid, 0, result); list_insert_item(output_item, output_list); } else if (test_type == FISHER) { double p_value = assoc_fisher_test(A1, A2, U1, U2, (double*) opt_input); assoc_fisher_result_t *result = assoc_fisher_result_new(record->chromosome, record->chromosome_len, record->position, record->reference, record->reference_len, record->alternate, record->alternate_len, A1, A2, U1, U2, p_value); list_item_t *output_item = list_item_new(tid, 0, result); list_insert_item(output_item, output_list); } // LOG_DEBUG_F("[%d] after adding %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position); } // next variant }
void process_and_free_chromosome_avls(allocate_splice_elements_t *chromosome_avls, list_t* write_list_p, unsigned int write_size) { int c; allocate_buffers_t *allocate_batches = (allocate_buffers_t *)malloc(sizeof(allocate_buffers_t)); write_batch_t *exact_splice_write_p; write_batch_t *extend_splice_write_p; for(c = 0; c < CHROMOSOME_NUMBER; c++){ if(chromosome_avls[c].avl_splice->root != NULL) { allocate_batches->write_exact_sp = write_batch_new(write_size, SPLICE_EXACT_FLAG); allocate_batches->write_extend_sp = write_batch_new(write_size, SPLICE_EXTEND_FLAG); //allocate_batches->write_extend_sp = write_batch_new(1000, SPLICE_EXTEND_FLAG); allocate_batches = process_avlnode_in_order(chromosome_avls[c].avl_splice->root, c, write_list_p, write_size, allocate_batches); exact_splice_write_p = allocate_batches->write_exact_sp; extend_splice_write_p = allocate_batches->write_extend_sp; if(exact_splice_write_p != NULL) { list_item_t* item_p = NULL; if(exact_splice_write_p->size > 0) { item_p = list_item_new(0, WRITE_ITEM, exact_splice_write_p); list_insert_item(item_p, write_list_p); } else { write_batch_free(exact_splice_write_p); } } if(extend_splice_write_p != NULL) { list_item_t* item_p = NULL; if(extend_splice_write_p->size > 0) { item_p = list_item_new(0, WRITE_ITEM, extend_splice_write_p); list_insert_item(item_p, write_list_p); } else { write_batch_free(extend_splice_write_p); } } }//end IF chromosome splice not NULL cp_avltree_destroy(chromosome_avls[c].avl_splice); } free(allocate_batches); if (statistics_on) { statistics_set(TOTAL_ST, 3, total_splice, statistics_p); } list_decr_writers(write_list_p); }
int merge_remaining_interval(kh_pos_t* positions_read, vcf_file_t **files, shared_options_data_t *shared_options_data, merge_options_data_t *options_data, list_t *output_list) { int num_entries = 0; #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries) for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) { if (kh_exist(positions_read, k)) { array_list_t *records_in_position = kh_value(positions_read, k); assert(records_in_position); // Launch merge int err_code = 0; vcf_record_t *merged = merge_position((vcf_record_file_link **) records_in_position->items, records_in_position->size, files, options_data->num_files, options_data, &err_code); if (!err_code) { list_item_t *item = list_item_new(k, MERGED_RECORD, merged); list_insert_item(item, output_list); num_entries += 1; } // Free empty nodes (lists of records in the same position) array_list_free(records_in_position, vcf_record_file_link_free); kh_del(pos, positions_read, k); } } return num_entries; }
allocate_buffers_t* process_avlnode_ends_in_order(node_element_splice_t *node, unsigned int chromosome, list_t* write_list_p, unsigned int write_size, allocate_buffers_t *allocate_batches) { int i; char strand[2] = {'+', '-'}; list_item_t* item_p = NULL; unsigned int bytes_exact, bytes_extend; allocate_batches->write_exact_sp; // write_batch_t* extend_splice_write_p = write_batch_new(write_size, SPLICE_EXTEND_FLAG); for(i = 0; i < node->number_allocate_ends; i++){ if(( allocate_batches->write_exact_sp->size + 100) > write_size) { item_p = list_item_new(0, WRITE_ITEM, allocate_batches->write_exact_sp); list_insert_item(item_p, write_list_p); allocate_batches->write_exact_sp = write_batch_new(write_size, SPLICE_EXACT_FLAG); } if(( allocate_batches->write_extend_sp->size + 100) > write_size) { item_p = list_item_new(0, WRITE_ITEM, allocate_batches->write_extend_sp); list_insert_item(item_p, write_list_p); allocate_batches->write_extend_sp = write_batch_new(write_size, SPLICE_EXTEND_FLAG); } bytes_exact = pack_junction(chromosome, node->allocate_ends[i]->strand, node->splice_start, node->allocate_ends[i]->end, junction_id, node->allocate_ends[i]->reads_number, &(((char *)allocate_batches->write_exact_sp->buffer_p)[allocate_batches->write_exact_sp->size])); bytes_extend = pack_junction(chromosome, node->allocate_ends[i]->strand, node->splice_start_extend, node->allocate_ends[i]->splice_end_extend, junction_id, node->allocate_ends[i]->reads_number, &(((char *)allocate_batches->write_extend_sp->buffer_p)[allocate_batches->write_extend_sp->size])); allocate_batches->write_exact_sp->size += bytes_exact; allocate_batches->write_extend_sp->size += bytes_extend; total_splice += node->allocate_ends[i]->reads_number; junction_id++; } return allocate_batches; //return exact_splice_write_p; }
int APP_CC xrdp_region_insert_rect(struct xrdp_region* self, int i, int left, int top, int right, int bottom) { struct xrdp_rect* r; r = (struct xrdp_rect*)g_malloc(sizeof(struct xrdp_rect), 1); r->left = left; r->top = top; r->right = right; r->bottom = bottom; list_insert_item(self->rects, i, (long)r); return 0; }
int merge_interval(kh_pos_t* positions_read, char *max_chromosome_merged, unsigned long max_position_merged, char **chromosome_order, int num_chromosomes, vcf_file_t **files, shared_options_data_t *shared_options_data, merge_options_data_t *options_data, list_t *output_list) { int num_entries = 0; #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries) for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) { if (kh_exist(positions_read, k)) { array_list_t *records_in_position = kh_value(positions_read, k); assert(records_in_position); vcf_record_t *record = ((vcf_record_file_link*) array_list_get(0, records_in_position))->record; vcf_record_file_link **links = NULL; int num_links = 0; // Remove positions prior to the last chromosome:position to merge int cmp_chrom = compare_chromosomes(record->chromosome, max_chromosome_merged, chromosome_order, num_chromosomes); if (cmp_chrom < 0 || (cmp_chrom == 0 && compare_positions(record->position, max_position_merged) <= 0)) { links = records_in_position->items; num_links = records_in_position->size; } // Launch merge if (num_links > 0) { // printf("links[0] = %s:%ld in file %s\n", links[0]->record->chromosome, links[0]->record->position, links[0]->file->filename); int err_code = 0; vcf_record_t *merged = merge_position(links, num_links, files, options_data->num_files, options_data, &err_code); if (!err_code) { list_item_t *item = list_item_new(k, MERGED_RECORD, merged); list_insert_item(item, output_list); num_entries += 1; } // Free empty nodes (lists of records in the same position) array_list_free(records_in_position, vcf_record_file_link_free); kh_del(pos, positions_read, k); } } // End kh_exist } return num_entries; }
int run_merge(shared_options_data_t *shared_options_data, merge_options_data_t *options_data) { if (options_data->num_files == 1) { LOG_INFO("Just one VCF file specified, no need to merge"); return 0; } list_t *read_list[options_data->num_files]; memset(read_list, 0, options_data->num_files * sizeof(list_t*)); list_t *output_header_list = (list_t*) malloc (sizeof(list_t)); list_init("headers", shared_options_data->num_threads, INT_MAX, output_header_list); list_t *output_list = (list_t*) malloc (sizeof(list_t)); list_init("output", shared_options_data->num_threads, shared_options_data->max_batches * shared_options_data->batch_lines, output_list); list_t *merge_tokens = (list_t*) malloc (sizeof(list_t)); list_init("tokens", 1, INT_MAX, merge_tokens); int ret_code = 0; double start, stop, total; vcf_file_t *files[options_data->num_files]; memset(files, 0, options_data->num_files * sizeof(vcf_file_t*)); // Initialize variables related to the different files for (int i = 0; i < options_data->num_files; i++) { files[i] = vcf_open(options_data->input_files[i], shared_options_data->max_batches); if (!files[i]) { LOG_FATAL_F("VCF file %s does not exist!\n", options_data->input_files[i]); } read_list[i] = (list_t*) malloc(sizeof(list_t)); list_init("text", 1, shared_options_data->max_batches, read_list[i]); } ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } chromosome_order = get_chromosome_order(shared_options_data->host_url, shared_options_data->species, shared_options_data->version, &num_chromosomes); printf("Number of threads = %d\n", shared_options_data->num_threads); #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); ret_code = vcf_multiread_batches(read_list, shared_options_data->batch_lines, files, options_data->num_files); stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_ERROR_F("Error %d while reading VCF files\n", ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } #pragma omp section { // Enable nested parallelism omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); int num_eof_found = 0; int eof_found[options_data->num_files]; memset(eof_found, 0, options_data->num_files * sizeof(int)); list_item_t *items[options_data->num_files]; memset(items, 0, options_data->num_files * sizeof(list_item_t*)); char *texts[options_data->num_files]; memset(texts, 0, options_data->num_files * sizeof(char*)); khash_t(pos) *positions_read = kh_init(pos); long max_position_merged = LONG_MAX; char *max_chromosome_merged = NULL; int header_merged = 0; int token = 0; double start_parsing, start_insertion, total_parsing = 0, total_insertion = 0; start = omp_get_wtime(); while (num_eof_found < options_data->num_files) { /* Process: * - N threads getting batches of VCF records and inserting them in a data structure. The common minimum * position of each group of batches will also be stored. * - If the data structure reaches certain size or the end of a chromosome, merge positions prior to the * last minimum registered. */ // Getting text elements in a critical region guarantees that each thread gets variants in positions in the same range for (int i = 0; i < options_data->num_files; i++) { if (eof_found[i]) { continue; } items[i] = list_remove_item(read_list[i]); if (items[i] == NULL || !strcmp(items[i]->data_p, "")) { LOG_INFO_F("[%d] EOF found in file %s\n", omp_get_thread_num(), options_data->input_files[i]); eof_found[i] = 1; num_eof_found++; if(items[i] != NULL && !strcmp(items[i]->data_p, "")) { free(items[i]->data_p); list_item_free(items[i]); LOG_DEBUG_F("[%d] Text batch freed\n", omp_get_thread_num()); } else { LOG_DEBUG_F("[%d] No need to free text batch\n", omp_get_thread_num()); } continue; } assert(items[i]->data_p != NULL); texts[i] = items[i]->data_p; // printf("[%d] text batch from file %d\tcontents = '%s'\n", omp_get_thread_num(), i, texts[i]); } for (int i = 0; i < options_data->num_files; i++) { if (eof_found[i]) { continue; } start_parsing = omp_get_wtime(); char *text_begin = texts[i]; char *text_end = text_begin + strlen(text_begin); assert(text_end != NULL); // printf("batch = '%.*s'\n", text_end - text_begin, text_begin); // Get VCF batches from text batches vcf_reader_status *status = vcf_reader_status_new(shared_options_data->batch_lines, 0); ret_code = run_vcf_parser(text_begin, text_end, shared_options_data->batch_lines, files[i], status); if (ret_code) { // TODO stop? LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, files[i]->filename); continue; } // printf("batches = %d\n", files[i]->record_batches->length); vcf_batch_t *batch = fetch_vcf_batch_non_blocking(files[i]); if (!batch) { continue; } total_parsing += omp_get_wtime() - start_parsing; start_insertion = omp_get_wtime(); // Insert records into hashtable for (int j = 0; j < batch->records->size; j++) { vcf_record_t *record = vcf_record_copy(array_list_get(j, batch->records)); vcf_record_file_link *link = vcf_record_file_link_new(record, files[i]); char key[64]; compose_key_value(record->chromosome, record->position, key); int ret = insert_position_read(key, link, positions_read); assert(ret); } total_insertion += omp_get_wtime() - start_insertion; // Update minimum position being a maximum of these batches vcf_record_t *current_record = (vcf_record_t*) array_list_get(batch->records->size - 1, batch->records); calculate_merge_interval(current_record, &max_chromosome_merged, &max_position_merged, chromosome_order, num_chromosomes); // Free batch and its contents vcf_reader_status_free(status); vcf_batch_free(batch); list_item_free(items[i]); } if (num_eof_found == options_data->num_files) { max_chromosome_merged = chromosome_order[num_chromosomes-1]; max_position_merged = LONG_MAX; } // Merge headers, if not previously done if (!header_merged) { merge_vcf_headers(files, options_data->num_files, options_data, output_header_list); header_merged = 1; // Decrease list writers count for (int i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_header_list); } } // If the data structure reaches certain size or the end of a chromosome, // merge positions prior to the last minimum registered if (num_eof_found < options_data->num_files && kh_size(positions_read) > TREE_LIMIT) { LOG_INFO_F("Merging until position %s:%ld\n", max_chromosome_merged, max_position_merged); token = merge_interval(positions_read, max_chromosome_merged, max_position_merged, chromosome_order, num_chromosomes, files, shared_options_data, options_data, output_list); } // When reaching EOF for all files, merge the remaining entries else if (num_eof_found == options_data->num_files && kh_size(positions_read) > 0) { LOG_INFO_F("Merging remaining positions (last = %s:%ld)\n", chromosome_order[num_chromosomes - 1], LONG_MAX); token = merge_remaining_interval(positions_read, files, shared_options_data, options_data, output_list); } if (token) { int *token_ptr = malloc (sizeof(int)); *token_ptr = token; list_item_t *item = list_item_new(1, 0, token_ptr); list_insert_item(item, merge_tokens); } // Set variables ready for next iteration of the algorithm if (max_chromosome_merged) { free(max_chromosome_merged); } token = 0; max_chromosome_merged = NULL; max_position_merged = LONG_MAX; } kh_destroy(pos, positions_read); stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); LOG_DEBUG_F("** Time in parsing = %f s\n", total_parsing); LOG_DEBUG_F("** Time in insertion = %f s\n", total_insertion); // for (int i = 0; i < shared_options_data->num_threads; i++) { // printf("[%d] Time in searching = %f s\n", i, total_search[i]); // printf("[%d] Time in merging = %f s\n", i, total_merge[i]); // } // Decrease list writers count for (int i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } list_decr_writers(merge_tokens); } #pragma omp section { LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num()); start = omp_get_wtime(); // Create file streams for results char aux_filename[32]; memset(aux_filename, 0, 32 * sizeof(char)); sprintf(aux_filename, "merge_from_%d_files.vcf", options_data->num_files); char *merge_filename; FILE *merge_fd = get_output_file(shared_options_data, aux_filename, &merge_filename); LOG_INFO_F("Output filename = %s\n", merge_filename); free(merge_filename); list_item_t *item1 = NULL, *item2 = NULL; vcf_header_entry_t *entry; vcf_record_t *record; int *num_records; // Write headers while ((item1 = list_remove_item(output_header_list)) != NULL) { entry = item1->data_p; write_vcf_header_entry(entry, merge_fd); } // Write delimiter array_list_t *sample_names = merge_vcf_sample_names(files, options_data->num_files); write_vcf_delimiter_from_samples((char**) sample_names->items, sample_names->size, merge_fd); // Write records // When a token is present, it means a set of batches has been merged. The token contains the number of records merged. // In this case, the records must be sorted by chromosome and position, and written afterwards. while ((item1 = list_remove_item(merge_tokens)) != NULL) { num_records = item1->data_p; vcf_record_t *records[*num_records]; for (int i = 0; i < *num_records; i++) { item2 = list_remove_item(output_list); if (!item2) { break; } records[i] = item2->data_p; list_item_free(item2); } // Sort records qsort(records, *num_records, sizeof(vcf_record_t*), record_cmp); // Write and free sorted records for (int i = 0; i < *num_records; i++) { record = records[i]; write_vcf_record(record, merge_fd); vcf_record_free_deep(record); } free(num_records); list_item_free(item1); } // Close file if (merge_fd != NULL) { fclose(merge_fd); } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%dW] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dW] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } } // Free variables related to the different files for (int i = 0; i < options_data->num_files; i++) { if(files[i]) { vcf_close(files[i]); } if(read_list[i]) { free(read_list[i]); } } free(output_list); return ret_code; }
int ped_ragel_read(list_t *batches_list, size_t batch_size, ped_file_t *file) { int cs; char *p = file->data; char *pe = p + file->data_len; char *eof = pe; char *ts; int custom_field_count = 0; current_batch = ped_batch_new(batch_size); #line 41 "ped_reader.c" { cs = ped_start; } #line 46 "ped_reader.c" { if ( p == pe ) goto _test_eof; switch ( cs ) { case 21: switch( (*p) ) { case 10: goto st22; case 35: goto st16; } if ( 33 <= (*p) && (*p) <= 126 ) goto tr36; goto tr0; tr0: #line 53 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'family' field\n", lines + 1, file->filename); } goto st0; tr3: #line 65 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'individual' field\n", lines + 1, file->filename); } goto st0; tr7: #line 77 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'father' field\n", lines + 1, file->filename); } goto st0; tr11: #line 89 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'mother' field\n", lines + 1, file->filename); } goto st0; tr15: #line 109 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'sex' field\n", lines + 1, file->filename); } goto st0; tr19: #line 124 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'phenotype' field\n", lines + 1, file->filename); } goto st0; tr26: #line 141 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'header' field\n", lines + 1, file->filename); } goto st0; tr44: #line 161 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in '%s' field\n", lines + 1, file->filename, current_record->custom_field); } goto st0; #line 108 "ped_reader.c" st0: cs = 0; goto _out; st22: if ( ++p == pe ) goto _test_eof22; case 22: if ( (*p) == 10 ) goto st22; goto st0; tr36: #line 22 "ped.ragel" { current_record = create_ped_record(); genotype = 0; } #line 45 "ped.ragel" { ts = p; } goto st1; st1: if ( ++p == pe ) goto _test_eof1; case 1: #line 134 "ped_reader.c" if ( (*p) == 9 ) goto tr1; if ( 33 <= (*p) && (*p) <= 126 ) goto st1; goto tr0; tr1: #line 49 "ped.ragel" { set_ped_record_family_id(strndup(ts, p-ts), current_record); } goto st2; st2: if ( ++p == pe ) goto _test_eof2; case 2: #line 150 "ped_reader.c" if ( (*p) == 95 ) goto tr4; if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr4; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr4; } else goto tr4; goto tr3; tr4: #line 57 "ped.ragel" { ts = p; } goto st3; st3: if ( ++p == pe ) goto _test_eof3; case 3: #line 172 "ped_reader.c" switch( (*p) ) { case 9: goto tr5; case 95: goto st3; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto st3; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto st3; } else goto st3; goto tr3; tr5: #line 61 "ped.ragel" { set_ped_record_individual_id(strndup(ts, p-ts), current_record); } goto st4; st4: if ( ++p == pe ) goto _test_eof4; case 4: #line 196 "ped_reader.c" switch( (*p) ) { case 46: goto tr8; case 95: goto tr9; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr9; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr9; } else goto tr9; goto tr7; tr8: #line 69 "ped.ragel" { ts = p; } goto st5; st5: if ( ++p == pe ) goto _test_eof5; case 5: #line 220 "ped_reader.c" if ( (*p) == 9 ) goto tr10; goto tr7; tr10: #line 73 "ped.ragel" { set_ped_record_father_id(strndup(ts, p-ts), current_record); } goto st6; st6: if ( ++p == pe ) goto _test_eof6; case 6: #line 234 "ped_reader.c" switch( (*p) ) { case 46: goto tr12; case 95: goto tr13; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr13; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr13; } else goto tr13; goto tr11; tr12: #line 81 "ped.ragel" { ts = p; } goto st7; st7: if ( ++p == pe ) goto _test_eof7; case 7: #line 258 "ped_reader.c" if ( (*p) == 9 ) goto tr14; goto tr11; tr14: #line 85 "ped.ragel" { set_ped_record_mother_id(strndup(ts, p-ts), current_record); } goto st8; st8: if ( ++p == pe ) goto _test_eof8; case 8: #line 272 "ped_reader.c" if ( (*p) == 46 ) goto tr16; if ( 48 <= (*p) && (*p) <= 57 ) goto tr17; goto tr15; tr16: #line 93 "ped.ragel" { ts = p; } goto st9; st9: if ( ++p == pe ) goto _test_eof9; case 9: #line 288 "ped_reader.c" if ( (*p) == 9 ) goto tr18; goto tr15; tr18: #line 97 "ped.ragel" { char *field = strndup(ts, p-ts); enum Sex sex = UNKNOWN_SEX; if (atoi(field) == 1) { sex = MALE; } else if (atoi(field) == 2) { sex = FEMALE; } set_ped_record_sex(sex, current_record); free(field); // Not set as ped_record_t variable -> not freed later } goto st10; st10: if ( ++p == pe ) goto _test_eof10; case 10: #line 310 "ped_reader.c" switch( (*p) ) { case 32: goto tr20; case 95: goto tr20; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr20; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr20; } else goto tr20; goto tr19; tr20: #line 113 "ped.ragel" { ts = p; } goto st23; tr42: #line 117 "ped.ragel" { if (strncmp(".", ts, 1)) { char *field = strndup(ts, p-ts); set_ped_record_phenotype(field, current_record, file); } } #line 145 "ped.ragel" { custom_field_count = 6; } goto st23; st23: if ( ++p == pe ) goto _test_eof23; case 23: #line 347 "ped_reader.c" switch( (*p) ) { case 9: goto tr39; case 10: goto tr40; case 32: goto tr42; case 95: goto st23; } if ( (*p) < 48 ) { if ( 11 <= (*p) && (*p) <= 13 ) goto tr41; } else if ( (*p) > 57 ) { if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto st23; } else if ( (*p) >= 65 ) goto st23; } else goto st23; goto tr19; tr39: #line 117 "ped.ragel" { if (strncmp(".", ts, 1)) { char *field = strndup(ts, p-ts); set_ped_record_phenotype(field, current_record, file); } } #line 145 "ped.ragel" { custom_field_count = 6; } goto st24; tr49: #line 153 "ped.ragel" { char* field_name = strndup(ts, p-ts); custom_field_count++; if (custom_field_count == file->num_field) { set_ped_record_custom_field(field_name, current_record, file); } } goto st24; st24: if ( ++p == pe ) goto _test_eof24; case 24: #line 393 "ped_reader.c" switch( (*p) ) { case 9: goto st24; case 10: goto tr46; } if ( (*p) > 13 ) { if ( 32 <= (*p) && (*p) <= 126 ) goto tr48; } else if ( (*p) >= 11 ) goto st26; goto tr44; tr40: #line 117 "ped.ragel" { if (strncmp(".", ts, 1)) { char *field = strndup(ts, p-ts); set_ped_record_phenotype(field, current_record, file); } } #line 145 "ped.ragel" { custom_field_count = 6; } #line 27 "ped.ragel" { // If batch is full, add to the list of batches and create a new, empty one if (ped_batch_is_full(current_batch)) { list_item_t *item = list_item_new(num_records, 1, current_batch); list_insert_item(item, batches_list); LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length); current_batch = ped_batch_new(batch_size); } // Add current record to current batch if (current_record) { add_record_to_ped_batch(current_record, current_batch); num_records++; } current_record = NULL; } #line 18 "ped.ragel" { lines++; } goto st25; tr46: #line 27 "ped.ragel" { // If batch is full, add to the list of batches and create a new, empty one if (ped_batch_is_full(current_batch)) { list_item_t *item = list_item_new(num_records, 1, current_batch); list_insert_item(item, batches_list); LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length); current_batch = ped_batch_new(batch_size); } // Add current record to current batch if (current_record) { add_record_to_ped_batch(current_record, current_batch); num_records++; } current_record = NULL; } #line 18 "ped.ragel" { lines++; } goto st25; tr50: #line 153 "ped.ragel" { char* field_name = strndup(ts, p-ts); custom_field_count++; if (custom_field_count == file->num_field) { set_ped_record_custom_field(field_name, current_record, file); } } #line 27 "ped.ragel" { // If batch is full, add to the list of batches and create a new, empty one if (ped_batch_is_full(current_batch)) { list_item_t *item = list_item_new(num_records, 1, current_batch); list_insert_item(item, batches_list); LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length); current_batch = ped_batch_new(batch_size); } // Add current record to current batch if (current_record) { add_record_to_ped_batch(current_record, current_batch); num_records++; } current_record = NULL; } #line 18 "ped.ragel" { lines++; } goto st25; st25: if ( ++p == pe ) goto _test_eof25; case 25: #line 499 "ped_reader.c" switch( (*p) ) { case 10: goto tr46; case 32: goto st26; } if ( (*p) < 33 ) { if ( 9 <= (*p) && (*p) <= 13 ) goto st26; } else if ( (*p) > 34 ) { if ( 36 <= (*p) && (*p) <= 126 ) goto tr36; } else goto tr36; goto tr0; tr41: #line 117 "ped.ragel" { if (strncmp(".", ts, 1)) { char *field = strndup(ts, p-ts); set_ped_record_phenotype(field, current_record, file); } } #line 145 "ped.ragel" { custom_field_count = 6; } goto st26; tr51: #line 153 "ped.ragel" { char* field_name = strndup(ts, p-ts); custom_field_count++; if (custom_field_count == file->num_field) { set_ped_record_custom_field(field_name, current_record, file); } } goto st26; st26: if ( ++p == pe ) goto _test_eof26; case 26: #line 540 "ped_reader.c" switch( (*p) ) { case 10: goto tr46; case 32: goto st26; } if ( 9 <= (*p) && (*p) <= 13 ) goto st26; goto st0; tr48: #line 149 "ped.ragel" { ts = p; } goto st27; tr52: #line 153 "ped.ragel" { char* field_name = strndup(ts, p-ts); custom_field_count++; if (custom_field_count == file->num_field) { set_ped_record_custom_field(field_name, current_record, file); } } goto st27; st27: if ( ++p == pe ) goto _test_eof27; case 27: #line 568 "ped_reader.c" switch( (*p) ) { case 9: goto tr49; case 10: goto tr50; case 32: goto tr52; } if ( (*p) > 13 ) { if ( 33 <= (*p) && (*p) <= 126 ) goto st27; } else if ( (*p) >= 11 ) goto tr51; goto tr44; tr17: #line 93 "ped.ragel" { ts = p; } goto st11; st11: if ( ++p == pe ) goto _test_eof11; case 11: #line 590 "ped_reader.c" switch( (*p) ) { case 9: goto tr18; case 46: goto st12; } if ( 48 <= (*p) && (*p) <= 57 ) goto st11; goto tr15; st12: if ( ++p == pe ) goto _test_eof12; case 12: if ( 48 <= (*p) && (*p) <= 57 ) goto st13; goto tr15; st13: if ( ++p == pe ) goto _test_eof13; case 13: if ( (*p) == 9 ) goto tr18; if ( 48 <= (*p) && (*p) <= 57 ) goto st13; goto tr15; tr13: #line 81 "ped.ragel" { ts = p; } goto st14; st14: if ( ++p == pe ) goto _test_eof14; case 14: #line 624 "ped_reader.c" switch( (*p) ) { case 9: goto tr14; case 95: goto st14; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto st14; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto st14; } else goto st14; goto tr11; tr9: #line 69 "ped.ragel" { ts = p; } goto st15; st15: if ( ++p == pe ) goto _test_eof15; case 15: #line 648 "ped_reader.c" switch( (*p) ) { case 9: goto tr10; case 95: goto st15; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto st15; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto st15; } else goto st15; goto tr7; st16: if ( ++p == pe ) goto _test_eof16; case 16: switch( (*p) ) { case 9: goto st17; case 32: goto tr28; case 95: goto tr29; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr29; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr29; } else goto tr29; goto tr26; st17: if ( ++p == pe ) goto _test_eof17; case 17: switch( (*p) ) { case 32: goto tr29; case 95: goto tr29; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr29; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr29; } else goto tr29; goto tr26; tr29: #line 128 "ped.ragel" { ts = p; } goto st18; st18: if ( ++p == pe ) goto _test_eof18; case 18: #line 707 "ped_reader.c" switch( (*p) ) { case 9: goto tr30; case 10: goto tr31; case 32: goto st18; case 95: goto st18; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto st18; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto st18; } else goto st18; goto tr26; tr30: #line 132 "ped.ragel" { char* field_name = strndup(ts, p-ts); custom_field_count++; if (file->variable_field && !strcmp(field_name, file->variable_field)) { file->num_field = custom_field_count; } free(field_name); } goto st19; st19: if ( ++p == pe ) goto _test_eof19; case 19: #line 738 "ped_reader.c" switch( (*p) ) { case 9: goto st19; case 10: goto st28; case 32: goto tr29; case 95: goto tr29; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr29; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr29; } else goto tr29; goto tr26; tr31: #line 132 "ped.ragel" { char* field_name = strndup(ts, p-ts); custom_field_count++; if (file->variable_field && !strcmp(field_name, file->variable_field)) { file->num_field = custom_field_count; } free(field_name); } goto st28; st28: if ( ++p == pe ) goto _test_eof28; case 28: #line 769 "ped_reader.c" if ( (*p) == 10 ) goto st22; if ( (*p) > 34 ) { if ( 36 <= (*p) && (*p) <= 126 ) goto tr36; } else if ( (*p) >= 33 ) goto tr36; goto tr0; tr28: #line 128 "ped.ragel" { ts = p; } goto st20; st20: if ( ++p == pe ) goto _test_eof20; case 20: #line 788 "ped_reader.c" switch( (*p) ) { case 9: goto tr30; case 10: goto tr31; case 32: goto tr29; case 95: goto tr29; } if ( (*p) < 65 ) { if ( 48 <= (*p) && (*p) <= 57 ) goto tr29; } else if ( (*p) > 90 ) { if ( 97 <= (*p) && (*p) <= 122 ) goto tr29; } else goto tr29; goto tr26; } _test_eof22: cs = 22; goto _test_eof; _test_eof1: cs = 1; goto _test_eof; _test_eof2: cs = 2; goto _test_eof; _test_eof3: cs = 3; goto _test_eof; _test_eof4: cs = 4; goto _test_eof; _test_eof5: cs = 5; goto _test_eof; _test_eof6: cs = 6; goto _test_eof; _test_eof7: cs = 7; goto _test_eof; _test_eof8: cs = 8; goto _test_eof; _test_eof9: cs = 9; goto _test_eof; _test_eof10: cs = 10; goto _test_eof; _test_eof23: cs = 23; goto _test_eof; _test_eof24: cs = 24; goto _test_eof; _test_eof25: cs = 25; goto _test_eof; _test_eof26: cs = 26; goto _test_eof; _test_eof27: cs = 27; goto _test_eof; _test_eof11: cs = 11; goto _test_eof; _test_eof12: cs = 12; goto _test_eof; _test_eof13: cs = 13; goto _test_eof; _test_eof14: cs = 14; goto _test_eof; _test_eof15: cs = 15; goto _test_eof; _test_eof16: cs = 16; goto _test_eof; _test_eof17: cs = 17; goto _test_eof; _test_eof18: cs = 18; goto _test_eof; _test_eof19: cs = 19; goto _test_eof; _test_eof28: cs = 28; goto _test_eof; _test_eof20: cs = 20; goto _test_eof; _test_eof: {} if ( p == eof ) { switch ( cs ) { case 24: case 25: case 26: #line 27 "ped.ragel" { // If batch is full, add to the list of batches and create a new, empty one if (ped_batch_is_full(current_batch)) { list_item_t *item = list_item_new(num_records, 1, current_batch); list_insert_item(item, batches_list); LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length); current_batch = ped_batch_new(batch_size); } // Add current record to current batch if (current_record) { add_record_to_ped_batch(current_record, current_batch); num_records++; } current_record = NULL; } break; case 1: #line 53 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'family' field\n", lines + 1, file->filename); } break; case 2: case 3: #line 65 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'individual' field\n", lines + 1, file->filename); } break; case 4: case 5: case 15: #line 77 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'father' field\n", lines + 1, file->filename); } break; case 6: case 7: case 14: #line 89 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'mother' field\n", lines + 1, file->filename); } break; case 8: case 9: case 11: case 12: case 13: #line 109 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'sex' field\n", lines + 1, file->filename); } break; case 10: #line 124 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'phenotype' field\n", lines + 1, file->filename); } break; case 16: case 17: case 18: case 19: case 20: #line 141 "ped.ragel" { LOG_ERROR_F("Line %zu (%s): Error in 'header' field\n", lines + 1, file->filename); } break; case 27: #line 153 "ped.ragel" { char* field_name = strndup(ts, p-ts); custom_field_count++; if (custom_field_count == file->num_field) { set_ped_record_custom_field(field_name, current_record, file); } } #line 27 "ped.ragel" { // If batch is full, add to the list of batches and create a new, empty one if (ped_batch_is_full(current_batch)) { list_item_t *item = list_item_new(num_records, 1, current_batch); list_insert_item(item, batches_list); LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length); current_batch = ped_batch_new(batch_size); } // Add current record to current batch if (current_record) { add_record_to_ped_batch(current_record, current_batch); num_records++; } current_record = NULL; } break; case 23: #line 117 "ped.ragel" { if (strncmp(".", ts, 1)) { char *field = strndup(ts, p-ts); set_ped_record_phenotype(field, current_record, file); } } #line 145 "ped.ragel" { custom_field_count = 6; } #line 27 "ped.ragel" { // If batch is full, add to the list of batches and create a new, empty one if (ped_batch_is_full(current_batch)) { list_item_t *item = list_item_new(num_records, 1, current_batch); list_insert_item(item, batches_list); LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length); current_batch = ped_batch_new(batch_size); } // Add current record to current batch if (current_record) { add_record_to_ped_batch(current_record, current_batch); num_records++; } current_record = NULL; } break; #line 973 "ped_reader.c" } } _out: {} } #line 221 "ped.ragel" // Insert the last batch if (!ped_batch_is_empty(current_batch)) { list_item_t *item = list_item_new(num_records, 1, current_batch); list_insert_item(item, batches_list); LOG_DEBUG_F("Batch added - %zu records (last)\n", current_batch->length); } if ( cs < #line 992 "ped_reader.c" 21 #line 231 "ped.ragel" ) { LOG_ERROR("The file was not successfully read\n"); LOG_INFO_F("Last state is %d, but %d was expected\n", cs, #line 1000 "ped_reader.c" 21 #line 235 "ped.ragel" ); } LOG_INFO_F("PED records read = %zu\n", num_records); return cs < #line 1009 "ped_reader.c" 21 #line 240 "ped.ragel" ; }
int add_vcf_batch(vcf_batch_t *batch, vcf_file_t *file) { assert(batch); assert(file); list_item_t *item = list_item_new(rand() % 1000, 1, batch); list_insert_item(item, file->record_batches); }
static void parse_mutation_phenotype_response(int tid, list_t *output_list) { list_item_t *output_item = list_item_new(tid, MUTATION_PHENOTYPE, trim(strdup(mutation_line[tid]))); list_insert_item(output_item, output_list); }
static void parse_snp_phenotype_response(int tid, list_t *output_list) { list_item_t *output_item = list_item_new(tid, SNP_PHENOTYPE, trim(strdup(snp_line[tid]))); list_insert_item(output_item, output_list); }
static void parse_effect_response(int tid, char *output_directory, size_t output_directory_len, cp_hashtable *output_files, list_t *output_list, cp_hashtable *summary_count, cp_hashtable *gene_list) { int *SO_found = (int*) malloc (sizeof(int)); // Whether the SO code field has been found int *count; char tmp_consequence_type[128]; int num_lines; char **split_batch = split(effect_line[tid], "\n", &num_lines); for (int i = 0; i < num_lines; i++) { int num_columns; char *copy_buf = strdup(split_batch[i]); char **split_result = split(copy_buf, "\t", &num_columns); free(copy_buf); // Find consequence type name (always after SO field) *SO_found = 0; if (num_columns == 25) { // LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]); if (!cp_hashtable_contains(gene_list, split_result[17])) { cp_hashtable_put(gene_list, strdup(split_result[17]), NULL); } *SO_found = atoi(split_result[18] + 3); memset(tmp_consequence_type, 0, 128 * sizeof(char)); strncat(tmp_consequence_type, split_result[19], strlen(split_result[19])); } else { if (strlen(split_batch[i]) == 0) { // Last line in batch could be only a newline for (int s = 0; s < num_columns; s++) { free(split_result[s]); } free(split_result); continue; } LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_columns, split_batch[i]); for (int s = 0; s < num_columns; s++) { free(split_result[s]); } free(split_result); continue; } for (int s = 0; s < num_columns; s++) { free(split_result[s]); } free(split_result); if (!*SO_found) { // SO:000000 is not valid LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid); continue; } // LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found); size_t consequence_type_len = strlen(tmp_consequence_type); // If file does not exist, create its descriptor and summary counter FILE *aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { #pragma omp critical { // This construction avoids 2 threads trying to insert the same CT aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { char filename[output_directory_len + consequence_type_len + 6]; memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char)); strncat(filename, output_directory, output_directory_len); strncat(filename, "/", 1); strncat(filename, tmp_consequence_type, consequence_type_len); strncat(filename, ".txt", 4); aux_file = fopen(filename, "a"); // Add to hashtables (file descriptors and summary counters) int *SO_stored = (int*) malloc (sizeof(int)); *SO_stored = *SO_found; cp_hashtable_put(output_files, SO_stored, aux_file); LOG_INFO_F("[%d] New consequence type found = %s\n", tid, tmp_consequence_type); } } } // Write line[tid] to file corresponding to its consequence type if (aux_file) { #pragma omp critical { // TODO move critical one level below? count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type); if (count == NULL) { char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char)); strncat(consequence_type, tmp_consequence_type, consequence_type_len); assert(!strcmp(consequence_type, tmp_consequence_type)); count = (int*) malloc (sizeof(int)); *count = 0; cp_hashtable_put(summary_count, consequence_type, count); // LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type); } // Increment counter for summary (*count)++; } // LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type); list_item_t *output_item = list_item_new(tid, *SO_found, strdup(split_batch[i])); list_insert_item(output_item, output_list); // LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type); } } for (int i = 0; i < num_lines; i++) { free(split_batch[i]); } free(split_batch); }
int tdt_test(vcf_record_t **variants, int num_variants, family_t **families, int num_families, cp_hashtable *sample_ids, list_t *output_list) { double start = omp_get_wtime(); int ret_code = 0; int tid = omp_get_thread_num(); int num_samples = cp_hashtable_count(sample_ids); tdt_result_t *result; char **sample_data; int gt_position; int father_allele1, father_allele2; int mother_allele1, mother_allele2; int child_allele1, child_allele2; /////////////////////////////////// // Perform analysis for each variant vcf_record_t *record; for (int i = 0; i < num_variants; i++) { record = variants[i]; LOG_DEBUG_F("[%d] Checking variant %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position); sample_data = (char**) record->samples->items; gt_position = get_field_position_in_format("GT", strndup(record->format, record->format_len)); // Transmission counts int t1 = 0; int t2 = 0; // Count over families family_t *family; for (int f = 0; f < num_families; f++) { family = families[f]; individual_t *father = family->father; individual_t *mother = family->mother; cp_list *children = family->children; // LOG_DEBUG_F("[%d] Checking suitability of family %s\n", tid, family->id); if (father == NULL || mother == NULL) { continue; } int *father_pos = cp_hashtable_get(sample_ids, father->id); if (father_pos != NULL) { // LOG_DEBUG_F("[%d] Father %s is in position %d\n", tid, father->id, *father_pos); } else { // LOG_DEBUG_F("[%d] Father %s is not positioned\n", tid, father->id); continue; } int *mother_pos = cp_hashtable_get(sample_ids, mother->id); if (mother_pos != NULL) { // LOG_DEBUG_F("[%d] Mother %s is in position %d\n", tid, mother->id, *mother_pos); } else { // LOG_DEBUG_F("[%d] Mother %s is not positioned\n", tid, mother->id); continue; } char *father_sample = strdup(sample_data[*father_pos]); char *mother_sample = strdup(sample_data[*mother_pos]); // LOG_DEBUG_F("[%d] Samples: Father = %s\tMother = %s\n", tid, father_sample, mother_sample); // If any parent's alleles can't be read or is missing, go to next family if (get_alleles(father_sample, gt_position, &father_allele1, &father_allele2) || get_alleles(mother_sample, gt_position, &mother_allele1, &mother_allele2)) { free(father_sample); free(mother_sample); continue; } // LOG_DEBUG_F("[%d] Alleles: Father = %d/%d\tMother = %d/%d\n", tid, father_allele1, father_allele2, mother_allele1, mother_allele2); // We need two genotyped parents, with at least one het if (father_allele1 == father_allele2 && mother_allele1 == mother_allele2) { free(father_sample); free(mother_sample); continue; } if ((father_allele1 && !father_allele2) || (mother_allele1 && !mother_allele2)) { free(father_sample); free(mother_sample); continue; } // LOG_DEBUG_F("[%d] Proceeding to analyse family %s...\n", tid, family->id); int trA = 0; // transmitted allele from first het parent int unA = 0; // untransmitted allele from first het parent int trB = 0; // transmitted allele from second het parent int unB = 0; // untransmitted allele from second het parent // Consider all offspring in nuclear family cp_list_iterator *children_iterator = cp_list_create_iterator(family->children, COLLECTION_LOCK_READ); individual_t *child = NULL; while ((child = cp_list_iterator_next(children_iterator)) != NULL) { // Only consider affected children if (child->condition != AFFECTED) { continue; } int *child_pos = cp_hashtable_get(sample_ids, child->id); if (child_pos != NULL) { // LOG_DEBUG_F("[%d] Child %s is in position %d\n", tid, child->id, *child_pos); } else { // LOG_DEBUG_F("[%d] Child %s is not positioned\n", tid, child->id); continue; } char *child_sample = strdup(sample_data[*child_pos]); // LOG_DEBUG_F("[%d] Samples: Child = %s\n", tid, child_sample); // Skip if offspring has missing genotype if (get_alleles(child_sample, gt_position, &child_allele1, &child_allele2)) { free(child_sample); continue; } // Exclude mendelian errors char *aux_chromosome = strndup(record->chromosome, record->chromosome_len); if (check_mendel(aux_chromosome, father_allele1, father_allele2, mother_allele1, mother_allele2, child_allele1, child_allele2, child->sex)) { free(child_sample); free(aux_chromosome); continue; } free(aux_chromosome); // We've now established: no missing genotypes // and at least one heterozygous parent // Kid is 00 if (!child_allele1 && !child_allele2) { if ( ( (!father_allele1) && father_allele2 ) && ( (!mother_allele1) && mother_allele2 ) ) { trA=1; unA=2; trB=1; unB=2; } else { trA=1; unA=2; } } else if ( (!child_allele1) && child_allele2 ) // Kid is 01 { // het dad if (father_allele1 != father_allele2 ) { // het mum if ( mother_allele1 != mother_allele2 ) { trA=1; trB=2; unA=2; unB=1; } else if ( !mother_allele1 ) { trA=2; unA=1; } else { trA=1; unA=2; } } else if ( !father_allele1 ) { trA=2; unA=1; } else { trA=1; unA=2; } } else // kid is 1/1 { if ( ( (!father_allele1) && father_allele2 ) && ( (!mother_allele1) && mother_allele2 ) ) { trA=2; unA=1; trB=2; unB=1; } else { trA=2; unA=1; } } // We have now populated trA (first transmission) // and possibly trB also //////////////////////////////////////// // Permutation? 50:50 flip (precomputed) // if (permute) { // if (flipA[f]) // { // int t = trA; // trA = unA; // unA = t; // // t = trB; // trB = unB; // unB = t; // } // } // Increment transmission counts if (trA==1) { t1++; } else if (trA==2) { t2++; } if (trB==1) { t1++; } else if (trB==2) { t2++; } // // LOG_DEBUG_F("TDT\t%.*s %s : %d %d - %d %d - %d %d - F %d/%d - M %d/%d - C %d/%d\n", // record->id_len, record->id, family->id, trA, unA, trB, unB, t1, t2, // father_allele1, father_allele2, mother_allele1, mother_allele2, child_allele1, child_allele2); free(child_sample); } // next offspring in family cp_list_iterator_destroy(children_iterator); free(father_sample); free(mother_sample); } // next nuclear family ///////////////////////////// // Finished counting: now compute // the statistics double tdt_chisq = -1; // Basic TDT test if (t1+t2 > 0) { tdt_chisq = ((double) ((t1-t2) * (t1-t2))) / (t1+t2); } // LOG_DEBUG_F("[%d] before adding %s:%ld\n", tid, record->chromosome, record->position); result = tdt_result_new(record->chromosome, record->chromosome_len, record->position, record->reference, record->reference_len, record->alternate, record->alternate_len, t1, t2, tdt_chisq); list_item_t *output_item = list_item_new(tid, 0, result); list_insert_item(output_item, output_list); // LOG_DEBUG_F("[%d] after adding %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position); } // next variant double end = omp_get_wtime(); return ret_code; }
static size_t write_mutation_phenotype_ws_results(char *contents, size_t size, size_t nmemb, void *userdata) { int tid = omp_get_thread_num(); int i = 0; int data_read_len = 0, next_line_len = 0; // Whether the buffer was consumed with a line read just partially int premature_end = 0; size_t realsize = size * nmemb; char *data = contents; char *output_text; // LOG_DEBUG_F("Mutation phenotype WS invoked, response size = %zu bytes -> %s\n", realsize, data); while (data_read_len < realsize) { assert((mutation_line + tid) != NULL); assert((mutation_max_line_size + tid) != NULL); // LOG_DEBUG_F("[%d] loop iteration #%d\n", tid, i); // Get length of data to copy next_line_len = strcspn(data, "\n"); // If the mutation_line[tid] is too long for the current buffers, reallocate a little more than the needed memory if (strlen(mutation_line[tid]) + next_line_len + 1 > mutation_max_line_size[tid]) { // LOG_DEBUG_F("Line too long (%d elements, but %zu needed) in batch #%d\n", // mutation_max_line_size[tid], strlen(mutation_line[tid]) + next_line_len, batch_num); // char *out_buf = (char*) calloc (next_line_len+1, sizeof(char)); // snprintf(out_buf, next_line_len, "%s", data); // LOG_INFO_F("[%d] too big data is: '%s'\n", tid, out_buf); char *aux_1 = (char*) realloc (mutation_line[tid], (mutation_max_line_size[tid] + next_line_len + 1) * sizeof(char)); char *aux_2 = (char*) realloc (mutation_output_line[tid], (mutation_max_line_size[tid] + next_line_len + 1) * sizeof(char)); if (!aux_1 || !aux_2) { LOG_ERROR("Can't resize buffers\n"); // Can't resize buffers -> can't keep reading the file if (!aux_1) { free(mutation_line[tid]); } if (!aux_2) { free(mutation_output_line[tid]); } return data_read_len; } mutation_line[tid] = aux_1; mutation_output_line[tid] = aux_2; mutation_max_line_size[tid] += next_line_len + 1; // LOG_DEBUG_F("[%d] buffers realloc'd (%d)\n", tid, mutation_max_line_size[tid]); } // LOG_DEBUG_F("[%d] position = %d, read = %d, max_size = %zu\n", i, next_line_len, data_read_len, realsize); if (data_read_len + next_line_len >= realsize) { // Save current state (mutation_line[tid] partially read) strncat(mutation_line[tid], data, next_line_len); chomp(mutation_line[tid]); mutation_line[tid][strlen(mutation_line[tid])] = '\0'; premature_end = 1; // LOG_DEBUG_F("widow mutation_line[tid] = '%s'\n", mutation_line[tid]); data_read_len = realsize; break; } strncat(mutation_line[tid], data, next_line_len); strncat(mutation_output_line[tid], mutation_line[tid], strlen(mutation_line[tid])); // LOG_DEBUG_F("[%d] copy to buffer (%zu)\n", tid, strlen(mutation_line[tid])); // LOG_DEBUG_F("[%d] before writing mutation phenotype\n", tid); output_text = strdup(mutation_output_line[tid]); list_item_t *output_item = list_item_new(tid, MUTATION_PHENOTYPE, output_text); list_insert_item(output_item, output_list); // LOG_DEBUG_F("[%d] after writing mutation phenotype\n", tid); data += next_line_len+1; data_read_len += next_line_len+1; memset(mutation_line[tid], 0, strlen(mutation_line[tid])); memset(mutation_output_line[tid], 0, strlen(mutation_output_line[tid])); i++; } // Empty buffer for next callback invocation if (!premature_end) { memset(mutation_line[tid], 0, strlen(mutation_line[tid])); memset(mutation_output_line[tid], 0, strlen(mutation_line[tid])); } return data_read_len; }
static size_t write_effect_ws_results(char *contents, size_t size, size_t nmemb, void *userdata) { int tid = omp_get_thread_num(); int i = 0; int data_read_len = 0, next_line_len = 0; // Whether the SO code field (previous to the consequence type name) has been found int *SO_found = (int*) malloc (sizeof(int)); // Whether the buffer was consumed with a line read just partially int premature_end = 0; size_t realsize = size * nmemb; int *count; char *data = contents; char tmp_consequence_type[128]; char *aux_buffer; char *output_text; LOG_DEBUG_F("Effect WS invoked, response size = %zu bytes\n", realsize); while (data_read_len < realsize) { assert((line + tid) != NULL); assert((max_line_size + tid) != NULL); LOG_DEBUG_F("[%d] loop iteration #%d\n", tid, i); // Get length of data to copy next_line_len = strcspn(data, "\n"); // If the line[tid] is too long for the current buffers, reallocate a little more than the needed memory if (strlen(line[tid]) + next_line_len + 1 > max_line_size[tid]) { // LOG_DEBUG_F("Line too long (%d elements, but %zu needed) in batch #%d\n", // max_line_size[tid], strlen(line[tid]) + next_line_len, batch_num); // char *out_buf = (char*) calloc (next_line_len+1, sizeof(char)); // snprintf(out_buf, next_line_len, "%s", data); // LOG_INFO_F("[%d] too big data is: '%s'\n", tid, out_buf); char *aux_1 = (char*) realloc (line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char)); char *aux_2 = (char*) realloc (output_line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char)); if (!aux_1 || !aux_2) { LOG_ERROR("Can't resize buffers\n"); // Can't resize buffers -> can't keep reading the file if (!aux_1) { free(line[tid]); } if (!aux_2) { free(output_line[tid]); } return data_read_len; } line[tid] = aux_1; output_line[tid] = aux_2; max_line_size[tid] += next_line_len + 1; // LOG_DEBUG_F("[%d] buffers realloc'd (%d)\n", tid, max_line_size[tid]); } // LOG_DEBUG_F("[%d] position = %d, read = %d, max_size = %zu\n", i, next_line_len, data_read_len, realsize); if (data_read_len + next_line_len >= realsize) { // Save current state (line[tid] partially read) strncat(line[tid], data, next_line_len); chomp(line[tid]); line[tid][strlen(line[tid])] = '\0'; premature_end = 1; // LOG_DEBUG_F("widow line[tid] = '%s'\n", line[tid]); data_read_len = realsize; break; } strncat(line[tid], data, next_line_len); strncat(output_line[tid], line[tid], strlen(line[tid])); // LOG_DEBUG_F("[%d] copy to buffer (%zu)\n", tid, strlen(line[tid])); int num_substrings; char *copy_buf = strdup(line[tid]); // char *copy_buf = strdup(trim(line[tid])); char **split_result = split(copy_buf, "\t", &num_substrings); free(copy_buf); // Find consequence type name (always after SO field) *SO_found = 0; if (num_substrings == 25) { // LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]); if (!cp_hashtable_contains(gene_list, split_result[17])) { cp_hashtable_put(gene_list, strdup(split_result[17]), NULL); } *SO_found = atoi(split_result[18] + 3); memset(tmp_consequence_type, 0, 128 * sizeof(char)); strncat(tmp_consequence_type, split_result[19], strlen(split_result[19])); } else { LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]); memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(output_line[tid])); // #pragma omp critical // { // printf("********\n"); // LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]); for (int s = 0; s < num_substrings; s++) { // printf("%s^", split_result[s]); free(split_result[s]); } // printf("********\n\n"); free(split_result); // } continue; } for (int s = 0; s < num_substrings; s++) { free(split_result[s]); } free(split_result); if (!*SO_found) { // SO:000000 is not valid LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid); memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(output_line[tid])); continue; } // LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found); size_t consequence_type_len = strlen(tmp_consequence_type); // If file does not exist, create its descriptor and summary counter FILE *aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { #pragma omp critical { // This construction avoids 2 threads trying to insert the same CT aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { char filename[output_directory_len + consequence_type_len + 6]; memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char)); strncat(filename, output_directory, output_directory_len); strncat(filename, "/", 1); strncat(filename, tmp_consequence_type, consequence_type_len); strncat(filename, ".txt", 4); aux_file = fopen(filename, "a"); // Add to hashtables (file descriptors and summary counters) int *SO_stored = (int*) malloc (sizeof(int)); *SO_stored = *SO_found; cp_hashtable_put(output_files, SO_stored, aux_file); LOG_INFO_F("[%d] new CT = %s\n", tid, tmp_consequence_type); } } } // Write line[tid] to file corresponding to its consequence type if (aux_file) { #pragma omp critical { // TODO move critical one level below? count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type); if (count == NULL) { char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char)); strncat(consequence_type, tmp_consequence_type, consequence_type_len); assert(!strcmp(consequence_type, tmp_consequence_type)); count = (int*) malloc (sizeof(int)); *count = 0; cp_hashtable_put(summary_count, consequence_type, count); // LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type); } // Increment counter for summary (*count)++; } // LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type); output_text = strdup(output_line[tid]); list_item_t *output_item = list_item_new(tid, *SO_found, output_text); list_insert_item(output_item, output_list); // LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type); } data += next_line_len+1; data_read_len += next_line_len+1; memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(output_line[tid])); i++; } // Empty buffer for next callback invocation if (!premature_end) { memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(line[tid])); } free(SO_found); return data_read_len; }
int run_stats(shared_options_data_t *shared_options_data, stats_options_data_t *options_data) { file_stats_t *file_stats = file_stats_new(); sample_stats_t **sample_stats; // List that stores the batches of records filtered by each thread list_t *output_list[shared_options_data->num_threads]; // List that stores which thread filtered the next batch to save list_t *next_token_list = malloc(sizeof(list_t)); int ret_code; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } if(options_data->variable) { set_variable_field(options_data->variable, 0, ped_file); } else { set_variable_field("PHENO", 6, ped_file); } if(options_data->variable_groups) { int n, m; char *variable_groups = strdup(options_data->variable_groups); char **groups; char **phenos_in_group; groups = split(variable_groups, ":", &n); for(int i = 0; i < n; i++){ phenos_in_group = split(groups[i], ",", &m); if(set_phenotype_group(phenos_in_group, m, ped_file) < 0) { LOG_ERROR("Variable can't appear in two groups\n"); return DUPLICATED_VARIABLE; } free(phenos_in_group); } ped_file->accept_new_values = 0; free(variable_groups); free(groups); } else { ped_file->accept_new_values = 1; } if(options_data->phenotype) { int n; char* phenotypes = strdup(options_data->phenotype); char** pheno_values = split(phenotypes, ",", &n); if(n != 2) { LOG_ERROR("To handle case-control test, only two phenotypes are supported\n"); return MORE_THAN_TWO_PHENOTYPES; } else { set_unaffected_phenotype(pheno_values[0],ped_file); set_affected_phenotype(pheno_values[1],ped_file); } } else { set_unaffected_phenotype("1", ped_file); set_affected_phenotype("2", ped_file); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } if(!ped_file->num_field) { LOG_ERROR_F("Can't find the specified field \"%s\" in file: %s \n", options_data->variable, ped_file->filename); return VARIABLE_FIELD_NOT_FOUND; } } ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } // Initialize variables related to the different threads for (int i = 0; i < shared_options_data->num_threads; i++) { output_list[i] = (list_t*) malloc(sizeof(list_t)); list_init("input", 1, shared_options_data->num_threads * shared_options_data->batch_lines, output_list[i]); } list_init("next_token", shared_options_data->num_threads, INT_MAX, next_token_list); LOG_INFO("About to retrieve statistics from VCF file...\n"); #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); if (shared_options_data->batch_bytes > 0) { ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, vcf_file); } else if (shared_options_data->batch_lines > 0) { ret_code = vcf_parse_batches(shared_options_data->batch_lines, vcf_file); } stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; khash_t(str) *phenotype_ids = NULL; int num_phenotypes; start = omp_get_wtime(); int i = 0; vcf_batch_t *batch = NULL; while ((batch = fetch_vcf_batch(vcf_file)) != NULL) { if (i == 0) { sample_stats = malloc (get_num_vcf_samples(vcf_file) * sizeof(sample_stats_t*)); for (int j = 0; j < get_num_vcf_samples(vcf_file); j++) { sample_stats[j] = sample_stats_new(array_list_get(j, vcf_file->samples_names)); } if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); // Get the khash of the phenotypes in PED file phenotype_ids = get_phenotypes(ped_file); num_phenotypes = get_num_variables(ped_file); } } if (i % 50 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); } // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes = NULL; array_list_t *input_records = batch->records; int *chunk_starts = create_chunks(input_records->size, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads), &num_chunks, &chunk_sizes); // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { LOG_DEBUG_F("[%d] Stats invocation\n", omp_get_thread_num()); // Invoke variant stats and/or sample stats when applies if (options_data->variant_stats) { int index = omp_get_thread_num() % shared_options_data->num_threads; ret_code = get_variants_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), chunk_sizes[j], individuals, sample_ids,num_phenotypes, output_list[index], file_stats); } if (options_data->sample_stats) { ret_code |= get_sample_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), chunk_sizes[j], individuals, sample_ids, sample_stats, file_stats); } } if (options_data->variant_stats) { // Insert as many tokens as elements correspond to each thread for (int t = 0; t < num_chunks; t++) { for (int s = 0; s < chunk_sizes[t]; s++) { list_item_t *token_item = list_item_new(t, 0, NULL); list_insert_item(token_item, next_token_list); } } } free(chunk_starts); free(chunk_sizes); vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(next_token_list); list_decr_writers(output_list[i]); } if (sample_ids) { kh_destroy(ids, sample_ids); } if (individuals) { free(individuals); } } #pragma omp section { LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num()); char *stats_prefix = get_vcf_stats_filename_prefix(shared_options_data->vcf_filename, shared_options_data->output_filename, shared_options_data->output_directory); // File names and descriptors for output to plain text files char *stats_filename, *summary_filename, *phenotype_filename; FILE *stats_fd, *summary_fd, **phenotype_fd; char *stats_db_name; sqlite3 *db = NULL; khash_t(stats_chunks) *hash; khash_t(str) *phenotype_ids; int num_phenotypes; if(ped_file){ phenotype_ids = get_phenotypes(ped_file); num_phenotypes = get_num_variables(ped_file); } if (options_data->save_db) { delete_files_by_extension(shared_options_data->output_directory, "db"); stats_db_name = calloc(strlen(stats_prefix) + strlen(".db") + 2, sizeof(char)); sprintf(stats_db_name, "%s.db", stats_prefix); create_stats_db(stats_db_name, VCF_CHUNKSIZE, create_vcf_query_fields, &db); hash = kh_init(stats_chunks); } // Write variant (and global) statistics if (options_data->variant_stats) { stats_filename = get_variant_stats_output_filename(stats_prefix); if (!(stats_fd = fopen(stats_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants: %s\n", stats_filename); } //Open one file for each phenotype if(ped_file){ phenotype_fd = malloc(sizeof(FILE*)*num_phenotypes); if(options_data->variable_groups){ int n; char *variable_groups = strdup(options_data->variable_groups); char ** names = split(variable_groups, ":", &n); for(int i = 0; i < n; i++) { phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, names[i]); if(!(phenotype_fd[i] = fopen(phenotype_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename); } free(phenotype_filename); } free(names); free(variable_groups); } else { for (khint_t i = kh_begin(phenotype_ids); i != kh_end(phenotype_ids); ++i) { if (!kh_exist(phenotype_ids,i)) continue; phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, kh_key(phenotype_ids,i)); if(!(phenotype_fd[kh_val(phenotype_ids,i)] = fopen(phenotype_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename); } free(phenotype_filename); } } } // Write header report_vcf_variant_stats_header(stats_fd); if(ped_file){ for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats_header(phenotype_fd[i]); } // For each variant, generate a new line int avail_stats = 0; variant_stats_t *var_stats_batch[VCF_CHUNKSIZE]; list_item_t *token_item = NULL, *output_item = NULL; while ( token_item = list_remove_item(next_token_list) ) { output_item = list_remove_item(output_list[token_item->id]); assert(output_item); var_stats_batch[avail_stats] = output_item->data_p; avail_stats++; // Run only when certain amount of stats is available if (avail_stats >= VCF_CHUNKSIZE) { report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch); if(ped_file) for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i); // Free all stats from the "batch" for (int i = 0; i < avail_stats; i++) { variant_stats_free(var_stats_batch[i]); } avail_stats = 0; } // Free resources list_item_free(output_item); list_item_free(token_item); } if (avail_stats > 0) { report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch); if(ped_file) for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i); // Free all stats from the "batch" for (int i = 0; i < avail_stats; i++) { variant_stats_free(var_stats_batch[i]); } avail_stats = 0; } // Write whole file stats (data only got when launching variant stats) summary_filename = get_vcf_file_stats_output_filename(stats_prefix); if (!(summary_fd = fopen(summary_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics summary: %s\n", summary_filename); } report_vcf_summary_stats(summary_fd, db, file_stats); free(stats_filename); free(summary_filename); // Close variant stats file if (stats_fd) { fclose(stats_fd); } if (summary_fd) { fclose(summary_fd); } if(ped_file){ for(int i = 0; i < num_phenotypes; i++) if(phenotype_fd[i]) fclose(phenotype_fd[i]); free(phenotype_fd); } } // Write sample statistics if (options_data->sample_stats) { stats_filename = get_sample_stats_output_filename(stats_prefix); if (!(stats_fd = fopen(stats_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of samples: %s\n", stats_filename); } report_vcf_sample_stats_header(stats_fd); report_vcf_sample_stats(stats_fd, NULL, vcf_file->samples_names->size, sample_stats); // Close sample stats file free(stats_filename); if (stats_fd) { fclose(stats_fd); } } free(stats_prefix); if (db) { insert_chunk_hash(VCF_CHUNKSIZE, hash, db); create_stats_index(create_vcf_index, db); close_stats_db(db, hash); } } } for (int i = 0; i < get_num_vcf_samples(vcf_file); i++) { sample_stats_free(sample_stats[i]); } free(sample_stats); free(file_stats); free(next_token_list); for (int i = 0; i < shared_options_data->num_threads; i++) { free(output_list[i]); } vcf_close(vcf_file); if (ped_file) { ped_close(ped_file, 1,1); } return 0; }
void batch_aligner(batch_aligner_input_t *input) { // printf("START: batch_aligner\n", omp_get_thread_num()); size_t total_batches = 0; list_t *read_list = input->read_list; list_t *write_list = input->write_list; write_batch_t* write_batch = NULL; aligner_batch_t *aligner_batch = NULL; list_item_t *read_item = NULL, *write_item = NULL; unsigned int tid = omp_get_thread_num(); struct timeval t1, t2; array_list_t *list1, *list2; // main loop while ( (read_item = list_remove_item(read_list)) != NULL ) { aligner_batch = aligner_batch_new((fastq_batch_t *) read_item->data_p); thr_batches[tid]++; //printf("********************** BATCH %d (batch aligner %d)\n", total_batches, omp_get_thread_num()); // Burros-Wheeler transform gettimeofday(&t1, NULL); apply_bwt(input->bwt_input, aligner_batch); gettimeofday(&t2, NULL); bwt_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec)); //printf("---> %d, bwt, num targets = %d\n", tid, aligner_batch->num_targets); if (aligner_batch->num_targets > 0) { // seeding gettimeofday(&t1, NULL); apply_seeding(input->region_input, aligner_batch); gettimeofday(&t2, NULL); seeding_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec)); thr_seeding_items[tid] += aligner_batch->num_targets; //printf("---> %d, seeding, num targets = %d\n", tid, aligner_batch->num_targets); // seeking CALs gettimeofday(&t1, NULL); apply_caling(input->cal_input, aligner_batch); gettimeofday(&t2, NULL); cal_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec)); thr_cal_items[tid] += aligner_batch->num_targets; //printf("---> %d, cal, num targets = %d\n", tid, aligner_batch->num_targets); } // pair-mode managing if (input->pair_input != NULL) { apply_pair(input->pair_input, aligner_batch); // printf("---> %d, pair, num targets = %d\n", tid, aligner_batch->num_targets); } if (aligner_batch->num_targets > 0) { // Smith-Waterman gettimeofday(&t1, NULL); apply_sw(input->sw_input, aligner_batch); gettimeofday(&t2, NULL); sw_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec)); //thr_sw_items[tid] += aligner_batch->num_targets; //printf("---> %d, sw, num targets = %d\n", tid, aligner_batch->num_targets); } if (aligner_batch->num_targets > 0) { // prepare alignments (converts sw-output to alignment, searches pairs...) prepare_alignments(input->pair_input, aligner_batch); } write_item = list_item_new(total_batches, 0, aligner_batch); list_insert_item(write_item, write_list); list_item_free(read_item); total_batches++; } // main loop /* printf("Thread %d: BWT time = %0.4f s\n", tid, bwt_time / 1e6); printf("Thread %d: Seeding time = %0.4f s\n", tid, seeding_time / 1e6); printf("Thread %d: CAL time = %0.4f s\n", tid, cal_time / 1e6); printf("Thread %d: SW time = %0.4f s\n", tid, sw_time / 1e6); */ // decreasing writers if (write_list != NULL) list_decr_writers(write_list); // printf("END: batch_aligner (%d), (total batches %d): END\n", omp_get_thread_num(), total_batches); }
void region_seeker_server(region_seeker_input_t *input_p){ printf("region_seeker_server(%d): START\n", omp_get_thread_num()); list_item_t *item_p = NULL; list_item_t *cal_item_p = NULL; fastq_batch_t *unmapped_batch_p; size_t num_reads; array_list_t **allocate_mapping_p; cal_batch_t *cal_batch_p; size_t num_mappings, total_mappings = 0, num_batches = 0; size_t num_threads = input_p->region_threads; size_t chunk; size_t total_reads = 0; omp_set_num_threads(num_threads); while ( (item_p = list_remove_item(input_p->unmapped_read_list_p)) != NULL ) { //printf("Region Seeker Processing batch...\n"); num_batches++; if (time_on) { timing_start(REGION_SEEKER, 0, timing_p); } unmapped_batch_p = (fastq_batch_t *)item_p->data_p; num_reads = unmapped_batch_p->num_reads; total_reads += num_reads; allocate_mapping_p = (array_list_t **)malloc(sizeof(array_list_t *)*num_reads); if (input_p->gpu_enable) { //******************************* GPU PROCESS *********************************// for (size_t i = 0; i < num_reads; i++) { allocate_mapping_p[i] = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); } #ifdef HPG_GPU num_mappings = bwt_map_exact_seed_batch_gpu(unmapped_batch_p, input_p->bwt_optarg_p, input_p->cal_optarg_p, input_p->bwt_index_p, input_p->gpu_context, allocate_mapping_p); #endif //****************************************************************************// } else { //******************************* CPU PROCESS *********************************// //printf("Region Seeker :: Process Batch with %d reads\n", num_reads); chunk = MAX(1, num_reads/(num_threads*10)); //printf("Region Seeker :: Process Batch with %d reads\n", num_reads); #pragma omp parallel for private(num_mappings) reduction(+:total_mappings) schedule(dynamic, chunk) //#pragma omp parallel for private(num_mappings) reduction(+:total_mappings) schedule(static) for (size_t i = 0; i < num_reads; i++) { //printf("Threads region zone: %d\n", omp_get_num_threads()); allocate_mapping_p[i] = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); num_mappings = bwt_map_exact_seeds_seq(&(unmapped_batch_p->seq[unmapped_batch_p->data_indices[i]]), input_p->cal_optarg_p->seed_size, input_p->cal_optarg_p->min_seed_size, input_p->bwt_optarg_p, input_p->bwt_index_p, allocate_mapping_p[i]); total_mappings += num_mappings; //printf("----------------->>>>>>>>>>>Regions found %d\n", num_mappings); } //****************************************************************************// } cal_batch_p = cal_batch_new(allocate_mapping_p, unmapped_batch_p); list_item_free(item_p); cal_item_p = list_item_new(0, 0, cal_batch_p); //region_batch_free(region_batch_p); if (time_on) { timing_stop(REGION_SEEKER, 0, timing_p); } list_insert_item(cal_item_p, input_p->region_list_p); //printf("Region Seeker Processing batch finish!\n"); } //End of while list_decr_writers(input_p->region_list_p); if (statistics_on) { statistics_set(REGION_SEEKER_ST, 0, num_batches, statistics_p); statistics_set(REGION_SEEKER_ST, 1, total_reads, statistics_p); } printf("region_seeker_server: END\n"); }
void topic_add_connection(topic_t* t, connection_t* con) { list_insert_item(t->conns, con); }