cp_hashtable* associate_samples_and_positions(vcf_file_t* file) { LOG_DEBUG_F("** %zu sample names read\n", file->samples_names->size); array_list_t *sample_names = file->samples_names; cp_hashtable *sample_ids = cp_hashtable_create(sample_names->size * 2, cp_hash_string, (cp_compare_fn) strcasecmp ); int *index; char *name; for (int i = 0; i < sample_names->size; i++) { name = sample_names->items[i]; index = (int*) malloc (sizeof(int)); *index = i; if (cp_hashtable_get(sample_ids, name)) { LOG_FATAL_F("Sample %s appears more than once. File can not be analyzed.\n", name); } cp_hashtable_put(sample_ids, name, index); } // char **keys = (char**) cp_hashtable_get_keys(sample_ids); // int num_keys = cp_hashtable_count(sample_ids); // for (int i = 0; i < num_keys; i++) { // printf("%s\t%d\n", keys[i], *((int*) cp_hashtable_get(sample_ids, keys[i]))); // } return sample_ids; }
static long get_thread_serial(long tno) { long *num; if (thread_id == NULL) return 0; num = cp_hashtable_get(thread_id, &tno); if (num == NULL) { long *key; num = malloc(sizeof(long)); if (num == NULL) { cp_error(CP_MEMORY_ALLOCATION_FAILURE, "can\'t allocate thread mapping number"); return -1L; } key = malloc(sizeof(long)); if (key == NULL) { cp_error(CP_MEMORY_ALLOCATION_FAILURE, "can\'t allocate thread mapping key"); return -1L; } *num = ++thread_count; *key = tno; cp_hashtable_put(thread_id, key, num); } return *num; }
void cp_error(int code, char *fmt, ...) #endif { va_list argp; char errcode[256]; char errmsg[224]; char *msg; if (loglevel > LOG_LEVEL_ERROR) return; // if (fmt == NULL) return; msg = cp_hashtable_get(error_message_lookup, &code); if (msg) #ifdef CP_HAS_SNPRINTF snprintf(errmsg, 224, " - %s", msg); #else sprintf(errmsg, " - %s", msg); #endif /* CP_HAS_SNPRINTF */ else
static void parse_effect_response(int tid, char *output_directory, size_t output_directory_len, cp_hashtable *output_files, list_t *output_list, cp_hashtable *summary_count, cp_hashtable *gene_list) { int *SO_found = (int*) malloc (sizeof(int)); // Whether the SO code field has been found int *count; char tmp_consequence_type[128]; int num_lines; char **split_batch = split(effect_line[tid], "\n", &num_lines); for (int i = 0; i < num_lines; i++) { int num_columns; char *copy_buf = strdup(split_batch[i]); char **split_result = split(copy_buf, "\t", &num_columns); free(copy_buf); // Find consequence type name (always after SO field) *SO_found = 0; if (num_columns == 25) { // LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]); if (!cp_hashtable_contains(gene_list, split_result[17])) { cp_hashtable_put(gene_list, strdup(split_result[17]), NULL); } *SO_found = atoi(split_result[18] + 3); memset(tmp_consequence_type, 0, 128 * sizeof(char)); strncat(tmp_consequence_type, split_result[19], strlen(split_result[19])); } else { if (strlen(split_batch[i]) == 0) { // Last line in batch could be only a newline for (int s = 0; s < num_columns; s++) { free(split_result[s]); } free(split_result); continue; } LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_columns, split_batch[i]); for (int s = 0; s < num_columns; s++) { free(split_result[s]); } free(split_result); continue; } for (int s = 0; s < num_columns; s++) { free(split_result[s]); } free(split_result); if (!*SO_found) { // SO:000000 is not valid LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid); continue; } // LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found); size_t consequence_type_len = strlen(tmp_consequence_type); // If file does not exist, create its descriptor and summary counter FILE *aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { #pragma omp critical { // This construction avoids 2 threads trying to insert the same CT aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { char filename[output_directory_len + consequence_type_len + 6]; memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char)); strncat(filename, output_directory, output_directory_len); strncat(filename, "/", 1); strncat(filename, tmp_consequence_type, consequence_type_len); strncat(filename, ".txt", 4); aux_file = fopen(filename, "a"); // Add to hashtables (file descriptors and summary counters) int *SO_stored = (int*) malloc (sizeof(int)); *SO_stored = *SO_found; cp_hashtable_put(output_files, SO_stored, aux_file); LOG_INFO_F("[%d] New consequence type found = %s\n", tid, tmp_consequence_type); } } } // Write line[tid] to file corresponding to its consequence type if (aux_file) { #pragma omp critical { // TODO move critical one level below? count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type); if (count == NULL) { char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char)); strncat(consequence_type, tmp_consequence_type, consequence_type_len); assert(!strcmp(consequence_type, tmp_consequence_type)); count = (int*) malloc (sizeof(int)); *count = 0; cp_hashtable_put(summary_count, consequence_type, count); // LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type); } // Increment counter for summary (*count)++; } // LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type); list_item_t *output_item = list_item_new(tid, *SO_found, strdup(split_batch[i])); list_insert_item(output_item, output_list); // LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type); } } for (int i = 0; i < num_lines; i++) { free(split_batch[i]); } free(split_batch); }
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) { int ret_code = 0; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } } char *output_directory = shared_options_data->output_directory; size_t output_directory_len = strlen(output_directory); ret_code = create_directory(output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", output_directory); } // Remove all .txt files in folder ret_code = delete_files_by_extension(output_directory, "txt"); if (ret_code != 0) { return ret_code; } // Initialize environment for connecting to the web service ret_code = init_http_environment(0); if (ret_code != 0) { return ret_code; } // Output file descriptors static cp_hashtable *output_files = NULL; // Lines of the output data in the main .txt files static list_t *output_list = NULL; // Consequence type counters (for summary, must be kept between web service calls) static cp_hashtable *summary_count = NULL; // Gene list (for genes-with-variants, must be kept between web service calls) static cp_hashtable *gene_list = NULL; // Initialize collections of file descriptors and summary counters ret_code = initialize_output_files(output_directory, output_directory_len, &output_files); if (ret_code != 0) { return ret_code; } initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list); initialize_ws_buffers(shared_options_data->num_threads); // Create job.status file char job_status_filename[output_directory_len + 10]; sprintf(job_status_filename, "%s/job.status", output_directory); FILE *job_status = new_job_status_file(job_status_filename); if (!job_status) { LOG_FATAL("Can't create job status file\n"); } else { update_job_status_file(0, job_status); } #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); start = omp_get_wtime(); ret_code = vcf_read(vcf_file, 1, (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines, shared_options_data->batch_bytes <= 0); stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); // Filters and files for filtering output filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); // Pedigree information (used in some filters) individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; // Filename structure outdir/vcfname.errors char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char)); get_filename_from_path(shared_options_data->vcf_filename, prefix_filename); char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char)); sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename); non_processed_file = fopen(non_processed_filename, "w"); free(non_processed_filename); // Maximum size processed by each thread (never allow more than 1000 variants per query) if (shared_options_data->batch_lines > 0) { shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads)); } else { shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY; } LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread); int i = 0; vcf_batch_t *batch = NULL; int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0; start = omp_get_wtime(); while (batch = fetch_vcf_batch(vcf_file)) { if (i == 0) { // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], vcf_file); } // Write file format, header entries and delimiter if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); } if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); } if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); } LOG_DEBUG("VCF header written\n"); if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); } } // printf("batch loaded = '%.*s'\n", 50, batch->text); // printf("batch text len = %zu\n", strlen(batch->text)); // if (i % 10 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); // } int reconnections = 0; int max_reconnections = 3; // TODO allow to configure? // Write records that passed to a separate file, and query the WS with them as args array_list_t *failed_records = NULL; int num_variables = ped_file? get_num_variables(ped_file): 0; array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records); if (passed_records->size > 0) { // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes; int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes); do { // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { int tid = omp_get_thread_num(); LOG_DEBUG_F("[%d] WS invocation\n", tid); LOG_DEBUG_F("[%d] -- effect WS\n", tid); if (!reconnections || ret_ws_0) { ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j], options_data->excludes); parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list); free(effect_line[tid]); effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char)); } if (!options_data->no_phenotypes) { if (!reconnections || ret_ws_1) { LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num()); ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_snp_phenotype_response(tid, output_list); free(snp_line[tid]); snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char)); } if (!reconnections || ret_ws_2) { LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num()); ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_mutation_phenotype_response(tid, output_list); free(mutation_line[tid]); mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char)); } } } LOG_DEBUG_F("*** %dth web services invocation finished\n", i); if (ret_ws_0 || ret_ws_1 || ret_ws_2) { if (ret_ws_0) { LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0)); } if (ret_ws_1) { LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1)); } if (ret_ws_2) { LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2)); } // In presence of errors, wait 4 seconds before retrying reconnections++; LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections); sleep(4); } else { free(chunk_starts); free(chunk_sizes); } } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)); } // If the maximum number of reconnections was reached still with errors, // write the non-processed batch to the corresponding file if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) { #pragma omp critical { write_vcf_batch(batch, non_processed_file); } } // Write records that passed and failed filters to separate files, and free them write_filtering_output_files(passed_records, failed_records, passed_file, failed_file); free_filtered_records(passed_records, failed_records, batch->records); // Free batch and its contents vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources if (passed_file) { fclose(passed_file); } if (failed_file) { fclose(failed_file); } if (non_processed_file) { fclose(non_processed_file); } // Free filters for (i = 0; i < num_filters; i++) { filter_t *filter = filters[i]; filter->free_func(filter); } free(filters); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } } #pragma omp section { // Thread which writes the results to all_variants, summary and one file per consequence type int ret = 0; char *line; list_item_t* item = NULL; FILE *fd = NULL; FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants"); FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes"); FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes"); while ((item = list_remove_item(output_list)) != NULL) { line = item->data_p; // Type greater than 0: consequence type identified by its SO code // Type equals to -1: SNP phenotype // Type equals to -2: mutation phenotype if (item->type > 0) { // Write entry in the consequence type file fd = cp_hashtable_get(output_files, &(item->type)); int ret = fprintf(fd, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to file: '%s'\n", line); } // Write in all_variants ret = fprintf(all_variants_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to all_variants: '%s'\n", line); } } else if (item->type == SNP_PHENOTYPE) { ret = fprintf(snp_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line); } } else if (item->type == MUTATION_PHENOTYPE) { ret = fprintf(mutation_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line); } } free(line); list_item_free(item); } } } write_summary_file(summary_count, cp_hashtable_get(output_files, "summary")); write_genes_with_variants_file(gene_list, output_directory); write_result_file(shared_options_data, options_data, summary_count, output_directory); free_output_data_structures(output_files, summary_count, gene_list); free_ws_buffers(shared_options_data->num_threads); free(output_list); vcf_close(vcf_file); update_job_status_file(100, job_status); close_job_status_file(job_status); return ret_code; }
int tdt_test(vcf_record_t **variants, int num_variants, family_t **families, int num_families, cp_hashtable *sample_ids, list_t *output_list) { double start = omp_get_wtime(); int ret_code = 0; int tid = omp_get_thread_num(); int num_samples = cp_hashtable_count(sample_ids); tdt_result_t *result; char **sample_data; int gt_position; int father_allele1, father_allele2; int mother_allele1, mother_allele2; int child_allele1, child_allele2; /////////////////////////////////// // Perform analysis for each variant vcf_record_t *record; for (int i = 0; i < num_variants; i++) { record = variants[i]; LOG_DEBUG_F("[%d] Checking variant %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position); sample_data = (char**) record->samples->items; gt_position = get_field_position_in_format("GT", strndup(record->format, record->format_len)); // Transmission counts int t1 = 0; int t2 = 0; // Count over families family_t *family; for (int f = 0; f < num_families; f++) { family = families[f]; individual_t *father = family->father; individual_t *mother = family->mother; cp_list *children = family->children; // LOG_DEBUG_F("[%d] Checking suitability of family %s\n", tid, family->id); if (father == NULL || mother == NULL) { continue; } int *father_pos = cp_hashtable_get(sample_ids, father->id); if (father_pos != NULL) { // LOG_DEBUG_F("[%d] Father %s is in position %d\n", tid, father->id, *father_pos); } else { // LOG_DEBUG_F("[%d] Father %s is not positioned\n", tid, father->id); continue; } int *mother_pos = cp_hashtable_get(sample_ids, mother->id); if (mother_pos != NULL) { // LOG_DEBUG_F("[%d] Mother %s is in position %d\n", tid, mother->id, *mother_pos); } else { // LOG_DEBUG_F("[%d] Mother %s is not positioned\n", tid, mother->id); continue; } char *father_sample = strdup(sample_data[*father_pos]); char *mother_sample = strdup(sample_data[*mother_pos]); // LOG_DEBUG_F("[%d] Samples: Father = %s\tMother = %s\n", tid, father_sample, mother_sample); // If any parent's alleles can't be read or is missing, go to next family if (get_alleles(father_sample, gt_position, &father_allele1, &father_allele2) || get_alleles(mother_sample, gt_position, &mother_allele1, &mother_allele2)) { free(father_sample); free(mother_sample); continue; } // LOG_DEBUG_F("[%d] Alleles: Father = %d/%d\tMother = %d/%d\n", tid, father_allele1, father_allele2, mother_allele1, mother_allele2); // We need two genotyped parents, with at least one het if (father_allele1 == father_allele2 && mother_allele1 == mother_allele2) { free(father_sample); free(mother_sample); continue; } if ((father_allele1 && !father_allele2) || (mother_allele1 && !mother_allele2)) { free(father_sample); free(mother_sample); continue; } // LOG_DEBUG_F("[%d] Proceeding to analyse family %s...\n", tid, family->id); int trA = 0; // transmitted allele from first het parent int unA = 0; // untransmitted allele from first het parent int trB = 0; // transmitted allele from second het parent int unB = 0; // untransmitted allele from second het parent // Consider all offspring in nuclear family cp_list_iterator *children_iterator = cp_list_create_iterator(family->children, COLLECTION_LOCK_READ); individual_t *child = NULL; while ((child = cp_list_iterator_next(children_iterator)) != NULL) { // Only consider affected children if (child->condition != AFFECTED) { continue; } int *child_pos = cp_hashtable_get(sample_ids, child->id); if (child_pos != NULL) { // LOG_DEBUG_F("[%d] Child %s is in position %d\n", tid, child->id, *child_pos); } else { // LOG_DEBUG_F("[%d] Child %s is not positioned\n", tid, child->id); continue; } char *child_sample = strdup(sample_data[*child_pos]); // LOG_DEBUG_F("[%d] Samples: Child = %s\n", tid, child_sample); // Skip if offspring has missing genotype if (get_alleles(child_sample, gt_position, &child_allele1, &child_allele2)) { free(child_sample); continue; } // Exclude mendelian errors char *aux_chromosome = strndup(record->chromosome, record->chromosome_len); if (check_mendel(aux_chromosome, father_allele1, father_allele2, mother_allele1, mother_allele2, child_allele1, child_allele2, child->sex)) { free(child_sample); free(aux_chromosome); continue; } free(aux_chromosome); // We've now established: no missing genotypes // and at least one heterozygous parent // Kid is 00 if (!child_allele1 && !child_allele2) { if ( ( (!father_allele1) && father_allele2 ) && ( (!mother_allele1) && mother_allele2 ) ) { trA=1; unA=2; trB=1; unB=2; } else { trA=1; unA=2; } } else if ( (!child_allele1) && child_allele2 ) // Kid is 01 { // het dad if (father_allele1 != father_allele2 ) { // het mum if ( mother_allele1 != mother_allele2 ) { trA=1; trB=2; unA=2; unB=1; } else if ( !mother_allele1 ) { trA=2; unA=1; } else { trA=1; unA=2; } } else if ( !father_allele1 ) { trA=2; unA=1; } else { trA=1; unA=2; } } else // kid is 1/1 { if ( ( (!father_allele1) && father_allele2 ) && ( (!mother_allele1) && mother_allele2 ) ) { trA=2; unA=1; trB=2; unB=1; } else { trA=2; unA=1; } } // We have now populated trA (first transmission) // and possibly trB also //////////////////////////////////////// // Permutation? 50:50 flip (precomputed) // if (permute) { // if (flipA[f]) // { // int t = trA; // trA = unA; // unA = t; // // t = trB; // trB = unB; // unB = t; // } // } // Increment transmission counts if (trA==1) { t1++; } else if (trA==2) { t2++; } if (trB==1) { t1++; } else if (trB==2) { t2++; } // // LOG_DEBUG_F("TDT\t%.*s %s : %d %d - %d %d - %d %d - F %d/%d - M %d/%d - C %d/%d\n", // record->id_len, record->id, family->id, trA, unA, trB, unB, t1, t2, // father_allele1, father_allele2, mother_allele1, mother_allele2, child_allele1, child_allele2); free(child_sample); } // next offspring in family cp_list_iterator_destroy(children_iterator); free(father_sample); free(mother_sample); } // next nuclear family ///////////////////////////// // Finished counting: now compute // the statistics double tdt_chisq = -1; // Basic TDT test if (t1+t2 > 0) { tdt_chisq = ((double) ((t1-t2) * (t1-t2))) / (t1+t2); } // LOG_DEBUG_F("[%d] before adding %s:%ld\n", tid, record->chromosome, record->position); result = tdt_result_new(record->chromosome, record->chromosome_len, record->position, record->reference, record->reference_len, record->alternate, record->alternate_len, t1, t2, tdt_chisq); list_item_t *output_item = list_item_new(tid, 0, result); list_insert_item(output_item, output_list); // LOG_DEBUG_F("[%d] after adding %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position); } // next variant double end = omp_get_wtime(); return ret_code; }
static size_t write_effect_ws_results(char *contents, size_t size, size_t nmemb, void *userdata) { int tid = omp_get_thread_num(); int i = 0; int data_read_len = 0, next_line_len = 0; // Whether the SO code field (previous to the consequence type name) has been found int *SO_found = (int*) malloc (sizeof(int)); // Whether the buffer was consumed with a line read just partially int premature_end = 0; size_t realsize = size * nmemb; int *count; char *data = contents; char tmp_consequence_type[128]; char *aux_buffer; char *output_text; LOG_DEBUG_F("Effect WS invoked, response size = %zu bytes\n", realsize); while (data_read_len < realsize) { assert((line + tid) != NULL); assert((max_line_size + tid) != NULL); LOG_DEBUG_F("[%d] loop iteration #%d\n", tid, i); // Get length of data to copy next_line_len = strcspn(data, "\n"); // If the line[tid] is too long for the current buffers, reallocate a little more than the needed memory if (strlen(line[tid]) + next_line_len + 1 > max_line_size[tid]) { // LOG_DEBUG_F("Line too long (%d elements, but %zu needed) in batch #%d\n", // max_line_size[tid], strlen(line[tid]) + next_line_len, batch_num); // char *out_buf = (char*) calloc (next_line_len+1, sizeof(char)); // snprintf(out_buf, next_line_len, "%s", data); // LOG_INFO_F("[%d] too big data is: '%s'\n", tid, out_buf); char *aux_1 = (char*) realloc (line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char)); char *aux_2 = (char*) realloc (output_line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char)); if (!aux_1 || !aux_2) { LOG_ERROR("Can't resize buffers\n"); // Can't resize buffers -> can't keep reading the file if (!aux_1) { free(line[tid]); } if (!aux_2) { free(output_line[tid]); } return data_read_len; } line[tid] = aux_1; output_line[tid] = aux_2; max_line_size[tid] += next_line_len + 1; // LOG_DEBUG_F("[%d] buffers realloc'd (%d)\n", tid, max_line_size[tid]); } // LOG_DEBUG_F("[%d] position = %d, read = %d, max_size = %zu\n", i, next_line_len, data_read_len, realsize); if (data_read_len + next_line_len >= realsize) { // Save current state (line[tid] partially read) strncat(line[tid], data, next_line_len); chomp(line[tid]); line[tid][strlen(line[tid])] = '\0'; premature_end = 1; // LOG_DEBUG_F("widow line[tid] = '%s'\n", line[tid]); data_read_len = realsize; break; } strncat(line[tid], data, next_line_len); strncat(output_line[tid], line[tid], strlen(line[tid])); // LOG_DEBUG_F("[%d] copy to buffer (%zu)\n", tid, strlen(line[tid])); int num_substrings; char *copy_buf = strdup(line[tid]); // char *copy_buf = strdup(trim(line[tid])); char **split_result = split(copy_buf, "\t", &num_substrings); free(copy_buf); // Find consequence type name (always after SO field) *SO_found = 0; if (num_substrings == 25) { // LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]); if (!cp_hashtable_contains(gene_list, split_result[17])) { cp_hashtable_put(gene_list, strdup(split_result[17]), NULL); } *SO_found = atoi(split_result[18] + 3); memset(tmp_consequence_type, 0, 128 * sizeof(char)); strncat(tmp_consequence_type, split_result[19], strlen(split_result[19])); } else { LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]); memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(output_line[tid])); // #pragma omp critical // { // printf("********\n"); // LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]); for (int s = 0; s < num_substrings; s++) { // printf("%s^", split_result[s]); free(split_result[s]); } // printf("********\n\n"); free(split_result); // } continue; } for (int s = 0; s < num_substrings; s++) { free(split_result[s]); } free(split_result); if (!*SO_found) { // SO:000000 is not valid LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid); memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(output_line[tid])); continue; } // LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found); size_t consequence_type_len = strlen(tmp_consequence_type); // If file does not exist, create its descriptor and summary counter FILE *aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { #pragma omp critical { // This construction avoids 2 threads trying to insert the same CT aux_file = cp_hashtable_get(output_files, SO_found); if (!aux_file) { char filename[output_directory_len + consequence_type_len + 6]; memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char)); strncat(filename, output_directory, output_directory_len); strncat(filename, "/", 1); strncat(filename, tmp_consequence_type, consequence_type_len); strncat(filename, ".txt", 4); aux_file = fopen(filename, "a"); // Add to hashtables (file descriptors and summary counters) int *SO_stored = (int*) malloc (sizeof(int)); *SO_stored = *SO_found; cp_hashtable_put(output_files, SO_stored, aux_file); LOG_INFO_F("[%d] new CT = %s\n", tid, tmp_consequence_type); } } } // Write line[tid] to file corresponding to its consequence type if (aux_file) { #pragma omp critical { // TODO move critical one level below? count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type); if (count == NULL) { char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char)); strncat(consequence_type, tmp_consequence_type, consequence_type_len); assert(!strcmp(consequence_type, tmp_consequence_type)); count = (int*) malloc (sizeof(int)); *count = 0; cp_hashtable_put(summary_count, consequence_type, count); // LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type); } // Increment counter for summary (*count)++; } // LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type); output_text = strdup(output_line[tid]); list_item_t *output_item = list_item_new(tid, *SO_found, output_text); list_insert_item(output_item, output_list); // LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type); } data += next_line_len+1; data_read_len += next_line_len+1; memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(output_line[tid])); i++; } // Empty buffer for next callback invocation if (!premature_end) { memset(line[tid], 0, strlen(line[tid])); memset(output_line[tid], 0, strlen(line[tid])); } free(SO_found); return data_read_len; }
int file_service(cp_http_request *request, cp_http_response *response) { int rc = 0; char *ext; char path[PATHLEN]; int uri_len; char buf[FBUFSIZE]; FILE *fp; cp_string *body = NULL; #ifdef DEBUG cp_http_request_dump(request); #endif ext = strrchr(request->uri, '.'); if (ext) cp_http_response_set_content_type_string(response, cp_hashtable_get(mimemap, ++ext)); /* check len, avoid buffer overrun */ uri_len = strlen(request->uri); if (uri_len + strlen(document_root) >= PATHLEN) { cp_http_response_set_content_type(response, HTML); cp_http_response_set_status(response, HTTP_404_NOT_FOUND); response->body = strdup(HTTP404_PAGE); return HTTP_CONNECTION_POLICY_CLOSE; } #ifdef CP_HAS_SNPRINTF snprintf(path, PATHLEN, "%s%s", document_root, request->uri); #else sprintf(path, "%s%s", document_root, request->uri); #endif /* CP_HAS_SNPRINTF */ if (path[strlen(path) - 1] == '/') { strlcat(path, "index.html", PATHLEN); response->content_type = HTML; } fp = fopen(path, "rb"); if (fp == NULL) { cp_http_response_set_content_type(response, HTML); cp_http_response_set_status(response, HTTP_404_NOT_FOUND); #ifdef CP_HAS_SNPRINTF snprintf(buf, FBUFSIZE, HTTP404_PAGE_uri, request->uri); #else sprintf(buf, HTTP404_PAGE_uri, request->uri); #endif /* CP_HAS_SNPRINTF */ response->body = strdup(buf); return HTTP_CONNECTION_POLICY_CLOSE; } #ifdef __TRACE__ DEBUGMSG("retrieving [%s]", path); #endif while ((rc = fread(buf, 1, FBUFSIZE, fp)) > 0) { if (body == NULL) body = cp_string_create(buf, rc); else cp_string_cat_bin(body, buf, rc); } fclose(fp); cp_http_response_set_status(response, HTTP_200_OK); response->content = body; return HTTP_CONNECTION_POLICY_KEEP_ALIVE; }
void code_binary_file_generator(size_t chunk_size, char *dna_filename, char *dna_binary_filename, cp_hashtable *t){ if (chunk_size <= 0) { chunk_size = 100000000; //100MB } FILE *binary_fd, *fd; fd = fopen(dna_filename, "r"); if (fd == NULL) { printf("Error opening file %s\n", dna_filename); exit(-1); } binary_fd = fopen (dna_binary_filename, "wb"); if (binary_fd == NULL) { printf("Error opening file %s\n", dna_binary_filename); exit(-1); } char *dna_chunk = (char *)malloc(sizeof(char)*chunk_size); size_t codes_allocate = chunk_size; unsigned char *code_values = (unsigned char *)malloc(sizeof(unsigned char)*codes_allocate); size_t code_pos = 0; size_t dna_len; char key[4]; unsigned char max_chunk = 3; unsigned char actual_nt = 0; unsigned char value; unsigned char *value_ptr; size_t nt = 0; unsigned char key_chunk = 3; printf("Process DNA File\n"); while (!feof(fd)) { fgets(dna_chunk, chunk_size, fd); if (dna_chunk[0] != '>') { dna_len = strlen(dna_chunk); //printf("Process (%i): %s", dna_len, dna_chunk); for (unsigned int c = 0; c < dna_len; c++) { if (dna_chunk[c] != '\n') { //printf("Char(%i)[%i]: %c\n", c, actual_nt, dna_chunk[c]); if (dna_chunk[c] == 'a' || dna_chunk[c] == 'c' || dna_chunk[c] == 'g' || dna_chunk[c] == 't' || dna_chunk[c] == 'n') { //printf("Convert %c in %c\n", dna_chunk[c], dna_chunk[c] - 32); dna_chunk[c] = dna_chunk[c] - 32; } key[actual_nt++] = dna_chunk[c]; if (actual_nt == max_chunk){ key[actual_nt] = '\0'; //printf("Store: %s\n", key); value_ptr = (unsigned char *)cp_hashtable_get(t, key); value = *value_ptr; code_values[code_pos++] = value; //printf("Stored code %d == %s : %d\n", value, key, code_pos - 1); if (code_pos >= codes_allocate) { //printf("Write Ids in file...\n"); fwrite(code_values, sizeof(unsigned char), code_pos, binary_fd); code_pos = 0; } actual_nt = 0; } } } //End for } else { printf("Process: %s", &dna_chunk[1]); }//End if strcmp } if(actual_nt > 0){ key[actual_nt] = '\0'; //printf("Store: %s\n", key); value_ptr = (unsigned char *)cp_hashtable_get(t, key); value = *value_ptr; code_values[code_pos++] = value; } if (code_pos >= 0) { fwrite(code_values, sizeof(unsigned char), code_pos, binary_fd); code_pos = 0; } fclose(fd); fclose(binary_fd); }