Exemple #1
0
cp_hashtable* associate_samples_and_positions(vcf_file_t* file) {
    LOG_DEBUG_F("** %zu sample names read\n", file->samples_names->size);
    array_list_t *sample_names = file->samples_names;
    cp_hashtable *sample_ids = cp_hashtable_create(sample_names->size * 2,
                                                   cp_hash_string,
                                                   (cp_compare_fn) strcasecmp
                                                  );
    
    int *index;
    char *name;
    
    for (int i = 0; i < sample_names->size; i++) {
        name = sample_names->items[i];
        index = (int*) malloc (sizeof(int)); *index = i;
        
        if (cp_hashtable_get(sample_ids, name)) {
            LOG_FATAL_F("Sample %s appears more than once. File can not be analyzed.\n", name);
        }
        
        cp_hashtable_put(sample_ids, name, index);
    }
//     char **keys = (char**) cp_hashtable_get_keys(sample_ids);
//     int num_keys = cp_hashtable_count(sample_ids);
//     for (int i = 0; i < num_keys; i++) {
//         printf("%s\t%d\n", keys[i], *((int*) cp_hashtable_get(sample_ids, keys[i])));
//     }
    
    return sample_ids;
}
Exemple #2
0
static long get_thread_serial(long tno)
{
	long *num;
	
	if (thread_id == NULL) return 0;

	num = cp_hashtable_get(thread_id, &tno);

	if (num == NULL)
	{
		long *key;
		num = malloc(sizeof(long));
		if (num == NULL)
		{
			cp_error(CP_MEMORY_ALLOCATION_FAILURE, "can\'t allocate thread mapping number");
			return -1L;
		}
		key = malloc(sizeof(long));
		if (key == NULL)
		{
			cp_error(CP_MEMORY_ALLOCATION_FAILURE, "can\'t allocate thread mapping key");
			return -1L;
		}

		*num = ++thread_count;
		*key = tno;
		cp_hashtable_put(thread_id, key, num);
	}

	return *num;
}
Exemple #3
0
void cp_error(int code, char *fmt, ...)
#endif
{
	va_list argp;
	char errcode[256];
	char errmsg[224];
	char *msg;

	if (loglevel > LOG_LEVEL_ERROR) return;
//	if (fmt == NULL) return;

	msg = cp_hashtable_get(error_message_lookup, &code);
	if (msg)
#ifdef CP_HAS_SNPRINTF
		snprintf(errmsg, 224, " - %s", msg);
#else
		sprintf(errmsg, " - %s", msg);
#endif /* CP_HAS_SNPRINTF */
	else
Exemple #4
0
static void parse_effect_response(int tid, char *output_directory, size_t output_directory_len, cp_hashtable *output_files, 
                                  list_t *output_list, cp_hashtable *summary_count, cp_hashtable *gene_list) {
    int *SO_found = (int*) malloc (sizeof(int)); // Whether the SO code field has been found
    int *count;
    char tmp_consequence_type[128];
    
    int num_lines;
    char **split_batch = split(effect_line[tid], "\n", &num_lines);
    
    for (int i = 0; i < num_lines; i++) {
        int num_columns;
        char *copy_buf = strdup(split_batch[i]);
        char **split_result = split(copy_buf, "\t", &num_columns);
        free(copy_buf);
        
        // Find consequence type name (always after SO field)
        *SO_found = 0;
        if (num_columns == 25) {
//             LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]);
            if (!cp_hashtable_contains(gene_list, split_result[17])) {
                cp_hashtable_put(gene_list, strdup(split_result[17]), NULL);
            }
            *SO_found = atoi(split_result[18] + 3);
           memset(tmp_consequence_type, 0, 128 * sizeof(char));
           strncat(tmp_consequence_type, split_result[19], strlen(split_result[19]));
        } else {
            if (strlen(split_batch[i]) == 0) { // Last line in batch could be only a newline
                for (int s = 0; s < num_columns; s++) {
                    free(split_result[s]);
                }
                free(split_result);
                continue;
            }
            
            LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_columns, split_batch[i]);
            
            for (int s = 0; s < num_columns; s++) {
                free(split_result[s]);
            }
            free(split_result);
            continue;
        }
        
        for (int s = 0; s < num_columns; s++) {
            free(split_result[s]);
        }
        free(split_result);
        
        if (!*SO_found) { // SO:000000 is not valid
            LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid);
            continue;
        }

//         LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found);
        size_t consequence_type_len = strlen(tmp_consequence_type);
     
        // If file does not exist, create its descriptor and summary counter
        FILE *aux_file = cp_hashtable_get(output_files, SO_found);
        if (!aux_file) {
#pragma omp critical
            {
                // This construction avoids 2 threads trying to insert the same CT
                aux_file = cp_hashtable_get(output_files, SO_found);
                if (!aux_file) {
                    char filename[output_directory_len + consequence_type_len + 6];
                    memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char));
                    strncat(filename, output_directory, output_directory_len);
                    strncat(filename, "/", 1);
                    strncat(filename, tmp_consequence_type, consequence_type_len);
                    strncat(filename, ".txt", 4);
                    aux_file = fopen(filename, "a");
                    
                    // Add to hashtables (file descriptors and summary counters)
                    int *SO_stored = (int*) malloc (sizeof(int));
                    *SO_stored = *SO_found;
                    cp_hashtable_put(output_files, SO_stored, aux_file);

                    LOG_INFO_F("[%d] New consequence type found = %s\n", tid, tmp_consequence_type);
                }
            }
        }
        
        // Write line[tid] to file corresponding to its consequence type
        if (aux_file) { 
#pragma omp critical
            {
                // TODO move critical one level below?
                count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type);
                if (count == NULL) {
                    char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char));
                    strncat(consequence_type, tmp_consequence_type, consequence_type_len);
                    assert(!strcmp(consequence_type, tmp_consequence_type));
                    count = (int*) malloc (sizeof(int));
                    *count = 0;
                    cp_hashtable_put(summary_count, consequence_type, count);
//                     LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type);
                }
                // Increment counter for summary
                (*count)++;
            }
            
//             LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type);
            list_item_t *output_item = list_item_new(tid, *SO_found, strdup(split_batch[i]));
            list_insert_item(output_item, output_list);
//             LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type);
        }
    }
    
    for (int i = 0; i < num_lines; i++) {
        free(split_batch[i]);
    }
    free(split_batch);
}
Exemple #5
0
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) {
    int ret_code = 0;
    double start, stop, total;
    
    vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches);
    if (!vcf_file) {
        LOG_FATAL("VCF file does not exist!\n");
    }
    
    ped_file_t *ped_file = NULL;
    if (shared_options_data->ped_filename) {
        ped_file = ped_open(shared_options_data->ped_filename);
        if (!ped_file) {
            LOG_FATAL("PED file does not exist!\n");
        }
        LOG_INFO("About to read PED file...\n");
        // Read PED file before doing any processing
        ret_code = ped_read(ped_file);
        if (ret_code != 0) {
            LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename);
        }
    }
    
    char *output_directory = shared_options_data->output_directory;
    size_t output_directory_len = strlen(output_directory);
    
    ret_code = create_directory(output_directory);
    if (ret_code != 0 && errno != EEXIST) {
        LOG_FATAL_F("Can't create output directory: %s\n", output_directory);
    }
    
    // Remove all .txt files in folder
    ret_code = delete_files_by_extension(output_directory, "txt");
    if (ret_code != 0) {
        return ret_code;
    }
    
    // Initialize environment for connecting to the web service
    ret_code = init_http_environment(0);
    if (ret_code != 0) {
        return ret_code;
    }
    
    // Output file descriptors
    static cp_hashtable *output_files = NULL;
    // Lines of the output data in the main .txt files
    static list_t *output_list = NULL;
    // Consequence type counters (for summary, must be kept between web service calls)
    static cp_hashtable *summary_count = NULL;
    // Gene list (for genes-with-variants, must be kept between web service calls)
    static cp_hashtable *gene_list = NULL;

    // Initialize collections of file descriptors and summary counters
    ret_code = initialize_output_files(output_directory, output_directory_len, &output_files);
    if (ret_code != 0) {
        return ret_code;
    }
    initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list);
    initialize_ws_buffers(shared_options_data->num_threads);
    
    // Create job.status file
    char job_status_filename[output_directory_len + 10];
    sprintf(job_status_filename, "%s/job.status", output_directory);
    FILE *job_status = new_job_status_file(job_status_filename);
    if (!job_status) {
        LOG_FATAL("Can't create job status file\n");
    } else {
        update_job_status_file(0, job_status);
    }
    
 
#pragma omp parallel sections private(start, stop, total)
    {
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num());
            
            start = omp_get_wtime();
            
            ret_code = vcf_read(vcf_file, 1,
                                (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines,
                                shared_options_data->batch_bytes <= 0);

            stop = omp_get_wtime();
            total = stop - start;

            if (ret_code) {
                LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename);
            }

            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            notify_end_parsing(vcf_file);
        }
        
#pragma omp section
        {
            // Enable nested parallelism and set the number of threads the user has chosen
            omp_set_nested(1);
            
            LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num());
            
            // Filters and files for filtering output
            filter_t **filters = NULL;
            int num_filters = 0;
            if (shared_options_data->chain != NULL) {
                filters = sort_filter_chain(shared_options_data->chain, &num_filters);
            }
            FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL;
            get_filtering_output_files(shared_options_data, &passed_file, &failed_file);
            
            // Pedigree information (used in some filters)
            individual_t **individuals = NULL;
            khash_t(ids) *sample_ids = NULL;
            
            // Filename structure outdir/vcfname.errors
            char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char));
            get_filename_from_path(shared_options_data->vcf_filename, prefix_filename);
            char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char));
            sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename);
            non_processed_file = fopen(non_processed_filename, "w");
            free(non_processed_filename);
            
            // Maximum size processed by each thread (never allow more than 1000 variants per query)
            if (shared_options_data->batch_lines > 0) {
                shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, 
                            ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads));
            } else {
                shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY;
            }
            LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread);
    
            int i = 0;
            vcf_batch_t *batch = NULL;
            int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0;
            
            start = omp_get_wtime();

            while (batch = fetch_vcf_batch(vcf_file)) {
                if (i == 0) {
                    // Add headers associated to the defined filters
                    vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters);
                    for (int j = 0; j < num_filters; j++) {
                        add_vcf_header_entry(filter_headers[j], vcf_file);
                    }
                        
                    // Write file format, header entries and delimiter
                    if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); }
                    if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); }
                    if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); }
                    
                    LOG_DEBUG("VCF header written\n");
                    
                    if (ped_file) {
                        // Create map to associate the position of individuals in the list of samples defined in the VCF file
                        sample_ids = associate_samples_and_positions(vcf_file);
                        // Sort individuals in PED as defined in the VCF file
                        individuals = sort_individuals(vcf_file, ped_file);
                    }
                }
                
//                     printf("batch loaded = '%.*s'\n", 50, batch->text);
//                     printf("batch text len = %zu\n", strlen(batch->text));

//                 if (i % 10 == 0) {
                    LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", 
                            i, omp_get_thread_num(),
                            batch->records->size, batch->records->capacity);
//                 }

                int reconnections = 0;
                int max_reconnections = 3; // TODO allow to configure?

                // Write records that passed to a separate file, and query the WS with them as args
                array_list_t *failed_records = NULL;
                int num_variables = ped_file? get_num_variables(ped_file): 0;
                array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records);
                if (passed_records->size > 0) {
                    // Divide the list of passed records in ranges of size defined in config file
                    int num_chunks;
                    int *chunk_sizes;
                    int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes);
                    
                    do {
                        // OpenMP: Launch a thread for each range
                        #pragma omp parallel for num_threads(shared_options_data->num_threads)
                        for (int j = 0; j < num_chunks; j++) {
                            int tid = omp_get_thread_num();
                            LOG_DEBUG_F("[%d] WS invocation\n", tid);
                            LOG_DEBUG_F("[%d] -- effect WS\n", tid);
                            if (!reconnections || ret_ws_0) {
                                ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), 
                                                            chunk_sizes[j], options_data->excludes);
                                parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list);
                                free(effect_line[tid]);
                                effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char));
                            }
                            
                            if (!options_data->no_phenotypes) {
                                if (!reconnections || ret_ws_1) {
                                    LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num());
                                    ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]);
                                    parse_snp_phenotype_response(tid, output_list);
                                    free(snp_line[tid]);
                                    snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char));
                                }
                                 
                                if (!reconnections || ret_ws_2) {
                                    LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num());
                                    ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]);
                                    parse_mutation_phenotype_response(tid, output_list);
                                    free(mutation_line[tid]);
                                    mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char));
                                }
                            }
                        }
                        
                        LOG_DEBUG_F("*** %dth web services invocation finished\n", i);
                        
                        if (ret_ws_0 || ret_ws_1 || ret_ws_2) {
                            if (ret_ws_0) {
                                LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0));
                            }
                            if (ret_ws_1) {
                                LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1));
                            }
                            if (ret_ws_2) {
                                LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2));
                            }
                            
                            // In presence of errors, wait 4 seconds before retrying
                            reconnections++;
                            LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections);
                            sleep(4);
                        } else {
                            free(chunk_starts);
                            free(chunk_sizes);
                        }
                    } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2));
                }
                
                // If the maximum number of reconnections was reached still with errors, 
                // write the non-processed batch to the corresponding file
                if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) {
                #pragma omp critical
                    {
                        write_vcf_batch(batch, non_processed_file);
                    }
                }
                
                // Write records that passed and failed filters to separate files, and free them
                write_filtering_output_files(passed_records, failed_records, passed_file, failed_file);
                free_filtered_records(passed_records, failed_records, batch->records);
                
                // Free batch and its contents
                vcf_batch_free(batch);
                
                i++;
            }

            stop = omp_get_wtime();

            total = stop - start;

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            // Free resources
            if (passed_file) { fclose(passed_file); }
            if (failed_file) { fclose(failed_file); }
            if (non_processed_file) { fclose(non_processed_file); }
            
            // Free filters
            for (i = 0; i < num_filters; i++) {
                filter_t *filter = filters[i];
                filter->free_func(filter);
            }
            free(filters);
            
            // Decrease list writers count
            for (i = 0; i < shared_options_data->num_threads; i++) {
                list_decr_writers(output_list);
            }
        }
        
#pragma omp section
        {
            // Thread which writes the results to all_variants, summary and one file per consequence type
            int ret = 0;
            char *line;
            list_item_t* item = NULL;
            FILE *fd = NULL;
            
            FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants");
            FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes");
            FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes");
            
            while ((item = list_remove_item(output_list)) != NULL) {
                line = item->data_p;
                
                // Type greater than 0: consequence type identified by its SO code
                // Type equals to -1: SNP phenotype
                // Type equals to -2: mutation phenotype
                if (item->type > 0) {
                    // Write entry in the consequence type file
                    fd = cp_hashtable_get(output_files, &(item->type));
                    int ret = fprintf(fd, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to file: '%s'\n", line);
                    }
                    
                    // Write in all_variants
                    ret = fprintf(all_variants_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to all_variants: '%s'\n", line);
                    }
                    
                } else if (item->type == SNP_PHENOTYPE) {
                    ret = fprintf(snp_phenotype_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line);
                    }
                    
                } else if (item->type == MUTATION_PHENOTYPE) {
                    ret = fprintf(mutation_phenotype_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line);
                    }
                }
                
                free(line);
                list_item_free(item);
            }
            
        }
    }

    write_summary_file(summary_count, cp_hashtable_get(output_files, "summary"));
    write_genes_with_variants_file(gene_list, output_directory);
    write_result_file(shared_options_data, options_data, summary_count, output_directory);

    free_output_data_structures(output_files, summary_count, gene_list);
    free_ws_buffers(shared_options_data->num_threads);
    free(output_list);
    vcf_close(vcf_file);
    
    update_job_status_file(100, job_status);
    close_job_status_file(job_status);
    
    return ret_code;
}
Exemple #6
0
int tdt_test(vcf_record_t **variants, int num_variants, family_t **families, int num_families, cp_hashtable *sample_ids, list_t *output_list) {
    double start = omp_get_wtime();
    
    int ret_code = 0;
    int tid = omp_get_thread_num();
    int num_samples = cp_hashtable_count(sample_ids);
    
    tdt_result_t *result;
    char **sample_data;
    
    int gt_position;
    int father_allele1, father_allele2;
    int mother_allele1, mother_allele2;
    int child_allele1, child_allele2;

    ///////////////////////////////////
    // Perform analysis for each variant

    vcf_record_t *record;
    for (int i = 0; i < num_variants; i++) {
        record = variants[i];
        LOG_DEBUG_F("[%d] Checking variant %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position);
        
        sample_data = (char**) record->samples->items;
        gt_position = get_field_position_in_format("GT", strndup(record->format, record->format_len));
    
        // Transmission counts
        int t1 = 0;
        int t2 = 0;
        
        
        // Count over families
        family_t *family;
        for (int f = 0; f < num_families; f++) {
            family = families[f];
            individual_t *father = family->father;
            individual_t *mother = family->mother;
            cp_list *children = family->children;

//           LOG_DEBUG_F("[%d] Checking suitability of family %s\n", tid, family->id);
            
            if (father == NULL || mother == NULL) {
                continue;
            }

            int *father_pos = cp_hashtable_get(sample_ids, father->id);
            if (father_pos != NULL) {
    //           LOG_DEBUG_F("[%d] Father %s is in position %d\n", tid, father->id, *father_pos);
            } else {
    //           LOG_DEBUG_F("[%d] Father %s is not positioned\n", tid, father->id);
                continue;
            }
            
            int *mother_pos = cp_hashtable_get(sample_ids, mother->id);
            if (mother_pos != NULL) {
    //           LOG_DEBUG_F("[%d] Mother %s is in position %d\n", tid, mother->id, *mother_pos);
            } else {
    //           LOG_DEBUG_F("[%d] Mother %s is not positioned\n", tid, mother->id);
                continue;
            }
            
            char *father_sample = strdup(sample_data[*father_pos]);
            char *mother_sample = strdup(sample_data[*mother_pos]);
            
//           LOG_DEBUG_F("[%d] Samples: Father = %s\tMother = %s\n", tid, father_sample, mother_sample);
            
            // If any parent's alleles can't be read or is missing, go to next family
            if (get_alleles(father_sample, gt_position, &father_allele1, &father_allele2) ||
                get_alleles(mother_sample, gt_position, &mother_allele1, &mother_allele2)) {
                free(father_sample);
                free(mother_sample);
                continue;
            }
            
//           LOG_DEBUG_F("[%d] Alleles: Father = %d/%d\tMother = %d/%d\n", tid, father_allele1, father_allele2, mother_allele1, mother_allele2);
            
            // We need two genotyped parents, with at least one het
            if (father_allele1 == father_allele2 && mother_allele1 == mother_allele2) {
                free(father_sample);
                free(mother_sample);
                continue;
            }
            
            if ((father_allele1 && !father_allele2) || (mother_allele1 && !mother_allele2)) {
                free(father_sample);
                free(mother_sample);
                continue;
            }

//           LOG_DEBUG_F("[%d] Proceeding to analyse family %s...\n", tid, family->id);

            
            int trA = 0;  // transmitted allele from first het parent
            int unA = 0;  // untransmitted allele from first het parent
            
            int trB = 0;  // transmitted allele from second het parent
            int unB = 0;  // untransmitted allele from second het parent
            
            // Consider all offspring in nuclear family
            cp_list_iterator *children_iterator = cp_list_create_iterator(family->children, COLLECTION_LOCK_READ);
            individual_t *child = NULL;
            while ((child = cp_list_iterator_next(children_iterator)) != NULL) {
                // Only consider affected children
                if (child->condition != AFFECTED) { continue; }
                
                int *child_pos = cp_hashtable_get(sample_ids, child->id);
                if (child_pos != NULL) {
        //           LOG_DEBUG_F("[%d] Child %s is in position %d\n", tid, child->id, *child_pos);
                } else {
        //           LOG_DEBUG_F("[%d] Child %s is not positioned\n", tid, child->id);
                    continue;
                }
                
                char *child_sample = strdup(sample_data[*child_pos]);
    //           LOG_DEBUG_F("[%d] Samples: Child = %s\n", tid, child_sample);
                
                // Skip if offspring has missing genotype
                if (get_alleles(child_sample, gt_position, &child_allele1, &child_allele2)) {
                    free(child_sample);
                    continue;
                }
                
                // Exclude mendelian errors
                char *aux_chromosome = strndup(record->chromosome, record->chromosome_len);
                if (check_mendel(aux_chromosome, father_allele1, father_allele2, mother_allele1, mother_allele2, 
                    child_allele1, child_allele2, child->sex)) {
                    free(child_sample);
                    free(aux_chromosome);
                    continue;
                }
                free(aux_chromosome);
                
                
                // We've now established: no missing genotypes
                // and at least one heterozygous parent

                // Kid is 00

                if (!child_allele1 && !child_allele2) {
                    if ( ( (!father_allele1) && father_allele2 ) && 
                        ( (!mother_allele1) && mother_allele2 ) )
                    { trA=1; unA=2; trB=1; unB=2; }
                    else 
                    { trA=1; unA=2; } 
                }
                else if ( (!child_allele1) && child_allele2 )  // Kid is 01
                {
                    // het dad
                    if (father_allele1 != father_allele2 )
                    {
                        // het mum
                        if ( mother_allele1 != mother_allele2 )
                    { trA=1; trB=2; unA=2; unB=1; }
                        else if ( !mother_allele1 ) 
                    { trA=2; unA=1; }
                        else { trA=1; unA=2; }
                    }
                    else if ( !father_allele1 ) 
                    {
                        trA=2; unA=1; 
                    }           
                    else
                    {
                        trA=1; unA=2;
                    }
                }
                else // kid is 1/1
                {
                    
                    if ( ( (!father_allele1) && father_allele2 ) && 
                        ( (!mother_allele1) && mother_allele2 ) )
                    { trA=2; unA=1; trB=2; unB=1; }
                    else 
                    { 
                        trA=2; unA=1;
                    }
                }
                
                // We have now populated trA (first transmission) 
                // and possibly trB also 
                
                ////////////////////////////////////////
                // Permutation? 50:50 flip (precomputed)
                
//                 if (permute) {
//                     if (flipA[f])
//                     {
//                     int t = trA;
//                     trA = unA;
//                     unA = t;
//                     
//                     t = trB;
//                     trB = unB;
//                     unB = t;
//                     }
//                 }
                
                // Increment transmission counts
                if (trA==1) { t1++; }
                else if (trA==2) { t2++; }
                
                if (trB==1) { t1++; }
                else if (trB==2) { t2++; }
                
//     //           LOG_DEBUG_F("TDT\t%.*s %s : %d %d - %d %d - %d %d - F %d/%d - M %d/%d - C %d/%d\n", 
//                             record->id_len, record->id, family->id, trA, unA, trB, unB, t1, t2, 
//                             father_allele1, father_allele2, mother_allele1, mother_allele2, child_allele1, child_allele2);
                free(child_sample);
                
            } // next offspring in family
            
            cp_list_iterator_destroy(children_iterator);
            free(father_sample);
            free(mother_sample);
        }  // next nuclear family

        /////////////////////////////
        // Finished counting: now compute
        // the statistics
        
        double tdt_chisq = -1;
        
        // Basic TDT test
        if (t1+t2 > 0) {
            tdt_chisq = ((double) ((t1-t2) * (t1-t2))) / (t1+t2);
        }
        
//         LOG_DEBUG_F("[%d] before adding %s:%ld\n", tid, record->chromosome, record->position);
        result = tdt_result_new(record->chromosome, record->chromosome_len, 
                                record->position, 
                                record->reference, record->reference_len, 
                                record->alternate, record->alternate_len,
                                t1, t2, tdt_chisq);
        list_item_t *output_item = list_item_new(tid, 0, result);
        list_insert_item(output_item, output_list);
//         LOG_DEBUG_F("[%d] after adding %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position);
        
    } // next variant

    double end = omp_get_wtime();
    
    return ret_code;
}
Exemple #7
0
static size_t write_effect_ws_results(char *contents, size_t size, size_t nmemb, void *userdata) {
    int tid = omp_get_thread_num();
    
    int i = 0;
    int data_read_len = 0, next_line_len = 0;
    // Whether the SO code field (previous to the consequence type name) has been found
    int *SO_found = (int*) malloc (sizeof(int));
    // Whether the buffer was consumed with a line read just partially
    int premature_end = 0;
    
    size_t realsize = size * nmemb;
    
    int *count;
    
    char *data = contents;
    char tmp_consequence_type[128];
    char *aux_buffer;
    char *output_text;
    
    
    LOG_DEBUG_F("Effect WS invoked, response size = %zu bytes\n", realsize);
    
    while (data_read_len < realsize) {
        assert((line + tid) != NULL);
        assert((max_line_size + tid) != NULL);
        
        LOG_DEBUG_F("[%d] loop iteration #%d\n", tid, i);
        // Get length of data to copy
        next_line_len = strcspn(data, "\n");
        
        // If the line[tid] is too long for the current buffers, reallocate a little more than the needed memory
        if (strlen(line[tid]) + next_line_len + 1 > max_line_size[tid]) {
//             LOG_DEBUG_F("Line too long (%d elements, but %zu needed) in batch #%d\n", 
//                         max_line_size[tid], strlen(line[tid]) + next_line_len, batch_num);
//             char *out_buf = (char*) calloc (next_line_len+1, sizeof(char));
//             snprintf(out_buf, next_line_len, "%s", data);
//             LOG_INFO_F("[%d] too big data is: '%s'\n", tid, out_buf);
            char *aux_1 = (char*) realloc (line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char));
            char *aux_2 = (char*) realloc (output_line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char));
            
            if (!aux_1 || !aux_2) {
                LOG_ERROR("Can't resize buffers\n");
                // Can't resize buffers -> can't keep reading the file
                if (!aux_1) { free(line[tid]); }
                if (!aux_2) { free(output_line[tid]); }
                return data_read_len;
            }
            
            line[tid] = aux_1;
            output_line[tid] = aux_2;
            max_line_size[tid] += next_line_len + 1;
//             LOG_DEBUG_F("[%d] buffers realloc'd (%d)\n", tid, max_line_size[tid]);
        }
        
//         LOG_DEBUG_F("[%d] position = %d, read = %d, max_size = %zu\n", i, next_line_len, data_read_len, realsize);
        
        if (data_read_len + next_line_len >= realsize) {
            // Save current state (line[tid] partially read)
            strncat(line[tid], data, next_line_len);
            chomp(line[tid]);
            line[tid][strlen(line[tid])] = '\0';
            premature_end = 1;
//             LOG_DEBUG_F("widow line[tid] = '%s'\n", line[tid]);
            data_read_len = realsize;
            break;
        }
        
        strncat(line[tid], data, next_line_len);
        strncat(output_line[tid], line[tid], strlen(line[tid]));
     
//         LOG_DEBUG_F("[%d] copy to buffer (%zu)\n", tid, strlen(line[tid]));
    
        int num_substrings;
        char *copy_buf = strdup(line[tid]);
//         char *copy_buf = strdup(trim(line[tid]));
        char **split_result = split(copy_buf, "\t", &num_substrings);
        free(copy_buf);
        
        // Find consequence type name (always after SO field)
        *SO_found = 0;
        if (num_substrings == 25) {
//             LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]);
            if (!cp_hashtable_contains(gene_list, split_result[17])) {
                cp_hashtable_put(gene_list, strdup(split_result[17]), NULL);
            }
            *SO_found = atoi(split_result[18] + 3);
           memset(tmp_consequence_type, 0, 128 * sizeof(char));
           strncat(tmp_consequence_type, split_result[19], strlen(split_result[19]));
        } else {
            LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]);
            memset(line[tid], 0, strlen(line[tid]));
            memset(output_line[tid], 0, strlen(output_line[tid]));
            
// #pragma omp critical
//             {
//             printf("********\n");
//             LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]);
            for (int s = 0; s < num_substrings; s++) {
//                 printf("%s^", split_result[s]);
                free(split_result[s]);
            }
//             printf("********\n\n");
            free(split_result);
//             }
            continue;
        }
        
        for (int s = 0; s < num_substrings; s++) {
            free(split_result[s]);
        }
        free(split_result);
        
        if (!*SO_found) { // SO:000000 is not valid
            LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid);
            memset(line[tid], 0, strlen(line[tid]));
            memset(output_line[tid], 0, strlen(output_line[tid]));
            continue;
        }

//         LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found);
        size_t consequence_type_len = strlen(tmp_consequence_type);
     
        // If file does not exist, create its descriptor and summary counter
        FILE *aux_file = cp_hashtable_get(output_files, SO_found);
        if (!aux_file) {
#pragma omp critical
            {
                // This construction avoids 2 threads trying to insert the same CT
                aux_file = cp_hashtable_get(output_files, SO_found);
                if (!aux_file) {
                    char filename[output_directory_len + consequence_type_len + 6];
                    memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char));
                    strncat(filename, output_directory, output_directory_len);
                    strncat(filename, "/", 1);
                    strncat(filename, tmp_consequence_type, consequence_type_len);
                    strncat(filename, ".txt", 4);
                    aux_file = fopen(filename, "a");
                    
                    // Add to hashtables (file descriptors and summary counters)
                    int *SO_stored = (int*) malloc (sizeof(int));
                    *SO_stored = *SO_found;
                    cp_hashtable_put(output_files, SO_stored, aux_file);

                    LOG_INFO_F("[%d] new CT = %s\n", tid, tmp_consequence_type);
                }
            }
        }
        
        // Write line[tid] to file corresponding to its consequence type
        if (aux_file) { 
#pragma omp critical
            {
                // TODO move critical one level below?
                count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type);
                if (count == NULL) {
                    char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char));
                    strncat(consequence_type, tmp_consequence_type, consequence_type_len);
                    assert(!strcmp(consequence_type, tmp_consequence_type));
                    count = (int*) malloc (sizeof(int));
                    *count = 0;
                    cp_hashtable_put(summary_count, consequence_type, count);
//                     LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type);
                }
                // Increment counter for summary
                (*count)++;
            }
            
//             LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type);
            output_text = strdup(output_line[tid]);
            list_item_t *output_item = list_item_new(tid, *SO_found, output_text);
            list_insert_item(output_item, output_list);
//             LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type);
        }
        
        data += next_line_len+1;
        data_read_len += next_line_len+1;
        
        memset(line[tid], 0, strlen(line[tid]));
        memset(output_line[tid], 0, strlen(output_line[tid]));
        
        i++;
    }
 
    // Empty buffer for next callback invocation
    if (!premature_end) {
        memset(line[tid], 0, strlen(line[tid]));
        memset(output_line[tid], 0, strlen(line[tid]));
    }
    free(SO_found);

    return data_read_len;
}
Exemple #8
0
int file_service(cp_http_request *request, cp_http_response *response)
{
    int rc = 0;
    char *ext;
    char path[PATHLEN];
    int uri_len;
    char buf[FBUFSIZE];
    FILE *fp;
    cp_string *body = NULL;

#ifdef DEBUG
    cp_http_request_dump(request);
#endif

    ext = strrchr(request->uri, '.');
    if (ext) 
        cp_http_response_set_content_type_string(response, 
                                                 cp_hashtable_get(mimemap, ++ext));

    /* check len, avoid buffer overrun */
    uri_len = strlen(request->uri);
    if (uri_len + strlen(document_root) >= PATHLEN)
    {
        cp_http_response_set_content_type(response, HTML);
        cp_http_response_set_status(response, HTTP_404_NOT_FOUND);
        response->body = strdup(HTTP404_PAGE);
        return HTTP_CONNECTION_POLICY_CLOSE;
    }
        
#ifdef CP_HAS_SNPRINTF
    snprintf(path, PATHLEN, "%s%s", document_root, request->uri);
#else
    sprintf(path, "%s%s", document_root, request->uri);
#endif /* CP_HAS_SNPRINTF */
    if (path[strlen(path) - 1] == '/') 
    {
        strlcat(path, "index.html", PATHLEN);
        response->content_type = HTML;
    }

    fp = fopen(path, "rb");
    if (fp == NULL)
    {
        cp_http_response_set_content_type(response, HTML);
        cp_http_response_set_status(response, HTTP_404_NOT_FOUND);
#ifdef CP_HAS_SNPRINTF
        snprintf(buf, FBUFSIZE, HTTP404_PAGE_uri, request->uri);
#else
        sprintf(buf, HTTP404_PAGE_uri, request->uri);
#endif /* CP_HAS_SNPRINTF */
        response->body = strdup(buf);
        return HTTP_CONNECTION_POLICY_CLOSE;
    }

#ifdef __TRACE__
    DEBUGMSG("retrieving [%s]", path);
#endif
    while ((rc = fread(buf, 1, FBUFSIZE, fp)) > 0)
    {
        if (body == NULL)
            body = cp_string_create(buf, rc);
        else
            cp_string_cat_bin(body, buf, rc);
    }
    fclose(fp);
    
    cp_http_response_set_status(response, HTTP_200_OK);

    response->content = body;

    return HTTP_CONNECTION_POLICY_KEEP_ALIVE;
}
Exemple #9
0
void code_binary_file_generator(size_t chunk_size, char *dna_filename, char *dna_binary_filename,  cp_hashtable *t){
  
  if (chunk_size <= 0) { 
    chunk_size = 100000000; //100MB 
  }

  FILE *binary_fd, *fd;
  fd = fopen(dna_filename, "r");
  if (fd == NULL) {  printf("Error opening file %s\n", dna_filename); exit(-1); }

  binary_fd = fopen (dna_binary_filename, "wb");
  if (binary_fd == NULL) { printf("Error opening file %s\n", dna_binary_filename); exit(-1); }

  char *dna_chunk = (char *)malloc(sizeof(char)*chunk_size);
  
  size_t codes_allocate = chunk_size;
  unsigned char *code_values = (unsigned char *)malloc(sizeof(unsigned char)*codes_allocate);
  size_t code_pos = 0;

  size_t dna_len;
  char key[4];
  unsigned char max_chunk = 3;
  unsigned char actual_nt = 0;
  
  unsigned char value;
  unsigned char *value_ptr;
  size_t nt = 0;
  unsigned char key_chunk = 3;
  printf("Process DNA File\n");

  while (!feof(fd)) {
    fgets(dna_chunk, chunk_size, fd);
    if (dna_chunk[0] != '>') {
      dna_len = strlen(dna_chunk);
      //printf("Process (%i): %s", dna_len, dna_chunk);
      for (unsigned int c = 0; c < dna_len; c++) {
	if (dna_chunk[c] != '\n') {
	  //printf("Char(%i)[%i]: %c\n", c, actual_nt, dna_chunk[c]);
	  if (dna_chunk[c] == 'a' || 
	      dna_chunk[c] == 'c' || 
	      dna_chunk[c] == 'g' || 
	      dna_chunk[c] == 't' || 
	      dna_chunk[c] == 'n') {
	    //printf("Convert %c in %c\n", dna_chunk[c], dna_chunk[c] - 32);
	    dna_chunk[c] = dna_chunk[c] - 32;
	  }

	  key[actual_nt++] = dna_chunk[c];
	  if (actual_nt ==  max_chunk){
	    key[actual_nt] = '\0';
	    //printf("Store: %s\n", key);
	    value_ptr = (unsigned char *)cp_hashtable_get(t, key);
	    value = *value_ptr;

	    code_values[code_pos++] = value;
	    //printf("Stored code %d == %s : %d\n", value, key, code_pos - 1);
	    if (code_pos >= codes_allocate) {
	    //printf("Write Ids in file...\n");
	      fwrite(code_values, sizeof(unsigned char), code_pos, binary_fd);
	      code_pos = 0;
	    }
	    actual_nt = 0;
	  }
	}
      } //End for
    } else {
      printf("Process: %s", &dna_chunk[1]);      
    }//End if strcmp
  }
    
  if(actual_nt > 0){
    key[actual_nt] = '\0';
    //printf("Store: %s\n", key);
    value_ptr = (unsigned char *)cp_hashtable_get(t, key);
    value = *value_ptr;
    code_values[code_pos++] = value;	
  }

  if (code_pos >= 0) {
    fwrite(code_values, sizeof(unsigned char), code_pos, binary_fd);
    code_pos = 0;
  }
      
  fclose(fd);
  fclose(binary_fd);
}