コード例 #1
0
ファイル: assoc.c プロジェクト: CharoL/hpg-variant
void assoc_test(enum ASSOC_task test_type, vcf_record_t **variants, int num_variants, individual_t **samples, int num_samples,
                const void *opt_input, list_t *output_list) {
    int tid = omp_get_thread_num();

    vcf_record_t *record;
    individual_t *individual;
    char *sample_data;
    
    int gt_position;
    int allele1, allele2;

    // Affection counts
    int A1 = 0, A2 = 0, U1 = 0, U2 = 0;
    
    // Perform analysis for each variant
    for (int i = 0; i < num_variants; i++) {
        record = variants[i];
//         LOG_DEBUG_F("[%d] Checking variant %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position);
        
        A1 = 0; A2 = 0;
        U1 = 0; U2 = 0;

        gt_position = get_field_position_in_format("GT", strndup(record->format, record->format_len));
    
        // Count over individuals
        for (int j = 0; j < num_samples; j++) {
        	individual = samples[j];
        	sample_data = strdup(array_list_get(j, record->samples));
        	if (!get_alleles(sample_data, gt_position, &allele1, &allele2)) {
				assoc_count_individual(individual, record, allele1, allele2, &A1, &A2, &U1, &U2);
			}
        	free(sample_data);
        }
        
        // Finished counting: now compute the statistics
        if (test_type == CHI_SQUARE) {
            double assoc_basic_chisq = assoc_basic_test(A1, U1, A2, U2);
            assoc_basic_result_t *result = assoc_basic_result_new(record->chromosome, record->chromosome_len, 
                                                                  record->position, 
                                                                  record->reference, record->reference_len,
                                                                  record->alternate, record->alternate_len,
                                                                  A1, A2, U1, U2, assoc_basic_chisq);
            list_item_t *output_item = list_item_new(tid, 0, result);
            list_insert_item(output_item, output_list);
        } else if (test_type == FISHER) {
            double p_value = assoc_fisher_test(A1, A2, U1, U2, (double*) opt_input);
            assoc_fisher_result_t *result = assoc_fisher_result_new(record->chromosome, record->chromosome_len, 
                                                                    record->position, 
                                                                    record->reference, record->reference_len,
                                                                    record->alternate, record->alternate_len,
                                                                    A1, A2, U1, U2, p_value);
            list_item_t *output_item = list_item_new(tid, 0, result);
            list_insert_item(output_item, output_list);
        }
        
//         LOG_DEBUG_F("[%d] after adding %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position);
        
    } // next variant

}
コード例 #2
0
ファイル: rna_splice.c プロジェクト: daviddeunaa/hpg-aligner
void process_and_free_chromosome_avls(allocate_splice_elements_t *chromosome_avls, 
				      list_t* write_list_p, unsigned int write_size) {
  int c;
  allocate_buffers_t *allocate_batches = (allocate_buffers_t *)malloc(sizeof(allocate_buffers_t));
  write_batch_t *exact_splice_write_p;
  write_batch_t *extend_splice_write_p;

  for(c = 0; c < CHROMOSOME_NUMBER; c++){
    if(chromosome_avls[c].avl_splice->root != NULL) {
      allocate_batches->write_exact_sp  = write_batch_new(write_size, SPLICE_EXACT_FLAG);
      allocate_batches->write_extend_sp  = write_batch_new(write_size, SPLICE_EXTEND_FLAG);
      //allocate_batches->write_extend_sp  = write_batch_new(1000, SPLICE_EXTEND_FLAG);

      allocate_batches = process_avlnode_in_order(chromosome_avls[c].avl_splice->root, c, write_list_p, write_size, allocate_batches);
      
      exact_splice_write_p = allocate_batches->write_exact_sp;
      extend_splice_write_p = allocate_batches->write_extend_sp;
      
      if(exact_splice_write_p != NULL) {
	list_item_t* item_p = NULL;
	if(exact_splice_write_p->size > 0) {
	  item_p = list_item_new(0, WRITE_ITEM, exact_splice_write_p);
	  list_insert_item(item_p, write_list_p);
	} else {
	  write_batch_free(exact_splice_write_p);
	}
      }

      if(extend_splice_write_p != NULL) {
	list_item_t* item_p = NULL;
	if(extend_splice_write_p->size > 0) {
	  item_p = list_item_new(0, WRITE_ITEM, extend_splice_write_p);
	  list_insert_item(item_p, write_list_p);
	  } else {
	  write_batch_free(extend_splice_write_p);
	}
      }
      
    }//end IF chromosome splice not NULL
    cp_avltree_destroy(chromosome_avls[c].avl_splice);
  }
  
  free(allocate_batches);

  if (statistics_on) { 
    statistics_set(TOTAL_ST, 3, total_splice, statistics_p);
  }
  
  list_decr_writers(write_list_p);
}
コード例 #3
0
ファイル: merge_runner.c プロジェクト: CharoL/hpg-variant
int merge_remaining_interval(kh_pos_t* positions_read, vcf_file_t **files, shared_options_data_t *shared_options_data,
                              merge_options_data_t *options_data, list_t *output_list) {
	int num_entries = 0;

    #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries)
    for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) {
        if (kh_exist(positions_read, k)) {
            array_list_t *records_in_position = kh_value(positions_read, k);
            assert(records_in_position);
            
            // Launch merge
            int err_code = 0;
            vcf_record_t *merged = merge_position((vcf_record_file_link **) records_in_position->items, records_in_position->size, 
                                                  files, options_data->num_files, options_data, &err_code);
            
            if (!err_code) {
                list_item_t *item = list_item_new(k, MERGED_RECORD, merged);
                list_insert_item(item, output_list);
                num_entries += 1;
            }
            
            // Free empty nodes (lists of records in the same position)
            array_list_free(records_in_position, vcf_record_file_link_free);
            kh_del(pos, positions_read, k);
        }
    }

    return num_entries;
}
コード例 #4
0
ファイル: rna_splice.c プロジェクト: daviddeunaa/hpg-aligner
allocate_buffers_t* process_avlnode_ends_in_order(node_element_splice_t *node, unsigned int chromosome,
					     list_t* write_list_p, unsigned int write_size, allocate_buffers_t *allocate_batches) {
  int i;
  char strand[2] = {'+', '-'};
  list_item_t* item_p = NULL;
  unsigned int bytes_exact, bytes_extend;
  allocate_batches->write_exact_sp;
  //  write_batch_t* extend_splice_write_p = write_batch_new(write_size, SPLICE_EXTEND_FLAG);

  for(i = 0; i < node->number_allocate_ends; i++){
    if(( allocate_batches->write_exact_sp->size + 100) > write_size) {
      item_p = list_item_new(0, WRITE_ITEM,  allocate_batches->write_exact_sp);
      list_insert_item(item_p, write_list_p);
      allocate_batches->write_exact_sp = write_batch_new(write_size, SPLICE_EXACT_FLAG);
    } 
    if(( allocate_batches->write_extend_sp->size + 100) > write_size) {
      item_p = list_item_new(0, WRITE_ITEM,  allocate_batches->write_extend_sp);
      list_insert_item(item_p, write_list_p);
      allocate_batches->write_extend_sp = write_batch_new(write_size, SPLICE_EXTEND_FLAG);
    } 

    bytes_exact = pack_junction(chromosome, node->allocate_ends[i]->strand, 
				node->splice_start, node->allocate_ends[i]->end, 
				junction_id, node->allocate_ends[i]->reads_number, 
				&(((char *)allocate_batches->write_exact_sp->buffer_p)[allocate_batches->write_exact_sp->size]));
    
    bytes_extend = pack_junction(chromosome, node->allocate_ends[i]->strand, node->splice_start_extend, 
				 node->allocate_ends[i]->splice_end_extend, junction_id, node->allocate_ends[i]->reads_number, 
				 &(((char *)allocate_batches->write_extend_sp->buffer_p)[allocate_batches->write_extend_sp->size])); 
    
    allocate_batches->write_exact_sp->size += bytes_exact;
    allocate_batches->write_extend_sp->size += bytes_extend;
    
    total_splice += node->allocate_ends[i]->reads_number;
    junction_id++;
  }
  return allocate_batches;
  //return exact_splice_write_p;
}
コード例 #5
0
ファイル: xrdp_region.c プロジェクト: harpyham/openulteo
int APP_CC
xrdp_region_insert_rect(struct xrdp_region* self, int i, int left,
                        int top, int right, int bottom)
{
  struct xrdp_rect* r;

  r = (struct xrdp_rect*)g_malloc(sizeof(struct xrdp_rect), 1);
  r->left = left;
  r->top = top;
  r->right = right;
  r->bottom = bottom;
  list_insert_item(self->rects, i, (long)r);
  return 0;
}
コード例 #6
0
ファイル: merge_runner.c プロジェクト: CharoL/hpg-variant
int merge_interval(kh_pos_t* positions_read, char *max_chromosome_merged, unsigned long max_position_merged,
                    char **chromosome_order, int num_chromosomes, vcf_file_t **files, 
                    shared_options_data_t *shared_options_data, merge_options_data_t *options_data, list_t *output_list) {
	int num_entries = 0;

    #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries)
    for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) {
        if (kh_exist(positions_read, k)) {
            array_list_t *records_in_position = kh_value(positions_read, k);
            assert(records_in_position);
            
            vcf_record_t *record = ((vcf_record_file_link*) array_list_get(0, records_in_position))->record;
            vcf_record_file_link **links = NULL;
            int num_links = 0;
            
            // Remove positions prior to the last chromosome:position to merge
            int cmp_chrom = compare_chromosomes(record->chromosome, max_chromosome_merged, chromosome_order, num_chromosomes);
            if (cmp_chrom < 0 || (cmp_chrom == 0 && compare_positions(record->position, max_position_merged) <= 0)) {
                links = records_in_position->items;
                num_links = records_in_position->size;
            }
            
            // Launch merge
            if (num_links > 0) {
//                 printf("links[0] = %s:%ld in file %s\n", links[0]->record->chromosome, links[0]->record->position, links[0]->file->filename);
                int err_code = 0;
                vcf_record_t *merged = merge_position(links, num_links, files, options_data->num_files, options_data, &err_code);
                
                if (!err_code) {
                    list_item_t *item = list_item_new(k, MERGED_RECORD, merged);
                    list_insert_item(item, output_list);
                    num_entries += 1;
                }
                
                // Free empty nodes (lists of records in the same position)
                array_list_free(records_in_position, vcf_record_file_link_free);
                kh_del(pos, positions_read, k);
            }
        } // End kh_exist
    }

    return num_entries;
}
コード例 #7
0
ファイル: merge_runner.c プロジェクト: CharoL/hpg-variant
int run_merge(shared_options_data_t *shared_options_data, merge_options_data_t *options_data) {
    if (options_data->num_files == 1) {
        LOG_INFO("Just one VCF file specified, no need to merge");
        return 0;
    }
    
    list_t *read_list[options_data->num_files];
    memset(read_list, 0, options_data->num_files * sizeof(list_t*));
    list_t *output_header_list = (list_t*) malloc (sizeof(list_t));
    list_init("headers", shared_options_data->num_threads, INT_MAX, output_header_list);
    list_t *output_list = (list_t*) malloc (sizeof(list_t));
    list_init("output", shared_options_data->num_threads, shared_options_data->max_batches * shared_options_data->batch_lines, output_list);
    list_t *merge_tokens = (list_t*) malloc (sizeof(list_t));
    list_init("tokens", 1, INT_MAX, merge_tokens);
    
    int ret_code = 0;
    double start, stop, total;
    vcf_file_t *files[options_data->num_files];
    memset(files, 0, options_data->num_files * sizeof(vcf_file_t*));
    
    // Initialize variables related to the different files
    for (int i = 0; i < options_data->num_files; i++) {
        files[i] = vcf_open(options_data->input_files[i], shared_options_data->max_batches);
        if (!files[i]) {
            LOG_FATAL_F("VCF file %s does not exist!\n", options_data->input_files[i]);
        }
        
        read_list[i] = (list_t*) malloc(sizeof(list_t));
        list_init("text", 1, shared_options_data->max_batches, read_list[i]);
    }
    
    ret_code = create_directory(shared_options_data->output_directory);
    if (ret_code != 0 && errno != EEXIST) {
        LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory);
    }

    chromosome_order = get_chromosome_order(shared_options_data->host_url, shared_options_data->species,
                                            shared_options_data->version, &num_chromosomes);
    
    printf("Number of threads = %d\n", shared_options_data->num_threads);
    
#pragma omp parallel sections private(start, stop, total)
    {
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num());
            // Reading
            start = omp_get_wtime();

            ret_code = vcf_multiread_batches(read_list, shared_options_data->batch_lines, files, options_data->num_files);

            stop = omp_get_wtime();
            total = stop - start;

            if (ret_code) {
                LOG_ERROR_F("Error %d while reading VCF files\n", ret_code);
            }

            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);
        }
        
#pragma omp section
        {
            // Enable nested parallelism
            omp_set_nested(1);
            
            LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num());
            
            int num_eof_found = 0;
            int eof_found[options_data->num_files];
            memset(eof_found, 0, options_data->num_files * sizeof(int));
            
            list_item_t *items[options_data->num_files];
            memset(items, 0, options_data->num_files * sizeof(list_item_t*));
            char *texts[options_data->num_files];
            memset(texts, 0, options_data->num_files * sizeof(char*));
            
            khash_t(pos) *positions_read = kh_init(pos);
            
            long max_position_merged = LONG_MAX;
            char *max_chromosome_merged = NULL;
            int header_merged = 0;
            int token = 0;
            
            double start_parsing, start_insertion, total_parsing = 0, total_insertion = 0;
            
            start = omp_get_wtime();

            while (num_eof_found < options_data->num_files) {
                /* Process:
                 * - N threads getting batches of VCF records and inserting them in a data structure. The common minimum 
                 * position of each group of batches will also be stored.
                 * - If the data structure reaches certain size or the end of a chromosome, merge positions prior to the 
                 * last minimum registered.
                 */
                
                // Getting text elements in a critical region guarantees that each thread gets variants in positions in the same range
                for (int i = 0; i < options_data->num_files; i++) {
                    if (eof_found[i]) {
                        continue;
                    }
                    
                    items[i] = list_remove_item(read_list[i]);
                    if (items[i] == NULL || !strcmp(items[i]->data_p, "")) {
                        LOG_INFO_F("[%d] EOF found in file %s\n", omp_get_thread_num(), options_data->input_files[i]);
                        eof_found[i] = 1;
                        num_eof_found++;
                        
                        if(items[i] != NULL && !strcmp(items[i]->data_p, "")) {
                            free(items[i]->data_p);
                            list_item_free(items[i]);
                            LOG_DEBUG_F("[%d] Text batch freed\n", omp_get_thread_num());
                        } else {
                            LOG_DEBUG_F("[%d] No need to free text batch\n", omp_get_thread_num());
                        }
                        
                        continue;
                    }
                    
                    assert(items[i]->data_p != NULL);
                    texts[i] = items[i]->data_p;
                    
//                     printf("[%d] text batch from file %d\tcontents = '%s'\n", omp_get_thread_num(), i, texts[i]);
                }
                
                for (int i = 0; i < options_data->num_files; i++) {
                    if (eof_found[i]) {
                        continue;
                    }
                    
                    start_parsing = omp_get_wtime();
                    
                    char *text_begin = texts[i];
                    char *text_end = text_begin + strlen(text_begin);
                    assert(text_end != NULL);
                    
//                     printf("batch = '%.*s'\n", text_end - text_begin, text_begin);
                    
                    // Get VCF batches from text batches
                    vcf_reader_status *status = vcf_reader_status_new(shared_options_data->batch_lines, 0);
                    ret_code = run_vcf_parser(text_begin, text_end, shared_options_data->batch_lines, files[i], status);
                    
                    if (ret_code) {
                        // TODO stop?
                        LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, files[i]->filename);
                        continue;
                    }

//                     printf("batches = %d\n", files[i]->record_batches->length);
                    vcf_batch_t *batch = fetch_vcf_batch_non_blocking(files[i]);
                    if (!batch) {
                        continue;
                    }
                    
                    total_parsing += omp_get_wtime() - start_parsing;
                    start_insertion = omp_get_wtime();
                    
                    // Insert records into hashtable
                    for (int j = 0; j < batch->records->size; j++) {
                        vcf_record_t *record = vcf_record_copy(array_list_get(j, batch->records));
                        vcf_record_file_link *link = vcf_record_file_link_new(record, files[i]);
                        char key[64];
                        compose_key_value(record->chromosome, record->position, key);
                        int ret = insert_position_read(key, link, positions_read);
                        assert(ret);
                    }
                    
                    total_insertion += omp_get_wtime() - start_insertion;
                    
                    // Update minimum position being a maximum of these batches
                    vcf_record_t *current_record = (vcf_record_t*) array_list_get(batch->records->size - 1, batch->records);
                    calculate_merge_interval(current_record, &max_chromosome_merged, &max_position_merged, chromosome_order, num_chromosomes);
                    
                    // Free batch and its contents
                    vcf_reader_status_free(status);
                    vcf_batch_free(batch);
                    list_item_free(items[i]);
                }
                
                if (num_eof_found == options_data->num_files) {
                    max_chromosome_merged = chromosome_order[num_chromosomes-1];
                    max_position_merged = LONG_MAX;
                }
                
                // Merge headers, if not previously done
                if (!header_merged) {
                    merge_vcf_headers(files, options_data->num_files, options_data, output_header_list);
                    header_merged = 1;
                    
                    // Decrease list writers count
                    for (int i = 0; i < shared_options_data->num_threads; i++) {
                        list_decr_writers(output_header_list);
                    }
                }
                
                // If the data structure reaches certain size or the end of a chromosome, 
                // merge positions prior to the last minimum registered
                if (num_eof_found < options_data->num_files && kh_size(positions_read) > TREE_LIMIT) {
                    LOG_INFO_F("Merging until position %s:%ld\n", max_chromosome_merged, max_position_merged);
                    token = merge_interval(positions_read, max_chromosome_merged, max_position_merged, chromosome_order, num_chromosomes,
                                   	   	   files, shared_options_data, options_data, output_list);
                }
                // When reaching EOF for all files, merge the remaining entries
                else if (num_eof_found == options_data->num_files && kh_size(positions_read) > 0) {
                    LOG_INFO_F("Merging remaining positions (last = %s:%ld)\n", chromosome_order[num_chromosomes - 1], LONG_MAX);
                    token = merge_remaining_interval(positions_read, files, shared_options_data, options_data, output_list);
                }
                
                if (token) {
                	int *token_ptr = malloc (sizeof(int)); *token_ptr = token;
                    list_item_t *item = list_item_new(1, 0, token_ptr);
                    list_insert_item(item, merge_tokens);
                }

                // Set variables ready for next iteration of the algorithm
                if (max_chromosome_merged) {
                    free(max_chromosome_merged);
                }
            	token = 0;
                max_chromosome_merged = NULL;
                max_position_merged = LONG_MAX;
            }
            
            kh_destroy(pos, positions_read);
            
            stop = omp_get_wtime();

            total = stop - start;

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            LOG_DEBUG_F("** Time in parsing = %f s\n", total_parsing);
            LOG_DEBUG_F("** Time in insertion = %f s\n", total_insertion);
//             for (int i = 0; i < shared_options_data->num_threads; i++) {
//                 printf("[%d] Time in searching = %f s\n", i, total_search[i]);
//                 printf("[%d] Time in merging = %f s\n", i, total_merge[i]);
//             }
            
            // Decrease list writers count
            for (int i = 0; i < shared_options_data->num_threads; i++) {
                list_decr_writers(output_list);
            }
            list_decr_writers(merge_tokens);
        }
        
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num());
    
            start = omp_get_wtime();

            // Create file streams for results
            char aux_filename[32]; memset(aux_filename, 0, 32 * sizeof(char));
            sprintf(aux_filename, "merge_from_%d_files.vcf", options_data->num_files);
            
            char *merge_filename;
            FILE *merge_fd = get_output_file(shared_options_data, aux_filename, &merge_filename);
            LOG_INFO_F("Output filename = %s\n", merge_filename);
            free(merge_filename);
            
            list_item_t *item1 = NULL, *item2 = NULL;
            vcf_header_entry_t *entry;
            vcf_record_t *record;
            int *num_records;
            
            // Write headers
            while ((item1 = list_remove_item(output_header_list)) != NULL) {
                entry = item1->data_p;
                write_vcf_header_entry(entry, merge_fd);
            }
            
            // Write delimiter
            array_list_t *sample_names = merge_vcf_sample_names(files, options_data->num_files);
            write_vcf_delimiter_from_samples((char**) sample_names->items, sample_names->size, merge_fd);
            
            // Write records
            // When a token is present, it means a set of batches has been merged. The token contains the number of records merged.
            // In this case, the records must be sorted by chromosome and position, and written afterwards.
            while ((item1 = list_remove_item(merge_tokens)) != NULL) {
                num_records = item1->data_p;
                vcf_record_t *records[*num_records];
                for (int i = 0; i < *num_records; i++) {
                    item2 = list_remove_item(output_list);
                    if (!item2) {
                        break;
                    }

                    records[i] = item2->data_p;
                    list_item_free(item2);
                }

                // Sort records
                qsort(records, *num_records, sizeof(vcf_record_t*), record_cmp);

                // Write and free sorted records
                for (int i = 0; i < *num_records; i++) {
                    record = records[i];
                    write_vcf_record(record, merge_fd);
                    vcf_record_free_deep(record);
                }

                free(num_records);
                list_item_free(item1);
            }
            
            // Close file
            if (merge_fd != NULL) { fclose(merge_fd); }
            
            stop = omp_get_wtime();

            total = stop - start;

            LOG_INFO_F("[%dW] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dW] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);
        }
    }

    // Free variables related to the different files
    for (int i = 0; i < options_data->num_files; i++) {
        if(files[i]) { vcf_close(files[i]); }
        if(read_list[i]) { free(read_list[i]); }
    }
    free(output_list);
    
    return ret_code;
}
コード例 #8
0
ファイル: ped_reader.c プロジェクト: mrG7/hpg-libs
int ped_ragel_read(list_t *batches_list, size_t batch_size, ped_file_t *file)
{
    int cs;
    char *p = file->data;
    char *pe = p + file->data_len;
    char *eof = pe;
    char *ts;
    int custom_field_count = 0;

    current_batch = ped_batch_new(batch_size);

    
#line 41 "ped_reader.c"
	{
	cs = ped_start;
	}

#line 46 "ped_reader.c"
	{
	if ( p == pe )
		goto _test_eof;
	switch ( cs )
	{
case 21:
	switch( (*p) ) {
		case 10: goto st22;
		case 35: goto st16;
	}
	if ( 33 <= (*p) && (*p) <= 126 )
		goto tr36;
	goto tr0;
tr0:
#line 53 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'family' field\n", lines + 1, file->filename);
    }
	goto st0;
tr3:
#line 65 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'individual' field\n", lines + 1, file->filename);
    }
	goto st0;
tr7:
#line 77 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'father' field\n", lines + 1, file->filename);
    }
	goto st0;
tr11:
#line 89 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'mother' field\n", lines + 1, file->filename);
    }
	goto st0;
tr15:
#line 109 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'sex' field\n", lines + 1, file->filename);
    }
	goto st0;
tr19:
#line 124 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'phenotype' field\n", lines + 1, file->filename);
    }
	goto st0;
tr26:
#line 141 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'header' field\n", lines + 1, file->filename);
    }
	goto st0;
tr44:
#line 161 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in '%s' field\n", lines + 1, file->filename, current_record->custom_field);
    }
	goto st0;
#line 108 "ped_reader.c"
st0:
cs = 0;
	goto _out;
st22:
	if ( ++p == pe )
		goto _test_eof22;
case 22:
	if ( (*p) == 10 )
		goto st22;
	goto st0;
tr36:
#line 22 "ped.ragel"
	{
        current_record = create_ped_record();
        genotype = 0;
    }
#line 45 "ped.ragel"
	{
        ts = p;
    }
	goto st1;
st1:
	if ( ++p == pe )
		goto _test_eof1;
case 1:
#line 134 "ped_reader.c"
	if ( (*p) == 9 )
		goto tr1;
	if ( 33 <= (*p) && (*p) <= 126 )
		goto st1;
	goto tr0;
tr1:
#line 49 "ped.ragel"
	{
        set_ped_record_family_id(strndup(ts, p-ts), current_record);
    }
	goto st2;
st2:
	if ( ++p == pe )
		goto _test_eof2;
case 2:
#line 150 "ped_reader.c"
	if ( (*p) == 95 )
		goto tr4;
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr4;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr4;
	} else
		goto tr4;
	goto tr3;
tr4:
#line 57 "ped.ragel"
	{
        ts = p;
    }
	goto st3;
st3:
	if ( ++p == pe )
		goto _test_eof3;
case 3:
#line 172 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr5;
		case 95: goto st3;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto st3;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto st3;
	} else
		goto st3;
	goto tr3;
tr5:
#line 61 "ped.ragel"
	{
        set_ped_record_individual_id(strndup(ts, p-ts), current_record);
    }
	goto st4;
st4:
	if ( ++p == pe )
		goto _test_eof4;
case 4:
#line 196 "ped_reader.c"
	switch( (*p) ) {
		case 46: goto tr8;
		case 95: goto tr9;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr9;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr9;
	} else
		goto tr9;
	goto tr7;
tr8:
#line 69 "ped.ragel"
	{
        ts = p;
    }
	goto st5;
st5:
	if ( ++p == pe )
		goto _test_eof5;
case 5:
#line 220 "ped_reader.c"
	if ( (*p) == 9 )
		goto tr10;
	goto tr7;
tr10:
#line 73 "ped.ragel"
	{
        set_ped_record_father_id(strndup(ts, p-ts), current_record);
    }
	goto st6;
st6:
	if ( ++p == pe )
		goto _test_eof6;
case 6:
#line 234 "ped_reader.c"
	switch( (*p) ) {
		case 46: goto tr12;
		case 95: goto tr13;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr13;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr13;
	} else
		goto tr13;
	goto tr11;
tr12:
#line 81 "ped.ragel"
	{
        ts = p;
    }
	goto st7;
st7:
	if ( ++p == pe )
		goto _test_eof7;
case 7:
#line 258 "ped_reader.c"
	if ( (*p) == 9 )
		goto tr14;
	goto tr11;
tr14:
#line 85 "ped.ragel"
	{
        set_ped_record_mother_id(strndup(ts, p-ts), current_record);
    }
	goto st8;
st8:
	if ( ++p == pe )
		goto _test_eof8;
case 8:
#line 272 "ped_reader.c"
	if ( (*p) == 46 )
		goto tr16;
	if ( 48 <= (*p) && (*p) <= 57 )
		goto tr17;
	goto tr15;
tr16:
#line 93 "ped.ragel"
	{
        ts = p;
    }
	goto st9;
st9:
	if ( ++p == pe )
		goto _test_eof9;
case 9:
#line 288 "ped_reader.c"
	if ( (*p) == 9 )
		goto tr18;
	goto tr15;
tr18:
#line 97 "ped.ragel"
	{
        char *field = strndup(ts, p-ts);
        enum Sex sex = UNKNOWN_SEX;
        if (atoi(field) == 1) {
            sex = MALE;
        } else if (atoi(field) == 2) {
            sex = FEMALE;
        }
        set_ped_record_sex(sex, current_record);
        free(field);    // Not set as ped_record_t variable -> not freed later
    }
	goto st10;
st10:
	if ( ++p == pe )
		goto _test_eof10;
case 10:
#line 310 "ped_reader.c"
	switch( (*p) ) {
		case 32: goto tr20;
		case 95: goto tr20;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr20;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr20;
	} else
		goto tr20;
	goto tr19;
tr20:
#line 113 "ped.ragel"
	{
        ts = p;
    }
	goto st23;
tr42:
#line 117 "ped.ragel"
	{
        if (strncmp(".", ts, 1)) {
            char *field = strndup(ts, p-ts);
            set_ped_record_phenotype(field, current_record, file);
        }
    }
#line 145 "ped.ragel"
	{
        custom_field_count = 6;
    }
	goto st23;
st23:
	if ( ++p == pe )
		goto _test_eof23;
case 23:
#line 347 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr39;
		case 10: goto tr40;
		case 32: goto tr42;
		case 95: goto st23;
	}
	if ( (*p) < 48 ) {
		if ( 11 <= (*p) && (*p) <= 13 )
			goto tr41;
	} else if ( (*p) > 57 ) {
		if ( (*p) > 90 ) {
			if ( 97 <= (*p) && (*p) <= 122 )
				goto st23;
		} else if ( (*p) >= 65 )
			goto st23;
	} else
		goto st23;
	goto tr19;
tr39:
#line 117 "ped.ragel"
	{
        if (strncmp(".", ts, 1)) {
            char *field = strndup(ts, p-ts);
            set_ped_record_phenotype(field, current_record, file);
        }
    }
#line 145 "ped.ragel"
	{
        custom_field_count = 6;
    }
	goto st24;
tr49:
#line 153 "ped.ragel"
	{
        char* field_name = strndup(ts, p-ts);
        custom_field_count++;
        if (custom_field_count == file->num_field) {
            set_ped_record_custom_field(field_name, current_record, file);
        }
    }
	goto st24;
st24:
	if ( ++p == pe )
		goto _test_eof24;
case 24:
#line 393 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto st24;
		case 10: goto tr46;
	}
	if ( (*p) > 13 ) {
		if ( 32 <= (*p) && (*p) <= 126 )
			goto tr48;
	} else if ( (*p) >= 11 )
		goto st26;
	goto tr44;
tr40:
#line 117 "ped.ragel"
	{
        if (strncmp(".", ts, 1)) {
            char *field = strndup(ts, p-ts);
            set_ped_record_phenotype(field, current_record, file);
        }
    }
#line 145 "ped.ragel"
	{
        custom_field_count = 6;
    }
#line 27 "ped.ragel"
	{
        // If batch is full, add to the list of batches and create a new, empty one
        if (ped_batch_is_full(current_batch))
        {
            list_item_t *item = list_item_new(num_records, 1, current_batch);
            list_insert_item(item, batches_list);
            LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length);
            current_batch = ped_batch_new(batch_size);
        }

        // Add current record to current batch
        if (current_record) {
            add_record_to_ped_batch(current_record, current_batch);
            num_records++;
        }
        current_record = NULL;
    }
#line 18 "ped.ragel"
	{
        lines++;
    }
	goto st25;
tr46:
#line 27 "ped.ragel"
	{
        // If batch is full, add to the list of batches and create a new, empty one
        if (ped_batch_is_full(current_batch))
        {
            list_item_t *item = list_item_new(num_records, 1, current_batch);
            list_insert_item(item, batches_list);
            LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length);
            current_batch = ped_batch_new(batch_size);
        }

        // Add current record to current batch
        if (current_record) {
            add_record_to_ped_batch(current_record, current_batch);
            num_records++;
        }
        current_record = NULL;
    }
#line 18 "ped.ragel"
	{
        lines++;
    }
	goto st25;
tr50:
#line 153 "ped.ragel"
	{
        char* field_name = strndup(ts, p-ts);
        custom_field_count++;
        if (custom_field_count == file->num_field) {
            set_ped_record_custom_field(field_name, current_record, file);
        }
    }
#line 27 "ped.ragel"
	{
        // If batch is full, add to the list of batches and create a new, empty one
        if (ped_batch_is_full(current_batch))
        {
            list_item_t *item = list_item_new(num_records, 1, current_batch);
            list_insert_item(item, batches_list);
            LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length);
            current_batch = ped_batch_new(batch_size);
        }

        // Add current record to current batch
        if (current_record) {
            add_record_to_ped_batch(current_record, current_batch);
            num_records++;
        }
        current_record = NULL;
    }
#line 18 "ped.ragel"
	{
        lines++;
    }
	goto st25;
st25:
	if ( ++p == pe )
		goto _test_eof25;
case 25:
#line 499 "ped_reader.c"
	switch( (*p) ) {
		case 10: goto tr46;
		case 32: goto st26;
	}
	if ( (*p) < 33 ) {
		if ( 9 <= (*p) && (*p) <= 13 )
			goto st26;
	} else if ( (*p) > 34 ) {
		if ( 36 <= (*p) && (*p) <= 126 )
			goto tr36;
	} else
		goto tr36;
	goto tr0;
tr41:
#line 117 "ped.ragel"
	{
        if (strncmp(".", ts, 1)) {
            char *field = strndup(ts, p-ts);
            set_ped_record_phenotype(field, current_record, file);
        }
    }
#line 145 "ped.ragel"
	{
        custom_field_count = 6;
    }
	goto st26;
tr51:
#line 153 "ped.ragel"
	{
        char* field_name = strndup(ts, p-ts);
        custom_field_count++;
        if (custom_field_count == file->num_field) {
            set_ped_record_custom_field(field_name, current_record, file);
        }
    }
	goto st26;
st26:
	if ( ++p == pe )
		goto _test_eof26;
case 26:
#line 540 "ped_reader.c"
	switch( (*p) ) {
		case 10: goto tr46;
		case 32: goto st26;
	}
	if ( 9 <= (*p) && (*p) <= 13 )
		goto st26;
	goto st0;
tr48:
#line 149 "ped.ragel"
	{
        ts = p;
    }
	goto st27;
tr52:
#line 153 "ped.ragel"
	{
        char* field_name = strndup(ts, p-ts);
        custom_field_count++;
        if (custom_field_count == file->num_field) {
            set_ped_record_custom_field(field_name, current_record, file);
        }
    }
	goto st27;
st27:
	if ( ++p == pe )
		goto _test_eof27;
case 27:
#line 568 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr49;
		case 10: goto tr50;
		case 32: goto tr52;
	}
	if ( (*p) > 13 ) {
		if ( 33 <= (*p) && (*p) <= 126 )
			goto st27;
	} else if ( (*p) >= 11 )
		goto tr51;
	goto tr44;
tr17:
#line 93 "ped.ragel"
	{
        ts = p;
    }
	goto st11;
st11:
	if ( ++p == pe )
		goto _test_eof11;
case 11:
#line 590 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr18;
		case 46: goto st12;
	}
	if ( 48 <= (*p) && (*p) <= 57 )
		goto st11;
	goto tr15;
st12:
	if ( ++p == pe )
		goto _test_eof12;
case 12:
	if ( 48 <= (*p) && (*p) <= 57 )
		goto st13;
	goto tr15;
st13:
	if ( ++p == pe )
		goto _test_eof13;
case 13:
	if ( (*p) == 9 )
		goto tr18;
	if ( 48 <= (*p) && (*p) <= 57 )
		goto st13;
	goto tr15;
tr13:
#line 81 "ped.ragel"
	{
        ts = p;
    }
	goto st14;
st14:
	if ( ++p == pe )
		goto _test_eof14;
case 14:
#line 624 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr14;
		case 95: goto st14;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto st14;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto st14;
	} else
		goto st14;
	goto tr11;
tr9:
#line 69 "ped.ragel"
	{
        ts = p;
    }
	goto st15;
st15:
	if ( ++p == pe )
		goto _test_eof15;
case 15:
#line 648 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr10;
		case 95: goto st15;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto st15;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto st15;
	} else
		goto st15;
	goto tr7;
st16:
	if ( ++p == pe )
		goto _test_eof16;
case 16:
	switch( (*p) ) {
		case 9: goto st17;
		case 32: goto tr28;
		case 95: goto tr29;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr29;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr29;
	} else
		goto tr29;
	goto tr26;
st17:
	if ( ++p == pe )
		goto _test_eof17;
case 17:
	switch( (*p) ) {
		case 32: goto tr29;
		case 95: goto tr29;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr29;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr29;
	} else
		goto tr29;
	goto tr26;
tr29:
#line 128 "ped.ragel"
	{
        ts = p;
    }
	goto st18;
st18:
	if ( ++p == pe )
		goto _test_eof18;
case 18:
#line 707 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr30;
		case 10: goto tr31;
		case 32: goto st18;
		case 95: goto st18;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto st18;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto st18;
	} else
		goto st18;
	goto tr26;
tr30:
#line 132 "ped.ragel"
	{
        char* field_name = strndup(ts, p-ts);
        custom_field_count++;
        if (file->variable_field && !strcmp(field_name, file->variable_field)) {
            file->num_field = custom_field_count;
        }
        free(field_name);
    }
	goto st19;
st19:
	if ( ++p == pe )
		goto _test_eof19;
case 19:
#line 738 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto st19;
		case 10: goto st28;
		case 32: goto tr29;
		case 95: goto tr29;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr29;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr29;
	} else
		goto tr29;
	goto tr26;
tr31:
#line 132 "ped.ragel"
	{
        char* field_name = strndup(ts, p-ts);
        custom_field_count++;
        if (file->variable_field && !strcmp(field_name, file->variable_field)) {
            file->num_field = custom_field_count;
        }
        free(field_name);
    }
	goto st28;
st28:
	if ( ++p == pe )
		goto _test_eof28;
case 28:
#line 769 "ped_reader.c"
	if ( (*p) == 10 )
		goto st22;
	if ( (*p) > 34 ) {
		if ( 36 <= (*p) && (*p) <= 126 )
			goto tr36;
	} else if ( (*p) >= 33 )
		goto tr36;
	goto tr0;
tr28:
#line 128 "ped.ragel"
	{
        ts = p;
    }
	goto st20;
st20:
	if ( ++p == pe )
		goto _test_eof20;
case 20:
#line 788 "ped_reader.c"
	switch( (*p) ) {
		case 9: goto tr30;
		case 10: goto tr31;
		case 32: goto tr29;
		case 95: goto tr29;
	}
	if ( (*p) < 65 ) {
		if ( 48 <= (*p) && (*p) <= 57 )
			goto tr29;
	} else if ( (*p) > 90 ) {
		if ( 97 <= (*p) && (*p) <= 122 )
			goto tr29;
	} else
		goto tr29;
	goto tr26;
	}
	_test_eof22: cs = 22; goto _test_eof; 
	_test_eof1: cs = 1; goto _test_eof; 
	_test_eof2: cs = 2; goto _test_eof; 
	_test_eof3: cs = 3; goto _test_eof; 
	_test_eof4: cs = 4; goto _test_eof; 
	_test_eof5: cs = 5; goto _test_eof; 
	_test_eof6: cs = 6; goto _test_eof; 
	_test_eof7: cs = 7; goto _test_eof; 
	_test_eof8: cs = 8; goto _test_eof; 
	_test_eof9: cs = 9; goto _test_eof; 
	_test_eof10: cs = 10; goto _test_eof; 
	_test_eof23: cs = 23; goto _test_eof; 
	_test_eof24: cs = 24; goto _test_eof; 
	_test_eof25: cs = 25; goto _test_eof; 
	_test_eof26: cs = 26; goto _test_eof; 
	_test_eof27: cs = 27; goto _test_eof; 
	_test_eof11: cs = 11; goto _test_eof; 
	_test_eof12: cs = 12; goto _test_eof; 
	_test_eof13: cs = 13; goto _test_eof; 
	_test_eof14: cs = 14; goto _test_eof; 
	_test_eof15: cs = 15; goto _test_eof; 
	_test_eof16: cs = 16; goto _test_eof; 
	_test_eof17: cs = 17; goto _test_eof; 
	_test_eof18: cs = 18; goto _test_eof; 
	_test_eof19: cs = 19; goto _test_eof; 
	_test_eof28: cs = 28; goto _test_eof; 
	_test_eof20: cs = 20; goto _test_eof; 

	_test_eof: {}
	if ( p == eof )
	{
	switch ( cs ) {
	case 24: 
	case 25: 
	case 26: 
#line 27 "ped.ragel"
	{
        // If batch is full, add to the list of batches and create a new, empty one
        if (ped_batch_is_full(current_batch))
        {
            list_item_t *item = list_item_new(num_records, 1, current_batch);
            list_insert_item(item, batches_list);
            LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length);
            current_batch = ped_batch_new(batch_size);
        }

        // Add current record to current batch
        if (current_record) {
            add_record_to_ped_batch(current_record, current_batch);
            num_records++;
        }
        current_record = NULL;
    }
	break;
	case 1: 
#line 53 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'family' field\n", lines + 1, file->filename);
    }
	break;
	case 2: 
	case 3: 
#line 65 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'individual' field\n", lines + 1, file->filename);
    }
	break;
	case 4: 
	case 5: 
	case 15: 
#line 77 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'father' field\n", lines + 1, file->filename);
    }
	break;
	case 6: 
	case 7: 
	case 14: 
#line 89 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'mother' field\n", lines + 1, file->filename);
    }
	break;
	case 8: 
	case 9: 
	case 11: 
	case 12: 
	case 13: 
#line 109 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'sex' field\n", lines + 1, file->filename);
    }
	break;
	case 10: 
#line 124 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'phenotype' field\n", lines + 1, file->filename);
    }
	break;
	case 16: 
	case 17: 
	case 18: 
	case 19: 
	case 20: 
#line 141 "ped.ragel"
	{
        LOG_ERROR_F("Line %zu (%s): Error in 'header' field\n", lines + 1, file->filename);
    }
	break;
	case 27: 
#line 153 "ped.ragel"
	{
        char* field_name = strndup(ts, p-ts);
        custom_field_count++;
        if (custom_field_count == file->num_field) {
            set_ped_record_custom_field(field_name, current_record, file);
        }
    }
#line 27 "ped.ragel"
	{
        // If batch is full, add to the list of batches and create a new, empty one
        if (ped_batch_is_full(current_batch))
        {
            list_item_t *item = list_item_new(num_records, 1, current_batch);
            list_insert_item(item, batches_list);
            LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length);
            current_batch = ped_batch_new(batch_size);
        }

        // Add current record to current batch
        if (current_record) {
            add_record_to_ped_batch(current_record, current_batch);
            num_records++;
        }
        current_record = NULL;
    }
	break;
	case 23: 
#line 117 "ped.ragel"
	{
        if (strncmp(".", ts, 1)) {
            char *field = strndup(ts, p-ts);
            set_ped_record_phenotype(field, current_record, file);
        }
    }
#line 145 "ped.ragel"
	{
        custom_field_count = 6;
    }
#line 27 "ped.ragel"
	{
        // If batch is full, add to the list of batches and create a new, empty one
        if (ped_batch_is_full(current_batch))
        {
            list_item_t *item = list_item_new(num_records, 1, current_batch);
            list_insert_item(item, batches_list);
            LOG_DEBUG_F("Batch added - %zu records\n", current_batch->length);
            current_batch = ped_batch_new(batch_size);
        }

        // Add current record to current batch
        if (current_record) {
            add_record_to_ped_batch(current_record, current_batch);
            num_records++;
        }
        current_record = NULL;
    }
	break;
#line 973 "ped_reader.c"
	}
	}

	_out: {}
	}

#line 221 "ped.ragel"
 

    // Insert the last batch
    if (!ped_batch_is_empty(current_batch))
    {
        list_item_t *item = list_item_new(num_records, 1, current_batch); 
        list_insert_item(item, batches_list);
        LOG_DEBUG_F("Batch added - %zu records (last)\n", current_batch->length);
    }

    if ( cs < 
#line 992 "ped_reader.c"
21
#line 231 "ped.ragel"
 ) 
    {
        LOG_ERROR("The file was not successfully read\n");
        LOG_INFO_F("Last state is %d, but %d was expected\n", 
                cs, 
#line 1000 "ped_reader.c"
21
#line 235 "ped.ragel"
);
    } 

    LOG_INFO_F("PED records read = %zu\n", num_records);

    return cs < 
#line 1009 "ped_reader.c"
21
#line 240 "ped.ragel"
;
}
コード例 #9
0
int add_vcf_batch(vcf_batch_t *batch, vcf_file_t *file) {
    assert(batch);
    assert(file);
    list_item_t *item = list_item_new(rand() % 1000, 1, batch); 
    list_insert_item(item, file->record_batches);
}
コード例 #10
0
ファイル: effect_runner.c プロジェクト: abushoy/hpg-variant
static void parse_mutation_phenotype_response(int tid, list_t *output_list) {
    list_item_t *output_item = list_item_new(tid, MUTATION_PHENOTYPE, trim(strdup(mutation_line[tid])));
    list_insert_item(output_item, output_list);
}
コード例 #11
0
ファイル: effect_runner.c プロジェクト: abushoy/hpg-variant
static void parse_snp_phenotype_response(int tid, list_t *output_list) {
    list_item_t *output_item = list_item_new(tid, SNP_PHENOTYPE, trim(strdup(snp_line[tid])));
    list_insert_item(output_item, output_list);
}
コード例 #12
0
ファイル: effect_runner.c プロジェクト: abushoy/hpg-variant
static void parse_effect_response(int tid, char *output_directory, size_t output_directory_len, cp_hashtable *output_files, 
                                  list_t *output_list, cp_hashtable *summary_count, cp_hashtable *gene_list) {
    int *SO_found = (int*) malloc (sizeof(int)); // Whether the SO code field has been found
    int *count;
    char tmp_consequence_type[128];
    
    int num_lines;
    char **split_batch = split(effect_line[tid], "\n", &num_lines);
    
    for (int i = 0; i < num_lines; i++) {
        int num_columns;
        char *copy_buf = strdup(split_batch[i]);
        char **split_result = split(copy_buf, "\t", &num_columns);
        free(copy_buf);
        
        // Find consequence type name (always after SO field)
        *SO_found = 0;
        if (num_columns == 25) {
//             LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]);
            if (!cp_hashtable_contains(gene_list, split_result[17])) {
                cp_hashtable_put(gene_list, strdup(split_result[17]), NULL);
            }
            *SO_found = atoi(split_result[18] + 3);
           memset(tmp_consequence_type, 0, 128 * sizeof(char));
           strncat(tmp_consequence_type, split_result[19], strlen(split_result[19]));
        } else {
            if (strlen(split_batch[i]) == 0) { // Last line in batch could be only a newline
                for (int s = 0; s < num_columns; s++) {
                    free(split_result[s]);
                }
                free(split_result);
                continue;
            }
            
            LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_columns, split_batch[i]);
            
            for (int s = 0; s < num_columns; s++) {
                free(split_result[s]);
            }
            free(split_result);
            continue;
        }
        
        for (int s = 0; s < num_columns; s++) {
            free(split_result[s]);
        }
        free(split_result);
        
        if (!*SO_found) { // SO:000000 is not valid
            LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid);
            continue;
        }

//         LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found);
        size_t consequence_type_len = strlen(tmp_consequence_type);
     
        // If file does not exist, create its descriptor and summary counter
        FILE *aux_file = cp_hashtable_get(output_files, SO_found);
        if (!aux_file) {
#pragma omp critical
            {
                // This construction avoids 2 threads trying to insert the same CT
                aux_file = cp_hashtable_get(output_files, SO_found);
                if (!aux_file) {
                    char filename[output_directory_len + consequence_type_len + 6];
                    memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char));
                    strncat(filename, output_directory, output_directory_len);
                    strncat(filename, "/", 1);
                    strncat(filename, tmp_consequence_type, consequence_type_len);
                    strncat(filename, ".txt", 4);
                    aux_file = fopen(filename, "a");
                    
                    // Add to hashtables (file descriptors and summary counters)
                    int *SO_stored = (int*) malloc (sizeof(int));
                    *SO_stored = *SO_found;
                    cp_hashtable_put(output_files, SO_stored, aux_file);

                    LOG_INFO_F("[%d] New consequence type found = %s\n", tid, tmp_consequence_type);
                }
            }
        }
        
        // Write line[tid] to file corresponding to its consequence type
        if (aux_file) { 
#pragma omp critical
            {
                // TODO move critical one level below?
                count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type);
                if (count == NULL) {
                    char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char));
                    strncat(consequence_type, tmp_consequence_type, consequence_type_len);
                    assert(!strcmp(consequence_type, tmp_consequence_type));
                    count = (int*) malloc (sizeof(int));
                    *count = 0;
                    cp_hashtable_put(summary_count, consequence_type, count);
//                     LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type);
                }
                // Increment counter for summary
                (*count)++;
            }
            
//             LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type);
            list_item_t *output_item = list_item_new(tid, *SO_found, strdup(split_batch[i]));
            list_insert_item(output_item, output_list);
//             LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type);
        }
    }
    
    for (int i = 0; i < num_lines; i++) {
        free(split_batch[i]);
    }
    free(split_batch);
}
コード例 #13
0
ファイル: tdt.c プロジェクト: CharoL/hpg-variant
int tdt_test(vcf_record_t **variants, int num_variants, family_t **families, int num_families, cp_hashtable *sample_ids, list_t *output_list) {
    double start = omp_get_wtime();
    
    int ret_code = 0;
    int tid = omp_get_thread_num();
    int num_samples = cp_hashtable_count(sample_ids);
    
    tdt_result_t *result;
    char **sample_data;
    
    int gt_position;
    int father_allele1, father_allele2;
    int mother_allele1, mother_allele2;
    int child_allele1, child_allele2;

    ///////////////////////////////////
    // Perform analysis for each variant

    vcf_record_t *record;
    for (int i = 0; i < num_variants; i++) {
        record = variants[i];
        LOG_DEBUG_F("[%d] Checking variant %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position);
        
        sample_data = (char**) record->samples->items;
        gt_position = get_field_position_in_format("GT", strndup(record->format, record->format_len));
    
        // Transmission counts
        int t1 = 0;
        int t2 = 0;
        
        
        // Count over families
        family_t *family;
        for (int f = 0; f < num_families; f++) {
            family = families[f];
            individual_t *father = family->father;
            individual_t *mother = family->mother;
            cp_list *children = family->children;

//           LOG_DEBUG_F("[%d] Checking suitability of family %s\n", tid, family->id);
            
            if (father == NULL || mother == NULL) {
                continue;
            }

            int *father_pos = cp_hashtable_get(sample_ids, father->id);
            if (father_pos != NULL) {
    //           LOG_DEBUG_F("[%d] Father %s is in position %d\n", tid, father->id, *father_pos);
            } else {
    //           LOG_DEBUG_F("[%d] Father %s is not positioned\n", tid, father->id);
                continue;
            }
            
            int *mother_pos = cp_hashtable_get(sample_ids, mother->id);
            if (mother_pos != NULL) {
    //           LOG_DEBUG_F("[%d] Mother %s is in position %d\n", tid, mother->id, *mother_pos);
            } else {
    //           LOG_DEBUG_F("[%d] Mother %s is not positioned\n", tid, mother->id);
                continue;
            }
            
            char *father_sample = strdup(sample_data[*father_pos]);
            char *mother_sample = strdup(sample_data[*mother_pos]);
            
//           LOG_DEBUG_F("[%d] Samples: Father = %s\tMother = %s\n", tid, father_sample, mother_sample);
            
            // If any parent's alleles can't be read or is missing, go to next family
            if (get_alleles(father_sample, gt_position, &father_allele1, &father_allele2) ||
                get_alleles(mother_sample, gt_position, &mother_allele1, &mother_allele2)) {
                free(father_sample);
                free(mother_sample);
                continue;
            }
            
//           LOG_DEBUG_F("[%d] Alleles: Father = %d/%d\tMother = %d/%d\n", tid, father_allele1, father_allele2, mother_allele1, mother_allele2);
            
            // We need two genotyped parents, with at least one het
            if (father_allele1 == father_allele2 && mother_allele1 == mother_allele2) {
                free(father_sample);
                free(mother_sample);
                continue;
            }
            
            if ((father_allele1 && !father_allele2) || (mother_allele1 && !mother_allele2)) {
                free(father_sample);
                free(mother_sample);
                continue;
            }

//           LOG_DEBUG_F("[%d] Proceeding to analyse family %s...\n", tid, family->id);

            
            int trA = 0;  // transmitted allele from first het parent
            int unA = 0;  // untransmitted allele from first het parent
            
            int trB = 0;  // transmitted allele from second het parent
            int unB = 0;  // untransmitted allele from second het parent
            
            // Consider all offspring in nuclear family
            cp_list_iterator *children_iterator = cp_list_create_iterator(family->children, COLLECTION_LOCK_READ);
            individual_t *child = NULL;
            while ((child = cp_list_iterator_next(children_iterator)) != NULL) {
                // Only consider affected children
                if (child->condition != AFFECTED) { continue; }
                
                int *child_pos = cp_hashtable_get(sample_ids, child->id);
                if (child_pos != NULL) {
        //           LOG_DEBUG_F("[%d] Child %s is in position %d\n", tid, child->id, *child_pos);
                } else {
        //           LOG_DEBUG_F("[%d] Child %s is not positioned\n", tid, child->id);
                    continue;
                }
                
                char *child_sample = strdup(sample_data[*child_pos]);
    //           LOG_DEBUG_F("[%d] Samples: Child = %s\n", tid, child_sample);
                
                // Skip if offspring has missing genotype
                if (get_alleles(child_sample, gt_position, &child_allele1, &child_allele2)) {
                    free(child_sample);
                    continue;
                }
                
                // Exclude mendelian errors
                char *aux_chromosome = strndup(record->chromosome, record->chromosome_len);
                if (check_mendel(aux_chromosome, father_allele1, father_allele2, mother_allele1, mother_allele2, 
                    child_allele1, child_allele2, child->sex)) {
                    free(child_sample);
                    free(aux_chromosome);
                    continue;
                }
                free(aux_chromosome);
                
                
                // We've now established: no missing genotypes
                // and at least one heterozygous parent

                // Kid is 00

                if (!child_allele1 && !child_allele2) {
                    if ( ( (!father_allele1) && father_allele2 ) && 
                        ( (!mother_allele1) && mother_allele2 ) )
                    { trA=1; unA=2; trB=1; unB=2; }
                    else 
                    { trA=1; unA=2; } 
                }
                else if ( (!child_allele1) && child_allele2 )  // Kid is 01
                {
                    // het dad
                    if (father_allele1 != father_allele2 )
                    {
                        // het mum
                        if ( mother_allele1 != mother_allele2 )
                    { trA=1; trB=2; unA=2; unB=1; }
                        else if ( !mother_allele1 ) 
                    { trA=2; unA=1; }
                        else { trA=1; unA=2; }
                    }
                    else if ( !father_allele1 ) 
                    {
                        trA=2; unA=1; 
                    }           
                    else
                    {
                        trA=1; unA=2;
                    }
                }
                else // kid is 1/1
                {
                    
                    if ( ( (!father_allele1) && father_allele2 ) && 
                        ( (!mother_allele1) && mother_allele2 ) )
                    { trA=2; unA=1; trB=2; unB=1; }
                    else 
                    { 
                        trA=2; unA=1;
                    }
                }
                
                // We have now populated trA (first transmission) 
                // and possibly trB also 
                
                ////////////////////////////////////////
                // Permutation? 50:50 flip (precomputed)
                
//                 if (permute) {
//                     if (flipA[f])
//                     {
//                     int t = trA;
//                     trA = unA;
//                     unA = t;
//                     
//                     t = trB;
//                     trB = unB;
//                     unB = t;
//                     }
//                 }
                
                // Increment transmission counts
                if (trA==1) { t1++; }
                else if (trA==2) { t2++; }
                
                if (trB==1) { t1++; }
                else if (trB==2) { t2++; }
                
//     //           LOG_DEBUG_F("TDT\t%.*s %s : %d %d - %d %d - %d %d - F %d/%d - M %d/%d - C %d/%d\n", 
//                             record->id_len, record->id, family->id, trA, unA, trB, unB, t1, t2, 
//                             father_allele1, father_allele2, mother_allele1, mother_allele2, child_allele1, child_allele2);
                free(child_sample);
                
            } // next offspring in family
            
            cp_list_iterator_destroy(children_iterator);
            free(father_sample);
            free(mother_sample);
        }  // next nuclear family

        /////////////////////////////
        // Finished counting: now compute
        // the statistics
        
        double tdt_chisq = -1;
        
        // Basic TDT test
        if (t1+t2 > 0) {
            tdt_chisq = ((double) ((t1-t2) * (t1-t2))) / (t1+t2);
        }
        
//         LOG_DEBUG_F("[%d] before adding %s:%ld\n", tid, record->chromosome, record->position);
        result = tdt_result_new(record->chromosome, record->chromosome_len, 
                                record->position, 
                                record->reference, record->reference_len, 
                                record->alternate, record->alternate_len,
                                t1, t2, tdt_chisq);
        list_item_t *output_item = list_item_new(tid, 0, result);
        list_insert_item(output_item, output_list);
//         LOG_DEBUG_F("[%d] after adding %.*s:%ld\n", tid, record->chromosome_len, record->chromosome, record->position);
        
    } // next variant

    double end = omp_get_wtime();
    
    return ret_code;
}
コード例 #14
0
ファイル: effect_runner.c プロジェクト: CharoL/hpg-variant
static size_t write_mutation_phenotype_ws_results(char *contents, size_t size, size_t nmemb, void *userdata) {
    int tid = omp_get_thread_num();
    
    int i = 0;
    int data_read_len = 0, next_line_len = 0;
    // Whether the buffer was consumed with a line read just partially
    int premature_end = 0;
    
    size_t realsize = size * nmemb;
    
    char *data = contents;
    char *output_text;
    
    
//     LOG_DEBUG_F("Mutation phenotype WS invoked, response size = %zu bytes -> %s\n", realsize, data);
    
    while (data_read_len < realsize) {
        assert((mutation_line + tid) != NULL);
        assert((mutation_max_line_size + tid) != NULL);
        
//         LOG_DEBUG_F("[%d] loop iteration #%d\n", tid, i);
        // Get length of data to copy
        next_line_len = strcspn(data, "\n");
        
        // If the mutation_line[tid] is too long for the current buffers, reallocate a little more than the needed memory
        if (strlen(mutation_line[tid]) + next_line_len + 1 > mutation_max_line_size[tid]) {
//             LOG_DEBUG_F("Line too long (%d elements, but %zu needed) in batch #%d\n", 
//                         mutation_max_line_size[tid], strlen(mutation_line[tid]) + next_line_len, batch_num);
//             char *out_buf = (char*) calloc (next_line_len+1, sizeof(char));
//             snprintf(out_buf, next_line_len, "%s", data);
//             LOG_INFO_F("[%d] too big data is: '%s'\n", tid, out_buf);
            char *aux_1 = (char*) realloc (mutation_line[tid], (mutation_max_line_size[tid] + next_line_len + 1) * sizeof(char));
            char *aux_2 = (char*) realloc (mutation_output_line[tid], (mutation_max_line_size[tid] + next_line_len + 1) * sizeof(char));
            
            if (!aux_1 || !aux_2) {
                LOG_ERROR("Can't resize buffers\n");
                // Can't resize buffers -> can't keep reading the file
                if (!aux_1) { free(mutation_line[tid]); }
                if (!aux_2) { free(mutation_output_line[tid]); }
                return data_read_len;
            }
            
            mutation_line[tid] = aux_1;
            mutation_output_line[tid] = aux_2;
            mutation_max_line_size[tid] += next_line_len + 1;
//             LOG_DEBUG_F("[%d] buffers realloc'd (%d)\n", tid, mutation_max_line_size[tid]);
        }
        
//         LOG_DEBUG_F("[%d] position = %d, read = %d, max_size = %zu\n", i, next_line_len, data_read_len, realsize);
        
        if (data_read_len + next_line_len >= realsize) {
            // Save current state (mutation_line[tid] partially read)
            strncat(mutation_line[tid], data, next_line_len);
            chomp(mutation_line[tid]);
            mutation_line[tid][strlen(mutation_line[tid])] = '\0';
            premature_end = 1;
//             LOG_DEBUG_F("widow mutation_line[tid] = '%s'\n", mutation_line[tid]);
            data_read_len = realsize;
            break;
        }
        
        strncat(mutation_line[tid], data, next_line_len);
        strncat(mutation_output_line[tid], mutation_line[tid], strlen(mutation_line[tid]));
     
//         LOG_DEBUG_F("[%d] copy to buffer (%zu)\n", tid, strlen(mutation_line[tid]));
    
//         LOG_DEBUG_F("[%d] before writing mutation phenotype\n", tid);
        output_text = strdup(mutation_output_line[tid]);
        list_item_t *output_item = list_item_new(tid, MUTATION_PHENOTYPE, output_text);
        list_insert_item(output_item, output_list);
//         LOG_DEBUG_F("[%d] after writing mutation phenotype\n", tid);
            
        data += next_line_len+1;
        data_read_len += next_line_len+1;
        
        memset(mutation_line[tid], 0, strlen(mutation_line[tid]));
        memset(mutation_output_line[tid], 0, strlen(mutation_output_line[tid]));
        
        i++;
    }
 
    // Empty buffer for next callback invocation
    if (!premature_end) {
        memset(mutation_line[tid], 0, strlen(mutation_line[tid]));
        memset(mutation_output_line[tid], 0, strlen(mutation_line[tid]));
    }

    return data_read_len;
}
コード例 #15
0
ファイル: effect_runner.c プロジェクト: CharoL/hpg-variant
static size_t write_effect_ws_results(char *contents, size_t size, size_t nmemb, void *userdata) {
    int tid = omp_get_thread_num();
    
    int i = 0;
    int data_read_len = 0, next_line_len = 0;
    // Whether the SO code field (previous to the consequence type name) has been found
    int *SO_found = (int*) malloc (sizeof(int));
    // Whether the buffer was consumed with a line read just partially
    int premature_end = 0;
    
    size_t realsize = size * nmemb;
    
    int *count;
    
    char *data = contents;
    char tmp_consequence_type[128];
    char *aux_buffer;
    char *output_text;
    
    
    LOG_DEBUG_F("Effect WS invoked, response size = %zu bytes\n", realsize);
    
    while (data_read_len < realsize) {
        assert((line + tid) != NULL);
        assert((max_line_size + tid) != NULL);
        
        LOG_DEBUG_F("[%d] loop iteration #%d\n", tid, i);
        // Get length of data to copy
        next_line_len = strcspn(data, "\n");
        
        // If the line[tid] is too long for the current buffers, reallocate a little more than the needed memory
        if (strlen(line[tid]) + next_line_len + 1 > max_line_size[tid]) {
//             LOG_DEBUG_F("Line too long (%d elements, but %zu needed) in batch #%d\n", 
//                         max_line_size[tid], strlen(line[tid]) + next_line_len, batch_num);
//             char *out_buf = (char*) calloc (next_line_len+1, sizeof(char));
//             snprintf(out_buf, next_line_len, "%s", data);
//             LOG_INFO_F("[%d] too big data is: '%s'\n", tid, out_buf);
            char *aux_1 = (char*) realloc (line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char));
            char *aux_2 = (char*) realloc (output_line[tid], (max_line_size[tid] + next_line_len + 1) * sizeof(char));
            
            if (!aux_1 || !aux_2) {
                LOG_ERROR("Can't resize buffers\n");
                // Can't resize buffers -> can't keep reading the file
                if (!aux_1) { free(line[tid]); }
                if (!aux_2) { free(output_line[tid]); }
                return data_read_len;
            }
            
            line[tid] = aux_1;
            output_line[tid] = aux_2;
            max_line_size[tid] += next_line_len + 1;
//             LOG_DEBUG_F("[%d] buffers realloc'd (%d)\n", tid, max_line_size[tid]);
        }
        
//         LOG_DEBUG_F("[%d] position = %d, read = %d, max_size = %zu\n", i, next_line_len, data_read_len, realsize);
        
        if (data_read_len + next_line_len >= realsize) {
            // Save current state (line[tid] partially read)
            strncat(line[tid], data, next_line_len);
            chomp(line[tid]);
            line[tid][strlen(line[tid])] = '\0';
            premature_end = 1;
//             LOG_DEBUG_F("widow line[tid] = '%s'\n", line[tid]);
            data_read_len = realsize;
            break;
        }
        
        strncat(line[tid], data, next_line_len);
        strncat(output_line[tid], line[tid], strlen(line[tid]));
     
//         LOG_DEBUG_F("[%d] copy to buffer (%zu)\n", tid, strlen(line[tid]));
    
        int num_substrings;
        char *copy_buf = strdup(line[tid]);
//         char *copy_buf = strdup(trim(line[tid]));
        char **split_result = split(copy_buf, "\t", &num_substrings);
        free(copy_buf);
        
        // Find consequence type name (always after SO field)
        *SO_found = 0;
        if (num_substrings == 25) {
//             LOG_DEBUG_F("gene = %s\tSO = %d\tCT = %s\n", split_result[17], atoi(split_result[18] + 3), split_result[19]);
            if (!cp_hashtable_contains(gene_list, split_result[17])) {
                cp_hashtable_put(gene_list, strdup(split_result[17]), NULL);
            }
            *SO_found = atoi(split_result[18] + 3);
           memset(tmp_consequence_type, 0, 128 * sizeof(char));
           strncat(tmp_consequence_type, split_result[19], strlen(split_result[19]));
        } else {
            LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]);
            memset(line[tid], 0, strlen(line[tid]));
            memset(output_line[tid], 0, strlen(output_line[tid]));
            
// #pragma omp critical
//             {
//             printf("********\n");
//             LOG_INFO_F("[%d] Non-valid line found (%d fields): '%s'\n", tid, num_substrings, line[tid]);
            for (int s = 0; s < num_substrings; s++) {
//                 printf("%s^", split_result[s]);
                free(split_result[s]);
            }
//             printf("********\n\n");
            free(split_result);
//             }
            continue;
        }
        
        for (int s = 0; s < num_substrings; s++) {
            free(split_result[s]);
        }
        free(split_result);
        
        if (!*SO_found) { // SO:000000 is not valid
            LOG_INFO_F("[%d] Non-valid SO found (0)\n", tid);
            memset(line[tid], 0, strlen(line[tid]));
            memset(output_line[tid], 0, strlen(output_line[tid]));
            continue;
        }

//         LOG_DEBUG_F("[%d] SO found = %d\n", tid, *SO_found);
        size_t consequence_type_len = strlen(tmp_consequence_type);
     
        // If file does not exist, create its descriptor and summary counter
        FILE *aux_file = cp_hashtable_get(output_files, SO_found);
        if (!aux_file) {
#pragma omp critical
            {
                // This construction avoids 2 threads trying to insert the same CT
                aux_file = cp_hashtable_get(output_files, SO_found);
                if (!aux_file) {
                    char filename[output_directory_len + consequence_type_len + 6];
                    memset(filename, 0, (output_directory_len + consequence_type_len + 6) * sizeof(char));
                    strncat(filename, output_directory, output_directory_len);
                    strncat(filename, "/", 1);
                    strncat(filename, tmp_consequence_type, consequence_type_len);
                    strncat(filename, ".txt", 4);
                    aux_file = fopen(filename, "a");
                    
                    // Add to hashtables (file descriptors and summary counters)
                    int *SO_stored = (int*) malloc (sizeof(int));
                    *SO_stored = *SO_found;
                    cp_hashtable_put(output_files, SO_stored, aux_file);

                    LOG_INFO_F("[%d] new CT = %s\n", tid, tmp_consequence_type);
                }
            }
        }
        
        // Write line[tid] to file corresponding to its consequence type
        if (aux_file) { 
#pragma omp critical
            {
                // TODO move critical one level below?
                count = (int*) cp_hashtable_get(summary_count, tmp_consequence_type);
                if (count == NULL) {
                    char *consequence_type = (char*) calloc (consequence_type_len+1, sizeof(char));
                    strncat(consequence_type, tmp_consequence_type, consequence_type_len);
                    assert(!strcmp(consequence_type, tmp_consequence_type));
                    count = (int*) malloc (sizeof(int));
                    *count = 0;
                    cp_hashtable_put(summary_count, consequence_type, count);
//                     LOG_DEBUG_F("[%d] Initialized summary count for %s\n", tmp_consequence_type);
                }
                // Increment counter for summary
                (*count)++;
            }
            
//             LOG_DEBUG_F("[%d] before writing %s\n", tid, tmp_consequence_type);
            output_text = strdup(output_line[tid]);
            list_item_t *output_item = list_item_new(tid, *SO_found, output_text);
            list_insert_item(output_item, output_list);
//             LOG_DEBUG_F("[%d] after writing %s\n", tid, tmp_consequence_type);
        }
        
        data += next_line_len+1;
        data_read_len += next_line_len+1;
        
        memset(line[tid], 0, strlen(line[tid]));
        memset(output_line[tid], 0, strlen(output_line[tid]));
        
        i++;
    }
 
    // Empty buffer for next callback invocation
    if (!premature_end) {
        memset(line[tid], 0, strlen(line[tid]));
        memset(output_line[tid], 0, strlen(line[tid]));
    }
    free(SO_found);

    return data_read_len;
}
コード例 #16
0
ファイル: stats_runner.c プロジェクト: abushoy/hpg-variant
int run_stats(shared_options_data_t *shared_options_data, stats_options_data_t *options_data) {
    file_stats_t *file_stats = file_stats_new();
    sample_stats_t **sample_stats;
    
    // List that stores the batches of records filtered by each thread
    list_t *output_list[shared_options_data->num_threads];
    // List that stores which thread filtered the next batch to save
    list_t *next_token_list = malloc(sizeof(list_t));

    int ret_code;
    double start, stop, total;
    
    vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches);
    if (!vcf_file) {
        LOG_FATAL("VCF file does not exist!\n");
    }
    
    ped_file_t *ped_file = NULL;
    if (shared_options_data->ped_filename) {
        ped_file = ped_open(shared_options_data->ped_filename);
        if (!ped_file) {
            LOG_FATAL("PED file does not exist!\n");
        }
        if(options_data->variable) {
            set_variable_field(options_data->variable, 0, ped_file);
        } else {
            set_variable_field("PHENO", 6, ped_file);
        }
        
        if(options_data->variable_groups) {
            int n, m;
            char *variable_groups = strdup(options_data->variable_groups);
            char **groups;
            char **phenos_in_group;
            groups = split(variable_groups, ":", &n);
            for(int i = 0; i < n; i++){
                phenos_in_group = split(groups[i], ",", &m);
                if(set_phenotype_group(phenos_in_group, m, ped_file) < 0) {
                    LOG_ERROR("Variable can't appear in two groups\n");
                    return DUPLICATED_VARIABLE;
                }
                free(phenos_in_group);
            }
            ped_file->accept_new_values = 0;
            
            free(variable_groups);
            free(groups);
        } else {
            ped_file->accept_new_values = 1;
        }
        if(options_data->phenotype) {
            int n;
            char* phenotypes = strdup(options_data->phenotype);
            char** pheno_values = split(phenotypes, ",", &n);
            if(n != 2) {
                LOG_ERROR("To handle case-control test, only two phenotypes are supported\n");
                return MORE_THAN_TWO_PHENOTYPES;
            } else {
                set_unaffected_phenotype(pheno_values[0],ped_file);
                set_affected_phenotype(pheno_values[1],ped_file);
            }
        } else {
            set_unaffected_phenotype("1", ped_file);
            set_affected_phenotype("2", ped_file);
        }
        
        LOG_INFO("About to read PED file...\n");
        // Read PED file before doing any processing
        ret_code = ped_read(ped_file);
        if (ret_code != 0) {
            LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename);
        }
        if(!ped_file->num_field) {
            LOG_ERROR_F("Can't find the specified field \"%s\" in file: %s \n", options_data->variable, ped_file->filename);
            return VARIABLE_FIELD_NOT_FOUND;
        }
    }
    
    ret_code = create_directory(shared_options_data->output_directory);
    if (ret_code != 0 && errno != EEXIST) {
        LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory);
    }
    
    // Initialize variables related to the different threads
    for (int i = 0; i < shared_options_data->num_threads; i++) {
        output_list[i] = (list_t*) malloc(sizeof(list_t));
        list_init("input", 1, shared_options_data->num_threads * shared_options_data->batch_lines, output_list[i]);
    }
    list_init("next_token", shared_options_data->num_threads, INT_MAX, next_token_list);
    
    LOG_INFO("About to retrieve statistics from VCF file...\n");

#pragma omp parallel sections private(start, stop, total)
    {
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num());
            // Reading
            start = omp_get_wtime();

            if (shared_options_data->batch_bytes > 0) {
                ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, vcf_file);
            } else if (shared_options_data->batch_lines > 0) {
                ret_code = vcf_parse_batches(shared_options_data->batch_lines, vcf_file);
            }

            stop = omp_get_wtime();
            total = stop - start;

            if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); }

            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            notify_end_parsing(vcf_file);
        }
        
#pragma omp section
        {
            // Enable nested parallelism and set the number of threads the user has chosen
            omp_set_nested(1);
            LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num());
            
            individual_t **individuals = NULL;
            khash_t(ids) *sample_ids = NULL;
            khash_t(str) *phenotype_ids = NULL;
            int num_phenotypes;
            
            start = omp_get_wtime();
            
            int i = 0;
            vcf_batch_t *batch = NULL;
            while ((batch = fetch_vcf_batch(vcf_file)) != NULL) {
                if (i == 0) {
                    sample_stats = malloc (get_num_vcf_samples(vcf_file) * sizeof(sample_stats_t*));
                    for (int j = 0; j < get_num_vcf_samples(vcf_file); j++) {
                        sample_stats[j] = sample_stats_new(array_list_get(j, vcf_file->samples_names));
                    }
                    
                    if (ped_file) {
                        // Create map to associate the position of individuals in the list of samples defined in the VCF file
                        sample_ids = associate_samples_and_positions(vcf_file);
                        // Sort individuals in PED as defined in the VCF file
                        individuals = sort_individuals(vcf_file, ped_file);
                        // Get the khash of the phenotypes in PED file
                        phenotype_ids = get_phenotypes(ped_file);
                        num_phenotypes = get_num_variables(ped_file);
                    }
                }
                
                if (i % 50 == 0) {
                    LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", 
                                i, omp_get_thread_num(),
                                batch->records->size, batch->records->capacity);
                }

                // Divide the list of passed records in ranges of size defined in config file
                int num_chunks;
                int *chunk_sizes = NULL;
                array_list_t *input_records = batch->records;
                int *chunk_starts = create_chunks(input_records->size, 
                                                  ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads), 
                                                  &num_chunks, &chunk_sizes);
                
                // OpenMP: Launch a thread for each range
                #pragma omp parallel for num_threads(shared_options_data->num_threads)
                for (int j = 0; j < num_chunks; j++) {
                    LOG_DEBUG_F("[%d] Stats invocation\n", omp_get_thread_num());
                    // Invoke variant stats and/or sample stats when applies
                    if (options_data->variant_stats) {
                        int index = omp_get_thread_num() % shared_options_data->num_threads;
                        ret_code = get_variants_stats((vcf_record_t**) (input_records->items + chunk_starts[j]),
                                                      chunk_sizes[j], individuals, sample_ids,num_phenotypes, output_list[index], file_stats); 
                    }
                    
                    if (options_data->sample_stats) {
                        ret_code |= get_sample_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), 
                                                      chunk_sizes[j], individuals, sample_ids, sample_stats, file_stats);
                    }
                }
                
                if (options_data->variant_stats) {
                    // Insert as many tokens as elements correspond to each thread
                    for (int t = 0; t < num_chunks; t++) {
                        for (int s = 0; s < chunk_sizes[t]; s++) {
                            list_item_t *token_item = list_item_new(t, 0, NULL);
                            list_insert_item(token_item, next_token_list);
                        }
                    }
                }
                
                free(chunk_starts);
                free(chunk_sizes);
                vcf_batch_free(batch);
                
                i++;
            }
            
            stop = omp_get_wtime();
            total = stop - start;

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);
            
            // Decrease list writers count
            for (i = 0; i < shared_options_data->num_threads; i++) {
                list_decr_writers(next_token_list);
                list_decr_writers(output_list[i]);
            }
            
            if (sample_ids) { kh_destroy(ids, sample_ids); }
            if (individuals) { free(individuals); }
        }
        
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num());
            
            char *stats_prefix = get_vcf_stats_filename_prefix(shared_options_data->vcf_filename, 
                                                               shared_options_data->output_filename, 
                                                               shared_options_data->output_directory);
            
            // File names and descriptors for output to plain text files
            char *stats_filename, *summary_filename, *phenotype_filename;
            FILE *stats_fd, *summary_fd, **phenotype_fd;
            
            char *stats_db_name;
            sqlite3 *db = NULL;
            khash_t(stats_chunks) *hash;
            
            khash_t(str) *phenotype_ids;
            int num_phenotypes;
            if(ped_file){
                phenotype_ids = get_phenotypes(ped_file);
                num_phenotypes = get_num_variables(ped_file);
            }
            
            if (options_data->save_db) {
                delete_files_by_extension(shared_options_data->output_directory, "db");
                stats_db_name = calloc(strlen(stats_prefix) + strlen(".db") + 2, sizeof(char));
                sprintf(stats_db_name, "%s.db", stats_prefix);
                create_stats_db(stats_db_name, VCF_CHUNKSIZE, create_vcf_query_fields, &db);
                hash = kh_init(stats_chunks);
            }
            
            // Write variant (and global) statistics
            if (options_data->variant_stats) {
                stats_filename = get_variant_stats_output_filename(stats_prefix);
                if (!(stats_fd = fopen(stats_filename, "w"))) {
                    LOG_FATAL_F("Can't open file for writing statistics of variants: %s\n", stats_filename);
                }
                
                //Open one file for each phenotype
                if(ped_file){
                    phenotype_fd = malloc(sizeof(FILE*)*num_phenotypes);
                    if(options_data->variable_groups){
                        int n;
                        char *variable_groups = strdup(options_data->variable_groups);
                        char ** names = split(variable_groups, ":", &n);
                        for(int i = 0; i < n; i++) {
                            phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, names[i]);
                            if(!(phenotype_fd[i] = fopen(phenotype_filename, "w"))) {
                                LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename);
                            }
                            free(phenotype_filename);
                        }
                        free(names);
                        free(variable_groups);
                    } else {
                 
                        for (khint_t i = kh_begin(phenotype_ids); i != kh_end(phenotype_ids); ++i) {
                            if (!kh_exist(phenotype_ids,i)) continue;
                            
                            phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, kh_key(phenotype_ids,i));
                            if(!(phenotype_fd[kh_val(phenotype_ids,i)] = fopen(phenotype_filename, "w"))) {
                                LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename);
                            }
                            free(phenotype_filename);
                        }
                    }
                }
                // Write header
                report_vcf_variant_stats_header(stats_fd);
                if(ped_file){
                    for(int i = 0; i < num_phenotypes; i++)
                        report_vcf_variant_phenotype_stats_header(phenotype_fd[i]);
                }
                
                // For each variant, generate a new line
                int avail_stats = 0;
                variant_stats_t *var_stats_batch[VCF_CHUNKSIZE];
                list_item_t *token_item = NULL, *output_item = NULL;
                while ( token_item = list_remove_item(next_token_list) ) {
                    output_item = list_remove_item(output_list[token_item->id]);
                    assert(output_item);
                    var_stats_batch[avail_stats] = output_item->data_p;
                    avail_stats++;
                    
                    // Run only when certain amount of stats is available
                    if (avail_stats >= VCF_CHUNKSIZE) {
                        report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch);
                        
                        if(ped_file)
                            for(int i = 0; i < num_phenotypes; i++)
                                report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i);

                        // Free all stats from the "batch"
                        for (int i = 0; i < avail_stats; i++) {
                            variant_stats_free(var_stats_batch[i]);
                        }
                        avail_stats = 0;
                    }
                    
                    // Free resources
                    list_item_free(output_item);
                    list_item_free(token_item);
                }
                
                if (avail_stats > 0) {
                    report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch);
                    
                    if(ped_file)
                        for(int i = 0; i < num_phenotypes; i++)
                            report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i);

                    // Free all stats from the "batch"
                    for (int i = 0; i < avail_stats; i++) {
                        variant_stats_free(var_stats_batch[i]);
                    }
                    avail_stats = 0;
                }
                
                // Write whole file stats (data only got when launching variant stats)
                summary_filename = get_vcf_file_stats_output_filename(stats_prefix);
                if (!(summary_fd = fopen(summary_filename, "w"))) {
                    LOG_FATAL_F("Can't open file for writing statistics summary: %s\n", summary_filename);
                }
                report_vcf_summary_stats(summary_fd, db, file_stats);
                
                free(stats_filename);
                free(summary_filename);
                
                // Close variant stats file
                if (stats_fd) { fclose(stats_fd); }
                if (summary_fd) { fclose(summary_fd); }
				if(ped_file){
		            for(int i = 0; i < num_phenotypes; i++)
		                if(phenotype_fd[i]) fclose(phenotype_fd[i]);
					free(phenotype_fd);
				}
            }
            
            // Write sample statistics
            if (options_data->sample_stats) {
                stats_filename = get_sample_stats_output_filename(stats_prefix);
                if (!(stats_fd = fopen(stats_filename, "w"))) {
                    LOG_FATAL_F("Can't open file for writing statistics of samples: %s\n", stats_filename);
                }
                
                report_vcf_sample_stats_header(stats_fd);
                report_vcf_sample_stats(stats_fd, NULL, vcf_file->samples_names->size, sample_stats);
                
                // Close sample stats file
                free(stats_filename);
                if (stats_fd) { fclose(stats_fd); }
            }
            
            free(stats_prefix);
            
            if (db) {
                insert_chunk_hash(VCF_CHUNKSIZE, hash, db);
                create_stats_index(create_vcf_index, db);
                close_stats_db(db, hash);
            }
            
        }
    }
    
    for (int i = 0; i < get_num_vcf_samples(vcf_file); i++) {
        sample_stats_free(sample_stats[i]);
    }
    free(sample_stats);
    free(file_stats);
    
    free(next_token_list);
    for (int i = 0; i < shared_options_data->num_threads; i++) {
        free(output_list[i]);
    }
    
    vcf_close(vcf_file);
    if (ped_file) { ped_close(ped_file, 1,1); }
    
    return 0;
}
コード例 #17
0
void batch_aligner(batch_aligner_input_t *input) {

  //  printf("START: batch_aligner\n", omp_get_thread_num());

  size_t total_batches = 0;
		
  list_t *read_list = input->read_list;
  list_t *write_list = input->write_list;

  write_batch_t* write_batch = NULL;
  aligner_batch_t *aligner_batch = NULL;

  list_item_t *read_item = NULL, *write_item = NULL;

  unsigned int tid = omp_get_thread_num();

  struct timeval t1, t2;

  array_list_t *list1, *list2;

  // main loop
  while ( (read_item = list_remove_item(read_list)) != NULL ) {

    aligner_batch = aligner_batch_new((fastq_batch_t *) read_item->data_p);

    thr_batches[tid]++;

    //printf("********************** BATCH %d (batch aligner %d)\n", total_batches, omp_get_thread_num());

    // Burros-Wheeler transform
    gettimeofday(&t1, NULL);
    apply_bwt(input->bwt_input, aligner_batch);
    gettimeofday(&t2, NULL);
    bwt_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec));
    //printf("---> %d, bwt, num targets = %d\n", tid, aligner_batch->num_targets);


    if (aligner_batch->num_targets > 0) {
      // seeding
      gettimeofday(&t1, NULL);
      apply_seeding(input->region_input, aligner_batch);
      gettimeofday(&t2, NULL);
      seeding_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec));
      thr_seeding_items[tid] += aligner_batch->num_targets;
      //printf("---> %d, seeding, num targets = %d\n", tid, aligner_batch->num_targets);

      // seeking CALs
      gettimeofday(&t1, NULL);
      apply_caling(input->cal_input, aligner_batch);
      gettimeofday(&t2, NULL);
      cal_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec));
      thr_cal_items[tid] += aligner_batch->num_targets;
      //printf("---> %d, cal, num targets = %d\n", tid, aligner_batch->num_targets);
    }

    // pair-mode managing
    if (input->pair_input != NULL) {      
      apply_pair(input->pair_input, aligner_batch);
      //      printf("---> %d, pair, num targets = %d\n", tid, aligner_batch->num_targets);
    }

    if (aligner_batch->num_targets > 0) {
      // Smith-Waterman
      gettimeofday(&t1, NULL);
      apply_sw(input->sw_input, aligner_batch);
      gettimeofday(&t2, NULL);
      sw_time[tid] += ((t2.tv_sec - t1.tv_sec) * 1e6 + (t2.tv_usec - t1.tv_usec));
      //thr_sw_items[tid] += aligner_batch->num_targets;
      //printf("---> %d, sw, num targets = %d\n", tid, aligner_batch->num_targets);
    }

    if (aligner_batch->num_targets > 0) {
      // prepare alignments (converts sw-output to alignment, searches pairs...)
      prepare_alignments(input->pair_input, aligner_batch);
    }

    write_item = list_item_new(total_batches, 0, aligner_batch);
    list_insert_item(write_item, write_list);

    list_item_free(read_item);
    total_batches++;
  } // main loop

  /*
  printf("Thread %d: BWT time     = %0.4f s\n", tid, bwt_time / 1e6);
  printf("Thread %d: Seeding time = %0.4f s\n", tid, seeding_time / 1e6);
  printf("Thread %d: CAL time     = %0.4f s\n", tid, cal_time / 1e6);
  printf("Thread %d: SW time      = %0.4f s\n", tid, sw_time / 1e6);
  */

  // decreasing writers
  if (write_list != NULL) list_decr_writers(write_list);

  //  printf("END: batch_aligner (%d), (total batches %d): END\n", omp_get_thread_num(), total_batches);
}
コード例 #18
0
void region_seeker_server(region_seeker_input_t *input_p){
  
  printf("region_seeker_server(%d): START\n", omp_get_thread_num());  
  list_item_t *item_p = NULL;
  list_item_t *cal_item_p = NULL;
  fastq_batch_t *unmapped_batch_p;
  size_t num_reads;
  array_list_t **allocate_mapping_p;
  cal_batch_t *cal_batch_p;
  size_t num_mappings, total_mappings = 0, num_batches = 0;
  size_t num_threads = input_p->region_threads;
  size_t chunk;
  size_t total_reads = 0;

  omp_set_num_threads(num_threads);
  
  while ( (item_p = list_remove_item(input_p->unmapped_read_list_p)) != NULL ) {

    //printf("Region Seeker Processing batch...\n");
    num_batches++;
    if (time_on) { timing_start(REGION_SEEKER, 0, timing_p); }
    
    unmapped_batch_p = (fastq_batch_t *)item_p->data_p;
    num_reads = unmapped_batch_p->num_reads;
    total_reads += num_reads;
    allocate_mapping_p = (array_list_t **)malloc(sizeof(array_list_t *)*num_reads);
    
    if (input_p->gpu_enable) {
      //******************************* GPU PROCESS *********************************//
      for (size_t i = 0; i < num_reads; i++) {
	allocate_mapping_p[i] = array_list_new(1000, 
					       1.25f, 
					       COLLECTION_MODE_ASYNCHRONIZED);
      }
      #ifdef HPG_GPU
      num_mappings = bwt_map_exact_seed_batch_gpu(unmapped_batch_p,
						  input_p->bwt_optarg_p, 
						  input_p->cal_optarg_p,
						  input_p->bwt_index_p,
						  input_p->gpu_context,
						  allocate_mapping_p);
      #endif
      //****************************************************************************//
    } else {

      //******************************* CPU PROCESS *********************************//
      //printf("Region Seeker :: Process Batch with %d reads\n", num_reads); 
      chunk = MAX(1, num_reads/(num_threads*10));
      
      //printf("Region Seeker :: Process Batch with %d reads\n", num_reads);
      #pragma omp parallel for private(num_mappings) reduction(+:total_mappings) schedule(dynamic, chunk)
      //#pragma omp parallel for private(num_mappings) reduction(+:total_mappings) schedule(static)
      for (size_t i = 0; i < num_reads; i++) {
	//printf("Threads region zone: %d\n", omp_get_num_threads());
	
	allocate_mapping_p[i] = array_list_new(1000, 
					       1.25f, 
					       COLLECTION_MODE_ASYNCHRONIZED);
	
	num_mappings = bwt_map_exact_seeds_seq(&(unmapped_batch_p->seq[unmapped_batch_p->data_indices[i]]), 
					       input_p->cal_optarg_p->seed_size,
					       input_p->cal_optarg_p->min_seed_size,
					       input_p->bwt_optarg_p, input_p->bwt_index_p, allocate_mapping_p[i]);
	
	total_mappings += num_mappings;
	//printf("----------------->>>>>>>>>>>Regions found %d\n", num_mappings);      
      }
      //****************************************************************************//
    
    }

    cal_batch_p = cal_batch_new(allocate_mapping_p, unmapped_batch_p);
      
    list_item_free(item_p);
    cal_item_p = list_item_new(0, 0, cal_batch_p);
    //region_batch_free(region_batch_p);    

    if (time_on) { timing_stop(REGION_SEEKER, 0, timing_p); }
    
    list_insert_item(cal_item_p, input_p->region_list_p);
    //printf("Region Seeker Processing batch finish!\n");

  } //End of while
  
  list_decr_writers(input_p->region_list_p);
 
  if (statistics_on) { 
    statistics_set(REGION_SEEKER_ST, 0, num_batches, statistics_p); 
    statistics_set(REGION_SEEKER_ST, 1, total_reads, statistics_p); 
  }
 
  printf("region_seeker_server: END\n");
  
}
コード例 #19
0
ファイル: topic.c プロジェクト: RoboticLtd/pico-ros
void topic_add_connection(topic_t* t, connection_t* con)
{
	list_insert_item(t->conns, con);
}