int apply_bwt(bwt_server_input_t* input, batch_t *batch) { //printf("APPLY BWT SERVER...\n"); mapping_batch_t *mapping_batch = batch->mapping_batch; size_t num_reads = array_list_size(mapping_batch->fq_batch); size_t num_mappings; array_list_t *list; size_t *unmapped_indices = mapping_batch->targets; size_t num_unmapped = 0; size_t num_anchors; for (int i = 0; i < num_reads; i++) { fastq_read_t *read = array_list_get(i, mapping_batch->fq_batch); //printf("BWT: %s\n", read->id); list = mapping_batch->mapping_lists[i]; array_list_set_flag(0, list); num_mappings = bwt_map_inexact_read(read, input->bwt_optarg_p, input->bwt_index_p, list); if (array_list_get_flag(list) != 2) { //If flag 2, the read exceded the max number of mappings if (array_list_get_flag(list) == 1) { if (num_mappings > 0) { num_anchors = bwt_search_pair_anchors(list, read->length); if (num_anchors == 0) { array_list_set_flag(NOT_ANCHORS, list); } } else { array_list_set_flag(NOT_ANCHORS, list); } //printf("tot anchors found %i %s\n", num_anchors, read->id); unmapped_indices[num_unmapped++] = i; } else if (num_mappings <= 0) { array_list_set_flag(0, list); //printf("Read NO Mapped %i %s\n", num_anchors, read->id); unmapped_indices[num_unmapped++] = i; } } else { array_list_set_flag(ALIGNMENTS_EXCEEDED, list); } } // array_list flag: 0 -> Not BWT Anchors found // 1 -> One BWT Anchors found // 2 -> Pair BWT Anchors found // 3 -> Alignments found // 4 -> Alignments exceded mapping_batch->num_targets = num_unmapped; if (batch->mapping_batch->num_targets > 0) { return CAL_STAGE; } return DNA_POST_PAIR_STAGE; }
int apply_sw_bs(sw_server_input_t* input, batch_t *batch) { int sw_3_nucleotides = 0; /* sw_optarg_t *sw_optarg2 = &input->sw_optarg; printf("Matrix Table\n\tA\tC\tG\tT\tN\nA\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nC\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nG\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nT\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nN\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\n\n", sw_optarg2->subst_matrix['A']['A'], sw_optarg2->subst_matrix['C']['A'], sw_optarg2->subst_matrix['G']['A'], sw_optarg2->subst_matrix['T']['A'], sw_optarg2->subst_matrix['N']['A'], sw_optarg2->subst_matrix['A']['C'], sw_optarg2->subst_matrix['C']['C'], sw_optarg2->subst_matrix['G']['C'], sw_optarg2->subst_matrix['T']['C'], sw_optarg2->subst_matrix['N']['C'], sw_optarg2->subst_matrix['A']['G'], sw_optarg2->subst_matrix['C']['G'], sw_optarg2->subst_matrix['G']['G'], sw_optarg2->subst_matrix['T']['G'], sw_optarg2->subst_matrix['N']['G'], sw_optarg2->subst_matrix['A']['T'], sw_optarg2->subst_matrix['C']['T'], sw_optarg2->subst_matrix['G']['T'], sw_optarg2->subst_matrix['T']['T'], sw_optarg2->subst_matrix['N']['T'], sw_optarg2->subst_matrix['A']['N'], sw_optarg2->subst_matrix['C']['N'], sw_optarg2->subst_matrix['G']['N'], sw_optarg2->subst_matrix['T']['N'], sw_optarg2->subst_matrix['N']['N'] ); */ if (sw_3_nucleotides == 0) { apply_sw_bs_4nt(input, batch); } else { //printf("START: apply_sw\n"); int tid = omp_get_thread_num(); mapping_batch_t *mapping_batch = batch->mapping_batch; cal_t *cal = NULL; array_list_t *cal_list = NULL, *mapping_list = NULL; array_list_t *fq_batch = mapping_batch->fq_batch; fastq_read_t *fq_read; // added by PP for bisulfite array_list_t *CT_fq_batch = mapping_batch->CT_fq_batch; array_list_t *GA_fq_batch = mapping_batch->GA_fq_batch; array_list_t *CT_rev_fq_batch = mapping_batch->CT_rev_fq_batch; array_list_t *GA_rev_fq_batch = mapping_batch->GA_rev_fq_batch; fastq_read_t *fq_read2; // end added by PP for bisulfite size_t start, end; size_t start2, end2; /* genome_t *genome = input->genome_p; */ // added by PP for bisulfite genome_t *genome1 = input->genome1_p; genome_t *genome2 = input->genome2_p; // end added by PP for bisulfite size_t flank_length = input->flank_length; // SIMD support for Smith-Waterman float score, min_score = input->min_score; sw_output_t *sw_output; size_t read_index, num_cals; size_t num_targets = mapping_batch->num_targets; size_t new_num_targets = 0; // added by PP for bisulfite size_t num_targets2 = mapping_batch->num_targets2; size_t new_num_targets2 = 0; // added by PP for bisulfite // added by PP for bisulfite size_t sw_total1 = mapping_batch->num_to_do; size_t sw_total2 = mapping_batch->num_to_do2; size_t sw_total = sw_total1 + sw_total2; // end added by PP for bisulfite // set to zero mapping_batch->num_to_do = 0; // added by PP for bisulfite mapping_batch->num_to_do2 = 0; int g[sw_total]; // end added by PP for bisulfite sw_optarg_t *sw_optarg = &input->sw_optarg; sw_multi_output_t *output = sw_multi_output_new(sw_total); char *q[sw_total], *r[sw_total]; uint8_t strands[sw_total], chromosomes[sw_total]; size_t starts[sw_total]; size_t sw_count = 0, read_indices[sw_total], sw_count2 = 0; int read_len, ref_len, max_ref_len; //printf("num of sw to do: %i\n", sw_total); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_targets; i++) { // printf("sw_server: target #%i of %i\n", i, num_seqs); read_index = mapping_batch->targets[i]; // to use with the three nucleotides searches fq_read = (fastq_read_t *) array_list_get(read_index, GA_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch); //printf("read %lu = %s\n", read_index, fq_read->sequence); //printf("read %lu = %s\n", read_index, fq_read2->sequence); // printf("sw_server: read #%i\n", read_index); cal_list = mapping_batch->mapping_lists[read_index]; num_cals = array_list_size(cal_list); read_len = fq_read->length; // max_ref_len = read_len + (read_len / 2); //printf("sw_server: num_cals = %i cals\n", num_cals); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); read_indices[sw_count] = read_index; if (flank_length >= cal->start) { start = 0; } else { start = cal->start - flank_length; } end = cal->end + flank_length; if (end >= genome1->chr_size[cal->chromosome_id - 1]) { end = genome1->chr_size[cal->chromosome_id - 1] - 1; } ref_len = end - start + 2; // if (ref_len < max_ref_len) { // query sequence, revcomp if necessary q[sw_count] = (char *) calloc((read_len + 1), sizeof(char)); // to use with the three nucleotides searches if (cal->strand == 0) { memcpy(q[sw_count], fq_read->sequence, read_len); //seq_reverse_complementary(q[sw_count], read_len); } else { memcpy(q[sw_count], fq_read2->sequence, read_len); } //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]); // reference sequence //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end); r[sw_count] = calloc(1, end - start + 2); // to use with the three nucleotides searches if (cal->strand == 0) { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome1); } else { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome2); /* start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end; end2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - start; genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start2, &end2, genome2); */ } /* genome_read_sequence_by_chr_index(r[sw_count], cal->strand, cal->chromosome_id - 1, &start, &end, genome1); */ // save some stuff, we'll use them after... strands[sw_count] = cal->strand; chromosomes[sw_count] = cal->chromosome_id; starts[sw_count] = start; /* printf("st = %lu\tend = %lu\n", cal->start, cal->end); printf("1\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n", q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end); */ // increase counter sw_count++; } // free cal_list array_list_clear(cal_list, (void *) cal_free); // batch->mapping_lists[index] = NULL; } //////////////// sw_count2 = sw_count; for (size_t i = 0; i < num_targets2; i++) { // printf("sw_server: target #%i of %i\n", i, num_seqs); read_index = mapping_batch->targets2[i]; // to use with the three nucleotides searches fq_read = (fastq_read_t *) array_list_get(read_index, CT_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch); //printf("read %lu = %s\n", read_index, fq_read->sequence); //printf("read %lu = %s\n", read_index, fq_read2->sequence); // printf("sw_server: read #%i\n", read_index); cal_list = mapping_batch->mapping_lists2[read_index]; num_cals = array_list_size(cal_list); read_len = fq_read->length; // max_ref_len = read_len + (read_len / 2); //printf("sw_server: num_cals = %i cals\n", num_cals); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); read_indices[sw_count] = read_index; if (flank_length >= cal->start) { start = 0; } else { start = cal->start - flank_length; } end = cal->end + flank_length; if (end >= genome1->chr_size[cal->chromosome_id - 1]) { end = genome1->chr_size[cal->chromosome_id - 1] - 1; } ref_len = end - start + 2; // if (ref_len < max_ref_len) { // query sequence, revcomp if necessary q[sw_count] = (char *) calloc((read_len + 1), sizeof(char)); // to use with the three nucleotides searches if (cal->strand == 0) { memcpy(q[sw_count], fq_read->sequence, read_len); //seq_reverse_complementary(q[sw_count], read_len); } else { memcpy(q[sw_count], fq_read2->sequence, read_len); } //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]); // reference sequence //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end); r[sw_count] = calloc(1, end - start + 2); // to use with the three nucleotides searches if (cal->strand == 0) { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome2); } else { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome1); /* start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end; end2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - start; genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start2, &end2, genome1); */ } /* genome_read_sequence_by_chr_index(r[sw_count], cal->strand, cal->chromosome_id - 1, &start, &end, genome2); */ // save some stuff, we'll use them after... strands[sw_count] = cal->strand; chromosomes[sw_count] = cal->chromosome_id; starts[sw_count] = start; //printf("2\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n", // q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end); // increase counter sw_count++; } // free cal_list array_list_clear(cal_list, (void *) cal_free); // batch->mapping_lists[index] = NULL; } //printf("before smith_waterman: sw_total = %i, sw_count = %i, sw_count2 = %i\n", sw_total, sw_count, sw_count2); // run Smith-Waterman // printf("before smith_waterman: sw_total = %i, sw_count = %i\n", sw_total, sw_count); smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output); // printf("after smith_waterman\n"); for (size_t i = 0; i < sw_count; i++) { LOG_DEBUG_F("cal: start = %lu, strand = %i\n", starts[i], strands[i]); LOG_DEBUG_F("\tquery : %s\n", q[i]); LOG_DEBUG_F("\tref. : %s\n", r[i]); LOG_DEBUG_F("\tquery map: %s (start: %i)\n", output->query_map_p[i], output->query_start_p[i]); LOG_DEBUG_F("\tref. map : %s (start: %i)\n", output->ref_map_p[i], output->ref_start_p[i]); LOG_DEBUG("\n"); } //size_t mapp = 0, mapp2 = 0; double norm_score; // filter alignments by min_score for (size_t i = 0; i < sw_count2; i++) { read_index = read_indices[i]; fq_read = (fastq_read_t *) array_list_get(read_index, GA_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch); read_len = fq_read->length; norm_score = NORM_SCORE(output->score_p[i], read_len, input->match); if (norm_score >= min_score) { // valid mappings, //insert in the list for further processing mapping_list = mapping_batch->mapping_lists[read_index]; array_list_set_flag(0, mapping_list); if (array_list_size(mapping_list) == 0) { mapping_batch->targets[new_num_targets++] = read_index; //mapp++; } sw_output = sw_output_new(strands[i], chromosomes[i], starts[i], strlen(r[i]), strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], output->score_p[i], norm_score, output->query_map_p[i], output->ref_map_p[i]); array_list_insert(sw_output, mapping_list); mapping_batch->num_to_do++; } // free query and reference free(q[i]); free(r[i]); } mapping_batch->num_targets = new_num_targets; for (size_t i = sw_count2; i < sw_count; i++) { read_index = read_indices[i]; fq_read = (fastq_read_t *) array_list_get(read_index, CT_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch); read_len = fq_read->length; norm_score = NORM_SCORE(output->score_p[i], read_len, input->match); if (norm_score >= min_score) { // valid mappings, //insert in the list for further processing mapping_list = mapping_batch->mapping_lists2[read_index]; array_list_set_flag(0, mapping_list); if (array_list_size(mapping_list) == 0) { mapping_batch->targets2[new_num_targets2++] = read_index; //mapp2++; } sw_output = sw_output_new(strands[i], chromosomes[i], starts[i], strlen(r[i]), strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], output->score_p[i], norm_score, output->query_map_p[i], output->ref_map_p[i]); array_list_insert(sw_output, mapping_list); mapping_batch->num_to_do2++; } // free query and reference free(q[i]); free(r[i]); } mapping_batch->num_targets2 = new_num_targets2; // update counter // thr_sw_items[tid] += sw_count; // free sw_multi_output_free(output); // go to the next stage /* printf("3 SW1 \t%3lu\tmapp \t%3lu\tno map (discard) \t%3lu\n", num_targets, mapp, num_targets - mapp); printf("3 SW2 \t%3lu\tmapp \t%3lu\tno map (discard) \t%3lu\n", num_targets2, mapp2, num_targets2 - mapp2); */ //printf("END: apply_sw, (%d Smith-Waterman)\n", sw_total); } //return CONSUMER_STAGE; return BS_POST_PAIR_STAGE; // printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids); }
void apply_sw(sw_server_input_t* input, aligner_batch_t *batch) { // printf("START: apply_sw\n"); int tid = omp_get_thread_num(); cal_t *cal = NULL; array_list_t *cal_list = NULL, *mapping_list = NULL;//, *old_list = NULL, *new_list = NULL; fastq_batch_t *fq_batch = batch->fq_batch; size_t start, end; genome_t *genome = input->genome_p; size_t flank_length = input->flank_length; // SIMD support for Smith-Waterman float score, min_score = input->min_score; // size_t curr_depth = 0; sw_output_t *sw_output; // sw_simd_input_t *sw_sinput = sw_simd_input_new(SIMD_DEPTH); // sw_simd_output_t *sw_soutput = sw_simd_output_new(SIMD_DEPTH); //sw_simd_context_t *context = sw_simd_context_new(input->match, input->mismatch, // input->gap_open, input->gap_extend); // for tracking the current read, cal being processed using sw_channel_t //sw_channel_t *channel; //sw_channel_t sw_channels[SIMD_DEPTH]; //memset(sw_channels, 0, sizeof(sw_channels)); //size_t header_len, read_len; //size_t strands[SIMD_DEPTH], chromosomes[SIMD_DEPTH], starts[SIMD_DEPTH]; size_t index, num_cals; size_t total = 0, valids = 0; size_t num_seqs = batch->num_targets; // set to zero batch->num_done = batch->num_to_do; batch->num_to_do = 0; size_t sw_total = batch->num_done; /* // for all seqs pending to process !! size_t sw_total = 0; for (size_t i = 0; i < num_seqs; i++) { sw_total += array_list_size(batch->mapping_lists[batch->targets[i]]); } printf("number of sw to run: %d (vs num_done = %d)\n", sw_total, batch->num_done); */ sw_optarg_t *sw_optarg = &input->sw_optarg; /* sw_optarg_t sw_optarg; //= sw_optarg_new(gap_open, gap_extend, matrix_filename); sw_optarg.gap_open = input->gap_open; sw_optarg.gap_extend = input->gap_extend; sw_optarg.subst_matrix['A']['A'] = input->match; sw_optarg.subst_matrix['C']['A'] = input->mismatch; sw_optarg.subst_matrix['T']['A'] = input->mismatch; sw_optarg.subst_matrix['G']['A'] = input->mismatch; sw_optarg.subst_matrix['A']['C'] = input->mismatch; sw_optarg.subst_matrix['C']['C'] = input->match; sw_optarg.subst_matrix['T']['C'] = input->mismatch; sw_optarg.subst_matrix['G']['C'] = input->mismatch; sw_optarg.subst_matrix['A']['G'] = input->mismatch; sw_optarg.subst_matrix['C']['T'] = input->mismatch; sw_optarg.subst_matrix['T']['T'] = input->match; sw_optarg.subst_matrix['G']['T'] = input->mismatch; sw_optarg.subst_matrix['A']['T'] = input->mismatch; sw_optarg.subst_matrix['C']['G'] = input->mismatch; sw_optarg.subst_matrix['T']['G'] = input->mismatch; sw_optarg.subst_matrix['G']['G'] = input->match; */ sw_multi_output_t *output = sw_multi_output_new(sw_total); char *q[sw_total], *r[sw_total]; uint8_t strands[sw_total], chromosomes[sw_total]; size_t starts[sw_total]; size_t sw_count = 0, read_indices[sw_total]; int read_len; // debugging: to kown how many reads are not mapped by SW score // int unmapped_by_score[fq_batch->num_reads]; // memset(unmapped_by_score, 0, fq_batch->num_reads * sizeof(int)); // printf("num of sw to do: %i\n", sw_total); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_seqs; i++) { index = batch->targets[i]; cal_list = batch->mapping_lists[index]; num_cals = array_list_size(cal_list); // printf("sw_server: read #%i with %i cals\n", index, num_cals); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); read_indices[sw_count] = index; // query sequence, revcomp if necessary read_len = fq_batch->data_indices[index + 1] - fq_batch->data_indices[index]; q[sw_count] = (char *) calloc((read_len + 1), sizeof(char)); memcpy(q[sw_count], &(fq_batch->seq[fq_batch->data_indices[index]]), read_len); if (cal->strand == 1) { seq_reverse_complementary(q[sw_count], read_len); } //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]); // reference sequence //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end); start = cal->start - flank_length; end = cal->end + flank_length; r[sw_count] = calloc(1, end - start + 2); genome_read_sequence_by_chr_index(r[sw_count], cal->strand, cal->chromosome_id - 1, &start, &end, genome); // save some stuff, we'll use them after... strands[sw_count] = cal->strand; chromosomes[sw_count] = cal->chromosome_id; starts[sw_count] = start; // printf("read #%i (sw #%i): query: %s (%i)\nref : %s (%i)\n\n", index, sw_count, q[sw_count], strlen(q[sw_count]), r[sw_count], strlen(r[sw_count])); // increase counter sw_count++; } // free cal_list array_list_free(cal_list, (void *)cal_free); batch->mapping_lists[index] = NULL; } // run Smith-Waterman // printf("before smith_waterman: number of sw = %i\n", sw_total); smith_waterman_mqmr(q, r, sw_total, sw_optarg, 1, output); // printf("after smith_waterman\n"); /* // debugging { FILE *fd = fopen("sw.out", "w"); sw_multi_output_save(sw_total, output, fd); fclose(fd); } */ size_t num_targets = 0; // filter alignments by min_score for (size_t i = 0; i < sw_total; i++) { // score = output->score_p[i] / (strlen(output->query_map_p[i]) * input->match); // if (score >= min_score) { /* printf("--------------------------------------------------------------\n"); printf("Smith-Waterman results:\n"); printf("id\t%s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[read_indices[i]]])); printf("ref\n%s\n", r[i]); printf("query\n%s\n", q[i]); printf("map\n%s\n", output->ref_map_p[i]); printf("ref: chr = %d, strand = %d, start = %d, len = %d\n", chromosomes[i], strands[i], starts[i], strlen(r[i])); printf("query-map-start = %d, ref-map-start = %d\n", output->query_start_p[i], output->ref_start_p[i]); printf("score = %0.2f (min. score = %0.2f)\n", output->score_p[i], min_score); printf("--------------------------------------------------------------\n"); */ if (output->score_p[i] >= min_score) { // valid mappings, //insert in the list for further processing index = read_indices[i]; if (batch->mapping_lists[index] == NULL) { mapping_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_set_flag(0, mapping_list); batch->mapping_lists[index] = mapping_list; batch->targets[num_targets++] = index; } sw_output = sw_output_new(strands[i], chromosomes[i], starts[i], strlen(r[i]), strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], output->score_p[i], score, output->query_map_p[i], output->ref_map_p[i]); array_list_insert(sw_output, mapping_list); batch->num_to_do++; // debugging //unmapped_by_score[index] = 1; } // free query and reference free(q[i]); free(r[i]); } batch->num_targets = num_targets; /* // debugging for (size_t i = 0; i < fq_batch->num_reads; i++) { if (unmapped_by_score[i] == 0) { unmapped_by_score_counter[tid]++; //printf("by score: %s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[index]])); } } */ // update counter thr_sw_items[tid] += sw_count; // free sw_multi_output_free(output); // printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids); }
size_t bwt_search_pair_anchors(array_list_t *list, unsigned int read_length) { bwt_anchor_t *bwt_anchor; int max_anchor_length = 0; bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw; int anchor_length_tmp, anchor_back, anchor_forw; int strand = 0, type = 0; int found_anchor = 0, found_double_anchor = 0; const int MIN_ANCHOR = 25; const int MIN_SINGLE_ANCHOR = 40; //const int MIN_DOUBLE_ANCHOR = MIN_ANCHOR*2; const int MAX_BWT_REGIONS = 50; const int MAX_BWT_ANCHOR_DISTANCE = 500000; array_list_t *anchor_list_tmp, *forward_anchor_list, *backward_anchor_list; cal_t *cal; int seed_size, gap_read, gap_genome; array_list_t *backward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_t *forward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED); array_list_t *backward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_t *forward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED); array_list_t *big_anchor_list = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED); //printf("Tot Anchors %i\n", array_list_size(list)); for (int i = 0; i < array_list_size(list); i++) { bwt_anchor = array_list_get(i, list); if (bwt_anchor->strand == 1) { //printf("(-)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1); if (bwt_anchor->type == FORWARD_ANCHOR) { array_list_insert(bwt_anchor, forward_anchor_list_1); //printf("FORW\n"); } else { array_list_insert(bwt_anchor, backward_anchor_list_1); //printf("BACK\n"); } } else { //printf("(+)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1); if (bwt_anchor->type == FORWARD_ANCHOR) { array_list_insert(bwt_anchor, forward_anchor_list_0); //printf("FORW\n"); } else { array_list_insert(bwt_anchor, backward_anchor_list_0); //printf("BACK\n"); } } anchor_length_tmp = bwt_anchor->end - bwt_anchor->start + 1; if (anchor_length_tmp > MIN_SINGLE_ANCHOR && anchor_length_tmp > max_anchor_length) { max_anchor_length = anchor_length_tmp; found_anchor = 1; strand = bwt_anchor->strand; type = bwt_anchor->type; } if (read_length - anchor_length_tmp < 16) { array_list_insert(bwt_anchor, big_anchor_list); } } array_list_clear(list, NULL); if (array_list_size(big_anchor_list) > 0) { for (int i = array_list_size(big_anchor_list) - 1; i >= 0; i--) { //printf("Insert cal %i\n", i); bwt_anchor = array_list_remove_at(i, big_anchor_list); size_t seed_size = bwt_anchor->end - bwt_anchor->start; if (bwt_anchor->type == FORWARD_ANCHOR) { cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size); } else { cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1); } array_list_insert(cal, list); } array_list_set_flag(SINGLE_ANCHORS, list); goto exit; } for (int type = 1; type >= 0; type--) { if (!type) { forward_anchor_list = forward_anchor_list_1; backward_anchor_list = backward_anchor_list_1; //printf("Strand (+): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list)); } else { forward_anchor_list = forward_anchor_list_0; backward_anchor_list = backward_anchor_list_0; //printf("Strand (-): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list)); } int *set_forward = (int *)calloc(array_list_size(forward_anchor_list), sizeof(int)); int *set_backward = (int *)calloc(array_list_size(backward_anchor_list), sizeof(int)); //Associate Anchors (+)/(-) for (int i = 0; i < array_list_size(forward_anchor_list); i++) { if (set_forward[i]) { continue; } bwt_anchor_forw = array_list_get(i, forward_anchor_list); for (int j = 0; j < array_list_size(backward_anchor_list); j++) { if (set_backward[j]) { continue; } bwt_anchor_back = array_list_get(j, backward_anchor_list); anchor_forw = (bwt_anchor_forw->end - bwt_anchor_forw->start + 1); anchor_back = (bwt_anchor_back->end - bwt_anchor_back->start + 1); anchor_length_tmp = anchor_forw + anchor_back; //printf("\tCommpare %i:%lu-%lu with %i:%lu-%lu\n", bwt_anchor_forw->chromosome + 1, // bwt_anchor_forw->start, bwt_anchor_forw->end, bwt_anchor_back->chromosome + 1, // bwt_anchor_back->start, bwt_anchor_back->end); if (bwt_anchor_forw->chromosome == bwt_anchor_back->chromosome && abs(bwt_anchor_back->start - bwt_anchor_forw->end) <= MAX_BWT_ANCHOR_DISTANCE && anchor_forw >= MIN_ANCHOR && anchor_back >= MIN_ANCHOR) { if (bwt_anchor_back->start < bwt_anchor_forw->end) { continue; } gap_read = read_length - (anchor_forw + anchor_back); gap_genome = bwt_anchor_back->start - bwt_anchor_forw->end; //printf("anchor_forw = %i, anchor_back = %i, gap_read = %i, gap_genome = %i\n", // anchor_forw, anchor_back, gap_read, gap_genome); int apply_flank = 0; if (gap_read < 2 || gap_genome < 2) { int gap; if (gap_read < 0 && gap_genome < 0) { gap = abs(gap_read) > abs(gap_genome) ? abs(gap_read) : abs(gap_genome); } else if (gap_read < 0) { gap = abs(gap_read); } else if (gap_genome < 0) { gap = abs(gap_genome); } else { gap = 2; } int flank = 5; apply_flank = 1; if (abs(gap) >= flank*2) { //Solve read overlap flank = abs(gap)/2 + flank/2; } //printf("\tgap = %i, flank = %i\n", gap, flank); if (flank >= anchor_forw) { bwt_anchor_forw->end -= anchor_forw/2; } else { bwt_anchor_forw->end -= flank; } if (flank >= anchor_back) { bwt_anchor_back->start += anchor_back/2; } else { bwt_anchor_back->start += flank; } } cal = convert_bwt_anchor_to_CAL(bwt_anchor_forw, 0, bwt_anchor_forw->end - bwt_anchor_forw->start); //printf("INSERT-1 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end); array_list_insert(cal, list); seed_size = bwt_anchor_back->end - bwt_anchor_back->start + 1; //if (bwt_anchor_forw->end + read_length >= bwt_anchor_back->start) { //seed_region_t *seed_region = seed_region_new(read_length - seed_size, read_length - 1, //bwt_anchor_back->start, bwt_anchor_back->end, 1); //cal->end = bwt_anchor_back->end; //linked_list_insert_last(seed_region, cal->sr_list); //} else { cal = convert_bwt_anchor_to_CAL(bwt_anchor_back, read_length - seed_size, read_length - 1); //printf("INSERT-2 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end); array_list_insert(cal, list); if (array_list_size(list) > 5) { free(set_backward); free(set_forward); goto exit; } array_list_set_flag(DOUBLE_ANCHORS, list); found_double_anchor = 1; set_forward[i] = 1; set_backward[j] = 1; break; } } } free(set_backward); free(set_forward); } if (!found_double_anchor && found_anchor) { //Not Double anchor found but one Yes!! if (strand == 1) { if (type == FORWARD_ANCHOR) { anchor_list_tmp = forward_anchor_list_1; } else { anchor_list_tmp = backward_anchor_list_1; } } else { if (type == FORWARD_ANCHOR) { anchor_list_tmp = forward_anchor_list_0; } else { anchor_list_tmp = backward_anchor_list_0; } } //printf("LIST SIZE %i\n", array_list_size(anchor_list_tmp)); for (int i = 0; i < array_list_size(anchor_list_tmp); i++) { bwt_anchor = array_list_get(i, anchor_list_tmp); size_t seed_size = bwt_anchor->end - bwt_anchor->start; //array_list_insert(bwt_anchor_new(bwt_anchor->strand, bwt_anchor->chromosome, // bwt_anchor->start, bwt_anchor->end, bwt_anchor->type), anchor_list); if (bwt_anchor->type == FORWARD_ANCHOR) { //printf("------------------------> start %i\n", 0); cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size); } else { //printf("------------------------> start %i\n", read_length - seed_size); cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1); } array_list_insert(cal, list); } array_list_set_flag(SINGLE_ANCHORS, list); } exit: array_list_free(forward_anchor_list_1, (void *)bwt_anchor_free); array_list_free(backward_anchor_list_1, (void *)bwt_anchor_free); array_list_free(forward_anchor_list_0, (void *)bwt_anchor_free); array_list_free(backward_anchor_list_0, (void *)bwt_anchor_free); array_list_free(big_anchor_list, (void *)bwt_anchor_free); return array_list_size(list); }
int apply_bwt_rna(bwt_server_input_t* input, batch_t *batch) { LOG_DEBUG("========= APPLY BWT RNA =========\n"); metaexons_t *metaexons = input->metaexons; mapping_batch_t *mapping_batch = batch->mapping_batch; size_t num_reads = array_list_size(mapping_batch->fq_batch); size_t num_mappings; array_list_t *list; size_t *unmapped_indices = mapping_batch->targets; size_t num_unmapped = 0; size_t num_anchors; extern pthread_mutex_t mutex_sp; extern st_bwt_t st_bwt; //pthread_mutex_lock(&mutex_sp); //extern size_t total_reads; //total_reads += num_reads; //pthread_mutex_unlock(&mutex_sp); for (int i = 0; i < num_reads; i++) { fastq_read_t *read = array_list_get(i, mapping_batch->fq_batch); //Rev-comp fastq_read_revcomp(read); //printf("BWT: %s\n", read->id); list = mapping_batch->mapping_lists[i]; array_list_set_flag(1, list); num_mappings = bwt_map_inexact_read(read, input->bwt_optarg_p, input->bwt_index_p, list); if (array_list_get_flag(list) != 2) { //If flag 2, the read exceded the max number of mappings if (array_list_get_flag(list) == 1) { if (num_mappings > 0) { num_anchors = bwt_search_pair_anchors(list, read->length); if (num_anchors == 0) { array_list_set_flag(NOT_ANCHORS, list); unmapped_indices[num_unmapped++] = i; } } else { array_list_set_flag(NOT_ANCHORS, list); unmapped_indices[num_unmapped++] = i; } //printf("tot anchors found %i %s\n", num_anchors, read->id); } else if (num_mappings <= 0) { array_list_set_flag(0, list); //printf("Read NO Mapped %i %s\n", num_anchors, read->id); unmapped_indices[num_unmapped++] = i; } else { //Read Map, Metaexon Actualization array_list_set_flag(ALIGNMENTS_FOUND, list); pthread_mutex_lock(&mutex_sp); st_bwt.map_bwt++; pthread_mutex_unlock(&mutex_sp); for (int i = 0; i < num_mappings; i++) { alignment_t *alignment = array_list_get(i, list); metaexon_insert(0, alignment->chromosome, alignment->position, alignment->position + read->length, 40, METAEXON_NORMAL, NULL, metaexons); //alignment->alig_data = cigar_code_new_by_string(alignment->cigar); } } } else { array_list_set_flag(ALIGNMENTS_EXCEEDED, list); } if (array_list_get_flag(list) == DOUBLE_ANCHORS) { //printf("DOUBLE ANCHORS\n"); for (int j = 0; j < array_list_size(list); j++) { //bwt_anchor_t *bwt_anchor_prev = array_list_get(j, list); cal_t *cal = array_list_get(j, list); metaexon_insert(0/*cal->strand*/, cal->chromosome_id - 1, cal->start, cal->end, 40, METAEXON_NORMAL, NULL, metaexons); } } else if (array_list_get_flag(list) == SINGLE_ANCHORS) { for (int j = 0; j < array_list_size(list); j++) { //bwt_anchor_t *bwt_anchor = array_list_get(j, list); cal_t *cal = array_list_get(j, list); metaexon_t *metaexon; if (metaexon_search(0/*cal->strand*/, cal->chromosome_id - 1, cal->start, cal->end, &metaexon, metaexons)) { metaexon_insert(0/*cal->strand*/, cal->chromosome_id - 1, cal->start, cal->end, 40, METAEXON_NORMAL, NULL, metaexons); } } } } // array_list flag: 0 -> Not BWT Anchors found // 1 -> One BWT Anchors found // 2 -> Pair BWT Anchors found // 3 -> Alignments found // 4 -> Alignments exceded mapping_batch->num_targets = num_unmapped; LOG_DEBUG("========= APPLY BWT RNA END =========\n"); if (batch->mapping_batch->num_targets > 0) { return RNA_CAL_STAGE; } else { return RNA_STAGE; } }
void apply_seeding(region_seeker_input_t* input, aligner_batch_t *batch) { // printf("START: apply_seeding\n"); char *seq; array_list_t *list = NULL; size_t index, num_mappings; fastq_batch_t *fq_batch = batch->fq_batch; size_t min_num_seeds = input->cal_optarg_p->min_num_seeds; size_t max_num_seeds = input->cal_optarg_p->max_num_seeds; size_t seed_size = input->cal_optarg_p->seed_size; size_t min_seed_size = input->cal_optarg_p->min_seed_size; size_t num_seqs = batch->num_targets; size_t num_outputs = 0; size_t *outputs = (size_t *) calloc(num_seqs, sizeof(size_t)); // set to zero batch->num_done = 0; batch->num_to_do = 0; // omp parallel for !! for (size_t i = 0; i < num_seqs; i++) { index = batch->targets[i]; list = batch->mapping_lists[index]; //printf("region_seeker.c: apply_seeding: list #%i size = %i\n", i, array_list_size(list)); seq = &(fq_batch->seq[fq_batch->data_indices[index]]); /* num_mappings = bwt_map_exact_seeds_seq(seq, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, list); */ num_mappings = bwt_map_exact_seeds_seq_by_num(seq, min_num_seeds, max_num_seeds, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, list); if (num_mappings > 0) { // printf("\tregion_seeker.c: apply_seeding, setting flag to 2 for list %i\n", index); array_list_set_flag(2, list); outputs[num_outputs++] = index; batch->num_to_do += num_mappings; // } else { // if (strncmp("@rand", &(fq_batch->header[fq_batch->header_indices[index]]), 5)) { // printf("\tno seeds for read #%d: %s\n", // index, &(fq_batch->header[fq_batch->header_indices[index]])); // } // } else { // printf("\tregion_seeker.c: apply_seeding: %s: list #%i, size = %i\n", &(fq_batch->header[fq_batch->header_indices[index]]), i, array_list_size(list)); } // printf("\tSEED : read %d (%d items): %s\n", // index, num_mappings, &(fq_batch->header[fq_batch->header_indices[index]])); batch->num_done += 2 * (strlen(seq) / seed_size); batch->num_to_do += num_mappings; } // update batch batch->num_allocated_targets = num_seqs; batch->num_targets = num_outputs; if (batch->targets != NULL) free(batch->targets); batch->targets = outputs; // update counter thr_seeding_items[omp_get_thread_num()] += batch->num_done; // printf("region_seeker.c: apply_seeding: num_outputs = %i\n", num_outputs); // printf("END: apply_seeding, (seeding %d reads)\n", num_outputs); }
//==================================================================================== // apply_caling //==================================================================================== int apply_caling(cal_seeker_input_t* input, batch_t *batch) { mapping_batch_t *mapping_batch = batch->mapping_batch; array_list_t *list = NULL; size_t read_index, num_cals; int min_seeds, max_seeds; cal_t *cal; array_list_t *cal_list; fastq_read_t *read; size_t num_chromosomes = input->genome->num_chromosomes + 1; size_t num_targets = mapping_batch->num_targets; size_t *targets = mapping_batch->targets; size_t new_num_targets = 0; array_list_t *region_list; bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw; linked_list_t *linked_list; int anchor_nt, gap_nt; seed_region_t *seed_region_start, *seed_region_end; //max_seeds = input->cal_optarg->num_seeds; // size_t *new_targets = (size_t *) calloc(num_targets, sizeof(size_t)); // set to zero mapping_batch->num_to_do = 0; for (size_t i = 0; i < num_targets; i++) { read_index = targets[i]; read = array_list_get(read_index, mapping_batch->fq_batch); region_list = mapping_batch->mapping_lists[read_index]; // for debugging // LOG_DEBUG_F("%s\n", ((fastq_read_t *) array_list_get(read_index, mapping_batch->fq_batch))->id); if (!list) { list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); } if (array_list_get_flag(region_list) == 0 || array_list_get_flag(region_list) == 2) { //We have normal and extend seeds (anchors) max_seeds = (read->length / 15)*2 + 10; num_cals = bwt_generate_cal_list_linked_list(region_list, input->cal_optarg, &min_seeds, &max_seeds, num_chromosomes, list, read->length, input->cal_optarg->min_cal_size, 0); } else { //We have double anchors with smaller distance between they //printf("Easy case... Two anchors and same distance between read gap and genome distance\n"); num_cals = 0; for (int a = array_list_size(region_list) - 1; a >= 0; a -= 2) { max_seeds = 2; min_seeds = 2; bwt_anchor_back = array_list_remove_at(a, region_list); bwt_anchor_forw = array_list_remove_at(a - 1, region_list); linked_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); //Seed for the first anchor anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start; //printf("\t seed0[%i-%i][%lu-%lu]\n", 0, anchor_nt - 1, // bwt_anchor_forw->start, bwt_anchor_forw->end); seed_region_start = seed_region_new(0, anchor_nt - 1, bwt_anchor_forw->start, bwt_anchor_forw->end, 0, 0, 0); //Seed for the first anchor gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start)); //printf("\t gap_nt = %i, anchor_nt = %i\n", gap_nt, anchor_nt); //printf("\t seed1[%i-%i][%lu-%lu]\n", anchor_nt + gap_nt, read->length - 1, // bwt_anchor_back->start + 1, bwt_anchor_back->end); seed_region_end = seed_region_new(anchor_nt + gap_nt, read->length - 1, bwt_anchor_back->start + 1, bwt_anchor_back->end, 1, 0, 0); //The reference distance is 0 and the read distance not //The read distance is 0 and the reference distance not //if (seed_region_start->genome_end > seed_region_end->genome_start || // seed_region_start->read_end > seed_region_end->read_start) { //array_list_clear(region_list, NULL); //continue; if (seed_region_end->genome_start - seed_region_start->genome_end < 5 || seed_region_end->read_start - seed_region_start->read_end < 5) { seed_region_start->genome_end -= 5; seed_region_start->read_end -= 5; seed_region_end->genome_start += 5; seed_region_end->read_start += 5; } linked_list_insert(seed_region_start, linked_list); linked_list_insert_last(seed_region_end, linked_list); cal = cal_new(bwt_anchor_forw->chromosome + 1, bwt_anchor_forw->strand, bwt_anchor_forw->start, bwt_anchor_back->end + 1, 2, linked_list, linked_list_new(COLLECTION_MODE_ASYNCHRONIZED)); array_list_insert(cal, list); num_cals++; } } // for debugging LOG_DEBUG_F("read %s : num. cals = %i, min. seeds = %i, max. seeds = %i\n", read->id, num_cals, min_seeds, max_seeds); /* if (num_cals == 0) { int seed_size = 24; //First, Delete old regions array_list_clear(mapping_batch->mapping_lists[read_index], region_bwt_free); //Second, Create new regions with seed_size 24 and 1 Mismatch bwt_map_inexact_seeds_seq(read->sequence, seed_size, seed_size/2, bwt_optarg, bwt_index, mapping_batch->mapping_lists[read_index]); num_cals = bwt_generate_cal_list_linked_list(mapping_batch->mapping_lists[mapping_batch->targets[i]], input->cal_optarg, &min_seeds, &max_seeds, num_chromosomes, list, read->length); }*/ /* for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); LOG_DEBUG_F("\tchr: %i, strand: %i, start: %lu, end: %lu, num_seeds = %i, num. regions = %lu\n", cal->chromosome_id, cal->strand, cal->start, cal->end, cal->num_seeds, cal->sr_list->size); } */ // printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", // min_seeds, max_seeds, min_limit, array_list_size(list)); // filter incoherent CALs int founds[num_cals], found = 0; for (size_t j = 0; j < num_cals; j++) { founds[j] = 0; cal = array_list_get(j, list); LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", j, num_cals, cal->sr_list->size, cal->num_seeds, cal->chromosome_id, cal->start, cal->end); if (cal->sr_list->size > 0) { int start = 0; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start); if (start > s->read_start) { LOG_DEBUG("\t\t\t:: remove\n"); found++; founds[j] = 1; } start = s->read_end + 1; } } else { found++; founds[j] = 1; } } if (found) { min_seeds = 100000; max_seeds = 0; cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { if (!founds[j]) { cal = array_list_get(j, list); cal->num_seeds = cal->sr_list->size; if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds; if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds; array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_free(list, (void *) cal_free); num_cals = array_list_size(cal_list); list = cal_list; } // LOG_FATAL_F("num. cals = %i, min. seeds = %i, max. seeds = %i\n", num_cals, min_seeds, max_seeds); // filter CALs by the number of seeds cal_list = list; list = NULL; /* int min_limit = input->cal_optarg->min_num_seeds_in_cal; if (min_limit < 0) min_limit = max_seeds; // min_limit -= 3; if (min_seeds == max_seeds || min_limit <= min_seeds) { cal_list = list; list = NULL; } else { cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); if (cal->num_seeds >= min_limit) { array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_clear(list, (void *) cal_free); num_cals = array_list_size(cal_list); } */ if (num_cals > MAX_CALS) { for (size_t j = num_cals - 1; j >= MAX_CALS; j--) { cal = (cal_t *) array_list_remove_at(j, cal_list); cal_free(cal); } num_cals = array_list_size(cal_list); } // LOG_DEBUG_F("num. cals = %i, MAX_CALS = %i\n", num_cals, MAX_CALS); if (num_cals > 0 && num_cals <= MAX_CALS) { array_list_set_flag(2, cal_list); targets[new_num_targets++] = read_index; /* int count1 = 0, count2 = 0; // count number of sw to do // method #1 // printf("method #1\n"); seed_region_t *s, *prev_s; linked_list_iterator_t* itr; for (size_t j = 0; j < num_cals; j++) { prev_s = NULL; cal = array_list_get(j, cal_list); itr = linked_list_iterator_new(cal->sr_list); s = (seed_region_t *) linked_list_iterator_curr(itr); while (s != NULL) { if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) { // printf("\t\t\tcase 1\n"); count1++; } prev_s = s; linked_list_iterator_next(itr); s = linked_list_iterator_curr(itr); } if (prev_s != NULL && prev_s->read_end < read->length - 1) { count1++; // printf("\t\t\tcase 2 (%i < %i)\n", prev_s->read_end, read->length - 1); } linked_list_iterator_free(itr); } // method #2 printf("method #2\n"); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, cal_list); printf("\t: %i\n", j); if (cal->sr_list->size > 0) { int start = 0; for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) { seed_region_t *s = list_item->item; printf("\t\t[%i|%i - %i|%i]\n", s->genome_start, s->read_start, s->read_end, s->genome_end); if (s->read_start != start) { count2++; } start = s->read_end + 1; } if (start < read->length) { count2++; } } } printf("count #1 = %i, count #2 = %i\n", count1, count2); assert(count1 == count2); mapping_batch->num_to_do += count1; */ // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; } else { array_list_set_flag(0, mapping_batch->mapping_lists[read_index]); // we have to free the region list array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); if (cal_list) array_list_free(cal_list, (void *) cal_free); if (list) array_list_clear(list, (void *) cal_free); } /* cal_list = list; list = NULL; array_list_set_flag(2, cal_list); // mapping_batch->num_to_do += num_cals; targets[new_num_targets++] = read_index; // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; */ /* // filter CALs by the number of seeds int min_limit = input->cal_optarg->min_num_seeds_in_cal; if (min_limit < 0) min_limit = max_seeds; printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", min_seeds, max_seeds, min_limit, array_list_size(list)); if (min_seeds == max_seeds || min_limit <= min_seeds) { cal_list = list; list = NULL; } else { cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); for (size_t j = 0; j < num_cals; j++) { cal = array_list_get(j, list); if (cal->num_seeds >= min_limit) { array_list_insert(cal, cal_list); array_list_set(j, NULL, list); } } array_list_clear(list, (void *) cal_free); num_cals = array_list_size(cal_list); printf("************, num_cals = %i\n", num_cals); } if (num_cals > MAX_CALS) { for (size_t j = num_cals - 1; j >= MAX_CALS; j--) { cal = (cal_t *) array_list_remove_at(j, cal_list); cal_free(cal); } num_cals = array_list_size(cal_list); } if (num_cals > 0 && num_cals <= MAX_CALS) { array_list_set_flag(2, cal_list); mapping_batch->num_to_do += num_cals; targets[new_num_targets++] = read_index; // we have to free the region list array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); mapping_batch->mapping_lists[read_index] = cal_list; } else { array_list_set_flag(0, mapping_batch->mapping_lists[read_index]); // we have to free the region list array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free); if (cal_list) array_list_free(cal_list, (void *) cal_free); if (list) array_list_clear(list, (void *) cal_free); } */ } // end for 0 ... num_targets // update batch mapping_batch->num_targets = new_num_targets; // LOG_DEBUG_F("num. SW to do: %i\n", mapping_batch->num_to_do); // exit(-1); // free memory if (list) array_list_free(list, NULL); if (batch->mapping_mode == RNA_MODE) { return RNA_STAGE; } if (batch->pair_input->pair_mng->pair_mode != SINGLE_END_MODE) { return PRE_PAIR_STAGE; } else if (batch->mapping_batch->num_targets > 0) { return SW_STAGE; } return DNA_POST_PAIR_STAGE; }
int apply_seeding(region_seeker_input_t* input, batch_t *batch) { //printf("APPLY SEEDING...\n"); //if (time_on) { start_timer(start); } mapping_batch_t *mapping_batch = batch->mapping_batch; size_t num_mappings; int seed_size = input->cal_optarg_p->seed_size; size_t min_seed_size = input->cal_optarg_p->min_seed_size; size_t num_targets = mapping_batch->num_targets; size_t *targets = mapping_batch->targets; size_t new_num_targets = 0; fastq_read_t *read; int min_intron_size = 40; int target; bwt_anchor_t *bwt_anchor = NULL; region_t *region; int gap_nt; int start_search; int end_search; // set to zero mapping_batch->num_to_do = 0; //TODO: omp parallel for !! /*if (batch->mapping_mode == 1000) { for (size_t i = 0; i < num_targets; i++) { //printf("Seq (i=%i)(target=%i): %s\n", i, targets[i], read->sequence); read = array_list_get(targets[i], mapping_batch->fq_batch); num_mappings = bwt_map_exact_seeds_seq(padding_left, padding_right, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], mapping_batch->extra_stage_id[targets[i]]); //printf("Num mappings %i\n", num_mappings); if (num_mappings > 0) { array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]); targets[new_num_targets++] = targets[i]; mapping_batch->num_to_do += num_mappings; } } } else {*/ //size_t new_num_targets = 0; //size_t *new_targets = (size_t *)malloc(array_list_size(fq_batch)*sizeof(size_t)); array_list_t *array_list_aux = array_list_new(256, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); //Flag 0: The read has simple anchor or any, and need seeds and normal Cal_Seeker //Flag 1: The read has double anchor and the gap is smaller than MIN_INTRON_SIZE. Cal_Seeker will be make one CAL //Flag 2: The read has double anchor but the gap is bigger than MIN_INTRON_SIZE. for (size_t i = 0; i < num_targets; i++) { read = array_list_get(targets[i], mapping_batch->fq_batch); //printf("Read Region %s: \n", read->id); /* if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 || array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) { array_list_clear(mapping_batch->mapping_lists[targets[i]], bwt_anchor_free); continue; } */ if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 || array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) { //Flag 0 Case, Not anchors found, Make normal seeds // printf("***** Normal Case 0. Not anchors found!\n"); for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) { bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor, array_list_aux); } num_mappings = 0; num_mappings = bwt_map_exact_seeds_seq(0, 0, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], 0); if (num_mappings > 0) { array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]); targets[new_num_targets++] = targets[i]; //mapping_batch->num_to_do += num_mappings; } } else if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) { //Flag 1 Case, One anchor found, Make displacements seeds printf("***** Case 1. One anchor found!\n"); for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) { bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor, array_list_aux); } int anchor_nt = bwt_anchor->end - bwt_anchor->start; int seed_id = 0; int seed_start, seed_end; int extra_seed; if ((bwt_anchor->type == FORWARD_ANCHOR && bwt_anchor->strand == 0) || (bwt_anchor->type == BACKWARD_ANCHOR && bwt_anchor->strand == 1 )) { start_search = anchor_nt + 1; end_search = read->length - 1; extra_seed = EXTRA_SEED_END; } else { start_search = 0; end_search = read->length - anchor_nt - 2; extra_seed = EXTRA_SEED_START; } printf("end_start %i - start_search %i = %i >= seed_size %i\n", end_search, start_search, end_search - start_search, seed_size); if (end_search - start_search >= seed_size) { printf("00 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search); /* num_mappings = bwt_map_exact_seeds_between_coords(start_search, end_search, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], extra_seed, &seed_id); */ } if (bwt_anchor->type == FORWARD_ANCHOR) { seed_id = 0; seed_start = 0; seed_end = anchor_nt; } else { seed_id += 1; seed_start = read->length - anchor_nt - 1; seed_end = read->length - 1; } for (int j = 0; j < array_list_size(array_list_aux); j++) { bwt_anchor_t *bwt_anchor = array_list_get(j, array_list_aux); // printf("\tCreate seed Anchor [%i:%lu|%i-%i|%lu]\n", bwt_anchor->chromosome + 1, bwt_anchor->start, // seed_start,seed_end,bwt_anchor->end); region = region_bwt_new(bwt_anchor->chromosome + 1, bwt_anchor->strand, bwt_anchor->start, bwt_anchor->end, seed_start, seed_end, read->length, seed_id); array_list_insert(region, mapping_batch->mapping_lists[targets[i]]); } array_list_clear(array_list_aux, (void *)bwt_anchor_free); array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]); targets[new_num_targets++] = targets[i]; } else { //Flag 2 Case, Pair of anchors found printf("***** Case 2. Double anchor found!\n"); bwt_anchor_t *bwt_anchor; bwt_anchor_t *bwt_anchor_forw, *bwt_anchor_back; int read_nt, genome_nt; int distance; int found = 0; region_t *region; int seed_id = 0; //if (array_list_size(mapping_batch->mapping_lists[targets[i]]) > 2) { int *anchors_targets = (int *)calloc(array_list_size(mapping_batch->mapping_lists[targets[i]]), sizeof(int)); int num = 0; //min_intron_size = 0; //Search if one anchor is at the same distance from the reference and the read for (int b = 0; b < array_list_size(mapping_batch->mapping_lists[targets[i]]); b += 2) { bwt_anchor_forw = array_list_get(b, mapping_batch->mapping_lists[targets[i]]); bwt_anchor_back = array_list_get(b + 1, mapping_batch->mapping_lists[targets[i]]); //printf("FORW=%i:%lu-%lu BACK=%i:%lu-%lu\n", bwt_anchor_forw->chromosome, bwt_anchor_forw->start, bwt_anchor_forw->end, // bwt_anchor_back->chromosome, bwt_anchor_back->start, bwt_anchor_back->end); read_nt = read->length - ((bwt_anchor_forw->end - bwt_anchor_forw->start) + (bwt_anchor_back->end - bwt_anchor_back->start)); genome_nt = bwt_anchor_back->start - bwt_anchor_forw->end; distance = abs(genome_nt - read_nt); //printf("\t%i:Distance %i\n", b, distance); if (distance < min_intron_size) { found = 1; } else { anchors_targets[num++] = b; } } if (found) { //printf("\tFound Exact Case... Delete other anchors\n"); for (int t = num - 1; t >= 0; t--) { target = anchors_targets[t]; //printf("\tDelete %i, %i-->\n", target, target + 1); bwt_anchor = array_list_remove_at(target + 1, mapping_batch->mapping_lists[targets[i]]); bwt_anchor_free(bwt_anchor); bwt_anchor = array_list_remove_at(target, mapping_batch->mapping_lists[targets[i]]); bwt_anchor_free(bwt_anchor); } array_list_set_flag(1, mapping_batch->mapping_lists[targets[i]]); } else { //Seeding between anchors //printf("\tFound gap between anchors \n"); array_list_t *anchors_forward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]), 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_t *anchors_backward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]), 1.25f, COLLECTION_MODE_ASYNCHRONIZED); int big_gap = 0; int final_anchor_nt = 0; int anchor_nt; int anchor_type; int anchor_strand; for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j -= 2) { bwt_anchor_back = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor_back, anchors_backward); bwt_anchor_forw = array_list_remove_at(j - 1, mapping_batch->mapping_lists[targets[i]]); array_list_insert(bwt_anchor_forw, anchors_forward); if (bwt_anchor_forw->strand == 0) { anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start; gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start)); } else { anchor_nt = bwt_anchor_back->end - bwt_anchor_back->start; gap_nt = read->length - (anchor_nt + (bwt_anchor_forw->end - bwt_anchor_forw->start)); } if (gap_nt < 0) { gap_nt = 0; } //printf("Gap nt (%i - %i): %i\n", anchor_nt, bwt_anchor_back->end - bwt_anchor_back->start, gap_nt); if (gap_nt > big_gap) { big_gap = gap_nt; final_anchor_nt = anchor_nt; anchor_type = bwt_anchor_back->type; anchor_strand = bwt_anchor_back->strand; } } printf("%i, %i\n", big_gap - 2, seed_size); if (big_gap - 2 > seed_size) { //if (anchor_type == FORWARD_ANCHOR && anchor_strand == 0 || // anchor_type == BACKWARD_ANCHOR && anchor_strand == 1 ) { start_search = final_anchor_nt + 1; end_search = final_anchor_nt + big_gap - 1; //} else { // start_search = final_anchor_nt + big_gap - 1; //end_search = final_anchor_nt + 1; //} //printf("Seeding between anchors... gap=%i\n", big_gap); printf("11 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search); /* num_mappings = bwt_map_exact_seeds_between_coords(start_search, end_search, read->sequence, seed_size, min_seed_size, input->bwt_optarg_p, input->bwt_index_p, mapping_batch->mapping_lists[targets[i]], EXTRA_SEED_NONE, &seed_id); */ } //printf("Making seeds anchors...\n"); for (int a = 0; a < array_list_size(anchors_forward); a++) { //Insert the last anchor. (Create new seed) bwt_anchor_forw = array_list_get(a, anchors_forward); bwt_anchor_back = array_list_get(a, anchors_backward); anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start; gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start)); //printf("\t --> Big Seed: %i, gap_nt: %i, anchor_nt = %i\n", a, gap_nt, anchor_nt); if (gap_nt < 0) { //gap_nt = 0; bwt_anchor_forw->end += gap_nt; bwt_anchor_back->start -= gap_nt; anchor_nt += gap_nt; gap_nt = 0; } else if (gap_nt == 0) { bwt_anchor_forw->end -= 1; bwt_anchor_back->start += 1; anchor_nt -= 1; gap_nt = 1; } region = region_bwt_new(bwt_anchor_forw->chromosome + 1, bwt_anchor_forw->strand, bwt_anchor_forw->start, bwt_anchor_forw->end, 0, anchor_nt, read->length, 0); //printf("Region: %i-%i\n", region->seq_start, region->seq_end); array_list_insert(region, mapping_batch->mapping_lists[targets[i]]); region = region_bwt_new(bwt_anchor_back->chromosome + 1, bwt_anchor_back->strand, bwt_anchor_back->start, bwt_anchor_back->end, anchor_nt + gap_nt, read->length - 1, read->length, seed_id + 1); //printf("Region: %i-%i\n", region->seq_start, region->seq_end); array_list_insert(region, mapping_batch->mapping_lists[targets[i]]); //printf("\tMaking seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]])); bwt_anchor_free(bwt_anchor_back); bwt_anchor_free(bwt_anchor_forw); } array_list_free(anchors_forward, NULL); array_list_free(anchors_backward, NULL); //printf("Making seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]])); array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]); } free(anchors_targets); targets[new_num_targets++] = targets[i]; } } mapping_batch->num_targets = new_num_targets; array_list_free(array_list_aux, NULL); //if (time_on) { stop_timer(start, end, time); timing_add(time, REGION_SEEKER, timing); } //printf("APPLY SEEDING DONE!\n"); return CAL_STAGE; }
void *file_reader_2(void *input) { wf_input_file_t *wf_input = (wf_input_file_t *) input; FILE *fd = wf_input->file; batch_t *batch = wf_input->batch; int pair_mode = batch->pair_input->pair_mng->pair_mode; const int MAX_READS = 100; int num_reads = 0; batch_t *new_batch = NULL; size_t sizes_to_read[3], head_len, seq_len, num_items; size_t tot_size; char *buffer, *id, *sequence, *quality; size_t bytes; unsigned char type; array_list_t *reads = array_list_new(MAX_READS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); mapping_batch_t *mapping_batch = mapping_batch_new_2(MAX_READS, reads, batch->pair_input->pair_mng); while (1) { //[size head][size seq][num items] bytes = fread(&type, sizeof(unsigned char), 1, fd); if (!bytes) { break; } //fastq_read_t *fq_read = file_fastq_read_new(&num_items, fd); fastq_read_t *fq_read = file_read_fastq_reads(&num_items, fd); if (fq_read == NULL) { /*printf("fq NULL\n");*/ break; } //printf("(num items %i)\nID : %s\nSEQ: %s\nQUA: %s\n", num_items, fq_read->id, fq_read->sequence, fq_read->quality); array_list_insert(fq_read, reads); mapping_batch->mapping_lists[num_reads] = array_list_new(50, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); if (type == CAL_TYPE) { //exit(-1); //printf("\tCal Report\n"); file_read_cals(num_items, mapping_batch->mapping_lists[num_reads], fq_read, fd); } else if (type == META_ALIGNMENT_TYPE) { //printf("\tMeta Alignments Report\n"); file_read_meta_alignments(num_items, mapping_batch->mapping_lists[num_reads], fq_read, fd); array_list_set_flag(BITEM_META_ALIGNMENTS, mapping_batch->mapping_lists[num_reads]); } else { //exit(-1); //printf("\tAlignments Report\n"); file_read_alignments(num_items, mapping_batch->mapping_lists[num_reads], fq_read, fd); } //printf("W3 file read %i\n", array_list_size(mapping_batch->mapping_lists[num_reads])); num_reads++; if (num_reads >= MAX_READS) { break; } } tot_reads2 += num_reads; //printf("W3 Reads: %i | %i\n", tot_reads2, num_reads); //w3_r += num_reads; //printf("W3 Reads: %i\n", w3_r); if (num_reads) { mapping_batch->num_allocated_targets = num_reads; new_batch = batch_new(batch->bwt_input, batch->region_input, batch->cal_input, batch->pair_input, batch->preprocess_rna, batch->sw_input, batch->writer_input, batch->mapping_mode, mapping_batch); } else { mapping_batch_free(mapping_batch); } extern size_t reads_w3; reads_w3 += num_reads; return new_batch; }
/* fastq_read_t *file_fastq_read_new(size_t *num_items, FILE *fd) { size_t sizes_to_read[3], head_len, seq_len; head_len = sizes_to_read[0]; seq_len = sizes_to_read[1]; *num_items = sizes_to_read[2]; int bytes = fread(sizes_to_read, sizeof(size_t), 3, fd); if (!bytes) { return NULL; } int tot_size = head_len + 2*seq_len; buffer = (char *)calloc(tot_size + 1, sizeof(char)); bytes = fread(buffer, sizeof(char), tot_size, fd); if (!bytes) { free(buffer); return NULL; } char *id = (char *)calloc(head_len + 1, sizeof(char)); memcpy(id, buffer, head_len); //printf("ID : %s\n", id); char *sequence = (char *)calloc(seq_len + 1, sizeof(char)); memcpy(sequence, &buffer[head_len], seq_len); //printf("SEQ: %s\n", sequence); char *quality = (char *)calloc(seq_len + 1, sizeof(char)); memcpy(quality, &buffer[head_len + seq_len], seq_len); //printf("QUA: %s\n", quality); fastq_read_t *fq_read = fastq_read_new(id, sequence, quality); free(buffer); free(id); free(sequence); free(quality); return fq_read; } int file_cal_fill(size_t num_items, array_list_t *list, FILE *fd) { if (!num_items) { return 0; } bwt_anchor_t bwt_anchors[num_items]; bytes = fread(bwt_anchors, sizeof(bwt_anchor_t), num_items, fd); if (!bytes) { LOG_FATAL("Corrupt file\n"); } for (int i = 0; i < num_items; i++) { //printf("[%i:%lu-%lu]\n", bwt_anchors[i].chromosome, bwt_anchors[i].start, bwt_anchors[i].end); size_t seed_size = bwt_anchors[i].end - bwt_anchors[i].start; cal_t *cal; if (bwt_anchors[i].type == FORWARD_ANCHOR) { cal = convert_bwt_anchor_to_CAL(&bwt_anchors[i], 0, seed_size); } else { cal = convert_bwt_anchor_to_CAL(&bwt_anchors[i], fq_read->length - seed_size - 1, fq_read->length - 1); } array_list_insert(cal, list); } return 0; } int file_meta_alignment_fill(size_t num_items, array_list_t *list, FILE *fd) { if (!num_items) { return 0; } simple_alignment_t simple_alignment[num_items]; simple_alignment_t *simple_a; bytes = fread(simple_alignment, sizeof(simple_alignment_t), num_items, fd); if (!bytes) { LOG_FATAL("Corrupt file\n"); } size_t cigar_tot_len = 0; for (int i = 0; i < num_items; i++) { simple_a = &simple_alignment[i]; //printf("ITEM %i: (%i)[%i:%lu] [%i-%i]\n", i, simple_a->map_strand, simple_a->map_chromosome, // simple_a->map_start, simple_a->gap_start, simple_a->gap_end); cigar_tot_len += simple_a->cigar_len; } char cigar_buffer[cigar_tot_len]; bytes = fread(cigar_buffer, sizeof(char), cigar_tot_len, fd); if (!bytes) { LOG_FATAL("Corrupt file\n"); } char cigars_test[num_items][1024]; size_t actual_read = 0; for (int i = 0; i < num_items; i++) { simple_a = &simple_alignment[i]; memcpy(&cigars_test[i], &cigar_buffer[actual_read], simple_a->cigar_len); cigars_test[i][simple_a->cigar_len] = '\0'; actual_read += simple_a->cigar_len; //printf("CIGAR %i: %s\n", i, cigars_test[i]); size_t map_len = fq_read->length - simple_a->gap_start - simple_a->gap_end; //printf("SEED := len_read:%i - gap_read:%i - gap_end:%i = %i, SEED-END = %i\n", fq_read->length, // simple_a->gap_start, // simple_a->gap_end, // map_len, simple_a->gap_start + map_len); seed_region_t *s_region = seed_region_new(simple_a->gap_start, simple_a->gap_start + map_len - 1, simple_a->map_start, simple_a->map_start + map_len, 0); //printf("Exit with seed [%i:%i]\n", s_region->read_start, s_region->read_end); linked_list_t *sr_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); //s_region->info = cigar_code_new_by_string(cigars_test[i]); linked_list_insert(s_region, sr_list); cal_t *cal = cal_new(simple_a->map_chromosome, simple_a->map_strand, simple_a->map_start, simple_a->map_start + map_len, 1, sr_list, linked_list_new(COLLECTION_MODE_ASYNCHRONIZED)); cal->info = cigar_code_new_by_string(cigars_test[i]); meta_alignment_t *meta_alignment = meta_alignment_new(); array_list_insert(cal, meta_alignment->cals_list); array_list_insert(meta_alignment, list); } return 0; } int file_alignment_fill(size_t num_items, array_list_t *list, fastq_read_t *fq_read, FILE *fd) { if (!num_items) { return 0; } alignment_aux_t alignments_aux[num_items]; alignment_aux_t *alignment_a; bytes = fread(alignments_aux, sizeof(alignment_aux_t), num_items, fd); if (!bytes) { LOG_FATAL("Corrupt file\n"); } size_t cigar_tot_len = 0; for (int i = 0; i < num_items; i++) { alignment_a = &simple_alignment[i]; //printf("ITEM %i: (%i)[%i:%lu] [%i-%i]\n", i, simple_a->map_strand, simple_a->map_chromosome, // simple_a->map_start, simple_a->gap_start, simple_a->gap_end); cigar_tot_len += alignmment_a->cigar_len + alignment_a->optional_field_length; } char cigars_test[num_items][1024]; char optional_fields[num_items][1024]; size_t actual_read = 0; for (int i = 0; i < num_items; i++) { alignment_a = &alignments_aux[i]; memcpy(&cigars_test[i], &cigar_buffer[actual_read], alignment_a->cigar_len); cigars_test[i][alignment_a->cigar_len] = '\0'; actual_read += simple_a->cigar_len; char op; char op_value[1024]; int c = 0; int hc_start = 0, hc_end; for (int j = 0; j < alignment_a->cigar_len; j++) { op = cigars_test[j]; if (op < 58) { op_value[c++] = op; } else { op_value[c] = '\0'; if (op == 'H') { hc_start = atoi(op_value); } break; } } if (cigars_test[alignment_a->cigar_len - 1] == 'H') { for (int j = alignment_a->cigar_len - 2; j >= 0; j--) { op = cigars_test[j]; if (op < 58) { op_value[c++] = op; } else { op_value[c] = '\0'; int len = strlen(op_value); char op_val_aux[len]; int pos = len - 1; for (int j = 0; j < len; j++) { op_val_aux[j] = op_value[pos - j]; } hc_end = atoi(op_val_aux); break; } } } memcpy(&optional_fields[i], &cigar_buffer[actual_read], alignment_a->optional_fields_length); optional_fields[i][alignment_a->optional_fields_length] = '0'; actual_read += alignment_a->optional_fields_length; int header_len = strlen(fq_read->id); char header_id[header_len + 1]; get_to_first_blank(fq_read->id, header_len, header_id); //char *header_match = (char *)malloc(sizeof(char)*header_len); //memcpy(header_match, header_id, header_len); int len_read = fq_read->length - (hc_start + hc_end); char *quality = (char *) calloc (len_read + 1, sizeof(char)); strncpy(quality, fq_read->quality + hc_start, len_read); char *query = (char *) calloc (len_read + 1, sizeof(char)); strncpy(query, fq_read->query + hc_start, len_read); //Revisar rna_Server get_to_first_blank header copy alignment_t *alignment = alignment_new(); alignment_init_single_end(strdup(header_id), query, quality, alignment_a->seq_strand, alignment_a->chromosome, alignment_a->position, strdup(cigars_test[i]), alignment_a->num_cigar_operations, alignment_a->map_quality, 1, num_items < 1, alignment_a->optional_fields_length, strdup(optional_fields[i]), alignment); array_list_insert(alignment, list); } return 0; } */ void *file_reader(void *input) { wf_input_file_t *wf_input = (wf_input_file_t *) input; FILE *fd = wf_input->file; batch_t *batch = wf_input->batch; int pair_mode = batch->pair_input->pair_mng->pair_mode; const int MAX_READS = 100; int num_reads = 0; batch_t *new_batch = NULL; size_t tot_size; size_t num_items; char *buffer, *id, *sequence, *quality; size_t bytes; unsigned char type; array_list_t *reads = array_list_new(MAX_READS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); mapping_batch_t *mapping_batch = mapping_batch_new_2(MAX_READS, reads, batch->pair_input->pair_mng); while (1) { //[type][size head][size seq][num items] bytes = fread(&type, sizeof(unsigned char), 1, fd); if (!bytes) { break; } fastq_read_t *fq_read = file_read_fastq_reads(&num_items, fd); if (fq_read == NULL) { break; } mapping_batch->mapping_lists[num_reads] = array_list_new(50, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); //printf("(num items %i)\nID : %s\nSEQ: %s\nQUA: %s\n", num_items, fq_read->id, fq_read->sequence, fq_read->quality); array_list_insert(fq_read, reads); if (type == CAL_TYPE) { //printf("\tCal Report\n"); file_read_cals(num_items, mapping_batch->mapping_lists[num_reads], fq_read, fd); array_list_set_flag(BITEM_SINGLE_ANCHORS, mapping_batch->mapping_lists[num_reads]); } else if (type == META_ALIGNMENT_TYPE) { //printf("\tMeta Alignments Report\n"); array_list_set_flag(BITEM_META_ALIGNMENTS, mapping_batch->mapping_lists[num_reads]); file_read_meta_alignments(num_items, mapping_batch->mapping_lists[num_reads], fq_read, fd); } else { //printf("\tAlignments Report\n"); file_read_alignments(num_items, mapping_batch->mapping_lists[num_reads], fq_read, fd); } /*if (strcmp("@ENST00000496771@ENSG00000000003@processed_transcript@X@99887538@99891686@-1@KNOWN_518_447_1_0_0_0_4:0:0_3:0:0_3/1", fq_read->id) == 0) { exit(-1); }*/ num_reads++; if (num_reads >= MAX_READS) { break; } } //w2_r += num_reads; //printf("W2 Reads: %i\n", w2_r); if (num_reads) { mapping_batch->num_allocated_targets = num_reads; new_batch = batch_new(batch->bwt_input, batch->region_input, batch->cal_input, batch->pair_input, batch->preprocess_rna, batch->sw_input, batch->writer_input, batch->mapping_mode, mapping_batch); } else { //array_list_free(reads, NULL); mapping_batch_free(mapping_batch); } extern size_t reads_w2; reads_w2 += num_reads; return new_batch; }