void apply_sw(sw_server_input_t* input, aligner_batch_t *batch) { // printf("START: apply_sw\n"); int tid = omp_get_thread_num(); cal_t *cal = NULL; array_list_t *cal_list = NULL, *mapping_list = NULL;//, *old_list = NULL, *new_list = NULL; fastq_batch_t *fq_batch = batch->fq_batch; size_t start, end; genome_t *genome = input->genome_p; size_t flank_length = input->flank_length; // SIMD support for Smith-Waterman float score, min_score = input->min_score; // size_t curr_depth = 0; sw_output_t *sw_output; // sw_simd_input_t *sw_sinput = sw_simd_input_new(SIMD_DEPTH); // sw_simd_output_t *sw_soutput = sw_simd_output_new(SIMD_DEPTH); //sw_simd_context_t *context = sw_simd_context_new(input->match, input->mismatch, // input->gap_open, input->gap_extend); // for tracking the current read, cal being processed using sw_channel_t //sw_channel_t *channel; //sw_channel_t sw_channels[SIMD_DEPTH]; //memset(sw_channels, 0, sizeof(sw_channels)); //size_t header_len, read_len; //size_t strands[SIMD_DEPTH], chromosomes[SIMD_DEPTH], starts[SIMD_DEPTH]; size_t index, num_cals; size_t total = 0, valids = 0; size_t num_seqs = batch->num_targets; // set to zero batch->num_done = batch->num_to_do; batch->num_to_do = 0; size_t sw_total = batch->num_done; /* // for all seqs pending to process !! size_t sw_total = 0; for (size_t i = 0; i < num_seqs; i++) { sw_total += array_list_size(batch->mapping_lists[batch->targets[i]]); } printf("number of sw to run: %d (vs num_done = %d)\n", sw_total, batch->num_done); */ sw_optarg_t *sw_optarg = &input->sw_optarg; /* sw_optarg_t sw_optarg; //= sw_optarg_new(gap_open, gap_extend, matrix_filename); sw_optarg.gap_open = input->gap_open; sw_optarg.gap_extend = input->gap_extend; sw_optarg.subst_matrix['A']['A'] = input->match; sw_optarg.subst_matrix['C']['A'] = input->mismatch; sw_optarg.subst_matrix['T']['A'] = input->mismatch; sw_optarg.subst_matrix['G']['A'] = input->mismatch; sw_optarg.subst_matrix['A']['C'] = input->mismatch; sw_optarg.subst_matrix['C']['C'] = input->match; sw_optarg.subst_matrix['T']['C'] = input->mismatch; sw_optarg.subst_matrix['G']['C'] = input->mismatch; sw_optarg.subst_matrix['A']['G'] = input->mismatch; sw_optarg.subst_matrix['C']['T'] = input->mismatch; sw_optarg.subst_matrix['T']['T'] = input->match; sw_optarg.subst_matrix['G']['T'] = input->mismatch; sw_optarg.subst_matrix['A']['T'] = input->mismatch; sw_optarg.subst_matrix['C']['G'] = input->mismatch; sw_optarg.subst_matrix['T']['G'] = input->mismatch; sw_optarg.subst_matrix['G']['G'] = input->match; */ sw_multi_output_t *output = sw_multi_output_new(sw_total); char *q[sw_total], *r[sw_total]; uint8_t strands[sw_total], chromosomes[sw_total]; size_t starts[sw_total]; size_t sw_count = 0, read_indices[sw_total]; int read_len; // debugging: to kown how many reads are not mapped by SW score // int unmapped_by_score[fq_batch->num_reads]; // memset(unmapped_by_score, 0, fq_batch->num_reads * sizeof(int)); // printf("num of sw to do: %i\n", sw_total); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_seqs; i++) { index = batch->targets[i]; cal_list = batch->mapping_lists[index]; num_cals = array_list_size(cal_list); // printf("sw_server: read #%i with %i cals\n", index, num_cals); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); read_indices[sw_count] = index; // query sequence, revcomp if necessary read_len = fq_batch->data_indices[index + 1] - fq_batch->data_indices[index]; q[sw_count] = (char *) calloc((read_len + 1), sizeof(char)); memcpy(q[sw_count], &(fq_batch->seq[fq_batch->data_indices[index]]), read_len); if (cal->strand == 1) { seq_reverse_complementary(q[sw_count], read_len); } //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]); // reference sequence //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end); start = cal->start - flank_length; end = cal->end + flank_length; r[sw_count] = calloc(1, end - start + 2); genome_read_sequence_by_chr_index(r[sw_count], cal->strand, cal->chromosome_id - 1, &start, &end, genome); // save some stuff, we'll use them after... strands[sw_count] = cal->strand; chromosomes[sw_count] = cal->chromosome_id; starts[sw_count] = start; // printf("read #%i (sw #%i): query: %s (%i)\nref : %s (%i)\n\n", index, sw_count, q[sw_count], strlen(q[sw_count]), r[sw_count], strlen(r[sw_count])); // increase counter sw_count++; } // free cal_list array_list_free(cal_list, (void *)cal_free); batch->mapping_lists[index] = NULL; } // run Smith-Waterman // printf("before smith_waterman: number of sw = %i\n", sw_total); smith_waterman_mqmr(q, r, sw_total, sw_optarg, 1, output); // printf("after smith_waterman\n"); /* // debugging { FILE *fd = fopen("sw.out", "w"); sw_multi_output_save(sw_total, output, fd); fclose(fd); } */ size_t num_targets = 0; // filter alignments by min_score for (size_t i = 0; i < sw_total; i++) { // score = output->score_p[i] / (strlen(output->query_map_p[i]) * input->match); // if (score >= min_score) { /* printf("--------------------------------------------------------------\n"); printf("Smith-Waterman results:\n"); printf("id\t%s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[read_indices[i]]])); printf("ref\n%s\n", r[i]); printf("query\n%s\n", q[i]); printf("map\n%s\n", output->ref_map_p[i]); printf("ref: chr = %d, strand = %d, start = %d, len = %d\n", chromosomes[i], strands[i], starts[i], strlen(r[i])); printf("query-map-start = %d, ref-map-start = %d\n", output->query_start_p[i], output->ref_start_p[i]); printf("score = %0.2f (min. score = %0.2f)\n", output->score_p[i], min_score); printf("--------------------------------------------------------------\n"); */ if (output->score_p[i] >= min_score) { // valid mappings, //insert in the list for further processing index = read_indices[i]; if (batch->mapping_lists[index] == NULL) { mapping_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); array_list_set_flag(0, mapping_list); batch->mapping_lists[index] = mapping_list; batch->targets[num_targets++] = index; } sw_output = sw_output_new(strands[i], chromosomes[i], starts[i], strlen(r[i]), strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], output->score_p[i], score, output->query_map_p[i], output->ref_map_p[i]); array_list_insert(sw_output, mapping_list); batch->num_to_do++; // debugging //unmapped_by_score[index] = 1; } // free query and reference free(q[i]); free(r[i]); } batch->num_targets = num_targets; /* // debugging for (size_t i = 0; i < fq_batch->num_reads; i++) { if (unmapped_by_score[i] == 0) { unmapped_by_score_counter[tid]++; //printf("by score: %s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[index]])); } } */ // update counter thr_sw_items[tid] += sw_count; // free sw_multi_output_free(output); // printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids); }
int apply_sw_bs(sw_server_input_t* input, batch_t *batch) { int sw_3_nucleotides = 0; /* sw_optarg_t *sw_optarg2 = &input->sw_optarg; printf("Matrix Table\n\tA\tC\tG\tT\tN\nA\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nC\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nG\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nT\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\nN\t%+.0f\t%+.0f\t%+.0f\t%+.0f\t%+.0f\n\n", sw_optarg2->subst_matrix['A']['A'], sw_optarg2->subst_matrix['C']['A'], sw_optarg2->subst_matrix['G']['A'], sw_optarg2->subst_matrix['T']['A'], sw_optarg2->subst_matrix['N']['A'], sw_optarg2->subst_matrix['A']['C'], sw_optarg2->subst_matrix['C']['C'], sw_optarg2->subst_matrix['G']['C'], sw_optarg2->subst_matrix['T']['C'], sw_optarg2->subst_matrix['N']['C'], sw_optarg2->subst_matrix['A']['G'], sw_optarg2->subst_matrix['C']['G'], sw_optarg2->subst_matrix['G']['G'], sw_optarg2->subst_matrix['T']['G'], sw_optarg2->subst_matrix['N']['G'], sw_optarg2->subst_matrix['A']['T'], sw_optarg2->subst_matrix['C']['T'], sw_optarg2->subst_matrix['G']['T'], sw_optarg2->subst_matrix['T']['T'], sw_optarg2->subst_matrix['N']['T'], sw_optarg2->subst_matrix['A']['N'], sw_optarg2->subst_matrix['C']['N'], sw_optarg2->subst_matrix['G']['N'], sw_optarg2->subst_matrix['T']['N'], sw_optarg2->subst_matrix['N']['N'] ); */ if (sw_3_nucleotides == 0) { apply_sw_bs_4nt(input, batch); } else { //printf("START: apply_sw\n"); int tid = omp_get_thread_num(); mapping_batch_t *mapping_batch = batch->mapping_batch; cal_t *cal = NULL; array_list_t *cal_list = NULL, *mapping_list = NULL; array_list_t *fq_batch = mapping_batch->fq_batch; fastq_read_t *fq_read; // added by PP for bisulfite array_list_t *CT_fq_batch = mapping_batch->CT_fq_batch; array_list_t *GA_fq_batch = mapping_batch->GA_fq_batch; array_list_t *CT_rev_fq_batch = mapping_batch->CT_rev_fq_batch; array_list_t *GA_rev_fq_batch = mapping_batch->GA_rev_fq_batch; fastq_read_t *fq_read2; // end added by PP for bisulfite size_t start, end; size_t start2, end2; /* genome_t *genome = input->genome_p; */ // added by PP for bisulfite genome_t *genome1 = input->genome1_p; genome_t *genome2 = input->genome2_p; // end added by PP for bisulfite size_t flank_length = input->flank_length; // SIMD support for Smith-Waterman float score, min_score = input->min_score; sw_output_t *sw_output; size_t read_index, num_cals; size_t num_targets = mapping_batch->num_targets; size_t new_num_targets = 0; // added by PP for bisulfite size_t num_targets2 = mapping_batch->num_targets2; size_t new_num_targets2 = 0; // added by PP for bisulfite // added by PP for bisulfite size_t sw_total1 = mapping_batch->num_to_do; size_t sw_total2 = mapping_batch->num_to_do2; size_t sw_total = sw_total1 + sw_total2; // end added by PP for bisulfite // set to zero mapping_batch->num_to_do = 0; // added by PP for bisulfite mapping_batch->num_to_do2 = 0; int g[sw_total]; // end added by PP for bisulfite sw_optarg_t *sw_optarg = &input->sw_optarg; sw_multi_output_t *output = sw_multi_output_new(sw_total); char *q[sw_total], *r[sw_total]; uint8_t strands[sw_total], chromosomes[sw_total]; size_t starts[sw_total]; size_t sw_count = 0, read_indices[sw_total], sw_count2 = 0; int read_len, ref_len, max_ref_len; //printf("num of sw to do: %i\n", sw_total); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_targets; i++) { // printf("sw_server: target #%i of %i\n", i, num_seqs); read_index = mapping_batch->targets[i]; // to use with the three nucleotides searches fq_read = (fastq_read_t *) array_list_get(read_index, GA_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch); //printf("read %lu = %s\n", read_index, fq_read->sequence); //printf("read %lu = %s\n", read_index, fq_read2->sequence); // printf("sw_server: read #%i\n", read_index); cal_list = mapping_batch->mapping_lists[read_index]; num_cals = array_list_size(cal_list); read_len = fq_read->length; // max_ref_len = read_len + (read_len / 2); //printf("sw_server: num_cals = %i cals\n", num_cals); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); read_indices[sw_count] = read_index; if (flank_length >= cal->start) { start = 0; } else { start = cal->start - flank_length; } end = cal->end + flank_length; if (end >= genome1->chr_size[cal->chromosome_id - 1]) { end = genome1->chr_size[cal->chromosome_id - 1] - 1; } ref_len = end - start + 2; // if (ref_len < max_ref_len) { // query sequence, revcomp if necessary q[sw_count] = (char *) calloc((read_len + 1), sizeof(char)); // to use with the three nucleotides searches if (cal->strand == 0) { memcpy(q[sw_count], fq_read->sequence, read_len); //seq_reverse_complementary(q[sw_count], read_len); } else { memcpy(q[sw_count], fq_read2->sequence, read_len); } //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]); // reference sequence //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end); r[sw_count] = calloc(1, end - start + 2); // to use with the three nucleotides searches if (cal->strand == 0) { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome1); } else { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome2); /* start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end; end2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - start; genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start2, &end2, genome2); */ } /* genome_read_sequence_by_chr_index(r[sw_count], cal->strand, cal->chromosome_id - 1, &start, &end, genome1); */ // save some stuff, we'll use them after... strands[sw_count] = cal->strand; chromosomes[sw_count] = cal->chromosome_id; starts[sw_count] = start; /* printf("st = %lu\tend = %lu\n", cal->start, cal->end); printf("1\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n", q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end); */ // increase counter sw_count++; } // free cal_list array_list_clear(cal_list, (void *) cal_free); // batch->mapping_lists[index] = NULL; } //////////////// sw_count2 = sw_count; for (size_t i = 0; i < num_targets2; i++) { // printf("sw_server: target #%i of %i\n", i, num_seqs); read_index = mapping_batch->targets2[i]; // to use with the three nucleotides searches fq_read = (fastq_read_t *) array_list_get(read_index, CT_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch); //printf("read %lu = %s\n", read_index, fq_read->sequence); //printf("read %lu = %s\n", read_index, fq_read2->sequence); // printf("sw_server: read #%i\n", read_index); cal_list = mapping_batch->mapping_lists2[read_index]; num_cals = array_list_size(cal_list); read_len = fq_read->length; // max_ref_len = read_len + (read_len / 2); //printf("sw_server: num_cals = %i cals\n", num_cals); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); read_indices[sw_count] = read_index; if (flank_length >= cal->start) { start = 0; } else { start = cal->start - flank_length; } end = cal->end + flank_length; if (end >= genome1->chr_size[cal->chromosome_id - 1]) { end = genome1->chr_size[cal->chromosome_id - 1] - 1; } ref_len = end - start + 2; // if (ref_len < max_ref_len) { // query sequence, revcomp if necessary q[sw_count] = (char *) calloc((read_len + 1), sizeof(char)); // to use with the three nucleotides searches if (cal->strand == 0) { memcpy(q[sw_count], fq_read->sequence, read_len); //seq_reverse_complementary(q[sw_count], read_len); } else { memcpy(q[sw_count], fq_read2->sequence, read_len); } //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]); // reference sequence //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end); r[sw_count] = calloc(1, end - start + 2); // to use with the three nucleotides searches if (cal->strand == 0) { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome2); } else { genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start, &end, genome1); /* start2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - end; end2 = genome1->chr_size[cal->chromosome_id - 1] - 1 - start; genome_read_sequence_by_chr_index(r[sw_count], 0, cal->chromosome_id - 1, &start2, &end2, genome1); */ } /* genome_read_sequence_by_chr_index(r[sw_count], cal->strand, cal->chromosome_id - 1, &start, &end, genome2); */ // save some stuff, we'll use them after... strands[sw_count] = cal->strand; chromosomes[sw_count] = cal->chromosome_id; starts[sw_count] = start; //printf("2\nseq %s\ngen %s\nstrand %2lu chromo %lu start %lu end %lu\n", // q[sw_count], r[sw_count], cal->strand, cal->chromosome_id, start, end); // increase counter sw_count++; } // free cal_list array_list_clear(cal_list, (void *) cal_free); // batch->mapping_lists[index] = NULL; } //printf("before smith_waterman: sw_total = %i, sw_count = %i, sw_count2 = %i\n", sw_total, sw_count, sw_count2); // run Smith-Waterman // printf("before smith_waterman: sw_total = %i, sw_count = %i\n", sw_total, sw_count); smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output); // printf("after smith_waterman\n"); for (size_t i = 0; i < sw_count; i++) { LOG_DEBUG_F("cal: start = %lu, strand = %i\n", starts[i], strands[i]); LOG_DEBUG_F("\tquery : %s\n", q[i]); LOG_DEBUG_F("\tref. : %s\n", r[i]); LOG_DEBUG_F("\tquery map: %s (start: %i)\n", output->query_map_p[i], output->query_start_p[i]); LOG_DEBUG_F("\tref. map : %s (start: %i)\n", output->ref_map_p[i], output->ref_start_p[i]); LOG_DEBUG("\n"); } //size_t mapp = 0, mapp2 = 0; double norm_score; // filter alignments by min_score for (size_t i = 0; i < sw_count2; i++) { read_index = read_indices[i]; fq_read = (fastq_read_t *) array_list_get(read_index, GA_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, GA_rev_fq_batch); read_len = fq_read->length; norm_score = NORM_SCORE(output->score_p[i], read_len, input->match); if (norm_score >= min_score) { // valid mappings, //insert in the list for further processing mapping_list = mapping_batch->mapping_lists[read_index]; array_list_set_flag(0, mapping_list); if (array_list_size(mapping_list) == 0) { mapping_batch->targets[new_num_targets++] = read_index; //mapp++; } sw_output = sw_output_new(strands[i], chromosomes[i], starts[i], strlen(r[i]), strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], output->score_p[i], norm_score, output->query_map_p[i], output->ref_map_p[i]); array_list_insert(sw_output, mapping_list); mapping_batch->num_to_do++; } // free query and reference free(q[i]); free(r[i]); } mapping_batch->num_targets = new_num_targets; for (size_t i = sw_count2; i < sw_count; i++) { read_index = read_indices[i]; fq_read = (fastq_read_t *) array_list_get(read_index, CT_fq_batch); fq_read2 = (fastq_read_t *) array_list_get(read_index, CT_rev_fq_batch); read_len = fq_read->length; norm_score = NORM_SCORE(output->score_p[i], read_len, input->match); if (norm_score >= min_score) { // valid mappings, //insert in the list for further processing mapping_list = mapping_batch->mapping_lists2[read_index]; array_list_set_flag(0, mapping_list); if (array_list_size(mapping_list) == 0) { mapping_batch->targets2[new_num_targets2++] = read_index; //mapp2++; } sw_output = sw_output_new(strands[i], chromosomes[i], starts[i], strlen(r[i]), strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], output->score_p[i], norm_score, output->query_map_p[i], output->ref_map_p[i]); array_list_insert(sw_output, mapping_list); mapping_batch->num_to_do2++; } // free query and reference free(q[i]); free(r[i]); } mapping_batch->num_targets2 = new_num_targets2; // update counter // thr_sw_items[tid] += sw_count; // free sw_multi_output_free(output); // go to the next stage /* printf("3 SW1 \t%3lu\tmapp \t%3lu\tno map (discard) \t%3lu\n", num_targets, mapp, num_targets - mapp); printf("3 SW2 \t%3lu\tmapp \t%3lu\tno map (discard) \t%3lu\n", num_targets2, mapp2, num_targets2 - mapp2); */ //printf("END: apply_sw, (%d Smith-Waterman)\n", sw_total); } //return CONSUMER_STAGE; return BS_POST_PAIR_STAGE; // printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids); }
void fill_gaps(mapping_batch_t *mapping_batch, sw_optarg_t *sw_optarg, genome_t *genome, int min_gap, int min_distance) { int sw_count = 0; fastq_read_t *read; array_list_t *fq_batch = mapping_batch->fq_batch; size_t read_index, read_len; cal_t *cal; array_list_t *cal_list = NULL; size_t num_cals, num_targets = mapping_batch->num_targets; char *revcomp_seq = NULL; seed_region_t *s, *prev_s, *new_s; linked_list_iterator_t* itr; cigar_code_t *cigar_code; size_t start, end; size_t gap_read_start, gap_read_end, gap_read_len; size_t gap_genome_start, gap_genome_end, gap_genome_len; int left_flank, right_flank; sw_prepare_t *sw_prepare; array_list_t *sw_prepare_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); char *query, *ref; int distance, first = 0, last = 0; // LOG_DEBUG("\n\n P R E - P R O C E S S\n"); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_targets; i++) { read_index = mapping_batch->targets[i]; read = (fastq_read_t *) array_list_get(read_index, fq_batch); cal_list = mapping_batch->mapping_lists[read_index]; num_cals = array_list_size(cal_list); if (num_cals <= 0) continue; read_len = read->length; min_distance = read_len*0.2; LOG_DEBUG_F(">>>>> read %s\n", read->id); // printf(">>>>> read %s\n", read->id); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); LOG_DEBUG_F("CAL #%i of %i (strand %i), sr_list size = %i, sr_duplicate_list size = %i\n", j, num_cals, cal->strand, cal->sr_list->size, cal->sr_duplicate_list->size); prev_s = NULL; itr = linked_list_iterator_new(cal->sr_list); s = (seed_region_t *) linked_list_iterator_curr(itr); while (s != NULL) { { // for debugging size_t start = s->genome_start;// + 1; size_t end = s->genome_end;// + 1; size_t len = end - start + 1; // printf(":::::::::: %lu - %lu = %i ::::::::::::\n", end, start, len ); char *ref = (char *) malloc((len + 1) * sizeof(char)); genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); ref[len] = '\0'; // LOG_DEBUG_F("\tseed: [%i|%i - %i|%i] %s (len = %i)\n", s->genome_start, s->read_start, s->read_end, s->genome_end, ref, len); free(ref); } // set the cigar for the current region gap_read_len = s->read_end - s->read_start + 1; cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); s->info = (void *) cigar_code; cigar_code = NULL; sw_prepare = NULL; if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) { distance = 0; mapping_batch->num_gaps++; if (prev_s == NULL) { // gap at the first position gap_read_start = 0; gap_read_end = s->read_start - 1; gap_genome_start = s->genome_start - s->read_start; gap_genome_end = s->genome_start - 1; gap_read_len = gap_read_end - gap_read_start + 1; gap_genome_len = gap_genome_end - gap_genome_start + 1; cal->start = gap_genome_start; assert(gap_read_len != 0); assert(gap_genome_len != 0); if (gap_read_len > min_gap) { // the gap is too big, may be there's another CAL to cover it cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code); } else { left_flank = 0; right_flank = DOUBLE_FLANK; } } else { assert(prev_s->read_end < s->read_start); // gap in a middle position gap_read_start = prev_s->read_end + 1; gap_read_end = s->read_start - 1; gap_genome_start = prev_s->genome_end + 1; gap_genome_end = s->genome_start - 1; gap_read_len = gap_read_end - gap_read_start + 1; gap_genome_len = gap_genome_end - gap_genome_start + 1; LOG_DEBUG_F("gap (read, genome) = (%i, %i)\n", gap_read_len, gap_genome_len); if (gap_genome_len == 0) { printf("#@#: %s\n", read->id); } assert(gap_genome_len != 0); if (gap_read_len == 0) { // there's a deletion just between two consecutives seeds cigar_code = (cigar_code_t *)prev_s->info; cigar_code_append_op(cigar_op_new(gap_genome_len, 'D'), cigar_code); cigar_code->distance += gap_genome_len; cigar_code_append_op(cigar_op_new(s->read_end - s->read_start + 1, 'M'), cigar_code); cigar_code->distance += ((cigar_code_t *)s->info)->distance; prev_s->read_end = s->read_end; prev_s->genome_end = s->genome_end; LOG_DEBUG_F("prev cigar = %s\n", new_cigar_code_string((cigar_code_t *)prev_s->info)); // continue loop... linked_list_iterator_remove(itr); s = linked_list_iterator_curr(itr); continue; } left_flank = SINGLE_FLANK; right_flank = SINGLE_FLANK; } if (!cigar_code) { // we have to try to fill this gap and get a cigar if (gap_read_len == gap_genome_len) { // 1) first, for from begin -> end, and begin <- end start = gap_genome_start;// + 1; end = gap_genome_end;// + 1; first = -1; last = -1; ref = (char *) malloc((gap_genome_len + 5) * sizeof(char)); genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } query = &revcomp_seq[gap_read_start]; } else { query = &read->sequence[gap_read_start]; } for (int k = 0; k < gap_read_len; k++) { if (query[k] != ref[k]) { distance++; if (first == -1) first = k; last = k; } } if (distance < min_distance) { cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); cigar_code_inc_distance(distance, cigar_code); } } if (!cigar_code) { // 2) second, prepare SW to run // get query sequence, revcomp if necessary size_t read_start = gap_read_start - left_flank; size_t read_end = gap_read_end + right_flank; int gap_read_len_ex = read_end - read_start + 1; query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char)); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } memcpy(query, &revcomp_seq[read_start], gap_read_len_ex); } else { memcpy(query, &read->sequence[read_start], gap_read_len_ex); } query[gap_read_len_ex] = '\0'; // get ref. sequence size_t genome_start = gap_genome_start - left_flank;// + 1; size_t genome_end = gap_genome_end + right_flank;// + 1; int gap_genome_len_ex = genome_end - genome_start + 1; ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &genome_start, &genome_end, genome); ref[gap_genome_len_ex] = '\0'; if (prev_s == NULL) { sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, FIRST_SW); } else { sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, MIDDLE_SW); } array_list_insert(sw_prepare, sw_prepare_list); // increase counter sw_count++; LOG_DEBUG_F("query: %s\n", query); LOG_DEBUG_F("ref : %s\n", ref); LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", distance, min_distance, gap_read_len, first, last); LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, read->id); } } // insert gap in the list new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0); new_s->info = (void *) cigar_code; linked_list_iterator_insert(new_s, itr); if (sw_prepare) { sw_prepare->seed_region = new_s; sw_prepare->cal = cal; sw_prepare->read = read; } } // continue loop... prev_s = s; linked_list_iterator_next(itr); s = linked_list_iterator_curr(itr); } // check for a gap at the last position sw_prepare = NULL; if (prev_s != NULL && prev_s->read_end < read_len - 1) { cigar_code = NULL; mapping_batch->num_gaps++; // mapping_batch->num_sws++; // mapping_batch->num_ext_sws++; // gap at the last position gap_read_start = prev_s->read_end + 1; gap_read_end = read_len - 1; gap_read_len = gap_read_end - gap_read_start + 1; assert(gap_read_len != 0); gap_genome_len = gap_read_len; gap_genome_start = prev_s->genome_end + 1; gap_genome_end = gap_genome_start + gap_genome_len - 1; cal->end = gap_genome_end; assert(gap_genome_len != 0); // LOG_DEBUG_F("\t\tgap_read_len = %i, gap_genome_len = %i\n", gap_read_len, gap_genome_len); // LOG_DEBUG_F("\t\t%i : [%lu|%lu - %lu|%lu]\n", // sw_count, gap_genome_start, gap_read_start, gap_read_end, gap_genome_end); if (gap_read_len > min_gap) { // the gap is too big, may be there's another CAL to cover it cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code); } else { // we have to try to fill this gap and get a cigar // 1) first, for from begin -> end, and begin <- end start = gap_genome_start;// + 1; end = gap_genome_end;// + 1; first = -1; last = -1; ref = (char *) malloc((gap_genome_len + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } query = &revcomp_seq[gap_read_start]; } else { query = &read->sequence[gap_read_start]; } distance = 0; for (int k = 0; k < gap_read_len; k++) { if (query[k] != ref[k]) { distance++; if (first == -1) first = k; last = k; } } if (distance < min_distance) { cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); cigar_code_inc_distance(distance, cigar_code); } else { // 2) second, prepare SW to run left_flank = DOUBLE_FLANK; right_flank = 0; // get query sequence, revcomp if necessary size_t read_start = gap_read_start - left_flank; size_t read_end = gap_read_end + right_flank; int gap_read_len_ex = read_end - read_start + 1; query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char)); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } memcpy(query, &revcomp_seq[read_start], gap_read_len_ex); } else { memcpy(query, &read->sequence[read_start], gap_read_len_ex); } query[gap_read_len_ex] = '\0'; // get ref. sequence size_t genome_start = gap_genome_start - left_flank;// + 1; size_t genome_end = gap_genome_end + right_flank;// + 1; int gap_genome_len_ex = genome_end - genome_start + 1; ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &genome_start, &genome_end, genome); query[gap_genome_len_ex] = '\0'; sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, LAST_SW); array_list_insert(sw_prepare, sw_prepare_list); // increase counter sw_count++; LOG_DEBUG_F("query: %s\n", query); LOG_DEBUG_F("ref : %s\n", ref); LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", distance, min_distance, gap_read_len, first, last); LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, read->id); } } // insert gap in the list new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0); new_s->info = (void *) cigar_code; linked_list_insert_last(new_s, cal->sr_list); if (sw_prepare) { sw_prepare->seed_region = new_s; sw_prepare->cal = cal; sw_prepare->read = read; } } linked_list_iterator_free(itr); } // free memory if (revcomp_seq) { free(revcomp_seq); revcomp_seq = NULL; } } // display_sr_lists("ATER pre-process in fill_gaps", mapping_batch); LOG_DEBUG_F("\nR U N S W (sw_count = %i, sw_prepare_list size = %i)\n", sw_count, array_list_size(sw_prepare_list)); assert(sw_count == array_list_size(sw_prepare_list)); char *q[sw_count], *r[sw_count]; for (int i = 0; i < sw_count; i++) { sw_prepare = array_list_get(i, sw_prepare_list); q[i] = sw_prepare->query; r[i] = sw_prepare->ref; } sw_multi_output_t *output = sw_multi_output_new(sw_count); // run Smith-Waterman smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output); LOG_DEBUG("P O S T - P R O C E S S\n"); cigar_op_t* cigar_op; for (int i = 0; i < sw_count; i++) { sw_prepare = array_list_get(i, sw_prepare_list); s = sw_prepare->seed_region; int read_gap_len = s->read_end - s->read_start + 1; int genome_gap_len = s->genome_end - s->genome_start + 1; int read_gap_len_ex = read_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank; int genome_gap_len_ex = genome_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank; LOG_DEBUG_F("\tgap (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", s->read_start, s->read_end, s->genome_start, s->genome_end, read_gap_len, genome_gap_len, sw_prepare->read->id); LOG_DEBUG_F("\tflanks (left, right) = (%i, %i)\n", sw_prepare->left_flank, sw_prepare->right_flank); LOG_DEBUG_F("\tquery : %s\n", sw_prepare->query); LOG_DEBUG_F("\tref : %s\n", sw_prepare->ref); LOG_DEBUG_F("\tmquery: %s (start %i)\n", output->query_map_p[i], output->query_start_p[i]); LOG_DEBUG_F("\tmref : %s (start %i)\n", output->ref_map_p[i], output->ref_start_p[i]); cigar_code_t *cigar_c = generate_cigar_code(output->query_map_p[i], output->ref_map_p[i], strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], read_gap_len, genome_gap_len, &distance, sw_prepare->ref_type); LOG_DEBUG_F("\tscore : %0.2f, cigar: %s (distance = %i)\n", output->score_p[i], new_cigar_code_string(cigar_c), distance); /* if (output->query_start_p[i] > 0 && output->ref_start_p[i] > 0 && output->query_start_p[i] != output->ref_start_p[i]) { LOG_DEBUG("both map start points > 0 and are different lengths"); exit(-1); } */ // assert(output->query_start_p[i] == 0); // assert(output->ref_start_p[i] == 0); cigar_op = cigar_code_get_op(0, cigar_c); if (cigar_op) { if (cigar_op->name == 'H') { if (output->ref_start_p[i] == 0) { cigar_op->name = 'I'; } else { cigar_op->name = 'M'; } } else if (cigar_op->name == '=') cigar_op->name = 'M'; } cigar_op = cigar_code_get_last_op(cigar_c); if (cigar_op && cigar_op->name == 'H') cigar_op->name = 'I'; LOG_DEBUG_F("gap_read_len = %i, cigar_code_length (%s) = %i\n", read_gap_len, new_cigar_code_string(cigar_c), cigar_code_nt_length(cigar_c)); assert(read_gap_len == cigar_code_nt_length(cigar_c)); /* if (cigar_code_get_num_ops(cigar_c) > 2) { if (sw_prepare->left_flank > 0) { cigar_op = cigar_code_get_op(0, cigar_c); assert(cigar_op->number >= sw_prepare->left_flank && cigar_op->name == 'M'); cigar_op->number -= sw_prepare->left_flank; } if (sw_prepare->right_flank > 0) { cigar_op = cigar_code_get_last_op(cigar_c); assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M'); cigar_op->number -= sw_prepare->right_flank; } init_cigar_string(cigar_c); LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c)); } else { assert(cigar_code_get_num_ops(cigar_c) == 1); if (sw_prepare->right_flank > 0) { cigar_op = cigar_code_get_last_op(cigar_c); assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M'); cigar_op->number -= (sw_prepare->left_flank + sw_prepare->right_flank); if (cigar_op->number > read_gap_len) { cigar_code_append_op(cigar_op_new(cigar_op->number - read_gap_len, 'D'), cigar_c); } else if (cigar_op->number < read_gap_len) { cigar_code_append_op(cigar_op_new(read_gap_len - cigar_op->number, 'I'), cigar_c); } else{ init_cigar_string(cigar_c); } // LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c)); } } */ // and now set the cigar for this gap s->info = (void *) cigar_c; // free sw_prepare_free(sw_prepare); } display_sr_lists("END of fill_gaps", mapping_batch); // free memory sw_multi_output_free(output); array_list_free(sw_prepare_list, (void *) NULL); }