int apply_sw_bs_4nt(sw_server_input_t* input, batch_t *batch) { mapping_batch_t *mapping_batch = batch->mapping_batch; genome_t *genome1 = input->genome1_p; genome_t *genome2 = input->genome2_p; sw_optarg_t *sw_optarg = &input->sw_optarg; { char r[1024]; size_t start = 169312417; size_t end = start + 99; genome_read_sequence_by_chr_index(r, 0, 0, &start, &end, genome2); printf("+++++++++++++ genome2 = %s \n", r); genome_read_sequence_by_chr_index(r, 0, 0, &start, &end, genome1); printf("+++++++++++++ genome1 = %s \n", r); } // fill gaps between seeds fill_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 5, 1); merge_seed_regions_bs(mapping_batch, 1); fill_end_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 400, 1); fill_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 5, 0); merge_seed_regions_bs(mapping_batch, 0); fill_end_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 400, 0); // now we can create the alignments fastq_read_t *read; array_list_t *fq_batch = mapping_batch->fq_batch; char *match_seq, *match_qual; size_t read_index, read_len, match_len, match_start; cal_t *cal; array_list_t *cal_list = NULL; size_t num_cals; seed_region_t *s; cigar_code_t *cigar_code; cigar_op_t *first_op; float score, norm_score, min_score = input->min_score; alignment_t *alignment; array_list_t *alignment_list; char *p, *optional_fields; int optional_fields_length, AS; array_list_t **mapping_lists; size_t num_targets; size_t *targets; for (int bs_id = 0; bs_id < 2; bs_id++) { if (bs_id == 0) { mapping_lists = mapping_batch->mapping_lists; num_targets = mapping_batch->num_targets; targets = mapping_batch->targets; } else { mapping_lists = mapping_batch->mapping_lists2; num_targets = mapping_batch->num_targets2; targets = mapping_batch->targets2; } for (size_t i = 0; i < num_targets; i++) { read_index = targets[i]; read = (fastq_read_t *) array_list_get(read_index, fq_batch); cal_list = mapping_lists[read_index]; num_cals = array_list_size(cal_list); if (num_cals <= 0) continue; read_len = read->length; alignment_list = array_list_new(num_cals, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); if (cal->sr_list->size == 0) continue; s = (seed_region_t *) linked_list_get_first(cal->sr_list); cigar_code = (cigar_code_t *) s->info; norm_score = cigar_code_get_score(read_len, cigar_code); score = norm_score * 100; //read_len; LOG_DEBUG_F("score = %0.2f\n", norm_score); // filter by SW score if (norm_score > min_score) { // update cigar and sequence and quality strings cigar_code_update(cigar_code); LOG_DEBUG_F("\tcigar code = %s\n", new_cigar_code_string(cigar_code)); match_start = 0; match_len = cigar_code_nt_length(cigar_code); first_op = cigar_code_get_first_op(cigar_code); match_start = (first_op && first_op->name == 'H' ? first_op->number : 0); match_seq = (char *) malloc((match_len + 1)* sizeof(char)); memcpy(match_seq, &read->sequence[match_start], match_len); match_seq[match_len] = 0; match_qual = (char *) malloc((match_len + 1)* sizeof(char)); memcpy(match_qual, &read->quality[match_start], match_len); match_qual[match_len] = 0; // set optional fields optional_fields_length = 100; optional_fields = (char *) calloc(optional_fields_length, sizeof(char)); p = optional_fields; AS = (int) norm_score * 100; sprintf(p, "ASi"); p += 3; memcpy(p, &AS, sizeof(int)); p += sizeof(int); sprintf(p, "NHi"); p += 3; memcpy(p, &num_cals, sizeof(int)); p += sizeof(int); sprintf(p, "NMi"); p += 3; memcpy(p, &cigar_code->distance, sizeof(int)); p += sizeof(int); assert(read->length == cigar_code_nt_length(cigar_code)); // create an alignment and insert it into the list alignment = alignment_new(); //read_id = malloc(read->length); size_t header_len = strlen(read->id); char *head_id = (char *) malloc(header_len + 1); get_to_first_blank(read->id, header_len, head_id); alignment_init_single_end(head_id, match_seq, match_qual, cal->strand, cal->chromosome_id - 1, cal->start - 1, new_cigar_code_string(cigar_code), cigar_code_get_num_ops(cigar_code), norm_score * 254, 1, (num_cals > 1), optional_fields_length, optional_fields, alignment); array_list_insert(alignment, alignment_list); LOG_DEBUG_F("creating alignment (bs_id = %i)...\n", bs_id); //alignment_print(alignment); } } // free the cal list, and update the mapping list with the alignment list array_list_free(cal_list, (void *) cal_free); mapping_lists[read_index] = alignment_list; } } // go to the next stage return BS_POST_PAIR_STAGE; }
void fill_gaps(mapping_batch_t *mapping_batch, sw_optarg_t *sw_optarg, genome_t *genome, int min_gap, int min_distance) { int sw_count = 0; fastq_read_t *read; array_list_t *fq_batch = mapping_batch->fq_batch; size_t read_index, read_len; cal_t *cal; array_list_t *cal_list = NULL; size_t num_cals, num_targets = mapping_batch->num_targets; char *revcomp_seq = NULL; seed_region_t *s, *prev_s, *new_s; linked_list_iterator_t* itr; cigar_code_t *cigar_code; size_t start, end; size_t gap_read_start, gap_read_end, gap_read_len; size_t gap_genome_start, gap_genome_end, gap_genome_len; int left_flank, right_flank; sw_prepare_t *sw_prepare; array_list_t *sw_prepare_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED); char *query, *ref; int distance, first = 0, last = 0; // LOG_DEBUG("\n\n P R E - P R O C E S S\n"); // initialize query and reference sequences to Smith-Waterman for (size_t i = 0; i < num_targets; i++) { read_index = mapping_batch->targets[i]; read = (fastq_read_t *) array_list_get(read_index, fq_batch); cal_list = mapping_batch->mapping_lists[read_index]; num_cals = array_list_size(cal_list); if (num_cals <= 0) continue; read_len = read->length; min_distance = read_len*0.2; LOG_DEBUG_F(">>>>> read %s\n", read->id); // printf(">>>>> read %s\n", read->id); // processing each CAL from this read for(size_t j = 0; j < num_cals; j++) { // get cal and read index cal = array_list_get(j, cal_list); LOG_DEBUG_F("CAL #%i of %i (strand %i), sr_list size = %i, sr_duplicate_list size = %i\n", j, num_cals, cal->strand, cal->sr_list->size, cal->sr_duplicate_list->size); prev_s = NULL; itr = linked_list_iterator_new(cal->sr_list); s = (seed_region_t *) linked_list_iterator_curr(itr); while (s != NULL) { { // for debugging size_t start = s->genome_start;// + 1; size_t end = s->genome_end;// + 1; size_t len = end - start + 1; // printf(":::::::::: %lu - %lu = %i ::::::::::::\n", end, start, len ); char *ref = (char *) malloc((len + 1) * sizeof(char)); genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); ref[len] = '\0'; // LOG_DEBUG_F("\tseed: [%i|%i - %i|%i] %s (len = %i)\n", s->genome_start, s->read_start, s->read_end, s->genome_end, ref, len); free(ref); } // set the cigar for the current region gap_read_len = s->read_end - s->read_start + 1; cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); s->info = (void *) cigar_code; cigar_code = NULL; sw_prepare = NULL; if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) { distance = 0; mapping_batch->num_gaps++; if (prev_s == NULL) { // gap at the first position gap_read_start = 0; gap_read_end = s->read_start - 1; gap_genome_start = s->genome_start - s->read_start; gap_genome_end = s->genome_start - 1; gap_read_len = gap_read_end - gap_read_start + 1; gap_genome_len = gap_genome_end - gap_genome_start + 1; cal->start = gap_genome_start; assert(gap_read_len != 0); assert(gap_genome_len != 0); if (gap_read_len > min_gap) { // the gap is too big, may be there's another CAL to cover it cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code); } else { left_flank = 0; right_flank = DOUBLE_FLANK; } } else { assert(prev_s->read_end < s->read_start); // gap in a middle position gap_read_start = prev_s->read_end + 1; gap_read_end = s->read_start - 1; gap_genome_start = prev_s->genome_end + 1; gap_genome_end = s->genome_start - 1; gap_read_len = gap_read_end - gap_read_start + 1; gap_genome_len = gap_genome_end - gap_genome_start + 1; LOG_DEBUG_F("gap (read, genome) = (%i, %i)\n", gap_read_len, gap_genome_len); if (gap_genome_len == 0) { printf("#@#: %s\n", read->id); } assert(gap_genome_len != 0); if (gap_read_len == 0) { // there's a deletion just between two consecutives seeds cigar_code = (cigar_code_t *)prev_s->info; cigar_code_append_op(cigar_op_new(gap_genome_len, 'D'), cigar_code); cigar_code->distance += gap_genome_len; cigar_code_append_op(cigar_op_new(s->read_end - s->read_start + 1, 'M'), cigar_code); cigar_code->distance += ((cigar_code_t *)s->info)->distance; prev_s->read_end = s->read_end; prev_s->genome_end = s->genome_end; LOG_DEBUG_F("prev cigar = %s\n", new_cigar_code_string((cigar_code_t *)prev_s->info)); // continue loop... linked_list_iterator_remove(itr); s = linked_list_iterator_curr(itr); continue; } left_flank = SINGLE_FLANK; right_flank = SINGLE_FLANK; } if (!cigar_code) { // we have to try to fill this gap and get a cigar if (gap_read_len == gap_genome_len) { // 1) first, for from begin -> end, and begin <- end start = gap_genome_start;// + 1; end = gap_genome_end;// + 1; first = -1; last = -1; ref = (char *) malloc((gap_genome_len + 5) * sizeof(char)); genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } query = &revcomp_seq[gap_read_start]; } else { query = &read->sequence[gap_read_start]; } for (int k = 0; k < gap_read_len; k++) { if (query[k] != ref[k]) { distance++; if (first == -1) first = k; last = k; } } if (distance < min_distance) { cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); cigar_code_inc_distance(distance, cigar_code); } } if (!cigar_code) { // 2) second, prepare SW to run // get query sequence, revcomp if necessary size_t read_start = gap_read_start - left_flank; size_t read_end = gap_read_end + right_flank; int gap_read_len_ex = read_end - read_start + 1; query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char)); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } memcpy(query, &revcomp_seq[read_start], gap_read_len_ex); } else { memcpy(query, &read->sequence[read_start], gap_read_len_ex); } query[gap_read_len_ex] = '\0'; // get ref. sequence size_t genome_start = gap_genome_start - left_flank;// + 1; size_t genome_end = gap_genome_end + right_flank;// + 1; int gap_genome_len_ex = genome_end - genome_start + 1; ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &genome_start, &genome_end, genome); ref[gap_genome_len_ex] = '\0'; if (prev_s == NULL) { sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, FIRST_SW); } else { sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, MIDDLE_SW); } array_list_insert(sw_prepare, sw_prepare_list); // increase counter sw_count++; LOG_DEBUG_F("query: %s\n", query); LOG_DEBUG_F("ref : %s\n", ref); LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", distance, min_distance, gap_read_len, first, last); LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, read->id); } } // insert gap in the list new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0); new_s->info = (void *) cigar_code; linked_list_iterator_insert(new_s, itr); if (sw_prepare) { sw_prepare->seed_region = new_s; sw_prepare->cal = cal; sw_prepare->read = read; } } // continue loop... prev_s = s; linked_list_iterator_next(itr); s = linked_list_iterator_curr(itr); } // check for a gap at the last position sw_prepare = NULL; if (prev_s != NULL && prev_s->read_end < read_len - 1) { cigar_code = NULL; mapping_batch->num_gaps++; // mapping_batch->num_sws++; // mapping_batch->num_ext_sws++; // gap at the last position gap_read_start = prev_s->read_end + 1; gap_read_end = read_len - 1; gap_read_len = gap_read_end - gap_read_start + 1; assert(gap_read_len != 0); gap_genome_len = gap_read_len; gap_genome_start = prev_s->genome_end + 1; gap_genome_end = gap_genome_start + gap_genome_len - 1; cal->end = gap_genome_end; assert(gap_genome_len != 0); // LOG_DEBUG_F("\t\tgap_read_len = %i, gap_genome_len = %i\n", gap_read_len, gap_genome_len); // LOG_DEBUG_F("\t\t%i : [%lu|%lu - %lu|%lu]\n", // sw_count, gap_genome_start, gap_read_start, gap_read_end, gap_genome_end); if (gap_read_len > min_gap) { // the gap is too big, may be there's another CAL to cover it cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code); } else { // we have to try to fill this gap and get a cigar // 1) first, for from begin -> end, and begin <- end start = gap_genome_start;// + 1; end = gap_genome_end;// + 1; first = -1; last = -1; ref = (char *) malloc((gap_genome_len + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &start, &end, genome); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } query = &revcomp_seq[gap_read_start]; } else { query = &read->sequence[gap_read_start]; } distance = 0; for (int k = 0; k < gap_read_len; k++) { if (query[k] != ref[k]) { distance++; if (first == -1) first = k; last = k; } } if (distance < min_distance) { cigar_code = cigar_code_new(); cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code); cigar_code_inc_distance(distance, cigar_code); } else { // 2) second, prepare SW to run left_flank = DOUBLE_FLANK; right_flank = 0; // get query sequence, revcomp if necessary size_t read_start = gap_read_start - left_flank; size_t read_end = gap_read_end + right_flank; int gap_read_len_ex = read_end - read_start + 1; query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char)); // handle strand - if (cal->strand) { if (revcomp_seq == NULL) { revcomp_seq = strdup(read->sequence); seq_reverse_complementary(revcomp_seq, read_len); } memcpy(query, &revcomp_seq[read_start], gap_read_len_ex); } else { memcpy(query, &read->sequence[read_start], gap_read_len_ex); } query[gap_read_len_ex] = '\0'; // get ref. sequence size_t genome_start = gap_genome_start - left_flank;// + 1; size_t genome_end = gap_genome_end + right_flank;// + 1; int gap_genome_len_ex = genome_end - genome_start + 1; ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));; genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, &genome_start, &genome_end, genome); query[gap_genome_len_ex] = '\0'; sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, LAST_SW); array_list_insert(sw_prepare, sw_prepare_list); // increase counter sw_count++; LOG_DEBUG_F("query: %s\n", query); LOG_DEBUG_F("ref : %s\n", ref); LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", distance, min_distance, gap_read_len, first, last); LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, read->id); } } // insert gap in the list new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0); new_s->info = (void *) cigar_code; linked_list_insert_last(new_s, cal->sr_list); if (sw_prepare) { sw_prepare->seed_region = new_s; sw_prepare->cal = cal; sw_prepare->read = read; } } linked_list_iterator_free(itr); } // free memory if (revcomp_seq) { free(revcomp_seq); revcomp_seq = NULL; } } // display_sr_lists("ATER pre-process in fill_gaps", mapping_batch); LOG_DEBUG_F("\nR U N S W (sw_count = %i, sw_prepare_list size = %i)\n", sw_count, array_list_size(sw_prepare_list)); assert(sw_count == array_list_size(sw_prepare_list)); char *q[sw_count], *r[sw_count]; for (int i = 0; i < sw_count; i++) { sw_prepare = array_list_get(i, sw_prepare_list); q[i] = sw_prepare->query; r[i] = sw_prepare->ref; } sw_multi_output_t *output = sw_multi_output_new(sw_count); // run Smith-Waterman smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output); LOG_DEBUG("P O S T - P R O C E S S\n"); cigar_op_t* cigar_op; for (int i = 0; i < sw_count; i++) { sw_prepare = array_list_get(i, sw_prepare_list); s = sw_prepare->seed_region; int read_gap_len = s->read_end - s->read_start + 1; int genome_gap_len = s->genome_end - s->genome_start + 1; int read_gap_len_ex = read_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank; int genome_gap_len_ex = genome_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank; LOG_DEBUG_F("\tgap (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", s->read_start, s->read_end, s->genome_start, s->genome_end, read_gap_len, genome_gap_len, sw_prepare->read->id); LOG_DEBUG_F("\tflanks (left, right) = (%i, %i)\n", sw_prepare->left_flank, sw_prepare->right_flank); LOG_DEBUG_F("\tquery : %s\n", sw_prepare->query); LOG_DEBUG_F("\tref : %s\n", sw_prepare->ref); LOG_DEBUG_F("\tmquery: %s (start %i)\n", output->query_map_p[i], output->query_start_p[i]); LOG_DEBUG_F("\tmref : %s (start %i)\n", output->ref_map_p[i], output->ref_start_p[i]); cigar_code_t *cigar_c = generate_cigar_code(output->query_map_p[i], output->ref_map_p[i], strlen(output->query_map_p[i]), output->query_start_p[i], output->ref_start_p[i], read_gap_len, genome_gap_len, &distance, sw_prepare->ref_type); LOG_DEBUG_F("\tscore : %0.2f, cigar: %s (distance = %i)\n", output->score_p[i], new_cigar_code_string(cigar_c), distance); /* if (output->query_start_p[i] > 0 && output->ref_start_p[i] > 0 && output->query_start_p[i] != output->ref_start_p[i]) { LOG_DEBUG("both map start points > 0 and are different lengths"); exit(-1); } */ // assert(output->query_start_p[i] == 0); // assert(output->ref_start_p[i] == 0); cigar_op = cigar_code_get_op(0, cigar_c); if (cigar_op) { if (cigar_op->name == 'H') { if (output->ref_start_p[i] == 0) { cigar_op->name = 'I'; } else { cigar_op->name = 'M'; } } else if (cigar_op->name == '=') cigar_op->name = 'M'; } cigar_op = cigar_code_get_last_op(cigar_c); if (cigar_op && cigar_op->name == 'H') cigar_op->name = 'I'; LOG_DEBUG_F("gap_read_len = %i, cigar_code_length (%s) = %i\n", read_gap_len, new_cigar_code_string(cigar_c), cigar_code_nt_length(cigar_c)); assert(read_gap_len == cigar_code_nt_length(cigar_c)); /* if (cigar_code_get_num_ops(cigar_c) > 2) { if (sw_prepare->left_flank > 0) { cigar_op = cigar_code_get_op(0, cigar_c); assert(cigar_op->number >= sw_prepare->left_flank && cigar_op->name == 'M'); cigar_op->number -= sw_prepare->left_flank; } if (sw_prepare->right_flank > 0) { cigar_op = cigar_code_get_last_op(cigar_c); assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M'); cigar_op->number -= sw_prepare->right_flank; } init_cigar_string(cigar_c); LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c)); } else { assert(cigar_code_get_num_ops(cigar_c) == 1); if (sw_prepare->right_flank > 0) { cigar_op = cigar_code_get_last_op(cigar_c); assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M'); cigar_op->number -= (sw_prepare->left_flank + sw_prepare->right_flank); if (cigar_op->number > read_gap_len) { cigar_code_append_op(cigar_op_new(cigar_op->number - read_gap_len, 'D'), cigar_c); } else if (cigar_op->number < read_gap_len) { cigar_code_append_op(cigar_op_new(read_gap_len - cigar_op->number, 'I'), cigar_c); } else{ init_cigar_string(cigar_c); } // LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c)); } } */ // and now set the cigar for this gap s->info = (void *) cigar_c; // free sw_prepare_free(sw_prepare); } display_sr_lists("END of fill_gaps", mapping_batch); // free memory sw_multi_output_free(output); array_list_free(sw_prepare_list, (void *) NULL); }