int sa_sam_writer(void *data) { sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data; sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch; if (mapping_batch == NULL) { printf("bam_writer1: error, NULL mapping batch\n"); return 0; } /* for (int i = 0; i < NUM_COUNTERS; i++) { counters[i] += mapping_batch->counters[i]; } */ #ifdef _TIMING for (int i = 0; i < NUM_TIMING; i++) { func_times[i] += mapping_batch->func_times[i]; } #endif int num_mismatches, num_cigar_ops; size_t flag, pnext = 0, tlen = 0; char *cigar_string, *cigar_M_string, *rnext = "*"; fastq_read_t *read; array_list_t *read_list = mapping_batch->fq_reads; array_list_t *mapping_list, *mate_list; FILE *out_file = (FILE *) wf_batch->writer_input->bam_file; sa_genome3_t *genome = wf_batch->sa_index->genome; size_t num_reads, num_mappings, num_mate_mappings; num_reads = mapping_batch->num_reads; if (mapping_batch->options->pair_mode != SINGLE_END_MODE) { // PAIR MODE int len; char *sequence, *quality; char *seq, *opt_fields; alignment_t *alig; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); // seq = read->sequence; /* if (i % 2 == 0) { mate_list = mapping_batch->mapping_lists[i+1]; num_mate_mappings = array_list_size(mate_list); } else { mate_list = mapping_list; num_mate_mappings = num_mappings; } */ mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { alig = (alignment_t *) array_list_get(j, mapping_list); /* // update alignment alig->secondary_alignment = 0; if (num_mate_mappings != 1) { alig->is_mate_mapped = 0; alig->is_paired_end_mapped = 0; alig->mate_strand = 0; } */ if (alig->optional_fields) { opt_fields = (char *) calloc(strlen(alig->optional_fields) + 100, sizeof(char)); sprintf(opt_fields, "NH:i:%i\t%s", num_mappings, alig->optional_fields); // sprintf(opt_fields, "NH:i:%i\t%s\tXU:i:%i", num_mappings, alig->optional_fields, mapping_batch->status[i]); } else { opt_fields = (char *) calloc(100, sizeof(char)); sprintf(opt_fields, "NH:i:%i", num_mappings); // sprintf(opt_fields, "NH:i:%i\tXU:i:%i", num_mappings, mapping_batch->status[i]); } /* // update alignment alig->secondary_alignment = 0; if (num_mate_mappings != 1) { alig->is_mate_mapped = 0; alig->is_paired_end_mapped = 0; alig->mate_strand = 0; } */ flag = 0; if (alig->is_paired_end) flag += BAM_FPAIRED; if (alig->is_paired_end_mapped) flag += BAM_FPROPER_PAIR; if (!alig->is_seq_mapped) flag += BAM_FUNMAP; if ((!alig->is_mate_mapped) && (alig->is_paired_end)) flag += BAM_FMUNMAP; if (alig->mate_strand) flag += BAM_FMREVERSE; if (alig->pair_num == 1) flag += BAM_FREAD1; if (alig->pair_num == 2) flag += BAM_FREAD2; if (alig->secondary_alignment) flag += BAM_FSECONDARY; if (alig->fails_quality_check) flag += BAM_FQCFAIL; if (alig->pc_optical_duplicate) flag += BAM_FDUP; if (alig->seq_strand) flag += BAM_FREVERSE; fprintf(out_file, "%s\t%lu\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n", read->id, flag, genome->chrom_names[alig->chromosome], alig->position + 1, (num_mappings > 1 ? 0 : alig->mapq), //60, //(alig->map_quality > 3 ? 0 : alig->map_quality), alig->cigar, (alig->chromosome == alig->mate_chromosome ? "=" : genome->chrom_names[alig->mate_chromosome]), alig->mate_position + 1, alig->template_length, alig->sequence, alig->quality, opt_fields ); // free memory free(opt_fields); alignment_free(alig); } // end for num_mappings } else { num_unmapped_reads++; opt_fields = (char *) calloc(100, sizeof(char)); sprintf(opt_fields, "XM:i:%i XU:i:%i", num_mappings, mapping_batch->status[i]); if (read->adapter) { len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { sequence = read->sequence; quality = read->quality; } fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\t%s\n", read->id, sequence, quality, opt_fields ); free(opt_fields); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } } else { // SINGLE MODE int len, mapq; char *seq; seed_cal_t *cal; cigar_t *cigar; char *sequence, *revcomp, *quality; for (size_t i = 0; i < num_reads; i++) { read = (fastq_read_t *) array_list_get(i, read_list); mapping_list = mapping_batch->mapping_lists[i]; num_mappings = array_list_size(mapping_list); num_total_mappings += num_mappings; #ifdef _VERBOSE if (num_mappings > 1) { num_dup_reads++; num_total_dup_reads += num_mappings; } #endif if (num_mappings > 0) { num_mapped_reads++; if (num_mappings > 1) { num_multihit_reads++; } for (size_t j = 0; j < num_mappings; j++) { cal = (seed_cal_t *) array_list_get(j, mapping_list); if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); revcomp = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); cigar = cigar_new_empty(); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ( (cal->strand == 1 && ((read->adapter_strand == 0 && read->adapter_length > 0) || (read->adapter_strand == 1 && read->adapter_length < 0))) || (cal->strand == 0 && ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0))) ) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); strcpy(revcomp, read->adapter_revcomp); strcat(revcomp, read->revcomp); cigar_append_op(abs(read->adapter_length), 'S', cigar); cigar_concat(&cal->cigar, cigar); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); strcpy(revcomp, read->revcomp); strcat(revcomp, read->adapter_revcomp); cigar_concat(&cal->cigar, cigar); cigar_append_op(read->adapter_length, 'S', cigar); } sequence[len] = 0; revcomp[len] = 0; quality[len] = 0; } else { // sequences and cigar sequence = read->sequence; revcomp = read->revcomp; quality = read->quality; cigar = &cal->cigar; } if (cal->strand) { flag = 16; seq = revcomp; } else { flag = 0; seq = sequence; } /* if (i == 0) { flag += BAM_FSECONDARY; } */ cigar_string = cigar_to_string(cigar); cigar_M_string = cigar_to_M_string(&num_mismatches, &num_cigar_ops, cigar); if (num_mappings > 1) { cal->mapq = 0; } fprintf(out_file, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%lu\t%i\t%s\t%s\tNH:i:%i\tNM:i:%i\n", read->id, flag, genome->chrom_names[cal->chromosome_id], cal->start + 1, (num_mappings == 1 ? cal->mapq : 0), cigar_M_string, rnext, pnext, tlen, seq, quality, num_mappings, num_mismatches ); // free memory free(cigar_M_string); free(cigar_string); seed_cal_free(cal); if (read->adapter) { free(sequence); free(revcomp); free(quality); cigar_free(cigar); } } } else { num_unmapped_reads++; if (read->adapter) { // sequences and cigar len = read->length + abs(read->adapter_length); sequence = (char *) malloc(len + 1); quality = (char *) malloc(len + 1); if (read->adapter_length < 0) { strcpy(quality, read->adapter_quality); strcat(quality, read->quality); } else { strcpy(quality, read->quality); strcat(quality, read->adapter_quality); } if ((read->adapter_strand == 0 && read->adapter_length < 0) || (read->adapter_strand == 1 && read->adapter_length > 0)) { strcpy(sequence, read->adapter); strcat(sequence, read->sequence); } else { strcpy(sequence, read->sequence); strcat(sequence, read->adapter); } sequence[len] = 0; quality[len] = 0; } else { // sequences sequence = read->sequence; quality = read->quality; } fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", read->id, sequence, quality ); if (read->adapter) { free(sequence); free(quality); } } array_list_free(mapping_list, (void *) NULL); } // end for num_reads } // free memory sa_mapping_batch_free(mapping_batch); if (wf_batch) sa_wf_batch_free(wf_batch); return 0; }
void suffix_mng_create_cals(fastq_read_t *read, int min_area, int strand, sa_index3_t *sa_index, array_list_t *cal_list, suffix_mng_t *p) { if (!p) return; if (!p->suffix_lists) return; if (p->num_seeds <= 0) return; int read_area, chrom; seed_t *seed; seed_cal_t *cal; linked_list_t *seed_list; claspinfo_t info; bl_claspinfoInit(&info); // initialization info.fragments = (Container *) malloc(sizeof(Container)); bl_containerInit(info.fragments, p->num_seeds, sizeof(slmatch_t)); info.subject = p->subject; slmatch_t frag; linked_list_t *suffix_list; for (unsigned int i = 0; i < p->num_chroms; i++) { suffix_list = p->suffix_lists[i]; if (suffix_list) { for (linked_list_item_t *item = suffix_list->first; item != NULL; item = item->next) { seed = item->item; bl_slmatchInit(&frag, 0); frag.i = seed->read_start; frag.j = seed->read_end - seed->read_start + 1; frag.p = seed->genome_start; frag.q = seed->genome_end - seed->genome_start + 1; frag.scr = seed->genome_end - seed->genome_start + 1; frag.subject = seed->chromosome_id; bl_containerAdd(info.fragments, &frag); } } } // sort fragments qsort(info.fragments->contspace, bl_containerSize(info.fragments), sizeof(slmatch_t), cmp_slmatch_qsort); int begin = 0; for (int i = 1; i <= bl_containerSize(info.fragments); i++){ // end of fragments list or different database sequence // --> process fragment[begin]...fragment[i-1], write output // and free chains (less memory consumption with large input files) if (i == bl_containerSize(info.fragments) || ((slmatch_t *) bl_containerGet(info.fragments, begin))->subject != ((slmatch_t *) bl_containerGet(info.fragments, i))->subject){ if (info.chainmode == SOP){ // only use chaining without clustering if no ids are specified bl_slClusterSop((slmatch_t *) info.fragments->contspace + begin, i - begin, info.epsilon, info.lambda, info.maxgap); } else { bl_slClusterLin((slmatch_t *) info.fragments->contspace + begin, i - begin, info.epsilon, info.lambda, info.maxgap); } for (int j = begin; j < i; j++) { slmatch_t *match = (slmatch_t *) bl_containerGet(info.fragments, j); if (match->chain) { slchain_t *chain = (slchain_t *) match->chain; if (chain->scr >= info.minscore && bl_containerSize(chain->matches) >= info.minfrag) { chrom = atoi(*(char **) bl_containerGet(info.subject, chain->subject)); read_area = 0; seed_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED); for (int k = 0; k < bl_containerSize(chain->matches); k++){ slmatch_t *frag = *(slmatch_t **) bl_containerGet(chain->matches, k); seed = seed_new(frag->i, frag->i + frag->j - 1, frag->p, frag->p + frag->q - 1); seed->chromosome_id = chrom; seed->strand = strand; read_area += frag->j; cigar_append_op(frag->j, '=', &seed->cigar); linked_list_insert_last(seed, seed_list); } // extend seeds cal = seed_cal_new(chrom, strand, chain->p, chain->p + chain->q - 1, seed_list); cal->read = read; extend_seeds(cal, sa_index); seed_cal_update_info(cal); if (cal->read_area >= min_area) { array_list_insert(cal, cal_list); } else { seed_cal_free(cal); } } bl_slchainDestruct(chain); free(chain); match->chain = NULL; } } // END OF for (j = begin; j < i; j++) begin = i; } // END OF if (i == bl_containerSize(info.fragments) || } // END OF for (i = 1; i <= bl_containerSize(info.fragments); i++) // destruct everything info.subject = NULL; bl_claspinfoDestruct(&info); // finally, clear suffix manager suffix_mng_clear(p); }