quip_out_t* quip_out_open( quip_writer_t writer, void* writer_data, quip_fmt_t fmt, quip_opt_t opts, const quip_aux_t* aux, const seqmap_t* ref) { quip_out_t* out = malloc_or_die(sizeof(quip_out_t)); out->fmt = fmt; switch (fmt) { case QUIP_FMT_FASTQ: out->x.fastq = quip_fastq_out_open(writer, writer_data, opts); break; case QUIP_FMT_BAM: opts |= QUIP_OPT_SAM_BAM; case QUIP_FMT_SAM: out->x.sam = quip_sam_out_open(writer, writer_data, opts, aux); break; case QUIP_FMT_QUIP: out->x.quip = quip_quip_out_open(writer, writer_data, opts, aux, ref); break; case QUIP_FMT_NULL: out->x.null = NULL; break; case QUIP_FMT_UNDEFINED: quip_error("Undefined format given."); } return out; }
void quip_write(quip_out_t* out, short_read_t* sr) { switch (out->fmt) { case QUIP_FMT_FASTQ: quip_fastq_write(out->x.fastq, sr); break; case QUIP_FMT_BAM: case QUIP_FMT_SAM: quip_sam_write(out->x.sam, sr); break; case QUIP_FMT_QUIP: quip_quip_write(out->x.quip, sr); break; case QUIP_FMT_NULL: break; default: quip_error("Write called on an unsupported format."); } }
static void seqenc_decode_reference_alignment(seqenc_t* E, short_read_t* r, size_t seqlen) { const twobit_t* refseq = seqmap_get(E->ref, (const char*) r->seqname.s); if (refseq == NULL) { quip_error( "A read was aligned to sequence %s, which was not found in the reference.", r->seqname.s); } str_reserve(&r->seq, seqlen + 1); r->seq.n = 0; /* decode N mask */ size_t i; reserve_nmask(E, seqlen); memset(r->seq.s, '\0', seqlen + 1); for (i = 0; i < seqlen; ++i) { if (dist2_decode(E->ac, &E->d_nmask[i])) r->seq.s[i] = 'N'; } uint32_t ref_pos = r->pos; uint32_t read_pos = 0; i = 0; /* cigar operation */ size_t j; /* position within the cigar op */ kmer_t y; /* reference nucleotide */ for (i = 0; i < r->cigar.n; ++i) { switch (r->cigar.ops[i]) { case BAM_CEQUAL: case BAM_CDIFF: case BAM_CMATCH: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos, ++ref_pos) { if (r->seq.s[read_pos] == 'N') continue; if (dist2_decode(E->ac, &E->d_ref_match) == SEQENC_REF_MATCH) { y = twobit_get(refseq, ref_pos); r->seq.s[read_pos] = kmertochar[y]; } else { r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)]; } } break; case BAM_CINS: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (r->seq.s[read_pos] == 'N') continue; r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)]; } break; case BAM_CDEL: ref_pos += r->cigar.lens[i]; break; case BAM_CREF_SKIP: ref_pos += r->cigar.lens[i]; break; case BAM_CSOFT_CLIP: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (r->seq.s[read_pos] == 'N') continue; r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)]; } break; case BAM_CHARD_CLIP: ref_pos += r->cigar.lens[i]; break; case BAM_CPAD: quip_error("Unsupported cigar operation."); break; } } r->seq.s[seqlen] = '\0'; r->seq.n = seqlen; if (read_pos != seqlen) { quip_error("Cigar operations do not account for full read length."); } if (r->strand) str_revcomp(r->seq.s, r->seq.n); }
void seqenc_encode_reference_alignment( seqenc_t* E, const short_read_t* r) { const twobit_t* refseq = seqmap_get(E->ref, (const char*) r->seqname.s); if (refseq == NULL) { quip_error( "A read was aligned to sequence %s, which was not found in the reference.\n", r->seqname.s); } str_copy(&E->tmpseq, &r->seq); if (r->strand) { str_revcomp(E->tmpseq.s, E->tmpseq.n); } /* encode N mask */ size_t i; reserve_nmask(E, r->seq.n); for (i = 0; i < r->seq.n; ++i) { dist2_encode(E->ac, &E->d_nmask[i], E->tmpseq.s[i] == 'N' ? 1 : 0); } uint32_t ref_pos = r->pos; uint32_t read_pos = 0; i = 0; /* cigar operation */ size_t j; /* position within the cigar op */ kmer_t x; /* read nucleotide */ kmer_t y; /* reference nucleotide */ for (i = 0; i < r->cigar.n; ++i) { switch (r->cigar.ops[i]) { case BAM_CEQUAL: case BAM_CDIFF: case BAM_CMATCH: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos, ++ref_pos) { if (E->tmpseq.s[read_pos] == 'N') continue; x = chartokmer[E->tmpseq.s[read_pos]]; y = twobit_get(refseq, ref_pos); if (x == y) { dist2_encode(E->ac, &E->d_ref_match, SEQENC_REF_MATCH); } else { dist2_encode(E->ac, &E->d_ref_match, SEQENC_REF_MISMATCH); dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]); } } break; case BAM_CINS: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (E->tmpseq.s[read_pos] == 'N') continue; dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]); } break; case BAM_CDEL: ref_pos += r->cigar.lens[i]; break; case BAM_CREF_SKIP: ref_pos += r->cigar.lens[i]; break; case BAM_CSOFT_CLIP: for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) { if (E->tmpseq.s[read_pos] == 'N') continue; dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]); } break; case BAM_CHARD_CLIP: ref_pos += r->cigar.lens[i]; break; case BAM_CPAD: quip_error("Cigar PAD operation is unsupported."); break; } } if (read_pos != r->seq.n) { quip_error("Cigar operations do not account for full read length."); } }
void seqenc_decode_extras(seqenc_t* E, short_read_t* x, size_t seqlen, uint8_t quip_version) { x->flags = uint32_enc_decode(E->ac, &E->d_ext_flags); x->strand = (x->flags & BAM_FREVERSE) ? 1 : 0; x->map_qual = dist256_decode(E->ac, &E->d_ext_map_qual); x->tlen = uint32_enc_decode(E->ac, &E->d_ext_tlen); x->cigar.n = 0; uint32_t seqidx = 0; if ((x->flags & BAM_FUNMAP) == 0) { decode_seqname(E, &x->seqname); seqidx = get_seq_idx(E, &x->seqname); if (dist2_decode(E->ac, &E->d_ext_pos_off_flag)) { x->pos = E->last_ref_pos + dist256_decode(E->ac, &E->d_ext_pos_off); } else { x->pos = uint32_enc_decode(E->ac, &E->d_ext_pos[seqidx]); } E->last_ref_pos = x->pos; uint8_t last_op = 9; size_t i = 0; uint32_t cigarlen = 0; x->cigar.n = uint32_enc_decode(E->ac, &E->d_ext_cigar_n); cigar_reserve(&x->cigar, x->cigar.n); for (i = 0; i < x->cigar.n; ++i) { x->cigar.ops[i] = cond_dist16_decode(E->ac, &E->d_ext_cigar_op, last_op); x->cigar.lens[i] = uint32_enc_decode(E->ac, &E->d_ext_cigar_len[x->cigar.ops[i]]); if (x->cigar.ops[i] != BAM_CDEL && x->cigar.ops[i] != BAM_CREF_SKIP && x->cigar.ops[i] != BAM_CHARD_CLIP) { cigarlen += x->cigar.lens[i]; } last_op = x->cigar.ops[i]; } if (cigarlen != seqlen) { quip_error("Cigar operations do not account for full read length."); } } if ((x->flags & BAM_FMUNMAP) == 0) { if (quip_version >= 4) { if ((x->flags & BAM_FUNMAP) == 0) { if (dist2_decode(E->ac, &E->d_ext_mate_sameseq)) { str_copy(&x->mate_seqname, &x->seqname); } else { decode_seqname(E, &x->mate_seqname); } } else { decode_seqname(E, &x->mate_seqname); } seqidx = get_seq_idx(E, &x->mate_seqname); } else { if (dist2_decode(E->ac, &E->d_ext_mate_sameseq)) { str_copy(&x->mate_seqname, &x->seqname); } else { decode_seqname(E, &x->mate_seqname); seqidx = get_seq_idx(E, &x->mate_seqname); } } x->mate_pos = uint32_enc_decode(E->ac, &E->d_ext_pos[seqidx]); } }
void seqenc_encode_extras(seqenc_t* E, const short_read_t* x, uint8_t quip_version) { uint32_enc_encode(E->ac, &E->d_ext_flags, x->flags); dist256_encode(E->ac, &E->d_ext_map_qual, x->map_qual); uint32_enc_encode(E->ac, &E->d_ext_tlen, x->tlen); uint32_t seqidx = 0; if ((x->flags & BAM_FUNMAP) == 0) { encode_seqname(E, &x->seqname); seqidx = get_seq_idx(E, &x->seqname); if (x->pos < E->last_ref_pos || x->pos - E->last_ref_pos >= 256) { dist2_encode(E->ac, &E->d_ext_pos_off_flag, 0); uint32_enc_encode(E->ac, &E->d_ext_pos[seqidx], x->pos); } else { dist2_encode(E->ac, &E->d_ext_pos_off_flag, 1); dist256_encode(E->ac, &E->d_ext_pos_off, x->pos - E->last_ref_pos); } E->last_ref_pos = x->pos; uint32_t cigarlen = 0; uint8_t last_op = 9; size_t i; uint32_enc_encode(E->ac, &E->d_ext_cigar_n, x->cigar.n); for (i = 0; i < x->cigar.n; ++i) { cond_dist16_encode(E->ac, &E->d_ext_cigar_op, last_op, x->cigar.ops[i]); uint32_enc_encode(E->ac, &E->d_ext_cigar_len[x->cigar.ops[i]], x->cigar.lens[i]); last_op = x->cigar.ops[i]; if (x->cigar.ops[i] != BAM_CDEL && x->cigar.ops[i] != BAM_CREF_SKIP && x->cigar.ops[i] != BAM_CHARD_CLIP) { cigarlen += x->cigar.lens[i]; } } if (cigarlen != x->seq.n) { quip_error("Cigar operations do not account for full read length."); } } if ((x->flags & BAM_FMUNMAP) == 0) { if (quip_version >= 4) { if ((x->flags & BAM_FUNMAP) == 0) { if (strcmp((char*) x->seqname.s, (char*) x->mate_seqname.s) == 0) { dist2_encode(E->ac, &E->d_ext_mate_sameseq, 1); } else { dist2_encode(E->ac, &E->d_ext_mate_sameseq, 0); encode_seqname(E, &x->mate_seqname); } } else { encode_seqname(E, &x->mate_seqname); } seqidx = get_seq_idx(E, &x->mate_seqname); } else { if ((x->mate_seqname.n == 1 && x->mate_seqname.s[0] == '=') || ((x->flags & BAM_FUNMAP) == 0 && strcmp((char*) x->seqname.s, (char*) x->mate_seqname.s) == 0)) { dist2_encode(E->ac, &E->d_ext_mate_sameseq, 1); } else { dist2_encode(E->ac, &E->d_ext_mate_sameseq, 0); encode_seqname(E, &x->mate_seqname); seqidx = get_seq_idx(E, &x->mate_seqname); } } uint32_enc_encode(E->ac, &E->d_ext_pos[seqidx], x->mate_pos); } }