Example #1
0
quip_out_t* quip_out_open(
              quip_writer_t     writer,
              void*             writer_data,
              quip_fmt_t        fmt,
              quip_opt_t        opts,
              const quip_aux_t* aux,
              const seqmap_t*   ref)
{
    quip_out_t* out = malloc_or_die(sizeof(quip_out_t));
    out->fmt = fmt;

    switch (fmt) {
        case QUIP_FMT_FASTQ:
            out->x.fastq = quip_fastq_out_open(writer, writer_data, opts);
            break;

        case QUIP_FMT_BAM:
            opts |= QUIP_OPT_SAM_BAM;

        case QUIP_FMT_SAM:
            out->x.sam = quip_sam_out_open(writer, writer_data, opts, aux);
            break;

        case QUIP_FMT_QUIP:
            out->x.quip = quip_quip_out_open(writer, writer_data, opts, aux, ref);
            break;

        case QUIP_FMT_NULL:
            out->x.null = NULL;
            break;

        case QUIP_FMT_UNDEFINED:
            quip_error("Undefined format given.");

    }

    return out;
}
Example #2
0
void quip_write(quip_out_t* out, short_read_t* sr)
{
    switch (out->fmt) {
        case QUIP_FMT_FASTQ:
            quip_fastq_write(out->x.fastq, sr);
            break;

        case QUIP_FMT_BAM:
        case QUIP_FMT_SAM:
            quip_sam_write(out->x.sam, sr);
            break;

        case QUIP_FMT_QUIP:
            quip_quip_write(out->x.quip, sr);
            break;

        case QUIP_FMT_NULL:
            break;

        default:
            quip_error("Write called on an unsupported format.");
    }
}
Example #3
0
static void seqenc_decode_reference_alignment(seqenc_t* E, short_read_t* r, size_t seqlen)
{
    const twobit_t* refseq = seqmap_get(E->ref, (const char*) r->seqname.s);

    if (refseq == NULL) {
        quip_error(
            "A read was aligned to sequence %s, which was not found in the reference.",
            r->seqname.s);
    }

    str_reserve(&r->seq, seqlen + 1);
    r->seq.n = 0;

    /* decode N mask */
    size_t i;
    reserve_nmask(E, seqlen);
    memset(r->seq.s, '\0', seqlen + 1);
    for (i = 0; i < seqlen; ++i) {
        if (dist2_decode(E->ac, &E->d_nmask[i])) r->seq.s[i] = 'N';
    }

    uint32_t ref_pos   = r->pos;
    uint32_t read_pos  = 0;

    i = 0;    /* cigar operation */
    size_t j; /* position within the cigar op */

    kmer_t y; /* reference nucleotide */

    for (i = 0; i < r->cigar.n; ++i) {
        switch (r->cigar.ops[i]) {
            case BAM_CEQUAL:
            case BAM_CDIFF:
            case BAM_CMATCH:
                for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos, ++ref_pos) {
                    if (r->seq.s[read_pos] == 'N') continue;

                    if (dist2_decode(E->ac, &E->d_ref_match) == SEQENC_REF_MATCH) {
                        y = twobit_get(refseq, ref_pos);
                        r->seq.s[read_pos] = kmertochar[y];
                    }
                    else {
                        r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)];
                    }
                }
                break;

            case BAM_CINS:
                for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) {
                    if (r->seq.s[read_pos] == 'N') continue;
                    r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)];
                }
                break;

            case BAM_CDEL:
                ref_pos += r->cigar.lens[i];
                break;

            case BAM_CREF_SKIP:
                ref_pos += r->cigar.lens[i];
                break;

            case BAM_CSOFT_CLIP:
                for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) {
                    if (r->seq.s[read_pos] == 'N') continue;
                    r->seq.s[read_pos] = kmertochar[dist4_decode(E->ac, &E->d_ref_ins_nuc)];
                }
                break;

            case BAM_CHARD_CLIP:
                ref_pos += r->cigar.lens[i];
                break;

            case BAM_CPAD:
                quip_error("Unsupported cigar operation.");
                break;
        }
    }
    r->seq.s[seqlen] = '\0';
    r->seq.n = seqlen;

    if (read_pos != seqlen) {
        quip_error("Cigar operations do not account for full read length.");
    }

    if (r->strand) str_revcomp(r->seq.s, r->seq.n);
}
Example #4
0
void seqenc_encode_reference_alignment(
        seqenc_t* E, const short_read_t* r)
{
    const twobit_t* refseq = seqmap_get(E->ref, (const char*) r->seqname.s);

    if (refseq == NULL) {
        quip_error(
            "A read was aligned to sequence %s, which was not found in the reference.\n",
            r->seqname.s);
    }

    str_copy(&E->tmpseq, &r->seq);
    if (r->strand) {
        str_revcomp(E->tmpseq.s, E->tmpseq.n);
    }

    /* encode N mask */
    size_t i;
    reserve_nmask(E, r->seq.n);
    for (i = 0; i < r->seq.n; ++i) {
        dist2_encode(E->ac, &E->d_nmask[i],
            E->tmpseq.s[i] == 'N' ? 1 : 0);
    }

    uint32_t ref_pos   = r->pos;
    uint32_t read_pos  = 0;

    i = 0;    /* cigar operation */
    size_t j; /* position within the cigar op */

    kmer_t x; /* read nucleotide */
    kmer_t y; /* reference nucleotide */

    for (i = 0; i < r->cigar.n; ++i) {
        switch (r->cigar.ops[i]) {
            case BAM_CEQUAL:
            case BAM_CDIFF:
            case BAM_CMATCH:
                for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos, ++ref_pos) {
                    if (E->tmpseq.s[read_pos] == 'N') continue;
                       
                    x = chartokmer[E->tmpseq.s[read_pos]];
                    y = twobit_get(refseq, ref_pos);

                    if (x == y) {
                        dist2_encode(E->ac, &E->d_ref_match, SEQENC_REF_MATCH);
                    }
                    else {
                        dist2_encode(E->ac, &E->d_ref_match, SEQENC_REF_MISMATCH);
                        dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]);
                    }
                }
                break;

            case BAM_CINS:
                for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) {
                    if (E->tmpseq.s[read_pos] == 'N') continue;
                    dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]);
                }
                break;

            case BAM_CDEL:
                ref_pos += r->cigar.lens[i];
                break;

            case BAM_CREF_SKIP:
                ref_pos += r->cigar.lens[i];
                break;

            case BAM_CSOFT_CLIP:
                for (j = 0; j < r->cigar.lens[i]; ++j, ++read_pos) {
                    if (E->tmpseq.s[read_pos] == 'N') continue;
                    dist4_encode(E->ac, &E->d_ref_ins_nuc, chartokmer[E->tmpseq.s[read_pos]]);
                }
                break;

            case BAM_CHARD_CLIP:
                ref_pos += r->cigar.lens[i];
                break;

            case BAM_CPAD:
                quip_error("Cigar PAD operation is unsupported.");
                break;
        }
    }

    if (read_pos != r->seq.n) {
        quip_error("Cigar operations do not account for full read length.");
    }
}
Example #5
0
void seqenc_decode_extras(seqenc_t* E, short_read_t* x, size_t seqlen,
                          uint8_t quip_version)
{
    x->flags    = uint32_enc_decode(E->ac, &E->d_ext_flags);
    x->strand   = (x->flags & BAM_FREVERSE) ? 1 : 0;
    x->map_qual = dist256_decode(E->ac, &E->d_ext_map_qual);
    x->tlen     = uint32_enc_decode(E->ac, &E->d_ext_tlen);

    x->cigar.n = 0;
    uint32_t seqidx = 0;
    if ((x->flags & BAM_FUNMAP) == 0) {
        decode_seqname(E, &x->seqname);
        seqidx = get_seq_idx(E, &x->seqname);

        if (dist2_decode(E->ac, &E->d_ext_pos_off_flag)) {
            x->pos =
                E->last_ref_pos + dist256_decode(E->ac, &E->d_ext_pos_off);
        }
        else {
            x->pos = uint32_enc_decode(E->ac, &E->d_ext_pos[seqidx]);
        }

        E->last_ref_pos = x->pos;

        uint8_t last_op = 9;
        size_t i = 0;
        uint32_t cigarlen = 0;
        x->cigar.n = uint32_enc_decode(E->ac, &E->d_ext_cigar_n);
        cigar_reserve(&x->cigar, x->cigar.n);

        for (i = 0; i < x->cigar.n; ++i) {
            x->cigar.ops[i] = cond_dist16_decode(E->ac, &E->d_ext_cigar_op, last_op);
            x->cigar.lens[i] = uint32_enc_decode(E->ac, &E->d_ext_cigar_len[x->cigar.ops[i]]);

            if (x->cigar.ops[i] != BAM_CDEL &&
                x->cigar.ops[i] != BAM_CREF_SKIP &&
                x->cigar.ops[i] != BAM_CHARD_CLIP)
            {
                cigarlen += x->cigar.lens[i];
            }

            last_op = x->cigar.ops[i];
        }

        if (cigarlen != seqlen) {
            quip_error("Cigar operations do not account for full read length.");
        }
    }

    if ((x->flags & BAM_FMUNMAP) == 0) {
        if (quip_version >= 4) {
            if ((x->flags & BAM_FUNMAP) == 0) {
                if (dist2_decode(E->ac, &E->d_ext_mate_sameseq)) {
                    str_copy(&x->mate_seqname, &x->seqname);
                }
                else {
                    decode_seqname(E, &x->mate_seqname);
                }
            }
            else {
                decode_seqname(E, &x->mate_seqname);
            }

            seqidx = get_seq_idx(E, &x->mate_seqname);
        }
        else {
            if (dist2_decode(E->ac, &E->d_ext_mate_sameseq)) {
                str_copy(&x->mate_seqname, &x->seqname);
            }
            else {
                decode_seqname(E, &x->mate_seqname);
                seqidx = get_seq_idx(E, &x->mate_seqname);
            }
        }
        x->mate_pos = uint32_enc_decode(E->ac, &E->d_ext_pos[seqidx]);
    }
}
Example #6
0
void seqenc_encode_extras(seqenc_t* E, const short_read_t* x, uint8_t quip_version)
{
    uint32_enc_encode(E->ac, &E->d_ext_flags, x->flags);
    dist256_encode(E->ac, &E->d_ext_map_qual, x->map_qual);
    uint32_enc_encode(E->ac, &E->d_ext_tlen, x->tlen);

    uint32_t seqidx = 0;
    if ((x->flags & BAM_FUNMAP) == 0) {
        encode_seqname(E, &x->seqname);
        seqidx = get_seq_idx(E, &x->seqname);

        if (x->pos < E->last_ref_pos || x->pos - E->last_ref_pos >= 256) {
            dist2_encode(E->ac, &E->d_ext_pos_off_flag, 0);
            uint32_enc_encode(E->ac, &E->d_ext_pos[seqidx], x->pos);
        }
        else {
            dist2_encode(E->ac, &E->d_ext_pos_off_flag, 1);
            dist256_encode(E->ac, &E->d_ext_pos_off, x->pos - E->last_ref_pos);
        }

        E->last_ref_pos = x->pos;

        uint32_t cigarlen = 0;
        uint8_t last_op = 9;
        size_t i;

        uint32_enc_encode(E->ac, &E->d_ext_cigar_n, x->cigar.n);
        for (i = 0; i < x->cigar.n; ++i) {
            cond_dist16_encode(E->ac, &E->d_ext_cigar_op, last_op, x->cigar.ops[i]);
            uint32_enc_encode(E->ac, &E->d_ext_cigar_len[x->cigar.ops[i]], x->cigar.lens[i]);
            last_op = x->cigar.ops[i];

            if (x->cigar.ops[i] != BAM_CDEL &&
                x->cigar.ops[i] != BAM_CREF_SKIP &&
                x->cigar.ops[i] != BAM_CHARD_CLIP)
            {
                cigarlen += x->cigar.lens[i];
            }
        }

        if (cigarlen != x->seq.n) {
            quip_error("Cigar operations do not account for full read length.");
        }
    }

    if ((x->flags & BAM_FMUNMAP) == 0) {
        if (quip_version >= 4) {
            if ((x->flags & BAM_FUNMAP) == 0) {
                if (strcmp((char*) x->seqname.s, (char*) x->mate_seqname.s) == 0) {
                    dist2_encode(E->ac, &E->d_ext_mate_sameseq, 1);
                }
                else {
                    dist2_encode(E->ac, &E->d_ext_mate_sameseq, 0);
                    encode_seqname(E, &x->mate_seqname);
                }
            }
            else {
                encode_seqname(E, &x->mate_seqname);
            }
            seqidx = get_seq_idx(E, &x->mate_seqname);
        }
        else {
            if ((x->mate_seqname.n == 1 && x->mate_seqname.s[0] == '=') ||
                ((x->flags & BAM_FUNMAP) == 0 && strcmp((char*) x->seqname.s, (char*) x->mate_seqname.s) == 0))
            {
                dist2_encode(E->ac, &E->d_ext_mate_sameseq, 1);
            }
            else {
                dist2_encode(E->ac, &E->d_ext_mate_sameseq, 0);
                encode_seqname(E, &x->mate_seqname);
                seqidx = get_seq_idx(E, &x->mate_seqname);
            }
        }

        uint32_enc_encode(E->ac, &E->d_ext_pos[seqidx], x->mate_pos);
    }
}