Beispiel #1
0
/*
 * This function calculates ct tag for two bams, it assumes they are from the same template and
 * writes the tag to the first read in position terms.
 */
static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
{
    bam1_t *swap;
    int i, end;
    uint32_t *cigar;
    str->l = 0;
    if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip
    if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
    kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
    kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
    for (i = 0, cigar = bam_get_cigar(b1); i < b1->core.n_cigar; ++i) {
        kputw(bam_cigar_oplen(cigar[i]), str);
        kputc(bam_cigar_opchr(cigar[i]), str);
    }
    end = bam_endpos(b1);
    kputw(b2->core.pos - end, str);
    kputc('T', str);
    kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
    kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
    for (i = 0, cigar = bam_get_cigar(b2); i < b2->core.n_cigar; ++i) {
        kputw(bam_cigar_oplen(cigar[i]), str);
        kputc(bam_cigar_opchr(cigar[i]), str);
    }

    uint8_t* data;
    if ((data = bam_aux_get(b1,"ct")) != NULL) bam_aux_del(b1, data);
    if ((data = bam_aux_get(b2,"ct")) != NULL) bam_aux_del(b2, data);

    bam_aux_append(b1, "ct", 'Z', str->l+1, (uint8_t*)str->s);
}
Beispiel #2
0
void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
{
	bam1_t *swap;
	int i, end;
	uint32_t *cigar;
	str->l = 0;
	if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip
	if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
	kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	end = bam_calend(&b1->core, cigar);
	kputw(b2->core.pos - end, str);
	kputc('T', str);
	kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); 
}
Beispiel #3
0
int bam_cigar2ulen(int n_cigar, const uint32_t *cigar)
{
    int k, l;
    for (k = l = 0; k < n_cigar; ++k) {
        if (bam_cigar_type(bam_cigar_op(cigar[k])) &1) {
            l += bam_cigar_oplen(cigar[k]);
        } else if (bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) {
            l += bam_cigar_oplen(cigar[k]);
        }
    }
    return l;
}
Beispiel #4
0
int bam_cigar2matches(int n_cigar, const uint32_t *cigar)
{
    int k, l;
    for (k = l = 0; k < n_cigar; ++k)
        if (bam_cigar_type(bam_cigar_op(cigar[k]))==3)
            l += bam_cigar_oplen(cigar[k]);
    return l;
}
Beispiel #5
0
CigarParser::CigarParser(uint32_t const* cigar, int len)
    : cigar_(cigar)
    , len_(len)
    , readPos_(0)
    , refPos_(0)
    , currentOpIdx_(0)
    , currentOp_(bam_cigar_op(*cigar))
    , currentOpLen_(bam_cigar_oplen(*cigar))
    , started_(false)
{
}
Beispiel #6
0
void calcAlnOffsets(uint32_t *cigar,
                    uint32_t n_cigar,
                    uint32_t sa_pos,
                    char sa_strand,
                    struct line *l)
{
    l->raLen = 0;
    l->qaLen = 0;
    l->sclip = 0;
    l->eclip = 0;
    l->SQO = 0;
    l->EQO = 0;
    bool first = true;

    uint32_t k;
    for (k = 0; k < n_cigar; ++k) 
    {
        uint32_t opLen = bam_cigar_oplen(cigar[k]);
        char opCode = bam_cigar_opchr(cigar[k]);
        if      (opCode == 'M' || opCode == '=' || opCode == 'X')
        {
            l->raLen += opLen;
            l->qaLen += opLen;
            first = false;
        }
        else if (opCode == 'S' || opCode == 'H')
        {
            if (first) l->sclip += opLen;
            else       l->eclip += opLen;
        }
        else if (opCode == 'D' || opCode == 'N')
        {
            l->raLen += opLen;
        }
        else if (opCode == 'I')
        {
            l->qaLen += opLen;
        }
    }
    //*rapos = str2pos(line->fields[POS]);
    l->rapos = sa_pos;
    if (sa_strand == '+')
    {
        l->pos = l->rapos - l->sclip;
        l->SQO = l->sclip;
        l->EQO = l->sclip + l->qaLen - 1;
    }
    else
    {
        l->pos = l->rapos + l->raLen + l->eclip - 1;
        l->SQO = l->eclip;
        l->EQO = l->eclip + l->qaLen - 1;
    }
}
Beispiel #7
0
// Adapted from samtools/bam.c
int32_t b2g_bam_clippedlength(bam1_t *b) {
  const bam1_core_t c = b->core;
  const uint32_t *cigar = bam1_cigar(b);
  uint32_t k;
  int32_t l = 0;
  for (k = 0; k < c.n_cigar; ++k) {
    if ('S' == bam_cigar_opchr(bam_cigar_op(cigar[k]))) {
      l += bam_cigar_oplen(cigar[k]);
    }
  }
  return l;
}
Beispiel #8
0
// Return sum of bases on right of alignment with:
// * hard masked (H)
// * soft masked (S)
// * inserted bases relative to ref (I)
static inline uint32_t bam_get_end_padding(int n_cigar, const uint32_t *cigar)
{
  ctx_assert(n_cigar > 0);

  uint32_t i, l = 0;
  const uint32_t c = (1<<BAM_CINS)|(1<<BAM_CSOFT_CLIP)|(1<<BAM_CHARD_CLIP);

  for(i = n_cigar-1; i > 0; i--)
    if((c >> bam_cigar_op(cigar[i])) & 1)
      l += bam_cigar_oplen(cigar[i]);

  return l;
}
/* Make a node containing an InDel
 *
 * b     The input read
 * cigar_op_num The operation number of the Insertion/Deletion
 *
 * returns a node, that must be either inserted into the linked list or
 * destroyed with destroyNode()
 */
InDel *makeNode(bam1_t *b, int cigar_op_num) {
    int i, op, oplen, quit = 0, type;
    int32_t start = b->core.pos-1;
    int32_t end;
    uint32_t *cigar = bam_get_cigar(b);
    InDel *node;

    for(i=0; i<cigar_op_num; i++) {
        oplen = bam_cigar_oplen(cigar[i]);
        type = bam_cigar_type(bam_cigar_op(cigar[i]));
        if(type & 2) start += oplen;
    }

    end = ++start;
    for(i=cigar_op_num; i<b->core.n_cigar; i++) {
        op = bam_cigar_op(cigar[i]);
        oplen = bam_cigar_oplen(cigar[i]);
        switch(op) {
        case 1: //I
        case 2: //D
            end = (end>start+oplen) ? end : start+oplen;
            break;
        default :
            quit = 1;
            break;
        }
        if(quit) break;
    }

    //Make the node
    node = calloc(1, sizeof(InDel));
    node->tid = b->core.tid;
    node->start = start;
    node->end = end;
    node->count = 1;

    return node;
}
Beispiel #10
0
/**
 * Gets the cigar from a BAM record
 */
void bam_get_cigar_string(bam1_t *s, kstring_t *cigar_string)
{
    cigar_string->l=0;
    int32_t n_cigar_op = bam_get_n_cigar_op(s);
    if (n_cigar_op)
    {
        uint32_t *cigar = bam_get_cigar(s);
        for (int32_t i = 0; i < n_cigar_op; ++i)
        {
            kputw(bam_cigar_oplen(cigar[i]), cigar_string);
            kputc(bam_cigar_opchr(cigar[i]), cigar_string);
        }
    }
}
Beispiel #11
0
void CigarParser::advance() {
    int type = bam_cigar_type(currentOp_);
    if (type & BAM_CONSUME_REFERENCE) {
        refPos_ += currentOpLen_;
    }
    if (type & BAM_CONSUME_QUERY) {
        readPos_ += currentOpLen_;
    }

    ++currentOpIdx_;
    assert(currentOpIdx_ < len_);
    currentOp_ = bam_cigar_op(cigar_[currentOpIdx_]);
    currentOpLen_ = bam_cigar_oplen(cigar_[currentOpIdx_]);
}
Beispiel #12
0
unsigned seq_lens_from_bin_cigar (uint32_t* cigar_bin, unsigned cigar_bin_sz, unsigned* q_len, unsigned* r_len)
{
    unsigned oplen, constype;
    uint32_t *sent;
    *q_len = *r_len = 0;
    unsigned allen = 0;
    for (sent = cigar_bin + cigar_bin_sz; cigar_bin != sent; ++cigar_bin)
    {
        oplen = bam_cigar_oplen (*cigar_bin);
        constype = bam_cigar_type (*cigar_bin);
        if (constype & CONSUME_QRY) *q_len += oplen;
        if (constype & CONSUME_REF) *r_len += oplen;
        allen += oplen;
    }
    return allen;
}
Beispiel #13
0
// Returns 0 on success, -1 on failure.
static int bam_format_cigar(const bam1_t* b, kstring_t* str)
{
    // An empty cigar is a special case return "*" rather than ""
    if (b->core.n_cigar == 0) {
        return (kputc('*', str) == EOF) ? -1 : 0;
    }

    const uint32_t *cigar = bam_get_cigar(b);
    uint32_t i;

    for (i = 0; i < b->core.n_cigar; ++i) {
        if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1;
        if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1;
    }

    return 0;
}
Beispiel #14
0
static int32_t unclipped_start(bam1_t *b) {
    uint32_t *cigar = bam_get_cigar(b);
    int32_t clipped = 0;
    uint32_t i;

    for (i = 0; i < b->core.n_cigar; i++) {
        char c = bam_cigar_opchr(cigar[i]);

        if (c == 'S' || c == 'H') { // clips
            clipped += bam_cigar_oplen(cigar[i]);
        } else {
            break;
        }
    }

    return b->core.pos - clipped + 1;
}
Beispiel #15
0
// Returns 0 to indicate read should be output 1 otherwise
static int process_aln(const bam_hdr_t *h, bam1_t *b, samview_settings_t* settings)
{
    if (settings->remove_B) bam_remove_B(b);
    if (settings->min_qlen > 0) {
        int k, qlen = 0;
        uint32_t *cigar = bam_get_cigar(b);
        for (k = 0; k < b->core.n_cigar; ++k)
            if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP)
                qlen += bam_cigar_oplen(cigar[k]);
        if (qlen < settings->min_qlen) return 1;
    }
    if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off))
        return 1;
    if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b))))
        return 1;
    if (settings->subsam_frac > 0.) {
        uint32_t k = __ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(b)) ^ settings->subsam_seed);
        if ((double)(k&0xffffff) / 0x1000000 >= settings->subsam_frac) return 1;
    }
    if (settings->rghash) {
        uint8_t *s = bam_aux_get(b, "RG");
        if (s) {
            khint_t k = kh_get(rg, settings->rghash, (char*)(s + 1));
            if (k == kh_end(settings->rghash)) return 1;
        }
    }
    if (settings->library) {
        const char *p = bam_get_library((bam_hdr_t*)h, b);
        if (!p || strcmp(p, settings->library) != 0) return 1;
    }
    if (settings->remove_aux_len) {
        size_t i;
        for (i = 0; i < settings->remove_aux_len; ++i) {
            uint8_t *s = bam_aux_get(b, settings->remove_aux[i]);
            if (s) {
                bam_aux_del(b, s);
            }
        }
    }
    return 0;
}
Beispiel #16
0
static void unpad_seq(bam1_t *b, kstring_t *s)
{
	int k, j, i;
	uint32_t *cigar = bam1_cigar(b);
	uint8_t *seq = bam1_seq(b);
	ks_resize(s, b->core.l_qseq);
	for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
		int op, ol;
		op = bam_cigar_op(cigar[k]);
		ol = bam_cigar_oplen(cigar[k]);
		assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP);
		if (op == BAM_CMATCH) {
			for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j);
			++j;
		} else if (op == BAM_CSOFT_CLIP) {
			j += ol;
		} else {
			for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
		}
	}
}
Beispiel #17
0
static int unpad_seq(bam1_t *b, kstring_t *s)
{
    // Returns 0 on success, -1 on an error
    int k, j, i;
    int length;
    int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */
    uint32_t *cigar = bam_get_cigar(b);
    uint8_t *seq = bam_get_seq(b);

    // b->core.l_qseq gives length of the SEQ entry (including soft clips, S)
    // We need the padded length after alignment from the CIGAR (excluding
    // soft clips S, but including pads from CIGAR D operations)
    length = bam_cigar2rlen(b->core.n_cigar, cigar);
    ks_resize(s, length);
    for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) {
        int op, ol;
        op = bam_cigar_op(cigar[k]);
        ol = bam_cigar_oplen(cigar[k]);
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j);
        } else if (op == BAM_CSOFT_CLIP) {
            j += ol;
        } else if (op == BAM_CHARD_CLIP) {
            /* do nothing */
        } else if (op == BAM_CDEL) {
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
        } else if (op == BAM_CREF_SKIP) {
            /* Treat CIGAR N as D (not ideal, but better than ignoring it) */
            for (i = 0; i < ol; ++i) s->s[s->l++] = 0;
            if (0 == cigar_n_warning) {
                cigar_n_warning = -1;
                fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b));
            }
        } else {
            fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b));
            return -1;
        }
    }
    return length != s->l;
}
Beispiel #18
0
unsigned alignment_bounds_from_bin_cigar (uint32_t* cigar_bin, unsigned cigar_bin_sz, uint8_t forward, unsigned qry_len, unsigned* q_beg, unsigned* q_end, unsigned* r_beg, unsigned* r_end)
{
    unsigned oplen, op, constype;
    uint32_t *sent;
    *q_beg = *q_end = *r_beg = *r_end = 0;
    unsigned allen = 0;
    uint32_t tail = 0;
    for (sent = cigar_bin + cigar_bin_sz; cigar_bin != sent; ++cigar_bin)
    {
        oplen = bam_cigar_oplen (*cigar_bin);
        op = bam_cigar_op (*cigar_bin);
        constype = bam_cigar_type (*cigar_bin);

        if (tail && (op == BAM_CHARD_CLIP || op == BAM_CSOFT_CLIP)) // the aligned zone ended, clip started. Note that tail indels are not valid, so we do not assume they are possible..
            break;

        if (op != BAM_CHARD_CLIP && op != BAM_CSOFT_CLIP)
            tail = 1;

        if (constype & CONSUME_QRY)
        {
            if (!tail) *q_beg += oplen;
            *q_end += oplen;
        }
        if (constype & CONSUME_REF)
        {
            if (!tail) *r_beg += oplen;
            *r_end += oplen;
        }
        allen += oplen;
    }
    if (!forward)
    {
        unsigned tmp = qry_len - *q_beg;
        *q_beg = qry_len - *q_end;
        *q_end = tmp;
    }
    return allen;
}
Beispiel #19
0
static int32_t unclipped_end(bam1_t *b) {
    uint32_t *cigar = bam_get_cigar(b);
    int32_t end_pos, clipped = 0;
    int32_t i;

    end_pos = bam_endpos(b);

    // now get the clipped end bases (if any)
    // if we get to the beginning of the cigar string
    // without hitting a non-clip then the results are meaningless
    for (i = b->core.n_cigar - 1; i >= 0; i--) {
        char c = bam_cigar_opchr(cigar[i]);

        if (c == 'S' || c == 'H') { // clips
            clipped += bam_cigar_oplen(cigar[i]);
        } else {
            break;
        }
    }

    return end_pos + clipped;
}
Beispiel #20
0
void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) {
    double w;

    if (no_fractional_weight) w = 1.0;
    else {
      uint8_t *p_tag = bam_aux_get(b, "ZW");
      if (p_tag == NULL) return;
      w = bam_aux2f(p_tag);
    }

    int pos = b->core.pos;
    uint32_t *p = bam_get_cigar(b);
    
    for (int i = 0; i < (int)b->core.n_cigar; ++i, ++p) {
      char op = bam_cigar_op(*p);
      int op_len = bam_cigar_oplen(*p);

      if (op == BAM_CMATCH)
	for (int j = 0; j < op_len; ++j, ++pos) wiggle.read_depth[pos] += w;
      else pos += ((bam_cigar_type(op) & 2) ? op_len : 0);
    }
}
Beispiel #21
0
void cigar_print (FILE* f, uint32_t* cigar, unsigned cigar_sz)
{
    uint32_t* sent;
    for (sent = cigar+cigar_sz; cigar != sent; ++cigar)
    {
        uint32_t curop = bam_cigar_op (*cigar);
        uint32_t count = bam_cigar_oplen (*cigar);
        char schar;
        switch (curop)
        {
        case BAM_CHARD_CLIP:
            schar = 'H';
            break;
        case BAM_CSOFT_CLIP: // skip
            schar = 'S';
            break;
        case BAM_CMATCH:
            schar = 'M';
            break;
        case BAM_CEQUAL:
            schar = '=';
            break;
        case BAM_CDIFF:
            schar = '#';
            break;
        case BAM_CINS:
            schar = 'I';
            break;
        case BAM_CDEL:
            schar = 'I';
            break;
        default:
            schar = '?';
        }
        fprintf (f, "%d%c", count, schar);
    }
}
Beispiel #22
0
string bam_cigarString (bam1_t *b) {//output CIGAR string
//    kstring_t strK;
//    kstring_t *str=&strK;
    const bam1_core_t *c = &b->core;

    string cigarString("");
    if ( c->n_cigar > 0 ) {
      uint32_t *cigar = bam_get_cigar(b);
      for (int i = 0; i < c->n_cigar; ++i) {
        cigarString+=to_string((uint)bam_cigar_oplen(cigar[i]))+bam_cigar_opchr(cigar[i]);
      };
    };

    
//	if (c->n_cigar) { // cigar
//		for (int i = 0; i < c->n_cigar; ++i) {
//			kputw(bam_cigar_oplen(cigar[i]), str);
//			kputc(bam_cigar_opchr(cigar[i]), str);
//		}
//	} else kputc('*', str);
//    
//    string cigarString (str->s,str->l);
    return cigarString;
};
Beispiel #23
0
// prepare internal structures for clipping and alignment
// returns true if realignment was performed
bool RealignImp::compute_alignment (
    const char* q_seq,
    unsigned q_len,
    const char* r_seq, 
    unsigned r_len,
    int r_pos, 
    bool forward, 
    const uint32_t* cigar, 
    unsigned cigar_sz, 
    uint32_t*& cigar_dest, 
    unsigned& cigar_dest_sz, 
    int& new_pos,
    bool& already_perfect,
    bool& clip_failed,
    bool& alignment_failed,
    bool& unclip_failed)
{
    already_perfect = false;
    alignment_failed = false;
    unclip_failed = false;
    unsigned oplen;

    const char* q_seq_clipped = q_seq;
    const uint32_t* cigar_clipped = cigar;
    unsigned cigar_sz_clipped = cigar_sz;

    unsigned sclip_q_len, sclip_r_len, sclip_al_len;

    assert (cigar_sz);
    // reset realigner
    Reset ();

    // set clipping 
    SetClipping ((int) cliptype_, forward);

    // clip out the hard and soft clipping zones from 5" and 3"
    // The 'cut out' of the q_seq is done by switching to downstream pointer.
    if (bam_cigar_op (*cigar) == BAM_CSOFT_CLIP)
    {
        oplen = bam_cigar_oplen (*cigar);
        ClipStart (oplen);
        q_seq_clipped += oplen;
        ++cigar_clipped;
        --cigar_sz_clipped;
    }

    if (cigar_sz > 1 && bam_cigar_op (cigar [cigar_sz - 1]) == BAM_CSOFT_CLIP)
    {
        oplen = bam_cigar_oplen (cigar [cigar_sz - 1]);
        ClipEnd (oplen);
        --cigar_sz_clipped;
    }

    // cigar defines q_seq and t_seq lengths
    sclip_al_len = seq_lens_from_bin_cigar (cigar_clipped, cigar_sz_clipped, &sclip_q_len, &sclip_r_len);

    const std::string query (q_seq_clipped, sclip_q_len);
    const std::string target (r_seq, sclip_r_len);
    std::string pretty_al; pretty_al.reserve (sclip_al_len);

    pretty_al_from_bin_cigar (cigar_clipped, cigar_sz_clipped, q_seq_clipped, r_seq, pretty_al);

    // Realigner requires strings of proper size to be passed to SetSequences
    SetSequences (query, target, pretty_al, forward);

    if (!ClipAnchors (clip_failed))
    {
        already_perfect = true;
        return false; // alignment already good, no imperfect zone to realign found
    }

    // TODO avoid automatic vectors to prevent unneeded heap usage
    vector<MDelement> new_md_vec; 
    vector<CigarOp> new_cigar_vec;
    unsigned int start_pos_shift;

    if (!computeSWalignment(new_cigar_vec, new_md_vec, start_pos_shift))
    {
        alignment_failed = true;
        return false;
    }

    if (!addClippedBasesToTags(new_cigar_vec, new_md_vec, q_len))
    {
        unclip_failed = true;
        return false; // error adding back clipped out zones
    }

    if (!LeftAnchorClipped () && start_pos_shift != 0) 
    {
        // build cigar data only if it is needed
        // TODO avoid automatic vectors to prevent unneeded heap usage
        std::vector <CigarOp> cigar_vec;
        cigar_vector_from_bin (cigar, cigar_sz, cigar_vec);
        new_pos = updateReadPosition (cigar_vec, start_pos_shift, r_pos);
    }
    else
        new_pos = r_pos;

    // free (cigar_dest);
    // TODO: switch to better alignment memory management, avoid heap operations
    cigar_dest = (uint32_t*) tmap_malloc (sizeof (uint32_t) * new_cigar_vec.size (), "cigar_dest");
    cigar_dest_sz = new_cigar_vec.size ();
    cigar_vector_to_bin (new_cigar_vec, cigar_dest);

    return true;
}
Beispiel #24
0
static aln_t align_read_against_one(kseq_t *target, const int read_len,
                                    uint8_t *read_num, kswq_t **qry,
                                    const align_config_t *conf,
                                    const int min_score) {
  uint8_t *ref_num = calloc(target->seq.l, sizeof(uint8_t));
  for (size_t k = 0; k < target->seq.l; ++k)
    ref_num[k] = conf->table[(int)target->seq.s[k]];

  aln_t aln;
  aln.cigar = NULL;
  aln.loc = ksw_align(read_len, read_num, target->seq.l, ref_num, conf->m,
                      conf->mat, conf->gap_o, conf->gap_e, KSW_XSTART, qry);

  aln.target_name = target->name.s;

  if (aln.loc.score < min_score) {
    free(ref_num);
    return aln;
  }

  ksw_global(aln.loc.qe - aln.loc.qb + 1, &read_num[aln.loc.qb],
             aln.loc.te - aln.loc.tb + 1, &ref_num[aln.loc.tb], conf->m,
             conf->mat, conf->gap_o, conf->gap_e, conf->bandwidth, &aln.n_cigar,
             &aln.cigar);

  aln.nm = 0;
  size_t qi = aln.loc.qb, ri = aln.loc.tb;
  for (int k = 0; k < aln.n_cigar; k++) {
    const int32_t oplen = bam_cigar_oplen(aln.cigar[k]),
                  optype = bam_cigar_type(aln.cigar[k]);

    if (optype & 3) { // consumes both - check for mismatches
      for (int j = 0; j < oplen; j++) {
        if (UNLIKELY(read_num[qi + j] != ref_num[ri + j]))
          aln.nm++;
      }
    } else {
      aln.nm += oplen;
    }
    if (optype & 1)
      qi += oplen;
    if (optype & 2)
      ri += oplen;
  }

  free(ref_num);

  /* size_t cigar_len = aln.loc.qb; */
  /* for (int c = 0; c < aln.n_cigar; c++) { */
  /*   int32_t length = (0xfffffff0 & *(aln.cigar + c)) >> 4; */
  /*   cigar_len += length; */
  /* } */
  /* cigar_len += read_len - aln.loc.qe - 1; */
  /* if(cigar_len != (size_t)read_len) { */
  /*   /\* printf("[ig_align] Error: cigar length (score %d) not equal to read length for XXX (target %s): %zu vs %d\n", aln.loc.score, target->name.s, cigar_len, read_len); *\/ */
  /*   // NOTE: */
  /*   //   It is *really* *f*****g* *scary* that it's spitting out cigars that are not the same length as the query sequence. */
  /*   //   Nonetheless, fixing it seems to involve delving into the depths of ksw_align() and ksw_global(), which would be very time consuming, and the length discrepancy seems to ony appear in very poor matches. */
  /*   //   I.e., poor enough that we will subsequently ignore them in partis/python/waterer.py, so it seems to not screw anything up downstream to just set the length-discrepant matches' scores to zero, such that ig-sw doesn't write them to its sam output. */
  /*   //   Note also that it is not always the lowest- or highest-scoring matches that have discrepant lengths (i.e. setting their scores to zero promotes matches swith poorer scores, but which do not have discrepant lengths. */
  /*   /\* aln.loc.score = 0; *\/ */
  /*   aln.cigar = NULL; */
  /* } */

  return aln;
}
Beispiel #25
0
static aln_v align_read(const kseq_t *read,
                        const kseq_v targets,
                        const align_config_t *conf)
{
    kseq_t *r;
    const int32_t read_len = read->seq.l;

    aln_v result;
    kv_init(result);
    kv_resize(aln_t, result, kv_size(targets));

    uint8_t *read_num = calloc(read_len, sizeof(uint8_t));

    for(size_t k = 0; k < read_len; ++k)
        read_num[k] = conf->table[(int)read->seq.s[k]];

    // Align to each target
    kswq_t *qry = NULL;
    for(size_t j = 0; j < kv_size(targets); j++) {
        // Encode target
        r = &kv_A(targets, j);
        uint8_t *ref_num = calloc(r->seq.l, sizeof(uint8_t));
        for(size_t k = 0; k < r->seq.l; ++k)
            ref_num[k] = conf->table[(int)r->seq.s[k]];

        aln_t aln;
        aln.target_idx = j;
        aln.loc = ksw_align(read_len, read_num,
                            r->seq.l, ref_num,
                            conf->m,
                            conf->mat,
                            conf->gap_o,
                            conf->gap_e,
                            KSW_XSTART,
                            &qry);
        ksw_global(aln.loc.qe - aln.loc.qb + 1,
                   &read_num[aln.loc.qb],
                   aln.loc.te - aln.loc.tb + 1,
                   &ref_num[aln.loc.tb],
                   conf->m,
                   conf->mat,
                   conf->gap_o,
                   conf->gap_e,
                   50, /* TODO: Magic number - band width */
                   &aln.n_cigar,
                   &aln.cigar);

        aln.nm = 0;
        size_t qi = aln.loc.qb, ri = aln.loc.tb;
        for(size_t k = 0; k < aln.n_cigar; k++) {
            const int32_t oplen = bam_cigar_oplen(aln.cigar[k]),
                          optype = bam_cigar_type(aln.cigar[k]);

            if(optype & 3) { // consumes both - check for mismatches
                for(size_t j = 0; j < oplen; j++) {
                    if(UNLIKELY(read_num[qi + j] != ref_num[ri + j]))
                        aln.nm++;
                }
            } else {
                aln.nm += oplen;
            }
            if(optype & 1) qi += oplen;
            if(optype & 2) ri += oplen;
        }

        kv_push(aln_t, result, aln);
        free(ref_num);
    }
    free(qry);
    free(read_num);
    ks_introsort(dec_score, kv_size(result), result.a);

    return result;
}
Beispiel #26
0
int bam_pad2unpad(samFile *in, samFile *out,  bam_hdr_t *h, faidx_t *fai)
{
    bam1_t *b = 0;
    kstring_t r, q;
    int r_tid = -1;
    uint32_t *cigar2 = 0;
    int ret = 0, n2 = 0, m2 = 0, *posmap = 0;

    b = bam_init1();
    r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
    int read_ret;
    while ((read_ret = sam_read1(in, h, b)) >= 0) { // read one alignment from `in'
        // Cannot depad unmapped CRAM data
        if (b->core.flag & BAM_FUNMAP)
            goto next_seq;

        uint32_t *cigar = bam_get_cigar(b);
        n2 = 0;
        if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam_get_qname(b), h->target_name[b->core.tid]) == 0) {
            // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam_get_qname(b));
            r_tid = b->core.tid;
            if (0!=unpad_seq(b, &r)) {
                fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in reference %s\n", bam_get_qname(b));
                return -1;
            };
            if (h->target_len[r_tid] != r.l) {
                fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %u in BAM header, but %llu in embedded reference\n", bam_get_qname(b), h->target_len[r_tid], (unsigned long long)(r.l));
                return -1;
            }
            if (fai) {
                // Check the embedded reference matches the FASTA file
                if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) {
                    fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]);
                    return -1;
                }
                assert(r.l == q.l);
                int i;
                for (i = 0; i < r.l; ++i) {
                    if (r.s[i] != q.s[i]) {
                        // Show gaps as ASCII 45
                        fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n",
                            h->target_name[b->core.tid], i+1,
                            r.s[i] ? seq_nt16_str[(int)r.s[i]] : 45,
                            q.s[i] ? seq_nt16_str[(int)q.s[i]] : 45);
                        return -1;
                    }
                }
            }
            write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
            replace_cigar(b, n2, cigar2);
            posmap = update_posmap(posmap, r);
        } else if (b->core.n_cigar > 0) {
            int i, k, op;
            if (b->core.tid < 0) {
                fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam_get_qname(b));
                return -1;
            } else if (b->core.tid == r_tid) {
                ; // good case, reference available
                //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam_get_qname(b));
            } else if (fai) {
                if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
                    fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
                    return -1;
                }
                posmap = update_posmap(posmap, r);
                r_tid = b->core.tid;
                // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]);
            } else {
                fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]);
                return -1;
            }
            if (0!=unpad_seq(b, &q)) {
                fprintf(stderr, "[depad] ERROR: Problem parsing SEQ and/or CIGAR in read %s\n", bam_get_qname(b));
                return -1;
            };
            if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) {
                write_cigar(cigar2, n2, m2, cigar[0]);
            } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) {
                write_cigar(cigar2, n2, m2, cigar[0]);
                if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) {
                    write_cigar(cigar2, n2, m2, cigar[1]);
                }
            }
            /* Determine CIGAR operator for each base in the aligned read */
            for (i = 0, k = b->core.pos; i < q.l; ++i, ++k)
                q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD);
            /* Include any pads if starts with an insert */
            if (q.s[0] == BAM_CINS) {
                for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
                if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD));
                k = 0;
            } else if (q.s[0] == BAM_CPAD) {
                // Join 'k' CPAD to our first cigar op CPAD too.
                for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k);
            } else {
                k = 0;
            }
            /* Count consecutive CIGAR operators to turn into a CIGAR string */
            for (i = 1, k++, op = q.s[0]; i < q.l; ++i) {
                if (op != q.s[i]) {
                    write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
                    op = q.s[i]; k = 1;
                } else ++k;
            }
            write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
            if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) {
                write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
            } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) {
                if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) {
                    write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]);
                }
                write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
            }
            /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */
            int pre_op, post_op;
            for (i = 2; i < n2; ++i)
                if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) {
                    pre_op = bam_cigar_op(cigar2[i-2]);
                    post_op = bam_cigar_op(cigar2[i]);
                    /* Note don't need to check for X/= as code above will use M only */
                    if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) {
                        /* This is a redundant P operator */
                        cigar2[i-1] = 0; // i.e. 0M
                        /* If had same operator either side, combine them in post_op */
                        if (pre_op == post_op) {
                            /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/
                            cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op);
                            cigar2[i-2] = 0; // i.e. 0M
                        }
                    }
                }
            /* Remove the zero'd operators (0M) */
            for (i = k = 0; i < n2; ++i)
                if (cigar2[i]) cigar2[k++] = cigar2[i];
            n2 = k;
            replace_cigar(b, n2, cigar2);
        }
        /* Even unmapped reads can have a POS value, e.g. if their mate was mapped */
        if (b->core.pos != -1) b->core.pos = posmap[b->core.pos];
        if (b->core.mtid < 0 || b->core.mpos < 0) {
            /* Nice case, no mate to worry about*/
            // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam_get_qname(b));
            /* TODO - Warning if FLAG says mate should be mapped? */
            /* Clean up funny input where mate position is given but mate reference is missing: */
            b->core.mtid = -1;
            b->core.mpos = -1;
        } else if (b->core.mtid == b->core.tid) {
            /* Nice case, same reference */
            // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam_get_qname(b));
            b->core.mpos = posmap[b->core.mpos];
        } else {
            /* Nasty case, Must load alternative posmap */
            // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]);
            if (!fai) {
                fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]);
                return -1;
            }
            /* Temporarily load the other reference sequence */
            if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) {
                fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]);
                return -1;
            }
            posmap = update_posmap(posmap, r);
            b->core.mpos = posmap[b->core.mpos];
            /* Restore the reference and posmap*/
            if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) {
                fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]);
                return -1;
            }
            posmap = update_posmap(posmap, r);
        }
        /* Most reads will have been moved so safest to always recalculate the BIN value */
        b->core.bin = bam_reg2bin(b->core.pos, bam_endpos(b));

    next_seq:
        sam_write1(out, h, b);
    }
    if (read_ret < -1) {
        fprintf(stderr, "[depad] truncated file.\n");
        ret = 1;
    }
    free(r.s); free(q.s); free(posmap);
    bam_destroy1(b);
    return ret;
}
Beispiel #27
0
int bsstrand_func(bam1_t *b, const samfile_t *in, samfile_t *out, void *data) {

	bsstrand_data_t *d = (bsstrand_data_t*)data;
	bsstrand_conf_t *conf = d->conf;
	const bam1_core_t *c = &b->core;

	if (c->flag & BAM_FUNMAP){
		if (out) samwrite(out, b);
		d->n_unmapped++;
		return 0;
	}
	
	fetch_refseq(d->rs, in->header->target_name[c->tid], c->pos, c->pos+1);
	uint32_t rpos=c->pos+1, qpos=0;
	int i, nC2T = 0, nG2A = 0;
	uint32_t j;
	char rbase, qbase;

	for (i=0; i<c->n_cigar; ++i) {
		uint32_t op = bam_cigar_op(bam1_cigar(b)[i]);
		uint32_t oplen = bam_cigar_oplen(bam1_cigar(b)[i]);
		switch(op) {
		case BAM_CMATCH:
			for(j=0; j<oplen; ++j) {
				rbase = toupper(getbase_refseq(d->rs, rpos+j));
				qbase = bscall(bam1_seq(b), qpos+j);
				if (rbase == 'C' && qbase == 'T') nC2T += 1;
				if (rbase == 'G' && qbase == 'A') nG2A += 1;
				/* printf("%c vs %c\n", toupper(rbase), qbase); */
			}
			rpos += oplen;
			qpos += oplen;
			break;
		case BAM_CINS:
			qpos += oplen;
			break;
		case BAM_CDEL:
			rpos += oplen;
			break;
		case BAM_CSOFT_CLIP:
			qpos += oplen;
			break;
		default:
			fprintf(stderr, "Unknown cigar, %u\n", op);
			abort();
		}
	}

	char key[2] = {'Z','S'};
	unsigned char *bsstrand = bam_aux_get(b, key);
	if (bsstrand) {
		bsstrand++;
		double s = similarity(nG2A, nC2T);
		if (nG2A > 1 && nC2T > 1 && s > 0.5) {
			if (conf->output_read || conf->output_all_read)
				printf("F\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
			bam_aux_append(b, "OS", 'A', 1, bsstrand);
			bsstrand[0] = '?';
			d->n_fail++;
		} else if (*bsstrand == '+' && nG2A > nC2T + 2) {
			if (conf->output_read || conf->output_all_read)
				printf("W2C\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
			bam_aux_append(b, "OS", 'A', 1, bsstrand);
			bsstrand[0] = '-';
			d->n_corr++;
		} else if (*bsstrand == '-' && nC2T > nG2A + 2) {
			if (conf->output_read || conf->output_all_read)
				printf("C2W\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
			bam_aux_append(b, "OS", 'A', 1, bsstrand);
			bsstrand[0] = '+';
			d->n_corr++;
		} else if (conf->output_all_read) {
			printf("N\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
		}
	} else if (!(c->flag & BAM_FUNMAP) && conf->infer_bsstrand) {
		char bss[3];
		if (similarity(nG2A, nC2T) < 0.5) {
			strcpy(bss, "??");
		} else if (nC2T > nG2A) {
			strcpy(bss, c->flag & BAM_FREVERSE ? "+-" : "++");
		} else {
			strcpy(bss, c->flag & BAM_FREVERSE ? "-+" : "--");
		}
		bam_aux_append(b, "ZS", 'Z', 3, (uint8_t*) bss);
	}

	
	if (out) samwrite(out, b);
	d->n_mapped++;

	return 0;
}
Beispiel #28
0
/**
 * Gets the base in the read that is mapped to a genomic position.
 * Extracts the read sequence and aualities too.
 */
void bam_get_base_and_qual_and_read_and_qual(bam1_t *srec, uint32_t pos, char& base, char& qual, int32_t& rpos, kstring_t* readseq, kstring_t* readqual)
{
    bam1_core_t *c = &srec->core;
    int32_t rlen = c->l_qseq;
    uint32_t cpos = c->pos; //reference coordinates of the first mapped base
    rpos = 0; //read coordinates

    kstring_t str;
    str.l = str.m = 0, str.s = 0;
    base = 'N';
    qual = 0;

    if (c->n_cigar)
    {
        uint32_t *cigar = bam_get_cigar(srec);
        for (uint32_t i = 0; i < c->n_cigar; ++i)
        {
            char op = bam_cigar_opchr(cigar[i]);
            str.l = 0;
            kputw(bam_cigar_oplen(cigar[i]), &str);
            char* stop;
            uint32_t len = strtol(str.s, &stop, 10);
            assert(stop);

            if (op=='M')
            {
                if (pos>=cpos && pos<=cpos+len-1)
                {
                    rpos += pos-cpos;
                    break;
                }

                cpos += len;
                rpos += len;
            }
            else if (op=='D')
            {
                if (pos>=cpos && pos<=cpos+len-1)
                {
                    rpos = -1;
                    break;
                }

                cpos += len;
            }
            else if (op=='S' || op=='I')
            {
                rpos += len;
            }
        }

        //std::cout << "bpos " << bpos << "\n";
        if (rpos>=0 && rpos<=rlen)
        {
            //sequence
            bam_get_seq_string(srec, readseq);
            base = readseq->s[rpos];

            //qual
            bam_get_qual_string(srec, readqual);
            qual = readqual->s[rpos];
        }
        else
        {
            rpos = BAM_READ_INDEX_NA;
        }
    }
//    std::cout << "b: " << base << "\n";
//    std::cout << "q: " << s[bpos-1] << " " << q << "\n";
//    for (uint32_t i = 0; i < c->l_qseq; ++i) std::cerr << ((char)(s[i] + 33));
};
Beispiel #29
0
static int trim_ns(bam1_t *b, void *data) {
    int ret = 0;
    opts_t *op((opts_t *)data);
    std::vector<uint8_t> aux(bam_get_aux(b), bam_get_aux(b) + bam_get_l_aux(b));
    int tmp;
    uint8_t *const seq(bam_get_seq(b));
    uint32_t *const cigar(bam_get_cigar(b));
    //op->n_cigar = b->core.n_cigar;
    op->resize(b->l_data); // Make sure it's big enough to hold everything.
    memcpy(op->data, b->data, b->core.l_qname);

    // Get #Ns at the beginning
    for(tmp = 0; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; ++tmp);
    const int n_start(tmp);

    if(tmp == b->core.l_qseq - 1) // all bases are N -- garbage read
         ret |= op->skip_all_ns;

    // Get #Ns at the end
    for(tmp = b->core.l_qseq - 1; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; --tmp);
    const int n_end(b->core.l_qseq - 1 - tmp);

    // Get new length for read
    int final_len(b->core.l_qseq - n_end - n_start);
    if(final_len < 0) final_len = 0;
    if(final_len < op->min_trimmed_len) // Too short.
        ret |= 1;
    // Copy in qual and all of aux.

    if(n_end) {
        if((tmp = bam_cigar_oplen(cigar[b->core.n_cigar - 1]) - n_end) == 0) {
            LOG_DEBUG("Entire cigar operation is the softclip. Decrease the number of new cigar operations.\n");
            --b->core.n_cigar;
        } else {
            LOG_DEBUG("Updating second cigar operation in-place.\n");
            cigar[b->core.n_cigar - 1] = bam_cigar_gen(tmp, BAM_CSOFT_CLIP);
        }
    }

    // Get new n_cigar.
    if((tmp = bam_cigar_oplen(*cigar) - n_start) == 0) {
        memcpy(op->data + b->core.l_qname, cigar + 1, (--b->core.n_cigar) << 2); // << 2 for 4 bit per cigar op
    } else {
        if(n_start) *cigar = bam_cigar_gen(tmp, BAM_CSOFT_CLIP);
        memcpy(op->data + b->core.l_qname, cigar, b->core.n_cigar << 2);
    }
    uint8_t *opseq(op->data + b->core.l_qname + (b->core.n_cigar << 2)); // Pointer to the seq region of new data field.
    for(tmp = 0; tmp < final_len >> 1; ++tmp)
        opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4) | (bam_seqi(seq, (tmp << 1) + n_start + 1));
    if(final_len & 1)
        opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4);

    tmp = bam_get_l_aux(b);
    memcpy(opseq + ((final_len + 1) >> 1), bam_get_qual(b) + n_start, final_len + tmp);
    // Switch data strings
    std::swap(op->data, b->data);
    b->core.l_qseq = final_len;
    memcpy(bam_get_aux(b), aux.data(), aux.size());
    b->l_data = (bam_get_aux(b) - b->data) + aux.size();
    if(n_end) bam_aux_append(b, "NE", 'i', sizeof(int), (uint8_t *)&n_end);
    if(n_start) bam_aux_append(b, "NS", 'i', sizeof(int), (uint8_t *)&n_start);
    const uint32_t *pvar((uint32_t *)dlib::array_tag(b, "PV"));
    tmp = b->core.flag & BAM_FREVERSE ? n_end: n_start;
    if(pvar) {
        std::vector<uint32_t>pvals(pvar + tmp, pvar + final_len + tmp);
        bam_aux_del(b, (uint8_t *)(pvar) - 6);
        dlib::bam_aux_array_append(b, "PV", 'I', sizeof(uint32_t), final_len, (uint8_t *)pvals.data());
    }
    const uint32_t *fvar((uint32_t *)dlib::array_tag(b, "FA"));
    if(fvar) {
        std::vector<uint32_t>fvals(fvar + tmp, fvar + final_len + tmp);
        bam_aux_del(b, (uint8_t *)(fvar) - 6);
        dlib::bam_aux_array_append(b, "FA", 'I', sizeof(uint32_t), final_len, (uint8_t *)fvals.data());
    }
    return ret;
}
Beispiel #30
0
int do_grep() {
#ifdef DEBUGa
	printf("[!]do_grep\n");
#endif
	BamInfo_t *pbam;
	kh_cstr_t BamID;
	khiter_t ki, bami;
	kstring_t ks1 = { 0, 0, NULL };
	kstring_t ks2 = { 0, 0, NULL };
	kstring_t ks3 = { 0, 0, NULL };

	samFile *in;
	bam_hdr_t *h;
	hts_idx_t *idx;
	bam1_t *b, *d, *d2, *bR1, *bR2, *bR3;
	bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1();
	//htsFile *out;
	//hts_opt *in_opts = NULL, *out_opts = NULL;
	int r = 0, exit_code = 0;

	kvec_t(bam1_t) R1, R2, RV;
	pierCluster_t *pierCluster;
	//samdat_t tmp_samdat;
	FILE *fs = fopen("./test.txt","w");

	for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) {
		//printf(">[%d]:\n",bami);
		if (kh_exist(bamNFOp, bami)) {
			kv_init(R1); kv_init(R2); kv_init(RV);
			//tmp_samdat = (const samdat_t){ 0 };
			//memset(&tmp_samdat,0,sizeof(samdat_t));
			//printf("-[%d]:\n",bami);
			BamID = kh_key(bamNFOp, bami);
			pbam = &kh_value(bamNFOp, bami);
			fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD);

			in = sam_open(pbam->fileName, "r");
			if (in == NULL) {
				fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			}
			h = sam_hdr_read(in);
/*			out = hts_open("-", "w");
			if (out == NULL) {
				fprintf(stderr, "[x]Error opening standard output\n");
				return EXIT_FAILURE;
			}
			if (sam_hdr_write(out, h) < 0) {
				fprintf(stderr, "[!]Error writing output header.\n");
				exit_code = 1;
			}
*/
			int8_t *ChrIsHum;
			if (h == NULL) {
				fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			} else {
				ChrIsHum = malloc(h->n_targets * sizeof(int8_t));
				for (int32_t i=0; i < h->n_targets; ++i) {
					//ChrIsHum[i] = -1;
					ki = kh_get(chrNFO, chrNFOp, h->target_name[i]);
					if (ki == kh_end(chrNFOp)) {
						errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]);
					} else {
						ChrInfo_t * tmp = &kh_value(chrNFOp, ki);
						ChrIsHum[i] = tmp->isHum;
						//printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]);
					}
				}
			}
			h->ignore_sam_err = 0;
			b = bam_init1();
			d = bam_init1();
			d2 = bam_init1();
			if ((idx = sam_index_load(in, pbam->fileName)) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
				return 1;
			}
			pierCluster = sam_plp_init();
			while ((r = sam_read1(in, h, b)) >= 0) {
				int8_t flag = false;
				const bam1_core_t *c = &b->core;
				if (c->flag & BAM_FSECONDARY) continue;
				if (c->n_cigar) {
					uint32_t *cigar = bam_get_cigar(b);
					for (int i = 0; i < c->n_cigar; ++i) {
						if (bam_cigar_opchr(cigar[i])=='S') {	// soft clipping
							if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) {
								flag = true;
							}
						}
					}
				}
				if (flag && ChrIsHum[c->tid]) {	// Now, skip Virus items.
					//bam_copy1(bR1, b);
					flag = 0;	// recycle
					//int enoughMapQ = 0;
					//kstring_t ks = { 0, 0, NULL };
					/*if (sam_format1(h, b, &ks1) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					} else*/ if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) {	// Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况.
						//printf(">[%s]\n",ks_str(&ks1));
						flag |= 1;
						//tmp_samdat.b = bam_dup1(b);
						//kv_push(samdat_t,R1,tmp_samdat);
						/*if (checkMapQ(ChrIsHum, b, true)) {
							++enoughMapQ;
						}*/
					}
					if (getPairedSam(in, idx, b, d) != 0) {
						flag &= ~1;
						continue;
					} else {
						flag |= 2;
						/*if (checkMapQ(ChrIsHum, d, false)) {
							++enoughMapQ;
						}*/
						/*if (c->flag & BAM_FSECONDARY) {
							if (getPairedSam(in, idx, d, d2) == 0) {
								//sam_format1(h, d2, &ks3);
								flag |= 4;
								if (checkMapQ(ChrIsHum, d2, false)) {
									++enoughMapQ;
								}
							}
						}*/
					}
/*
对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。
>[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	353	chr2	13996555	0	50S40M	chr18	48245109	0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:40	AS:i:40	XS:i:40	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0;	YC:Z:CT	YD:Z:f]
-[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	177	chr18	48245109	9	40S50M	gi|59585|emb|X04615.1|2000	0	GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:50	AS:i:50	XS:i:46	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0;	YC:Z:GA	YD:Z:f]
+[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	113	gi|59585|emb|X04615.1|	2000	60	40S46M4S	chr18	48245109	0	TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:46	AS:i:46	XS:i:27	RG:Z:Fsimout_mB	SA:Z:fchr2,13996555,+,50S40M,0,0;	YC:Z:CT	YD:Z:r]
*/
					/*if (sam_format1(h, d, &ks2) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					}*/
					if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) {
						/*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1));
						printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2));
						if (flag & 4) {
							printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3));
						}
						printf("<--%d\n",enoughMapQ);*/
						if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) {
							//printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) sam_plp_push(ChrIsHum, pierCluster, d);
							//if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) sam_plp_push(ChrIsHum, pierCluster, d2);
						} else {
							//print
							fprintf(fs,"[%s]\nHumRange=%s:%d-%d\n", BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos);
							for (size_t i=0; i<kv_size(pierCluster->Reads);++i) {
								bam1_t *bi = kv_A(pierCluster->Reads, i);
								if (sam_format1(h, bi, &ks1) < 0) {
									fprintf(stderr, "Error writing output.\n");
									exit_code = 1;
									break;
								} else {
									fprintf(fs,"%s\n",ks1.s);
								}
							}
							fprintf(fs,"\n");
							//printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							//fflush(fs);
							sam_plp_dectroy(pierCluster);
							pierCluster = sam_plp_init();
						}
					}
				}
				/*char *qname = bam_get_qname(b);
				if (sam_write1(out, h, b) < 0) {
					fprintf(stderr, "[x]Error writing output.\n");
					exit_code = 1;
					break;
				}*/
			}
/*			r = sam_close(out);   // stdout can only be closed once
			if (r < 0) {
				fprintf(stderr, "Error closing output.\n");
				exit_code = 1;
			}
*/
			hts_idx_destroy(idx);
			bam_destroy1(b);
			bam_destroy1(d);
			bam_destroy1(d2);
			bam_hdr_destroy(h);
			r = sam_close(in);
			free(ChrIsHum);
#ifdef DEBUGa
			fflush(NULL);
			//pressAnyKey();
#endif
			sam_plp_dectroy(pierCluster);
			//printf("<[%d]:\n",bami);
		}
	}
	fclose(fs);
	getPairedSam(NULL, NULL, NULL, NULL);	// sam_close(fp2);
	//printf("---[%d]---\n",exit_code);
	bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3);
	ks_release(&ks1);
	ks_release(&ks2);
	ks_release(&ks3);
	return exit_code;
}