Beispiel #1
0
extern "C" void bwa_seed2genome_pos(uint64_t sa_pos, uint64_t *contig_id, uint64_t *contig_pos, bwa_seq_t *seq)
{
	bwa_seq_t *p=seq ;

	p->sa = sa_pos ;
	p->c1 = 1 ;
	p->type=BWA_TYPE_UNIQUE ;
	p->cigar=NULL ;
	p->strand=0 ;
	
	mybwa_cal_pac_pos_core(bwt_bwt[0], bwt_bwt[1], p, 0, 0); 
				
	uint64_t len = pos_end(p) - p->pos; 
	int seq_id=-1 ;
	
	bns_coor_pac2real(bwt_bns, p->pos, len, &seq_id) ;
	uint64_t pos = (int)(p->pos - bwt_bns->anns[seq_id].offset) ;
	
	if (false && sa_pos==461542)
	{
		fprintf(stdout, "seq_id=%i, pos=%lu, n_aln=%i, multi=%i, strand=%i\n", seq_id, pos, p->n_aln, p->n_multi, p->strand) ; 

		p->sa = 461542;//461970 ;
		p->c1 = 1 ;
		p->type=BWA_TYPE_UNIQUE ;
		p->cigar=NULL ;
		p->strand=1 ;
		
		mybwa_cal_pac_pos_core(bwt_bwt[0], bwt_bwt[1], p, 0, 0); 
		
		uint64_t len = pos_end(p) - p->pos; 
		int seq_id=-1 ;
		
		bns_coor_pac2real(bwt_bns, p->pos, len, &seq_id) ;
		uint64_t pos = (int)(p->pos - bwt_bns->anns[seq_id].offset) ;

		fprintf(stdout, "+++ seq_id=%i, pos=%lu, n_aln=%i, multi=%i, strand=%i\n", seq_id, pos, p->n_aln, p->n_multi, p->strand) ;
		//fprintf(stdout, "bwt->seq_len=%lld", (long long int)bwt_bwt[0]->seq_len) ;
		//fprintf(stdout, "reverse_bwt->seq_len=%lld", (long long int)bwt_bwt[1]->seq_len) ;
		
		//bwa_seq_t *a=NULL ;
		//fprintf(stdout, "error%lld", (long long int)a->sa) ;
	}

	*contig_id=seq_id ;
	*contig_pos=pos ;
}
Beispiel #2
0
Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) {
  // Calculate the local coordinate and local alignment.
  bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr);
  bwa_refine_gapped(bns, 1, sequence, reference, NULL);

  // Copy the local alignment data into the alignment object.
  Alignment alignment;

  // Populate basic path info
  alignment.edit_distance = sequence->nm;
  alignment.num_mismatches = sequence->n_mm;
  alignment.num_gap_opens = sequence->n_gapo;
  alignment.num_gap_extensions = sequence->n_gape;
  alignment.num_best = sequence->c1;
  alignment.num_second_best = sequence->c2;
  
  // Final alignment position.
  alignment.type = sequence->type;
  bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig);
  alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1;
  alignment.negative_strand = sequence->strand;
  alignment.mapping_quality = sequence->mapQ;
  
  // Cigar step.
  alignment.cigar = NULL;
  if(sequence->cigar) {
    alignment.cigar = new uint16_t[sequence->n_cigar];
    memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t));
  }
  alignment.n_cigar = sequence->n_cigar;

  // MD tag with a better breakdown of differences in the cigar
  alignment.md = strdup(sequence->md);
  delete[] sequence->md;
  sequence->md = NULL;

  return alignment;
}
Beispiel #3
0
static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
{
	// FIXME: this routine does not work if the query bridge three reference sequences
	int32_t coor, refl, lq;
	int x, y, i, seqid;
	bns_coor_pac2real(bns, p->k, p->len, &seqid);
	coor = p->k - bns->anns[seqid].offset;
	refl = bns->anns[seqid].len;
	x = coor; y = 0;
	// test if the alignment goes beyond the boundary
	for (i = 0; i < n_cigar; ++i) {
		int op = cigar[i]&0xf, ln = cigar[i]>>4;
		if (op == 1 || op == 4 || op == 5) y += ln;
		else if (op == 2) x += ln;
		else x += ln, y += ln;
	}
	lq = y; // length of the query sequence
	if (x > refl) { // then fix it
		int j, nc, mq[2], nlen[2];
		uint32_t *cn, kk = 0;
		nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0;
		cn = calloc(n_cigar + 3, 4);
		x = coor; y = 0;
		for (i = j = 0; i < n_cigar; ++i) {
			int op = cigar[i]&0xf, ln = cigar[i]>>4;
			if (op == 4 || op == 5 || op == 1) { // ins or clipping
				y += ln;
				cn[j++] = cigar[i];
			} else if (op == 2) { // del
				if (x + ln >= refl && nc == 0) {
					cn[j++] = (uint32_t)(lq - y)<<4 | 4;
					nc = j;
					cn[j++] = (uint32_t)y<<4 | 4;
					kk = p->k + (x + ln - refl);
					nlen[0] = x - coor;
					nlen[1] = p->len - nlen[0] - ln;
				} else cn[j++] = cigar[i];
				x += ln;
			} else if (op == 0) { // match
				if (x + ln >= refl && nc == 0) {
					// FIXME: not consider a special case where a split right between M and I
					cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M
					cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S
					nc = j;
					mq[0] += refl - x;
					cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4;
					if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0;
					mq[1] += x + ln - refl;
					kk = bns->anns[seqid].offset + refl;
					nlen[0] = refl - coor;
					nlen[1] = p->len - nlen[0];
				} else {
					cn[j++] = cigar[i];
					mq[nc?1:0] += ln;
				}
				x += ln; y += ln;
			}
		}
		if (mq[0] > mq[1]) { // then take the first alignment
			n_cigar = nc;
			memcpy(cigar, cn, 4 * nc);
			p->len = nlen[0];
		} else {
			p->k = kk; p->len = nlen[1];
			n_cigar = j - nc;
			memcpy(cigar, cn + nc, 4 * (j - nc));
		}
		free(cn);
	}
Beispiel #4
0
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2, const char *bwa_rg_id)
{
	int j;
	//if (strcmp (p->name, "HWUSI-EAS1600:WT2_250_read_1:11_30_09:3:1:83:1066#0") == 0)
	//{
	//	fprintf (stderr, "found %s\n", p->name);
	//}
	if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
		int seqid, nn, am = 0, flag = p->extra_flag;
		char XT;

		if (p->type == BWA_TYPE_NO_MATCH) {
			p->pos = mate->pos;
			p->strand = mate->strand;
			flag |= SAM_FSU;
			j = 1;
		} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment

		// get seqid
		nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
		if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
			flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences

		// update flag and print it
		if (p->strand) flag |= SAM_FSR;
		if (mate) {
			if (mate->type != BWA_TYPE_NO_MATCH) {
				if (mate->strand) flag |= SAM_FMR;
			} else flag |= SAM_FMU;
		}
		printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
		printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);

		// print CIGAR
		if (p->cigar) {
			for (j = 0; j != p->n_cigar; ++j)
				printf("%d%c", __cigar_len(p->cigar[j]), "MIDSN"[__cigar_op(p->cigar[j])]);
		} else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
		else printf("%dM", p->len);

		// print mate coordinate
		if (mate && mate->type != BWA_TYPE_NO_MATCH) {
			int m_seqid, m_is_N;
			long long isize;
			am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
			// redundant calculation here, but should not matter too much
			m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
			printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
			isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
			if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
			printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
		} else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
		else printf("\t*\t0\t0\t");

		// print sequence and quality
		if (p->strand == 0)
			for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
		else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
		putchar('\t');
		if (p->qual) {
			if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
			printf("%s", p->qual);
		} else printf("*");

		if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
		if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
		if (p->type != BWA_TYPE_NO_MATCH) {
			int i;
			// calculate XT tag
			XT = "NURM"[p->type];
			if (nn > 10) XT = 'N';
			// print tags
			printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
			// print XS tag, to be compatible with Cufflinks
			if (p->sense_strand != 2 ) printf("\tXS:A:%c", p->sense_strand ? '-':'+' );
			else printf("\tXS:A:.");
			if (nn) printf("\tXN:i:%d", nn);
			if (mate) printf("\tSM:i:%lu\tAM:i:%d", p->seQ, am);
			if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
				printf("\tX0:i:%lu", p->c1);
				if (p->c1 <= max_top2) printf("\tX1:i:%lu", p->c2);
			}
			printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo_t + p->n_gapo_q, p->n_gapo_t+p->n_gape_t+p->n_gapo_q+p->n_gape_q);
			if (p->md) printf("\tMD:Z:%s", p->md);
			// print multiple hits
			if (p->n_multi) {
				bool header_printed = 0;
				for (i = 0; i < p->n_multi; ++i) {
					bwt_multi1_t *q = p->multi + i;
					j = pos_end_multi(q, p->len) - q->pos;
					nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
					if(pos_end_multi(q, p->len) - bns->anns[seqid].offset > bns->anns[seqid].len) continue; //the alignment bridges adjacent sequences (chroms)
//TODO: need to avoid this at the first place in the junction discovery step, but this should be rare for mm or human
					if (! header_printed) {
						header_printed = 1;
						printf("\tXA:Z:");
					}
					int k;
					printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
						   (int)(q->pos - bns->anns[seqid].offset + 1));
					if (q->cigar) {
						for (k = 0; k < q->n_cigar; ++k)
							printf("%d%c", __cigar_len(q->cigar[k]), "MIDSN"[__cigar_op(q->cigar[k])]);
					} else printf("%dM", p->len);
					printf(",%d", q->nm); //q->gap_t + q->gap_q + q->mm);
					if (q->sense_strand != 2) printf(",%c;", q->sense_strand? '-' : '+' );
					else printf(",.;");
				}
			}
		}
		putchar('\n');
	} else { // this read has no match
		ubyte_t *s = p->strand? p->rseq : p->seq;
		int flag = p->extra_flag | SAM_FSU;
		if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
		printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
		for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
		putchar('\t');
		if (p->qual) {
			if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
			printf("%s", p->qual);
		} else printf("*");
		if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
		putchar('\n');
	}
}