static AjBool assemoutWriteBamAlignment(AjPSeqBamBgzf gzfile, const AjPAssemRead r, AjPSeqBam bam) { AjPSeqBamCore c; AjPAssemTag tag; unsigned char *dpos; const char *s; ajuint ilen; ajuint slen; ajuint i; AjIList l = NULL; /* optional fields */ ajint tagvalsize = 0; const unsigned char* tagval = 0; ajint intval =0; /* processing cigar strings*/ char *t; int op; long x; unsigned char bam_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 }; /* bam_write1 for each alignment */ c = &bam->core; ilen = ajStrGetLen(r->Seq); c->tid = (int) r->Reference; if(r->Flag & BAM_FREVERSE) c->pos = r->y1-1; else c->pos = r->x1-1; /* BAM format is zero based; -1 is translated to 0, meaning unmapped */ c->bin = 0; c->qual = r->MapQ; c->l_qname = 1 + ajStrGetLen(r->Name); c->flag = r->Flag; c->n_cigar = 0; c->l_qseq = ilen; c->mtid = (int) r->Rnext; c->mpos = (int) r->Pnext-1; c->isize = r->Tlen; /* get cigar string length */ s = ajStrGetPtr(r->Cigar); if (strcmp(s,"*")) /* '*' means unavailable */ { for (; *s; ++s) { if ((isalpha((int)*s)) || (*s=='=')) ++c->n_cigar; else if (!isdigit((int)*s)) ajWarn("invalid CIGAR character: %c\n", *s); } } bam->data_len = c->n_cigar*4 + c->l_qname + (ilen + 1)/2 + ilen; /* allocation for optional tags are made as they are appended */ if(bam->data_len > bam->m_data) { AJCRESIZE0(bam->data,bam->m_data, bam->data_len); bam->m_data = bam->data_len; } dpos = bam->data; /* copy query name to bam->data */ memcpy(dpos, ajStrGetPtr(r->Name), c->l_qname); dpos += c->l_qname; /* copy cigar string to bam->data */ s = ajStrGetPtr(r->Cigar); for (i = 0; i != c->n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper((int)*t); if (op == 'M') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; else if (op == 'S') op = BAM_CSOFT_CLIP; else if (op == 'H') op = BAM_CHARD_CLIP; else if (op == 'P') op = BAM_CPAD; else if (op == '=') op = BAM_CEQUAL; else if (op == 'X') op = BAM_CDIFF; else ajWarn("invalid CIGAR operation: %c",op); s = t + 1; ((ajuint*)dpos)[i] = x << BAM_CIGAR_SHIFT | op; } if (*s && c->n_cigar) ajWarn("unmatched CIGAR operation: %c", *s); c->bin = ajSeqBamReg2bin(c->pos, ajSeqBamCalend(c, MAJSEQBAMCIGAR(bam))); dpos += c->n_cigar*4; /* copy sequence string to bam->data */ s = ajStrGetPtr(r->Seq); slen = (ilen+1)/2; for (i = 0; i < slen; ++i) dpos[i] = 0; for (i = 0; i < ilen; ++i) dpos[i/2] |= bam_nt16_table[(ajuint)s[i]] << 4*(1-i%2); dpos += slen; /* copy quality values to bam->data */ if(r->SeqQ && !ajStrMatchC(r->SeqQ, "*")) { s = ajStrGetPtr(r->SeqQ); for(i=0;i<ilen;i++) dpos[i]= s[i]-33; } else for(i=0;i<ilen;i++) dpos[i]= 0xff; l = ajListIterNewread(r->Tags); bam->l_aux=0; while (!ajListIterDone(l)) { tag = ajListIterGet(l); /* TODO: array type 'B' and other types */ if(tag->type == 'i' || tag->type == 'I') { tagvalsize = 4; ajStrToInt(tag->Comment, &intval); tagval = (unsigned char*)&intval; } else if(tag->type =='s' || tag->type =='S') { tagvalsize = 2; ajStrToInt(tag->Comment, &intval); tagval = (unsigned char*)&intval; } else if(tag->type =='c' || tag->type =='C') { tagvalsize = 1; ajStrToInt(tag->Comment, &intval); tagval = (unsigned char*)&intval; } else if(tag->type =='A') { tagvalsize = 1; tagval = (const unsigned char*)ajStrGetPtr(tag->Comment); } else if(tag->type =='Z') { tagvalsize = ajStrGetLen(tag->Comment)+1; tagval = (const unsigned char*)ajStrGetPtr(tag->Comment); } else { ajWarn("tag type '%c' not yet supported",tag->type); continue; } ajSeqBamAuxAppend(bam, ajStrGetPtr(tag->Name), tag->type, tagvalsize, tagval); } ajListIterDel(&l); ajSeqBamWrite(gzfile, bam); return ajTrue; }
int main(int argc, char **argv) { AjPSeqall queryseqs; AjPSeqset targetseqs; AjPSeq queryseq; const AjPSeq targetseq; AjPStr queryaln = 0; AjPStr targetaln = 0; AjPFile errorf; AjBool show = ajFalse; const char *queryseqc; const char *targetseqc; AjPMatrixf matrix; AjPSeqCvt cvt = 0; float **sub; ajint *compass = NULL; float *path = NULL; float gapopen; float gapextend; float score; float minscore; ajuint j, k; ajint querystart = 0; ajint targetstart = 0; ajint queryend = 0; ajint targetend = 0; ajint width = 0; AjPTable kmers = 0; ajint wordlen = 6; ajint oldmax = 0; ajint newmax = 0; ajuint ntargetseqs; ajuint nkmers; AjPAlign align = NULL; EmbPWordMatch maxmatch; /* match with maximum score */ /* Cursors for the current sequence being scanned, ** i.e., until which location it was scanned. ** Separate cursor/location entries for each sequence in the seqset. */ ajuint* lastlocation; EmbPWordRK* wordsw = NULL; AjPList* matchlist = NULL; embInit("supermatcher", argc, argv); matrix = ajAcdGetMatrixf("datafile"); queryseqs = ajAcdGetSeqall("asequence"); targetseqs= ajAcdGetSeqset("bsequence"); gapopen = ajAcdGetFloat("gapopen"); gapextend = ajAcdGetFloat("gapextend"); wordlen = ajAcdGetInt("wordlen"); align = ajAcdGetAlign("outfile"); errorf = ajAcdGetOutfile("errorfile"); width = ajAcdGetInt("width"); /* width for banded Smith-Waterman */ minscore = ajAcdGetFloat("minscore"); gapopen = ajRoundFloat(gapopen, 8); gapextend = ajRoundFloat(gapextend, 8); sub = ajMatrixfGetMatrix(matrix); cvt = ajMatrixfGetCvt(matrix); embWordLength(wordlen); /* seqset sequence is the reference sequence for SAM format */ ajAlignSetRefSeqIndx(align, 1); ajSeqsetTrim(targetseqs); ntargetseqs = ajSeqsetGetSize(targetseqs); AJCNEW0(matchlist, ntargetseqs); /* get tables of words */ for(k=0;k<ntargetseqs;k++) { targetseq = ajSeqsetGetseqSeq(targetseqs, k); embWordGetTable(&kmers, targetseq); ajDebug("Number of distinct kmers found so far: %d\n", ajTableGetLength(kmers)); } AJCNEW0(lastlocation, ntargetseqs); if(ajTableGetLength(kmers)<1) ajErr("no kmers found"); nkmers = embWordRabinKarpInit(kmers, &wordsw, wordlen, targetseqs); while(ajSeqallNext(queryseqs,&queryseq)) { ajSeqTrim(queryseq); queryaln = ajStrNewRes(1+ajSeqGetLen(queryseq)); ajDebug("Read '%S'\n", ajSeqGetNameS(queryseq)); for(k=0;k<ntargetseqs;k++) { lastlocation[k]=0; matchlist[k] = ajListstrNew(); } embWordRabinKarpSearch(ajSeqGetSeqS(queryseq), targetseqs, (const EmbPWordRK*)wordsw, wordlen, nkmers, matchlist, lastlocation, ajFalse); for(k=0;k<ajSeqsetGetSize(targetseqs);k++) { targetseq = ajSeqsetGetseqSeq(targetseqs, k); ajDebug("Processing '%S'\n", ajSeqGetNameS(targetseq)); if(ajListGetLength(matchlist[k])==0) { ajFmtPrintF(errorf, "No wordmatch start points for " "%s vs %s. No alignment\n", ajSeqGetNameC(queryseq),ajSeqGetNameC(targetseq)); embWordMatchListDelete(&matchlist[k]); continue; } /* only the maximum match is used as seed * (if there is more than one location with the maximum match * only the first one is used) * TODO: we should add a new option to make above limit optional */ maxmatch = embWordMatchFirstMax(matchlist[k]); supermatcher_findendpoints(maxmatch,targetseq, queryseq, &targetstart, &querystart, &targetend, &queryend); targetaln=ajStrNewRes(1+ajSeqGetLen(targetseq)); queryseqc = ajSeqGetSeqC(queryseq); targetseqc = ajSeqGetSeqC(targetseq); ajStrAssignC(&queryaln,""); ajStrAssignC(&targetaln,""); ajDebug("++ %S v %S start:%d %d end:%d %d\n", ajSeqGetNameS(targetseq), ajSeqGetNameS(queryseq), targetstart, querystart, targetend, queryend); newmax = (targetend-targetstart+2)*width; if(newmax > oldmax) { AJCRESIZE0(path,oldmax,newmax); AJCRESIZE0(compass,oldmax,newmax); oldmax=newmax; ajDebug("++ memory re/allocation for path/compass arrays" " to size: %d\n", newmax); } else { AJCSET0(path,newmax); AJCSET0(compass,newmax); } ajDebug("Calling embAlignPathCalcSWFast " "%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n", querystart, queryend, (queryend - querystart + 1), ajSeqGetLen(queryseq), targetstart, targetend, (targetend - targetstart + 1), ajSeqGetLen(targetseq), width); score = embAlignPathCalcSWFast(&targetseqc[targetstart], &queryseqc[querystart], targetend-targetstart+1, queryend-querystart+1, 0,width, gapopen,gapextend, path,sub,cvt, compass,show); if(score>minscore) { embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend, targetseq,queryseq, &targetaln,&queryaln, targetend-targetstart+1, queryend-querystart+1, 0,width, &targetstart,&querystart); if(!ajAlignFormatShowsSequences(align)) { ajAlignDefineCC(align, ajStrGetPtr(targetaln), ajStrGetPtr(queryaln), ajSeqGetNameC(targetseq), ajSeqGetNameC(queryseq)); ajAlignSetScoreR(align, score); } else { ajDebug(" queryaln:%S \ntargetaln:%S\n", queryaln,targetaln); embAlignReportLocal(align, queryseq, targetseq, queryaln, targetaln, querystart, targetstart, gapopen, gapextend, score, matrix, 1 + ajSeqGetOffset(queryseq), 1 + ajSeqGetOffset(targetseq) ); } ajAlignWrite(align); ajAlignReset(align); } ajStrDel(&targetaln); embWordMatchListDelete(&matchlist[k]); } ajStrDel(&queryaln); } for(k=0;k<nkmers;k++) { AJFREE(wordsw[k]->seqindxs); AJFREE(wordsw[k]->nSeqMatches); for(j=0;j<wordsw[k]->nseqs;j++) AJFREE(wordsw[k]->locs[j]); AJFREE(wordsw[k]->nnseqlocs); AJFREE(wordsw[k]->locs); AJFREE(wordsw[k]); } embWordFreeTable(&kmers); if(!ajAlignFormatShowsSequences(align)) ajMatrixfDel(&matrix); AJFREE(path); AJFREE(compass); AJFREE(kmers); AJFREE(wordsw); AJFREE(matchlist); AJFREE(lastlocation); ajAlignClose(align); ajAlignDel(&align); ajSeqallDel(&queryseqs); ajSeqDel(&queryseq); ajSeqsetDel(&targetseqs); ajFileClose(&errorf); embExit(); return 0; }