Beispiel #1
0
static AjBool assemoutWriteBamAlignment(AjPSeqBamBgzf gzfile,
					const AjPAssemRead r,
					AjPSeqBam bam)
{
    AjPSeqBamCore c;
    AjPAssemTag tag;
    unsigned char *dpos;
    const char *s;
    ajuint ilen;
    ajuint slen;
    ajuint i;
    AjIList l = NULL;

    /* optional fields */
    ajint tagvalsize = 0;
    const unsigned char* tagval = 0;
    ajint intval =0;

    /* processing cigar strings*/
    char *t;
    int op;
    long x;


    unsigned char bam_nt16_table[256] =
    {
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
     15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
     15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
     15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
    };

    /* bam_write1 for each alignment */

    c = &bam->core;

    ilen = ajStrGetLen(r->Seq);

    c->tid = (int) r->Reference;

    if(r->Flag & BAM_FREVERSE)
	c->pos = r->y1-1;
    else
	c->pos = r->x1-1; /* BAM format is zero based;
                             -1 is translated to 0, meaning unmapped */
    c->bin = 0;
    c->qual = r->MapQ;
    c->l_qname = 1 + ajStrGetLen(r->Name);
    c->flag = r->Flag;
    c->n_cigar = 0;
    c->l_qseq = ilen;
    c->mtid = (int) r->Rnext;
    c->mpos = (int) r->Pnext-1;
    c->isize = r->Tlen;


    /* get cigar string length */
    s = ajStrGetPtr(r->Cigar);
    if (strcmp(s,"*")) /* '*' means unavailable  */
    {
	for (; *s; ++s)
	{
	  if ((isalpha((int)*s)) || (*s=='='))
		++c->n_cigar;
	    else
	      if (!isdigit((int)*s))
		    ajWarn("invalid CIGAR character: %c\n", *s);
	}
    }


    bam->data_len = c->n_cigar*4 + c->l_qname +
        (ilen + 1)/2 + ilen;

    /* allocation for optional tags are made as they are appended */

    if(bam->data_len > bam->m_data)
    {
        AJCRESIZE0(bam->data,bam->m_data, bam->data_len);
        bam->m_data = bam->data_len;
    }

    dpos = bam->data;

    /* copy query name to bam->data */
    memcpy(dpos, ajStrGetPtr(r->Name), c->l_qname);
    dpos += c->l_qname;

    /* copy cigar string to bam->data */
    s = ajStrGetPtr(r->Cigar);
    for (i = 0; i != c->n_cigar; ++i)
    {
	x = strtol(s, &t, 10);
	op = toupper((int)*t);
	if (op == 'M') op = BAM_CMATCH;
	else if (op == 'I') op = BAM_CINS;
	else if (op == 'D') op = BAM_CDEL;
	else if (op == 'N') op = BAM_CREF_SKIP;
	else if (op == 'S') op = BAM_CSOFT_CLIP;
	else if (op == 'H') op = BAM_CHARD_CLIP;
	else if (op == 'P') op = BAM_CPAD;
	else if (op == '=') op = BAM_CEQUAL;
	else if (op == 'X') op = BAM_CDIFF;
	else ajWarn("invalid CIGAR operation: %c",op);
	s = t + 1;
	((ajuint*)dpos)[i] = x << BAM_CIGAR_SHIFT | op;
    }

    if (*s && c->n_cigar)
	ajWarn("unmatched CIGAR operation: %c", *s);

    c->bin = ajSeqBamReg2bin(c->pos, ajSeqBamCalend(c, MAJSEQBAMCIGAR(bam)));

    dpos += c->n_cigar*4;


    /* copy sequence string to bam->data */
    s = ajStrGetPtr(r->Seq);
    slen = (ilen+1)/2;
    for (i = 0; i < slen; ++i)
        dpos[i] = 0;
    for (i = 0; i < ilen; ++i)
        dpos[i/2] |= bam_nt16_table[(ajuint)s[i]] << 4*(1-i%2);
    dpos += slen;

    /* copy quality values to bam->data */
    if(r->SeqQ && !ajStrMatchC(r->SeqQ, "*"))
    {
	s = ajStrGetPtr(r->SeqQ);

	for(i=0;i<ilen;i++)
	    dpos[i]= s[i]-33;

    }
    else
	for(i=0;i<ilen;i++)
	    dpos[i]= 0xff;



    l = ajListIterNewread(r->Tags);
    bam->l_aux=0;
    while (!ajListIterDone(l))
    {

	tag = ajListIterGet(l);

	/* TODO: array type 'B' and other types */

	if(tag->type == 'i' || tag->type == 'I')
	{
	    tagvalsize = 4;
	    ajStrToInt(tag->Comment, &intval);
	    tagval = (unsigned char*)&intval;
	}
	else if(tag->type =='s' || tag->type =='S')
	{
	    tagvalsize = 2;
	    ajStrToInt(tag->Comment, &intval);
	    tagval = (unsigned char*)&intval;
	}
	else if(tag->type =='c' || tag->type =='C')
	{
	    tagvalsize = 1;
	    ajStrToInt(tag->Comment, &intval);
	    tagval = (unsigned char*)&intval;
	}
	else if(tag->type =='A')
	{
	    tagvalsize = 1;
	    tagval = (const unsigned char*)ajStrGetPtr(tag->Comment);
	}
	else if(tag->type =='Z')
	{
	    tagvalsize = ajStrGetLen(tag->Comment)+1;
	    tagval = (const unsigned char*)ajStrGetPtr(tag->Comment);
	}
	else
	{
	    ajWarn("tag type '%c' not yet supported",tag->type);
	    continue;
	}
	ajSeqBamAuxAppend(bam,
		ajStrGetPtr(tag->Name),
		tag->type,
		tagvalsize,
		tagval);


    }
    ajListIterDel(&l);



    ajSeqBamWrite(gzfile, bam);

    return ajTrue;
}
Beispiel #2
0
int main(int argc, char **argv)
{
    AjPSeqall queryseqs;
    AjPSeqset targetseqs;
    AjPSeq queryseq;
    const AjPSeq targetseq;
    AjPStr queryaln = 0;
    AjPStr targetaln = 0;

    AjPFile errorf;
    AjBool show = ajFalse;

    const char   *queryseqc;
    const char   *targetseqc;

    AjPMatrixf matrix;
    AjPSeqCvt cvt = 0;
    float **sub;
    ajint *compass = NULL;
    float *path = NULL;

    float gapopen;
    float gapextend;
    float score;
    float minscore;

    ajuint j, k;
    ajint querystart = 0;
    ajint targetstart = 0;
    ajint queryend   = 0;
    ajint targetend   = 0;
    ajint width  = 0;
    AjPTable kmers = 0;
    ajint wordlen = 6;
    ajint oldmax = 0;
    ajint newmax = 0;

    ajuint ntargetseqs;
    ajuint nkmers;

    AjPAlign align = NULL;
    EmbPWordMatch maxmatch; /* match with maximum score */

    /* Cursors for the current sequence being scanned,
    ** i.e., until which location it was scanned.
    ** Separate cursor/location entries for each sequence in the seqset.
    */
    ajuint* lastlocation;

    EmbPWordRK* wordsw = NULL;
    AjPList* matchlist = NULL;

    embInit("supermatcher", argc, argv);

    matrix    = ajAcdGetMatrixf("datafile");
    queryseqs = ajAcdGetSeqall("asequence");
    targetseqs= ajAcdGetSeqset("bsequence");
    gapopen   = ajAcdGetFloat("gapopen");
    gapextend = ajAcdGetFloat("gapextend");
    wordlen   = ajAcdGetInt("wordlen");
    align     = ajAcdGetAlign("outfile");
    errorf    = ajAcdGetOutfile("errorfile");
    width     = ajAcdGetInt("width");	/* width for banded Smith-Waterman */
    minscore  = ajAcdGetFloat("minscore");

    gapopen   = ajRoundFloat(gapopen, 8);
    gapextend = ajRoundFloat(gapextend, 8);

    sub = ajMatrixfGetMatrix(matrix);
    cvt = ajMatrixfGetCvt(matrix);

    embWordLength(wordlen);

    /* seqset sequence is the reference sequence for SAM format */
    ajAlignSetRefSeqIndx(align, 1);

    ajSeqsetTrim(targetseqs);

    ntargetseqs = ajSeqsetGetSize(targetseqs);

    AJCNEW0(matchlist, ntargetseqs);

    /* get tables of words */
    for(k=0;k<ntargetseqs;k++)
    {
	targetseq = ajSeqsetGetseqSeq(targetseqs, k);
	embWordGetTable(&kmers, targetseq);
	ajDebug("Number of distinct kmers found so far: %d\n",
		ajTableGetLength(kmers));
    }
    AJCNEW0(lastlocation, ntargetseqs);

    if(ajTableGetLength(kmers)<1)
	ajErr("no kmers found");

    nkmers = embWordRabinKarpInit(kmers, &wordsw, wordlen, targetseqs);

    while(ajSeqallNext(queryseqs,&queryseq))
    {
	ajSeqTrim(queryseq);

	queryaln = ajStrNewRes(1+ajSeqGetLen(queryseq));

	ajDebug("Read '%S'\n", ajSeqGetNameS(queryseq));

	for(k=0;k<ntargetseqs;k++)
	{
	    lastlocation[k]=0;
	    matchlist[k] = ajListstrNew();
	}

	embWordRabinKarpSearch(ajSeqGetSeqS(queryseq), targetseqs,
		(const EmbPWordRK*)wordsw, wordlen, nkmers,
		matchlist, lastlocation, ajFalse);


	for(k=0;k<ajSeqsetGetSize(targetseqs);k++)
	{
	    targetseq      = ajSeqsetGetseqSeq(targetseqs, k);

	    ajDebug("Processing '%S'\n", ajSeqGetNameS(targetseq));

	    if(ajListGetLength(matchlist[k])==0)
	    {
		ajFmtPrintF(errorf,
		            "No wordmatch start points for "
		            "%s vs %s. No alignment\n",
		            ajSeqGetNameC(queryseq),ajSeqGetNameC(targetseq));
		embWordMatchListDelete(&matchlist[k]);
		continue;
	    }


	    /* only the maximum match is used as seed
	     * (if there is more than one location with the maximum match
	     * only the first one is used)
	     * TODO: we should add a new option to make above limit optional
	     */
	    maxmatch = embWordMatchFirstMax(matchlist[k]);

	    supermatcher_findendpoints(maxmatch,targetseq, queryseq,
		    &targetstart, &querystart,
		    &targetend, &queryend);

	    targetaln=ajStrNewRes(1+ajSeqGetLen(targetseq));
	    queryseqc = ajSeqGetSeqC(queryseq);
	    targetseqc = ajSeqGetSeqC(targetseq);

	    ajStrAssignC(&queryaln,"");
	    ajStrAssignC(&targetaln,"");

	    ajDebug("++ %S v %S start:%d %d end:%d %d\n",
		    ajSeqGetNameS(targetseq), ajSeqGetNameS(queryseq),
		    targetstart, querystart, targetend, queryend);

	    newmax = (targetend-targetstart+2)*width;

	    if(newmax > oldmax)
	    {
		AJCRESIZE0(path,oldmax,newmax);
		AJCRESIZE0(compass,oldmax,newmax);
		oldmax=newmax;
		ajDebug("++ memory re/allocation for path/compass arrays"
			" to size: %d\n", newmax);
	    }
	    else
	    {
		AJCSET0(path,newmax);
		AJCSET0(compass,newmax);
	    }

	    ajDebug("Calling embAlignPathCalcSWFast "
		    "%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n",
		    querystart, queryend, (queryend - querystart + 1),
		    ajSeqGetLen(queryseq),
		    targetstart, targetend, (targetend - targetstart + 1),
		    ajSeqGetLen(targetseq),
		    width);

	    score = embAlignPathCalcSWFast(&targetseqc[targetstart],
	                                   &queryseqc[querystart],
	                                   targetend-targetstart+1,
	                                   queryend-querystart+1,
	                                   0,width,
	                                   gapopen,gapextend,
	                                   path,sub,cvt,
	                                   compass,show);
	    if(score>minscore)
	    {
		embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend,
		                         targetseq,queryseq,
		                         &targetaln,&queryaln,
		                         targetend-targetstart+1,
		                         queryend-querystart+1,
		                         0,width,
		                         &targetstart,&querystart);

		if(!ajAlignFormatShowsSequences(align))
		{
		    ajAlignDefineCC(align, ajStrGetPtr(targetaln),
		                    ajStrGetPtr(queryaln),
		                    ajSeqGetNameC(targetseq),
		                    ajSeqGetNameC(queryseq));
		    ajAlignSetScoreR(align, score);
		}
		else
		{
		    ajDebug(" queryaln:%S \ntargetaln:%S\n",
		            queryaln,targetaln);
		    embAlignReportLocal(align,
			    queryseq, targetseq,
			    queryaln, targetaln,
			    querystart, targetstart,
			    gapopen, gapextend,
			    score, matrix,
			    1 + ajSeqGetOffset(queryseq),
			    1 + ajSeqGetOffset(targetseq)
		    );
		}
		ajAlignWrite(align);
		ajAlignReset(align);
	    }
	    ajStrDel(&targetaln);

	    embWordMatchListDelete(&matchlist[k]);
	}

	ajStrDel(&queryaln);
    }


    for(k=0;k<nkmers;k++)
    {
	AJFREE(wordsw[k]->seqindxs);
	AJFREE(wordsw[k]->nSeqMatches);

	for(j=0;j<wordsw[k]->nseqs;j++)
	    AJFREE(wordsw[k]->locs[j]);

	AJFREE(wordsw[k]->nnseqlocs);
	AJFREE(wordsw[k]->locs);
	AJFREE(wordsw[k]);
    }

    embWordFreeTable(&kmers);

    if(!ajAlignFormatShowsSequences(align))
	ajMatrixfDel(&matrix);
    
    AJFREE(path);
    AJFREE(compass);
    AJFREE(kmers);
    AJFREE(wordsw);

    AJFREE(matchlist);
    AJFREE(lastlocation);

    ajAlignClose(align);
    ajAlignDel(&align);
    ajSeqallDel(&queryseqs);
    ajSeqDel(&queryseq);
    ajSeqsetDel(&targetseqs);
    ajFileClose(&errorf);

    embExit();

    return 0;
}