Exemple #1
0
int main(int argc, char **argv)
{

    AjPSeqset seqset;
    const AjPSeq seq1;
    const AjPSeq seq2;
    ajint wordlen;
    AjPTable seq1MatchTable = NULL;
    AjPList matchlist ;
    AjPGraph graph = 0;
    ajuint i;
    ajuint j;
    float total=0;
    ajuint acceptableticks[]=
    {
	1,10,50,100,200,500,1000,1500,10000,50000,
	100000,500000,1000000,5000000
    };
    ajint numbofticks = 10;
    ajint gap,tickgap;
    AjBool boxit    = AJTRUE;
    AjBool dumpfeat = AJFALSE;
    float xmargin;
    float ymargin;
    float k;
    char ptr[10];
    float ticklen;
    float onefifth;
    AjPFeattable *tabptr = NULL;
    AjPFeattabOut seq1out = NULL;
    AjPStr sajb = NULL;
    float flen1;
    float flen2;
    ajuint tui;
    
    embInit("polydot", argc, argv);

    wordlen  = ajAcdGetInt("wordsize");
    seqset   = ajAcdGetSeqset("sequences");
    graph    = ajAcdGetGraph("graph");
    gap      = ajAcdGetInt("gap");
    boxit    = ajAcdGetBoolean("boxit");
    seq1out  = ajAcdGetFeatout("outfeat");
    dumpfeat = ajAcdGetToggle("dumpfeat");

    sajb = ajStrNew();
    embWordLength(wordlen);

    AJCNEW(lines,ajSeqsetGetSize(seqset));
    AJCNEW(pts,ajSeqsetGetSize(seqset));
    AJCNEW(tabptr,ajSeqsetGetSize(seqset));

    for(i=0;i<ajSeqsetGetSize(seqset);i++)
    {
	seq1 = ajSeqsetGetseqSeq(seqset, i);
	total += ajSeqGetLen(seq1);

    }
    
    total +=(float)(gap*(ajSeqsetGetSize(seqset)-1));
    
    xmargin = total*(float)0.15;
    ymargin = total*(float)0.15;
    
    ticklen = xmargin*(float)0.1;
    onefifth  = xmargin*(float)0.2;
    
    i = 0;
    while(acceptableticks[i]*numbofticks < ajSeqsetGetLen(seqset))
	i++;
    
    if(i<=13)
	tickgap = acceptableticks[i];
    else
	tickgap = acceptableticks[13];
    
    ajGraphAppendTitleS(graph, ajSeqsetGetUsa(seqset));

    ajGraphOpenWin(graph, (float)0.0-xmargin,(total+xmargin)*(float)1.35,
		   (float)0.0-ymargin,
		   total+ymargin);
    ajGraphicsSetCharscale((float)0.3);
    
    
    for(i=0;i<ajSeqsetGetSize(seqset);i++)
    {
	which = i;
	seq1 = ajSeqsetGetseqSeq(seqset, i);
	tui = ajSeqGetLen(seq1);
	flen1 = (float) tui;

	if(embWordGetTable(&seq1MatchTable, seq1)){ /* get table of words */
	    for(j=0;j<ajSeqsetGetSize(seqset);j++)
	    {
		seq2 = ajSeqsetGetseqSeq(seqset, j);
		tui  = ajSeqGetLen(seq2);
		flen2 = (float) tui;

		if(boxit)
		    ajGraphicsDrawposRect(xstart,ystart,
                                          xstart+flen1,
                                          ystart+flen2);

		matchlist = embWordBuildMatchTable(seq1MatchTable, seq2,
						   ajTrue);
		if(matchlist)
		    polydot_plotMatches(matchlist);

		if(i<j && dumpfeat)
		    embWordMatchListConvToFeat(matchlist,&tabptr[i],
					       &tabptr[j],seq1,
					       seq2);

		if(matchlist)	       /* free the match structures */
		    embWordMatchListDelete(&matchlist);

		if(j==0)
		{
		    for(k=0.0;k<ajSeqGetLen(seq1);k+=tickgap)
		    {
			ajGraphicsDrawposLine(xstart+k,ystart,xstart+k,
				    ystart-ticklen);

			sprintf(ptr,"%d",(ajint)k);
			ajGraphicsDrawposTextAtmid(xstart+k,
                                                   ystart-(onefifth),
                                                   ptr);
		    }
		    ajGraphicsDrawposTextAtmid(
                        xstart+(flen1/(float)2.0),
                        ystart-(3*onefifth),
                        ajStrGetPtr(ajSeqsetGetseqNameS(seqset, i)));
		}

		if(i==0)
		{
		    for(k=0.0;k<ajSeqGetLen(seq2);k+=tickgap)
		    {
			ajGraphicsDrawposLine(xstart,ystart+k,xstart-ticklen,
				    ystart+k);

			sprintf(ptr,"%d",(ajint)k);
			ajGraphicsDrawposTextAtend(xstart-(onefifth),
                                                   ystart+k,
                                                   ptr);
		    }
		    ajGraphicsDrawposTextAtlineJustify(
                        xstart-(3*onefifth),
                        ystart+(flen2/(float)2.0),
                        xstart-(3*onefifth),ystart+flen2,
                        ajStrGetPtr(ajSeqsetGetseqNameS(seqset, j)),0.5);
		}
		ystart += flen2+(float)gap;
	    }
	}
	embWordFreeTable(&seq1MatchTable);
	seq1MatchTable = NULL;
	xstart += flen1+(float)gap;
	ystart = 0.0;
    }
    
    ajGraphicsDrawposTextAtstart(total+onefifth,total-(onefifth),
		     "No. Length  Lines  Points Sequence");
    
    for(i=0;i<ajSeqsetGetSize(seqset);i++)
    {
	seq1 = ajSeqsetGetseqSeq(seqset, i);
	ajFmtPrintS(&sajb,"%3u %6d %5d %6d %s",i+1,
		    ajSeqGetLen(seq1),lines[i],
		    pts[i],ajSeqGetNameC(seq1));

	ajGraphicsDrawposTextAtstart(total+onefifth,total-(onefifth*(i+2)),
                                     ajStrGetPtr(sajb));
    }
    
    if(dumpfeat && seq1out)
    {
	for(i=0;i<ajSeqsetGetSize(seqset);i++)
	{
	    ajFeattableWrite(seq1out, tabptr[i]);
	    ajFeattableDel(&tabptr[i]);
	}
    }
    
    ajGraphicsClose();
    ajGraphxyDel(&graph);

    ajStrDel(&sajb);
    AJFREE(lines);
    AJFREE(pts);
    AJFREE(tabptr);

    ajSeqsetDel(&seqset);
    ajFeattabOutDel(&seq1out);;

    embExit();

    return 0;
}
int main(int argc, char **argv)
{
    AjPSeqset seqset;
    AjPSeqall seqall;
    AjPSeq queryseq;
    const AjPSeq targetseq;
    ajint wordlen;
    AjPTable wordsTable = NULL;
    AjPList* matchlist = NULL;
    AjPFile logfile;
    AjPFeattable* seqsetftables = NULL;
    AjPFeattable seqallseqftable = NULL;
    AjPFeattabOut ftoutforseqsetseq = NULL;
    AjPFeattabOut ftoutforseqallseq = NULL;
    AjPAlign align = NULL;
    AjIList iter = NULL;
    ajint targetstart;
    ajint querystart;
    ajint len;
    ajuint i, j;
    ajulong nAllMatches = 0;
    ajulong sumAllScore = 0;
    AjBool dumpAlign = ajTrue;
    AjBool dumpFeature = ajTrue;
    AjBool checkmode = ajFalse;
    EmbPWordRK* wordsw = NULL;
    ajuint npatterns = 0;
    ajuint seqsetsize;
    ajuint nmatches;
    ajuint* nmatchesseqset;
    ajuint* lastlocation; /* Cursors for Rabin-Karp search. */
                          /* Shows until what point the query sequence was
                           *  scanned for a pattern sequences in the seqset.
                          */
    char* paddedheader = NULL;
    const char* header;
    AjPStr padding;

    header = "Pattern %S  #pat-sequences  #all-matches  avg-match-length\n";
    padding = ajStrNew();

    embInit("wordmatch", argc, argv);

    wordlen = ajAcdGetInt("wordsize");
    seqset  = ajAcdGetSeqset("asequence");
    seqall  = ajAcdGetSeqall("bsequence");
    logfile = ajAcdGetOutfile("logfile");
    dumpAlign = ajAcdGetToggle("dumpalign");
    dumpFeature = ajAcdGetToggle("dumpfeat");

    if(dumpAlign)
    {
        align = ajAcdGetAlign("outfile");
        ajAlignSetExternal(align, ajTrue);
    }

    seqsetsize = ajSeqsetGetSize(seqset);
    ajSeqsetTrim(seqset);
    AJCNEW0(matchlist, seqsetsize);
    AJCNEW0(seqsetftables, seqsetsize);
    AJCNEW0(nmatchesseqset, seqsetsize);

    if (dumpFeature)
    {
        ftoutforseqsetseq =  ajAcdGetFeatout("aoutfeat");
        ftoutforseqallseq =  ajAcdGetFeatout("boutfeat");
    }

    checkmode = !dumpFeature && !dumpAlign;
    embWordLength(wordlen);

    ajFmtPrintF(logfile, "Small sequence/file for constructing"
	    " target patterns: %S\n", ajSeqsetGetUsa(seqset));
    ajFmtPrintF(logfile, "Large sequence/file to be scanned"
	    " for patterns: %S\n", ajSeqallGetUsa(seqall));
    ajFmtPrintF(logfile, "Number of sequences in the patterns file: %u\n",
            seqsetsize);
    ajFmtPrintF(logfile, "Pattern/word length: %u\n", wordlen);

    for(i=0;i<seqsetsize;i++)
    {
        targetseq = ajSeqsetGetseqSeq(seqset, i);
        embWordGetTable(&wordsTable, targetseq);
    }

    AJCNEW0(lastlocation, seqsetsize);

    if(ajTableGetLength(wordsTable)>0)
    {
        npatterns = embWordRabinKarpInit(wordsTable,
                                       &wordsw, wordlen, seqset);
        ajFmtPrintF(logfile, "Number of patterns/words found: %u\n", npatterns);

        while(ajSeqallNext(seqall,&queryseq))
        {
            for(i=0;i<seqsetsize;i++)
            {
                lastlocation[i]=0;

                if (!checkmode)
                    matchlist[i] = ajListstrNew();
            }

            nmatches = embWordRabinKarpSearch(
                    ajSeqGetSeqS(queryseq), seqset,
                    (EmbPWordRK const *)wordsw, wordlen, npatterns,
                    matchlist, lastlocation, checkmode);
            nAllMatches += nmatches;

            if (checkmode)
        	continue;

            for(i=0;i<seqsetsize;i++)
            {
                if(ajListGetLength(matchlist[i])>0)
                {
                    iter = ajListIterNewread(matchlist[i]) ;

                    while(embWordMatchIter(iter, &targetstart, &querystart, &len,
                            &targetseq))
                    {
                        if(dumpAlign)
                        {
                            ajAlignDefineSS(align, targetseq, queryseq);
                            ajAlignSetScoreI(align, len);
                            /* ungapped alignment means same length
                             *  for both sequences
                            */
                            ajAlignSetSubRange(align, targetstart, 1, len,
                                    ajSeqIsReversed(targetseq),
                                    ajSeqGetLen(targetseq),
                                    querystart, 1, len,
                                    ajSeqIsReversed(queryseq),
                                    ajSeqGetLen(queryseq));
                        }
                    }

                    if(dumpAlign)
                    {
                	ajAlignWrite(align);
                	ajAlignReset(align);
                    }

                    if(ajListGetLength(matchlist[i])>0 && dumpFeature)
                    {
                        embWordMatchListConvToFeat(matchlist[i],
                                                   &seqsetftables[i],
                                                   &seqallseqftable,
                                                   targetseq, queryseq);
                        ajFeattableWrite(ftoutforseqallseq, seqallseqftable);
                        ajFeattableDel(&seqallseqftable);
                    }

                    ajListIterDel(&iter);
                }

                embWordMatchListDelete(&matchlist[i]);
            }
        }

        /* search completed, now report statistics */
        for(i=0;i<npatterns;i++)
        {
            sumAllScore += wordsw[i]->lenMatches;

            for(j=0;j<wordsw[i]->nseqs;j++)
        	nmatchesseqset[wordsw[i]->seqindxs[j]] +=
        		wordsw[i]->nSeqMatches[j];
        }

        ajFmtPrintF(logfile, "Number of sequences in the file scanned "
                "for patterns: %u\n", ajSeqallGetCount(seqall));
        ajFmtPrintF(logfile, "Number of all matches: %Lu"
                " (wordmatch finds exact matches only)\n", nAllMatches);

        if(nAllMatches>0)
        {
            ajFmtPrintF(logfile, "Sum of match lengths: %Lu\n", sumAllScore);
            ajFmtPrintF(logfile, "Average match length: %.2f\n",
        	    sumAllScore*1.0/nAllMatches);

            ajFmtPrintF(logfile, "\nDistribution of the matches among pattern"
        	    " sequences:\n");
            ajFmtPrintF(logfile, "-----------------------------------------"
        	    "-----------\n");

            for(i=0;i<ajSeqsetGetSize(seqset);i++)
            {
        	if (nmatchesseqset[i]>0)
        	    ajFmtPrintF(logfile, "%-42s: %8u\n",
        	                ajSeqGetNameC(ajSeqsetGetseqSeq(seqset, i)),
        	                nmatchesseqset[i]);

        	ajFeattableWrite(ftoutforseqsetseq, seqsetftables[i]);
        	ajFeattableDel(&seqsetftables[i]);
            }

            ajFmtPrintF(logfile, "\nPattern statistics:\n");
            ajFmtPrintF(logfile, "-------------------\n");
            if(wordlen>7)
        	ajStrAppendCountK(&padding, ' ', wordlen-7);
            paddedheader = ajFmtString(header,padding);
            ajFmtPrintF(logfile, paddedheader);

            for(i=0;i<npatterns;i++)
        	if (wordsw[i]->nMatches>0)
        	    ajFmtPrintF(logfile, "%-7s: %12u  %12u %17.2f\n",
        	                wordsw[i]->word->fword, wordsw[i]->nseqs,
        	                wordsw[i]->nMatches,
        	                wordsw[i]->lenMatches*1.0/wordsw[i]->nMatches);
        }

    }

    for(i=0;i<npatterns;i++)
    {
        for(j=0;j<wordsw[i]->nseqs;j++)
            AJFREE(wordsw[i]->locs[j]);

        AJFREE(wordsw[i]->locs);
        AJFREE(wordsw[i]->seqindxs);
        AJFREE(wordsw[i]->nnseqlocs);
        AJFREE(wordsw[i]->nSeqMatches);
        AJFREE(wordsw[i]);
    }

    embWordFreeTable(&wordsTable);

    AJFREE(wordsw);
    AJFREE(matchlist);
    AJFREE(lastlocation);
    AJFREE(nmatchesseqset);
    AJFREE(seqsetftables);

    if(dumpAlign)
    {
        ajAlignClose(align);
        ajAlignDel(&align);
    }

    if(dumpFeature)
    {
        ajFeattabOutDel(&ftoutforseqsetseq);
        ajFeattabOutDel(&ftoutforseqallseq);
    }

    ajFileClose(&logfile);

    ajSeqallDel(&seqall);
    ajSeqsetDel(&seqset);
    ajSeqDel(&queryseq);
    ajStrDel(&padding);
    AJFREE(paddedheader);

    embExit();

    return 0;
}