Beispiel #1
0
AjPResource ajResourceNew(void)
{
    AjPResource ret;

    AJNEW0(ret);

    ret->Id       = ajStrNew();
    ret->Idalt    = ajListstrNew();
    ret->Acc      = ajStrNew();
    ret->Name     = ajStrNew();
    ret->Desc     = ajStrNew();
    ret->Url      = ajStrNew();
    ret->Urllink  = ajStrNew();
    ret->Urlrest  = ajStrNew();
    ret->Urlsoap  = ajStrNew();
    ret->Cat      = ajListstrNew();
    ret->Taxon    = ajListNew();
    ret->Edamdat  = ajListNew();
    ret->Edamfmt  = ajListNew();
    ret->Edamid   = ajListNew();
    ret->Edamtpc  = ajListNew();
    ret->Xref     = ajListNew();
    ret->Query    = ajListNew();
    ret->Example  = ajListstrNew();

    return ret;
}
Beispiel #2
0
static void remap_NoCutList(AjPFile outfile, const AjPTable hittable,
			    AjBool html, const AjPStr enzymes, AjBool blunt,
			    AjBool sticky, ajuint sitelen, AjBool commercial,
			    AjBool ambiguity, AjBool limit,
			    const AjPTable retable)
{

    /* for iterating over hittable */
    PValue value;
    void **keyarray = NULL;			/* array for table */
    void **valarray = NULL;			/* array for table */
    ajint i;

    /* list of enzymes that cut */
    AjPList cutlist;
    AjIList citer;			/* iterator for cutlist */
    AjPStr cutname = NULL;
    AjBool found;

    /* for parsing value->iso string */
    AjPStrTok tok;
    char tokens[] = " ,";
    AjPStr code = NULL;
    const char *p;

    /* for reading in enzymes names */
    AjPFile enzfile = NULL;
    AjPStr *ea;
    ajint ne;				/* number of enzymes */
    AjBool isall = ajTrue;

    /* list of enzymes that don't cut */
    AjPList nocutlist;
    AjIList niter;			/* iterator for nocutlist */
    AjPStr nocutname = NULL;

    /* count of rejected enzymes not matching criteria */
    ajint rejected_count = 0;

    EmbPPatRestrict enz;

    /* for renaming preferred isoschizomers */
    AjPList newlist;

     /*
     **
     ** Make a list of enzymes('cutlist') that hit
     ** including the isoschizomer names
     **
     */
    ajDebug("Make a list of all enzymes that cut\n");


    cutlist   = ajListstrNew();
    nocutlist = ajListstrNew();


    ajTableToarrayKeysValues(hittable, &keyarray, &valarray);
    for(i = 0; keyarray[i]; i++)
    {
        value = (PValue) valarray[i];
        cutname = ajStrNew();
        ajStrAssignRef(&cutname, keyarray[i]);
        ajListstrPushAppend(cutlist, cutname);

        /* Add to cutlist all isoschizomers of enzymes that cut */
        ajDebug("Add to cutlist all isoschizomers of enzymes that cut\n");

        /* start token to parse isoschizomers names */
        tok = ajStrTokenNewC(value->iso,  tokens);
        while(ajStrTokenNextParseC(&tok, tokens, &code))
        {
            cutname = ajStrNew();
            ajStrAssignS(&cutname, code);
            ajListstrPushAppend(cutlist, cutname);
        }
        ajStrTokenDel(&tok);
    }
    ajStrDel(&code);
    AJFREE(keyarray);
    AJFREE(valarray);



     /*
     ** Read in list of enzymes ('nocutlist') - either all or
     ** the input enzyme list.
     ** Exclude those that don't match the selection criteria - count these.
     */

    ajDebug("Read in a list of all input enzyme names\n");

    ne = 0;
    if(!enzymes)
	isall = ajTrue;
    else
    {
	/* get input list of enzymes into ea[] */
	ne = ajArrCommaList(enzymes, &ea);
	if(ajStrMatchCaseC(ea[0], "all"))
	    isall = ajTrue;
	else
	{
	    isall = ajFalse;
	    for(i=0; i<ne; ++i)
		ajStrRemoveWhite(&ea[i]);
	}
    }

    enzfile = ajDatafileNewInNameC(ENZDATA);

    /* push all enzyme names without the required criteria onto nocutlist */

    enz = embPatRestrictNew();
    while(!ajFileIsEof(enzfile))
    {
        if(!embPatRestrictReadEntry(enz, enzfile))
	    continue;

         /* 
	 ** If user entered explicit enzyme list, then check to see if
	 ** this is one of that explicit list 
	 */
        if(!isall)
	{
            found = AJFALSE;
            for(i=0; i<ne; ++i)
                if(ajStrMatchCaseS(ea[i], enz->cod))
		{
		    found = AJTRUE;
                    break;
                }

	    if(!found)			/* not in the explicit list */
		continue;

	    ajDebug("RE %S is in the input explicit list of REs\n", enz->cod);
	}

	/* ignore ncuts==0 as they are unknown */
	if(!enz->ncuts)
	{
	    /* number of cut positions */
            ajDebug("RE %S has an unknown number of cut positions\n",
		    enz->cod);
	    continue;
	}
        ajDebug("RE %S has a known number of cut sites\n", enz->cod);

	if(enz->len < sitelen)
	{
	    /* recognition site length */
            ajDebug("RE %S does not have a long enough recognition site\n", 
		    enz->cod);
	    rejected_count++;
	    continue;
	}

	if(!blunt && enz->blunt)
	{
	    /* blunt/sticky */
            ajDebug("RE %S is blunt\n", enz->cod);
	    rejected_count++;
	    continue;
	}

	if(!sticky && !enz->blunt)
	{
	    /* blunt/sticky */
            ajDebug("RE %S is sticky\n", enz->cod);
	    rejected_count++;
	    continue;
	}

	/* commercially available enzymes have uppercase patterns */
	p = ajStrGetPtr(enz->pat);

         /* 
	 ** The -commercial qualifier is only used if we are searching
	 ** through 'all' of the REBASE database - if we have specified an
	 ** explicit list of enzymes then they are searched for whether or
	 ** not they are commercially available
	 */
	if((*p >= 'a' && *p <= 'z') && commercial && isall)
	{
            ajDebug("RE %S is not commercial\n", enz->cod);
	    rejected_count++;
	    continue;
        }

	if(!ambiguity && remap_Ambiguous(enz->pat)) {
	    ajDebug("RE %S is ambiguous\n", enz->cod);
	    rejected_count++;
	    continue;	
	}

        ajDebug("RE %S matches all required criteria\n", enz->cod);

        code = ajStrNew();
	ajStrAssignS(&code, enz->cod);
	ajListstrPushAppend(nocutlist, code);
    }
    embPatRestrictDel(&enz);
    ajFileClose(&enzfile);


    for(i=0; i<ne; ++i)
	if(ea[i])
	    ajStrDel(&ea[i]);

    if(ne)
	AJFREE(ea);

    /*
     ** Change names of enzymes in the non-cutter list
     ** to that of preferred (prototype) 
     ** enzyme name so that the isoschizomers of cutters
     ** will be removed from the 
     ** non-cutter list in the next bit.
     ** Remove duplicate prototype names.
     */
    if(limit)
    {
        newlist = ajListstrNew();
        remap_RenamePreferred(nocutlist, retable, newlist);
        ajListstrFreeData(&nocutlist);
        nocutlist = newlist;
        ajListSortUnique(nocutlist, remap_cmpcase, remap_strdel);
    }


     /*
     ** Iterate through the list of input enzymes removing those that are in
     ** the cutlist.
     */

    ajDebug("Remove from the nocutlist all enzymes and isoschizomers "
	    "that cut\n");

     /*
     **  This steps down both lists at the same time, comparing names and
     **  iterating to the next name in whichever list whose name compares
     **  alphabetically before the other.  Where a match is found, the
     **  nocutlist item is deleted.
     */

    ajListSort(nocutlist, remap_cmpcase);
    ajListSort(cutlist, remap_cmpcase);

    citer = ajListIterNewread(cutlist);
    niter = ajListIterNew(nocutlist);

    /*
       while((cutname = (AjPStr)ajListIterGet(citer)) != NULL)
       ajDebug("dbg cutname = %S\n", cutname);
       */

    nocutname = (AjPStr)ajListIterGet(niter);
    cutname   = (AjPStr)ajListIterGet(citer);

    ajDebug("initial cutname, nocutname: '%S' '%S'\n", cutname, nocutname);

    while(nocutname != NULL && cutname != NULL)
    {
	i = ajStrCmpCaseS(cutname, nocutname);
	ajDebug("compare cutname, nocutname: %S %S ", cutname, nocutname);
	ajDebug("ajStrCmpCase=%d\n", i);
	if(i == 0)
	{
	    /* match - so remove from nocutlist */
	    ajDebug("ajListstrRemove %S\n", nocutname);
	    ajListstrIterRemove(niter);
	    nocutname = (AjPStr)ajListIterGet(niter);
	     /* 
	     ** Don't increment the cutname list pointer here
	     ** - there may be more than one entry in the nocutname
	     ** list with the same name because we have converted 
	     ** isoschizomers to their preferred name
	     */
	    /* cutname = (AjPStr)ajListIterGet(citer); */
	}
	else if(i == -1)
	    /* cutlist name sorts before nocutlist name */
	    cutname = (AjPStr)ajListIterGet(citer);
	else if(i == 1)
	    /* nocutlist name sorts before cutlist name */
	    nocutname = (AjPStr)ajListIterGet(niter);
    }

    ajListIterDel(&citer);
    ajListIterDel(&niter);
    ajListstrFreeData(&cutlist);


     /* Print the resulting list of those that do not cut*/

    ajDebug("Print out the list\n");

    /* print the title */
    if(html)
	ajFmtPrintF(outfile, "<H2>");
    ajFmtPrintF(outfile, "\n\n# Enzymes that do not cut\n\n");

    if(html)
	ajFmtPrintF(outfile, "</H2>\n");

    if(html)
	ajFmtPrintF(outfile, "<PRE>");

    /*  ajListSort(nocutlist, ajStrVcmp);*/
    niter = ajListIterNewread(nocutlist);
    i = 0;
    while((nocutname = (AjPStr)ajListIterGet(niter)) != NULL)
    {
	ajFmtPrintF(outfile, "%-10S", nocutname);
	/* new line after every 7 names printed */
	if(i++ == 7)
	{
	    ajFmtPrintF(outfile, "\n");
	    i = 0;
	}
    }
    ajListIterDel(&niter);


    /* end the output */
    ajFmtPrintF(outfile, "\n");
    if(html) {ajFmtPrintF(outfile, "</PRE>\n");}



     /*
     ** Print the count of rejected enzymes
     ** N.B. This is the count of ALL rejected enzymes including all
     ** isoschizomers
     */

    if(html)
        ajFmtPrintF(outfile, "<H2>");
    ajFmtPrintF(outfile,
		"\n\n# No. of cutting enzymes which do not match the\n"
		"# SITELEN, BLUNT, STICKY, COMMERCIAL, AMBIGUOUS criteria\n\n");
    if(html)
	ajFmtPrintF(outfile, "</H2>\n");
    ajFmtPrintF(outfile, "%d\n", rejected_count);

    ajDebug("Tidy up\n");
    ajListstrFreeData(&nocutlist);
    ajListstrFreeData(&cutlist);

    return;
}
Beispiel #3
0
/* @funcstatic seqwords_TermsRead *********************************************
 **
 ** Read the next Terms object from a file in embl-like format. The search 
 ** terms are modified with a leading and trailing space.
 **
 ** @param [r] inf  [AjPFile]   Input file stream
 ** @param [w] thys [AjPTerms*] Terms object
 **
 ** @return [AjBool] True on succcess
 ** @@
 *****************************************************************************/
static AjBool seqwords_TermsRead(AjPFile inf, 
				 AjPTerms *thys)
{    
    AjPStr   line           =NULL;	/* Line of text. */
    AjPStr   temp           =NULL;
    AjPList  list_terms     =NULL;	/* List of keywords for a scop node*/
    AjBool   ok             =ajFalse;
    AjPStr   type           = NULL;


    /* Memory management */
    (*thys)=seqwords_TermsNew();
    list_terms = ajListstrNew();
    line       = ajStrNew();
    type       = ajStrNew();
    
    /* Read first line. */
    ok = ajReadlineTrim(inf,&line);


    while(ok && !ajStrPrefixC(line,"//"))
    {
	if(ajStrPrefixC(line,"XX"))
	{
	    ok = ajReadlineTrim(inf,&line);
	    continue;
	}	
	else if(ajStrPrefixC(line,"TY"))
	{
	    ajFmtScanS(line, "%*s %S", &type);
	    
	    if(ajStrMatchC(type, "SCOP"))
		(*thys)->Type = ajSCOP;
	    else if(ajStrMatchC(type, "CATH"))
		(*thys)->Type = ajCATH;
	}
	else if(ajStrPrefixC(line,"CL"))
	{
	    ajStrAssignC(&(*thys)->Class,ajStrGetPtr(line)+3);
	    ajStrRemoveWhiteExcess(&(*thys)->Class);
	}
	else if(ajStrPrefixC(line,"AR"))
	{
	    ajStrAssignC(&(*thys)->Architecture,ajStrGetPtr(line)+3);
	    ajStrRemoveWhiteExcess(&(*thys)->Architecture);
	}
	else if(ajStrPrefixC(line,"TP"))
	{
	    ajStrAssignC(&(*thys)->Topology,ajStrGetPtr(line)+3);
	    ajStrRemoveWhiteExcess(&(*thys)->Topology);
	}
	else if(ajStrPrefixC(line,"FO"))
	{
	    ajStrAssignC(&(*thys)->Fold,ajStrGetPtr(line)+3);
	    while(ajReadlineTrim(inf,&line))
	    {
		if(ajStrPrefixC(line,"XX"))
		    break;
		ajStrAppendC(&(*thys)->Fold,ajStrGetPtr(line)+3);
	    }
	    ajStrRemoveWhiteExcess(&(*thys)->Fold);
	}
	else if(ajStrPrefixC(line,"SF"))
	{
	    ajStrAssignC(&(*thys)->Superfamily,ajStrGetPtr(line)+3);
	    while(ajReadlineTrim(inf,&line))
	    {
		if(ajStrPrefixC(line,"XX"))
		    break;
		ajStrAppendC(&(*thys)->Superfamily,ajStrGetPtr(line)+3);
	    }
	    ajStrRemoveWhiteExcess(&(*thys)->Superfamily);
	}
	else if(ajStrPrefixC(line,"FA"))
	{
	    ajStrAssignC(&(*thys)->Family,ajStrGetPtr(line)+3);
	    while(ajReadlineTrim(inf,&line))
	    {
		if(ajStrPrefixC(line,"XX"))
		    break;
		ajStrAppendC(&(*thys)->Family,ajStrGetPtr(line)+3);
	    }
	    ajStrRemoveWhiteExcess(&(*thys)->Family);
	}
	else if(ajStrPrefixC(line,"TE")) 
	{
	    /* Copy and clean up term. */
	    temp    = ajStrNew();
	    ajStrAssignC(&temp,ajStrGetPtr(line)+3);
	    ajStrRemoveWhiteExcess(&temp);

	    
	    /* Append a leading and trailing space to search term*/
	    ajStrAppendK(&temp, ' ');
	    ajStrInsertC(&temp, 0, " ");

	    
	    /* Add the current term to the list. */
	    ajListstrPush(list_terms,temp);		    
	}

	ok = ajReadlineTrim(inf,&line);
    }
    if(!ok)
    {
	/* Clean up. */
	ajListstrFree(&list_terms);
	ajStrDel(&line);
	
    
	/* Return. */
	return ajFalse;
    }
        
    
    /* Convert the AjPList of terms to array of AjPSeq's. */
    if(!((*thys)->N=ajListstrToarray((AjPList)list_terms,&(*thys)->Keywords)))
	ajWarn("Zero sized list of terms passed into seqwords_TermsRead");


    /* Clean up.  Free the list (not the nodes!). */
    ajListstrFree(&list_terms);
    ajStrDel(&line);
    ajStrDel(&type);
    
    return ajTrue;
} 
int main(int argc, char** argv)
{
    AjPFile outf = NULL;
    AjPFile cachef = NULL;

    AjIList iterator = NULL;
    AjPList aliases  = NULL;
    AjPList dbas     = NULL;
    AjPList species  = NULL;

    AjPStr alias   = NULL;
    AjPStr dbname  = NULL;
    AjPStr spname  = NULL;
    AjPStr svrname = NULL;
    AjPStr svrurl  = NULL;
    AjPStr dbcurl  = NULL;

    AjPTime svrtime = NULL;

    EnsEDatabaseadaptorGroup dbag = ensEDatabaseadaptorGroupNULL;

    EnsPDatabaseadaptor dba = NULL;

    EnsPDatabaseconnection dbc = NULL;

    embInit("cacheensembl", argc, argv);
    ensInit();

    svrname = ajAcdGetString("servername");
    outf    = ajAcdGetOutfile("outfile");
    cachef = ajAcdGetOutfile("cachefile");

    dbcurl = ajStrNew();
    svrurl = ajStrNew();
    dbname = ajStrNew();

    ajNamSvrGetUrl(svrname, &svrurl);

    if(!svrurl)
        ajFatal("Could not resolve server name '%S'.", svrname);

    dbc = ensDatabaseconnectionNewUrl(svrurl);
    ensRegistryLoadDatabaseconnection(dbc);
    ensDatabaseconnectionDel(&dbc);

    /* Write the server file header. */

    svrtime = ajTimeNewTodayFmt("cachefile");
    ajFmtPrintF(cachef, "# %S %D\n", ajFileGetNameS(cachef), svrtime);
    ajTimeDel(&svrtime);

    ajFmtPrintF(cachef,
                "# Automatically generated by cacheensembl "
                "for server '%S'.\n\n",
                svrname);

    /*
    ** Get all Ensembl Database Adaptor objects and write them as
    ** EMBOSS Database definitions.
    */

    aliases = ajListstrNew();
    dbas    = ajListNew();
    species = ajListstrNew();

    ensRegistryRetrieveAllSpecies(species);

    while(ajListstrPop(species, &spname))
    {
        ensRegistryGetAllDatabaseadaptors(ensEDatabaseadaptorGroupNULL,
                                          spname,
                                          dbas);

        while(ajListPop(dbas, (void**) &dba))
        {
            dbag = ensDatabaseadaptorGetGroup(dba);

            if(dbag == ensEDatabaseadaptorGroupNULL)
            {
                ajDebug("cacheensembl main got unexpected "
                        "Ensembl Database Adaptor Group %d.\n",
                        dbag);

                continue;
            }

            ajStrAssignS(&dbname, ensDatabaseadaptorGetSpecies(dba));

            if(dbag != ensEDatabaseadaptorGroupCore)
            {
                ajStrAppendC(&dbname, "_");
                ajStrAppendC(&dbname, ensDatabaseadaptorGroupToChar(dbag));
            }

            dbc = ensDatabaseadaptorGetDatabaseconnection(dba);

            ensDatabaseconnectionFetchUrl(dbc, &dbcurl);

            if(outf)
                ajFmtPrintF(outf, "%S\n", dbname);

            ajFmtPrintF(cachef, "DBNAME %S [\n", dbname);
            ajFmtPrintF(cachef, "  release: \"%s\"\n", ensSoftwareGetVersion());
            ajFmtPrintF(cachef, "  server:  \"%S\"\n", svrname);
            ajFmtPrintF(cachef, "  url:     \"%S\"\n", dbcurl);
            ajFmtPrintF(cachef, "]\n");
            ajFmtPrintF(cachef, "\n");

            if(dbag != ensEDatabaseadaptorGroupCore)
                continue;

            ensRegistryAliasFetchAllbySpecies(
                ensDatabaseadaptorGetSpecies(dba),
                aliases);

            /*
            ** Format all aliases to lower case,
            ** sort them alphabetically and remove duplicates.
            */

            iterator = ajListIterNew(aliases);
            while(!ajListIterDone(iterator))
            {
                alias = ajListstrIterGet(iterator);
                ajStrFmtLower(&alias);
            }
            ajListIterDel(&iterator);

            ajListSortUnique(aliases,
                             cacheensembl_stringcompare,
                             cacheensembl_stringdelete);

            alias = NULL;
            if(ajListGetLength(aliases) > 0)
            {
                while(ajListstrPop(aliases, &alias))
                {
                    /*
                    ** Reject any aliases with other than alpha-numeric
                    ** characters like white space.
                    */

                    if(ajStrIsAlnum(alias))
                        ajFmtPrintF(cachef,
                                    "ALIAS %S %S\n",
                                    alias,
                                    ensDatabaseadaptorGetSpecies(dba));

                    ajStrDel(&alias);
                }

                ajFmtPrintF(cachef, "\n");
            }

            /* Ensembl Database Adaptor objects *must not* be deleted. */
        }

        ajStrDel(&spname);
    }

    ajListstrFree(&aliases);
    ajListFree(&dbas);

    ajStrDel(&dbcurl);
    ajStrDel(&svrurl);
    ajStrDel(&dbname);
    ajStrDel(&svrname);

    ajFileClose(&outf);
    ajFileClose(&cachef);

    embExit();

    return EXIT_SUCCESS;
}
Beispiel #5
0
int main(int argc, char **argv)
{
    AjPSeqall queryseqs;
    AjPSeqset targetseqs;
    AjPSeq queryseq;
    const AjPSeq targetseq;
    AjPStr queryaln = 0;
    AjPStr targetaln = 0;

    AjPFile errorf;
    AjBool show = ajFalse;

    const char   *queryseqc;
    const char   *targetseqc;

    AjPMatrixf matrix;
    AjPSeqCvt cvt = 0;
    float **sub;
    ajint *compass = NULL;
    float *path = NULL;

    float gapopen;
    float gapextend;
    float score;
    float minscore;

    ajuint j, k;
    ajint querystart = 0;
    ajint targetstart = 0;
    ajint queryend   = 0;
    ajint targetend   = 0;
    ajint width  = 0;
    AjPTable kmers = 0;
    ajint wordlen = 6;
    ajint oldmax = 0;
    ajint newmax = 0;

    ajuint ntargetseqs;
    ajuint nkmers;

    AjPAlign align = NULL;
    EmbPWordMatch maxmatch; /* match with maximum score */

    /* Cursors for the current sequence being scanned,
    ** i.e., until which location it was scanned.
    ** Separate cursor/location entries for each sequence in the seqset.
    */
    ajuint* lastlocation;

    EmbPWordRK* wordsw = NULL;
    AjPList* matchlist = NULL;

    embInit("supermatcher", argc, argv);

    matrix    = ajAcdGetMatrixf("datafile");
    queryseqs = ajAcdGetSeqall("asequence");
    targetseqs= ajAcdGetSeqset("bsequence");
    gapopen   = ajAcdGetFloat("gapopen");
    gapextend = ajAcdGetFloat("gapextend");
    wordlen   = ajAcdGetInt("wordlen");
    align     = ajAcdGetAlign("outfile");
    errorf    = ajAcdGetOutfile("errorfile");
    width     = ajAcdGetInt("width");	/* width for banded Smith-Waterman */
    minscore  = ajAcdGetFloat("minscore");

    gapopen   = ajRoundFloat(gapopen, 8);
    gapextend = ajRoundFloat(gapextend, 8);

    sub = ajMatrixfGetMatrix(matrix);
    cvt = ajMatrixfGetCvt(matrix);

    embWordLength(wordlen);

    /* seqset sequence is the reference sequence for SAM format */
    ajAlignSetRefSeqIndx(align, 1);

    ajSeqsetTrim(targetseqs);

    ntargetseqs = ajSeqsetGetSize(targetseqs);

    AJCNEW0(matchlist, ntargetseqs);

    /* get tables of words */
    for(k=0;k<ntargetseqs;k++)
    {
	targetseq = ajSeqsetGetseqSeq(targetseqs, k);
	embWordGetTable(&kmers, targetseq);
	ajDebug("Number of distinct kmers found so far: %d\n",
		ajTableGetLength(kmers));
    }
    AJCNEW0(lastlocation, ntargetseqs);

    if(ajTableGetLength(kmers)<1)
	ajErr("no kmers found");

    nkmers = embWordRabinKarpInit(kmers, &wordsw, wordlen, targetseqs);

    while(ajSeqallNext(queryseqs,&queryseq))
    {
	ajSeqTrim(queryseq);

	queryaln = ajStrNewRes(1+ajSeqGetLen(queryseq));

	ajDebug("Read '%S'\n", ajSeqGetNameS(queryseq));

	for(k=0;k<ntargetseqs;k++)
	{
	    lastlocation[k]=0;
	    matchlist[k] = ajListstrNew();
	}

	embWordRabinKarpSearch(ajSeqGetSeqS(queryseq), targetseqs,
		(const EmbPWordRK*)wordsw, wordlen, nkmers,
		matchlist, lastlocation, ajFalse);


	for(k=0;k<ajSeqsetGetSize(targetseqs);k++)
	{
	    targetseq      = ajSeqsetGetseqSeq(targetseqs, k);

	    ajDebug("Processing '%S'\n", ajSeqGetNameS(targetseq));

	    if(ajListGetLength(matchlist[k])==0)
	    {
		ajFmtPrintF(errorf,
		            "No wordmatch start points for "
		            "%s vs %s. No alignment\n",
		            ajSeqGetNameC(queryseq),ajSeqGetNameC(targetseq));
		embWordMatchListDelete(&matchlist[k]);
		continue;
	    }


	    /* only the maximum match is used as seed
	     * (if there is more than one location with the maximum match
	     * only the first one is used)
	     * TODO: we should add a new option to make above limit optional
	     */
	    maxmatch = embWordMatchFirstMax(matchlist[k]);

	    supermatcher_findendpoints(maxmatch,targetseq, queryseq,
		    &targetstart, &querystart,
		    &targetend, &queryend);

	    targetaln=ajStrNewRes(1+ajSeqGetLen(targetseq));
	    queryseqc = ajSeqGetSeqC(queryseq);
	    targetseqc = ajSeqGetSeqC(targetseq);

	    ajStrAssignC(&queryaln,"");
	    ajStrAssignC(&targetaln,"");

	    ajDebug("++ %S v %S start:%d %d end:%d %d\n",
		    ajSeqGetNameS(targetseq), ajSeqGetNameS(queryseq),
		    targetstart, querystart, targetend, queryend);

	    newmax = (targetend-targetstart+2)*width;

	    if(newmax > oldmax)
	    {
		AJCRESIZE0(path,oldmax,newmax);
		AJCRESIZE0(compass,oldmax,newmax);
		oldmax=newmax;
		ajDebug("++ memory re/allocation for path/compass arrays"
			" to size: %d\n", newmax);
	    }
	    else
	    {
		AJCSET0(path,newmax);
		AJCSET0(compass,newmax);
	    }

	    ajDebug("Calling embAlignPathCalcSWFast "
		    "%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n",
		    querystart, queryend, (queryend - querystart + 1),
		    ajSeqGetLen(queryseq),
		    targetstart, targetend, (targetend - targetstart + 1),
		    ajSeqGetLen(targetseq),
		    width);

	    score = embAlignPathCalcSWFast(&targetseqc[targetstart],
	                                   &queryseqc[querystart],
	                                   targetend-targetstart+1,
	                                   queryend-querystart+1,
	                                   0,width,
	                                   gapopen,gapextend,
	                                   path,sub,cvt,
	                                   compass,show);
	    if(score>minscore)
	    {
		embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend,
		                         targetseq,queryseq,
		                         &targetaln,&queryaln,
		                         targetend-targetstart+1,
		                         queryend-querystart+1,
		                         0,width,
		                         &targetstart,&querystart);

		if(!ajAlignFormatShowsSequences(align))
		{
		    ajAlignDefineCC(align, ajStrGetPtr(targetaln),
		                    ajStrGetPtr(queryaln),
		                    ajSeqGetNameC(targetseq),
		                    ajSeqGetNameC(queryseq));
		    ajAlignSetScoreR(align, score);
		}
		else
		{
		    ajDebug(" queryaln:%S \ntargetaln:%S\n",
		            queryaln,targetaln);
		    embAlignReportLocal(align,
			    queryseq, targetseq,
			    queryaln, targetaln,
			    querystart, targetstart,
			    gapopen, gapextend,
			    score, matrix,
			    1 + ajSeqGetOffset(queryseq),
			    1 + ajSeqGetOffset(targetseq)
		    );
		}
		ajAlignWrite(align);
		ajAlignReset(align);
	    }
	    ajStrDel(&targetaln);

	    embWordMatchListDelete(&matchlist[k]);
	}

	ajStrDel(&queryaln);
    }


    for(k=0;k<nkmers;k++)
    {
	AJFREE(wordsw[k]->seqindxs);
	AJFREE(wordsw[k]->nSeqMatches);

	for(j=0;j<wordsw[k]->nseqs;j++)
	    AJFREE(wordsw[k]->locs[j]);

	AJFREE(wordsw[k]->nnseqlocs);
	AJFREE(wordsw[k]->locs);
	AJFREE(wordsw[k]);
    }

    embWordFreeTable(&kmers);

    if(!ajAlignFormatShowsSequences(align))
	ajMatrixfDel(&matrix);
    
    AJFREE(path);
    AJFREE(compass);
    AJFREE(kmers);
    AJFREE(wordsw);

    AJFREE(matchlist);
    AJFREE(lastlocation);

    ajAlignClose(align);
    ajAlignDel(&align);
    ajSeqallDel(&queryseqs);
    ajSeqDel(&queryseq);
    ajSeqsetDel(&targetseqs);
    ajFileClose(&errorf);

    embExit();

    return 0;
}
int main(int argc, char **argv)
{
    AjPSeqset seqset;
    AjPSeqall seqall;
    AjPSeq queryseq;
    const AjPSeq targetseq;
    ajint wordlen;
    AjPTable wordsTable = NULL;
    AjPList* matchlist = NULL;
    AjPFile logfile;
    AjPFeattable* seqsetftables = NULL;
    AjPFeattable seqallseqftable = NULL;
    AjPFeattabOut ftoutforseqsetseq = NULL;
    AjPFeattabOut ftoutforseqallseq = NULL;
    AjPAlign align = NULL;
    AjIList iter = NULL;
    ajint targetstart;
    ajint querystart;
    ajint len;
    ajuint i, j;
    ajulong nAllMatches = 0;
    ajulong sumAllScore = 0;
    AjBool dumpAlign = ajTrue;
    AjBool dumpFeature = ajTrue;
    AjBool checkmode = ajFalse;
    EmbPWordRK* wordsw = NULL;
    ajuint npatterns = 0;
    ajuint seqsetsize;
    ajuint nmatches;
    ajuint* nmatchesseqset;
    ajuint* lastlocation; /* Cursors for Rabin-Karp search. */
                          /* Shows until what point the query sequence was
                           *  scanned for a pattern sequences in the seqset.
                          */
    char* paddedheader = NULL;
    const char* header;
    AjPStr padding;

    header = "Pattern %S  #pat-sequences  #all-matches  avg-match-length\n";
    padding = ajStrNew();

    embInit("wordmatch", argc, argv);

    wordlen = ajAcdGetInt("wordsize");
    seqset  = ajAcdGetSeqset("asequence");
    seqall  = ajAcdGetSeqall("bsequence");
    logfile = ajAcdGetOutfile("logfile");
    dumpAlign = ajAcdGetToggle("dumpalign");
    dumpFeature = ajAcdGetToggle("dumpfeat");

    if(dumpAlign)
    {
        align = ajAcdGetAlign("outfile");
        ajAlignSetExternal(align, ajTrue);
    }

    seqsetsize = ajSeqsetGetSize(seqset);
    ajSeqsetTrim(seqset);
    AJCNEW0(matchlist, seqsetsize);
    AJCNEW0(seqsetftables, seqsetsize);
    AJCNEW0(nmatchesseqset, seqsetsize);

    if (dumpFeature)
    {
        ftoutforseqsetseq =  ajAcdGetFeatout("aoutfeat");
        ftoutforseqallseq =  ajAcdGetFeatout("boutfeat");
    }

    checkmode = !dumpFeature && !dumpAlign;
    embWordLength(wordlen);

    ajFmtPrintF(logfile, "Small sequence/file for constructing"
	    " target patterns: %S\n", ajSeqsetGetUsa(seqset));
    ajFmtPrintF(logfile, "Large sequence/file to be scanned"
	    " for patterns: %S\n", ajSeqallGetUsa(seqall));
    ajFmtPrintF(logfile, "Number of sequences in the patterns file: %u\n",
            seqsetsize);
    ajFmtPrintF(logfile, "Pattern/word length: %u\n", wordlen);

    for(i=0;i<seqsetsize;i++)
    {
        targetseq = ajSeqsetGetseqSeq(seqset, i);
        embWordGetTable(&wordsTable, targetseq);
    }

    AJCNEW0(lastlocation, seqsetsize);

    if(ajTableGetLength(wordsTable)>0)
    {
        npatterns = embWordRabinKarpInit(wordsTable,
                                       &wordsw, wordlen, seqset);
        ajFmtPrintF(logfile, "Number of patterns/words found: %u\n", npatterns);

        while(ajSeqallNext(seqall,&queryseq))
        {
            for(i=0;i<seqsetsize;i++)
            {
                lastlocation[i]=0;

                if (!checkmode)
                    matchlist[i] = ajListstrNew();
            }

            nmatches = embWordRabinKarpSearch(
                    ajSeqGetSeqS(queryseq), seqset,
                    (EmbPWordRK const *)wordsw, wordlen, npatterns,
                    matchlist, lastlocation, checkmode);
            nAllMatches += nmatches;

            if (checkmode)
        	continue;

            for(i=0;i<seqsetsize;i++)
            {
                if(ajListGetLength(matchlist[i])>0)
                {
                    iter = ajListIterNewread(matchlist[i]) ;

                    while(embWordMatchIter(iter, &targetstart, &querystart, &len,
                            &targetseq))
                    {
                        if(dumpAlign)
                        {
                            ajAlignDefineSS(align, targetseq, queryseq);
                            ajAlignSetScoreI(align, len);
                            /* ungapped alignment means same length
                             *  for both sequences
                            */
                            ajAlignSetSubRange(align, targetstart, 1, len,
                                    ajSeqIsReversed(targetseq),
                                    ajSeqGetLen(targetseq),
                                    querystart, 1, len,
                                    ajSeqIsReversed(queryseq),
                                    ajSeqGetLen(queryseq));
                        }
                    }

                    if(dumpAlign)
                    {
                	ajAlignWrite(align);
                	ajAlignReset(align);
                    }

                    if(ajListGetLength(matchlist[i])>0 && dumpFeature)
                    {
                        embWordMatchListConvToFeat(matchlist[i],
                                                   &seqsetftables[i],
                                                   &seqallseqftable,
                                                   targetseq, queryseq);
                        ajFeattableWrite(ftoutforseqallseq, seqallseqftable);
                        ajFeattableDel(&seqallseqftable);
                    }

                    ajListIterDel(&iter);
                }

                embWordMatchListDelete(&matchlist[i]);
            }
        }

        /* search completed, now report statistics */
        for(i=0;i<npatterns;i++)
        {
            sumAllScore += wordsw[i]->lenMatches;

            for(j=0;j<wordsw[i]->nseqs;j++)
        	nmatchesseqset[wordsw[i]->seqindxs[j]] +=
        		wordsw[i]->nSeqMatches[j];
        }

        ajFmtPrintF(logfile, "Number of sequences in the file scanned "
                "for patterns: %u\n", ajSeqallGetCount(seqall));
        ajFmtPrintF(logfile, "Number of all matches: %Lu"
                " (wordmatch finds exact matches only)\n", nAllMatches);

        if(nAllMatches>0)
        {
            ajFmtPrintF(logfile, "Sum of match lengths: %Lu\n", sumAllScore);
            ajFmtPrintF(logfile, "Average match length: %.2f\n",
        	    sumAllScore*1.0/nAllMatches);

            ajFmtPrintF(logfile, "\nDistribution of the matches among pattern"
        	    " sequences:\n");
            ajFmtPrintF(logfile, "-----------------------------------------"
        	    "-----------\n");

            for(i=0;i<ajSeqsetGetSize(seqset);i++)
            {
        	if (nmatchesseqset[i]>0)
        	    ajFmtPrintF(logfile, "%-42s: %8u\n",
        	                ajSeqGetNameC(ajSeqsetGetseqSeq(seqset, i)),
        	                nmatchesseqset[i]);

        	ajFeattableWrite(ftoutforseqsetseq, seqsetftables[i]);
        	ajFeattableDel(&seqsetftables[i]);
            }

            ajFmtPrintF(logfile, "\nPattern statistics:\n");
            ajFmtPrintF(logfile, "-------------------\n");
            if(wordlen>7)
        	ajStrAppendCountK(&padding, ' ', wordlen-7);
            paddedheader = ajFmtString(header,padding);
            ajFmtPrintF(logfile, paddedheader);

            for(i=0;i<npatterns;i++)
        	if (wordsw[i]->nMatches>0)
        	    ajFmtPrintF(logfile, "%-7s: %12u  %12u %17.2f\n",
        	                wordsw[i]->word->fword, wordsw[i]->nseqs,
        	                wordsw[i]->nMatches,
        	                wordsw[i]->lenMatches*1.0/wordsw[i]->nMatches);
        }

    }

    for(i=0;i<npatterns;i++)
    {
        for(j=0;j<wordsw[i]->nseqs;j++)
            AJFREE(wordsw[i]->locs[j]);

        AJFREE(wordsw[i]->locs);
        AJFREE(wordsw[i]->seqindxs);
        AJFREE(wordsw[i]->nnseqlocs);
        AJFREE(wordsw[i]->nSeqMatches);
        AJFREE(wordsw[i]);
    }

    embWordFreeTable(&wordsTable);

    AJFREE(wordsw);
    AJFREE(matchlist);
    AJFREE(lastlocation);
    AJFREE(nmatchesseqset);
    AJFREE(seqsetftables);

    if(dumpAlign)
    {
        ajAlignClose(align);
        ajAlignDel(&align);
    }

    if(dumpFeature)
    {
        ajFeattabOutDel(&ftoutforseqsetseq);
        ajFeattabOutDel(&ftoutforseqallseq);
    }

    ajFileClose(&logfile);

    ajSeqallDel(&seqall);
    ajSeqsetDel(&seqset);
    ajSeqDel(&queryseq);
    ajStrDel(&padding);
    AJFREE(paddedheader);

    embExit();

    return 0;
}
Beispiel #7
0
/* @funcstatic acdrelations_procacdfile ***************************************
**
** Process ACD file and write new ACD file with new relations: attributes
** added (replaced if necessary).
**
** @param [r] inf  [AjPFile] ACD input file
** @param [r] outf [AjPFile] ACD output file
** @param [r] P    [PEdam]   edam object
** @param [r] T    [PKtype]  ktype object
** @return [void] 
** @@
******************************************************************************/
static void acdrelations_procacdfile
            (AjPFile inf, 
	     AjPFile outf, 
	     PEdam P,
    	     PKtype T)
{
  AjPStr   line     = NULL;
  AjPStr   tok      = NULL;
  AjPStr   acdtype  = NULL;  
  AjPStr   strtmp   = NULL;
  AjPList  strlist  = NULL;
  AjPStr  *strarr   = NULL;
  ajint    nstr     = 0;
  

  /* Allocate memory */
  line        = ajStrNew();
  tok         = ajStrNew();
  acdtype     = ajStrNew();  
  strlist     = ajListstrNew();

  
  /*  Read next line */
  while(ajReadline(inf,&line))
    {
      ajFmtScanS(line, "%S", &tok);
      
      /* Write application definition or section definition out as-is */
      if(ajStrMatchC(tok, "application:")  ||
	 ajStrMatchC(tok, "section:"))
      {
	  ajFmtPrintF(outf, "%S", line);
	  while(ajReadline(inf,&line))
          {
	      ajFmtPrintF(outf, "%S", line);
	      ajFmtScanS(line, "%S", &tok);
	      if(ajStrMatchC(tok, "]"))
                  break;
          }
      }
      /* Write variables, endsection definitions and comments out as-is */
      else if(ajStrMatchC(tok, "variable:")   ||
	      ajStrMatchC(tok, "endsection:") ||
              ajStrMatchC(tok, "#"))
          ajFmtPrintF(outf, "%S", line);

      /* Write out blank lines as-is */
      else if (!ajFmtScanS(line, "%S", &tok))
          ajFmtPrintF(outf, "%S", line);
      /* Process data definition */
      else
          /* First line of data definition */
      {
          /* Process and write datatype line */
          ajFmtPrintF(outf, "%S", line);
          ajFmtScanS(line, "%S", &acdtype);
	  ajStrRemoveSetC(&acdtype, ":");

          /* Process subsequent (attribute) lines */
	  while(ajReadline(inf,&line))
	    {
	      strtmp = ajStrNew();
	      ajStrAssignS(&strtmp, line);
	      ajStrRemoveWhite(&strtmp);
	      ajListstrPushAppend(strlist, strtmp);

	      ajFmtScanS(line, "%S", &tok);

              /* Reached end of data definition */
              if(ajStrMatchC(tok, "]"))
              {
                  nstr = ajListstrToarray(strlist, &strarr);
                  
                  /* Write relations: line */
                  acdrelations_writerelations(outf, acdtype, strarr, nstr, P, T);

                  AJFREE(strarr);
                  ajListstrFreeData(&strlist);
		  strlist = ajListstrNew();
		  
		  ajFmtPrintF(outf, "%S", line);
		  break;
		}
              /* Ignore existing relations: lines */
              else if(ajStrMatchC(tok, "relations:"))
                  continue;
              
	      ajFmtPrintF(outf, "%S", line);
	    }
	}
    }


  /* Free memory */
  ajStrDel(&line);
  ajStrDel(&tok);
  ajStrDel(&acdtype);
  ajListstrFreeData(&strlist);

  return;
}
Beispiel #8
0
static void acdrelations_readdatfile
            (AjPFile inf, 
	     PEdam *P)
{
  AjPStr  line           = NULL;
  const AjPStr  tok      = NULL;
  const AjPStr  subtok   = NULL;
  AjPStr  strtmp         = NULL;
  AjPList strlist        = NULL;

  AjPStr  acdtype        = NULL;
  AjPStr  relations      = NULL;

  PEdamdat dattmp        = NULL;
  AjPList  datlist       = NULL;
  
  if(!P)
    ajFatal("Null arg error 1 in acdrelations_readdatfile");
  if(!inf)
    ajFatal("Null arg error 3 in acdrelations_readdatfile");  


  /* Allocate memory */
  line      = ajStrNew();
  acdtype   = ajStrNew();
  relations = ajStrNew();
  datlist   = ajListNew();

  /* Read data from file */
  while(ajReadline(inf,&line))
    {
      /* Discard comment lines */
      if(ajStrPrefixC(line,"#")) 
	continue;

      
      
      /* Tokenise line, delimited by '|'.
         Parse first token (ACD datatype ) */
      ajStrAssignS(&acdtype, ajStrParseC(line, "|")); 
      
      /* Parse second token (EDAM relations: value ) */
      ajStrAssignS(&relations, ajStrParseC(NULL, "|")); 

      /* Parse third token (attribute:value strings block) */
      tok = ajStrParseC(NULL, "|");
        

      /* Create new string list */
      strlist = ajListstrNew();

      /* Tokenise third token itself into tokens delimited by ' ' (space)
         Parse tokens (individual attribute:value strings)*/
      if((subtok=ajStrParseC(tok, ";")))
      {
          strtmp = ajStrNew();
          ajStrAssignS(&strtmp, subtok);
          ajStrRemoveWhite(&strtmp);
          ajListstrPushAppend(strlist, strtmp);
              
          while((subtok=ajStrParseC(NULL, ";")))
          {
              strtmp = ajStrNew();
              ajStrAssignS(&strtmp, subtok);
              ajStrRemoveWhite(&strtmp);
              ajListstrPushAppend(strlist, strtmp);
          }
      }
      
      /* Write PEdamdat structure & push onto list */
      dattmp = ajEdamdatNew();
      ajStrRemoveWhite(&acdtype);
      ajStrAssignS(&dattmp->acdtype, acdtype);
      ajStrAssignS(&dattmp->edam, relations);
      dattmp->n = ajListstrToarray(strlist, &dattmp->acdattr);
      ajListPushAppend(datlist, dattmp);
      
      /* Clear nodes (but not strings) from string list */
      ajListstrFree(&strlist);
    }
  

  /* Write PEdam structure */
  ((*P)->n) = ajListToarray(datlist, (void***) &((*P)->dat));

  /* Free memory */
  ajStrDel(&line);
  ajStrDel(&acdtype);
  ajStrDel(&relations);
  ajListFree(&datlist);

  return;
}
Beispiel #9
0
int main(int argc, char **argv)
{
    AjPSeqout outseq = NULL;
    AjPList list     = NULL;
    AjPSeq seq       = NULL;
    AjPStr insert    = NULL;
    AjPStr seqstr    = NULL;
    AjPStr* seqr     = NULL;
    AjPFile data     = NULL;
    ajint start   = 0;
    ajint length  = 0;
    ajint amount  = 0;
    ajint scmax   = 0;
    ajint extra   = 0;

    embInit("makeprotseq", argc, argv);

    data     = ajAcdGetInfile("pepstatsfile");
    insert   = ajAcdGetString("insert");
    start    = ajAcdGetInt("start");
    length   = ajAcdGetInt("length");
    amount   = ajAcdGetInt("amount");
    outseq   = ajAcdGetSeqoutall("outseq");

    list = ajListstrNew();

    /* this is checked by acd
    if(amount <=0 || length <= 0)
    ajFatal("Amount or length is 0 or less. "
                 "Unable to create any sequences"); */

    /* if insert, make sure sequence is large enough */
    if(ajStrGetLen(insert))
    {
        length -= ajStrGetLen(insert);
        /* start= start <= 1 ? 0 : --start; */ /* checked in acd */
        start--;

        if(length <= 0)
            ajFatal("Sequence smaller than inserted part. "
                    "Unable to create sequences.");
    }

    /* make the list of AjPStr to be used in sequence creation */
    if(data)
    {
        ajDebug("Distribution datafile '%s' given checking type\n",
                ajFileGetPrintnameC(data));
        seqstr = ajStrNew();
        ajReadlineTrim(data,&seqstr);

        if(ajStrFindC(seqstr,"PEPSTATS") == 0)
        {
            makeprotseq_parse_pepstats(&list,data);
        }
        else
        {
            ajWarn("Not pepstats file. Making completely random sequences.");
            makeprotseq_default_chars(&list);
        }

        ajStrDel(&seqstr);
        ajFileClose(&data);
    }
    else
        makeprotseq_default_chars(&list);

    /* if insert, make sure type is correct */
    /* typecheking code is not working, uncomment and test after it is
    if(ajStrGetLen(insert))
    {
    seqstr = ajStrNew();
    if(prot)
        ajStrAssignC(&seqstr,"pureprotein");
    if(!ajSeqTypeCheckS(&insert,seqstr))
        ajFatal("Insert not the same sequence type as sequence itself.");
    ajStrDel(&seqstr);
    } */

    /* array allows fast creation of a sequences */
    scmax = (ajuint) ajListstrToarray(list,&seqr);
    if(!scmax)
        ajFatal("No strings in list. No characters to make the sequence.");

    ajDebug("Distribution array done.\nscmax '%d', extra '%d', first '%S'\n",
            scmax,extra,seqr[0]);

    ajRandomSeed();

    while(amount-- > 0)
    {
        seqstr = makeprotseq_random_sequence(seqr,scmax,length);

        if(ajStrGetLen(insert))
            ajStrInsertS(&seqstr,start,insert);

        ajStrFmtLower(&seqstr);
        seq = ajSeqNew();

        ajSeqAssignSeqS(seq, seqstr);
        ajSeqSetProt(seq);

        ajSeqoutWriteSeq(outseq, seq);
        ajSeqDel(&seq);
        ajStrDel(&seqstr);
    }

    ajSeqoutClose(outseq);
    ajSeqoutDel(&outseq);
    ajListstrFreeData(&list);
    ajStrDel(&insert);
    AJFREE(seqr);

    embExit();

    return 0;
}