AjPResource ajResourceNew(void) { AjPResource ret; AJNEW0(ret); ret->Id = ajStrNew(); ret->Idalt = ajListstrNew(); ret->Acc = ajStrNew(); ret->Name = ajStrNew(); ret->Desc = ajStrNew(); ret->Url = ajStrNew(); ret->Urllink = ajStrNew(); ret->Urlrest = ajStrNew(); ret->Urlsoap = ajStrNew(); ret->Cat = ajListstrNew(); ret->Taxon = ajListNew(); ret->Edamdat = ajListNew(); ret->Edamfmt = ajListNew(); ret->Edamid = ajListNew(); ret->Edamtpc = ajListNew(); ret->Xref = ajListNew(); ret->Query = ajListNew(); ret->Example = ajListstrNew(); return ret; }
static void remap_NoCutList(AjPFile outfile, const AjPTable hittable, AjBool html, const AjPStr enzymes, AjBool blunt, AjBool sticky, ajuint sitelen, AjBool commercial, AjBool ambiguity, AjBool limit, const AjPTable retable) { /* for iterating over hittable */ PValue value; void **keyarray = NULL; /* array for table */ void **valarray = NULL; /* array for table */ ajint i; /* list of enzymes that cut */ AjPList cutlist; AjIList citer; /* iterator for cutlist */ AjPStr cutname = NULL; AjBool found; /* for parsing value->iso string */ AjPStrTok tok; char tokens[] = " ,"; AjPStr code = NULL; const char *p; /* for reading in enzymes names */ AjPFile enzfile = NULL; AjPStr *ea; ajint ne; /* number of enzymes */ AjBool isall = ajTrue; /* list of enzymes that don't cut */ AjPList nocutlist; AjIList niter; /* iterator for nocutlist */ AjPStr nocutname = NULL; /* count of rejected enzymes not matching criteria */ ajint rejected_count = 0; EmbPPatRestrict enz; /* for renaming preferred isoschizomers */ AjPList newlist; /* ** ** Make a list of enzymes('cutlist') that hit ** including the isoschizomer names ** */ ajDebug("Make a list of all enzymes that cut\n"); cutlist = ajListstrNew(); nocutlist = ajListstrNew(); ajTableToarrayKeysValues(hittable, &keyarray, &valarray); for(i = 0; keyarray[i]; i++) { value = (PValue) valarray[i]; cutname = ajStrNew(); ajStrAssignRef(&cutname, keyarray[i]); ajListstrPushAppend(cutlist, cutname); /* Add to cutlist all isoschizomers of enzymes that cut */ ajDebug("Add to cutlist all isoschizomers of enzymes that cut\n"); /* start token to parse isoschizomers names */ tok = ajStrTokenNewC(value->iso, tokens); while(ajStrTokenNextParseC(&tok, tokens, &code)) { cutname = ajStrNew(); ajStrAssignS(&cutname, code); ajListstrPushAppend(cutlist, cutname); } ajStrTokenDel(&tok); } ajStrDel(&code); AJFREE(keyarray); AJFREE(valarray); /* ** Read in list of enzymes ('nocutlist') - either all or ** the input enzyme list. ** Exclude those that don't match the selection criteria - count these. */ ajDebug("Read in a list of all input enzyme names\n"); ne = 0; if(!enzymes) isall = ajTrue; else { /* get input list of enzymes into ea[] */ ne = ajArrCommaList(enzymes, &ea); if(ajStrMatchCaseC(ea[0], "all")) isall = ajTrue; else { isall = ajFalse; for(i=0; i<ne; ++i) ajStrRemoveWhite(&ea[i]); } } enzfile = ajDatafileNewInNameC(ENZDATA); /* push all enzyme names without the required criteria onto nocutlist */ enz = embPatRestrictNew(); while(!ajFileIsEof(enzfile)) { if(!embPatRestrictReadEntry(enz, enzfile)) continue; /* ** If user entered explicit enzyme list, then check to see if ** this is one of that explicit list */ if(!isall) { found = AJFALSE; for(i=0; i<ne; ++i) if(ajStrMatchCaseS(ea[i], enz->cod)) { found = AJTRUE; break; } if(!found) /* not in the explicit list */ continue; ajDebug("RE %S is in the input explicit list of REs\n", enz->cod); } /* ignore ncuts==0 as they are unknown */ if(!enz->ncuts) { /* number of cut positions */ ajDebug("RE %S has an unknown number of cut positions\n", enz->cod); continue; } ajDebug("RE %S has a known number of cut sites\n", enz->cod); if(enz->len < sitelen) { /* recognition site length */ ajDebug("RE %S does not have a long enough recognition site\n", enz->cod); rejected_count++; continue; } if(!blunt && enz->blunt) { /* blunt/sticky */ ajDebug("RE %S is blunt\n", enz->cod); rejected_count++; continue; } if(!sticky && !enz->blunt) { /* blunt/sticky */ ajDebug("RE %S is sticky\n", enz->cod); rejected_count++; continue; } /* commercially available enzymes have uppercase patterns */ p = ajStrGetPtr(enz->pat); /* ** The -commercial qualifier is only used if we are searching ** through 'all' of the REBASE database - if we have specified an ** explicit list of enzymes then they are searched for whether or ** not they are commercially available */ if((*p >= 'a' && *p <= 'z') && commercial && isall) { ajDebug("RE %S is not commercial\n", enz->cod); rejected_count++; continue; } if(!ambiguity && remap_Ambiguous(enz->pat)) { ajDebug("RE %S is ambiguous\n", enz->cod); rejected_count++; continue; } ajDebug("RE %S matches all required criteria\n", enz->cod); code = ajStrNew(); ajStrAssignS(&code, enz->cod); ajListstrPushAppend(nocutlist, code); } embPatRestrictDel(&enz); ajFileClose(&enzfile); for(i=0; i<ne; ++i) if(ea[i]) ajStrDel(&ea[i]); if(ne) AJFREE(ea); /* ** Change names of enzymes in the non-cutter list ** to that of preferred (prototype) ** enzyme name so that the isoschizomers of cutters ** will be removed from the ** non-cutter list in the next bit. ** Remove duplicate prototype names. */ if(limit) { newlist = ajListstrNew(); remap_RenamePreferred(nocutlist, retable, newlist); ajListstrFreeData(&nocutlist); nocutlist = newlist; ajListSortUnique(nocutlist, remap_cmpcase, remap_strdel); } /* ** Iterate through the list of input enzymes removing those that are in ** the cutlist. */ ajDebug("Remove from the nocutlist all enzymes and isoschizomers " "that cut\n"); /* ** This steps down both lists at the same time, comparing names and ** iterating to the next name in whichever list whose name compares ** alphabetically before the other. Where a match is found, the ** nocutlist item is deleted. */ ajListSort(nocutlist, remap_cmpcase); ajListSort(cutlist, remap_cmpcase); citer = ajListIterNewread(cutlist); niter = ajListIterNew(nocutlist); /* while((cutname = (AjPStr)ajListIterGet(citer)) != NULL) ajDebug("dbg cutname = %S\n", cutname); */ nocutname = (AjPStr)ajListIterGet(niter); cutname = (AjPStr)ajListIterGet(citer); ajDebug("initial cutname, nocutname: '%S' '%S'\n", cutname, nocutname); while(nocutname != NULL && cutname != NULL) { i = ajStrCmpCaseS(cutname, nocutname); ajDebug("compare cutname, nocutname: %S %S ", cutname, nocutname); ajDebug("ajStrCmpCase=%d\n", i); if(i == 0) { /* match - so remove from nocutlist */ ajDebug("ajListstrRemove %S\n", nocutname); ajListstrIterRemove(niter); nocutname = (AjPStr)ajListIterGet(niter); /* ** Don't increment the cutname list pointer here ** - there may be more than one entry in the nocutname ** list with the same name because we have converted ** isoschizomers to their preferred name */ /* cutname = (AjPStr)ajListIterGet(citer); */ } else if(i == -1) /* cutlist name sorts before nocutlist name */ cutname = (AjPStr)ajListIterGet(citer); else if(i == 1) /* nocutlist name sorts before cutlist name */ nocutname = (AjPStr)ajListIterGet(niter); } ajListIterDel(&citer); ajListIterDel(&niter); ajListstrFreeData(&cutlist); /* Print the resulting list of those that do not cut*/ ajDebug("Print out the list\n"); /* print the title */ if(html) ajFmtPrintF(outfile, "<H2>"); ajFmtPrintF(outfile, "\n\n# Enzymes that do not cut\n\n"); if(html) ajFmtPrintF(outfile, "</H2>\n"); if(html) ajFmtPrintF(outfile, "<PRE>"); /* ajListSort(nocutlist, ajStrVcmp);*/ niter = ajListIterNewread(nocutlist); i = 0; while((nocutname = (AjPStr)ajListIterGet(niter)) != NULL) { ajFmtPrintF(outfile, "%-10S", nocutname); /* new line after every 7 names printed */ if(i++ == 7) { ajFmtPrintF(outfile, "\n"); i = 0; } } ajListIterDel(&niter); /* end the output */ ajFmtPrintF(outfile, "\n"); if(html) {ajFmtPrintF(outfile, "</PRE>\n");} /* ** Print the count of rejected enzymes ** N.B. This is the count of ALL rejected enzymes including all ** isoschizomers */ if(html) ajFmtPrintF(outfile, "<H2>"); ajFmtPrintF(outfile, "\n\n# No. of cutting enzymes which do not match the\n" "# SITELEN, BLUNT, STICKY, COMMERCIAL, AMBIGUOUS criteria\n\n"); if(html) ajFmtPrintF(outfile, "</H2>\n"); ajFmtPrintF(outfile, "%d\n", rejected_count); ajDebug("Tidy up\n"); ajListstrFreeData(&nocutlist); ajListstrFreeData(&cutlist); return; }
/* @funcstatic seqwords_TermsRead ********************************************* ** ** Read the next Terms object from a file in embl-like format. The search ** terms are modified with a leading and trailing space. ** ** @param [r] inf [AjPFile] Input file stream ** @param [w] thys [AjPTerms*] Terms object ** ** @return [AjBool] True on succcess ** @@ *****************************************************************************/ static AjBool seqwords_TermsRead(AjPFile inf, AjPTerms *thys) { AjPStr line =NULL; /* Line of text. */ AjPStr temp =NULL; AjPList list_terms =NULL; /* List of keywords for a scop node*/ AjBool ok =ajFalse; AjPStr type = NULL; /* Memory management */ (*thys)=seqwords_TermsNew(); list_terms = ajListstrNew(); line = ajStrNew(); type = ajStrNew(); /* Read first line. */ ok = ajReadlineTrim(inf,&line); while(ok && !ajStrPrefixC(line,"//")) { if(ajStrPrefixC(line,"XX")) { ok = ajReadlineTrim(inf,&line); continue; } else if(ajStrPrefixC(line,"TY")) { ajFmtScanS(line, "%*s %S", &type); if(ajStrMatchC(type, "SCOP")) (*thys)->Type = ajSCOP; else if(ajStrMatchC(type, "CATH")) (*thys)->Type = ajCATH; } else if(ajStrPrefixC(line,"CL")) { ajStrAssignC(&(*thys)->Class,ajStrGetPtr(line)+3); ajStrRemoveWhiteExcess(&(*thys)->Class); } else if(ajStrPrefixC(line,"AR")) { ajStrAssignC(&(*thys)->Architecture,ajStrGetPtr(line)+3); ajStrRemoveWhiteExcess(&(*thys)->Architecture); } else if(ajStrPrefixC(line,"TP")) { ajStrAssignC(&(*thys)->Topology,ajStrGetPtr(line)+3); ajStrRemoveWhiteExcess(&(*thys)->Topology); } else if(ajStrPrefixC(line,"FO")) { ajStrAssignC(&(*thys)->Fold,ajStrGetPtr(line)+3); while(ajReadlineTrim(inf,&line)) { if(ajStrPrefixC(line,"XX")) break; ajStrAppendC(&(*thys)->Fold,ajStrGetPtr(line)+3); } ajStrRemoveWhiteExcess(&(*thys)->Fold); } else if(ajStrPrefixC(line,"SF")) { ajStrAssignC(&(*thys)->Superfamily,ajStrGetPtr(line)+3); while(ajReadlineTrim(inf,&line)) { if(ajStrPrefixC(line,"XX")) break; ajStrAppendC(&(*thys)->Superfamily,ajStrGetPtr(line)+3); } ajStrRemoveWhiteExcess(&(*thys)->Superfamily); } else if(ajStrPrefixC(line,"FA")) { ajStrAssignC(&(*thys)->Family,ajStrGetPtr(line)+3); while(ajReadlineTrim(inf,&line)) { if(ajStrPrefixC(line,"XX")) break; ajStrAppendC(&(*thys)->Family,ajStrGetPtr(line)+3); } ajStrRemoveWhiteExcess(&(*thys)->Family); } else if(ajStrPrefixC(line,"TE")) { /* Copy and clean up term. */ temp = ajStrNew(); ajStrAssignC(&temp,ajStrGetPtr(line)+3); ajStrRemoveWhiteExcess(&temp); /* Append a leading and trailing space to search term*/ ajStrAppendK(&temp, ' '); ajStrInsertC(&temp, 0, " "); /* Add the current term to the list. */ ajListstrPush(list_terms,temp); } ok = ajReadlineTrim(inf,&line); } if(!ok) { /* Clean up. */ ajListstrFree(&list_terms); ajStrDel(&line); /* Return. */ return ajFalse; } /* Convert the AjPList of terms to array of AjPSeq's. */ if(!((*thys)->N=ajListstrToarray((AjPList)list_terms,&(*thys)->Keywords))) ajWarn("Zero sized list of terms passed into seqwords_TermsRead"); /* Clean up. Free the list (not the nodes!). */ ajListstrFree(&list_terms); ajStrDel(&line); ajStrDel(&type); return ajTrue; }
int main(int argc, char** argv) { AjPFile outf = NULL; AjPFile cachef = NULL; AjIList iterator = NULL; AjPList aliases = NULL; AjPList dbas = NULL; AjPList species = NULL; AjPStr alias = NULL; AjPStr dbname = NULL; AjPStr spname = NULL; AjPStr svrname = NULL; AjPStr svrurl = NULL; AjPStr dbcurl = NULL; AjPTime svrtime = NULL; EnsEDatabaseadaptorGroup dbag = ensEDatabaseadaptorGroupNULL; EnsPDatabaseadaptor dba = NULL; EnsPDatabaseconnection dbc = NULL; embInit("cacheensembl", argc, argv); ensInit(); svrname = ajAcdGetString("servername"); outf = ajAcdGetOutfile("outfile"); cachef = ajAcdGetOutfile("cachefile"); dbcurl = ajStrNew(); svrurl = ajStrNew(); dbname = ajStrNew(); ajNamSvrGetUrl(svrname, &svrurl); if(!svrurl) ajFatal("Could not resolve server name '%S'.", svrname); dbc = ensDatabaseconnectionNewUrl(svrurl); ensRegistryLoadDatabaseconnection(dbc); ensDatabaseconnectionDel(&dbc); /* Write the server file header. */ svrtime = ajTimeNewTodayFmt("cachefile"); ajFmtPrintF(cachef, "# %S %D\n", ajFileGetNameS(cachef), svrtime); ajTimeDel(&svrtime); ajFmtPrintF(cachef, "# Automatically generated by cacheensembl " "for server '%S'.\n\n", svrname); /* ** Get all Ensembl Database Adaptor objects and write them as ** EMBOSS Database definitions. */ aliases = ajListstrNew(); dbas = ajListNew(); species = ajListstrNew(); ensRegistryRetrieveAllSpecies(species); while(ajListstrPop(species, &spname)) { ensRegistryGetAllDatabaseadaptors(ensEDatabaseadaptorGroupNULL, spname, dbas); while(ajListPop(dbas, (void**) &dba)) { dbag = ensDatabaseadaptorGetGroup(dba); if(dbag == ensEDatabaseadaptorGroupNULL) { ajDebug("cacheensembl main got unexpected " "Ensembl Database Adaptor Group %d.\n", dbag); continue; } ajStrAssignS(&dbname, ensDatabaseadaptorGetSpecies(dba)); if(dbag != ensEDatabaseadaptorGroupCore) { ajStrAppendC(&dbname, "_"); ajStrAppendC(&dbname, ensDatabaseadaptorGroupToChar(dbag)); } dbc = ensDatabaseadaptorGetDatabaseconnection(dba); ensDatabaseconnectionFetchUrl(dbc, &dbcurl); if(outf) ajFmtPrintF(outf, "%S\n", dbname); ajFmtPrintF(cachef, "DBNAME %S [\n", dbname); ajFmtPrintF(cachef, " release: \"%s\"\n", ensSoftwareGetVersion()); ajFmtPrintF(cachef, " server: \"%S\"\n", svrname); ajFmtPrintF(cachef, " url: \"%S\"\n", dbcurl); ajFmtPrintF(cachef, "]\n"); ajFmtPrintF(cachef, "\n"); if(dbag != ensEDatabaseadaptorGroupCore) continue; ensRegistryAliasFetchAllbySpecies( ensDatabaseadaptorGetSpecies(dba), aliases); /* ** Format all aliases to lower case, ** sort them alphabetically and remove duplicates. */ iterator = ajListIterNew(aliases); while(!ajListIterDone(iterator)) { alias = ajListstrIterGet(iterator); ajStrFmtLower(&alias); } ajListIterDel(&iterator); ajListSortUnique(aliases, cacheensembl_stringcompare, cacheensembl_stringdelete); alias = NULL; if(ajListGetLength(aliases) > 0) { while(ajListstrPop(aliases, &alias)) { /* ** Reject any aliases with other than alpha-numeric ** characters like white space. */ if(ajStrIsAlnum(alias)) ajFmtPrintF(cachef, "ALIAS %S %S\n", alias, ensDatabaseadaptorGetSpecies(dba)); ajStrDel(&alias); } ajFmtPrintF(cachef, "\n"); } /* Ensembl Database Adaptor objects *must not* be deleted. */ } ajStrDel(&spname); } ajListstrFree(&aliases); ajListFree(&dbas); ajStrDel(&dbcurl); ajStrDel(&svrurl); ajStrDel(&dbname); ajStrDel(&svrname); ajFileClose(&outf); ajFileClose(&cachef); embExit(); return EXIT_SUCCESS; }
int main(int argc, char **argv) { AjPSeqall queryseqs; AjPSeqset targetseqs; AjPSeq queryseq; const AjPSeq targetseq; AjPStr queryaln = 0; AjPStr targetaln = 0; AjPFile errorf; AjBool show = ajFalse; const char *queryseqc; const char *targetseqc; AjPMatrixf matrix; AjPSeqCvt cvt = 0; float **sub; ajint *compass = NULL; float *path = NULL; float gapopen; float gapextend; float score; float minscore; ajuint j, k; ajint querystart = 0; ajint targetstart = 0; ajint queryend = 0; ajint targetend = 0; ajint width = 0; AjPTable kmers = 0; ajint wordlen = 6; ajint oldmax = 0; ajint newmax = 0; ajuint ntargetseqs; ajuint nkmers; AjPAlign align = NULL; EmbPWordMatch maxmatch; /* match with maximum score */ /* Cursors for the current sequence being scanned, ** i.e., until which location it was scanned. ** Separate cursor/location entries for each sequence in the seqset. */ ajuint* lastlocation; EmbPWordRK* wordsw = NULL; AjPList* matchlist = NULL; embInit("supermatcher", argc, argv); matrix = ajAcdGetMatrixf("datafile"); queryseqs = ajAcdGetSeqall("asequence"); targetseqs= ajAcdGetSeqset("bsequence"); gapopen = ajAcdGetFloat("gapopen"); gapextend = ajAcdGetFloat("gapextend"); wordlen = ajAcdGetInt("wordlen"); align = ajAcdGetAlign("outfile"); errorf = ajAcdGetOutfile("errorfile"); width = ajAcdGetInt("width"); /* width for banded Smith-Waterman */ minscore = ajAcdGetFloat("minscore"); gapopen = ajRoundFloat(gapopen, 8); gapextend = ajRoundFloat(gapextend, 8); sub = ajMatrixfGetMatrix(matrix); cvt = ajMatrixfGetCvt(matrix); embWordLength(wordlen); /* seqset sequence is the reference sequence for SAM format */ ajAlignSetRefSeqIndx(align, 1); ajSeqsetTrim(targetseqs); ntargetseqs = ajSeqsetGetSize(targetseqs); AJCNEW0(matchlist, ntargetseqs); /* get tables of words */ for(k=0;k<ntargetseqs;k++) { targetseq = ajSeqsetGetseqSeq(targetseqs, k); embWordGetTable(&kmers, targetseq); ajDebug("Number of distinct kmers found so far: %d\n", ajTableGetLength(kmers)); } AJCNEW0(lastlocation, ntargetseqs); if(ajTableGetLength(kmers)<1) ajErr("no kmers found"); nkmers = embWordRabinKarpInit(kmers, &wordsw, wordlen, targetseqs); while(ajSeqallNext(queryseqs,&queryseq)) { ajSeqTrim(queryseq); queryaln = ajStrNewRes(1+ajSeqGetLen(queryseq)); ajDebug("Read '%S'\n", ajSeqGetNameS(queryseq)); for(k=0;k<ntargetseqs;k++) { lastlocation[k]=0; matchlist[k] = ajListstrNew(); } embWordRabinKarpSearch(ajSeqGetSeqS(queryseq), targetseqs, (const EmbPWordRK*)wordsw, wordlen, nkmers, matchlist, lastlocation, ajFalse); for(k=0;k<ajSeqsetGetSize(targetseqs);k++) { targetseq = ajSeqsetGetseqSeq(targetseqs, k); ajDebug("Processing '%S'\n", ajSeqGetNameS(targetseq)); if(ajListGetLength(matchlist[k])==0) { ajFmtPrintF(errorf, "No wordmatch start points for " "%s vs %s. No alignment\n", ajSeqGetNameC(queryseq),ajSeqGetNameC(targetseq)); embWordMatchListDelete(&matchlist[k]); continue; } /* only the maximum match is used as seed * (if there is more than one location with the maximum match * only the first one is used) * TODO: we should add a new option to make above limit optional */ maxmatch = embWordMatchFirstMax(matchlist[k]); supermatcher_findendpoints(maxmatch,targetseq, queryseq, &targetstart, &querystart, &targetend, &queryend); targetaln=ajStrNewRes(1+ajSeqGetLen(targetseq)); queryseqc = ajSeqGetSeqC(queryseq); targetseqc = ajSeqGetSeqC(targetseq); ajStrAssignC(&queryaln,""); ajStrAssignC(&targetaln,""); ajDebug("++ %S v %S start:%d %d end:%d %d\n", ajSeqGetNameS(targetseq), ajSeqGetNameS(queryseq), targetstart, querystart, targetend, queryend); newmax = (targetend-targetstart+2)*width; if(newmax > oldmax) { AJCRESIZE0(path,oldmax,newmax); AJCRESIZE0(compass,oldmax,newmax); oldmax=newmax; ajDebug("++ memory re/allocation for path/compass arrays" " to size: %d\n", newmax); } else { AJCSET0(path,newmax); AJCSET0(compass,newmax); } ajDebug("Calling embAlignPathCalcSWFast " "%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n", querystart, queryend, (queryend - querystart + 1), ajSeqGetLen(queryseq), targetstart, targetend, (targetend - targetstart + 1), ajSeqGetLen(targetseq), width); score = embAlignPathCalcSWFast(&targetseqc[targetstart], &queryseqc[querystart], targetend-targetstart+1, queryend-querystart+1, 0,width, gapopen,gapextend, path,sub,cvt, compass,show); if(score>minscore) { embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend, targetseq,queryseq, &targetaln,&queryaln, targetend-targetstart+1, queryend-querystart+1, 0,width, &targetstart,&querystart); if(!ajAlignFormatShowsSequences(align)) { ajAlignDefineCC(align, ajStrGetPtr(targetaln), ajStrGetPtr(queryaln), ajSeqGetNameC(targetseq), ajSeqGetNameC(queryseq)); ajAlignSetScoreR(align, score); } else { ajDebug(" queryaln:%S \ntargetaln:%S\n", queryaln,targetaln); embAlignReportLocal(align, queryseq, targetseq, queryaln, targetaln, querystart, targetstart, gapopen, gapextend, score, matrix, 1 + ajSeqGetOffset(queryseq), 1 + ajSeqGetOffset(targetseq) ); } ajAlignWrite(align); ajAlignReset(align); } ajStrDel(&targetaln); embWordMatchListDelete(&matchlist[k]); } ajStrDel(&queryaln); } for(k=0;k<nkmers;k++) { AJFREE(wordsw[k]->seqindxs); AJFREE(wordsw[k]->nSeqMatches); for(j=0;j<wordsw[k]->nseqs;j++) AJFREE(wordsw[k]->locs[j]); AJFREE(wordsw[k]->nnseqlocs); AJFREE(wordsw[k]->locs); AJFREE(wordsw[k]); } embWordFreeTable(&kmers); if(!ajAlignFormatShowsSequences(align)) ajMatrixfDel(&matrix); AJFREE(path); AJFREE(compass); AJFREE(kmers); AJFREE(wordsw); AJFREE(matchlist); AJFREE(lastlocation); ajAlignClose(align); ajAlignDel(&align); ajSeqallDel(&queryseqs); ajSeqDel(&queryseq); ajSeqsetDel(&targetseqs); ajFileClose(&errorf); embExit(); return 0; }
int main(int argc, char **argv) { AjPSeqset seqset; AjPSeqall seqall; AjPSeq queryseq; const AjPSeq targetseq; ajint wordlen; AjPTable wordsTable = NULL; AjPList* matchlist = NULL; AjPFile logfile; AjPFeattable* seqsetftables = NULL; AjPFeattable seqallseqftable = NULL; AjPFeattabOut ftoutforseqsetseq = NULL; AjPFeattabOut ftoutforseqallseq = NULL; AjPAlign align = NULL; AjIList iter = NULL; ajint targetstart; ajint querystart; ajint len; ajuint i, j; ajulong nAllMatches = 0; ajulong sumAllScore = 0; AjBool dumpAlign = ajTrue; AjBool dumpFeature = ajTrue; AjBool checkmode = ajFalse; EmbPWordRK* wordsw = NULL; ajuint npatterns = 0; ajuint seqsetsize; ajuint nmatches; ajuint* nmatchesseqset; ajuint* lastlocation; /* Cursors for Rabin-Karp search. */ /* Shows until what point the query sequence was * scanned for a pattern sequences in the seqset. */ char* paddedheader = NULL; const char* header; AjPStr padding; header = "Pattern %S #pat-sequences #all-matches avg-match-length\n"; padding = ajStrNew(); embInit("wordmatch", argc, argv); wordlen = ajAcdGetInt("wordsize"); seqset = ajAcdGetSeqset("asequence"); seqall = ajAcdGetSeqall("bsequence"); logfile = ajAcdGetOutfile("logfile"); dumpAlign = ajAcdGetToggle("dumpalign"); dumpFeature = ajAcdGetToggle("dumpfeat"); if(dumpAlign) { align = ajAcdGetAlign("outfile"); ajAlignSetExternal(align, ajTrue); } seqsetsize = ajSeqsetGetSize(seqset); ajSeqsetTrim(seqset); AJCNEW0(matchlist, seqsetsize); AJCNEW0(seqsetftables, seqsetsize); AJCNEW0(nmatchesseqset, seqsetsize); if (dumpFeature) { ftoutforseqsetseq = ajAcdGetFeatout("aoutfeat"); ftoutforseqallseq = ajAcdGetFeatout("boutfeat"); } checkmode = !dumpFeature && !dumpAlign; embWordLength(wordlen); ajFmtPrintF(logfile, "Small sequence/file for constructing" " target patterns: %S\n", ajSeqsetGetUsa(seqset)); ajFmtPrintF(logfile, "Large sequence/file to be scanned" " for patterns: %S\n", ajSeqallGetUsa(seqall)); ajFmtPrintF(logfile, "Number of sequences in the patterns file: %u\n", seqsetsize); ajFmtPrintF(logfile, "Pattern/word length: %u\n", wordlen); for(i=0;i<seqsetsize;i++) { targetseq = ajSeqsetGetseqSeq(seqset, i); embWordGetTable(&wordsTable, targetseq); } AJCNEW0(lastlocation, seqsetsize); if(ajTableGetLength(wordsTable)>0) { npatterns = embWordRabinKarpInit(wordsTable, &wordsw, wordlen, seqset); ajFmtPrintF(logfile, "Number of patterns/words found: %u\n", npatterns); while(ajSeqallNext(seqall,&queryseq)) { for(i=0;i<seqsetsize;i++) { lastlocation[i]=0; if (!checkmode) matchlist[i] = ajListstrNew(); } nmatches = embWordRabinKarpSearch( ajSeqGetSeqS(queryseq), seqset, (EmbPWordRK const *)wordsw, wordlen, npatterns, matchlist, lastlocation, checkmode); nAllMatches += nmatches; if (checkmode) continue; for(i=0;i<seqsetsize;i++) { if(ajListGetLength(matchlist[i])>0) { iter = ajListIterNewread(matchlist[i]) ; while(embWordMatchIter(iter, &targetstart, &querystart, &len, &targetseq)) { if(dumpAlign) { ajAlignDefineSS(align, targetseq, queryseq); ajAlignSetScoreI(align, len); /* ungapped alignment means same length * for both sequences */ ajAlignSetSubRange(align, targetstart, 1, len, ajSeqIsReversed(targetseq), ajSeqGetLen(targetseq), querystart, 1, len, ajSeqIsReversed(queryseq), ajSeqGetLen(queryseq)); } } if(dumpAlign) { ajAlignWrite(align); ajAlignReset(align); } if(ajListGetLength(matchlist[i])>0 && dumpFeature) { embWordMatchListConvToFeat(matchlist[i], &seqsetftables[i], &seqallseqftable, targetseq, queryseq); ajFeattableWrite(ftoutforseqallseq, seqallseqftable); ajFeattableDel(&seqallseqftable); } ajListIterDel(&iter); } embWordMatchListDelete(&matchlist[i]); } } /* search completed, now report statistics */ for(i=0;i<npatterns;i++) { sumAllScore += wordsw[i]->lenMatches; for(j=0;j<wordsw[i]->nseqs;j++) nmatchesseqset[wordsw[i]->seqindxs[j]] += wordsw[i]->nSeqMatches[j]; } ajFmtPrintF(logfile, "Number of sequences in the file scanned " "for patterns: %u\n", ajSeqallGetCount(seqall)); ajFmtPrintF(logfile, "Number of all matches: %Lu" " (wordmatch finds exact matches only)\n", nAllMatches); if(nAllMatches>0) { ajFmtPrintF(logfile, "Sum of match lengths: %Lu\n", sumAllScore); ajFmtPrintF(logfile, "Average match length: %.2f\n", sumAllScore*1.0/nAllMatches); ajFmtPrintF(logfile, "\nDistribution of the matches among pattern" " sequences:\n"); ajFmtPrintF(logfile, "-----------------------------------------" "-----------\n"); for(i=0;i<ajSeqsetGetSize(seqset);i++) { if (nmatchesseqset[i]>0) ajFmtPrintF(logfile, "%-42s: %8u\n", ajSeqGetNameC(ajSeqsetGetseqSeq(seqset, i)), nmatchesseqset[i]); ajFeattableWrite(ftoutforseqsetseq, seqsetftables[i]); ajFeattableDel(&seqsetftables[i]); } ajFmtPrintF(logfile, "\nPattern statistics:\n"); ajFmtPrintF(logfile, "-------------------\n"); if(wordlen>7) ajStrAppendCountK(&padding, ' ', wordlen-7); paddedheader = ajFmtString(header,padding); ajFmtPrintF(logfile, paddedheader); for(i=0;i<npatterns;i++) if (wordsw[i]->nMatches>0) ajFmtPrintF(logfile, "%-7s: %12u %12u %17.2f\n", wordsw[i]->word->fword, wordsw[i]->nseqs, wordsw[i]->nMatches, wordsw[i]->lenMatches*1.0/wordsw[i]->nMatches); } } for(i=0;i<npatterns;i++) { for(j=0;j<wordsw[i]->nseqs;j++) AJFREE(wordsw[i]->locs[j]); AJFREE(wordsw[i]->locs); AJFREE(wordsw[i]->seqindxs); AJFREE(wordsw[i]->nnseqlocs); AJFREE(wordsw[i]->nSeqMatches); AJFREE(wordsw[i]); } embWordFreeTable(&wordsTable); AJFREE(wordsw); AJFREE(matchlist); AJFREE(lastlocation); AJFREE(nmatchesseqset); AJFREE(seqsetftables); if(dumpAlign) { ajAlignClose(align); ajAlignDel(&align); } if(dumpFeature) { ajFeattabOutDel(&ftoutforseqsetseq); ajFeattabOutDel(&ftoutforseqallseq); } ajFileClose(&logfile); ajSeqallDel(&seqall); ajSeqsetDel(&seqset); ajSeqDel(&queryseq); ajStrDel(&padding); AJFREE(paddedheader); embExit(); return 0; }
/* @funcstatic acdrelations_procacdfile *************************************** ** ** Process ACD file and write new ACD file with new relations: attributes ** added (replaced if necessary). ** ** @param [r] inf [AjPFile] ACD input file ** @param [r] outf [AjPFile] ACD output file ** @param [r] P [PEdam] edam object ** @param [r] T [PKtype] ktype object ** @return [void] ** @@ ******************************************************************************/ static void acdrelations_procacdfile (AjPFile inf, AjPFile outf, PEdam P, PKtype T) { AjPStr line = NULL; AjPStr tok = NULL; AjPStr acdtype = NULL; AjPStr strtmp = NULL; AjPList strlist = NULL; AjPStr *strarr = NULL; ajint nstr = 0; /* Allocate memory */ line = ajStrNew(); tok = ajStrNew(); acdtype = ajStrNew(); strlist = ajListstrNew(); /* Read next line */ while(ajReadline(inf,&line)) { ajFmtScanS(line, "%S", &tok); /* Write application definition or section definition out as-is */ if(ajStrMatchC(tok, "application:") || ajStrMatchC(tok, "section:")) { ajFmtPrintF(outf, "%S", line); while(ajReadline(inf,&line)) { ajFmtPrintF(outf, "%S", line); ajFmtScanS(line, "%S", &tok); if(ajStrMatchC(tok, "]")) break; } } /* Write variables, endsection definitions and comments out as-is */ else if(ajStrMatchC(tok, "variable:") || ajStrMatchC(tok, "endsection:") || ajStrMatchC(tok, "#")) ajFmtPrintF(outf, "%S", line); /* Write out blank lines as-is */ else if (!ajFmtScanS(line, "%S", &tok)) ajFmtPrintF(outf, "%S", line); /* Process data definition */ else /* First line of data definition */ { /* Process and write datatype line */ ajFmtPrintF(outf, "%S", line); ajFmtScanS(line, "%S", &acdtype); ajStrRemoveSetC(&acdtype, ":"); /* Process subsequent (attribute) lines */ while(ajReadline(inf,&line)) { strtmp = ajStrNew(); ajStrAssignS(&strtmp, line); ajStrRemoveWhite(&strtmp); ajListstrPushAppend(strlist, strtmp); ajFmtScanS(line, "%S", &tok); /* Reached end of data definition */ if(ajStrMatchC(tok, "]")) { nstr = ajListstrToarray(strlist, &strarr); /* Write relations: line */ acdrelations_writerelations(outf, acdtype, strarr, nstr, P, T); AJFREE(strarr); ajListstrFreeData(&strlist); strlist = ajListstrNew(); ajFmtPrintF(outf, "%S", line); break; } /* Ignore existing relations: lines */ else if(ajStrMatchC(tok, "relations:")) continue; ajFmtPrintF(outf, "%S", line); } } } /* Free memory */ ajStrDel(&line); ajStrDel(&tok); ajStrDel(&acdtype); ajListstrFreeData(&strlist); return; }
static void acdrelations_readdatfile (AjPFile inf, PEdam *P) { AjPStr line = NULL; const AjPStr tok = NULL; const AjPStr subtok = NULL; AjPStr strtmp = NULL; AjPList strlist = NULL; AjPStr acdtype = NULL; AjPStr relations = NULL; PEdamdat dattmp = NULL; AjPList datlist = NULL; if(!P) ajFatal("Null arg error 1 in acdrelations_readdatfile"); if(!inf) ajFatal("Null arg error 3 in acdrelations_readdatfile"); /* Allocate memory */ line = ajStrNew(); acdtype = ajStrNew(); relations = ajStrNew(); datlist = ajListNew(); /* Read data from file */ while(ajReadline(inf,&line)) { /* Discard comment lines */ if(ajStrPrefixC(line,"#")) continue; /* Tokenise line, delimited by '|'. Parse first token (ACD datatype ) */ ajStrAssignS(&acdtype, ajStrParseC(line, "|")); /* Parse second token (EDAM relations: value ) */ ajStrAssignS(&relations, ajStrParseC(NULL, "|")); /* Parse third token (attribute:value strings block) */ tok = ajStrParseC(NULL, "|"); /* Create new string list */ strlist = ajListstrNew(); /* Tokenise third token itself into tokens delimited by ' ' (space) Parse tokens (individual attribute:value strings)*/ if((subtok=ajStrParseC(tok, ";"))) { strtmp = ajStrNew(); ajStrAssignS(&strtmp, subtok); ajStrRemoveWhite(&strtmp); ajListstrPushAppend(strlist, strtmp); while((subtok=ajStrParseC(NULL, ";"))) { strtmp = ajStrNew(); ajStrAssignS(&strtmp, subtok); ajStrRemoveWhite(&strtmp); ajListstrPushAppend(strlist, strtmp); } } /* Write PEdamdat structure & push onto list */ dattmp = ajEdamdatNew(); ajStrRemoveWhite(&acdtype); ajStrAssignS(&dattmp->acdtype, acdtype); ajStrAssignS(&dattmp->edam, relations); dattmp->n = ajListstrToarray(strlist, &dattmp->acdattr); ajListPushAppend(datlist, dattmp); /* Clear nodes (but not strings) from string list */ ajListstrFree(&strlist); } /* Write PEdam structure */ ((*P)->n) = ajListToarray(datlist, (void***) &((*P)->dat)); /* Free memory */ ajStrDel(&line); ajStrDel(&acdtype); ajStrDel(&relations); ajListFree(&datlist); return; }
int main(int argc, char **argv) { AjPSeqout outseq = NULL; AjPList list = NULL; AjPSeq seq = NULL; AjPStr insert = NULL; AjPStr seqstr = NULL; AjPStr* seqr = NULL; AjPFile data = NULL; ajint start = 0; ajint length = 0; ajint amount = 0; ajint scmax = 0; ajint extra = 0; embInit("makeprotseq", argc, argv); data = ajAcdGetInfile("pepstatsfile"); insert = ajAcdGetString("insert"); start = ajAcdGetInt("start"); length = ajAcdGetInt("length"); amount = ajAcdGetInt("amount"); outseq = ajAcdGetSeqoutall("outseq"); list = ajListstrNew(); /* this is checked by acd if(amount <=0 || length <= 0) ajFatal("Amount or length is 0 or less. " "Unable to create any sequences"); */ /* if insert, make sure sequence is large enough */ if(ajStrGetLen(insert)) { length -= ajStrGetLen(insert); /* start= start <= 1 ? 0 : --start; */ /* checked in acd */ start--; if(length <= 0) ajFatal("Sequence smaller than inserted part. " "Unable to create sequences."); } /* make the list of AjPStr to be used in sequence creation */ if(data) { ajDebug("Distribution datafile '%s' given checking type\n", ajFileGetPrintnameC(data)); seqstr = ajStrNew(); ajReadlineTrim(data,&seqstr); if(ajStrFindC(seqstr,"PEPSTATS") == 0) { makeprotseq_parse_pepstats(&list,data); } else { ajWarn("Not pepstats file. Making completely random sequences."); makeprotseq_default_chars(&list); } ajStrDel(&seqstr); ajFileClose(&data); } else makeprotseq_default_chars(&list); /* if insert, make sure type is correct */ /* typecheking code is not working, uncomment and test after it is if(ajStrGetLen(insert)) { seqstr = ajStrNew(); if(prot) ajStrAssignC(&seqstr,"pureprotein"); if(!ajSeqTypeCheckS(&insert,seqstr)) ajFatal("Insert not the same sequence type as sequence itself."); ajStrDel(&seqstr); } */ /* array allows fast creation of a sequences */ scmax = (ajuint) ajListstrToarray(list,&seqr); if(!scmax) ajFatal("No strings in list. No characters to make the sequence."); ajDebug("Distribution array done.\nscmax '%d', extra '%d', first '%S'\n", scmax,extra,seqr[0]); ajRandomSeed(); while(amount-- > 0) { seqstr = makeprotseq_random_sequence(seqr,scmax,length); if(ajStrGetLen(insert)) ajStrInsertS(&seqstr,start,insert); ajStrFmtLower(&seqstr); seq = ajSeqNew(); ajSeqAssignSeqS(seq, seqstr); ajSeqSetProt(seq); ajSeqoutWriteSeq(outseq, seq); ajSeqDel(&seq); ajStrDel(&seqstr); } ajSeqoutClose(outseq); ajSeqoutDel(&outseq); ajListstrFreeData(&list); ajStrDel(&insert); AJFREE(seqr); embExit(); return 0; }