static AjBool dbxflat_ParseEmbl(EmbPBtreeEntry entry, AjPFile inf) { AjPStr line = NULL; ajlong pos = 0L; line = ajStrNewC(""); while(!ajStrPrefixC(line,"//")) { pos = ajFileResetPos(inf); if(!ajReadlineTrim(inf,&line)) { ajStrDel(&line); return ajFalse; } if(ajStrPrefixC(line,"ID")) { entry->fpos = pos; ajFmtScanS(line,"%*S%S",&entry->id); ajStrTrimEndC(&entry->id, ";"); /* ++global; printf("%d. %s\n",global,ajStrGetPtr(entry->id)); */ if(svfield) embBtreeEmblSV(line,svfield->data); } if(svfield) if(ajStrPrefixC(line,"SV") || ajStrPrefixC(line,"IV")) /* emblcds database format */ embBtreeEmblAC(line,svfield->data); if(accfield) if(ajStrPrefixC(line,"AC") || ajStrPrefixC(line,"PA")) /* emblcds database format */ embBtreeEmblAC(line,accfield->data); if(keyfield) if(ajStrPrefixC(line,"KW")) embBtreeEmblKW(line,keyfield->data,keyfield->len); if(desfield) if(ajStrPrefixC(line,"DE")) embBtreeEmblDE(line,desfield->data,desfield->len); if(orgfield) if(ajStrPrefixC(line,"OC") || ajStrPrefixC(line,"OS")) embBtreeEmblTX(line,orgfield->data,orgfield->len); } ajStrDel(&line); return ajTrue; }
static AjBool dbxflat_ParseEmbl(EmbPBtreeEntry entry, AjPFile inf) { AjPStr line = NULL; ajlong pos = 0L; line = ajStrNewC(""); while(!ajStrPrefixC(line,"//")) { pos = ajFileResetPos(inf); if(!ajReadlineTrim(inf,&line)) { ajStrDel(&line); return ajFalse; } if(ajStrPrefixC(line,"ID")) { entry->fpos = pos; ajFmtScanS(line,"%*S%S",&entry->id); ajStrTrimEndC(&entry->id, ";"); /* ++global; printf("%d. %s\n",global,ajStrGetPtr(entry->id)); */ if(entry->do_sv) embBtreeEmblSV(line,entry->sv); } if(entry->do_sv) if(ajStrPrefixC(line,"SV") || ajStrPrefixC(line,"IV")) /* emblcds database format */ embBtreeEmblAC(line,entry->sv); if(entry->do_accession) if(ajStrPrefixC(line,"AC") || ajStrPrefixC(line,"PA")) /* emblcds database format */ embBtreeEmblAC(line,entry->ac); if(entry->do_keyword) if(ajStrPrefixC(line,"KW")) embBtreeEmblKW(line,entry->kw,entry->kwlen); if(entry->do_description) if(ajStrPrefixC(line,"DE")) embBtreeEmblDE(line,entry->de,entry->delen); if(entry->do_taxonomy) if(ajStrPrefixC(line,"OC") || ajStrPrefixC(line,"OS")) embBtreeEmblTX(line,entry->tx,entry->txlen); } ajStrDel(&line); return ajTrue; }
static AjBool dbifasta_ParseFasta(AjPFile libr, ajint* dpos, ajint* maxFieldLen, ajuint* countfield, AjPRegexp idexp, ajuint usertype, AjPFile* alistfile, AjBool systemsort, AjPStr const * fields) { char* fd; ajlong ipos; static AjPStr tstr = NULL; static ajint numFields; static ajint accfield = -1; static ajint desfield = -1; static ajint svnfield = -1; static AjBool reset = AJTRUE; ajuint type = usertype; if(!fields) { reset = ajTrue; accfield = svnfield = desfield = -1; return ajFalse; } if(reset) { numFields = 0; while(fields[numFields]) { if(ajStrMatchCaseC(fields[numFields], "acc")) accfield=numFields; else if(ajStrMatchCaseC(fields[numFields], "sv")) svnfield=numFields; else if(ajStrMatchCaseC(fields[numFields], "des")) desfield=numFields; else ajWarn("EMBL parsing unknown field '%S' ignored", fields[numFields]); numFields++; } reset = ajFalse; } if(!dbifastaGWrdexp) dbifastaGWrdexp = ajRegCompC("([A-Za-z0-9]+)"); if(!tstr) tstr = ajStrNew(); *dpos = (ajint) ajFileResetPos(libr); /* Lossy cast */ ajReadline(libr, &dbifastaGRline); if(!ajStrGetLen(dbifastaGRline)) return ajFalse; if(!ajRegExec(idexp,dbifastaGRline)) { ajStrDelStatic(&dbifastaGTmpAc); type = FASTATYPE_SIMPLE; idexp = dbifastaGIdexp; if(!ajRegExec(idexp, dbifastaGRline)) { ajFatal("Unrecognised ID line format: %S", dbifastaGRline); return ajFalse; } ajWarn("Invalid ID line for selected format: %S", dbifastaGRline); } /* ** each case needs to set id, tmpac, tmpsv, tmpdes ** using empty values if they are not found */ ajStrAssignC(&dbifastaGTmpSv, ""); ajStrAssignC(&dbifastaGTmpGi, ""); ajStrAssignC(&dbifastaGTmpDb, ""); ajStrAssignC(&dbifastaGTmpDes, ""); ajStrAssignC(&dbifastaGTmpAc, ""); ajStrAssignC(&dbifastaGTmpId, ""); switch(type) { case FASTATYPE_SIMPLE: ajRegSubI(idexp,2,&dbifastaGTmpId); ajStrAssignS(&dbifastaGTmpAc,dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_DBID: ajRegSubI(idexp,1,&dbifastaGTmpId); ajStrAssignS(&dbifastaGTmpAc,dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_GCGID: ajRegSubI(idexp,1,&dbifastaGTmpId); ajStrAssignS(&dbifastaGTmpAc,dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_NCBI: if(!ajSeqParseNcbi(dbifastaGRline, &dbifastaGTmpId, &dbifastaGTmpAc, &dbifastaGTmpSv, &dbifastaGTmpGi, &dbifastaGTmpDb, &dbifastaGTmpDes)) { ajStrDelStatic(&dbifastaGTmpAc); return ajFalse; } break; case FASTATYPE_GCGIDACC: ajRegSubI(idexp,1,&dbifastaGTmpId); ajRegSubI(idexp,2,&dbifastaGTmpAc); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_GCGACCID: ajRegSubI(idexp,1,&dbifastaGTmpAc); ajRegSubI(idexp,2,&dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_IDACC: ajRegSubI(idexp,1,&dbifastaGTmpId); ajRegSubI(idexp,2,&dbifastaGTmpAc); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_ACCID: ajRegSubI(idexp,1,&dbifastaGTmpAc); ajRegSubI(idexp,2,&dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; default: ajStrDelStatic(&dbifastaGTmpAc); return ajFalse; } ajStrFmtUpper(&dbifastaGTmpId); ajStrFmtUpper(&dbifastaGTmpAc); if(accfield >= 0) embDbiMaxlen(&dbifastaGTmpAc, &maxFieldLen[accfield]); if(svnfield >= 0) { embDbiMaxlen(&dbifastaGTmpSv, &maxFieldLen[svnfield]); embDbiMaxlen(&dbifastaGTmpGi, &maxFieldLen[svnfield]); } if(systemsort) { if(accfield >= 0 && ajStrGetLen(dbifastaGTmpAc)) { countfield[accfield]++; ajFmtPrintF(alistfile[accfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpAc); } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpSv)) { countfield[svnfield]++; ajFmtPrintF(alistfile[svnfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpSv); } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpGi)) { countfield[svnfield]++; ajFmtPrintF(alistfile[svnfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpGi); } if(desfield >= 0 && ajStrGetLen(dbifastaGTmpDes)) while(ajRegExec(dbifastaGWrdexp, dbifastaGTmpDes)) { ajRegSubI(dbifastaGWrdexp, 1, &dbifastaGTmpFd); embDbiMaxlen(&dbifastaGTmpFd, &maxFieldLen[desfield]); ajStrFmtUpper(&dbifastaGTmpFd); ajDebug("++des '%S' tmpdes '%S\n", dbifastaGTmpFd, dbifastaGTmpDes); countfield[desfield]++; ajFmtPrintF(alistfile[desfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpFd); ajRegPost(dbifastaGWrdexp, &dbifastaGTmpDes); } } else { if(accfield >= 0 && ajStrGetLen(dbifastaGTmpAc)) { fd = ajCharNewS(dbifastaGTmpAc); ajListPushAppend(dbifastaGFdl[accfield],fd); countfield[accfield]++; } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpSv)) { fd = ajCharNewS(dbifastaGTmpSv); ajListPushAppend(dbifastaGFdl[svnfield], fd); countfield[svnfield]++; } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpGi)) { fd = ajCharNewS(dbifastaGTmpGi); ajListPushAppend(dbifastaGFdl[svnfield], fd); countfield[svnfield]++; } if(desfield >= 0 && ajStrGetLen(dbifastaGTmpDes)) while(ajRegExec(dbifastaGWrdexp, dbifastaGTmpDes)) { ajRegSubI(dbifastaGWrdexp, 1, &dbifastaGTmpFd); embDbiMaxlen(&dbifastaGTmpFd, &maxFieldLen[desfield]); ajStrFmtUpper(&dbifastaGTmpFd); ajDebug("++des '%S' tmpdes: '%S'\n", dbifastaGTmpFd, dbifastaGTmpDes); fd = ajCharNewS(dbifastaGTmpFd); ajListPushAppend(dbifastaGFdl[desfield], fd); countfield[desfield]++; ajRegPost(dbifastaGWrdexp, &dbifastaGTmpDes); } } ipos = ajFileResetPos(libr); while(ajReadline(libr, &dbifastaGRline)) { if(ajStrGetCharFirst(dbifastaGRline) == '>') { ajFileSeek(libr, ipos, 0); return ajTrue; } ipos = ajFileResetPos(libr); } ajFileSeek(libr, ipos, 0); /* end of file reached */ return ajTrue; }
static AjBool dbxflat_ParseFastq(EmbPBtreeEntry entry, AjPFile inf) { AjPStr line = NULL; ajlong pos = 0L; ajuint seqlen = 0; ajuint qlen = 0; AjPStr tmpfd = NULL; AjPStr str = NULL; AjPStr de = NULL; AjBool ok; if(!dbxflat_wrdexp) dbxflat_wrdexp = ajRegCompC("([A-Za-z0-9.:=]+)"); line = ajStrNewC(""); pos = ajFileResetPos(inf); if(!ajReadlineTrim(inf,&line)) { ajStrDel(&line); return ajFalse; } /* first line of entry */ if(!ajStrPrefixC(line,"@")) return ajFalse; entry->fpos = pos; ajStrCutStart(&line, 1); ajStrExtractFirst(line, &de, &entry->id); if(desfield && ajStrGetLen(de)) { while(ajRegExec(dbxflat_wrdexp,de)) { ajRegSubI(dbxflat_wrdexp, 1, &tmpfd); str = ajStrNew(); ajStrAssignS(&str,tmpfd); ajListPush(desfield->data,(void *)str); ajRegPost(dbxflat_wrdexp, &de); } } /* now read sequence */ ok = ajReadlineTrim(inf,&line); while(ok && !ajStrPrefixC(line, "+")) { ajStrRemoveWhite(&line); seqlen += MAJSTRGETLEN(line); ok = ajReadlineTrim(inf,&line); } if(!ok) return ajFalse; ok = ajReadlineTrim(inf,&line); while(ok) { qlen += MAJSTRGETLEN(line); if(qlen < seqlen) ok = ajReadlineTrim(inf,&line); else ok = ajFalse; } ajStrDel(&de); ajStrDel(&tmpfd); ajStrDel(&line); return ajTrue; }
static AjBool dbxflat_ParseGenbank(EmbPBtreeEntry entry, AjPFile inf) { AjPStr line = NULL; ajlong pos = 0L; AjBool ret = ajTrue; AjPStr sumline = NULL; line = ajStrNewC(""); sumline = ajStrNew(); while(!ajStrPrefixC(line,"//") && ret) { if(ajStrPrefixC(line,"LOCUS")) { entry->fpos = pos; ajFmtScanS(line,"%*S%S",&entry->id); } if(svfield) if(ajStrPrefixC(line,"VERSION")) embBtreeGenBankAC(line,svfield->data); if(accfield) if(ajStrPrefixC(line,"ACCESSION")) embBtreeGenBankAC(line,accfield->data); if(keyfield) if(ajStrPrefixC(line,"KEYWORDS")) { ajStrAssignS(&sumline,line); ret = ajReadlineTrim(inf,&line); while(ret && *MAJSTRGETPTR(line)==' ') { ajStrAppendS(&sumline,line); ret = ajReadlineTrim(inf,&line); } ajStrRemoveWhiteExcess(&sumline); embBtreeGenBankKW(sumline,keyfield->data,keyfield->len); continue; } if(desfield) if(ajStrPrefixC(line,"DEFINITION")) { ajStrAssignS(&sumline,line); ret = ajReadlineTrim(inf,&line); while(ret && *MAJSTRGETPTR(line)==' ') { ajStrAppendS(&sumline,line); ret = ajReadlineTrim(inf,&line); } ajStrRemoveWhiteExcess(&sumline); embBtreeGenBankDE(sumline,desfield->data,desfield->len); continue; } if(orgfield) if(ajStrPrefixC(line,"SOURCE")) { ret = ajReadlineTrim(inf,&line); ajStrAppendC(&line,";"); while(ret && *MAJSTRGETPTR(line)==' ') { ajStrAppendS(&sumline,line); ret = ajReadlineTrim(inf,&line); } ajStrRemoveWhiteExcess(&sumline); embBtreeGenBankTX(sumline,orgfield->data,orgfield->len); continue; } pos = ajFileResetPos(inf); if(!ajReadlineTrim(inf,&line)) ret = ajFalse; } ajStrDel(&line); ajStrDel(&sumline); return ret; }
static AjBool dbxflat_ParseGenbank(EmbPBtreeEntry entry, AjPFile inf) { AjPStr line = NULL; ajlong pos = 0L; AjBool ret = ajTrue; AjPStr sumline = NULL; line = ajStrNewC(""); sumline = ajStrNew(); while(!ajStrPrefixC(line,"//") && ret) { if(ajStrPrefixC(line,"LOCUS")) { entry->fpos = pos; ajFmtScanS(line,"%*S%S",&entry->id); } if(entry->do_sv) if(ajStrPrefixC(line,"VERSION")) embBtreeGenBankAC(line,entry->sv); if(entry->do_accession) if(ajStrPrefixC(line,"ACCESSION")) embBtreeGenBankAC(line,entry->ac); if(entry->do_keyword) if(ajStrPrefixC(line,"KEYWORDS")) { ajStrAssignS(&sumline,line); ret = ajReadlineTrim(inf,&line); while(ret && *MAJSTRGETPTR(line)==' ') { ajStrAppendS(&sumline,line); ret = ajReadlineTrim(inf,&line); } ajStrRemoveWhiteExcess(&sumline); embBtreeGenBankKW(sumline,entry->kw,entry->kwlen); continue; } if(entry->do_description) if(ajStrPrefixC(line,"DEFINITION")) { ajStrAssignS(&sumline,line); ret = ajReadlineTrim(inf,&line); while(ret && *MAJSTRGETPTR(line)==' ') { ajStrAppendS(&sumline,line); ret = ajReadlineTrim(inf,&line); } ajStrRemoveWhiteExcess(&sumline); embBtreeGenBankDE(sumline,entry->de,entry->delen); continue; } if(entry->do_taxonomy) if(ajStrPrefixC(line,"SOURCE")) { ret = ajReadlineTrim(inf,&line); ajStrAppendC(&line,";"); while(ret && *MAJSTRGETPTR(line)==' ') { ajStrAppendS(&sumline,line); ret = ajReadlineTrim(inf,&line); } ajStrRemoveWhiteExcess(&sumline); embBtreeGenBankTX(sumline,entry->tx,entry->txlen); continue; } pos = ajFileResetPos(inf); if(!ajReadlineTrim(inf,&line)) ret = ajFalse; } ajStrDel(&line); ajStrDel(&sumline); return ret; }
int main(int argc, char **argv) { AjPFile infdat = NULL; AjPFile infdoc = NULL; AjPFile outf = NULL; AjPFile outs = NULL; AjBool haspattern; const char *p; AjPStr line = NULL; AjPStr text = NULL; AjPStr dirname = NULL; AjPStr filename = NULL; AjPStr id = NULL; AjPStr ac = NULL; AjPStr de = NULL; AjPStr pa = NULL; AjPStr ps = NULL; AjPStr fn = NULL; AjPStr re = NULL; AjPStr fname = NULL; AjBool flag; AjBool isopen; AjBool goback; ajlong storepos = 0L; embInit("prosextract", argc, argv); dirname = ajAcdGetDirectoryName("prositedir"); line = ajStrNew(); text = ajStrNew(); id = ajStrNew(); ac = ajStrNew(); de = ajStrNew(); pa = ajStrNew(); ps = ajStrNew(); fn=ajStrNew(); ajStrAssignS(&fn,dirname); ajStrAppendC(&fn,"prosite.dat"); if(!(infdat=ajFileNewInNameS(fn))) ajFatal("Cannot open file %S",fn); ajStrDel(&fn); fn=ajStrNewC("PROSITE/prosite.lines"); outf = ajDatafileNewOutNameS(fn); ajStrDel(&fn); haspattern = ajFalse; while(ajReadlineTrim(infdat, &line) ) { if(ajStrPrefixC(line, "ID")) { if(ajStrSuffixC(line,"PATTERN.")) { haspattern = ajTrue; /*save id*/ p = ajStrGetPtr(line); p = ajSysFuncStrtok(p," \t;"); p = ajSysFuncStrtok(NULL," \t;"); ajStrAssignC(&id,p); ajFmtPrintF(outf, "%S ", id); continue; } else { haspattern = ajFalse; continue; } } if(!haspattern) continue; if(ajStrPrefixC(line, "AC") ) { p = ajStrGetPtr(line); p = ajSysFuncStrtok(p, " \t;"); p = ajSysFuncStrtok(NULL, " \t;"); ajStrAssignC(&ac,p); ajFmtPrintF(outf, "%S\n ", ac); continue; } if(ajStrPrefixC(line, "DE") ) { p = ajStrGetPtr(line); p = ajSysFuncStrtok(p, " \t."); p = ajSysFuncStrtok(NULL, " \t."); ajStrAssignC(&de,p); ajFmtPrintF(outf, "%S\n ", de); continue; } if(ajStrPrefixC(line, "PA")) { ajStrAssignC(&pa,""); while(ajStrPrefixC(line,"PA")) { p = ajStrGetPtr(line); p = ajSysFuncStrtok(p, " \t."); p = ajSysFuncStrtok(NULL, " \t."); ajStrAppendC(&pa,p); ajReadlineTrim(infdat, &line); } ajFmtPrintF(outf, "%S\n", pa); re = embPatPrositeToRegExp(pa); ajFmtPrintF(outf, "^%S\n\n", re); ajStrDel(&re); continue; } } /* Finished processing prosite.dat so look at prosite.doc */ fn = ajStrNew(); ajStrAssignS(&fn,dirname); ajStrAppendC(&fn,"prosite.doc"); if(!(infdoc=ajFileNewInNameS(fn))) ajFatal("Cannot open file %S",fn); ajStrDel(&fn); fname = ajStrNewC("PROSITE/"); flag = ajFalse; isopen = ajFalse; goback = ajFalse; while(ajReadlineTrim(infdoc, &text)) { if(ajStrPrefixC(text, "{PS") && isopen && !goback) goback = ajTrue; if(ajStrPrefixC(text, "{PS") && !isopen) { storepos = ajFileResetPos(infdoc); /* save out the documentation text to acc numbered outfiles . */ p = ajStrGetPtr(text)+1; p = ajSysFuncStrtok(p, ";"); ajStrAssignS(&filename, fname); ajStrAppendC(&filename, p); outs = ajDatafileNewOutNameS(filename); flag = ajTrue; isopen = ajTrue; continue; } if(ajStrPrefixC(text, "{BEGIN}") && flag) { while(ajReadlineTrim(infdoc, &text)) { if(ajStrPrefixC(text,"{END}")) break; ajFmtPrintF(outs, "%S\n", text); } ajFileClose(&outs); isopen = ajFalse; if(goback) { goback = ajFalse; ajFileSeek(infdoc,storepos,0); } } } ajStrDel(&line); ajStrDel(&text); ajStrDel(&dirname); ajStrDel(&filename); ajStrDel(&id); ajStrDel(&ac); ajStrDel(&de); ajStrDel(&pa); ajStrDel(&re); ajStrDel(&ps); ajStrDel(&fname); ajFileClose(&infdat); ajFileClose(&infdoc); ajFileClose(&outf); embExit(); return 0; }