static AjBool dbiblast_parseNcbi(const AjPStr line, AjPFile * alistfile, AjBool systemsort, AjPStr const * fields, ajint* maxFieldLen, ajuint* countfield, AjPStr* myid, AjPList* fdlist) { char* fd; static ajint numFields; static ajint accfield = -1; static ajint desfield = -1; static ajint svnfield = -1; static AjBool reset = AJTRUE; if(!fields) { reset = ajTrue; accfield = svnfield = desfield = -1; return ajFalse; } if(reset) { numFields = 0; while(fields[numFields]) { if(ajStrMatchCaseC(fields[numFields], "acc")) accfield=numFields; else if(ajStrMatchCaseC(fields[numFields], "sv")) svnfield=numFields; else if(ajStrMatchCaseC(fields[numFields], "des")) desfield=numFields; else ajWarn("EMBL parsing unknown field '%S' ignored", fields[numFields]); numFields++; } reset = ajFalse; } if(!wrdexp) wrdexp = ajRegCompC("([A-Za-z0-9]+)"); ajStrAssignC(&tmpdes,""); ajStrAssignC(&t,""); ajStrAssignC(&tmpac,""); ajStrAssignC(&tmpsv,""); ajStrAssignC(&tmpgi,""); ajStrAssignC(&tmpdb,""); ajFmtPrintS(&t,">%S",line); if(!ajSeqParseNcbi(t,myid,&tmpac,&tmpsv,&tmpgi,&tmpdb,&tmpdes)) return ajFalse; if(ajStrGetLen(tmpac)) ajStrFmtUpper(&tmpac); if(accfield >= 0) embDbiMaxlen(&tmpac, &maxFieldLen[accfield]); if(svnfield >= 0) { embDbiMaxlen(&tmpsv, &maxFieldLen[svnfield]); embDbiMaxlen(&tmpgi, &maxFieldLen[svnfield]); } ajStrFmtUpper(myid); /* ajDebug("parseNCBI success\n"); */ if(systemsort) { if(accfield >= 0 && ajStrGetLen(tmpac)) { countfield[accfield]++; ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, tmpac); } if(svnfield >= 0 && ajStrGetLen(tmpsv)) { countfield[svnfield]++; ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, tmpsv); } if(svnfield >= 0 && ajStrGetLen(tmpgi)) { countfield[svnfield]++; ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, tmpgi); } if(desfield >= 0 && ajStrGetLen(tmpdes)) while(ajRegExec(wrdexp, tmpdes)) { ajRegSubI(wrdexp, 1, &tmpfd); embDbiMaxlen(&tmpfd, &maxFieldLen[desfield]); ajStrFmtUpper(&tmpfd); ajDebug("++des '%S'\n", tmpfd); countfield[desfield]++; ajFmtPrintF(alistfile[desfield], "%S %S\n", *myid, tmpfd); ajRegPost(wrdexp, &tmpdes); } } else { if(accfield >= 0 && ajStrGetLen(tmpac)) { fd = ajCharNewS(tmpac); countfield[accfield]++; ajListPushAppend(fdlist[accfield], fd); } if(svnfield >= 0 && ajStrGetLen(tmpsv)) { fd = ajCharNewS(tmpsv); countfield[svnfield]++; ajListPushAppend(fdlist[svnfield], fd); } if(svnfield >= 0 && ajStrGetLen(tmpgi)) { fd = ajCharNewS(tmpgi); ajListPushAppend(fdlist[svnfield], fd); } if(desfield >= 0 && ajStrGetLen(tmpdes)) { while(ajRegExec(wrdexp, tmpdes)) { ajRegSubI(wrdexp, 1, &tmpfd); embDbiMaxlen(&tmpfd, &maxFieldLen[desfield]); ajStrFmtUpper(&tmpfd); ajDebug("++des '%S'\n", tmpfd); fd = ajCharNewS(tmpfd); countfield[desfield]++; ajListPushAppend(fdlist[desfield], fd); ajRegPost(wrdexp, &tmpdes); } } } /* ajDebug("parseNCBI '%S' '%S'\n", *myid, tmpac); */ return ajTrue; }
static AjBool dbiblast_parseSimple(const AjPStr line, AjPFile * alistfile, AjBool systemsort, AjPStr const * fields, ajint* maxFieldLen, ajuint* countfield, AjPStr* myid, AjPList* myfdl) { static AjPRegexp idexp = NULL; static AjPStr mytmpac = NULL; char* ac; static ajint numFields; static ajint accfield = -1; static AjBool reset = AJTRUE; if(!fields) { reset = ajTrue; accfield = -1; return ajFalse; } if(reset) { numFields = 0; while(fields[numFields]) { if(ajStrMatchCaseC(fields[numFields], "acc")) accfield=numFields; else if(!ajStrMatchCaseC(fields[numFields], "sv") && !ajStrMatchCaseC(fields[numFields], "des")) ajWarn("Simple ID parsing unknown field '%S' ignored", fields[numFields]); numFields++; } reset = ajFalse; } if(!idexp) idexp = ajRegCompC("^([^ ]+)( +([A-Za-z][A-Za-z0-9]+[0-9]))"); if(!ajRegExec(idexp, line)) return ajFalse; ajRegSubI(idexp, 1, myid); ajRegSubI(idexp, 3, &mytmpac); ajStrFmtUpper(myid); ajStrFmtUpper(&mytmpac); /* GCG mixes case on new SwissProt acnums */ if(accfield >= 0) { embDbiMaxlen(&mytmpac, &maxFieldLen[accfield]); countfield[accfield]++; if(systemsort) ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, mytmpac); else { ac = ajCharNewS(mytmpac); ajListPushAppend(myfdl[accfield], ac); } } ajDebug("parseSimple '%S' '%S'\n", *myid, mytmpac); return ajTrue; }
static AjBool dbifasta_ParseFasta(AjPFile libr, ajint* dpos, ajint* maxFieldLen, ajuint* countfield, AjPRegexp idexp, ajuint usertype, AjPFile* alistfile, AjBool systemsort, AjPStr const * fields) { char* fd; ajlong ipos; static AjPStr tstr = NULL; static ajint numFields; static ajint accfield = -1; static ajint desfield = -1; static ajint svnfield = -1; static AjBool reset = AJTRUE; ajuint type = usertype; if(!fields) { reset = ajTrue; accfield = svnfield = desfield = -1; return ajFalse; } if(reset) { numFields = 0; while(fields[numFields]) { if(ajStrMatchCaseC(fields[numFields], "acc")) accfield=numFields; else if(ajStrMatchCaseC(fields[numFields], "sv")) svnfield=numFields; else if(ajStrMatchCaseC(fields[numFields], "des")) desfield=numFields; else ajWarn("EMBL parsing unknown field '%S' ignored", fields[numFields]); numFields++; } reset = ajFalse; } if(!dbifastaGWrdexp) dbifastaGWrdexp = ajRegCompC("([A-Za-z0-9]+)"); if(!tstr) tstr = ajStrNew(); *dpos = (ajint) ajFileResetPos(libr); /* Lossy cast */ ajReadline(libr, &dbifastaGRline); if(!ajStrGetLen(dbifastaGRline)) return ajFalse; if(!ajRegExec(idexp,dbifastaGRline)) { ajStrDelStatic(&dbifastaGTmpAc); type = FASTATYPE_SIMPLE; idexp = dbifastaGIdexp; if(!ajRegExec(idexp, dbifastaGRline)) { ajFatal("Unrecognised ID line format: %S", dbifastaGRline); return ajFalse; } ajWarn("Invalid ID line for selected format: %S", dbifastaGRline); } /* ** each case needs to set id, tmpac, tmpsv, tmpdes ** using empty values if they are not found */ ajStrAssignC(&dbifastaGTmpSv, ""); ajStrAssignC(&dbifastaGTmpGi, ""); ajStrAssignC(&dbifastaGTmpDb, ""); ajStrAssignC(&dbifastaGTmpDes, ""); ajStrAssignC(&dbifastaGTmpAc, ""); ajStrAssignC(&dbifastaGTmpId, ""); switch(type) { case FASTATYPE_SIMPLE: ajRegSubI(idexp,2,&dbifastaGTmpId); ajStrAssignS(&dbifastaGTmpAc,dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_DBID: ajRegSubI(idexp,1,&dbifastaGTmpId); ajStrAssignS(&dbifastaGTmpAc,dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_GCGID: ajRegSubI(idexp,1,&dbifastaGTmpId); ajStrAssignS(&dbifastaGTmpAc,dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_NCBI: if(!ajSeqParseNcbi(dbifastaGRline, &dbifastaGTmpId, &dbifastaGTmpAc, &dbifastaGTmpSv, &dbifastaGTmpGi, &dbifastaGTmpDb, &dbifastaGTmpDes)) { ajStrDelStatic(&dbifastaGTmpAc); return ajFalse; } break; case FASTATYPE_GCGIDACC: ajRegSubI(idexp,1,&dbifastaGTmpId); ajRegSubI(idexp,2,&dbifastaGTmpAc); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_GCGACCID: ajRegSubI(idexp,1,&dbifastaGTmpAc); ajRegSubI(idexp,2,&dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_IDACC: ajRegSubI(idexp,1,&dbifastaGTmpId); ajRegSubI(idexp,2,&dbifastaGTmpAc); ajRegPost(idexp, &dbifastaGTmpDes); break; case FASTATYPE_ACCID: ajRegSubI(idexp,1,&dbifastaGTmpAc); ajRegSubI(idexp,2,&dbifastaGTmpId); ajRegPost(idexp, &dbifastaGTmpDes); break; default: ajStrDelStatic(&dbifastaGTmpAc); return ajFalse; } ajStrFmtUpper(&dbifastaGTmpId); ajStrFmtUpper(&dbifastaGTmpAc); if(accfield >= 0) embDbiMaxlen(&dbifastaGTmpAc, &maxFieldLen[accfield]); if(svnfield >= 0) { embDbiMaxlen(&dbifastaGTmpSv, &maxFieldLen[svnfield]); embDbiMaxlen(&dbifastaGTmpGi, &maxFieldLen[svnfield]); } if(systemsort) { if(accfield >= 0 && ajStrGetLen(dbifastaGTmpAc)) { countfield[accfield]++; ajFmtPrintF(alistfile[accfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpAc); } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpSv)) { countfield[svnfield]++; ajFmtPrintF(alistfile[svnfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpSv); } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpGi)) { countfield[svnfield]++; ajFmtPrintF(alistfile[svnfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpGi); } if(desfield >= 0 && ajStrGetLen(dbifastaGTmpDes)) while(ajRegExec(dbifastaGWrdexp, dbifastaGTmpDes)) { ajRegSubI(dbifastaGWrdexp, 1, &dbifastaGTmpFd); embDbiMaxlen(&dbifastaGTmpFd, &maxFieldLen[desfield]); ajStrFmtUpper(&dbifastaGTmpFd); ajDebug("++des '%S' tmpdes '%S\n", dbifastaGTmpFd, dbifastaGTmpDes); countfield[desfield]++; ajFmtPrintF(alistfile[desfield], "%S %S\n", dbifastaGTmpId, dbifastaGTmpFd); ajRegPost(dbifastaGWrdexp, &dbifastaGTmpDes); } } else { if(accfield >= 0 && ajStrGetLen(dbifastaGTmpAc)) { fd = ajCharNewS(dbifastaGTmpAc); ajListPushAppend(dbifastaGFdl[accfield],fd); countfield[accfield]++; } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpSv)) { fd = ajCharNewS(dbifastaGTmpSv); ajListPushAppend(dbifastaGFdl[svnfield], fd); countfield[svnfield]++; } if(svnfield >= 0 && ajStrGetLen(dbifastaGTmpGi)) { fd = ajCharNewS(dbifastaGTmpGi); ajListPushAppend(dbifastaGFdl[svnfield], fd); countfield[svnfield]++; } if(desfield >= 0 && ajStrGetLen(dbifastaGTmpDes)) while(ajRegExec(dbifastaGWrdexp, dbifastaGTmpDes)) { ajRegSubI(dbifastaGWrdexp, 1, &dbifastaGTmpFd); embDbiMaxlen(&dbifastaGTmpFd, &maxFieldLen[desfield]); ajStrFmtUpper(&dbifastaGTmpFd); ajDebug("++des '%S' tmpdes: '%S'\n", dbifastaGTmpFd, dbifastaGTmpDes); fd = ajCharNewS(dbifastaGTmpFd); ajListPushAppend(dbifastaGFdl[desfield], fd); countfield[desfield]++; ajRegPost(dbifastaGWrdexp, &dbifastaGTmpDes); } } ipos = ajFileResetPos(libr); while(ajReadline(libr, &dbifastaGRline)) { if(ajStrGetCharFirst(dbifastaGRline) == '>') { ajFileSeek(libr, ipos, 0); return ajTrue; } ipos = ajFileResetPos(libr); } ajFileSeek(libr, ipos, 0); /* end of file reached */ return ajTrue; }