static void getFastaOffsets(struct brokenRefPepTbl *brpTbl, struct sqlConnection *conn, struct extFileTbl* extFileTbl, char *faPath) /* parse fasta file to get offsets of proteins */ { struct gbFa *fa = gbFaOpen(faPath, "r"); char acc[GB_ACC_BUFSZ]; struct brokenRefPep *brp; HGID extId = extFileTblGet(extFileTbl, conn, faPath); gbVerbMsg(5, "scanning fasta: %s", faPath); while (gbFaReadNext(fa)) { gbVerbMsg(5, " %s: %lld", fa->id, (long long)fa->recOff); /* save only if same acecss, version, and file (to match mrna fa) */ short ver = gbSplitAccVer(fa->id, acc); brp = hashFindVal(brpTbl->protAccHash, acc); if ((brp != NULL) && (ver == brp->protVer) && sameString(faPath, brp->newFaPath)) { gbFaGetSeq(fa); /* force read of sequence data */ brp->newFaId = extId; brp->newFaOff = fa->recOff; brp->newSeqSize = fa->seqLen; brp->newRecSize = fa->off-fa->recOff; gbVerbMsg(5, " save: %s %lld for %lld\n", fa->id, (long long)fa->recOff, (long long)fa->off); } } gbFaClose(&fa); }
static void openByAccPrefix(char* accPrefix) /* Open up the by accession prefix */ { char *mode; char raPath[PATH_LEN], faPath[PATH_LEN], gbIdxPath[PATH_LEN]; carefulClose(&raFile); gbFaClose(&faFile); if (gbIdxName != NULL) carefulClose(&gbIdxFile); makeAccPrefixedFile(accPrefix, raName, raPath); mode = isFirstOpen(raPath) ? "w" : "a"; raFile = mustOpen(raPath, mode); makeAccPrefixedFile(accPrefix, faName, faPath); faFile = gbFaOpen(faPath, mode); if (gbIdxName != NULL) { makeAccPrefixedFile(accPrefix, gbIdxName, gbIdxPath); gbIdxFile = mustOpen(gbIdxPath, mode); } strcpy(gCurAccPrefix, accPrefix); }
void copySelectedFasta(struct gbSelect* select) /* copy FASTA records that were selected for alignment, segregating by * native/xeno, and partitioning large files. */ { char inFasta[PATH_LEN]; struct gbFa* inFa; struct outFa* nativeFa = NULL; struct outFa* xenoFa = NULL; if (select->orgCats & GB_NATIVE) nativeFa = outFaNew(select, GB_NATIVE); if (select->orgCats & GB_XENO) xenoFa = outFaNew(select, GB_XENO); gbProcessedGetPath(select, "fa", inFasta); gbVerbEnter(2, "copying from %s", inFasta); inFa = gbFaOpen(inFasta, "r"); while (copyFastaRec(select, inFa, nativeFa, xenoFa)) continue; outFaFree(&nativeFa); outFaFree(&xenoFa); gbFaClose(&inFa); gbVerbLeave(2, "copying from %s", inFasta); }
void seqDataProcessUpdate(struct gbSelect* select) /* Get sequences for a partition and update. Partition processed index should * be loaded and selected versions flaged. */ { char inFasta[PATH_LEN]; struct gbFa* inFa; gbProcessedGetPath(select, "fa", inFasta); inFa = gbFaOpen(inFasta, "r"); while (gbFaReadNext(inFa)) processSeq(select, inFa); gbFaClose(&inFa); }
int main(int argc, char *argv[]) /* Check parameters, set up, loop through each GenBank file. */ { char *gbName; int argi = 1; struct hash *estAuthorHash = NULL; char *pepFa; optionInit(&argc, argv, optionSpecs); if (argc < 4) usage(); gByAccPrefixSize = optionInt("byAccPrefix", 0); gbIdxName = optionVal("gbidx", NULL); pepFa = optionVal("pepFa", NULL); gbType = gbParseType(optionVal("type", "mrna,est")); gbOrg = optionVal("org", NULL); inclXMs = optionExists("inclXMs"); if (gByAccPrefixSize > 4) /* keep small to avoid tons of open files */ errAbort("max value of -byAccPrefix is 4"); gCurAccPrefix[0] = '\0'; faName = argv[argi++]; raName = argv[argi++]; estAuthorHash = newHash(23); kvt = newKvt(5*1024); gbfInit(); if (pepFa != NULL) gPepFa = gbFaOpen(pepFa,"w"); char *blackList = optionVal("blackList", NULL); if (blackList != NULL) blackListRanges = genbankBlackListParse(blackList); while (argi < argc) { gbName = argv[argi++]; printf("Processing %s into %s and %s\n", gbName, faName, raName); procOneGbFile(gbName, estAuthorHash); } gbFaClose(&faFile); gbFaClose(&gPepFa); carefulClose(&raFile); carefulClose(&gbIdxFile); return 0; }
void outFaOpen(struct outFa* outFa) /* Open the fasta file */ { char ext[64]; char path[PATH_LEN]; assert(outFa->fa == NULL); safef(ext, sizeof(ext), "%d.fa", outFa->nextPartNum); gbAlignedGetPath(&outFa->select, ext, workDir, path); outFa->fa = gbFaOpen(path, "w"); outFa->numSeqs = 0; outFa->numBases = 0; if (createPolyASizes) { safef(ext, sizeof(ext), "%d.polya", outFa->nextPartNum); gbAlignedGetPath(&outFa->select, ext, workDir, path); outFa->polyAFh = mustOpen(path, "w"); } outFa->nextPartNum++; }
static void setupOutputFiles(char *acc, char *org) /* Get the output files (in globals) for a sequence, opening as needed. */ { if (gByAccPrefixSize > 0) { char accPrefix[32]; strncpy(accPrefix, acc, gByAccPrefixSize); accPrefix[gByAccPrefixSize] = '\0'; tolowers(accPrefix); if (!sameString(accPrefix, gCurAccPrefix)) openByAccPrefix(accPrefix); } else { /* output to a single set of files */ if (raFile == NULL) { raFile = mustOpen(raName, "w"); faFile = gbFaOpen(faName, "w"); if (gbIdxName != NULL) gbIdxFile = mustOpen(gbIdxName, "w"); } } }
void seqDataOpen(boolean inclVersion, char *outFile) /* open output file and set options */ { gInclVersion = inclVersion; gOutFa = gbFaOpen(outFile, "w"); }