void gbAlignDataProcess(struct sqlConnection *conn, struct gbSelect* select, struct gbStatusTbl* statusTbl) /* Parse a psl file looking for accessions to add to the database. If the * entry matches the status->selectAlign field, it will be saved for loading * and the count of aligned entries will be incremented. */ { char pslPath[PATH_LEN]; char oiPath[PATH_LEN]; gbAlignedGetPath(select, "psl.gz", NULL, pslPath); /* shouldn't have called this method if there no alignments counted */ if (!fileExists(pslPath)) errAbort("PSL file does exist, yet genbank index indicates that it should: %s", pslPath); processPslFile(conn, select, statusTbl, pslPath); /* load the associated orientInfo file if native */ if (select->orgCats == GB_NATIVE) { strcpy(oiPath, pslPath); assert(endsWith(pslPath, ".psl.gz")); strcpy(oiPath + strlen(oiPath) - 7, ".oi.gz"); processOIFile(conn, select, statusTbl, oiPath); } /* for native ESTs, we might have an intronPsl file */ if ((select->type == GB_EST) && (select->orgCats == GB_NATIVE)) { char intronPslPath[PATH_LEN]; gbAlignedGetPath(select, "intronPsl.gz", NULL, intronPslPath); if (fileExists(intronPslPath)) processIntronPslFile(conn, select, statusTbl, intronPslPath); } }
void createAlignedIndex(struct gbSelect* select, char* alignIdx) /* create an alignment index from the alignRecs stored in the index. * it is not renamed from the tmp file name here, just closed */ { struct gbProcessed* processed; FILE *alignIdxFh; /* setup output PSL files */ gbAlignedGetPath(select, "alidx", NULL, alignIdx); alignIdxFh = gbMustOpenOutput(alignIdx); /* visit all processed entries for this update */ for (processed = select->update->processed; processed != NULL; processed = processed->updateLink) { struct gbEntry* entry = processed->entry; if ((entry->clientFlags & (MIGRATE_FLAG|ALIGN_FLAG)) && (entry->orgCat & select->orgCats)) { struct gbAligned* aligned = gbEntryFindAlignedVer(entry, processed->version); int numAligns = ((aligned != NULL) ? aligned->numAligns : 0); gbAlignedWriteIdxRec(alignIdxFh, entry->acc, processed->version, numAligns); } } carefulClose(&alignIdxFh); }
void migratePsls(struct migrateAligns* migrate, unsigned pslFileType, struct gbEntryCnts* counts, FILE* outPslFh) /* Migrate selected PSL records */ { char inPsl[PATH_LEN]; struct lineFile* inPslLf; struct psl* psl; gbAlignedGetPath(migrate->prevSelect, gPslFileGzExt[pslFileType], NULL, inPsl); /* It's possible to end up here and not have a file if none of the sequences * aligned */ if (fileExists(inPsl)) { gbVerbEnter(2, "migrating %ss from %s", gPslFileExt[pslFileType], inPsl); inPslLf = gzLineFileOpen(inPsl); while ((psl = pslNext(inPslLf)) != NULL) { migratePsl(migrate, pslFileType, counts, psl, inPsl, outPslFh); pslFree(&psl); } gzLineFileClose(&inPslLf); gbVerbLeave(2, "migrating %ss from %s", gPslFileExt[pslFileType], inPsl); } }
void outFaOpen(struct outFa* outFa) /* Open the fasta file */ { char ext[64]; char path[PATH_LEN]; assert(outFa->fa == NULL); safef(ext, sizeof(ext), "%d.fa", outFa->nextPartNum); gbAlignedGetPath(&outFa->select, ext, workDir, path); outFa->fa = gbFaOpen(path, "w"); outFa->numSeqs = 0; outFa->numBases = 0; if (createPolyASizes) { safef(ext, sizeof(ext), "%d.polya", outFa->nextPartNum); gbAlignedGetPath(&outFa->select, ext, workDir, path); outFa->polyAFh = mustOpen(path, "w"); } outFa->nextPartNum++; }
void markAligns(struct gbSelect* select, unsigned orgCat) /* create a file indicating that sequences either needs aligned or migated for * this for this partation. This is used to determine what needs to be * installed after the alignment. This is needed because they might be all * be migrate, so that fasta can't be the indicator. */ { char path[PATH_LEN]; FILE* fh; unsigned orgCatsHold = select->orgCats; select->orgCats = orgCat; gbAlignedGetPath(select, "aligns", workDir, path); fh = gbMustOpenOutput(path); gbOutputRename(path, &fh); select->orgCats = orgCatsHold; }
static void processOrgCatOi(struct gbSelect* select, unsigned orgCat) /* process files in an update an organism category. OIs are only available * for native, however this follow the structure of the PSL code */ { char inOi[PATH_LEN], *row[EST_ORIENT_INFO_NUM_COLS]; struct lineFile* inOiLf; unsigned orgCatsHold = select->orgCats; select->orgCats = orgCat; gbAlignedGetPath(select, "oi.gz", NULL, inOi); inOiLf = gzLineFileOpen(inOi); while (lineFileNextRowTab(inOiLf, row, EST_ORIENT_INFO_NUM_COLS)) { struct estOrientInfo* oi = estOrientInfoLoad(row); processOi(select, oi); estOrientInfoFree(&oi); } gzLineFileClose(&inOiLf); select->orgCats = orgCatsHold; }
void copyIntronPsls(struct gbSelect* select, FILE* outPslFh, struct recCounts* recCounts) /* Copy an intron PSL file from the work directory if it exists */ { char inPsl[PATH_LEN]; struct lineFile* inPslLf; struct psl* psl; gbAlignedGetPath(select, "intronPsl", gWorkDir, inPsl); if (fileExists(inPsl)) { gbVerbEnter(2, "installing from %s", inPsl); inPslLf = gzLineFileOpen(inPsl); while ((psl = pslNext(inPslLf)) != NULL) { copyIntronPsl(select, psl, inPsl, outPslFh, recCounts); pslFree(&psl); } gzLineFileClose(&inPslLf); gbVerbLeave(2, "installing from %s", inPsl); } }
void copyPsls(struct gbSelect* select, unsigned pslFileType, FILE* outPslFh, struct gbEntryCnts* counts) /* Copy a PSL file from the work directory if it exists, count alignments * for index. */ { char inPsl[PATH_LEN]; struct lineFile* inPslLf; struct psl* psl; gbAlignedGetPath(select, gPslFileExt[pslFileType], gWorkDir, inPsl); if (fileExists(inPsl)) { gbVerbEnter(2, "installing from %s", inPsl); inPslLf = gzLineFileOpen(inPsl); while ((psl = pslNext(inPslLf)) != NULL) { copyPsl(select, pslFileType, psl, inPsl, outPslFh, counts); pslFree(&psl); } gzLineFileClose(&inPslLf); gbVerbLeave(2, "installing from %s", inPsl); } }
void migrateOrientInfos(struct migrateAligns* migrate, FILE* outOiFh) /* Migrate estOrientInfo records */ { char inOi[PATH_LEN]; struct lineFile* inOiLf; char *row[EST_ORIENT_INFO_NUM_COLS]; gbAlignedGetPath(migrate->prevSelect, "oi.gz", NULL, inOi); if (fileExists(inOi)) { gbVerbEnter(2, "migrating from %s", inOi); inOiLf = gzLineFileOpen(inOi); while (lineFileNextRowTab(inOiLf, row, ArraySize(row))) { struct estOrientInfo *oi = estOrientInfoLoad(row); migrateOrientInfo(migrate, oi, inOi, outOiFh); estOrientInfoFree(&oi); } gzLineFileClose(&inOiLf); gbVerbLeave(2, "migrating from %s", inOi); } }
void copyOrientInfos(struct gbSelect* select, FILE* outOiFh, struct recCounts* recCounts) /* Copy an OI file from the work directory, if it exists, count alignments * for index. */ { char inOi[PATH_LEN]; struct lineFile* inOiLf; char *row[EST_ORIENT_INFO_NUM_COLS]; gbAlignedGetPath(select, "oi", gWorkDir, inOi); if (fileExists(inOi)) { gbVerbEnter(2, "installing from %s", inOi); inOiLf = gzLineFileOpen(inOi); while (lineFileNextRowTab(inOiLf, row, ArraySize(row))) { struct estOrientInfo *oi = estOrientInfoLoad(row); copyOrientInfo(select, oi, inOi, outOiFh, recCounts); estOrientInfoFree(&oi); } gzLineFileClose(&inOiLf); gbVerbLeave(2, "installing from %s", inOi); } }
void installOrgCatAligned(struct gbSelect* select, unsigned orgCat, struct gbSelect* prevSelect, struct gbAlignInfo* alignInfo, char* alignIdx) /* Install alignments for either native or xeno. The alignment index is * created and named returned, but not renamed until both native and xeno are * processed. */ { unsigned holdOrgCats = select->orgCats; struct outputFiles out; struct recCounts recCounts; ZeroVar(&out); ZeroVar(&recCounts); select->orgCats = orgCat; if (prevSelect != NULL) prevSelect->orgCats = orgCat; /* setup out PSL and orientInfo files */ gbAlignedGetPath(select, "psl.gz", NULL, out.psl.path); out.psl.fh = openSortOutput(out.psl.path, PSL_SORT_SPEC); if (select->orgCats == GB_NATIVE) { gbAlignedGetPath(select, "oi.gz", NULL, out.oi.path); out.oi.fh = openSortOutput(out.oi.path, OI_SORT_SPEC); if (select->type == GB_EST) { gbAlignedGetPath(select, "intronPsl.gz", NULL, out.intronPsl.path); out.intronPsl.fh = openSortOutput(out.intronPsl.path, PSL_SORT_SPEC); } } if (select->type == GB_MRNA) { /* we don't bother sorting raw psl */ gbAlignedGetPath(select, "rawPsl.gz", NULL, out.rawPsl.path); out.rawPsl.fh = gbMustOpenOutput(out.rawPsl.path); } /* previous aligned if this is a full update */ if (prevSelect != NULL) migrateAligned(select, prevSelect, alignInfo, &out, &recCounts); /* copy currently aligned, if they exist */ copyPsls(select, MAIN_PSL_FILE, out.psl.fh, &recCounts.pslCnts); if (select->type == GB_MRNA) copyPsls(select, RAW_PSL_FILE, out.rawPsl.fh, &recCounts.rawPslCnts); if ((select->orgCats == GB_NATIVE) && (recCounts.pslCnts.recTotalCnt > 0)) { /* copy new OI and intronPsls */ copyOrientInfos(select, out.oi.fh, &recCounts); if (select->type == GB_EST) copyPsls(select, INTRON_PSL_FILE, out.intronPsl.fh, &recCounts.intronPslCnts); } /* Install or remove files. Done seperate from copy due to posibility of * all being migrated*/ if (recCounts.intronPslCnts.recTotalCnt > 0) gbOutputRename(out.intronPsl.path, &out.intronPsl.fh); else gbOutputRemove(out.intronPsl.path, &out.intronPsl.fh); if (recCounts.oiCnts.recTotalCnt > 0) gbOutputRename(out.oi.path, &out.oi.fh); else gbOutputRemove(out.oi.path, &out.oi.fh); if (recCounts.rawPslCnts.recTotalCnt > 0) gbOutputRename(out.rawPsl.path, &out.rawPsl.fh); else gbOutputRemove(out.rawPsl.path, &out.rawPsl.fh); if (recCounts.pslCnts.recTotalCnt > 0) gbOutputRename(out.psl.path, &out.psl.fh); else gbOutputRemove(out.psl.path, &out.psl.fh); createAlignedIndex(select, alignIdx); select->orgCats = holdOrgCats; if (prevSelect != NULL) prevSelect->orgCats = holdOrgCats; }