struct metaDataTbls* chkMetaDataTbls(struct gbSelect* select, struct sqlConnection* conn, boolean checkExtSeqRecs, unsigned descOrgCats, char* gbdbMapToCurrent) /* load the metadata tables do basic validatation. descOrgCats are * orgCats that should have descriptions. */ { struct metaDataTbls* metaDataTbls; gbVerbEnter(1, "load and check metadata tables: %s", gbSelectDesc(select)); metaDataTbls = metaDataTblsNew(); /* order is important here to allow checking between tables */ loadGbCdnaInfoData(metaDataTbls, select, conn); if (select->release->srcDb == GB_REFSEQ) { /* must load before seq data due to protein checks */ loadRefSeqStatus(metaDataTbls, conn); loadRefLink(metaDataTbls, conn); } loadSeqData(metaDataTbls, select, conn, checkExtSeqRecs, gbdbMapToCurrent); loadGbStatus(metaDataTbls, select, descOrgCats, conn); gbVerbLeave(1, "load and check metadata tables: %s", gbSelectDesc(select)); return metaDataTbls; }
void checkMrnaPartition(struct gbSelect* select) /* Check an mRNA partition. For genbank, check all ESTs against * this mRNA partation. */ { struct hashCookie cookie; struct hashEl* hel; gbReleaseLoadProcessed(select); struct gbSelect* prevSelect = gbProcessedGetPrevRel(select); if (prevSelect != NULL) gbReleaseLoadProcessed(prevSelect); gbVerbEnter(2, "checking %s", gbSelectDesc(select)); cookie = hashFirst(select->release->entryTbl); while ((hel = hashNext(&cookie)) != NULL) checkOrgCat(hel->val, prevSelect); gbVerbLeave(2, "checking %s", gbSelectDesc(select)); if (select->release->srcDb == GB_GENBANK) checkEstPartitions(select->release); gbReleaseUnload(select->release); if (prevSelect != NULL) { gbReleaseUnload(prevSelect->release); freeMem(prevSelect); } }
struct gbAlignInfo gbAlignGet(struct gbSelect* select, struct gbSelect* prevSelect) /* Build files to align in the work directory. If this is not a full release, * or there is no previously aligned release, prevSelect should be NULL. */ { struct gbAlignInfo alignInfo; gbVerbEnter(1, "gbAlignGet: %s", gbSelectDesc(select)); if (prevSelect != NULL) prevSelect->orgCats = select->orgCats; /* load the required entry data */ gbReleaseLoadProcessed(select); if (prevSelect != NULL) { gbReleaseLoadProcessed(prevSelect); gbReleaseLoadAligned(prevSelect); } /* select entries to align */ gbVerbEnter(2, "selecting seqs to align"); alignInfo = gbAlignFindNeedAligned(select, prevSelect); gbVerbLeave(2, "selecting seqs to align"); if (alignInfo.migrate.accTotalCnt > 0) gbVerbMsg(1, "gbAlignGet: %d %s entries, %d alignments will be migrated", alignInfo.migrate.accTotalCnt, gbFmtSelect(select->type), alignInfo.migrate.recTotalCnt); /* create fasta with sequences to align if not empty */ if (alignInfo.align.accTotalCnt > 0) { gbVerbMsg(1, "gbAlignGet: %d %s sequences will be align", alignInfo.align.accTotalCnt, gbFmtSelect(select->type)); copySelectedFasta(select); } /* leave calling cards */ if (select->orgCats & GB_NATIVE) markAligns(select, GB_NATIVE); if (select->orgCats & GB_XENO) markAligns(select, GB_XENO); /* print before releasing memory */ gbVerbLeave(1, "gbAlignGet: %s", gbSelectDesc(select)); /* unload entries to free memory */ gbReleaseUnload(select->release); if (prevSelect != NULL) gbReleaseUnload(prevSelect->release); return alignInfo; }
void parseUpdateMetaData(struct sqlConnection *conn, struct gbSelect* select, struct gbStatusTbl* statusTbl) /* Parse metadata for changed and new entry for an update. Done one * update at a time to allow reading the ra file in sequential order * (as there is one per update). This doesn't load the mrna or seq * tables, but might add to the unique string tables. */ { gbVerbEnter(4, "process metadata for %s", gbSelectDesc(select)); gbMetaDataProcess(conn, statusTbl, select); gbUpdateClearSelectVer(select->update); gbVerbLeave(4, "process metadata for %s", gbSelectDesc(select)); }
static void chkGbRelease(struct gbSelect* select, struct metaDataTbls* metaDataTbls) /* Check a partation of gbRelease */ { gbVerbEnter(1, "check: %s", gbSelectDesc(select)); /* load required entry date */ gbReleaseLoadProcessed(select); gbReleaseLoadAligned(select); chkGbIndex(select, metaDataTbls); /* unload entries to free memory */ gbReleaseUnload(select->release); gbVerbLeave(1, "check: %s", gbSelectDesc(select)); }
static boolean checkShrinkage(struct gbSelect* select, float maxShrinkage, struct gbStatusTbl* statusTbl) /* Check for too much shrinkage, print deleted if exeeeded and return * FALSE. Return true if ok.*/ { float shrinkage = 0.0; unsigned numOld = statusTbl->numDelete + statusTbl->numSeqChg + statusTbl->numMetaChg + statusTbl->numRebuildDerived + statusTbl->numExtChg + statusTbl->numNoChg; unsigned numNew = statusTbl->numSeqChg + statusTbl->numMetaChg + statusTbl->numExtChg + +statusTbl->numRebuildDerived + statusTbl->numNoChg + statusTbl->numNew; if (numNew < numOld) { /* FIXME: the at least 50 feels like a hack */ shrinkage = 1.0 - ((float)numNew/(float)numOld); if ((maxShrinkage > 0) && ((numOld-numNew) < 50)) shrinkage = 0; /* allow for small partations */ if (shrinkage > maxShrinkage) { fprintf(stderr, "Error: size after deletion exceeds maximum shrinkage for %s,\n" "Rerun with -allowLargeDeletes to overrided.\n" "Will continue checking for other large deletes.\n" "delete=%u seqChg=%u metaChg=%u extChg=%u new=%u orphan=%u derived=%u noChg=%u\n", gbSelectDesc(select), statusTbl->numDelete, statusTbl->numSeqChg, statusTbl->numMetaChg, statusTbl->numExtChg, statusTbl->numNew, statusTbl->numOrphan, statusTbl->numRebuildDerived, statusTbl->numNoChg); listDeletedAcc(select, statusTbl); return FALSE; } } return TRUE; }
void doLoadPartition(struct gbSelect* select) /* Do work of syncing the database with the state in the genbank respository for * a given partition. */ { gbVerbEnter(2, "load for %s", gbSelectDesc(select)); /* load required entry date */ gbReleaseLoadProcessed(select); gbReleaseLoadAligned(select); databaseUpdate(select); gbVerbLeave(2, "load for %s", gbSelectDesc(select)); /* unload entries to free memory */ gbReleaseUnload(select->release); }
static void listDeletedAcc(struct gbSelect* select, struct gbStatusTbl* statusTbl) /* print the accessions being deleted */ { struct gbStatus *status = statusTbl->deleteList; printf("deleted accessions for %s\n", gbSelectDesc(select)); for (; status != NULL; status = status->next) printf("\t%s\n", status->acc); }
void processUpdateAligns(struct sqlConnection *conn, struct gbSelect* select, struct gbUpdate* update, struct gbStatusTbl* statusTbl) /* Get alignements for an update. */ { select->update = update; gbVerbEnter(4, "process alignments: %s", gbSelectDesc(select)); if (select->orgCats & GB_NATIVE) processUpdateAlignsForOrgCat(conn, select, GB_NATIVE, statusTbl); if (select->orgCats & GB_XENO) processUpdateAlignsForOrgCat(conn, select, GB_XENO, statusTbl); gbUpdateClearSelectVer(select->update); gbVerbLeave(4, "process alignments: %s", gbSelectDesc(select)); select->update = NULL; }
void gbAlignInstall(struct gbSelect* select, struct gbSelect* prevSelect) /* Install alignments, optionally migrating unchanged ones from a previous * release. This does one update, accPrefix and either native or xeno */ { char nativeAlignIdx[PATH_LEN], xenoAlignIdx[PATH_LEN]; struct gbAlignInfo alignInfo; gbVerbEnter(1, "gbAlignInstall: %s", gbSelectDesc(select)); /* load required entry date */ gbReleaseLoadProcessed(select); if (prevSelect != NULL) { gbReleaseLoadProcessed(prevSelect); gbReleaseLoadAligned(prevSelect); } /* mark entries and updates to migrate or align */ alignInfo = gbAlignFindNeedAligned(select, prevSelect); /* Process each category */ if (select->orgCats & GB_NATIVE) installOrgCatAligned(select, GB_NATIVE, prevSelect, &alignInfo, nativeAlignIdx); if (select->orgCats & GB_XENO) installOrgCatAligned(select, GB_XENO, prevSelect, &alignInfo, xenoAlignIdx); /* now indices can be renamed, not completely atomic, but good enough */ if (select->orgCats & GB_NATIVE) gbOutputRename(nativeAlignIdx, NULL); if (select->orgCats & GB_XENO) gbOutputRename(xenoAlignIdx, NULL); /* print message before memory is freed */ gbVerbLeave(1, "gbAlignInstall: %s", gbSelectDesc(select)); /* unload entries to free memory */ gbReleaseUnload(select->release); if (prevSelect != NULL) gbReleaseUnload(prevSelect->release); }
int chkAlignTables(char *db, struct gbSelect* select, struct sqlConnection* conn, struct metaDataTbls* metaDataTbls, struct dbLoadOptions *options) /* Verify all of the alignment-related. */ { int cnt = 0; if (gChromSizes == NULL) buildChromSizes(db); gbVerbEnter(1, "validating alignment tables: %s", gbSelectDesc(select)); if (select->release->srcDb & GB_GENBANK) { chkGenBankAlignTables(select, conn, metaDataTbls, options); cnt++; } if (select->release->srcDb & GB_REFSEQ) { chkRefSeqAlignTables(select, conn, metaDataTbls, options); cnt++; } gbVerbLeave(1, "validated alignment tables: %s", gbSelectDesc(select)); return cnt; }
void checkEstPartition(struct gbRelease* mrnaRelease, struct gbSelect* select) /* Check an EST partition */ { struct hashCookie cookie; struct hashEl* hel; gbVerbEnter(2, "checking %s", gbSelectDesc(select)); gbReleaseLoadProcessed(select); struct gbSelect* prevSelect = gbProcessedGetPrevRel(select); if (prevSelect != NULL) gbReleaseLoadProcessed(prevSelect); cookie = hashFirst(select->release->entryTbl); while ((hel = hashNext(&cookie)) != NULL) checkEst(mrnaRelease, hel->val, prevSelect); gbReleaseUnload(select->release); if (prevSelect != NULL) { gbReleaseUnload(prevSelect->release); freeMem(prevSelect); } gbVerbLeave(2, "checking %s", gbSelectDesc(select)); }
static void testLoad(struct gbSelect* select, unsigned flags) /* do load testing of part of a release */ { char desc[512]; struct stepInfo info; select->type = (flags & DO_MRNA) ? GB_MRNA : GB_EST; safef(desc, sizeof(desc), "%s %s", ((flags & DO_PROCESSED) ? "processed" : "aligned"), gbSelectDesc(select)); info = beginStep(select->release->index, select->release, desc); if (flags & DO_PROCESSED) gbReleaseLoadProcessed(select); else { select->orgCats = GB_NATIVE|GB_XENO; gbReleaseLoadAligned(select); } endStep(select->release->index, &info); select->type = 0; }
void databaseUpdate(struct gbSelect* select) /* update the database from genbank state on disk */ { struct sqlConnection *conn = hAllocConn(gDatabase); struct gbStatusTbl* statusTbl; boolean maxShrinkageExceeded; char typePrefix[32], tmpDir[PATH_LEN]; gbVerbEnter(3, "update %s", gbSelectDesc(select)); /* Setup tmp dir for load, must be unique for each update due to * initialLoad feature */ if (select->accPrefix != NULL) safef(typePrefix, sizeof(typePrefix), "%s.%s", gbFmtSelect(select->type), select->accPrefix); else safef(typePrefix, sizeof(typePrefix), "%s", gbFmtSelect(select->type)); safef(tmpDir, sizeof(tmpDir), "%s/%s/%s/%s", gWorkDir, select->release->name, select->release->genome->database, typePrefix); if (!(gOptions.flags & DBLOAD_DRY_RUN)) gbMakeDirs(tmpDir); /* Build list of entries that need processed. This also flags updates that * have the change and new entries so we can limit the per-update processing. */ statusTbl = gbBuildState(conn, select, &gOptions, gMaxShrinkage, tmpDir, gbVerbose, FALSE, &maxShrinkageExceeded); if (maxShrinkageExceeded) { fprintf(stderr, "Warning: switching to dryRun mode due to maxShrinkage being exceeded\n"); gMaxShrinkageError = TRUE; gOptions.flags |= DBLOAD_DRY_RUN; } if (gOptions.flags & DBLOAD_DRY_RUN) { gbVerbLeave(3, "dry run, skipping update %s", gbSelectDesc(select)); gbStatusTblFree(&statusTbl); hFreeConn(&conn); return; } checkForStop(); /* last safe place */ /* count global number of extFileChgs */ gExtFileChged += statusTbl->numExtChg; /* first clean out old and changed */ deleteOutdated(conn, select, statusTbl, tmpDir); /* meta data MUST be done first, it sets some gbStatus data */ processMetaData(conn, select, statusTbl, tmpDir); processAligns(conn, select, statusTbl, tmpDir); /* now it's safe to update the status table, delay commit for initialLoad */ if (gOptions.flags & DBLOAD_INITIAL) slSafeAddHead(&gPendingStatusUpdates, gbStatusTblUpdate(statusTbl, conn, FALSE)); else gbStatusTblUpdate(statusTbl, conn, TRUE); /* add this and partition to the loaded table, if not already there. * set the extFile updated flag updates were done or this is the initial load */ updateLoadedTbl(select); if (gOptions.flags & DBLOAD_INITIAL) gbLoadedTblSetExtFileUpdated(gLoadedTbl, select); if ((gOptions.flags & DBLOAD_INITIAL) == 0) gbLoadedTblCommit(gLoadedTbl); /* print before freeing memory */ gbVerbLeave(3, "update %s", gbSelectDesc(select)); gbStatusTblFree(&statusTbl); hFreeConn(&conn); }
struct gbStatusTbl* gbBuildState(struct sqlConnection *conn, struct gbSelect* select, struct dbLoadOptions* options, float maxShrinkage, char* tmpDir, int verboseLevel, boolean extFileUpdate, boolean* maxShrinkageExceeded) /* Load status table and find of state of all genbank entries in the release * compared to the database. */ { struct gbStatusTbl* statusTbl; struct selectStatusData ssData; unsigned selectFlags = (select->type | select->release->srcDb); ZeroVar(&ssData); gOptions = options; *maxShrinkageExceeded = FALSE; gbVerbose = verboseLevel; gErrorCnt = 0; loadNonCoding = dbLoadNonCoding(sqlGetDatabase(conn), select); if (loadNonCoding) gbVerbMsg(1, "NOTE: loading non-coding"); gbVerbEnter(3, "build state table"); gbVerbMsg(4, "reading gbSeq accessions"); ssData.select = select; ssData.seqHash = seqTblLoadAcc(conn, select); gbVerbMsg(4, "reading gbStatus"); statusTbl = gbStatusTblSelectLoad(conn, selectFlags, select->accPrefix, selectStatus, &ssData, tmpDir, extFileUpdate, (gbVerbose >= 4)); findNewEntries(select, statusTbl); /* Don't allow deletes when select criteria has changed */ if ((ssData.orgCatDelCnt > 0) && !(gOptions->flags & DBLOAD_LARGE_DELETES)) errAbort("%u entries deleted due to organism category no longer being selected, specify -allowLargeDeletes to override", ssData.orgCatDelCnt); /* check shrinkage unless override */ if ((gOptions->flags & DBLOAD_LARGE_DELETES) == 0) { if (!checkShrinkage(select, maxShrinkage, statusTbl)) *maxShrinkageExceeded = TRUE; } /* don't do other setup if we are going to stop on maxShrinkageExceeded */ if (!*maxShrinkageExceeded) { gbVerbMsg(4, "checking for orphans"); findOrphans(conn, select, ssData.seqHash, statusTbl); if (((gOptions->flags & DBLOAD_INITIAL) == 0)) { gbVerbMsg(4, "checking for type change"); checkForTypeChange(conn, select, statusTbl); } } #ifdef DUMP_HASH_STATS hashPrintStats(ssData.seqHash, "stateSeq", stderr); #endif hashFree(&ssData.seqHash); gbVerbLeave(3, "build state table"); /* always print stats */ fprintf(stderr, "gbLoadRna: selected %s: delete=%u seqChg=%u metaChg=%u extChg=%u new=%u orphan=%u derived=%u noChg=%u\n", gbSelectDesc(select), statusTbl->numDelete, statusTbl->numSeqChg, statusTbl->numMetaChg, statusTbl->numExtChg, statusTbl->numNew, statusTbl->numOrphan, statusTbl->numRebuildDerived, statusTbl->numNoChg); /* this doesn't include large delete errors */ if (gErrorCnt > 0) errAbort("Errors detecting when constructing state table"); return statusTbl; }