struct metaDataTbls* chkMetaDataTbls(struct gbSelect* select,
                                     struct sqlConnection* conn,
                                     boolean checkExtSeqRecs,
                                     unsigned descOrgCats,
                                     char* gbdbMapToCurrent)
/* load the metadata tables do basic validatation.  descOrgCats are
 * orgCats that should have descriptions. */
{
struct metaDataTbls* metaDataTbls;

gbVerbEnter(1, "load and check metadata tables: %s", gbSelectDesc(select));

metaDataTbls = metaDataTblsNew();

/* order is important here to allow checking between tables */
loadGbCdnaInfoData(metaDataTbls, select, conn);
if (select->release->srcDb == GB_REFSEQ)
    {
    /* must load before seq data due to protein checks */
    loadRefSeqStatus(metaDataTbls, conn);
    loadRefLink(metaDataTbls, conn);
    }
loadSeqData(metaDataTbls, select, conn, checkExtSeqRecs, gbdbMapToCurrent);
loadGbStatus(metaDataTbls, select, descOrgCats, conn);

gbVerbLeave(1, "load and check metadata tables: %s", gbSelectDesc(select));
return metaDataTbls;
}
void checkMrnaPartition(struct gbSelect* select)
/* Check an mRNA partition.  For genbank, check all ESTs against
 * this mRNA partation. */
{
struct hashCookie cookie;
struct hashEl* hel;

gbReleaseLoadProcessed(select);
struct gbSelect* prevSelect = gbProcessedGetPrevRel(select);
if (prevSelect != NULL)
    gbReleaseLoadProcessed(prevSelect);

gbVerbEnter(2, "checking %s", gbSelectDesc(select));
cookie = hashFirst(select->release->entryTbl);
while ((hel = hashNext(&cookie)) != NULL)
    checkOrgCat(hel->val, prevSelect);
gbVerbLeave(2, "checking %s", gbSelectDesc(select));
if (select->release->srcDb == GB_GENBANK)
    checkEstPartitions(select->release);

gbReleaseUnload(select->release);
if (prevSelect != NULL)
    {
    gbReleaseUnload(prevSelect->release);
    freeMem(prevSelect);
    }
}
struct gbAlignInfo gbAlignGet(struct gbSelect* select,
                              struct gbSelect* prevSelect)
/* Build files to align in the work directory.  If this is not a full release,
 * or there is no previously aligned release, prevSelect should be NULL.
 */
{
struct gbAlignInfo alignInfo;

gbVerbEnter(1, "gbAlignGet: %s", gbSelectDesc(select));
if (prevSelect != NULL)
    prevSelect->orgCats = select->orgCats;

/* load the required entry data */
gbReleaseLoadProcessed(select);
if (prevSelect != NULL)
    {
    gbReleaseLoadProcessed(prevSelect);
    gbReleaseLoadAligned(prevSelect);
    }

/* select entries to align */
gbVerbEnter(2, "selecting seqs to align");
alignInfo = gbAlignFindNeedAligned(select, prevSelect);
gbVerbLeave(2, "selecting seqs to align");

if (alignInfo.migrate.accTotalCnt > 0)
    gbVerbMsg(1, "gbAlignGet: %d %s entries, %d alignments will be migrated",
              alignInfo.migrate.accTotalCnt, gbFmtSelect(select->type),
              alignInfo.migrate.recTotalCnt);

/* create fasta with sequences to align if not empty */
if (alignInfo.align.accTotalCnt > 0)
    {
    gbVerbMsg(1, "gbAlignGet: %d %s sequences will be align",
              alignInfo.align.accTotalCnt, gbFmtSelect(select->type));
    copySelectedFasta(select);
    }

/* leave calling cards */
if (select->orgCats & GB_NATIVE)
    markAligns(select, GB_NATIVE);
if (select->orgCats & GB_XENO)
    markAligns(select, GB_XENO);

/* print before releasing memory */
gbVerbLeave(1, "gbAlignGet: %s", gbSelectDesc(select));

/* unload entries to free memory */
gbReleaseUnload(select->release);
if (prevSelect != NULL)
    gbReleaseUnload(prevSelect->release);
return alignInfo;
}
void parseUpdateMetaData(struct sqlConnection *conn,
                         struct gbSelect* select, 
                         struct gbStatusTbl* statusTbl)
/* Parse metadata for changed and new entry for an update.  Done one
 * update at a time to allow reading the ra file in sequential order
 * (as there is one per update).  This doesn't load the mrna or seq
 * tables, but might add to the unique string tables. */
{
gbVerbEnter(4, "process metadata for %s", gbSelectDesc(select));
gbMetaDataProcess(conn, statusTbl, select);
gbUpdateClearSelectVer(select->update);
gbVerbLeave(4, "process metadata for %s", gbSelectDesc(select));
}
Пример #5
0
static void chkGbRelease(struct gbSelect* select,
                         struct metaDataTbls* metaDataTbls)
/* Check a partation of gbRelease */
{
gbVerbEnter(1, "check: %s", gbSelectDesc(select));
/* load required entry date */
gbReleaseLoadProcessed(select);
gbReleaseLoadAligned(select);

chkGbIndex(select, metaDataTbls);

/* unload entries to free memory */
gbReleaseUnload(select->release);
gbVerbLeave(1, "check: %s", gbSelectDesc(select));
}
Пример #6
0
static boolean checkShrinkage(struct gbSelect* select, float maxShrinkage,
                              struct gbStatusTbl* statusTbl)
/* Check for too much shrinkage, print deleted if exeeeded and return
 * FALSE.  Return true if ok.*/
{
float shrinkage = 0.0;
unsigned numOld = statusTbl->numDelete + statusTbl->numSeqChg
    + statusTbl->numMetaChg + statusTbl->numRebuildDerived + statusTbl->numExtChg + statusTbl->numNoChg;
unsigned numNew  = statusTbl->numSeqChg + statusTbl->numMetaChg
    + statusTbl->numExtChg + +statusTbl->numRebuildDerived + statusTbl->numNoChg + statusTbl->numNew;
if (numNew < numOld)
    {
    /* FIXME: the at least 50 feels like a hack */
    shrinkage = 1.0 - ((float)numNew/(float)numOld);
    if ((maxShrinkage > 0) && ((numOld-numNew) < 50))
        shrinkage = 0;  /* allow for small partations */
    if (shrinkage > maxShrinkage)
        {
        fprintf(stderr, 
                "Error: size after deletion exceeds maximum shrinkage for %s,\n"
                "Rerun with -allowLargeDeletes to overrided.\n"
                "Will continue checking for other large deletes.\n"
                 "delete=%u seqChg=%u metaChg=%u extChg=%u new=%u orphan=%u derived=%u noChg=%u\n",
                 gbSelectDesc(select), statusTbl->numDelete,
                statusTbl->numSeqChg, statusTbl->numMetaChg,
                statusTbl->numExtChg, statusTbl->numNew, statusTbl->numOrphan,
                statusTbl->numRebuildDerived, statusTbl->numNoChg);
        listDeletedAcc(select, statusTbl);
        return FALSE;
        }
    }
return TRUE;
}
void doLoadPartition(struct gbSelect* select)
/* Do work of syncing the database with the state in the genbank respository for
 * a given partition.  */
{
gbVerbEnter(2, "load for %s", gbSelectDesc(select));

/* load required entry date */
gbReleaseLoadProcessed(select);
gbReleaseLoadAligned(select);

databaseUpdate(select);

gbVerbLeave(2, "load for %s", gbSelectDesc(select));

/* unload entries to free memory */
gbReleaseUnload(select->release);
}
Пример #8
0
static void listDeletedAcc(struct gbSelect* select, struct gbStatusTbl* statusTbl)
/* print the accessions being deleted */
{
struct gbStatus *status = statusTbl->deleteList;
printf("deleted accessions for %s\n", gbSelectDesc(select));
for (; status != NULL; status = status->next)
    printf("\t%s\n", status->acc);
}
void processUpdateAligns(struct sqlConnection *conn, struct gbSelect* select,
                         struct gbUpdate* update,
                         struct gbStatusTbl* statusTbl)
/* Get alignements for an update.  */
{
select->update = update;
gbVerbEnter(4, "process alignments: %s", gbSelectDesc(select));

if (select->orgCats & GB_NATIVE)
    processUpdateAlignsForOrgCat(conn, select, GB_NATIVE, statusTbl);
if (select->orgCats & GB_XENO)
    processUpdateAlignsForOrgCat(conn, select, GB_XENO, statusTbl);

gbUpdateClearSelectVer(select->update);

gbVerbLeave(4, "process alignments: %s", gbSelectDesc(select));
select->update = NULL;
}
Пример #10
0
void gbAlignInstall(struct gbSelect* select, struct gbSelect* prevSelect)
/* Install alignments, optionally migrating unchanged ones from a previous
 * release.  This does one update, accPrefix and either native or xeno */
{
char nativeAlignIdx[PATH_LEN], xenoAlignIdx[PATH_LEN];
struct gbAlignInfo alignInfo;

gbVerbEnter(1, "gbAlignInstall: %s", gbSelectDesc(select));

/* load required entry date */
gbReleaseLoadProcessed(select);
if (prevSelect != NULL)
    {
    gbReleaseLoadProcessed(prevSelect);
    gbReleaseLoadAligned(prevSelect);
    }

/* mark entries and updates to migrate or align */
alignInfo = gbAlignFindNeedAligned(select, prevSelect);

/* Process each category */
if (select->orgCats & GB_NATIVE)
    installOrgCatAligned(select, GB_NATIVE, prevSelect, &alignInfo,
                         nativeAlignIdx);
if (select->orgCats & GB_XENO)
    installOrgCatAligned(select, GB_XENO, prevSelect, &alignInfo,
                         xenoAlignIdx);

/* now indices can be renamed, not completely atomic, but good enough */
if (select->orgCats & GB_NATIVE)
    gbOutputRename(nativeAlignIdx, NULL);
if (select->orgCats & GB_XENO)
    gbOutputRename(xenoAlignIdx, NULL);

/* print message before memory is freed */
gbVerbLeave(1, "gbAlignInstall: %s", gbSelectDesc(select));

/* unload entries to free memory */
gbReleaseUnload(select->release);
if (prevSelect != NULL)
    gbReleaseUnload(prevSelect->release);
}
Пример #11
0
int chkAlignTables(char *db, struct gbSelect* select, struct sqlConnection* conn,
                   struct metaDataTbls* metaDataTbls, struct dbLoadOptions *options)
/* Verify all of the alignment-related. */
{
int cnt = 0;
if (gChromSizes == NULL)
    buildChromSizes(db);
gbVerbEnter(1, "validating alignment tables: %s", gbSelectDesc(select));
if (select->release->srcDb & GB_GENBANK)
    {
    chkGenBankAlignTables(select, conn, metaDataTbls, options);
    cnt++;
    }
if (select->release->srcDb & GB_REFSEQ)
    {
    chkRefSeqAlignTables(select, conn, metaDataTbls, options);
    cnt++;
    }
gbVerbLeave(1, "validated alignment tables: %s", gbSelectDesc(select));
return cnt;
}
void checkEstPartition(struct gbRelease* mrnaRelease,
                       struct gbSelect* select)
/* Check an EST partition */
{
struct hashCookie cookie;
struct hashEl* hel;

gbVerbEnter(2, "checking %s", gbSelectDesc(select));
gbReleaseLoadProcessed(select);
struct gbSelect* prevSelect = gbProcessedGetPrevRel(select);
if (prevSelect != NULL)
    gbReleaseLoadProcessed(prevSelect);
cookie = hashFirst(select->release->entryTbl);
while ((hel = hashNext(&cookie)) != NULL)
    checkEst(mrnaRelease, hel->val, prevSelect);
gbReleaseUnload(select->release);
if (prevSelect != NULL)
    {
    gbReleaseUnload(prevSelect->release);
    freeMem(prevSelect);
    }
gbVerbLeave(2, "checking %s", gbSelectDesc(select));
}
Пример #13
0
static void testLoad(struct gbSelect* select, unsigned flags)
/* do load testing of part of a release */
{
char desc[512];
struct stepInfo info;
select->type = (flags & DO_MRNA) ? GB_MRNA : GB_EST;
safef(desc, sizeof(desc), "%s %s",
      ((flags & DO_PROCESSED) ? "processed" : "aligned"),
      gbSelectDesc(select));
info = beginStep(select->release->index, select->release, desc);
if (flags & DO_PROCESSED)
    gbReleaseLoadProcessed(select);
else
    {
    select->orgCats = GB_NATIVE|GB_XENO;
    gbReleaseLoadAligned(select);
    }
endStep(select->release->index, &info);
select->type = 0;
}
void databaseUpdate(struct gbSelect* select)
/* update the database from genbank state on disk */
{
struct sqlConnection *conn = hAllocConn(gDatabase);
struct gbStatusTbl* statusTbl;
boolean maxShrinkageExceeded;
char typePrefix[32], tmpDir[PATH_LEN];

gbVerbEnter(3, "update %s", gbSelectDesc(select));

/* Setup tmp dir for load, must be unique for each update due to
 * initialLoad feature */
if (select->accPrefix != NULL)
    safef(typePrefix, sizeof(typePrefix), "%s.%s", gbFmtSelect(select->type),
          select->accPrefix);
else
    safef(typePrefix, sizeof(typePrefix), "%s", gbFmtSelect(select->type));

safef(tmpDir, sizeof(tmpDir), "%s/%s/%s/%s",
      gWorkDir, select->release->name, select->release->genome->database,
      typePrefix);
if (!(gOptions.flags & DBLOAD_DRY_RUN))
    gbMakeDirs(tmpDir);


/* Build list of entries that need processed.  This also flags updates that
 * have the change and new entries so we can limit the per-update processing.
 */
statusTbl = gbBuildState(conn, select, &gOptions, gMaxShrinkage, tmpDir,
                         gbVerbose, FALSE, &maxShrinkageExceeded);
if (maxShrinkageExceeded)
    {
    fprintf(stderr, "Warning: switching to dryRun mode due to maxShrinkage being exceeded\n");
    gMaxShrinkageError = TRUE;
    gOptions.flags |= DBLOAD_DRY_RUN;
    }
if (gOptions.flags & DBLOAD_DRY_RUN)
    {
    gbVerbLeave(3, "dry run, skipping update %s", gbSelectDesc(select));
    gbStatusTblFree(&statusTbl);
    hFreeConn(&conn);
    return;
    }

checkForStop(); /* last safe place */

/* count global number of extFileChgs */
gExtFileChged += statusTbl->numExtChg;

/* first clean out old and changed */
deleteOutdated(conn, select, statusTbl, tmpDir);

/* meta data MUST be done first, it sets some gbStatus data */
processMetaData(conn, select, statusTbl, tmpDir);
processAligns(conn, select, statusTbl, tmpDir);

/* now it's safe to update the status table, delay commit for initialLoad */
if (gOptions.flags & DBLOAD_INITIAL)
    slSafeAddHead(&gPendingStatusUpdates,
                  gbStatusTblUpdate(statusTbl, conn, FALSE));
else
    gbStatusTblUpdate(statusTbl, conn, TRUE);

/* add this and partition to the loaded table, if not already there.
 * set the extFile updated flag updates were done or this is the initial load    */
updateLoadedTbl(select);
if (gOptions.flags & DBLOAD_INITIAL)
    gbLoadedTblSetExtFileUpdated(gLoadedTbl, select);

if ((gOptions.flags & DBLOAD_INITIAL) == 0)
    gbLoadedTblCommit(gLoadedTbl);

/* print before freeing memory */
gbVerbLeave(3, "update %s", gbSelectDesc(select));
gbStatusTblFree(&statusTbl);

hFreeConn(&conn);
}
Пример #15
0
struct gbStatusTbl* gbBuildState(struct sqlConnection *conn,
                                 struct gbSelect* select, 
                                 struct dbLoadOptions* options,
                                 float maxShrinkage,
                                 char* tmpDir,
                                 int verboseLevel,
                                 boolean extFileUpdate,
                                 boolean* maxShrinkageExceeded)
/* Load status table and find of state of all genbank entries in the release
 * compared to the database. */
{
struct gbStatusTbl* statusTbl;
struct selectStatusData ssData;
unsigned selectFlags = (select->type | select->release->srcDb);
ZeroVar(&ssData);

gOptions = options;
*maxShrinkageExceeded = FALSE;
gbVerbose = verboseLevel;
gErrorCnt = 0;

loadNonCoding = dbLoadNonCoding(sqlGetDatabase(conn), select);
if (loadNonCoding)
    gbVerbMsg(1, "NOTE: loading non-coding");

gbVerbEnter(3, "build state table");
gbVerbMsg(4, "reading gbSeq accessions");
ssData.select = select;
ssData.seqHash = seqTblLoadAcc(conn, select);

gbVerbMsg(4, "reading gbStatus");
statusTbl = gbStatusTblSelectLoad(conn, selectFlags, select->accPrefix,
                                  selectStatus, &ssData,
                                  tmpDir, extFileUpdate, (gbVerbose >= 4));
findNewEntries(select, statusTbl);

/* Don't allow deletes when select criteria has changed */
if ((ssData.orgCatDelCnt > 0) && !(gOptions->flags & DBLOAD_LARGE_DELETES))
    errAbort("%u entries deleted due to organism category no longer being selected, specify -allowLargeDeletes to override",
             ssData.orgCatDelCnt);

/* check shrinkage unless override */
if ((gOptions->flags & DBLOAD_LARGE_DELETES) == 0)
    {
    if (!checkShrinkage(select, maxShrinkage, statusTbl))
        *maxShrinkageExceeded = TRUE;
    }

/* don't do other setup if we are going to stop on maxShrinkageExceeded */
if (!*maxShrinkageExceeded)
    {
    gbVerbMsg(4, "checking for orphans");
    findOrphans(conn, select, ssData.seqHash, statusTbl);

    if (((gOptions->flags & DBLOAD_INITIAL) == 0))
        {
        gbVerbMsg(4, "checking for type change");
        checkForTypeChange(conn, select, statusTbl);
        }
    }

#ifdef DUMP_HASH_STATS
hashPrintStats(ssData.seqHash, "stateSeq", stderr);
#endif
hashFree(&ssData.seqHash);

gbVerbLeave(3, "build state table");

/* always print stats */
fprintf(stderr, "gbLoadRna: selected %s: delete=%u seqChg=%u metaChg=%u extChg=%u new=%u orphan=%u derived=%u noChg=%u\n",
        gbSelectDesc(select), statusTbl->numDelete, statusTbl->numSeqChg,
        statusTbl->numMetaChg, statusTbl->numExtChg, statusTbl->numNew,
        statusTbl->numOrphan, statusTbl->numRebuildDerived, 
        statusTbl->numNoChg);

/* this doesn't include large delete errors */
if (gErrorCnt > 0)
    errAbort("Errors detecting when constructing state table");
return statusTbl;
}