static void loadSeqCDnaRow(struct metaDataTbls* metaDataTbls,
                           struct extFileTbl* extFileTbl,
                           boolean checkExtSeqRecs,
                           char* gbdbMapToCurrent,
                           struct sqlConnection* conn, char **row)
/* load one row for a cDNA from the seq table */
{
struct seqFields seq;
struct metaData* md;
parseGbSeqRow(row, &seq);
md = metaDataTblsGet(metaDataTbls, seq.acc);
if (md->inSeq)
    gbError("%s: acc occurs multiple times in the seq table", seq.acc);
md->inSeq = TRUE;
md->seqSize = seq.size;

if (md->inGbCdnaInfo)
    {
    if (seq.id != md->gbCdnaInfoId)
        gbError("%s: gbSeq.id (%d) not same gbCdnaInfo.id (%d)", seq.acc, seq.id, md->gbCdnaInfoId);
    if (seq.type != md->gbCdnaInfoType)
        gbError("%s: gbSeq.type (%s) not same as gbCdnaInfo.type (%s)", seq.acc,
                gbFmtSelect(seq.type), gbFmtSelect(md->gbCdnaInfoType));
    if ((seq.srcDb & md->typeFlags) == 0)
        gbError("%s: gbSeq.srcDb (%s) not same gbCdnaInfo.srcDb (%s)", seq.acc,
                gbFmtSelect(seq.srcDb), gbFmtSelect(md->typeFlags));
    if (md->seqSize >= seq.file_size)
        gbError("%s: gbSeq.size >= gbSeq.file_size", seq.acc);
    }

if (verifySeqExtFile(&seq, extFileTbl, checkExtSeqRecs, gbdbMapToCurrent))
    md->inExtFile = TRUE;
}
Esempio n. 2
0
static bool checkForAccTypeChange(struct sqlConnection *conn, 
                                  struct gbSelect* select,
                                  struct gbStatus* status)
/* Check if a sequence that appears new has really had it's type has changed.
 * Returns true if type changed (or other error), false if nothing detected.
 */
{
char query[128];
struct sqlResult* sr;
char **row;
bool changed = FALSE;

sqlSafef(query, sizeof(query),
      "SELECT type FROM gbSeq WHERE acc = '%s'", status->acc);
sr = sqlGetResult(conn, query);
if ((sr != NULL) && ((row = sqlNextRow(sr)) != NULL))
    {
    unsigned type = gbParseType(row[0]);
    if (type != status->type)
        fprintf(stderr,
                "Error: %s %s type has changed from %s to %s; add to ignore file\n",
                status->acc, gbFormatDate(status->modDate),
                gbFmtSelect(type), gbFmtSelect(status->type));
    else
        fprintf(stderr,
                "Error: %s %s is in the seq table, but shouldn't be, don't know why\n",
                status->acc, gbFormatDate(status->modDate));
    changed = TRUE;
    gErrorCnt++;
    }
sqlFreeResult(&sr);
return changed;
}
struct gbAlignInfo gbAlignGet(struct gbSelect* select,
                              struct gbSelect* prevSelect)
/* Build files to align in the work directory.  If this is not a full release,
 * or there is no previously aligned release, prevSelect should be NULL.
 */
{
struct gbAlignInfo alignInfo;

gbVerbEnter(1, "gbAlignGet: %s", gbSelectDesc(select));
if (prevSelect != NULL)
    prevSelect->orgCats = select->orgCats;

/* load the required entry data */
gbReleaseLoadProcessed(select);
if (prevSelect != NULL)
    {
    gbReleaseLoadProcessed(prevSelect);
    gbReleaseLoadAligned(prevSelect);
    }

/* select entries to align */
gbVerbEnter(2, "selecting seqs to align");
alignInfo = gbAlignFindNeedAligned(select, prevSelect);
gbVerbLeave(2, "selecting seqs to align");

if (alignInfo.migrate.accTotalCnt > 0)
    gbVerbMsg(1, "gbAlignGet: %d %s entries, %d alignments will be migrated",
              alignInfo.migrate.accTotalCnt, gbFmtSelect(select->type),
              alignInfo.migrate.recTotalCnt);

/* create fasta with sequences to align if not empty */
if (alignInfo.align.accTotalCnt > 0)
    {
    gbVerbMsg(1, "gbAlignGet: %d %s sequences will be align",
              alignInfo.align.accTotalCnt, gbFmtSelect(select->type));
    copySelectedFasta(select);
    }

/* leave calling cards */
if (select->orgCats & GB_NATIVE)
    markAligns(select, GB_NATIVE);
if (select->orgCats & GB_XENO)
    markAligns(select, GB_XENO);

/* print before releasing memory */
gbVerbLeave(1, "gbAlignGet: %s", gbSelectDesc(select));

/* unload entries to free memory */
gbReleaseUnload(select->release);
if (prevSelect != NULL)
    gbReleaseUnload(prevSelect->release);
return alignInfo;
}
Esempio n. 4
0
static void chkPsl(struct psl* psl, unsigned iRow, char* database,
                   char* table, struct metaDataTbls* metaDataTbls,
                   unsigned typeFlags)
/* Validate a PSL of a mrna/est to genome alignment against the metadata.
 * Also count the number of alignments of a mrna. */
{
unsigned chromSize = getChromSize(database, psl->tName);
struct metaData* md = metaDataTblsFind(metaDataTbls, psl->qName);
char pslDesc[128];
if (gbVerbose >= 3)
    gbVerbMsg(3, "chkPsl %s:%d %s %s",  table, iRow, psl->qName, psl->tName);

safef(pslDesc, sizeof(pslDesc), "psl %s.%s row %u", database, table, iRow);

/* check that we have sequence info and compare sizes sizes */
if (chromSize == 0)
    gbError("%s: tName not a valid chromosome: \"%s\"", pslDesc, psl->tName);
else
    if (chromSize != psl->tSize)
        gbError("%s: tSize %u != chromosome %s size %u",
                pslDesc, psl->tSize, psl->tName, chromSize);

if (md == NULL)
    gbError("%s: qName not in mrna table as type %s: \"%s\"",
            pslDesc, gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName);
else if (md->inSeq)
    {
    if (!md->inGbIndex)
        gbError("%s: qName not in gbIndex as type %s: \"%s\""
                " (Note: this can be caused by GenBank entries that were changed from type mRNA to other RNA types)", pslDesc,
                gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName);
    else
        {
        if (typeFlags != md->typeFlags)
            gbError("%s: alignment for %s type %s doesn't match expected %s",
                    pslDesc, psl->qName, gbFmtSelect(md->typeFlags),
                    gbFmtSelect(typeFlags));
        }
    if (md->seqSize != psl->qSize)
        gbError("%s: qSize %u != %s size %u",
                pslDesc, psl->qSize, psl->qName, md->seqSize);
    md->numAligns++;
    }

/* validate consistency of PSL */
if (pslCheck(pslDesc, stderr, psl))
    errorCnt++;
}
Esempio n. 5
0
static void chkGenePred(struct genePred* gene, char *geneName, unsigned iRow,
                        char* database, char* table,
                        struct metaDataTbls* metaDataTbls, unsigned typeFlags)
/* Validate a genePred of a refSeq to genome alignment against the metadata.
 * Also count the number of alignments, and check the geneName, if available */
{
char desc[512];
unsigned chromSize = getChromSize(database, gene->chrom);
struct metaData* md = metaDataTblsFind(metaDataTbls, gene->name);

if (gbVerbose >= 3)
    gbVerbMsg(3, "chkGenePred %s:%d %s %s",  table, iRow, 
              gene->name, gene->chrom);
safef(desc, sizeof(desc), "gene %s.%s:%u %s %s", database, table,
      iRow, gene->name, gene->chrom);

/* basic sanity checks */
if (genePredCheck(desc, stderr, chromSize, gene))
    errorCnt++;

/* check if in mrna table */
if (md == NULL)
    gbError("%s: %s in not in mrna table", desc, gene->name);
else
    {
    if (typeFlags != md->typeFlags)
        gbError("%s: alignment of %s type %s doesn't match expected %s",
                desc, gene->name, gbFmtSelect(md->typeFlags),
                gbFmtSelect(typeFlags));
    md->numAligns++;
    }

/* check gene name */
if ((md != NULL) && (geneName != NULL))
    {
    char* rlName = (md->rlName == NULL) ? "" : md->rlName;
    if (!sameString(geneName, rlName))
        gbError("%s: %s geneName \"%s\" does not match refLink name \"%s\"",
                desc, gene->name, geneName, rlName);
    }
}
Esempio n. 6
0
void findEntries(int numAccs, unsigned type, struct gbRelease* release,
                 struct numRange* versions, struct numRange* modDates,
                 unsigned flags, unsigned orgCats,
                 struct hash* accTbl, int* accCount)
/* find entries to copy based on number of versions and/or modDates.
 * Specify NULL to not use criteria */
{
/* scan by update to help minimize number of updates (by grouping) */
struct gbUpdate* update;
int localAccCount = 0;

if (verboseLevel() > 1)
    {
    fprintf(stderr, "findEntries: num=%d", numAccs);
    if (flags & FE_FULL)
        fprintf(stderr, " full");
    if (flags & FE_DAILY)
        fprintf(stderr, " daily");
    fprintf(stderr, " %s", gbFmtSelect(type|orgCats));
    if (versions != NULL)
        fprintf(stderr, " numVers=%d-%d", versions->minNum, versions->maxNum);
    if (modDates != NULL)
        fprintf(stderr, " numModDates=%d-%d", modDates->minNum, modDates->maxNum);
    fprintf(stderr, "\n");
    }

for (update = release->updates; (update != NULL) && (localAccCount < numAccs);
     update = update->next)
    {
    if ((update->isFull && (flags & FE_FULL))
        || (!update->isFull && (flags & FE_DAILY)))
        findInUpdate(numAccs, type, release, versions,  modDates,
                     flags, orgCats, update, accTbl, &localAccCount);
    }
(*accCount) += localAccCount;
verbose(1, "  found: %d entries\n", localAccCount);

}
static void loadGbStatusRow(struct metaDataTbls* metaDataTbls,
                            struct sqlConnection* conn, char** row,
                            unsigned descOrgCats)
/* load a row of the gbStatus table */
{
struct metaData* md;
int iRow = 0;
boolean isOk;
HGID seqId;

/* columns: acc,version,modDate,type,srcDb,gbSeq,numAligns */

md = metaDataTblsGet(metaDataTbls, row[iRow++]);
if (md->inGbStatus)
    gbError("%s: occurs multiple times in the gbStatus table", md->acc);
md->inGbStatus = TRUE;
md->gbsVersion = strToUnsigned(row[iRow++], md->acc, "gbStatus.version", NULL);

isOk = TRUE;
md->gbsModDate = gbParseChkDate(row[iRow++], &isOk);
if (!isOk)
    gbError("%s: invalid gbStatus.moddate value: \"%s\"", md->acc, row[iRow-1]);

md->gbsType = gbParseType(row[iRow++]);
md->gbsSrcDb = gbParseSrcDb(row[iRow++]);
md->gbsOrgCat = gbParseOrgCat(row[iRow++]);
seqId = strToUnsigned(row[iRow++], md->acc, "gbStatus.gbSeq", NULL);
md->gbsNumAligns = strToUnsigned(row[iRow++], md->acc, "gbStatus.numAligns",
                                 NULL);

md->typeFlags |= md->gbsType;

if (md->inGbCdnaInfo)
    {
    if (seqId != md->gbCdnaInfoId)
        gbError("%s: gbStatus.gbSeq (%d) not same gbCdnaInfo.id (%d)", md->acc, seqId,
                md->gbCdnaInfoId);
    if (md->gbsType != md->gbCdnaInfoType)
        gbError("%s: gbStatus.type (%s) not same as gbCdnaInfo.type (%s)", md->acc,
                gbFmtSelect(md->gbsType), gbFmtSelect(md->gbCdnaInfoType));
    if (md->gbsSrcDb != (md->typeFlags & GB_SRC_DB_MASK))
        gbError("%s: gbStatus.srcDb (%s) not same gbCdnaInfo.srcDb (%s)", md->acc,
                gbFmtSelect(md->gbsSrcDb), gbFmtSelect(md->typeFlags));
    if (md->gbsVersion != md->gbCdnaInfoVersion)
        gbError("%s: gbStatus.version (%d) not same gbCdnaInfo.version (%d)", md->acc,
                md->gbsVersion, md->gbCdnaInfoVersion);
    if ((md->gbsModDate != md->gbCdnaInfoModdate))
        gbError("%s: gbStatus.modDate (%s) not same gbCdnaInfo.moddate (%s)", md->acc,
                gbFormatDate(md->gbsModDate), gbFormatDate(md->gbCdnaInfoModdate));
    /* verify either have or don't have a description */
    if (descOrgCats & md->gbsOrgCat)
        {
        if (!md->haveDesc)
            gbError("%s: should have gbCdnaInfo.description: %s", md->acc,
                    gbFmtSelect(md->gbsType|md->gbsOrgCat|md->gbsSrcDb));
        }
    else
        {
        if (md->haveDesc)
            gbError("%s: should not have gbCdnaInfo.description: %s", md->acc,
                    gbFmtSelect(md->gbsType|md->gbsOrgCat|md->gbsSrcDb));
        }
    }
}
void databaseUpdate(struct gbSelect* select)
/* update the database from genbank state on disk */
{
struct sqlConnection *conn = hAllocConn(gDatabase);
struct gbStatusTbl* statusTbl;
boolean maxShrinkageExceeded;
char typePrefix[32], tmpDir[PATH_LEN];

gbVerbEnter(3, "update %s", gbSelectDesc(select));

/* Setup tmp dir for load, must be unique for each update due to
 * initialLoad feature */
if (select->accPrefix != NULL)
    safef(typePrefix, sizeof(typePrefix), "%s.%s", gbFmtSelect(select->type),
          select->accPrefix);
else
    safef(typePrefix, sizeof(typePrefix), "%s", gbFmtSelect(select->type));

safef(tmpDir, sizeof(tmpDir), "%s/%s/%s/%s",
      gWorkDir, select->release->name, select->release->genome->database,
      typePrefix);
if (!(gOptions.flags & DBLOAD_DRY_RUN))
    gbMakeDirs(tmpDir);


/* Build list of entries that need processed.  This also flags updates that
 * have the change and new entries so we can limit the per-update processing.
 */
statusTbl = gbBuildState(conn, select, &gOptions, gMaxShrinkage, tmpDir,
                         gbVerbose, FALSE, &maxShrinkageExceeded);
if (maxShrinkageExceeded)
    {
    fprintf(stderr, "Warning: switching to dryRun mode due to maxShrinkage being exceeded\n");
    gMaxShrinkageError = TRUE;
    gOptions.flags |= DBLOAD_DRY_RUN;
    }
if (gOptions.flags & DBLOAD_DRY_RUN)
    {
    gbVerbLeave(3, "dry run, skipping update %s", gbSelectDesc(select));
    gbStatusTblFree(&statusTbl);
    hFreeConn(&conn);
    return;
    }

checkForStop(); /* last safe place */

/* count global number of extFileChgs */
gExtFileChged += statusTbl->numExtChg;

/* first clean out old and changed */
deleteOutdated(conn, select, statusTbl, tmpDir);

/* meta data MUST be done first, it sets some gbStatus data */
processMetaData(conn, select, statusTbl, tmpDir);
processAligns(conn, select, statusTbl, tmpDir);

/* now it's safe to update the status table, delay commit for initialLoad */
if (gOptions.flags & DBLOAD_INITIAL)
    slSafeAddHead(&gPendingStatusUpdates,
                  gbStatusTblUpdate(statusTbl, conn, FALSE));
else
    gbStatusTblUpdate(statusTbl, conn, TRUE);

/* add this and partition to the loaded table, if not already there.
 * set the extFile updated flag updates were done or this is the initial load    */
updateLoadedTbl(select);
if (gOptions.flags & DBLOAD_INITIAL)
    gbLoadedTblSetExtFileUpdated(gLoadedTbl, select);

if ((gOptions.flags & DBLOAD_INITIAL) == 0)
    gbLoadedTblCommit(gLoadedTbl);

/* print before freeing memory */
gbVerbLeave(3, "update %s", gbSelectDesc(select));
gbStatusTblFree(&statusTbl);

hFreeConn(&conn);
}