static void loadSeqCDnaRow(struct metaDataTbls* metaDataTbls, struct extFileTbl* extFileTbl, boolean checkExtSeqRecs, char* gbdbMapToCurrent, struct sqlConnection* conn, char **row) /* load one row for a cDNA from the seq table */ { struct seqFields seq; struct metaData* md; parseGbSeqRow(row, &seq); md = metaDataTblsGet(metaDataTbls, seq.acc); if (md->inSeq) gbError("%s: acc occurs multiple times in the seq table", seq.acc); md->inSeq = TRUE; md->seqSize = seq.size; if (md->inGbCdnaInfo) { if (seq.id != md->gbCdnaInfoId) gbError("%s: gbSeq.id (%d) not same gbCdnaInfo.id (%d)", seq.acc, seq.id, md->gbCdnaInfoId); if (seq.type != md->gbCdnaInfoType) gbError("%s: gbSeq.type (%s) not same as gbCdnaInfo.type (%s)", seq.acc, gbFmtSelect(seq.type), gbFmtSelect(md->gbCdnaInfoType)); if ((seq.srcDb & md->typeFlags) == 0) gbError("%s: gbSeq.srcDb (%s) not same gbCdnaInfo.srcDb (%s)", seq.acc, gbFmtSelect(seq.srcDb), gbFmtSelect(md->typeFlags)); if (md->seqSize >= seq.file_size) gbError("%s: gbSeq.size >= gbSeq.file_size", seq.acc); } if (verifySeqExtFile(&seq, extFileTbl, checkExtSeqRecs, gbdbMapToCurrent)) md->inExtFile = TRUE; }
static bool checkForAccTypeChange(struct sqlConnection *conn, struct gbSelect* select, struct gbStatus* status) /* Check if a sequence that appears new has really had it's type has changed. * Returns true if type changed (or other error), false if nothing detected. */ { char query[128]; struct sqlResult* sr; char **row; bool changed = FALSE; sqlSafef(query, sizeof(query), "SELECT type FROM gbSeq WHERE acc = '%s'", status->acc); sr = sqlGetResult(conn, query); if ((sr != NULL) && ((row = sqlNextRow(sr)) != NULL)) { unsigned type = gbParseType(row[0]); if (type != status->type) fprintf(stderr, "Error: %s %s type has changed from %s to %s; add to ignore file\n", status->acc, gbFormatDate(status->modDate), gbFmtSelect(type), gbFmtSelect(status->type)); else fprintf(stderr, "Error: %s %s is in the seq table, but shouldn't be, don't know why\n", status->acc, gbFormatDate(status->modDate)); changed = TRUE; gErrorCnt++; } sqlFreeResult(&sr); return changed; }
struct gbAlignInfo gbAlignGet(struct gbSelect* select, struct gbSelect* prevSelect) /* Build files to align in the work directory. If this is not a full release, * or there is no previously aligned release, prevSelect should be NULL. */ { struct gbAlignInfo alignInfo; gbVerbEnter(1, "gbAlignGet: %s", gbSelectDesc(select)); if (prevSelect != NULL) prevSelect->orgCats = select->orgCats; /* load the required entry data */ gbReleaseLoadProcessed(select); if (prevSelect != NULL) { gbReleaseLoadProcessed(prevSelect); gbReleaseLoadAligned(prevSelect); } /* select entries to align */ gbVerbEnter(2, "selecting seqs to align"); alignInfo = gbAlignFindNeedAligned(select, prevSelect); gbVerbLeave(2, "selecting seqs to align"); if (alignInfo.migrate.accTotalCnt > 0) gbVerbMsg(1, "gbAlignGet: %d %s entries, %d alignments will be migrated", alignInfo.migrate.accTotalCnt, gbFmtSelect(select->type), alignInfo.migrate.recTotalCnt); /* create fasta with sequences to align if not empty */ if (alignInfo.align.accTotalCnt > 0) { gbVerbMsg(1, "gbAlignGet: %d %s sequences will be align", alignInfo.align.accTotalCnt, gbFmtSelect(select->type)); copySelectedFasta(select); } /* leave calling cards */ if (select->orgCats & GB_NATIVE) markAligns(select, GB_NATIVE); if (select->orgCats & GB_XENO) markAligns(select, GB_XENO); /* print before releasing memory */ gbVerbLeave(1, "gbAlignGet: %s", gbSelectDesc(select)); /* unload entries to free memory */ gbReleaseUnload(select->release); if (prevSelect != NULL) gbReleaseUnload(prevSelect->release); return alignInfo; }
static void chkPsl(struct psl* psl, unsigned iRow, char* database, char* table, struct metaDataTbls* metaDataTbls, unsigned typeFlags) /* Validate a PSL of a mrna/est to genome alignment against the metadata. * Also count the number of alignments of a mrna. */ { unsigned chromSize = getChromSize(database, psl->tName); struct metaData* md = metaDataTblsFind(metaDataTbls, psl->qName); char pslDesc[128]; if (gbVerbose >= 3) gbVerbMsg(3, "chkPsl %s:%d %s %s", table, iRow, psl->qName, psl->tName); safef(pslDesc, sizeof(pslDesc), "psl %s.%s row %u", database, table, iRow); /* check that we have sequence info and compare sizes sizes */ if (chromSize == 0) gbError("%s: tName not a valid chromosome: \"%s\"", pslDesc, psl->tName); else if (chromSize != psl->tSize) gbError("%s: tSize %u != chromosome %s size %u", pslDesc, psl->tSize, psl->tName, chromSize); if (md == NULL) gbError("%s: qName not in mrna table as type %s: \"%s\"", pslDesc, gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName); else if (md->inSeq) { if (!md->inGbIndex) gbError("%s: qName not in gbIndex as type %s: \"%s\"" " (Note: this can be caused by GenBank entries that were changed from type mRNA to other RNA types)", pslDesc, gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName); else { if (typeFlags != md->typeFlags) gbError("%s: alignment for %s type %s doesn't match expected %s", pslDesc, psl->qName, gbFmtSelect(md->typeFlags), gbFmtSelect(typeFlags)); } if (md->seqSize != psl->qSize) gbError("%s: qSize %u != %s size %u", pslDesc, psl->qSize, psl->qName, md->seqSize); md->numAligns++; } /* validate consistency of PSL */ if (pslCheck(pslDesc, stderr, psl)) errorCnt++; }
static void chkGenePred(struct genePred* gene, char *geneName, unsigned iRow, char* database, char* table, struct metaDataTbls* metaDataTbls, unsigned typeFlags) /* Validate a genePred of a refSeq to genome alignment against the metadata. * Also count the number of alignments, and check the geneName, if available */ { char desc[512]; unsigned chromSize = getChromSize(database, gene->chrom); struct metaData* md = metaDataTblsFind(metaDataTbls, gene->name); if (gbVerbose >= 3) gbVerbMsg(3, "chkGenePred %s:%d %s %s", table, iRow, gene->name, gene->chrom); safef(desc, sizeof(desc), "gene %s.%s:%u %s %s", database, table, iRow, gene->name, gene->chrom); /* basic sanity checks */ if (genePredCheck(desc, stderr, chromSize, gene)) errorCnt++; /* check if in mrna table */ if (md == NULL) gbError("%s: %s in not in mrna table", desc, gene->name); else { if (typeFlags != md->typeFlags) gbError("%s: alignment of %s type %s doesn't match expected %s", desc, gene->name, gbFmtSelect(md->typeFlags), gbFmtSelect(typeFlags)); md->numAligns++; } /* check gene name */ if ((md != NULL) && (geneName != NULL)) { char* rlName = (md->rlName == NULL) ? "" : md->rlName; if (!sameString(geneName, rlName)) gbError("%s: %s geneName \"%s\" does not match refLink name \"%s\"", desc, gene->name, geneName, rlName); } }
void findEntries(int numAccs, unsigned type, struct gbRelease* release, struct numRange* versions, struct numRange* modDates, unsigned flags, unsigned orgCats, struct hash* accTbl, int* accCount) /* find entries to copy based on number of versions and/or modDates. * Specify NULL to not use criteria */ { /* scan by update to help minimize number of updates (by grouping) */ struct gbUpdate* update; int localAccCount = 0; if (verboseLevel() > 1) { fprintf(stderr, "findEntries: num=%d", numAccs); if (flags & FE_FULL) fprintf(stderr, " full"); if (flags & FE_DAILY) fprintf(stderr, " daily"); fprintf(stderr, " %s", gbFmtSelect(type|orgCats)); if (versions != NULL) fprintf(stderr, " numVers=%d-%d", versions->minNum, versions->maxNum); if (modDates != NULL) fprintf(stderr, " numModDates=%d-%d", modDates->minNum, modDates->maxNum); fprintf(stderr, "\n"); } for (update = release->updates; (update != NULL) && (localAccCount < numAccs); update = update->next) { if ((update->isFull && (flags & FE_FULL)) || (!update->isFull && (flags & FE_DAILY))) findInUpdate(numAccs, type, release, versions, modDates, flags, orgCats, update, accTbl, &localAccCount); } (*accCount) += localAccCount; verbose(1, " found: %d entries\n", localAccCount); }
static void loadGbStatusRow(struct metaDataTbls* metaDataTbls, struct sqlConnection* conn, char** row, unsigned descOrgCats) /* load a row of the gbStatus table */ { struct metaData* md; int iRow = 0; boolean isOk; HGID seqId; /* columns: acc,version,modDate,type,srcDb,gbSeq,numAligns */ md = metaDataTblsGet(metaDataTbls, row[iRow++]); if (md->inGbStatus) gbError("%s: occurs multiple times in the gbStatus table", md->acc); md->inGbStatus = TRUE; md->gbsVersion = strToUnsigned(row[iRow++], md->acc, "gbStatus.version", NULL); isOk = TRUE; md->gbsModDate = gbParseChkDate(row[iRow++], &isOk); if (!isOk) gbError("%s: invalid gbStatus.moddate value: \"%s\"", md->acc, row[iRow-1]); md->gbsType = gbParseType(row[iRow++]); md->gbsSrcDb = gbParseSrcDb(row[iRow++]); md->gbsOrgCat = gbParseOrgCat(row[iRow++]); seqId = strToUnsigned(row[iRow++], md->acc, "gbStatus.gbSeq", NULL); md->gbsNumAligns = strToUnsigned(row[iRow++], md->acc, "gbStatus.numAligns", NULL); md->typeFlags |= md->gbsType; if (md->inGbCdnaInfo) { if (seqId != md->gbCdnaInfoId) gbError("%s: gbStatus.gbSeq (%d) not same gbCdnaInfo.id (%d)", md->acc, seqId, md->gbCdnaInfoId); if (md->gbsType != md->gbCdnaInfoType) gbError("%s: gbStatus.type (%s) not same as gbCdnaInfo.type (%s)", md->acc, gbFmtSelect(md->gbsType), gbFmtSelect(md->gbCdnaInfoType)); if (md->gbsSrcDb != (md->typeFlags & GB_SRC_DB_MASK)) gbError("%s: gbStatus.srcDb (%s) not same gbCdnaInfo.srcDb (%s)", md->acc, gbFmtSelect(md->gbsSrcDb), gbFmtSelect(md->typeFlags)); if (md->gbsVersion != md->gbCdnaInfoVersion) gbError("%s: gbStatus.version (%d) not same gbCdnaInfo.version (%d)", md->acc, md->gbsVersion, md->gbCdnaInfoVersion); if ((md->gbsModDate != md->gbCdnaInfoModdate)) gbError("%s: gbStatus.modDate (%s) not same gbCdnaInfo.moddate (%s)", md->acc, gbFormatDate(md->gbsModDate), gbFormatDate(md->gbCdnaInfoModdate)); /* verify either have or don't have a description */ if (descOrgCats & md->gbsOrgCat) { if (!md->haveDesc) gbError("%s: should have gbCdnaInfo.description: %s", md->acc, gbFmtSelect(md->gbsType|md->gbsOrgCat|md->gbsSrcDb)); } else { if (md->haveDesc) gbError("%s: should not have gbCdnaInfo.description: %s", md->acc, gbFmtSelect(md->gbsType|md->gbsOrgCat|md->gbsSrcDb)); } } }
void databaseUpdate(struct gbSelect* select) /* update the database from genbank state on disk */ { struct sqlConnection *conn = hAllocConn(gDatabase); struct gbStatusTbl* statusTbl; boolean maxShrinkageExceeded; char typePrefix[32], tmpDir[PATH_LEN]; gbVerbEnter(3, "update %s", gbSelectDesc(select)); /* Setup tmp dir for load, must be unique for each update due to * initialLoad feature */ if (select->accPrefix != NULL) safef(typePrefix, sizeof(typePrefix), "%s.%s", gbFmtSelect(select->type), select->accPrefix); else safef(typePrefix, sizeof(typePrefix), "%s", gbFmtSelect(select->type)); safef(tmpDir, sizeof(tmpDir), "%s/%s/%s/%s", gWorkDir, select->release->name, select->release->genome->database, typePrefix); if (!(gOptions.flags & DBLOAD_DRY_RUN)) gbMakeDirs(tmpDir); /* Build list of entries that need processed. This also flags updates that * have the change and new entries so we can limit the per-update processing. */ statusTbl = gbBuildState(conn, select, &gOptions, gMaxShrinkage, tmpDir, gbVerbose, FALSE, &maxShrinkageExceeded); if (maxShrinkageExceeded) { fprintf(stderr, "Warning: switching to dryRun mode due to maxShrinkage being exceeded\n"); gMaxShrinkageError = TRUE; gOptions.flags |= DBLOAD_DRY_RUN; } if (gOptions.flags & DBLOAD_DRY_RUN) { gbVerbLeave(3, "dry run, skipping update %s", gbSelectDesc(select)); gbStatusTblFree(&statusTbl); hFreeConn(&conn); return; } checkForStop(); /* last safe place */ /* count global number of extFileChgs */ gExtFileChged += statusTbl->numExtChg; /* first clean out old and changed */ deleteOutdated(conn, select, statusTbl, tmpDir); /* meta data MUST be done first, it sets some gbStatus data */ processMetaData(conn, select, statusTbl, tmpDir); processAligns(conn, select, statusTbl, tmpDir); /* now it's safe to update the status table, delay commit for initialLoad */ if (gOptions.flags & DBLOAD_INITIAL) slSafeAddHead(&gPendingStatusUpdates, gbStatusTblUpdate(statusTbl, conn, FALSE)); else gbStatusTblUpdate(statusTbl, conn, TRUE); /* add this and partition to the loaded table, if not already there. * set the extFile updated flag updates were done or this is the initial load */ updateLoadedTbl(select); if (gOptions.flags & DBLOAD_INITIAL) gbLoadedTblSetExtFileUpdated(gLoadedTbl, select); if ((gOptions.flags & DBLOAD_INITIAL) == 0) gbLoadedTblCommit(gLoadedTbl); /* print before freeing memory */ gbVerbLeave(3, "update %s", gbSelectDesc(select)); gbStatusTblFree(&statusTbl); hFreeConn(&conn); }