void refPepRepair(char *db, char *accFile, boolean dryRun) /* fix dangling repPep gbSeq entries. */ { struct sqlConnection *conn = sqlConnect(db); struct brokenRefPepTbl *brpTbl; struct extFileTbl* extFileTbl; struct slName *accs = (accFile == NULL) ? NULL : slNameLoadReal(accFile); if (!checkForRefLink(conn)) { sqlDisconnect(&conn); return; } gbVerbMsg(1, "%s: repairing refseq protein gbExtFile entries%s", sqlGetDatabase(conn), (dryRun? " (dry run)" : "")); extFileTbl = extFileTblLoad(conn); brpTbl = brokenRefPepTblNew(conn, accs); brokenRefPepGetSeqScan(conn, extFileTbl, brpTbl); brokenRefPepGetMrnas(conn, brpTbl); fillInFastaOffsets(brpTbl, conn, extFileTbl); if (brpTbl->numToRepair > 0) makeRepairs(brpTbl, conn, extFileTbl, dryRun); else gbVerbMsg(1, "%s: no refseq proteins to repair", sqlGetDatabase(conn)); brokenRefPepTblFree(&brpTbl); extFileTblFree(&extFileTbl); sqlDisconnect(&conn); slFreeList(&accs); }
static void getFastaOffsets(struct brokenRefPepTbl *brpTbl, struct sqlConnection *conn, struct extFileTbl* extFileTbl, char *faPath) /* parse fasta file to get offsets of proteins */ { struct gbFa *fa = gbFaOpen(faPath, "r"); char acc[GB_ACC_BUFSZ]; struct brokenRefPep *brp; HGID extId = extFileTblGet(extFileTbl, conn, faPath); gbVerbMsg(5, "scanning fasta: %s", faPath); while (gbFaReadNext(fa)) { gbVerbMsg(5, " %s: %lld", fa->id, (long long)fa->recOff); /* save only if same acecss, version, and file (to match mrna fa) */ short ver = gbSplitAccVer(fa->id, acc); brp = hashFindVal(brpTbl->protAccHash, acc); if ((brp != NULL) && (ver == brp->protVer) && sameString(faPath, brp->newFaPath)) { gbFaGetSeq(fa); /* force read of sequence data */ brp->newFaId = extId; brp->newFaOff = fa->recOff; brp->newSeqSize = fa->seqLen; brp->newRecSize = fa->off-fa->recOff; gbVerbMsg(5, " save: %s %lld for %lld\n", fa->id, (long long)fa->recOff, (long long)fa->off); } } gbFaClose(&fa); }
void deleteOutdated(struct sqlConnection *conn, struct gbSelect* select, struct gbStatusTbl* statusTbl, char* tmpDir) /* delete outdated alignments and metadata from the database. */ { gbVerbEnter(3, "delete outdated"); /* first the alignments */ gbVerbMsg(4, "delete outdated alignments"); gbAlignDataDeleteOutdated(gDatabase, conn, select, statusTbl, &gOptions, tmpDir); /* now drop metadata entries */ gbVerbMsg(4, "delete outdated metadata"); gbMetaDataDeleteOutdated(conn, select, statusTbl, &gOptions, tmpDir); /* Now it's safe to drop deleted entries from the database status table. */ gbVerbMsg(4, "delete outdated gbStatus"); gbStatusTblRemoveDeleted(statusTbl, conn); /* orphaned now become new */ statusTbl->newList = slCat(statusTbl->newList, statusTbl->orphanList); statusTbl->orphanList = NULL; statusTbl->numNew += statusTbl->numOrphan; statusTbl->numOrphan = 0; gbVerbLeave(3, "delete outdated"); }
void refPepList(char *db, FILE* outFh) /* list of sequences needing repair */ { struct sqlConnection *conn = sqlConnect(db); struct brokenRefPepTbl *brpTbl; struct hashCookie cookie; struct hashEl *hel; struct extFileTbl* extFileTbl = NULL; if (!checkForRefLink(conn)) { sqlDisconnect(&conn); return; } extFileTbl = extFileTblLoad(conn); brpTbl = brokenRefPepTblNew(conn, NULL); brokenRefPepGetSeqScan(conn, extFileTbl, brpTbl); brokenRefPepGetMrnas(conn, brpTbl); extFileTblFree(&extFileTbl); cookie = hashFirst(brpTbl->protAccHash); while ((hel = hashNext(&cookie)) != NULL) { struct brokenRefPep *brp = hel->val; fprintf(outFh, "%s\t%s\t%s\n", sqlGetDatabase(conn), brp->protAcc, (brp->mrnaAcc != NULL)? "repair" : "drop"); } gbVerbMsg(1, "%s: need to repair %d refseq protein gbExtFile entries", sqlGetDatabase(conn), brpTbl->numToRepair); gbVerbMsg(1, "%s: need to drop %d refseq protein gbExtFile entries", sqlGetDatabase(conn), brpTbl->numToDrop); }
int main(int argc, char* argv[]) { char *relName, *updateName, *typeAccPrefix, *database, *sep; struct gbIndex* index; struct gbSelect select; struct gbSelect* prevSelect = NULL; boolean noMigrate; ZeroVar(&select); optionInit(&argc, argv, optionSpecs); if (argc != 5) usage(); gWorkDir = optionVal("workdir", "work/align"); gSortTmp = optionVal("sortTmp", NULL); noMigrate = optionExists("noMigrate"); gbVerbInit(optionInt("verbose", 0)); relName = argv[1]; updateName = argv[2]; typeAccPrefix = argv[3]; database = argv[4]; /* parse typeAccPrefix */ sep = strchr(typeAccPrefix, '.'); if (sep != NULL) *sep = '\0'; select.type = gbParseType(typeAccPrefix); if (sep != NULL) { select.accPrefix = sep+1; *sep = '.'; } index = gbIndexNew(database, NULL); select.release = gbIndexMustFindRelease(index, relName); select.update = gbReleaseMustFindUpdate(select.release, updateName); select.orgCats = gbParseOrgCat(optionVal("orgCats", "native,xeno")); gbVerbMsg(0, "gbAlignInstall: %s/%s/%s/%s", select.release->name, select.release->genome->database, select.update->name, typeAccPrefix); /* Get the release to migrate, if applicable */ if (!noMigrate) prevSelect = gbAlignGetMigrateRel(&select); gbAlignInstall(&select, prevSelect); /* must go to stderr to be logged */ gbVerbMsg(0, "gbAlignInstall: complete"); gbIndexFree(&index); return 0; }
struct gbAlignInfo gbAlignGet(struct gbSelect* select, struct gbSelect* prevSelect) /* Build files to align in the work directory. If this is not a full release, * or there is no previously aligned release, prevSelect should be NULL. */ { struct gbAlignInfo alignInfo; gbVerbEnter(1, "gbAlignGet: %s", gbSelectDesc(select)); if (prevSelect != NULL) prevSelect->orgCats = select->orgCats; /* load the required entry data */ gbReleaseLoadProcessed(select); if (prevSelect != NULL) { gbReleaseLoadProcessed(prevSelect); gbReleaseLoadAligned(prevSelect); } /* select entries to align */ gbVerbEnter(2, "selecting seqs to align"); alignInfo = gbAlignFindNeedAligned(select, prevSelect); gbVerbLeave(2, "selecting seqs to align"); if (alignInfo.migrate.accTotalCnt > 0) gbVerbMsg(1, "gbAlignGet: %d %s entries, %d alignments will be migrated", alignInfo.migrate.accTotalCnt, gbFmtSelect(select->type), alignInfo.migrate.recTotalCnt); /* create fasta with sequences to align if not empty */ if (alignInfo.align.accTotalCnt > 0) { gbVerbMsg(1, "gbAlignGet: %d %s sequences will be align", alignInfo.align.accTotalCnt, gbFmtSelect(select->type)); copySelectedFasta(select); } /* leave calling cards */ if (select->orgCats & GB_NATIVE) markAligns(select, GB_NATIVE); if (select->orgCats & GB_XENO) markAligns(select, GB_XENO); /* print before releasing memory */ gbVerbLeave(1, "gbAlignGet: %s", gbSelectDesc(select)); /* unload entries to free memory */ gbReleaseUnload(select->release); if (prevSelect != NULL) gbReleaseUnload(prevSelect->release); return alignInfo; }
void loadSeqData(struct metaDataTbls* metaDataTbls, struct gbSelect* select, struct sqlConnection* conn, boolean checkExtSeqRecs, char* gbdbMapToCurrent) /* load seq table data, gbCdnaInfo table should be loaded. For * refseq, refLink should also have been loaded*/ { struct extFileTbl* extFileTbl; gbVerbMsg(2, "load gbExtFile table data"); extFileTbl = extFileTblLoad(conn); /* setup globals */ if (missingExtFileIds == NULL) { missingExtFileIds = hashNew(16); missingExtFiles = hashNew(16); } loadSeqCDnaData(metaDataTbls,select, conn, checkExtSeqRecs, gbdbMapToCurrent, extFileTbl); if (select->release->srcDb == GB_REFSEQ) loadSeqPepData(metaDataTbls, conn, checkExtSeqRecs, gbdbMapToCurrent, extFileTbl); extFileTblFree(&extFileTbl); }
static void loadGbStatus(struct metaDataTbls* metaDataTbls, struct gbSelect* select, unsigned descOrgCats, struct sqlConnection* conn) /* load the gbStatus table */ { char accWhere[64]; char query[512]; struct sqlResult* result; char** row; gbVerbMsg(2, "load gbStatus table data"); accWhere[0] = '\0'; if (select->accPrefix != NULL) safef(accWhere, sizeof(accWhere), " AND (acc LIKE '%s%%')", select->accPrefix); safef(query, sizeof(query), "SELECT acc,version,modDate,type,srcDb,orgCat,gbSeq,numAligns " "FROM gbStatus WHERE (type='%s') AND (srcDb='%s')%s", ((select->type == GB_MRNA) ? "mRNA" : "EST"), ((select->release->srcDb == GB_GENBANK) ? "GenBank" : "RefSeq"), accWhere); result = sqlGetResult(conn, query); while ((row = sqlNextRow(result)) != NULL) loadGbStatusRow(metaDataTbls, conn, row, descOrgCats); sqlFreeResult(&result); }
static void loadSeqCDnaData(struct metaDataTbls* metaDataTbls, struct gbSelect* select, struct sqlConnection* conn, boolean checkExtSeqRecs, char* gbdbMapToCurrent, struct extFileTbl* extFileTbl) /* load cDNA data from the seq table */ { char accWhere[64]; char query[512]; char **row; struct sqlResult* result; gbVerbMsg(2, "load gbSeq cDNA table data"); accWhere[0] = '\0'; if (select->accPrefix != NULL) safef(accWhere, sizeof(accWhere), " AND (acc LIKE '%s%%')", select->accPrefix); safef(query, sizeof(query), "SELECT * FROM gbSeq WHERE (type='%s') AND (srcDb='%s')%s", ((select->type == GB_MRNA) ? "mRNA" : "EST"), ((select->release->srcDb == GB_GENBANK) ? "GenBank" : "RefSeq"), accWhere); result = sqlGetResult(conn, query); while ((row = sqlNextRow(result)) != NULL) loadSeqCDnaRow(metaDataTbls, extFileTbl, checkExtSeqRecs,gbdbMapToCurrent, conn, row); sqlFreeResult(&result); }
static void loadGbCdnaInfoData(struct metaDataTbls* metaDataTbls, struct gbSelect* select, struct sqlConnection* conn) /* load the gbCdnaInfo table */ { char accWhere[64]; char query[512]; struct sqlResult* result; char** row; gbVerbMsg(2, "load gbCdnaInfo table data"); accWhere[0] = '\0'; if (select->accPrefix != NULL) safef(accWhere, sizeof(accWhere), " AND (acc LIKE '%s%%')", select->accPrefix); safef(query, sizeof(query), "SELECT acc,id,version,moddate,type,direction," /* 0 1 2 3 4 5 */ "source,organism,library,mrnaClone,sex,tissue,development,cell,cds," /* 6 7 8 9 10 11 12 13 14 */ "keyword,description,geneName,productName,author " /* 15 16 17 18 19 */ "FROM gbCdnaInfo WHERE (type='%s')%s", ((select->type == GB_MRNA) ? "mRNA" : "EST"), accWhere); /* mrna doesn't have a srcDb, so we guess from acc */ result = sqlGetResult(conn, query); while ((row = sqlNextRow(result)) != NULL) { if (gbGuessSrcDb(row[0]) == select->release->srcDb) loadGbCdnaInfoRow(metaDataTbls, conn, row); } sqlFreeResult(&result); }
int main(int argc, char* argv[]) { char *relName, *updateName, *typeAccPrefix, *database, *sep; struct gbIndex* index; struct gbSelect select; struct gbSelect* prevSelect = NULL; struct gbAlignInfo alignInfo; boolean noMigrate; ZeroVar(&select); optionInit(&argc, argv, optionSpecs); if (argc != 5) usage(); maxFaSize = optionInt("fasize", -1); workDir = optionVal("workdir", "work/align"); noMigrate = optionExists("noMigrate"); createPolyASizes = optionExists("polyASizes"); gbVerbInit(optionInt("verbose", 0)); relName = argv[1]; updateName = argv[2]; typeAccPrefix = argv[3]; database = argv[4]; /* parse typeAccPrefix */ sep = strchr(typeAccPrefix, '.'); if (sep != NULL) *sep = '\0'; select.type = gbParseType(typeAccPrefix); if (sep != NULL) { select.accPrefix = sep+1; *sep = '.'; } select.orgCats = gbParseOrgCat(optionVal("orgCats", "native,xeno")); index = gbIndexNew(database, NULL); select.release = gbIndexMustFindRelease(index, relName); select.update = gbReleaseMustFindUpdate(select.release, updateName); gbVerbMsg(0, "gbAlignGet: %s/%s/%s/%s", select.release->name, select.release->genome->database, select.update->name, typeAccPrefix); /* Get the release to migrate, if applicable */ if (!noMigrate) prevSelect = gbAlignGetMigrateRel(&select); alignInfo = gbAlignGet(&select, prevSelect); /* always print stats */ fprintf(stderr, "gbAlignGet: %s/%s/%s/%s: align=%d, migrate=%d\n", select.release->name, select.release->genome->database, select.update->name, typeAccPrefix, alignInfo.align.accTotalCnt, alignInfo.migrate.accTotalCnt); gbIndexFree(&index); /* print alignment and migrate count, which is read by the driver program */ printf("alignCnt: %d %d\n", alignInfo.align.accTotalCnt, alignInfo.migrate.accTotalCnt); return 0; }
static void makeRepairs(struct brokenRefPepTbl *brpTbl, struct sqlConnection *conn, struct extFileTbl* extFileTbl, boolean dryRun) /* make repairs once data is collected */ { static char *tmpDir = "/var/tmp"; struct hashCookie cookie; struct hashEl *hel; int repairCnt = 0; int dropCnt = 0; struct seqTbl* seqTbl = seqTblNew(conn, tmpDir, (gbVerbose > 3)); struct sqlDeleter* seqTblDeleter = sqlDeleterNew(tmpDir, (gbVerbose > 3)); cookie = hashFirst(brpTbl->protAccHash); while ((hel = hashNext(&cookie)) != NULL) { struct brokenRefPep *brp = hel->val; if ((brp->mrnaAcc != NULL) && (brp->newFaOff >= 0)) { refPepRepairOne(conn, brp, seqTbl, extFileTbl, dryRun); repairCnt++; } else { refPepDropOne(conn, brp, seqTblDeleter, dryRun); dropCnt++; } } if (dryRun) { gbVerbMsg(1, "%s: would have repaired %d refseq protein gbExtFile entries", sqlGetDatabase(conn), repairCnt); gbVerbMsg(1, "%s: would have dropped %d refseq protein gbExtFile entries", sqlGetDatabase(conn), dropCnt); } else { seqTblCommit(seqTbl, conn); gbVerbMsg(1, "%s: repaired %d refseq protein gbExtFile entries", sqlGetDatabase(conn), repairCnt); sqlDeleterDel(seqTblDeleter, conn, SEQ_TBL, "acc"); gbVerbMsg(1, "%s: dropped %d refseq protein gbExtFile entries", sqlGetDatabase(conn), dropCnt); } }
static struct extFile* extFileGet(struct extFileTbl* extFileTbl, char *acc, int extFileId) /* get the extFile for an accession given it's id. Return NULL if id is not * valid */ { struct extFile* extFile = extFileTblFindById(extFileTbl, extFileId); if (extFile == NULL) gbVerbMsg(3, "%s: gbExtFile id %d not in gbExtFile table", acc, extFileId); return extFile; }
static void loadRefSeqStatus(struct metaDataTbls* metaDataTbls, struct sqlConnection* conn) /* load the refSeqStatus table */ { struct sqlResult* result; char** row; gbVerbMsg(2, "load refSeqStatus table data"); result = sqlGetResult(conn, "SELECT mrnaAcc,status FROM refSeqStatus"); while ((row = sqlNextRow(result)) != NULL) loadRefSeqStatusRow(metaDataTbls, conn, row); sqlFreeResult(&result); }
/* Check a location of a sequence against bounds of a faFile */ static boolean extFileChkBounds(char *protAcc, struct extFile* extFile, off_t faOff, unsigned recSize) { off_t faSize = fileSize(extFile->path); if (faSize < 0) { gbVerbMsg(3, "%s: extFile does not exist or is not readable: %s", protAcc, extFile->path); return FALSE; } if (faSize != extFile->size) { gbVerbMsg(3, "%s: extFile size (%lld) does match actual fasta file size (%lld): %s", protAcc, (long long)extFile->size, (long long)faSize, extFile->path); return FALSE; } if ((faOff+recSize) > faSize) { gbVerbMsg(3, "%s: fasta record end (%lld) does past end of (%lld): %s", protAcc, (long long)(faOff+recSize), (long long)faSize, extFile->path); return FALSE; } return TRUE; }
static void loadRefLink(struct metaDataTbls* metaDataTbls, struct sqlConnection* conn) /* load the refLink table */ { struct sqlResult* result; char** row; gbVerbMsg(2, "load relLink table data"); result = sqlGetResult(conn, "SELECT mrnaAcc,name,product,protAcc,geneName," "prodName,locusLinkId,omimId from refLink"); while ((row = sqlNextRow(result)) != NULL) loadRefLinkRow(metaDataTbls, conn, row); sqlFreeResult(&result); }
static struct sqlDeleter* buildReloadDeleter(char *reloadList, unsigned srcDb, char *tmpDir) /* read reload list, building a deleter for the specified source DB */ { struct sqlDeleter* deleter = NULL; struct lineFile *lf = gzLineFileOpen(reloadList); int cnt = 0; char *row[1]; while (lineFileChopNext(lf, row, ArraySize(row))) { char *acc = trimSpaces(row[0]); if (gbGuessSrcDb(acc) == srcDb) { if (deleter == NULL) deleter = sqlDeleterNew(tmpDir, (gbVerbose >= 4)); sqlDeleterAddAcc(deleter, acc); cnt++; gbVerbMsg(5, "%s delete for reloading", acc); } } gzLineFileClose(&lf); gbVerbMsg(1, "delete %d entries for reloading", cnt); return deleter; }
static void chkPsl(struct psl* psl, unsigned iRow, char* database, char* table, struct metaDataTbls* metaDataTbls, unsigned typeFlags) /* Validate a PSL of a mrna/est to genome alignment against the metadata. * Also count the number of alignments of a mrna. */ { unsigned chromSize = getChromSize(database, psl->tName); struct metaData* md = metaDataTblsFind(metaDataTbls, psl->qName); char pslDesc[128]; if (gbVerbose >= 3) gbVerbMsg(3, "chkPsl %s:%d %s %s", table, iRow, psl->qName, psl->tName); safef(pslDesc, sizeof(pslDesc), "psl %s.%s row %u", database, table, iRow); /* check that we have sequence info and compare sizes sizes */ if (chromSize == 0) gbError("%s: tName not a valid chromosome: \"%s\"", pslDesc, psl->tName); else if (chromSize != psl->tSize) gbError("%s: tSize %u != chromosome %s size %u", pslDesc, psl->tSize, psl->tName, chromSize); if (md == NULL) gbError("%s: qName not in mrna table as type %s: \"%s\"", pslDesc, gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName); else if (md->inSeq) { if (!md->inGbIndex) gbError("%s: qName not in gbIndex as type %s: \"%s\"" " (Note: this can be caused by GenBank entries that were changed from type mRNA to other RNA types)", pslDesc, gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName); else { if (typeFlags != md->typeFlags) gbError("%s: alignment for %s type %s doesn't match expected %s", pslDesc, psl->qName, gbFmtSelect(md->typeFlags), gbFmtSelect(typeFlags)); } if (md->seqSize != psl->qSize) gbError("%s: qSize %u != %s size %u", pslDesc, psl->qSize, psl->qName, md->seqSize); md->numAligns++; } /* validate consistency of PSL */ if (pslCheck(pslDesc, stderr, psl)) errorCnt++; }
void sqlDeleterDel(struct sqlDeleter* sd, struct sqlConnection *conn, char* table, char* column) /* Delete row where column is in list. */ { if ((sd->accCount > 0) && sqlTableExists(conn, table)) { if (sd->verbose) gbVerbMsg(gbVerbose, "deleting %d keys from %s", sd->accCount, table); #ifdef COPY_TO_DELETE_HACK if (sd->useDeleteJoin) deleteJoin(sd, conn, table, column); else #endif deleteDirect(sd, conn, table, column); } }
static HGID getExtFileId(struct sqlConnection *conn, char* relPath) /* get the extFile id for a file, prepending the gbdb root dir */ { char path[PATH_LEN]; path[0] = '\0'; if (gGbdbGenBank[0] != '\0') { strcpy(path, gGbdbGenBank); strcat(path, "/"); } strcat(path, relPath); if (extFiles == NULL) { gbVerbMsg(4, "loading extFile table"); extFiles = extFileTblLoad(conn); } return extFileTblGet(extFiles, conn, path); }
static void loadSeqPepData(struct metaDataTbls* metaDataTbls, struct sqlConnection* conn, boolean checkExtSeqRecs, char* gbdbMapToCurrent, struct extFileTbl* extFileTbl) /* load refseq peptide data from the seq table */ { char query[512]; char **row; struct sqlResult* result; gbVerbMsg(2, "load gbSeq peptide table data"); safef(query, sizeof(query), "SELECT * FROM gbSeq WHERE (type='PEP') AND (srcDb='RefSeq')"); result = sqlGetResult(conn, query); while ((row = sqlNextRow(result)) != NULL) loadSeqPepRow(metaDataTbls, extFileTbl, checkExtSeqRecs,gbdbMapToCurrent, conn, row); sqlFreeResult(&result); }
static void chkGenePred(struct genePred* gene, char *geneName, unsigned iRow, char* database, char* table, struct metaDataTbls* metaDataTbls, unsigned typeFlags) /* Validate a genePred of a refSeq to genome alignment against the metadata. * Also count the number of alignments, and check the geneName, if available */ { char desc[512]; unsigned chromSize = getChromSize(database, gene->chrom); struct metaData* md = metaDataTblsFind(metaDataTbls, gene->name); if (gbVerbose >= 3) gbVerbMsg(3, "chkGenePred %s:%d %s %s", table, iRow, gene->name, gene->chrom); safef(desc, sizeof(desc), "gene %s.%s:%u %s %s", database, table, iRow, gene->name, gene->chrom); /* basic sanity checks */ if (genePredCheck(desc, stderr, chromSize, gene)) errorCnt++; /* check if in mrna table */ if (md == NULL) gbError("%s: %s in not in mrna table", desc, gene->name); else { if (typeFlags != md->typeFlags) gbError("%s: alignment of %s type %s doesn't match expected %s", desc, gene->name, gbFmtSelect(md->typeFlags), gbFmtSelect(typeFlags)); md->numAligns++; } /* check gene name */ if ((md != NULL) && (geneName != NULL)) { char* rlName = (md->rlName == NULL) ? "" : md->rlName; if (!sameString(geneName, rlName)) gbError("%s: %s geneName \"%s\" does not match refLink name \"%s\"", desc, gene->name, geneName, rlName); } }
static struct sqlDeleter* buildIgnoredDeleters(struct sqlConnection *conn, struct gbRelease* release, boolean force, char* workDir) /* Construct a deleter object with ignored acc that are in gbStatus. return * NULL if none. */ { struct sqlDeleter* deleter = NULL; struct hashCookie cookie; struct hashEl* hel; char tmpDir[PATH_LEN]; /* Need to force load of ignore table, as release might not be initialized yet */ gbReleaseLoadIgnore(release); safef(tmpDir, sizeof(tmpDir), "%s/ignore", workDir); /* build delete object */ cookie = hashFirst(release->ignore->accHash); while ((hel = hashNext(&cookie)) != NULL) { struct gbIgnoreAcc* igAcc; for (igAcc = hel->val; igAcc != NULL; igAcc = igAcc->next) { if (force || inGbStatusTable(conn, igAcc->acc, igAcc->modDate)) { if (deleter == NULL) deleter = sqlDeleterNew(tmpDir, (gbVerbose >= 4)); sqlDeleterAddAcc(deleter, igAcc->acc); gbVerbMsg(4, "%s %s ignored, will delete", igAcc->acc, gbFormatDate(igAcc->modDate)); } } } return deleter; }
struct gbStatusTbl* gbBuildState(struct sqlConnection *conn, struct gbSelect* select, struct dbLoadOptions* options, float maxShrinkage, char* tmpDir, int verboseLevel, boolean extFileUpdate, boolean* maxShrinkageExceeded) /* Load status table and find of state of all genbank entries in the release * compared to the database. */ { struct gbStatusTbl* statusTbl; struct selectStatusData ssData; unsigned selectFlags = (select->type | select->release->srcDb); ZeroVar(&ssData); gOptions = options; *maxShrinkageExceeded = FALSE; gbVerbose = verboseLevel; gErrorCnt = 0; loadNonCoding = dbLoadNonCoding(sqlGetDatabase(conn), select); if (loadNonCoding) gbVerbMsg(1, "NOTE: loading non-coding"); gbVerbEnter(3, "build state table"); gbVerbMsg(4, "reading gbSeq accessions"); ssData.select = select; ssData.seqHash = seqTblLoadAcc(conn, select); gbVerbMsg(4, "reading gbStatus"); statusTbl = gbStatusTblSelectLoad(conn, selectFlags, select->accPrefix, selectStatus, &ssData, tmpDir, extFileUpdate, (gbVerbose >= 4)); findNewEntries(select, statusTbl); /* Don't allow deletes when select criteria has changed */ if ((ssData.orgCatDelCnt > 0) && !(gOptions->flags & DBLOAD_LARGE_DELETES)) errAbort("%u entries deleted due to organism category no longer being selected, specify -allowLargeDeletes to override", ssData.orgCatDelCnt); /* check shrinkage unless override */ if ((gOptions->flags & DBLOAD_LARGE_DELETES) == 0) { if (!checkShrinkage(select, maxShrinkage, statusTbl)) *maxShrinkageExceeded = TRUE; } /* don't do other setup if we are going to stop on maxShrinkageExceeded */ if (!*maxShrinkageExceeded) { gbVerbMsg(4, "checking for orphans"); findOrphans(conn, select, ssData.seqHash, statusTbl); if (((gOptions->flags & DBLOAD_INITIAL) == 0)) { gbVerbMsg(4, "checking for type change"); checkForTypeChange(conn, select, statusTbl); } } #ifdef DUMP_HASH_STATS hashPrintStats(ssData.seqHash, "stateSeq", stderr); #endif hashFree(&ssData.seqHash); gbVerbLeave(3, "build state table"); /* always print stats */ fprintf(stderr, "gbLoadRna: selected %s: delete=%u seqChg=%u metaChg=%u extChg=%u new=%u orphan=%u derived=%u noChg=%u\n", gbSelectDesc(select), statusTbl->numDelete, statusTbl->numSeqChg, statusTbl->numMetaChg, statusTbl->numExtChg, statusTbl->numNew, statusTbl->numOrphan, statusTbl->numRebuildDerived, statusTbl->numNoChg); /* this doesn't include large delete errors */ if (gErrorCnt > 0) errAbort("Errors detecting when constructing state table"); return statusTbl; }
/* Check a protein sequence, return FALSE if there is some reason it can't be * obtained or doesn't match */ static boolean faCheckProtRec(char *protAcc, short protVer, struct extFile* extFile, off_t faOff, unsigned seqSize, unsigned recSize) { static const int extraBytes = 8; /* extra bytes to read to allow checking next record */ int askSize = recSize+extraBytes; int readSize; char *faBuf, *p, gotAcc[GB_ACC_BUFSZ]; short gotVer; struct dnaSeq *protSeq; FILE *fh = mustOpen(extFile->path, "r"); /* bounds have already been check; so error if we can read the bytes */ if (fseeko(fh, faOff, SEEK_SET) < 0) errnoAbort("%s: can't seek to %lld in %s", protAcc, (long long)faOff, extFile->path); faBuf = needMem(askSize+1); readSize = fread(faBuf, 1, askSize, fh); if (readSize < 0) errnoAbort("%s: read failed at %lld in %s", protAcc, (long long)faOff, extFile->path); if (readSize < recSize) errAbort("%s: can't read %d bytes at %lld in %s", protAcc, recSize, (long long)faOff, extFile->path); carefulClose(&fh); faBuf[readSize] = '\0'; /* check that it starts with a '>' and that there are no extra bases after the * end of sequence */ if (faBuf[0] != '>') { gbVerbMsg(3, "%s: fasta record at %lld does not start with a '>': %s", protAcc, (long long)faOff, extFile->path); freeMem(faBuf); return FALSE; } p = skipLeadingSpaces(faBuf+recSize); if (!((*p == '>') || (*p == '\0'))) { gbVerbMsg(3, "%s: fasta record at %lld for %d has extra characters following the record: %s", protAcc, (long long)faOff, recSize, extFile->path); freeMem(faBuf); return FALSE; } protSeq = faSeqFromMemText(faBuf, FALSE); gotVer = gbSplitAccVer(protSeq->name, gotAcc); if (!(sameString(gotAcc, protAcc) && (gotVer == protVer))) { gbVerbMsg(3, "%s: expected sequence %s.%d, found %s.%d in fasta record at %lld : %s", protAcc, protAcc, protVer, gotAcc, gotVer, (long long)faOff, extFile->path); dnaSeqFree(&protSeq); return FALSE; } if (protSeq->size != seqSize) { gbVerbMsg(3, "%s: expected sequence of %d chars, got %d from fasta record at %lld : %s", protAcc, seqSize, protSeq->size, (long long)faOff, extFile->path); dnaSeqFree(&protSeq); return FALSE; } dnaSeqFree(&protSeq); return TRUE; }