Exemplo n.º 1
0
void refPepRepair(char *db,
                  char *accFile,
                  boolean dryRun)
/* fix dangling repPep gbSeq entries. */
{
struct sqlConnection *conn = sqlConnect(db);
struct brokenRefPepTbl *brpTbl;
struct extFileTbl* extFileTbl;
struct slName *accs = (accFile == NULL) ? NULL : slNameLoadReal(accFile);
if (!checkForRefLink(conn))
    {
    sqlDisconnect(&conn);
    return;
    }

gbVerbMsg(1, "%s: repairing refseq protein gbExtFile entries%s",
          sqlGetDatabase(conn), (dryRun? " (dry run)" : ""));

extFileTbl = extFileTblLoad(conn);
brpTbl = brokenRefPepTblNew(conn, accs);
brokenRefPepGetSeqScan(conn, extFileTbl, brpTbl);
brokenRefPepGetMrnas(conn, brpTbl);

fillInFastaOffsets(brpTbl, conn, extFileTbl);
if (brpTbl->numToRepair > 0)
    makeRepairs(brpTbl, conn, extFileTbl, dryRun);
else
    gbVerbMsg(1, "%s: no refseq proteins to repair", sqlGetDatabase(conn));
brokenRefPepTblFree(&brpTbl);
extFileTblFree(&extFileTbl);
sqlDisconnect(&conn);
slFreeList(&accs);
}
Exemplo n.º 2
0
static void getFastaOffsets(struct brokenRefPepTbl *brpTbl,
                            struct sqlConnection *conn,
                            struct extFileTbl* extFileTbl,
                            char *faPath)
/* parse fasta file to get offsets of proteins */
{
struct gbFa *fa = gbFaOpen(faPath, "r");
char acc[GB_ACC_BUFSZ];
struct brokenRefPep *brp;
HGID extId = extFileTblGet(extFileTbl, conn, faPath);

gbVerbMsg(5, "scanning fasta: %s", faPath);
while (gbFaReadNext(fa))
    {
    gbVerbMsg(5, "   %s: %lld", fa->id, (long long)fa->recOff);
    /* save only if same acecss, version, and file (to match mrna fa) */
    short ver = gbSplitAccVer(fa->id, acc);
    brp = hashFindVal(brpTbl->protAccHash, acc);
    if ((brp != NULL) && (ver == brp->protVer) && sameString(faPath, brp->newFaPath))
        {
        gbFaGetSeq(fa); /* force read of sequence data */
        brp->newFaId = extId;
        brp->newFaOff = fa->recOff;
        brp->newSeqSize = fa->seqLen;
        brp->newRecSize = fa->off-fa->recOff;
        gbVerbMsg(5, "      save: %s %lld for %lld\n", fa->id, (long long)fa->recOff, (long long)fa->off);
        }
    }
gbFaClose(&fa);
}
void deleteOutdated(struct sqlConnection *conn, struct gbSelect* select,
                    struct gbStatusTbl* statusTbl, char* tmpDir)
/* delete outdated alignments and metadata from the database. */
{
gbVerbEnter(3, "delete outdated");

/* first the alignments */
gbVerbMsg(4, "delete outdated alignments");
gbAlignDataDeleteOutdated(gDatabase, conn, select, statusTbl, &gOptions, tmpDir);

/* now drop metadata entries */
gbVerbMsg(4, "delete outdated metadata");
gbMetaDataDeleteOutdated(conn, select, statusTbl, &gOptions, tmpDir);

/* Now it's safe to drop deleted entries from the database status table. */
gbVerbMsg(4, "delete outdated gbStatus");
gbStatusTblRemoveDeleted(statusTbl, conn);

/* orphaned now become new */
statusTbl->newList = slCat(statusTbl->newList, statusTbl->orphanList);
statusTbl->orphanList = NULL;
statusTbl->numNew += statusTbl->numOrphan;
statusTbl->numOrphan = 0;

gbVerbLeave(3, "delete outdated");
}
Exemplo n.º 4
0
void refPepList(char *db,
                FILE* outFh)
/* list of sequences needing repair */
{
struct sqlConnection *conn = sqlConnect(db);
struct brokenRefPepTbl *brpTbl;
struct hashCookie cookie;
struct hashEl *hel;
struct extFileTbl* extFileTbl = NULL;

if (!checkForRefLink(conn))
    {
    sqlDisconnect(&conn);
    return;
    }

extFileTbl = extFileTblLoad(conn);
brpTbl = brokenRefPepTblNew(conn, NULL);
brokenRefPepGetSeqScan(conn, extFileTbl, brpTbl);
brokenRefPepGetMrnas(conn, brpTbl);
extFileTblFree(&extFileTbl);

cookie = hashFirst(brpTbl->protAccHash);
while ((hel = hashNext(&cookie)) != NULL)
    {
    struct brokenRefPep *brp = hel->val;
    fprintf(outFh, "%s\t%s\t%s\n", sqlGetDatabase(conn), brp->protAcc, (brp->mrnaAcc != NULL)? "repair" : "drop");
    }
gbVerbMsg(1, "%s: need to repair %d refseq protein gbExtFile entries",
          sqlGetDatabase(conn), brpTbl->numToRepair);
gbVerbMsg(1, "%s: need to drop %d refseq protein gbExtFile entries",
          sqlGetDatabase(conn), brpTbl->numToDrop);
}
Exemplo n.º 5
0
int main(int argc, char* argv[])
{
char *relName, *updateName, *typeAccPrefix, *database, *sep;
struct gbIndex* index;
struct gbSelect select;
struct gbSelect* prevSelect = NULL;
boolean noMigrate;
ZeroVar(&select);

optionInit(&argc, argv, optionSpecs);
if (argc != 5)
    usage();
gWorkDir = optionVal("workdir", "work/align");
gSortTmp = optionVal("sortTmp", NULL);
noMigrate = optionExists("noMigrate");
gbVerbInit(optionInt("verbose", 0));
relName = argv[1];
updateName = argv[2];
typeAccPrefix = argv[3];
database = argv[4];

/* parse typeAccPrefix */
sep = strchr(typeAccPrefix, '.');
if (sep != NULL)
    *sep = '\0';
select.type = gbParseType(typeAccPrefix);
if (sep != NULL)
    {
    select.accPrefix = sep+1;
    *sep = '.';
    }

index = gbIndexNew(database, NULL);
select.release = gbIndexMustFindRelease(index, relName);
select.update = gbReleaseMustFindUpdate(select.release, updateName);
select.orgCats = gbParseOrgCat(optionVal("orgCats", "native,xeno"));

gbVerbMsg(0, "gbAlignInstall: %s/%s/%s/%s", select.release->name,
          select.release->genome->database, select.update->name,
          typeAccPrefix);

/* Get the release to migrate, if applicable */
if (!noMigrate)
    prevSelect = gbAlignGetMigrateRel(&select);

gbAlignInstall(&select, prevSelect);

/* must go to stderr to be logged */
gbVerbMsg(0, "gbAlignInstall: complete");
    
gbIndexFree(&index);
return 0;
}
struct gbAlignInfo gbAlignGet(struct gbSelect* select,
                              struct gbSelect* prevSelect)
/* Build files to align in the work directory.  If this is not a full release,
 * or there is no previously aligned release, prevSelect should be NULL.
 */
{
struct gbAlignInfo alignInfo;

gbVerbEnter(1, "gbAlignGet: %s", gbSelectDesc(select));
if (prevSelect != NULL)
    prevSelect->orgCats = select->orgCats;

/* load the required entry data */
gbReleaseLoadProcessed(select);
if (prevSelect != NULL)
    {
    gbReleaseLoadProcessed(prevSelect);
    gbReleaseLoadAligned(prevSelect);
    }

/* select entries to align */
gbVerbEnter(2, "selecting seqs to align");
alignInfo = gbAlignFindNeedAligned(select, prevSelect);
gbVerbLeave(2, "selecting seqs to align");

if (alignInfo.migrate.accTotalCnt > 0)
    gbVerbMsg(1, "gbAlignGet: %d %s entries, %d alignments will be migrated",
              alignInfo.migrate.accTotalCnt, gbFmtSelect(select->type),
              alignInfo.migrate.recTotalCnt);

/* create fasta with sequences to align if not empty */
if (alignInfo.align.accTotalCnt > 0)
    {
    gbVerbMsg(1, "gbAlignGet: %d %s sequences will be align",
              alignInfo.align.accTotalCnt, gbFmtSelect(select->type));
    copySelectedFasta(select);
    }

/* leave calling cards */
if (select->orgCats & GB_NATIVE)
    markAligns(select, GB_NATIVE);
if (select->orgCats & GB_XENO)
    markAligns(select, GB_XENO);

/* print before releasing memory */
gbVerbLeave(1, "gbAlignGet: %s", gbSelectDesc(select));

/* unload entries to free memory */
gbReleaseUnload(select->release);
if (prevSelect != NULL)
    gbReleaseUnload(prevSelect->release);
return alignInfo;
}
void loadSeqData(struct metaDataTbls* metaDataTbls,
                 struct gbSelect* select, struct sqlConnection* conn,
                 boolean checkExtSeqRecs, char* gbdbMapToCurrent)
/* load seq table data, gbCdnaInfo table should be loaded. For
* refseq, refLink should also have been loaded*/
{
struct extFileTbl* extFileTbl;

gbVerbMsg(2,  "load gbExtFile table data");
extFileTbl = extFileTblLoad(conn);

/* setup globals */
if (missingExtFileIds == NULL) 
    {
    missingExtFileIds = hashNew(16);
    missingExtFiles = hashNew(16);
    }
loadSeqCDnaData(metaDataTbls,select, conn, checkExtSeqRecs, gbdbMapToCurrent,
                extFileTbl);
if (select->release->srcDb == GB_REFSEQ)
    loadSeqPepData(metaDataTbls, conn, checkExtSeqRecs, gbdbMapToCurrent,
                   extFileTbl);

extFileTblFree(&extFileTbl);
}
static void loadGbStatus(struct metaDataTbls* metaDataTbls,
                         struct gbSelect* select, 
                         unsigned descOrgCats,
                         struct sqlConnection* conn)
/* load the gbStatus table */
{
char accWhere[64];
char query[512];
struct sqlResult* result;
char** row;

gbVerbMsg(2, "load gbStatus table data");
accWhere[0] = '\0';
if (select->accPrefix != NULL)
    safef(accWhere, sizeof(accWhere), " AND (acc LIKE '%s%%')",
          select->accPrefix);
safef(query, sizeof(query), 
      "SELECT acc,version,modDate,type,srcDb,orgCat,gbSeq,numAligns "
      "FROM gbStatus WHERE (type='%s') AND (srcDb='%s')%s",
      ((select->type == GB_MRNA) ? "mRNA" : "EST"),
      ((select->release->srcDb == GB_GENBANK) ? "GenBank" : "RefSeq"),
      accWhere);

result = sqlGetResult(conn, query);
while ((row = sqlNextRow(result)) != NULL)
    loadGbStatusRow(metaDataTbls, conn, row, descOrgCats);
sqlFreeResult(&result);
}
static void loadSeqCDnaData(struct metaDataTbls* metaDataTbls,
                            struct gbSelect* select, struct sqlConnection* conn,
                            boolean checkExtSeqRecs, char* gbdbMapToCurrent,
                            struct extFileTbl* extFileTbl)
/* load cDNA data from the seq table */
{
char accWhere[64];
char query[512];
char **row;
struct sqlResult* result;
gbVerbMsg(2, "load gbSeq cDNA table data");
accWhere[0] = '\0';
if (select->accPrefix != NULL)
    safef(accWhere, sizeof(accWhere), " AND (acc LIKE '%s%%')",
          select->accPrefix);
safef(query, sizeof(query), 
      "SELECT * FROM gbSeq WHERE (type='%s') AND (srcDb='%s')%s",
      ((select->type == GB_MRNA) ? "mRNA" : "EST"),
      ((select->release->srcDb == GB_GENBANK) ? "GenBank" : "RefSeq"),
      accWhere);

result = sqlGetResult(conn, query);

while ((row = sqlNextRow(result)) != NULL)
    loadSeqCDnaRow(metaDataTbls, extFileTbl, checkExtSeqRecs,gbdbMapToCurrent,
                   conn, row);
sqlFreeResult(&result);
}
static void loadGbCdnaInfoData(struct metaDataTbls* metaDataTbls,
                               struct gbSelect* select, struct sqlConnection* conn)
/* load the gbCdnaInfo table */
{
char accWhere[64];
char query[512];
struct sqlResult* result;
char** row;

gbVerbMsg(2, "load gbCdnaInfo table data");
accWhere[0] = '\0';
if (select->accPrefix != NULL)
    safef(accWhere, sizeof(accWhere), " AND (acc LIKE '%s%%')",
          select->accPrefix);
safef(query, sizeof(query), 
      "SELECT acc,id,version,moddate,type,direction,"
      /*        0  1       2       3    4         5 */
      "source,organism,library,mrnaClone,sex,tissue,development,cell,cds,"
      /*    6        7       8         9  10     11          12   13  14 */
      "keyword,description,geneName,productName,author "
      /*    15          16       17          18     19 */
      "FROM gbCdnaInfo WHERE (type='%s')%s",
      ((select->type == GB_MRNA) ? "mRNA" : "EST"), accWhere);
/* mrna doesn't have a srcDb, so we guess from acc */
result = sqlGetResult(conn, query);
while ((row = sqlNextRow(result)) != NULL)
    {
    if (gbGuessSrcDb(row[0]) == select->release->srcDb)
        loadGbCdnaInfoRow(metaDataTbls, conn, row);
    }
sqlFreeResult(&result);
}
int main(int argc, char* argv[])
{
char *relName, *updateName, *typeAccPrefix, *database, *sep;
struct gbIndex* index;
struct gbSelect select;
struct gbSelect* prevSelect = NULL;
struct gbAlignInfo alignInfo;
boolean noMigrate;
ZeroVar(&select);

optionInit(&argc, argv, optionSpecs);
if (argc != 5)
    usage();
maxFaSize = optionInt("fasize", -1);
workDir = optionVal("workdir", "work/align");
noMigrate = optionExists("noMigrate");
createPolyASizes = optionExists("polyASizes");
gbVerbInit(optionInt("verbose", 0));
relName = argv[1];
updateName = argv[2];
typeAccPrefix = argv[3];
database = argv[4];

/* parse typeAccPrefix */
sep = strchr(typeAccPrefix, '.');
if (sep != NULL)
    *sep = '\0';
select.type = gbParseType(typeAccPrefix);
if (sep != NULL)
    {
    select.accPrefix = sep+1;
    *sep = '.';
    }
select.orgCats = gbParseOrgCat(optionVal("orgCats", "native,xeno"));

index = gbIndexNew(database, NULL);
select.release = gbIndexMustFindRelease(index, relName);
select.update = gbReleaseMustFindUpdate(select.release, updateName);
gbVerbMsg(0, "gbAlignGet: %s/%s/%s/%s", select.release->name,
          select.release->genome->database, select.update->name,
          typeAccPrefix);

/* Get the release to migrate, if applicable */
if (!noMigrate)
    prevSelect = gbAlignGetMigrateRel(&select);

alignInfo = gbAlignGet(&select, prevSelect);

/* always print stats */
fprintf(stderr, "gbAlignGet: %s/%s/%s/%s: align=%d, migrate=%d\n",
        select.release->name, select.release->genome->database,
        select.update->name, typeAccPrefix,
        alignInfo.align.accTotalCnt, alignInfo.migrate.accTotalCnt);
gbIndexFree(&index);

/* print alignment and migrate count, which is read by the driver program */
printf("alignCnt: %d %d\n", alignInfo.align.accTotalCnt, alignInfo.migrate.accTotalCnt);
return 0;
}
Exemplo n.º 12
0
static void makeRepairs(struct brokenRefPepTbl *brpTbl,
                        struct sqlConnection *conn,
                        struct extFileTbl* extFileTbl,
                        boolean dryRun)
/* make repairs once data is collected */
{
static char *tmpDir = "/var/tmp";
struct hashCookie cookie;
struct hashEl *hel;
int repairCnt = 0;
int dropCnt = 0;
struct seqTbl* seqTbl = seqTblNew(conn, tmpDir, (gbVerbose > 3));
struct sqlDeleter* seqTblDeleter = sqlDeleterNew(tmpDir, (gbVerbose > 3));

cookie = hashFirst(brpTbl->protAccHash);
while ((hel = hashNext(&cookie)) != NULL)
    {
    struct brokenRefPep *brp = hel->val;
    if ((brp->mrnaAcc != NULL) && (brp->newFaOff >= 0))
        {
        refPepRepairOne(conn, brp, seqTbl, extFileTbl, dryRun);
        repairCnt++;
        }
    else
        {
        refPepDropOne(conn, brp, seqTblDeleter, dryRun);
        dropCnt++;
        }
    }
if (dryRun)
    {
    gbVerbMsg(1, "%s: would have repaired %d refseq protein gbExtFile entries", 
              sqlGetDatabase(conn), repairCnt);
    gbVerbMsg(1, "%s: would have dropped %d refseq protein gbExtFile entries", 
              sqlGetDatabase(conn), dropCnt);
    }
else
    {
    seqTblCommit(seqTbl, conn);
    gbVerbMsg(1, "%s: repaired %d refseq protein gbExtFile entries",
              sqlGetDatabase(conn), repairCnt);
    sqlDeleterDel(seqTblDeleter, conn, SEQ_TBL, "acc");
    gbVerbMsg(1, "%s: dropped %d refseq protein gbExtFile entries",
              sqlGetDatabase(conn), dropCnt);
    }
}
Exemplo n.º 13
0
static struct extFile* extFileGet(struct extFileTbl* extFileTbl, char *acc, int extFileId)
/* get the extFile for an accession given it's id.  Return NULL if id is not
 * valid */
{
struct extFile* extFile = extFileTblFindById(extFileTbl, extFileId);
if (extFile == NULL)
    gbVerbMsg(3, "%s: gbExtFile id %d not in gbExtFile table", acc, extFileId);
return extFile;
}
static void loadRefSeqStatus(struct metaDataTbls* metaDataTbls,
                            struct sqlConnection* conn)
/* load the refSeqStatus table */
{
struct sqlResult* result;
char** row;

gbVerbMsg(2, "load refSeqStatus table data");

result = sqlGetResult(conn, "SELECT mrnaAcc,status FROM refSeqStatus");
while ((row = sqlNextRow(result)) != NULL)
    loadRefSeqStatusRow(metaDataTbls, conn, row);
sqlFreeResult(&result);
}
Exemplo n.º 15
0
/* Check a location of a sequence against bounds of a faFile */
static boolean extFileChkBounds(char *protAcc, struct extFile* extFile,
                                off_t faOff, unsigned recSize)
{
off_t faSize = fileSize(extFile->path);
if (faSize < 0)
    {
    gbVerbMsg(3, "%s: extFile does not exist or is not readable: %s", protAcc, extFile->path);
    return FALSE;
    }
if (faSize != extFile->size)
    {
    gbVerbMsg(3, "%s: extFile size (%lld) does match actual fasta file size (%lld): %s", protAcc, 
              (long long)extFile->size, (long long)faSize, extFile->path);
    return FALSE;
    }
if ((faOff+recSize) > faSize)
    {
    gbVerbMsg(3, "%s: fasta record end (%lld) does past end of (%lld): %s", protAcc, 
              (long long)(faOff+recSize), (long long)faSize, extFile->path);
    return FALSE;
    }
return TRUE;
}
static void loadRefLink(struct metaDataTbls* metaDataTbls,
                        struct sqlConnection* conn)
/* load the refLink table */
{
struct sqlResult* result;
char** row;

gbVerbMsg(2, "load relLink table data");

result = sqlGetResult(conn, "SELECT mrnaAcc,name,product,protAcc,geneName,"
                      "prodName,locusLinkId,omimId from refLink");
while ((row = sqlNextRow(result)) != NULL)
    loadRefLinkRow(metaDataTbls, conn, row);
sqlFreeResult(&result);
}
static struct sqlDeleter* buildReloadDeleter(char *reloadList, unsigned srcDb, char *tmpDir)
/* read reload list, building a deleter for the specified source DB */
{
struct sqlDeleter* deleter = NULL;
struct lineFile *lf = gzLineFileOpen(reloadList);
int cnt = 0;
char *row[1];

while (lineFileChopNext(lf, row, ArraySize(row)))
    {
    char *acc = trimSpaces(row[0]);
    if (gbGuessSrcDb(acc) == srcDb)
        {
        if (deleter == NULL)
            deleter = sqlDeleterNew(tmpDir, (gbVerbose >= 4));
        sqlDeleterAddAcc(deleter, acc);
        cnt++;
        gbVerbMsg(5, "%s delete for reloading", acc);
        }
    }
gzLineFileClose(&lf);
gbVerbMsg(1, "delete %d entries for reloading", cnt);
return deleter;
}
Exemplo n.º 18
0
static void chkPsl(struct psl* psl, unsigned iRow, char* database,
                   char* table, struct metaDataTbls* metaDataTbls,
                   unsigned typeFlags)
/* Validate a PSL of a mrna/est to genome alignment against the metadata.
 * Also count the number of alignments of a mrna. */
{
unsigned chromSize = getChromSize(database, psl->tName);
struct metaData* md = metaDataTblsFind(metaDataTbls, psl->qName);
char pslDesc[128];
if (gbVerbose >= 3)
    gbVerbMsg(3, "chkPsl %s:%d %s %s",  table, iRow, psl->qName, psl->tName);

safef(pslDesc, sizeof(pslDesc), "psl %s.%s row %u", database, table, iRow);

/* check that we have sequence info and compare sizes sizes */
if (chromSize == 0)
    gbError("%s: tName not a valid chromosome: \"%s\"", pslDesc, psl->tName);
else
    if (chromSize != psl->tSize)
        gbError("%s: tSize %u != chromosome %s size %u",
                pslDesc, psl->tSize, psl->tName, chromSize);

if (md == NULL)
    gbError("%s: qName not in mrna table as type %s: \"%s\"",
            pslDesc, gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName);
else if (md->inSeq)
    {
    if (!md->inGbIndex)
        gbError("%s: qName not in gbIndex as type %s: \"%s\""
                " (Note: this can be caused by GenBank entries that were changed from type mRNA to other RNA types)", pslDesc,
                gbFmtSelect(typeFlags & GB_TYPE_MASK), psl->qName);
    else
        {
        if (typeFlags != md->typeFlags)
            gbError("%s: alignment for %s type %s doesn't match expected %s",
                    pslDesc, psl->qName, gbFmtSelect(md->typeFlags),
                    gbFmtSelect(typeFlags));
        }
    if (md->seqSize != psl->qSize)
        gbError("%s: qSize %u != %s size %u",
                pslDesc, psl->qSize, psl->qName, md->seqSize);
    md->numAligns++;
    }

/* validate consistency of PSL */
if (pslCheck(pslDesc, stderr, psl))
    errorCnt++;
}
void sqlDeleterDel(struct sqlDeleter* sd, struct sqlConnection *conn,
                   char* table, char* column)
/* Delete row where column is in list. */
{
if ((sd->accCount > 0) && sqlTableExists(conn, table))
    {
    if (sd->verbose)
        gbVerbMsg(gbVerbose, "deleting %d keys from %s", sd->accCount, table);
#ifdef COPY_TO_DELETE_HACK
    if (sd->useDeleteJoin)
        deleteJoin(sd, conn, table, column);
    else
#endif
        deleteDirect(sd, conn, table, column);
    }
}
Exemplo n.º 20
0
static HGID getExtFileId(struct sqlConnection *conn, char* relPath)
/* get the extFile id for a file, prepending the gbdb root dir */
{
char path[PATH_LEN];
path[0] = '\0';
if (gGbdbGenBank[0] != '\0')
    {
    strcpy(path, gGbdbGenBank);
    strcat(path, "/");
    }
strcat(path, relPath);

if (extFiles == NULL)
    {
    gbVerbMsg(4, "loading extFile table");
    extFiles = extFileTblLoad(conn);
    }
return extFileTblGet(extFiles, conn, path);
}
static void loadSeqPepData(struct metaDataTbls* metaDataTbls,
                           struct sqlConnection* conn,
                           boolean checkExtSeqRecs, char* gbdbMapToCurrent,
                           struct extFileTbl* extFileTbl)
/* load refseq peptide data from the seq table */
{
char query[512];
char **row;
struct sqlResult* result;
gbVerbMsg(2, "load gbSeq peptide table data");
safef(query, sizeof(query), 
      "SELECT * FROM gbSeq WHERE (type='PEP') AND (srcDb='RefSeq')");

result = sqlGetResult(conn, query);

while ((row = sqlNextRow(result)) != NULL)
    loadSeqPepRow(metaDataTbls, extFileTbl, checkExtSeqRecs,gbdbMapToCurrent,
                  conn, row);
sqlFreeResult(&result);
}
Exemplo n.º 22
0
static void chkGenePred(struct genePred* gene, char *geneName, unsigned iRow,
                        char* database, char* table,
                        struct metaDataTbls* metaDataTbls, unsigned typeFlags)
/* Validate a genePred of a refSeq to genome alignment against the metadata.
 * Also count the number of alignments, and check the geneName, if available */
{
char desc[512];
unsigned chromSize = getChromSize(database, gene->chrom);
struct metaData* md = metaDataTblsFind(metaDataTbls, gene->name);

if (gbVerbose >= 3)
    gbVerbMsg(3, "chkGenePred %s:%d %s %s",  table, iRow, 
              gene->name, gene->chrom);
safef(desc, sizeof(desc), "gene %s.%s:%u %s %s", database, table,
      iRow, gene->name, gene->chrom);

/* basic sanity checks */
if (genePredCheck(desc, stderr, chromSize, gene))
    errorCnt++;

/* check if in mrna table */
if (md == NULL)
    gbError("%s: %s in not in mrna table", desc, gene->name);
else
    {
    if (typeFlags != md->typeFlags)
        gbError("%s: alignment of %s type %s doesn't match expected %s",
                desc, gene->name, gbFmtSelect(md->typeFlags),
                gbFmtSelect(typeFlags));
    md->numAligns++;
    }

/* check gene name */
if ((md != NULL) && (geneName != NULL))
    {
    char* rlName = (md->rlName == NULL) ? "" : md->rlName;
    if (!sameString(geneName, rlName))
        gbError("%s: %s geneName \"%s\" does not match refLink name \"%s\"",
                desc, gene->name, geneName, rlName);
    }
}
static struct sqlDeleter*  buildIgnoredDeleters(struct sqlConnection *conn,
                                                struct gbRelease* release,
                                                boolean force, char* workDir)
/* Construct a deleter object with ignored acc that are in gbStatus.  return
 * NULL if none. */
{
struct sqlDeleter* deleter = NULL;
struct hashCookie cookie;
struct hashEl* hel;
char tmpDir[PATH_LEN];

/* Need to force load of ignore table, as release might not be initialized yet */
gbReleaseLoadIgnore(release);

safef(tmpDir, sizeof(tmpDir), "%s/ignore", workDir);

/* build delete object */
cookie = hashFirst(release->ignore->accHash);
while ((hel = hashNext(&cookie)) != NULL)
    {
    struct gbIgnoreAcc* igAcc;
    for (igAcc = hel->val; igAcc != NULL; igAcc = igAcc->next)
        {
        if (force || inGbStatusTable(conn, igAcc->acc, igAcc->modDate))
            {
            if (deleter == NULL)
                deleter = sqlDeleterNew(tmpDir, (gbVerbose >= 4));
            sqlDeleterAddAcc(deleter, igAcc->acc);
            gbVerbMsg(4, "%s %s ignored, will delete", igAcc->acc, 
                      gbFormatDate(igAcc->modDate));
            }

        }
    }
return deleter;
}
Exemplo n.º 24
0
struct gbStatusTbl* gbBuildState(struct sqlConnection *conn,
                                 struct gbSelect* select, 
                                 struct dbLoadOptions* options,
                                 float maxShrinkage,
                                 char* tmpDir,
                                 int verboseLevel,
                                 boolean extFileUpdate,
                                 boolean* maxShrinkageExceeded)
/* Load status table and find of state of all genbank entries in the release
 * compared to the database. */
{
struct gbStatusTbl* statusTbl;
struct selectStatusData ssData;
unsigned selectFlags = (select->type | select->release->srcDb);
ZeroVar(&ssData);

gOptions = options;
*maxShrinkageExceeded = FALSE;
gbVerbose = verboseLevel;
gErrorCnt = 0;

loadNonCoding = dbLoadNonCoding(sqlGetDatabase(conn), select);
if (loadNonCoding)
    gbVerbMsg(1, "NOTE: loading non-coding");

gbVerbEnter(3, "build state table");
gbVerbMsg(4, "reading gbSeq accessions");
ssData.select = select;
ssData.seqHash = seqTblLoadAcc(conn, select);

gbVerbMsg(4, "reading gbStatus");
statusTbl = gbStatusTblSelectLoad(conn, selectFlags, select->accPrefix,
                                  selectStatus, &ssData,
                                  tmpDir, extFileUpdate, (gbVerbose >= 4));
findNewEntries(select, statusTbl);

/* Don't allow deletes when select criteria has changed */
if ((ssData.orgCatDelCnt > 0) && !(gOptions->flags & DBLOAD_LARGE_DELETES))
    errAbort("%u entries deleted due to organism category no longer being selected, specify -allowLargeDeletes to override",
             ssData.orgCatDelCnt);

/* check shrinkage unless override */
if ((gOptions->flags & DBLOAD_LARGE_DELETES) == 0)
    {
    if (!checkShrinkage(select, maxShrinkage, statusTbl))
        *maxShrinkageExceeded = TRUE;
    }

/* don't do other setup if we are going to stop on maxShrinkageExceeded */
if (!*maxShrinkageExceeded)
    {
    gbVerbMsg(4, "checking for orphans");
    findOrphans(conn, select, ssData.seqHash, statusTbl);

    if (((gOptions->flags & DBLOAD_INITIAL) == 0))
        {
        gbVerbMsg(4, "checking for type change");
        checkForTypeChange(conn, select, statusTbl);
        }
    }

#ifdef DUMP_HASH_STATS
hashPrintStats(ssData.seqHash, "stateSeq", stderr);
#endif
hashFree(&ssData.seqHash);

gbVerbLeave(3, "build state table");

/* always print stats */
fprintf(stderr, "gbLoadRna: selected %s: delete=%u seqChg=%u metaChg=%u extChg=%u new=%u orphan=%u derived=%u noChg=%u\n",
        gbSelectDesc(select), statusTbl->numDelete, statusTbl->numSeqChg,
        statusTbl->numMetaChg, statusTbl->numExtChg, statusTbl->numNew,
        statusTbl->numOrphan, statusTbl->numRebuildDerived, 
        statusTbl->numNoChg);

/* this doesn't include large delete errors */
if (gErrorCnt > 0)
    errAbort("Errors detecting when constructing state table");
return statusTbl;
}
Exemplo n.º 25
0
/* Check a protein sequence, return FALSE if there is some reason it can't be
 * obtained or doesn't match */
static boolean faCheckProtRec(char *protAcc, short protVer, struct extFile* extFile,
                              off_t faOff, unsigned seqSize, unsigned recSize)
{
static const int extraBytes = 8;  /* extra bytes to read to allow checking next record */
int askSize = recSize+extraBytes;
int readSize;
char *faBuf, *p, gotAcc[GB_ACC_BUFSZ];
short gotVer;
struct dnaSeq *protSeq;
FILE *fh = mustOpen(extFile->path, "r");

/* bounds have already been check; so error if we can read the bytes */
if (fseeko(fh, faOff, SEEK_SET) < 0)
    errnoAbort("%s: can't seek to %lld in %s", protAcc, (long long)faOff, extFile->path);
faBuf = needMem(askSize+1);
readSize = fread(faBuf, 1, askSize, fh);
if (readSize < 0)
    errnoAbort("%s: read failed at %lld in %s", protAcc, (long long)faOff, extFile->path);
if (readSize < recSize)
    errAbort("%s: can't read %d bytes at %lld in %s", protAcc, recSize, (long long)faOff, extFile->path);
carefulClose(&fh);
faBuf[readSize] = '\0';

/* check that it starts with a '>' and that there are no extra bases after the
 * end of sequence */
if (faBuf[0] != '>')
    {
    gbVerbMsg(3, "%s: fasta record at %lld does not start with a '>': %s", protAcc, 
              (long long)faOff, extFile->path);
    freeMem(faBuf);
    return FALSE;
    }
p = skipLeadingSpaces(faBuf+recSize);
if (!((*p == '>') || (*p == '\0')))
    {
    gbVerbMsg(3, "%s: fasta record at %lld for %d has extra characters following the record: %s", protAcc, 
              (long long)faOff, recSize, extFile->path);
    freeMem(faBuf);
    return FALSE;
    }
protSeq = faSeqFromMemText(faBuf, FALSE);
gotVer = gbSplitAccVer(protSeq->name, gotAcc);
if (!(sameString(gotAcc, protAcc) && (gotVer == protVer)))
    {
    gbVerbMsg(3, "%s: expected sequence %s.%d, found %s.%d in fasta record at %lld : %s", protAcc,
              protAcc, protVer, gotAcc, gotVer, (long long)faOff, extFile->path);
    dnaSeqFree(&protSeq);
    return FALSE;
    }

if (protSeq->size != seqSize)
    {
    gbVerbMsg(3, "%s: expected sequence of %d chars, got %d from fasta record at %lld : %s", protAcc,
              seqSize, protSeq->size, (long long)faOff, extFile->path);
    dnaSeqFree(&protSeq);
    return FALSE;
    }

dnaSeqFree(&protSeq);
return TRUE;
}