static boolean raRecLoad(struct raInfoTbl *rit, unsigned srcDb, struct lineFile *raLf, unsigned cdnaExtId, unsigned pepExtId) /* load next ra record */ { char *acc, *protAccVer, protAcc[GB_ACC_BUFSZ]; int ver; struct hash *raRec = raNextRecord(raLf); if (raRec == NULL) return FALSE; acc = hashMustFindVal(raRec, "acc"); ver = sqlSigned((char*)hashMustFindVal(raRec, "ver")); raInfoAdd(rit, raRec, acc, ver, "siz", "fao", "fas", cdnaExtId); if ((srcDb == GB_REFSEQ) && ((protAccVer = hashFindVal(raRec, "prt")) != NULL)) { if (pepExtId == 0) errAbort("%s has protein %s, but no pep.fa file", acc, protAccVer); ver = gbSplitAccVer(protAccVer, protAcc); raInfoAdd(rit, raRec, protAcc, ver, "prs", "pfo", "pfs", pepExtId); } #ifdef DUMP_HASH_STATS hashPrintStats(raRec, "raRec", stderr); #endif hashFree(&raRec); return TRUE; }
void expAdd(char *file) /* Add rows from .ra file */ { struct hash *ra = NULL; struct lineFile *lf = lineFileOpen(file, TRUE); struct encodeExp *exp; struct hash *oldExps; char *key; /* create hash of keys for existing experiments so we can distinguish new ones */ oldExps = expKeyHashFromTable(connExp, table); verbose(1, "Adding experiments from file \'%s\' to table \'%s\'\n", file, table); while ((ra = raNextRecord(lf)) != NULL) { exp = encodeExpFromRa(ra); key = encodeExpKey(exp); if (hashLookup(oldExps, key) == NULL) { verbose(2, "Adding new experiment: %s\n", key); encodeExpAdd(connExp, table, exp); } else verbose(2, "Old experiment: %s\n", key); } }
struct hash *raReadThreeLevels(char *fileName, char *lowKeyField, char *middleKeyField) /* Return 3 level hash that contains all ra records in file keyed by lowKeyField, which must exist. * and broken into sub hashes based upon middleKeyField that must exist. * Example raReadThreeLevels("cv.ra","term","type"): * returns hash of 'type' hashes of 'term' hashes of every stanza in cv.ra */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *topHash = hashNew(0); // Not expecting that many types struct hash *bottomHash; while ((bottomHash = raNextRecord(lf)) != NULL) { char *lowKey = hashFindVal(bottomHash, lowKeyField); if (lowKey == NULL) errAbort("Couldn't find key field %s line %d of %s", lowKeyField, lf->lineIx, lf->fileName); char *middleKey = hashFindVal(bottomHash, middleKeyField); if (middleKey == NULL) errAbort("Couldn't find middle key field %s line %d of %s", middleKeyField, lf->lineIx, lf->fileName); struct hash *middleHash = hashFindVal(topHash, middleKey); if (middleHash == NULL) { middleHash = hashNew(16); // could be quite a few terms per type. hashAdd(topHash, middleKey, middleHash); } hashAdd(middleHash, lowKey, bottomHash); } lineFileClose(&lf); if (hashNumEntries(topHash) == 0) hashFree(&topHash); return topHash; }
struct hash *raReadWithFilter(char *fileName, char *keyField,char *filterKey,char *filterValue) /* Return hash that contains all filtered ra records in file keyed by given field, which must exist. * The values of the hash are themselves hashes. The filter is a key/value pair that must exist. * Example raReadWithFilter(file,"term","type","antibody"): returns hash of hashes of every term with type=antibody */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *bigHash = hashNew(14); struct hash *hash; while ((hash = raNextRecord(lf)) != NULL) { char *key = hashFindVal(hash, keyField); if (key == NULL) errAbort("Couldn't find key field %s line %d of %s", keyField, lf->lineIx, lf->fileName); if (filterKey != NULL) { char *filter = hashFindVal(hash, filterKey); if (filter == NULL) { hashFree(&hash); continue; } if (filterValue != NULL && differentString(filterValue,filter)) { hashFree(&hash); continue; } } hashAdd(bigHash, key, hash); } lineFileClose(&lf); if (hashNumEntries(bigHash) == 0) hashFree(&bigHash); return bigHash; }
struct hash *raReadSingle(char *fileName) /* Read in first ra record in file and return as hash. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = raNextRecord(lf); lineFileClose(&lf); return hash; }
struct hash *raReadAll(char *fileName, char *keyField) /* Return hash that contains all ra records in file keyed * by given field, which must exist. The values of the * hash are themselves hashes. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *bigHash = hashNew(0); struct hash *hash; while ((hash = raNextRecord(lf)) != NULL) { char *key = hashFindVal(hash, keyField); if (key == NULL) errAbort("Couldn't find key field %s line %d of %s", keyField, lf->lineIx, lf->fileName); hashAdd(bigHash, key, hash); } lineFileClose(&lf); return bigHash; }
void expRestoreTable(char *file) /* Fill empty table with experiments in .ra file with id's */ { struct hash *ra = NULL; struct lineFile *lf = lineFileOpen(file, TRUE); struct encodeExp *exp; int ix = 1; int expId; char *accession; char *key; verbose(1, "Restoring experiments from file \'%s\' to table \'%s\'\n", file, table); if (sqlRowCount(connExp, sqlCheckIdentifier(table)) != 0) errAbort("ERROR: table for restore must exist and be empty"); while ((ra = raNextRecord(lf)) != NULL) { exp = encodeExpFromRa(ra); /* save accession and id as we may stomp on these for to-delete experiments */ accession = cloneString(exp->lab); expId = exp->ix; key = encodeExpKey(exp); while (ix < expId) { exp->accession = "DELETED"; exp->ix = ix; verbose(3, "Adding row for deleted experiment %d\n", ix); encodeExpAdd(connExp, table, exp); ix++; } /* restore accession and id */ exp->accession = accession; exp->ix = expId; encodeExpAdd(connExp, table, exp); verbose(3, "Adding row for experiment %d: %s\n", ix, key); ix++; } verbose(1, "To complete restore, delete rows where accession=DELETED\n"); }
struct hash *readRefRa(char *fileName) /* Read in refSeq ra file and return bits we're interested * in in a hash full of refSeqInfos. */ { struct hash *hash = newHash(16); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *ra; int count = 0, cdsCount = 0; while ((ra = raNextRecord(lf)) != NULL) { char *acc = hashFindVal(ra, "acc"); if (acc != NULL) { char *cds = hashFindVal(ra, "cds"); char *siz = hashFindVal(ra, "siz"); struct refSeqInfo *rsi; if (siz == NULL) { warn("No size for %s, skipping", acc); continue; } AllocVar(rsi); hashAddSaveName(hash, acc, rsi, &rsi->acc); rsi->size = atoi(siz); if (cds != NULL) { rsi->hasCds = parseCds(cds, 0, rsi->size, &rsi->cdsStart, &rsi->cdsEnd); if (rsi->hasCds) ++cdsCount; } ++count; } hashFree(&ra); } lineFileClose(&lf); printf("Got %d cds of %d in %s\n", cdsCount, count, fileName); return hash; }
void txReadRa(char *mrnaRa, char *refSeqRa, char *outDir) /* txReadRa - Read ra files from genbank and parse out relevant info into some * tab-separated files. */ { struct lineFile *mrna = lineFileOpen(mrnaRa, TRUE); struct lineFile *refSeq = lineFileOpen(refSeqRa, TRUE); makeDir(outDir); FILE *fCds = openToWrite(outDir, "cds.tab"); FILE *fStatus = openToWrite(outDir, "refSeqStatus.tab"); FILE *fSize = openToWrite(outDir, "mrnaSize.tab"); FILE *fRefToPep = openToWrite(outDir, "refToPep.tab"); FILE *fPepStatus = openToWrite(outDir, "refPepStatus.tab"); FILE *fExceptions = openToWrite(outDir, "exceptions.tab"); FILE *fAccVer = openToWrite(outDir, "accVer.tab"); struct hash *ra; while ((ra = raNextRecord(refSeq)) != NULL) { char *acc = requiredField(ra, refSeq, "acc"); char *rss = requiredField(ra, refSeq, "rss"); char *siz = requiredField(ra, refSeq, "siz"); char *ver = requiredField(ra, mrna, "ver"); char *prt = hashFindVal(ra, "prt"); char *cds = hashFindVal(ra, "cds"); /* Translate rss into status. */ char *status = NULL; if (sameString(rss, "rev")) status = "Reviewed"; else if (sameString(rss, "pro")) status = "Provisional"; else if (sameString(rss, "pre")) status = "Predicted"; else if (sameString(rss, "val")) status = "Validated"; else if (sameString(rss, "inf")) status = "Inferred"; else errAbort("Unrecognized rss field %s after line %d of %s", rss, refSeq->lineIx, refSeq->fileName); fprintf(fStatus, "%s.%s\t%s\n", acc, ver, status); if (prt != NULL) { fprintf(fPepStatus, "%s\t%s\n", prt, status); fprintf(fRefToPep, "%s.%s\t%s\n", acc, ver, prt); } fprintf(fSize, "%s.%s\t%s\n", acc, ver, siz); if (cds != NULL) fprintf(fCds, "%s.%s\t%s\n", acc, ver, cds); outputExceptions(acc, ver, ra, fExceptions); fprintf(fAccVer, "%s\t%s.%s\n", acc, acc, ver); hashFree(&ra); } while ((ra = raNextRecord(mrna)) != NULL) { char *acc = requiredField(ra, mrna, "acc"); char *siz = requiredField(ra, mrna, "siz"); char *ver = requiredField(ra, mrna, "ver"); char *cds = hashFindVal(ra, "cds"); fprintf(fSize, "%s.%s\t%s\n", acc, ver, siz); if (cds != NULL) fprintf(fCds, "%s.%s\t%s\n", acc, ver, cds); outputExceptions(acc, ver, ra, fExceptions); fprintf(fAccVer, "%s\t%s.%s\n", acc, acc, ver); hashFree(&ra); } carefulClose(&fCds); carefulClose(&fStatus); carefulClose(&fSize); carefulClose(&fRefToPep); carefulClose(&fPepStatus); carefulClose(&fExceptions); carefulClose(&fAccVer); }