static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh) /* convert a genePred to a psl and CDS */ { int chromSize = hashIntValDefault(chromHash, gp->chrom, 0); if (chromSize == 0) errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom); int qSize = 0; if (qSizes != NULL) qSize = hashIntValDefault(qSizeHash, gp->name, 0); struct psl *psl = genePredToPsl(gp, chromSize, qSize); pslTabOut(psl, pslFh); pslFree(&psl); if (gp->cdsStart < gp->cdsEnd) cnvGenePredCds(gp, qSize, cdsFh); }
static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh) /* convert a genePred to a psl and CDS */ { int chromSize = hashIntValDefault(chromHash, gp->chrom, 0); if (chromSize == 0) errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom); int e = 0, qSize=0; for (e = 0; e < gp->exonCount; ++e) qSize+=(gp->exonEnds[e] - gp->exonStarts[e]); struct psl *psl = pslNew(gp->name, qSize, 0, qSize, gp->chrom, chromSize, gp->txStart, gp->txEnd, gp->strand, gp->exonCount, 0); psl->blockCount = gp->exonCount; for (e = 0; e < gp->exonCount; ++e) { psl->blockSizes[e] = (gp->exonEnds[e] - gp->exonStarts[e]); psl->qStarts[e] = e==0 ? 0 : psl->qStarts[e-1] + psl->blockSizes[e-1]; psl->tStarts[e] = gp->exonStarts[e]; } psl->match = qSize; psl->tNumInsert = psl->blockCount-1; psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize; pslTabOut(psl, pslFh); pslFree(&psl); if (gp->cdsStart < gp->cdsEnd) cnvGenePredCds(gp, qSize, cdsFh); }
void gensatFixFull(char *captionFile) /* Fix missing captions. */ { struct lineFile *lf = lineFileOpen(captionFile, TRUE); char *row[2]; struct dyString *sql = dyStringNew(0); struct sqlConnection *conn = sqlConnect(database); struct hash *capHash = newHash(16); while (lineFileRowTab(lf, row)) { int captionId; char *submitId = row[0]; char *caption = row[1]; captionId = hashIntValDefault(capHash, caption, 0); if (captionId == 0) { dyStringClear(sql); dyStringAppend(sql, "insert into caption values(default, \""); dyStringAppend(sql, caption); dyStringAppend(sql, "\")"); sqlUpdate(conn, sql->string); verbose(1, "%s\n", sql->string); captionId = sqlLastAutoId(conn); hashAddInt(capHash, caption, captionId); } dyStringClear(sql); dyStringPrintf(sql, "update imageFile set caption=%d ", captionId); dyStringPrintf(sql, "where submissionSet=%d ", gensatId); dyStringPrintf(sql, "and submitId = \"%s\"", submitId); sqlUpdate(conn, sql->string); verbose(1, "%s\n", sql->string); } dyStringFree(&sql); }
static int idLookup(struct hash *hash, void *obj) /* Look up object in hash. Return 0 if can't find it. * Otherwise return object id. */ { char buf[17]; safef(buf, sizeof(buf), "%p", obj); return hashIntValDefault(hash, buf, 0); }
double scoreLiftOverChain(struct liftOverChain *chain, char *fromOrg, char *fromDb, char *toOrg, char *toDb, char *cartOrg, char *cartDb, struct hash *dbRank ) /* Score the chain in terms of best match for cart settings */ { double score = 0; char *chainFromOrg = hArchiveOrganism(chain->fromDb); char *chainToOrg = hArchiveOrganism(chain->toDb); int fromRank = hashIntValDefault(dbRank, chain->fromDb, 0); /* values up to approx. #assemblies */ int toRank = hashIntValDefault(dbRank, chain->toDb, 0); int maxRank = hashIntVal(dbRank, "maxRank"); if (sameOk(fromOrg,chainFromOrg) && sameOk(fromDb,chain->fromDb) && sameOk(toOrg,chainToOrg) && sameOk(toDb,chain->toDb)) score += 10000000; if (sameOk(fromOrg,chainFromOrg)) score += 2000000; if (sameOk(fromDb,chain->fromDb)) score += 1000000; if (sameOk(toOrg,chainToOrg)) score += 200000; if (sameOk(toDb,chain->toDb)) score += 100000; if (sameOk(cartDb,chain->fromDb)) score += 20000; if (sameOk(cartDb,chain->toDb)) score += 10000; if (sameOk(cartOrg,chainFromOrg)) score += 2000; if (sameOk(cartOrg,chainToOrg)) score += 1000; score += 10*(maxRank-fromRank); score += (maxRank - toRank); return score; }
static boolean columnIsIncluded(struct annoFormatTab *self, char *sourceName, char *colName) // Return TRUE if column has not been explicitly deselected. { if (self->columnVis) { char fullName[PATH_LEN]; makeFullColumnName(fullName, sizeof(fullName), sourceName, colName); int vis = hashIntValDefault(self->columnVis, fullName, 1); if (vis == 0) return FALSE; } return TRUE; }
double calcDistanceFromCluster(struct rnaBinder *rb, struct clusterMember *cmList, struct dMatrix *sjIndex, struct dMatrix *psInten) /* Calculate the distance from the rnaBinder intensity measurement to the sjIndexes of the cluster members. If no intensity present use 0 as it will fall in the middle of [-1,1]. */ { double sum = 0; int count = 0; int sjIx = 0, gsIx = 0; struct clusterMember *cm = NULL; double corr = 0; if(sjIndex->colCount != psInten->colCount) errAbort("Splice Junction and Intensity files must have same number of columns."); /* Get the index of the gene set in the intensity file. */ gsIx = hashIntValDefault(psInten->nameIndex, rb->psName, -1); if(gsIx == -1) { /* warn("Probe Set %s not found in intensitiy file."); */ return 0; } for(cm = cmList; cm != NULL; cm = cm->next) { /* For each member get the index in the splice junction file. */ sjIx = hashIntValDefault(sjIndex->nameIndex, cm->psName, -1); if(sjIx == -1) errAbort("Probe Set %s not found in SJ index file."); corr = correlation(psInten->matrix[gsIx], sjIndex->matrix[sjIx], sjIndex->colCount); sum += corr; count++; } if(count == 0) errAbort("No junctions in cluster."); sum = sum / (double) count; return sum; }
struct psl *removeKillList(struct psl* pslList) /* Remove all of the psls that are in the kill hash. */ { struct psl *psl = NULL, *pslNext = NULL, *pslNewList = NULL; if(killHash == NULL) return pslList; for(psl = pslList; psl != NULL; psl = pslNext) { pslNext = psl->next; /* If the accession is in the kill list with value 1 don't add it. */ if(hashIntValDefault(killHash, psl->qName, 0) == 0) { slAddHead(&pslNewList, psl); } } slReverse(&pslNewList); return pslNewList; }
int oneHubTrackSettings(char *hubUrl, struct hash *totals) /* Read hub trackDb files, noting settings used */ { struct trackHub *hub = NULL; struct errCatch *errCatch = errCatchNew(); if (errCatchStart(errCatch)) hub = trackHubOpen(hubUrl, "hub_0"); errCatchEnd(errCatch); errCatchFree(&errCatch); if (hub == NULL) return 1; printf("%s (%s)\n", hubUrl, hub->shortLabel); struct trackHubGenome *genome; struct hash *counts; if (totals) counts = totals; else counts = newHash(0); struct hashEl *el; for (genome = hub->genomeList; genome != NULL; genome = genome->next) { struct trackDb *tdb, *tdbs = trackHubTracksForGenome(hub, genome); for (tdb = tdbs; tdb != NULL; tdb = tdb->next) { struct hashCookie cookie = hashFirst(trackDbHashSettings(tdb)); verbose(2, " track: %s\n", tdb->shortLabel); while ((el = hashNext(&cookie)) != NULL) { int count = hashIntValDefault(counts, el->name, 0); count++; hashReplace(counts, el->name, intToPt(count)); } } } if (!totals) printCounts(counts); trackHubClose(&hub); return 0; }
int countBases(struct sqlConnection *conn, char *chrom, int chromSize, char *database) /* Count bases, generally not including gaps, in chromosome. */ { static boolean gapsLoaded = FALSE; struct sqlResult *sr; int totalGaps = 0; char **row; int rowOffset; if (countGaps) return chromSize; /* If doing all chroms, then load up all the gaps and be done with * it instead of re-reading the gap table for every chrom */ if (sameWord(clChrom,"all")) { if (!gapsLoaded) gapHash = loadAllGaps(conn, database); gapsLoaded = TRUE; totalGaps = hashIntValDefault(gapHash, chrom, 0); } else { sr = hChromQuery(conn, "gap", chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { int gapSize; struct agpGap gap; agpGapStaticLoad(row+rowOffset, &gap); gapSize = gap.chromEnd - gap.chromStart; totalGaps += gapSize; } sqlFreeResult(&sr); } return chromSize - totalGaps; }
int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); char *lrgFile = argv[1]; char *chromSizes = argv[2]; char *pslFile = argv[3]; struct hash *chromHash = hChromSizeHashFromFile(chromSizes); struct lrg *lrg, *lrgList = lrgLoadAllByTab(lrgFile); FILE *f = mustOpen(pslFile, "w"); for (lrg = lrgList; lrg != NULL; lrg = lrg->next) { int chromSize = hashIntValDefault(chromHash, lrg->chrom, 0); if (chromSize == 0) errAbort("Can't find size of '%s' in chrom.sizes file %s.", lrg->chrom, chromSizes); struct psl *psl = lrgToPsl(lrg, chromSize); pslTabOut(psl, f); } return 0; }
void txInfoAssemble(char *txBedFile, char *cdsEvFile, char *txCdsPredictFile, char *altSpliceFile, char *exceptionFile, char *sizePolyAFile, char *pslFile, char *flipFile, char *outFile) /* txInfoAssemble - Assemble information from various sources into txInfo table.. */ { /* Build up hash of evidence keyed by transcript name. */ struct hash *cdsEvHash = hashNew(18); struct cdsEvidence *cdsEv, *cdsEvList = cdsEvidenceLoadAll(cdsEvFile); for (cdsEv = cdsEvList; cdsEv != NULL; cdsEv = cdsEv->next) hashAddUnique(cdsEvHash, cdsEv->name, cdsEv); verbose(2, "Loaded %d elements from %s\n", cdsEvHash->elCount, cdsEvFile); /* Build up hash of bestorf structures keyed by transcript name */ struct hash *predictHash = hashNew(18); struct cdsEvidence *predict, *predictList = cdsEvidenceLoadAll(txCdsPredictFile); for (predict = predictList; predict != NULL; predict = predict->next) hashAddUnique(predictHash, predict->name, predict); verbose(2, "Loaded %d predicts from %s\n", predictHash->elCount, txCdsPredictFile); /* Build up structure for random access of retained introns */ struct bed *altSpliceList = bedLoadNAll(altSpliceFile, 6); verbose(2, "Loaded %d alts from %s\n", slCount(altSpliceList), altSpliceFile); struct hash *altSpliceHash = bedsIntoHashOfKeepers(altSpliceList); /* Read in exception info. */ struct hash *selenocysteineHash, *altStartHash; genbankExceptionsHash(exceptionFile, &selenocysteineHash, &altStartHash); /* Read in polyA sizes */ struct hash *sizePolyAHash = hashNameIntFile(sizePolyAFile); verbose(2, "Loaded %d from %s\n", sizePolyAHash->elCount, sizePolyAFile); /* Read in psls */ struct hash *pslHash = hashNew(20); struct psl *psl, *pslList = pslLoadAll(pslFile); for (psl = pslList; psl != NULL; psl = psl->next) hashAdd(pslHash, psl->qName, psl); verbose(2, "Loaded %d from %s\n", pslHash->elCount, pslFile); /* Read in accessions that we flipped for better splice sites. */ struct hash *flipHash = hashWordsInFile(flipFile, 0); /* Open primary gene input and output. */ struct lineFile *lf = lineFileOpen(txBedFile, TRUE); FILE *f = mustOpen(outFile, "w"); /* Main loop - process each gene */ char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(3, "Processing %s\n", bed->name); /* Initialize info to zero */ struct txInfo info; ZeroVar(&info); /* Figure out name, sourceAcc, and isRefSeq from bed->name */ info.name = bed->name; info.category = "n/a"; if (isRfam(bed->name) || stringIn("tRNA", bed->name) != NULL) { info.sourceAcc = cloneString(bed->name); } else { info.sourceAcc = txAccFromTempName(bed->name); } info.isRefSeq = startsWith("NM_", info.sourceAcc); if (startsWith("antibody.", info.sourceAcc) || startsWith("CCDS", info.sourceAcc) || isRfam(info.sourceAcc) || stringIn("tRNA", info.sourceAcc) != NULL) { /* Fake up some things for antibody frag and CCDS that don't have alignments. */ info.sourceSize = bedTotalBlockSize(bed); info.aliCoverage = 1.0; info.aliIdRatio = 1.0; info. genoMapCount = 1; } else { /* Loop through all psl's associated with our RNA. Figure out * our overlap with each, and pick best one. */ struct hashEl *hel, *firstPslHel = hashLookup(pslHash, info.sourceAcc); if (firstPslHel == NULL) errAbort("%s is not in %s", info.sourceAcc, pslFile); int mapCount = 0; struct psl *psl, *bestPsl = NULL; int coverage, bestCoverage = 0; boolean isFlipped = (hashLookup(flipHash, info.sourceAcc) != NULL); for (hel = firstPslHel; hel != NULL; hel = hashLookupNext(hel)) { psl = hel->val; mapCount += 1; coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } /* If we flipped it, try it on the opposite strand too. */ if (isFlipped) { psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); coverage = pslBedOverlap(psl, bed); if (coverage > bestCoverage) { bestCoverage = coverage; bestPsl = psl; } psl->strand[0] = (psl->strand[0] == '+' ? '-' : '+'); } } if (bestPsl == NULL) errAbort("%s has no overlapping alignments with %s in %s", bed->name, info.sourceAcc, pslFile); /* Figure out and save alignment statistics. */ int polyA = hashIntValDefault(sizePolyAHash, bed->name, 0); info.sourceSize = bestPsl->qSize - polyA; info.aliCoverage = (double)bestCoverage / info.sourceSize; info.aliIdRatio = (double)(bestPsl->match + bestPsl->repMatch)/ (bestPsl->match + bestPsl->misMatch + bestPsl->repMatch); info. genoMapCount = mapCount; } /* Get orf size and start/end complete from cdsEv. */ if (bed->thickStart < bed->thickEnd) { cdsEv = hashFindVal(cdsEvHash, bed->name); if (cdsEv != NULL) { info.orfSize = cdsEv->end - cdsEv->start; info.startComplete = cdsEv->startComplete; info.endComplete = cdsEv->endComplete; } } /* Get score from prediction. */ predict = hashFindVal(predictHash, bed->name); if (predict != NULL) info.cdsScore = predict->score; /* Figure out nonsense-mediated-decay from bed itself. */ info.nonsenseMediatedDecay = isNonsenseMediatedDecayTarget(bed); /* Figure out if retained intron from bed and alt-splice keeper hash */ info.retainedIntron = hasRetainedIntron(bed, altSpliceHash); info.strangeSplice = countStrangeSplices(bed, altSpliceHash); info.atacIntrons = countAtacIntrons(bed, altSpliceHash); info.bleedIntoIntron = addIntronBleed(bed, altSpliceHash); /* Look up selenocysteine info. */ info.selenocysteine = (hashLookup(selenocysteineHash, bed->name) != NULL); /* Loop through bed looking for small gaps indicative of frame shift/stop */ int i, lastBlock = bed->blockCount-1; int exonCount = 1; for (i=0; i < lastBlock; ++i) { int gapStart = bed->chromStarts[i] + bed->blockSizes[i]; int gapEnd = bed->chromStarts[i+1]; int gapSize = gapEnd - gapStart; switch (gapSize) { case 1: case 2: info.genomicFrameShift = TRUE; break; case 3: info.genomicStop = TRUE; break; default: exonCount += 1; break; } } info.exonCount = exonCount; /* Write info, free bed. */ txInfoTabOut(&info, f); bedFree(&bed); } /* Clean up and go home. */ carefulClose(&f); }
void jsonQuery(char *inFile, char *path, char *outFile) /* jsonQuery - Use a path syntax to retrieve elements/values from each line of JSON input. */ { struct lineFile *lf = lineFileOpen(inFile, TRUE); struct hash *uniqHash = NULL; boolean countUniq = optionExists("countUniq"); boolean uniq = optionExists("uniq") || countUniq; if (uniq) uniqHash = hashNew(0); struct dyString *dy = dyStringNew(0); FILE *outF = mustOpen(outFile, "w"); char *line; while (lineFileNextReal(lf, &line)) { struct lm *lm = lmInit(1<<16); struct jsonElement *topEl = jsonParseLm(line, lm); struct slRef topRef; topRef.next = NULL; topRef.val = topEl; char desc[1024]; safef(desc, sizeof desc, "line %d of %s", lf->lineIx, inFile); struct slRef *results = jsonQueryElementList(&topRef, desc, path, lm); struct slRef *result; for (result = results; result != NULL; result = result->next) { struct jsonElement *el = result->val; if (uniq) { dyStringClear(dy); jsonDyStringPrint(dy, el, NULL, -1); char *elStr = dy->string; int count = hashIntValDefault(uniqHash, elStr, 0); if (count < 1) { hashAddInt(uniqHash, elStr, 1); verbose(2, "line %d: %s\n", lf->lineIx, elStr); if (!countUniq) { fprintf(outF, "%s\n", elStr); fflush(outF); } } else hashIncInt(uniqHash, elStr); } else jsonPrintToFile(el, NULL, outF, 2); } lmCleanup(&lm); } lineFileClose(&lf); if (countUniq) { struct hashEl *hel; struct hashCookie cookie = hashFirst(uniqHash); while ((hel = hashNext(&cookie)) != NULL) { fprintf(outF, "%10d %s\n", ptToInt(hel->val), hel->name); } } carefulClose(&outF); }