void doOneChrom(char *database, char *chrom, char *rnaTable, char *expTable, FILE *f) /* Process one chromosome. */ { int chromSize = hChromSize(database, chrom); struct binKeeper *bk = binKeeperNew(0, chromSize); struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; struct bed *exp, *rna; int rowOffset; struct binElement *be, *beList; int oneCount; /* Load up expTable into bin-keeper. */ sr = hChromQuery(conn, expTable, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { exp = bedLoadN(row + rowOffset, 12); binKeeperAdd(bk, exp->chromStart, exp->chromEnd, exp); } sqlFreeResult(&sr); /* Loop through rnaTable and look at intersections. */ sr = hChromQuery(conn, rnaTable, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { rna = bedLoadN(row + rowOffset, 12); beList = binKeeperFind(bk, rna->chromStart, rna->chromEnd); oneCount = 0; for (be = beList; be != NULL; be = be->next) { exp = be->val; if (exp->strand[0] == rna->strand[0]) { ++oneCount; ++hitCount; // fprintf(f, "%s:%d-%d\t%s\t%s\n", // rna->chrom, rna->chromStart, rna->chromEnd, rna->name, exp->name); } } slFreeList(&beList); if (oneCount == 0) { ++missCount; fprintf(f, "miss %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name); } else if (oneCount == 1) { fprintf(f, "uniq %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name); ++uniqCount; } else { fprintf(f, "dupe %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name); ++dupeCount; } } sqlFreeResult(&sr); hFreeConn(&conn); }
void getBinKeeper(char *chromName) /* put SNPs in binKeeper */ { char query[512]; struct sqlConnection *conn = hAllocConn(); struct sqlResult *sr; char **row; int start = 0; int end = 0; char *rsId = NULL; int chromSize = hChromSize(chromName); verbose(1, "constructing binKeeper...\n"); snps = binKeeperNew(0, chromSize); safef(query, sizeof(query), "select chromStart, chromEnd, name from %s where chrom = '%s'", snpTable, chromName); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { start = sqlUnsigned(row[0]); end = sqlUnsigned(row[1]); rsId = cloneString(row[2]); binKeeperAdd(snps, start, end, rsId); } sqlFreeResult(&sr); hFreeConn(&conn); }
struct hash *readChainToBinKeeper(char *sizeFileName, char *fileName) { struct binKeeper *bk; struct chain *chain; struct lineFile *lf = lineFileOpen(fileName, TRUE); struct lineFile *sf = lineFileOpen(sizeFileName, TRUE); struct hash *hash = newHash(0); char *chromRow[2]; while (lineFileRow(sf, chromRow)) { char *name = chromRow[0]; int size = lineFileNeedNum(sf, chromRow, 1); if (hashLookup(hash, name) != NULL) warn("Duplicate %s, ignoring all but first\n", name); else { bk = binKeeperNew(0, size); assert(size > 1); hashAdd(hash, name, bk); } } while ((chain = chainRead(lf)) != NULL) { bk = hashMustFindVal(hash, chain->tName); binKeeperAdd(bk, chain->tStart, chain->tEnd, chain); } lineFileClose(&lf); return hash; }
struct binKeeper *fbToBinKeeper(struct featureBits *fbList, int chromSize) /* Make a binKeeper filled with fbList. */ { struct binKeeper *bk = binKeeperNew(0, chromSize); struct featureBits *fb; for (fb = fbList; fb != NULL; fb = fb->next) binKeeperAdd(bk, fb->start, fb->end, fb); return bk; }
struct altGraphX *agFromAlignments(char *db, struct ggMrnaAli *maList, struct dnaSeq *seq, struct sqlConnection *conn, int chromStart, int chromEnd, FILE *out ) /** Custer overlaps from maList into altGraphX structure. */ { struct altGraphX *ag = NULL, *agList = NULL; struct ggMrnaCluster *mcList=NULL, *mc=NULL; struct ggMrnaInput *ci = NULL; struct geneGraph *gg = NULL; static int count = 0; ci = ggMrnaInputFromAlignments(maList, seq); mcList = ggClusterMrna(ci); if(mcList == NULL) { freeGgMrnaInput(&ci); return NULL; } clusterCount++; for(mc = mcList; mc != NULL; mc = mc->next) { if(optionExists("consensus")) { gg = ggGraphConsensusCluster(db, mc, ci, tissLibHash, !optionExists("skipTissues")); } else gg = ggGraphCluster(db, mc,ci); assert(checkEvidenceMatrix(gg)); ag = ggToAltGraphX(gg); if(ag != NULL) { char name[256]; freez(&ag->name); safef(name, sizeof(name), "%s.%d", ag->tName, count++); ag->name = cloneString(name); /* Convert back to genomic coordinates. */ altGraphXoffset(ag, chromStart); /* Sort vertices so that they are chromosomal order */ altGraphXVertPosSort(ag); /* write to file */ binKeeperAdd(agxSeenBin, ag->tStart, ag->tEnd, ag); slAddHead(&agList, ag); } } /* Sometimes get nested, partial transcripts. Want to filter those out. */ for(ag = agList; ag != NULL; ag = ag->next) { if(!agxIsRedundant(ag)) altGraphXTabOut(ag, out); } /* genoSeq and maList are freed with ci and gg */ ggFreeMrnaClusterList(&mcList); freeGgMrnaInput(&ci); freeGeneGraph(&gg); return agList; }
struct hash *readBed(char *fileName) /* Read bed and return it as a hash keyed by chromName * with binKeeper values. */ { char *row[5]; struct lineFile *lf = lineFileOpen(fileName, TRUE); struct hash *hash = newHash(0); int expectedCols = bScore ? 5 : 3; while (lineFileNextRow(lf, row, expectedCols)) { struct binKeeper *bk; struct bed5 *bed; struct hashEl *hel = hashLookup(hash, row[0]); if (hel == NULL) { bk = binKeeperNew(0, 1024*1024*1024); hel = hashAdd(hash, row[0], bk); } bk = hel->val; AllocVar(bed); bed->chrom = hel->name; bed->start = lineFileNeedNum(lf, row, 1); bed->end = lineFileNeedNum(lf, row, 2); if (bScore) bed->score = lineFileNeedNum(lf, row, 4); if (bed->start > bed->end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); if (bed->start == bed->end) { if (allowStartEqualEnd) // Note we are tweaking binKeeper coords here, so use bed->start and bed->end. binKeeperAdd(bk, max(0, bed->start-1), bed->end+1, bed); else lineFileAbort(lf, "start==end (if this is legit, use -allowStartEqualEnd)"); } else binKeeperAdd(bk, bed->start, bed->end, bed); } lineFileClose(&lf); return hash; }
void bestProbeOverlap(struct sqlConnection *conn, char *probeTable, struct genePred *gpList, struct hash *gpToProbeHash) /* Create hash of most overlapping probe if any for each gene. Require * at least 100 base overlap. */ { /* Create a hash of binKeepers filled with probes. */ struct hash *keeperHash = keepersForChroms(conn); struct hashCookie it = hashFirst(keeperHash); struct hashEl *hel; int pslCount = 0; while ((hel = hashNext(&it)) != NULL) { char *chrom = hel->name; struct binKeeper *bk = hel->val; int rowOffset; struct sqlResult *sr = hChromQuery(conn, probeTable, chrom, NULL, &rowOffset); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct psl *psl = pslLoad(row+rowOffset); binKeeperAdd(bk, psl->tStart, psl->tEnd, psl); ++pslCount; } sqlFreeResult(&sr); } verbose(2, "Loaded %d psls from %s\n", pslCount, probeTable); /* Loop through gene list, finding best probe if any for each gene. */ struct genePred *gp; for (gp = gpList; gp != NULL; gp = gp->next) { struct rbTree *rangeTree = genePredToRangeTree(gp, FALSE); struct psl *bestPsl = NULL; int bestOverlap = 99; /* MinOverlap - 1 */ struct binKeeper *bk = hashMustFindVal(keeperHash, gp->chrom); struct binElement *bin, *binList = binKeeperFind(bk, gp->txStart, gp->txEnd); for (bin = binList; bin != NULL; bin = bin->next) { struct psl *psl = bin->val; if (psl->strand[0] == gp->strand[0]) { int overlap = pslRangeTreeOverlap(psl, rangeTree); if (overlap > bestOverlap) { bestOverlap = overlap; bestPsl = psl; } } } if (bestPsl != NULL) hashAdd(gpToProbeHash, gp->name, bestPsl->qName); } }
struct hash *readBed(char *fileName) /* Read in bed file into hash of binKeepers keyed by * target. */ { struct lineFile *lf = NULL; struct hash *hash = newHash(0); char *row[3]; struct chromInfo *ciList = NULL, *ci; int count = 0, chromCount = 0; /* Make first pass through just figuring out maximum size * of each chromosome info. */ lf = lineFileOpen(fileName, TRUE); while (lineFileRow(lf, row)) { char *chrom = row[0]; int e = lineFileNeedNum(lf, row, 2); ci = hashFindVal(hash, chrom); if (ci == NULL) { AllocVar(ci); hashAddSaveName(hash, chrom, ci, &ci->name); slAddHead(&ciList, ci); ++chromCount; } if (e > ci->maxEnd) ci->maxEnd = e; ++count; } lineFileClose(&lf); /* Allocate binKeeper on each chromosome. */ for (ci = ciList; ci != NULL; ci = ci->next) { ci->bk = binKeeperNew(0, ci->maxEnd); } /* Make second pass filling in binKeeper */ lf = lineFileOpen(fileName, TRUE); while (lineFileRow(lf, row)) { char *chrom = row[0]; int s = lineFileNeedNum(lf, row, 1); int e = lineFileNeedNum(lf, row, 2); ci = hashMustFindVal(hash, chrom); binKeeperAdd(ci->bk, s, e, NULL); } lineFileClose(&lf); printf("Read %d items in %d target chromosomes from %s\n", count, chromCount, fileName); return hash; }
struct hash *bedsIntoKeeperHash(struct bed *bedList) /* Create a hash full of bin keepers (one for each chromosome or contig. * The binKeepers are full of beds. */ { struct hash *sizeHash = minChromSizeFromBeds(bedList); struct hash *bkHash = minChromSizeKeeperHash(sizeHash); struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) { struct binKeeper *bk = hashMustFindVal(bkHash, bed->chrom); binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed); } hashFree(&sizeHash); return bkHash; }
struct mouseChromCache *newMouseChromCache(char *chrom, int chromSize, char *ratMouseDir) /* Create a new chromCache. */ { struct mouseChromCache *mcc; char fileName[512]; struct lineFile *lf; char *row[3]; int start,end; long long *pPos; /* Open up file with actual alignments. Warn and return NULL * if it doesn't exist. */ sprintf(fileName, "%s/%s.axt", ratMouseDir, chrom); lf = lineFileMayOpen(fileName, TRUE); /* Allocate structure and store basic info in it. */ AllocVar(mcc); mcc->name = cloneString(chrom); mcc->size = chromSize; mcc->lf = lf; if (lf == NULL) { warn("%s doesn't exist", fileName); if (!noDieMissing) noWarnAbort(); return mcc; } /* Read index file into bk. */ sprintf(fileName, "%s/%s.axt.ix", ratMouseDir, chrom); mcc->bk = binKeeperNew(0, chromSize); lf = lineFileOpen(fileName, TRUE); verbose(1, "Reading %s\n", fileName); while (lineFileRow(lf, row)) { start = lineFileNeedNum(lf, row, 0); end = lineFileNeedNum(lf, row, 1) + start; AllocVar(pPos); *pPos = atoll(row[2]); binKeeperAdd(mcc->bk, start, end, pPos); } lineFileClose(&lf); /* Return initialized object. */ return mcc; }
struct binKeeper *loadAxtsIntoRange(char *fileName, char *tPrefix, char *qPrefix) /* Read in an axt file and shove it into a bin-keeper. */ { struct lineFile *lf = lineFileOpen(fileName, TRUE); struct binKeeper *bk = binKeeperNew(0, maxChromSize); struct axt *axt; int count = 0; while ((axt = axtRead(lf)) != NULL) { binKeeperAdd(bk, axt->tStart, axt->tEnd, axt); ++count; } uglyf("LOaded %d from %s\n", count, fileName); lineFileClose(&lf); return bk; }
void chromKeeperAdd(char *chrom, int chromStart, int chromEnd, void *val) /* Add an item to the chromKeeper. */ { int i=0; boolean added = FALSE; for(i=0; i<chromCount; i++) { if(sameString(chrom,chromNames[i])) { binKeeperAdd(chromRanges[i], chromStart, chromEnd, val); added=TRUE; break; } } if(!added) errAbort("chromKeeper::chromKeeperAdd() - Don't recognize chrom %s", chrom); }
struct hash *netToBkHash(char *netFile) /* Read net file into a hash full of binKeepers keyed by chromosome. * The binKeepers are full of nets. */ { struct hash *netHash = hashNew(0); struct lineFile *lf = lineFileOpen(netFile, TRUE); struct chainNet *net, *netList = chainNetRead(lf); for (net = netList; net != NULL; net = net->next) { if (hashLookup(netHash, net->name)) errAbort("%s has multiple %s records", netFile, net->name); struct binKeeper *bk = binKeeperNew(0, net->size); hashAdd(netHash, net->name, bk); struct cnFill *fill; for(fill=net->fillList; fill != NULL; fill = fill->next) binKeeperAdd(bk, fill->tStart, fill->tStart+fill->tSize, fill); } lineFileClose(&lf); return netHash; }
struct binKeeper *readRepeats2(char *chrom, char *rmskFileName, struct hash *tSizeHash) /* read all repeats for a chromosome of size size, returns results in binKeeper structure for fast query*/ { boolean rmskRet; struct lineFile *rmskF = NULL; struct rmskOut2 *rmsk; struct binKeeper *bk; int size; size = hashIntVal(tSizeHash, chrom); bk = binKeeperNew(0, size); assert(size > 1); rmskOut2OpenVerify(rmskFileName ,&rmskF , &rmskRet); while ((rmsk = rmskOut2ReadNext(rmskF)) != NULL) { binKeeperAdd(bk, rmsk->genoStart, rmsk->genoEnd, rmsk); } lineFileClose(&rmskF); return bk; }
struct hash *bedsIntoHashOfKeepers(struct bed *bedList) /* Return a hash full of binKeepers, keyed by chromosome (or contig) * that contains the bedList */ { struct hash *sizeHash = chromMinSizeHash(bedList); struct hash *keeperHash = hashNew(16); struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) { struct binKeeper *keeper = hashFindVal(keeperHash, bed->chrom); if (keeper == NULL) { struct minChromSize *chrom = hashMustFindVal(sizeHash, bed->chrom); keeper = binKeeperNew(0, chrom->minSize); hashAdd(keeperHash, chrom->name, keeper); } binKeeperAdd(keeper, bed->chromStart, bed->chromEnd, bed); } hashFree(&sizeHash); return keeperHash; }
static struct cdsExon *loadExon(struct gene *gene, struct binKeeper *chrBins, struct genePred *gp, int iExon, int start, int end, int cdsOff) /* load information about an exon into various structures */ { struct cdsExon *exon; checkOverlap(gene->genes, chrBins, gp, start, end); lmAllocVar(gene->genes->memPool, exon); exon->gene = gene; exon->chromStart = start; exon->chromEnd = end; exon->frame = gp->exonFrames[iExon]; if (exon->gene->strand == '+') exon->exonNum = iExon; else exon->exonNum = (gp->exonCount-1) - iExon; exon->cdsOff = cdsOff; binKeeperAdd(chrBins, start, end, exon); slAddHead(&gene->exons, exon); return exon; }
void loadPslsFromFile(char *pslFile, char *chrom, struct sqlConnection *conn) /** Load the psls from the directed file (instead of the database. */ { struct psl *psl = NULL, *pslNext = NULL, *pslList = NULL; pslList = pslLoadAll(pslFile); for(psl = pslList; psl != NULL; psl = psl->next) { minPslStart = min(psl->tStart, minPslStart); maxPslEnd = max(psl->tEnd, maxPslEnd); } chromPslBin = binKeeperNew(minPslStart, maxPslEnd); agxSeenBin = binKeeperNew(minPslStart, maxPslEnd); for(psl = pslList; psl != NULL; psl = pslNext) { pslNext = psl->next; if(sameString(psl->tName, chrom)) binKeeperAdd(chromPslBin, psl->tStart, psl->tEnd, psl); else pslFree(&psl); } }
struct genePred *loadGenePred(char *database, char *chrom, char *track, struct binKeeper *bk) /* Load in a gene prediction track to bk. */ { struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; int rowOffset; struct genePred *list = NULL, *el; sr = hChromQuery(conn, track, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { el = genePredLoad(row + rowOffset); binKeeperAdd(bk, el->txStart, el->txEnd, el); slAddHead(&list, el); } sqlFreeResult(&sr); hFreeConn(&conn); slReverse(&list); return list; }
struct hash *txgIntoKeeperHash(struct txGraph *txgList) /* Create a hash full of bin keepers (one for each chromosome or contig. * The binKeepers are full of txGraphs. */ { struct hash *sizeHash = txgChromMinSizeHash(txgList); struct hash *bkHash = hashNew(16); struct txGraph *txg; for (txg = txgList; txg != NULL; txg = txg->next) { struct binKeeper *bk = hashFindVal(bkHash, txg->tName); if (bk == NULL) { struct minChromSize *chrom = hashMustFindVal(sizeHash, txg->tName); verbose(3, "New binKeeper for %s\n", txg->tName); bk = binKeeperNew(0, chrom->minSize); hashAdd(bkHash, txg->tName, bk); } binKeeperAdd(bk, txg->tStart, txg->tEnd, txg); } hashFree(&sizeHash); return bkHash; }
struct hash *readLiftOverMapChainHash(char *fileName) /* taken from kent/src/hg/lib/liftOver.c */ /* Read map file into hashes. */ { struct hash *chainHash = hashNew(10); struct lineFile *lf = lineFileOpen(fileName, TRUE); struct chain *chain; struct liftOverChromMap *map; while ((chain = chainRead(lf)) != NULL) { if ((map = hashFindVal(chainHash, chain->tName)) == NULL) { AllocVar(map); map->bk = binKeeperNew(0, chain->tSize); hashAddSaveName(chainHash, chain->tName, map, &map->name); } binKeeperAdd(map->bk, chain->tStart, chain->tEnd, chain); } lineFileClose(&lf); return chainHash; }
void addGenePred(struct hash *chromHash, char **row) /* add a genePred's exons to the approriate binkeeper object in hash */ { struct genePred *gene = genePredLoad(row); int iExon; struct binKeeper *chromBins = getChromBins(chromHash, gene->chrom, gene->strand); struct geneLoc *geneLoc = geneLocNew(chromHash->lm, gene->name, gene->chrom, gene->strand, gene->txStart, gene->txEnd); for (iExon = 0; iExon < gene->exonCount; iExon++) { int exonStart = gene->exonStarts[iExon]; int exonEnd = gene->exonEnds[iExon]; if (gCdsOnly) { exonStart = max(exonStart, gene->cdsStart); exonEnd = min(exonEnd, gene->cdsEnd); } if (exonStart < exonEnd) binKeeperAdd(chromBins, exonStart, exonEnd, geneLoc); } genePredFree(&gene); }
void loadPslsFromDatabase(struct sqlConnection *conn, char *db, char *chrom) /** Load all of the desired alignments into the chromkeeper structure from the desired pslTables. */ { int i = 0; struct sqlResult *sr = NULL; char **row = NULL; int rowOffset = 0; struct psl *pslList = NULL, *psl = NULL; for(i = 0; i < numDbTables; i++) { sr = hChromQuery(conn, dbTables[i], chrom, NULL, &rowOffset); while((row = sqlNextRow(sr)) != NULL) { psl = pslLoad(row+rowOffset); slAddHead(&pslList, psl); minPslStart = min(psl->tStart, minPslStart); maxPslEnd = max(psl->tEnd, maxPslEnd); /* This just adds the mrna twice to the list, cheat way to add more weight to certain tables. */ if(weightMrna && (stringIn("refSeqAli", dbTables[i]) || stringIn("mrna", dbTables[i]))) { psl = clonePsl(psl); slAddHead(&pslList, psl); } } sqlFreeResult(&sr); } chromPslBin = binKeeperNew(minPslStart, maxPslEnd); agxSeenBin = binKeeperNew(minPslStart, maxPslEnd); for(psl = pslList; psl != NULL; psl = psl->next) { binKeeperAdd(chromPslBin, psl->tStart, psl->tEnd, psl); } }
struct hash *readRepeatsAll2(char *sizeFileName, char *rmskDir) /* read all repeats for a all chromosomes getting sizes from sizeFileNmae , returns results in hash of binKeeper structure for fast query*/ { boolean rmskRet; struct binKeeper *bk; struct lineFile *rmskF = NULL; struct rmskOut2 *rmsk; struct lineFile *lf = lineFileOpen(sizeFileName, TRUE); struct hash *hash = newHash(0); char *row[2]; char rmskFileName[256]; while (lineFileRow(lf, row)) { char *name = row[0]; int size = lineFileNeedNum(lf, row, 1); if (hashLookup(hash, name) != NULL) warn("Duplicate %s, ignoring all but first\n", name); else { bk = binKeeperNew(0, size); assert(size > 1); safef(rmskFileName, sizeof(rmskFileName), "%s/%s.fa.out",rmskDir,name); rmskOut2OpenVerify(rmskFileName ,&rmskF , &rmskRet); while ((rmsk = rmskOut2ReadNext(rmskF)) != NULL) { binKeeperAdd(bk, rmsk->genoStart, rmsk->genoEnd, rmsk); } lineFileClose(&rmskF); hashAdd(hash, name, bk); } } lineFileClose(&lf); return hash; }
void oneChrom(char *database, char *chrom, char *refAliTrack, char *bedTrack, struct hash *otherHash, struct stats *stats) /* Process one chromosome. */ { struct bed *bedList = NULL, *bed; struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; int rowOffset; int chromSize = hChromSize(database, chrom); struct binKeeper *bk = binKeeperNew(0, chromSize); struct psl *pslList = NULL; struct dnaSeq *chromSeq = NULL; if (endsWith(bedTrack, ".bed")) { struct lineFile *lf = lineFileOpen(bedTrack, TRUE); char *row[3]; while (lineFileRow(lf, row)) { if (sameString(chrom, row[0])) { bed = bedLoad3(row); slAddHead(&bedList, bed); } } lineFileClose(&lf); } else { sr = hChromQuery(conn, bedTrack, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { bed = bedLoad3(row+rowOffset); slAddHead(&bedList, bed); } sqlFreeResult(&sr); } slReverse(&bedList); uglyf("Loaded beds\n"); sr = hChromQuery(conn, refAliTrack, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct psl *psl = pslLoad(row + rowOffset); slAddHead(&pslList, psl); binKeeperAdd(bk, psl->tStart, psl->tEnd, psl); } sqlFreeResult(&sr); uglyf("Loaded psls\n"); chromSeq = hLoadChrom(database, chrom); /* Fetch entire chromosome into memory. */ uglyf("Loaded human seq\n"); for (bed = bedList; bed != NULL; bed = bed->next) { struct binElement *el, *list = binKeeperFind(bk, bed->chromStart, bed->chromEnd); for (el = list; el != NULL; el = el->next) { struct psl *fullPsl = el->val; struct psl *psl = pslTrimToTargetRange(fullPsl, bed->chromStart, bed->chromEnd); if (psl != NULL) { foldPslIntoStats(psl, chromSeq, otherHash, stats); pslFree(&psl); } } slFreeList(&list); stats->bedCount += 1; stats->bedBaseCount += bed->chromEnd - bed->chromStart; sqlFreeResult(&sr); } freeDnaSeq(&chromSeq); pslFreeList(&pslList); binKeeperFree(&bk); hFreeConn(&conn); }
void txGeneCanonical(char *codingCluster, char *infoFile, char *noncodingGraph, char *genesBed, char *nearCoding, char *outCanonical, char *outIsoforms, char *outClusters) /* txGeneCanonical - Pick a canonical version of each gene - that is the form * to use when just interested in a single splicing varient. Produces final * transcript clusters as well. */ { /* Read in input into lists in memory. */ struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster); struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph); struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12); struct txInfo *info, *infoList = txInfoLoadAll(infoFile); struct bed *nearList = bedLoadNAll(nearCoding, 12); /* Make hash of all beds. */ struct hash *bedHash = hashNew(18); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(bedHash, bed->name, bed); /* Make has of all info. */ struct hash *infoHash = hashNew(18); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Make a binKeeper structure that we'll populate with coding genes. */ struct hash *sizeHash = minChromSizeFromBeds(bedList); struct hash *keeperHash = minChromSizeKeeperHash(sizeHash); /* Make list of coding genes and toss them into binKeeper. * This will eat up bed list, but bedHash is ok. */ struct gene *gene, *geneList = NULL; for (coding = codingList; coding != NULL; coding = coding->next) { gene = geneFromCluster(coding, bedHash, infoHash); slAddHead(&geneList, gene); struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom); binKeeperAdd(bk, gene->start, gene->end, gene); } /* Go through near-coding genes and add them to the coding gene * they most overlap. */ for (bed = nearList; bed != NULL; bed = nextBed) { nextBed = bed->next; gene = mostOverlappingGene(keeperHash, bed); if (gene == NULL) errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name); geneAddBed(gene, bed); } /* Add non-coding genes. */ for (graph = graphList; graph != NULL; graph = graph->next) { gene = geneFromGraph(graph, bedHash); slAddHead(&geneList, gene); } /* Sort so it all looks nicer. */ slSort(&geneList, geneCmp); /* Open up output files. */ FILE *fCan = mustOpen(outCanonical, "w"); FILE *fIso = mustOpen(outIsoforms, "w"); FILE *fClus = mustOpen(outClusters, "w"); /* Loop through, making up gene name, and writing output. */ int geneId = 0; for (gene = geneList; gene != NULL; gene = gene->next) { /* Make up name. */ char name[16]; safef(name, sizeof(name), "g%05d", ++geneId); /* Reverse transcript list just to make it look better. */ slReverse(&gene->txList); /* Write out canonical file output */ bed = hashMustFindVal(bedHash, gene->niceTx->name); fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n", bed->chrom, bed->chromStart, bed->chromEnd, geneId, gene->niceTx->name, gene->niceTx->name); /* Write out isoforms output. */ for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fIso, "%d\t%s\n", geneId, bed->name); /* Write out cluster output, starting with bed 6 standard fields. */ fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t", gene->chrom, gene->start, gene->end, name, 0, gene->strand); /* Write out thick-start/thick end. */ if (gene->isCoding) { int thickStart = gene->end, thickEnd = gene->start; for (bed = gene->txList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { thickStart = min(thickStart, bed->thickStart); thickEnd = max(thickEnd, bed->thickEnd); } } fprintf(fClus, "%d\t%d\t", thickStart, thickEnd); } else { fprintf(fClus, "%d\t%d\t", gene->start, gene->start); } /* We got no rgb value, just write out zero. */ fprintf(fClus, "0\t"); /* Get exons from exonTree. */ struct range *exon, *exonList = rangeTreeList(gene->exonTree); fprintf(fClus, "%d\t", slCount(exonList)); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->start - gene->start); fprintf(fClus, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->end - exon->start); fprintf(fClus, "\t"); /* Write out associated transcripts. */ fprintf(fClus, "%d\t", slCount(gene->txList)); for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fClus, "%s,", bed->name); fprintf(fClus, "\t"); /* Write out nice value */ fprintf(fClus, "%s\t", gene->niceTx->name); /* Write out coding/noncoding value. */ fprintf(fClus, "%d\n", gene->isCoding); } /* Close up files. */ carefulClose(&fCan); carefulClose(&fIso); carefulClose(&fClus); }
void sortGenes(struct sqlConnection *conn) /* Put up sort gene page. */ { cartWebStart(cart, database, "Finding Candidate Genes for Gene Sorter"); if (!hgNearOk(database)) errAbort("Sorry, gene sorter not available for this database."); /* Get list of regions. */ struct genoGraph *gg = ggFirstVisible(); double threshold = getThreshold(); struct bed3 *bed, *bedList = regionsOverThreshold(gg); /* Figure out what table and column are the sorter's main gene set. */ struct hash *genomeRa = hgReadRa(genome, database, "hgNearData", "genome.ra", NULL); char *geneTable = hashMustFindVal(genomeRa, "geneTable"); char *idColumn = hashMustFindVal(genomeRa, "idColumn"); /* if marker labels were present when the file was uploaded, they are saved here */ char cgmName[256]; safef(cgmName, sizeof(cgmName), "%s.cgm", gg->binFileName); struct lineFile *m = lineFileMayOpen(cgmName, TRUE); char *cgmRow[4]; cgmRow[0] = ""; /* dummy row */ cgmRow[1] = ""; cgmRow[2] = "0"; cgmRow[3] = "0"; FILE *g = NULL; int markerCount = 0; struct tempName snpTn; if (m) { /* Create custom column output file. */ trashDirFile(&snpTn, "hgg", "marker", ".mrk"); g = mustOpen(snpTn.forCgi, "w"); fprintf(g, "column name=\"%s Markers\" shortLabel=\"%s Markers over threshold\" longLabel=\"%s Markers in regions over threshold\" " "visibility=on priority=99 " "\n" , gg->shortLabel , gg->shortLabel , gg->shortLabel ); } /*** Build up hash of all transcriptHash that are in region. */ struct hash *transcriptHash = hashNew(16); /* This loop handles one chromosome at a time. It depends on * the bedList being sorted by chromosome. */ for (bed = bedList; bed != NULL; ) { /* Make binKeeper and stuff in all regions in this chromosome into it. */ char *chrom = bed->chrom; int chromSize = hChromSize(database, chrom); struct binKeeper *bk = binKeeperNew(0, chromSize); while (bed != NULL && sameString(chrom, bed->chrom)) { binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed); bed = bed->next; } struct binKeeper *bkGenes = NULL; if (m) bkGenes = binKeeperNew(0, chromSize); /* Query database to find out bounds of all genes on this chromosome * and if they overlap any of the regions then put them in the hash. */ char query[512]; safef(query, sizeof(query), "select name,txStart,txEnd from %s where chrom='%s'", geneTable, chrom); struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; int start = sqlUnsigned(row[1]); int end = sqlUnsigned(row[2]); if (binKeeperAnyOverlap(bk, start, end)) { hashStore(transcriptHash, name); if (m) binKeeperAdd(bkGenes, start, end, cloneString(name)); } } sqlFreeResult(&sr); if (m) { /* Read cgm file if it exists, looking at all markers on this chromosome * and if they overlap any of the regions and genes then output them. */ do { // marker, chrom, chromStart, val char *marker = cgmRow[0]; char *chr = cgmRow[1]; int start = sqlUnsigned(cgmRow[2]); int end = start+1; double val = sqlDouble(cgmRow[3]); int cmp = strcmp(chr,chrom); if (cmp > 0) break; if (cmp == 0) { if (val >= threshold) { struct binElement *el, *bkList = binKeeperFind(bkGenes, start, end); for (el = bkList; el; el=el->next) { /* output to custom column trash file */ fprintf(g, "%s %s\n", (char *)el->val, marker); } if (bkList) { ++markerCount; slFreeList(&bkList); } } } } while (lineFileRow(m, cgmRow)); } /* Clean up for this chromosome. */ binKeeperFree(&bk); if (m) { /* For speed, we do not free up the values (cloned the kg names earlier) */ binKeeperFree(&bkGenes); } } /* Get list of all transcripts in regions. */ struct hashEl *el, *list = hashElListHash(transcriptHash); /* Create file with all matching gene IDs. */ struct tempName keyTn; trashDirFile(&keyTn, "hgg", "key", ".key"); FILE *f = mustOpen(keyTn.forCgi, "w"); for (el = list; el != NULL; el = el->next) fprintf(f, "%s\n", el->name); carefulClose(&f); /* Print out some info. */ hPrintf("Thresholding <i>%s</i> at %g. ", gg->shortLabel, threshold); hPrintf("There are %d regions covering %lld bases.<BR>\n", slCount(bedList), bedTotalSize((struct bed*)bedList) ); hPrintf("Installed a Gene Sorter filter that selects only genes in these regions.<BR>\n"); if (m) { hPrintf("There are %d markers in the regions over threshold that overlap knownGenes.<BR>\n", markerCount); hPrintf("Installed a Gene Sorter custom column called \"%s Markers\" with these markers.<BR>\n", gg->shortLabel); } /* close custom column output file */ if (m) { lineFileClose(&m); carefulClose(&g); } /* Stuff cart variable with name of file. */ char keyCartName[256]; safef(keyCartName, sizeof(keyCartName), "%s%s.keyFile", advFilterPrefix, idColumn); cartSetString(cart, keyCartName, keyTn.forCgi); cartSetString(cart, customFileVarName, snpTn.forCgi); char snpVisCartNameTemp[256]; char *snpVisCartName = NULL; safef(snpVisCartNameTemp, sizeof(snpVisCartNameTemp), "%s%s Markers.vis", colConfigPrefix, gg->shortLabel); snpVisCartName = replaceChars(snpVisCartNameTemp, " ", "_"); cartSetString(cart, snpVisCartName, "1"); freeMem(snpVisCartName); hPrintf("<FORM ACTION=\"../cgi-bin/hgNear\" METHOD=GET>\n"); cartSaveSession(cart); hPrintf("<CENTER>"); cgiMakeButton("submit", "go to gene sorter"); hPrintf("</CENTER>"); hPrintf("</FORM>"); cartWebEnd(); }