void bedIntersect(char *aFile, char *bFile, char *outFile) /* bedIntersect - Intersect two bed files. */ { struct lineFile *lf = lineFileOpen(aFile, TRUE); struct hash *bHash = readBed(bFile); FILE *f = mustOpen(outFile, "w"); char *row[40]; int wordCount; while ((wordCount = (strictTab ? lineFileChopTab(lf, row) : lineFileChop(lf, row))) != 0) { char *chrom = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); if (start > end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); if (start == end && !allowStartEqualEnd) lineFileAbort(lf, "start==end (if this is legit, use -allowStartEqualEnd)"); struct binKeeper *bk = hashFindVal(bHash, chrom); if (bk != NULL) { struct binElement *hitList = NULL, *hit; if (allowStartEqualEnd && start == end) hitList = binKeeperFind(bk, start-1, end+1); else hitList = binKeeperFind(bk, start, end); if (aHitAny) { for (hit = hitList; hit != NULL; hit = hit->next) { float cov = getCov(start, end, hit->val); if (cov >= minCoverage) { outputBed(f, row, wordCount, start, end, hit->val); break; } else { struct bed5 *b = hit->val; verbose(1, "filter out %s %d %d %d %d overlap %d %d %d %.3f\n", chrom, start, end, b->start, b->end, positiveRangeIntersection(start, end, b->start, b->end), end-start, b->end-b->start, cov); } } } else { for (hit = hitList; hit != NULL; hit = hit->next) { if (getCov(start, end, hit->val) >= minCoverage) outputBed(f, row, wordCount, start, end, hit->val); } } slFreeList(&hitList); } } }
boolean hasRetainedIntron(struct bed *bed, struct hash *altSpliceHash) /* See if any exons in bed enclose any retained introns in keeper-hash */ { struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom); boolean gotOne = FALSE; if (keeper == NULL) return FALSE; int i; for (i=0; i<bed->blockCount; ++i) { int start = bed->chromStarts[i] + bed->chromStart; int end = start + bed->blockSizes[i]; struct binElement *bin, *binList = binKeeperFind(keeper, start, end); for (bin = binList; bin != NULL; bin = bin->next) { struct bed *intron = bin->val; if (sameString(intron->name, "retainedIntron")) { if (intron->strand[0] == bed->strand[0] && start < intron->chromStart && end > intron->chromEnd) { gotOne = TRUE; break; } } } slFreeList(&binList); if (gotOne) break; } return gotOne; }
int countMatchingIntrons(struct bed *bed, char *type, struct hash *altSpliceHash) /* Count number of introns of a particular type . */ { struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom); if (keeper == NULL) return 0; int total = 0; int i, lastBlock = bed->blockCount-1; for (i=0; i<lastBlock; ++i) { int start = bed->chromStarts[i] + bed->blockSizes[i] + bed->chromStart; int end = bed->chromStart + bed->chromStarts[i+1]; struct binElement *bin, *binList = binKeeperFind(keeper, start, end); for (bin = binList; bin != NULL; bin = bin->next) { struct bed *intron = bin->val; if (sameString(intron->name, type)) { if (intron->strand[0] == bed->strand[0] && start == intron->chromStart && end == intron->chromEnd) { if (end - start > 3) { ++total; break; } } } } slFreeList(&binList); } return total; }
struct bed *mostOverlappingBed(struct bed *ref, struct hash *geneHash, double *retRatio) /* Find most overlapping gene to ref. */ { struct bed *bestBed = NULL; double bestRatio = 0; struct binKeeper *bk = hashFindVal(geneHash, ref->chrom); if (bk != NULL) { struct binElement *el, *list = binKeeperFind(bk, ref->chromStart, ref->chromEnd); for (el = list; el != NULL; el = el->next) { struct bed *bed = el->val; if (bed->strand[0] == ref->strand[0]) { double ratio = bedOverlapRatio(ref, bed); if (ratio > bestRatio) { bestRatio = ratio; bestBed = bed; } } } } *retRatio = bestRatio; return bestBed; }
struct bed *findCompatible(struct bed *newBed, struct hash *oldHash, struct hash *usedHash) /* Try and find an old bed compatible with new bed. */ { struct binKeeper *bk = hashFindVal(oldHash, newBed->chrom); int bestDiff = BIGNUM; struct bed *bestBed = NULL; if (bk == NULL) return NULL; struct binElement *bin, *binList = binKeeperFind(bk, newBed->chromStart, newBed->chromEnd); for (bin = binList; bin != NULL; bin = bin->next) { struct bed *oldBed = bin->val; if (oldBed->strand[0] == newBed->strand[0]) { if (!hashLookup(usedHash, oldBed->name)) { if (bedCompatibleExtension(oldBed, newBed) || endUtrChangeOnly(oldBed, newBed)) { int diff = bedTotalBlockSize(oldBed) - bedTotalBlockSize(newBed); if (diff < 0) diff = -diff; if (diff < bestDiff) { bestDiff = diff; bestBed = oldBed; } } } } } slFreeList(&binList); return bestBed; }
struct bed *findMostOverlapping(struct bed *bed, struct hash *keeperHash) /* Try find most overlapping thing to bed in keeper hash. */ { struct bed *bestBed = NULL; int bestOverlap = 0; struct binKeeper *bk = hashFindVal(keeperHash, bed->chrom); if (bk == NULL) return NULL; struct binElement *bin, *binList = binKeeperFind(bk, bed->chromStart, bed->chromEnd); for (bin = binList; bin != NULL; bin = bin->next) { struct bed *bed2 = bin->val; if (bed2->strand[0] == bed->strand[0]) { int overlap = bedSameStrandOverlap(bed2, bed); if (overlap > bestOverlap) { bestOverlap = overlap; bestBed = bed2; } } } slFreeList(&binList); return bestBed; }
void axtAndBed(char *inAxt, char *inBed, char *outAxt) /* axtAndBed - Intersect an axt with a bed file and output axt.. */ { struct hash *tHash = readBed(inBed); /* target keyed, binKeeper value */ struct lineFile *lf = lineFileOpen(inAxt, TRUE); struct axt *axt; struct binElement *list = NULL, *el; FILE *f = mustOpen(outAxt, "w"); struct axtScoreScheme *ss = axtScoreSchemeDefault(); while ((axt = axtRead(lf)) != NULL) { struct chromInfo *ci = hashFindVal(tHash, axt->tName); if (ci != NULL) { list = binKeeperFind(ci->bk, axt->tStart, axt->tEnd); if (list != NULL) { /* Flatten out any overlapping elements by projecting them * onto a 0/1 valued character array and then looking for * runs of 1 in this array. */ int tStart = axt->tStart; int tEnd = axt->tEnd; int tSize = tEnd - tStart; int i, s = 0; char c, lastC = 0; char *merger = NULL; AllocArray(merger, tSize+1); for (el = list; el != NULL; el = el->next) { int s = el->start - tStart; int e = el->end - tStart; int sz; if (s < 0) s = 0; if (e > tSize) e = tSize; sz = e - s; if (sz > 0) memset(merger + s, 1, sz); } for (i=0; i<=tSize; ++i) { c = merger[i]; if (c && !lastC) { s = i; lastC = c; } else if (!c && lastC) { axtSubsetOnT(axt, s+tStart, i+tStart, ss, f); lastC = c; } } freez(&merger); slFreeList(&list); } } axtFree(&axt); } }
void doOneChrom(char *database, char *chrom, char *rnaTable, char *expTable, FILE *f) /* Process one chromosome. */ { int chromSize = hChromSize(database, chrom); struct binKeeper *bk = binKeeperNew(0, chromSize); struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; struct bed *exp, *rna; int rowOffset; struct binElement *be, *beList; int oneCount; /* Load up expTable into bin-keeper. */ sr = hChromQuery(conn, expTable, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { exp = bedLoadN(row + rowOffset, 12); binKeeperAdd(bk, exp->chromStart, exp->chromEnd, exp); } sqlFreeResult(&sr); /* Loop through rnaTable and look at intersections. */ sr = hChromQuery(conn, rnaTable, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { rna = bedLoadN(row + rowOffset, 12); beList = binKeeperFind(bk, rna->chromStart, rna->chromEnd); oneCount = 0; for (be = beList; be != NULL; be = be->next) { exp = be->val; if (exp->strand[0] == rna->strand[0]) { ++oneCount; ++hitCount; // fprintf(f, "%s:%d-%d\t%s\t%s\n", // rna->chrom, rna->chromStart, rna->chromEnd, rna->name, exp->name); } } slFreeList(&beList); if (oneCount == 0) { ++missCount; fprintf(f, "miss %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name); } else if (oneCount == 1) { fprintf(f, "uniq %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name); ++uniqCount; } else { fprintf(f, "dupe %s:%d-%d %c %s\n", rna->chrom, rna->chromStart, rna->chromEnd, rna->strand[0], rna->name); ++dupeCount; } } sqlFreeResult(&sr); hFreeConn(&conn); }
struct bed *findExact(struct bed *newBed, struct hash *oldHash, struct hash *usedHash) /* Try and find an old bed identical with new bed. */ { struct binKeeper *bk = hashFindVal(oldHash, newBed->chrom); if (bk == NULL) return NULL; struct bed *matchingBed = NULL; struct binElement *bin, *binList = binKeeperFind(bk, newBed->chromStart, newBed->chromEnd); for (bin = binList; bin != NULL; bin = bin->next) { struct bed *oldBed = bin->val; if (oldBed->strand[0] == newBed->strand[0]) { if (!hashLookup(usedHash, oldBed->name)) { if (bedExactMatch(oldBed, newBed)) { matchingBed = oldBed; break; } } } } slFreeList(&binList); return matchingBed; }
char *findCommonName(struct bed *range, struct binKeeper *knownBk, struct hash *refLinkHash) /* Try and find a common name for range based on overlap with * known genes. */ { struct binElement *beList=NULL, *be; struct refLink *link = NULL; struct genePred *gp; int matchCount = 0; beList = binKeeperFind(knownBk, range->chromStart, range->chromEnd); for (be = beList; be != NULL; be = be->next) { gp = be->val; if (gp->strand[0] == range->strand[0]) { ++matchCount; link = hashFindVal(refLinkHash, gp->name); } } slFreeList(&beList); if (matchCount == 1 && link != 0) return link->name; else return range->name; }
enum remapResult remapBase(struct hash *chainHash, char *orig_chrom, int orig_base, char **dest_chrom, int *dest_base) { struct liftOverChromMap *map = hashFindVal(chainHash, orig_chrom); struct binElement *list = NULL; struct chain *chainHit = NULL; struct chain *toFree; struct chain *subChain; int start = orig_base, end = start+1; if (map) list = binKeeperFind(map->bk, start, start+1); if (!list) return deleted; else if (list->next != NULL) { slFreeList(&list); return duplicated; } chainHit = list->val; if (!mapThroughChain(chainHit, 1, &start, &end, &subChain, &toFree)) { slFreeList(&list); return problem; } chainFree(&toFree); *dest_base = start; *dest_chrom = chainHit->qName; slFreeList(&list); return lifted; }
void bestProbeOverlap(struct sqlConnection *conn, char *probeTable, struct genePred *gpList, struct hash *gpToProbeHash) /* Create hash of most overlapping probe if any for each gene. Require * at least 100 base overlap. */ { /* Create a hash of binKeepers filled with probes. */ struct hash *keeperHash = keepersForChroms(conn); struct hashCookie it = hashFirst(keeperHash); struct hashEl *hel; int pslCount = 0; while ((hel = hashNext(&it)) != NULL) { char *chrom = hel->name; struct binKeeper *bk = hel->val; int rowOffset; struct sqlResult *sr = hChromQuery(conn, probeTable, chrom, NULL, &rowOffset); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct psl *psl = pslLoad(row+rowOffset); binKeeperAdd(bk, psl->tStart, psl->tEnd, psl); ++pslCount; } sqlFreeResult(&sr); } verbose(2, "Loaded %d psls from %s\n", pslCount, probeTable); /* Loop through gene list, finding best probe if any for each gene. */ struct genePred *gp; for (gp = gpList; gp != NULL; gp = gp->next) { struct rbTree *rangeTree = genePredToRangeTree(gp, FALSE); struct psl *bestPsl = NULL; int bestOverlap = 99; /* MinOverlap - 1 */ struct binKeeper *bk = hashMustFindVal(keeperHash, gp->chrom); struct binElement *bin, *binList = binKeeperFind(bk, gp->txStart, gp->txEnd); for (bin = binList; bin != NULL; bin = bin->next) { struct psl *psl = bin->val; if (psl->strand[0] == gp->strand[0]) { int overlap = pslRangeTreeOverlap(psl, rangeTree); if (overlap > bestOverlap) { bestOverlap = overlap; bestPsl = psl; } } } if (bestPsl != NULL) hashAdd(gpToProbeHash, gp->name, bestPsl->qName); } }
int bkCountOverlappingRange(struct binKeeper *bk, int start, int end) /* Return biggest overlap of anything in binKeeper with given range. */ { struct binElement *el, *list = binKeeperFind(bk, start, end); int overlap, bestOverlap = 0; for (el = list; el != NULL; el = el->next) { overlap = rangeIntersection(el->start, el->end, start, end); if (overlap > bestOverlap) bestOverlap = overlap; } return bestOverlap; }
struct psl *getPslsFromCache(char *chrom, int chromStart, int chromEnd) /** Get all of the psls for a given gp of interest. */ { struct psl *pslList = NULL, *psl = NULL; struct binElement *beList = NULL, *be = NULL; beList = binKeeperFind(chromPslBin, chromStart, chromEnd); for(be = beList; be != NULL; be = be->next) { psl = be->val; slAddHead(&pslList, psl); } slFreeList(&beList); return pslList; }
void checkForClusters(char *chromName) /* describe collisions */ { char query[512]; struct sqlConnection *conn = hAllocConn(); struct sqlResult *sr; char **row; int start = 0; int end = 0; char *rsId = NULL; struct binElement *el, *elList = NULL; char *matchName = NULL; int candidateCount = 0; int matchCount = 0; verbose(1, "checking for collisions...\n"); safef(query, sizeof(query), "select chromStart, chromEnd, name from %s where chrom = '%s'", snpTable, chromName); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { candidateCount++; start = sqlUnsigned(row[0]); end = sqlUnsigned(row[1]); rsId = cloneString(row[2]); elList = binKeeperFind(snps, start, end); for (el = elList; el != NULL; el = el->next) { matchName = cloneString((char *)el->val); /* skip self hits */ if (sameString(matchName, rsId)) continue; fprintf(outputFileHandle, "%s\t%d\t%d\t%s\n", chromName, start, end, (char *)el->val); matchCount++; } } sqlFreeResult(&sr); hFreeConn(&conn); verbose(1, " candidate count = %d\n", candidateCount); verbose(1, " match count = %d\n", matchCount); }
int addIntronBleed(struct bed *bed, struct hash *altSpliceHash) /* Return the number of bases at start or end that bleed into introns * of other, probably better, transcripts. */ { struct binKeeper *keeper = hashFindVal(altSpliceHash, bed->chrom); if (keeper == NULL) return 0; int i; int total = 0; int lastBlock = bed->blockCount-1; if (lastBlock == 0) return 0; /* Single exon case. */ /* This funny loop just checks first and last block. */ for (i=0; i<=lastBlock; i += lastBlock) { int start = bed->chromStarts[i] + bed->chromStart; int end = start + bed->blockSizes[i]; struct binElement *bin, *binList = binKeeperFind(keeper, start, end); for (bin = binList; bin != NULL; bin = bin->next) { struct bed *bleeder = bin->val; if (sameString(bleeder->name, "bleedingExon")) { if (bleeder->strand[0] == bed->strand[0]) { if (i == 0) /* First block, want start to be same */ { if (bleeder->chromStart == start && bleeder->chromEnd < end) total += bleeder->chromEnd - bleeder->chromStart; break; } else if (i == lastBlock) { if (bleeder->chromEnd == end && bleeder->chromStart > start) total += bleeder->chromEnd - bleeder->chromStart; break; } } } } slFreeList(&binList); } return total; }
struct cnFill *netFillAt(char *chrom, int start, int end, struct hash *netHash) /* Get list of highest level fill for net at given position. */ { struct cnFill *fillList = NULL, *fill; struct binKeeper *bk = hashFindVal(netHash, chrom); if (bk != NULL) { struct binElement *beList = NULL, *be = NULL; beList = binKeeperFind(bk, start, end); for (be = beList; be != NULL; be = be->next) { fill = be->val; slAddHead(&fillList, fill); } slFreeList(&beList); slReverse(&fillList); } return fillList; }
void findOverlapingExons(struct geneLoc **geneLocList, struct binKeeper *chromBins, int exonStart, int exonEnd) /* Find overlaping exons, add their genes to the list if not already there */ { struct binElement *overExons = binKeeperFind(chromBins, exonStart, exonEnd); struct binElement *overExon; int overLen; for (overExon = overExons; overExon != NULL; overExon = overExon->next) { if (overlapsByThreshold(overExon, exonStart, exonEnd, &overLen)) { struct geneLoc *gl = overExon->val; gl->numOverlap += overLen; if (!containsGeneLoc(geneLocList, gl)) slAddHead(geneLocList, gl); } } }
struct txGraph *agxForCoordinates(char *chrom, int chromStart, int chromEnd, char strand, struct hash *orthoChromHash) /* Get list of graphs that cover a particular region. */ { struct binElement *beList = NULL, *be = NULL; struct txGraph *agx = NULL, *agxList = NULL; struct binKeeper *bk = hashFindVal(orthoChromHash, chrom); if (bk != NULL) { beList = binKeeperFind(bk, chromStart, chromEnd); for(be = beList; be != NULL; be = be->next) { agx = be->val; if(agx->strand[0] == strand) slSafeAddHead(&agxList, agx); } slReverse(&agxList); slFreeList(&beList); } return agxList; }
boolean agxIsRedundant(struct altGraphX *agx) /** Return TRUE if there has already been an altGraphX record that was a superSet of this data. */ { struct binElement *be = NULL, *beList = NULL; struct altGraphX *agxSeen = NULL; boolean alreadySeen = FALSE; beList = binKeeperFind(agxSeenBin, agx->tStart, agx->tEnd); for(be = beList; be != NULL; be = be->next) { agxSeen = (struct altGraphX *)be->val; if(agxSeen == agx) continue; if(agxIsSubset(agx, agxSeen)) { alreadySeen = TRUE; break; } } slFreeList(&beList); return alreadySeen; }
struct gene *mostOverlappingGene(struct hash *keeperHash, struct bed *bed) /* Find most overlapping gene in hash of keepers of genes. */ { struct binKeeper *bk = hashMustFindVal(keeperHash, bed->chrom); struct binElement *bin, *binList = binKeeperFind(bk, bed->chromStart, bed->chromEnd); int bestOverlap = 0; struct gene *bestGene = NULL; for (bin = binList; bin != NULL; bin = bin->next) { struct gene *gene = bin->val; if (gene->strand == bed->strand[0]) { int overlap = bedRangeTreeOverlap(bed, gene->exonTree); if (overlap > bestOverlap) { bestOverlap = overlap; bestGene = gene; } } } slFreeList(&binList); return bestGene; }
struct binElement *chromKeeperFind(char *chrom, int chromStart, int chromEnd) /* Return a list of all items in chromKeeper that intersect range. Free this list with slFreeList. */ { int i; static boolean warned = FALSE; struct binElement *be = NULL; boolean found = FALSE; for(i=0; i<chromCount; i++) { if(sameString(chromNames[i], chrom)) { be = binKeeperFind(chromRanges[i], chromStart, chromEnd); found = TRUE; break; } } if(!found && !warned) { warn("chromKeeper::chromKeeperFind() - Don't recognize chrom %s", chrom); warned = TRUE; } return be; }
void checkExp(char *bedFileName, char *tNibDir, char *nibList) { struct lineFile *bf = lineFileOpen(bedFileName , TRUE), *af = NULL; char *row[PSEUDOGENELINK_NUM_COLS] ; struct pseudoGeneLink *ps; char *tmpName[512], cmd[512]; struct axt *axtList = NULL, *axt, *mAxt = NULL; struct dnaSeq *qSeq = NULL, *tSeq = NULL, *seqList = NULL; struct nibInfo *qNib = NULL, *tNib = NULL; FILE *op; int ret; if (nibHash == NULL) nibHash = hashNew(0); while (lineFileNextRow(bf, row, ArraySize(row))) { struct misMatch *misMatchList = NULL; struct binKeeper *bk = NULL; struct binElement *el, *elist = NULL; struct psl *mPsl = NULL, *rPsl = NULL, *pPsl = NULL, *psl ; struct misMatch *mf = NULL; ps = pseudoGeneLinkLoad(row); tmpName[0] = cloneString(ps->name); chopByChar(tmpName[0], '.', tmpName, sizeof(tmpName)); verbose(2,"name %s %s:%d-%d\n", ps->name, ps->chrom, ps->chromStart,ps->chromEnd); /* get expressed retro from hash */ bk = hashFindVal(mrnaHash, ps->chrom); elist = binKeeperFindSorted(bk, ps->chromStart, ps->chromEnd ) ; for (el = elist; el != NULL ; el = el->next) { rPsl = el->val; verbose(2,"retroGene %s %s:%d-%d\n",rPsl->qName, ps->chrom, ps->chromStart,ps->chromEnd); } /* find mrnas that overlap parent gene */ bk = hashFindVal(mrnaHash, ps->gChrom); elist = binKeeperFindSorted(bk, ps->gStart , ps->gEnd ) ; for (el = elist; el != NULL ; el = el->next) { pPsl = el->val; verbose(2,"parent %s %s:%d %d,%d\n", pPsl->qName, pPsl->tName,pPsl->tStart, pPsl->match, pPsl->misMatch); } /* find self chain */ bk = hashFindVal(chainHash, ps->chrom); elist = binKeeperFind(bk, ps->chromStart , ps->chromEnd ) ; slSort(&elist, chainCmpScoreDesc); for (el = elist; el != NULL ; el = el->next) { struct chain *chain = el->val, *subChain, *retChainToFree, *retChainToFree2; int qs = chain->qStart; int qe = chain->qEnd; int id = chain->id; if (chain->qStrand == '-') { qs = chain->qSize - chain->qEnd; qe = chain->qSize - chain->qStart; } if (!sameString(chain->qName , ps->gChrom) || !positiveRangeIntersection(qs, qe, ps->gStart, ps->gEnd)) { verbose(2," wrong chain %s:%d-%d %s:%d-%d parent %s:%d-%d\n", chain->qName, qs, qe, chain->tName,chain->tStart,chain->tEnd, ps->gChrom,ps->gStart,ps->gEnd); continue; } verbose(2,"chain id %d %4.0f",chain->id, chain->score); chainSubsetOnT(chain, ps->chromStart+7, ps->chromEnd-7, &subChain, &retChainToFree); if (subChain != NULL) chain = subChain; chainSubsetOnQ(chain, ps->gStart, ps->gEnd, &subChain, &retChainToFree2); if (subChain != NULL) chain = subChain; if (chain->qStrand == '-') { qs = chain->qSize - chain->qEnd; qe = chain->qSize - chain->qStart; } verbose(2," %s:%d-%d %s:%d-%d ", chain->qName, qs, qe, chain->tName,chain->tStart,chain->tEnd); if (subChain != NULL) verbose(2,"subChain %s:%d-%d %s:%d-%d\n", subChain->qName, subChain->qStart, subChain->qEnd, subChain->tName,subChain->tStart,subChain->tEnd); qNib = nibInfoFromCache(nibHash, tNibDir, chain->qName); tNib = nibInfoFromCache(nibHash, tNibDir, chain->tName); tSeq = nibInfoLoadStrand(tNib, chain->tStart, chain->tEnd, '+'); qSeq = nibInfoLoadStrand(qNib, chain->qStart, chain->qEnd, chain->qStrand); axtList = chainToAxt(chain, qSeq, chain->qStart, tSeq, chain->tStart, maxGap, BIGNUM); verbose(2,"axt count %d misMatch cnt %d\n",slCount(axtList), slCount(misMatchList)); for (axt = axtList; axt != NULL ; axt = axt->next) { addMisMatch(&misMatchList, axt, chain->qSize); } verbose(2,"%d in mismatch list %s id %d \n",slCount(misMatchList), chain->qName, id); chainFree(&retChainToFree); chainFree(&retChainToFree2); break; } /* create axt of each expressed retroGene to parent gene */ /* get alignment for each mrna overlapping retroGene */ bk = hashFindVal(mrnaHash, ps->chrom); elist = binKeeperFindSorted(bk, ps->chromStart , ps->chromEnd ) ; { char queryName[512]; char axtName[512]; char pslName[512]; safef(queryName, sizeof(queryName), "/tmp/query.%s.fa", ps->chrom); safef(axtName, sizeof(axtName), "/tmp/tmp.%s.axt", ps->chrom); safef(pslName, sizeof(pslName), "/tmp/tmp.%s.psl", ps->chrom); op = fopen(pslName,"w"); for (el = elist ; el != NULL ; el = el->next) { psl = el->val; pslOutput(psl, op, '\t','\n'); qSeq = twoBitReadSeqFrag(twoBitFile, psl->qName, 0, 0); if (qSeq != NULL) slAddHead(&seqList, qSeq); else errAbort("seq %s not found \n", psl->qName); } fclose(op); faWriteAll(queryName, seqList); safef(cmd,sizeof(cmd),"pslPretty -long -axt %s %s %s %s",pslName , nibList, queryName, axtName); ret = system(cmd); if (ret != 0) errAbort("ret is %d %s\n",ret,cmd); verbose(2, "ret is %d %s\n",ret,cmd); af = lineFileOpen(axtName, TRUE); while ((axt = axtRead(af)) != NULL) slAddHead(&mAxt, axt); lineFileClose(&af); } slReverse(&mAxt); /* for each parent/retro pair, count bases matching retro and parent better */ for (el = elist; el != NULL ; el = el->next) { int i, scoreRetro=0, scoreParent=0, scoreNeither=0; struct dyString *parentMatch = newDyString(16*1024); struct dyString *retroMatch = newDyString(16*1024); mPsl = el->val; if (mAxt != NULL) { verbose(2,"mrna %s %s:%d %d,%d axt %s\n", mPsl->qName, mPsl->tName,mPsl->tStart, mPsl->match, mPsl->misMatch, mAxt->qName); assert(sameString(mPsl->qName, mAxt->qName)); for (i = 0 ; i< (mPsl->tEnd-mPsl->tStart) ; i++) { int j = mAxt->tStart - mPsl->tStart; verbose(5, "listLen = %d\n",slCount(&misMatchList)); if ((mf = matchFound(&misMatchList, (mPsl->tStart)+i)) != NULL) { if (toupper(mf->retroBase) == toupper(mAxt->qSym[j+i])) { verbose (3,"match retro[%d] %d %c == %c parent %c %d\n", i,mf->retroLoc, mf->retroBase, mAxt->qSym[j+i], mf->parentBase, mf->parentLoc); dyStringPrintf(retroMatch, "%d,", mf->retroLoc); scoreRetro++; } else if (toupper(mf->parentBase) == toupper(mAxt->qSym[j+i])) { verbose (3,"match parent[%d] %d %c == %c retro %c %d\n", i,mf->parentLoc, mf->parentBase, mAxt->qSym[j+i], mf->retroBase, mf->retroLoc); dyStringPrintf(parentMatch, "%d,", mf->parentLoc); scoreParent++; } else { verbose (3,"match neither[%d] %d %c != %c retro %c %d\n", i,mf->parentLoc, mf->parentBase, mAxt->tSym[j+i], mf->retroBase, mf->retroLoc); scoreNeither++; } } } verbose(2,"final score %s parent %d retro %d neither %d\n", mPsl->qName, scoreParent, scoreRetro, scoreNeither); fprintf(outFile,"%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n", ps->chrom, ps->chromStart, ps->chromEnd, ps->name, ps->score, mPsl->tName, mPsl->tStart, mPsl->tEnd, mPsl->qName, scoreParent, scoreRetro, scoreNeither, parentMatch->string, retroMatch->string); mAxt = mAxt->next; } dyStringFree(&parentMatch); dyStringFree(&retroMatch); } } }
void oneChromInput(char *database, char *chrom, int chromSize, char *rangeTrack, char *expTrack, struct hash *refLinkHash, struct hash *erHash, FILE *f) /* Read in info for one chromosome. */ { struct binKeeper *rangeBk = binKeeperNew(0, chromSize); struct binKeeper *expBk = binKeeperNew(0, chromSize); struct binKeeper *knownBk = binKeeperNew(0, chromSize); struct bed *rangeList = NULL, *range; struct bed *expList = NULL; struct genePred *knownList = NULL; struct rangeInfo *riList = NULL, *ri; struct hash *riHash = hashNew(0); /* rangeInfo values. */ struct binElement *rangeBeList = NULL, *rangeBe, *beList = NULL, *be; /* Load up data from database. */ rangeList = loadBed(database, chrom, rangeTrack, 12, rangeBk); expList = loadBed(database, chrom, expTrack, 15, expBk); knownList = loadGenePred(database, chrom, "refGene", knownBk); /* Build range info basics. */ rangeBeList = binKeeperFindAll(rangeBk); for (rangeBe = rangeBeList; rangeBe != NULL; rangeBe = rangeBe->next) { range = rangeBe->val; AllocVar(ri); slAddHead(&riList, ri); hashAddSaveName(riHash, range->name, ri, &ri->id); ri->range = range; ri->commonName = findCommonName(range, knownBk, refLinkHash); } slReverse(&riList); /* Mark split ones. */ beList = binKeeperFindAll(expBk); for (be = beList; be != NULL; be = be->next) { struct bed *exp = be->val; struct binElement *subList = binKeeperFind(rangeBk, exp->chromStart, exp->chromEnd); if (slCount(subList) > 1) { struct binElement *sub; for (sub = subList; sub != NULL; sub = sub->next) { struct bed *range = sub->val; struct rangeInfo *ri = hashMustFindVal(riHash, range->name); ri->isSplit = TRUE; } } slFreeList(&subList); } /* Output the nice ones: not split and having some expression info. */ for (ri = riList; ri != NULL; ri = ri->next) { if (!ri->isSplit) { struct bed *range = ri->range; beList = binKeeperFind(expBk, range->chromStart, range->chromEnd); if (beList != NULL) outputAveraged(f, ri, erHash, beList); slFreeList(&beList); } } /* Clean up time! */ freeHash(&riHash); genePredFreeList(&knownList); bedFree(&rangeList); bedFree(&expList); slFreeList(&rangeBeList); slFreeList(&beList); slFreeList(&riList); binKeeperFree(&rangeBk); binKeeperFree(&expBk); binKeeperFree(&knownBk); }
void sortGenes(struct sqlConnection *conn) /* Put up sort gene page. */ { cartWebStart(cart, database, "Finding Candidate Genes for Gene Sorter"); if (!hgNearOk(database)) errAbort("Sorry, gene sorter not available for this database."); /* Get list of regions. */ struct genoGraph *gg = ggFirstVisible(); double threshold = getThreshold(); struct bed3 *bed, *bedList = regionsOverThreshold(gg); /* Figure out what table and column are the sorter's main gene set. */ struct hash *genomeRa = hgReadRa(genome, database, "hgNearData", "genome.ra", NULL); char *geneTable = hashMustFindVal(genomeRa, "geneTable"); char *idColumn = hashMustFindVal(genomeRa, "idColumn"); /* if marker labels were present when the file was uploaded, they are saved here */ char cgmName[256]; safef(cgmName, sizeof(cgmName), "%s.cgm", gg->binFileName); struct lineFile *m = lineFileMayOpen(cgmName, TRUE); char *cgmRow[4]; cgmRow[0] = ""; /* dummy row */ cgmRow[1] = ""; cgmRow[2] = "0"; cgmRow[3] = "0"; FILE *g = NULL; int markerCount = 0; struct tempName snpTn; if (m) { /* Create custom column output file. */ trashDirFile(&snpTn, "hgg", "marker", ".mrk"); g = mustOpen(snpTn.forCgi, "w"); fprintf(g, "column name=\"%s Markers\" shortLabel=\"%s Markers over threshold\" longLabel=\"%s Markers in regions over threshold\" " "visibility=on priority=99 " "\n" , gg->shortLabel , gg->shortLabel , gg->shortLabel ); } /*** Build up hash of all transcriptHash that are in region. */ struct hash *transcriptHash = hashNew(16); /* This loop handles one chromosome at a time. It depends on * the bedList being sorted by chromosome. */ for (bed = bedList; bed != NULL; ) { /* Make binKeeper and stuff in all regions in this chromosome into it. */ char *chrom = bed->chrom; int chromSize = hChromSize(database, chrom); struct binKeeper *bk = binKeeperNew(0, chromSize); while (bed != NULL && sameString(chrom, bed->chrom)) { binKeeperAdd(bk, bed->chromStart, bed->chromEnd, bed); bed = bed->next; } struct binKeeper *bkGenes = NULL; if (m) bkGenes = binKeeperNew(0, chromSize); /* Query database to find out bounds of all genes on this chromosome * and if they overlap any of the regions then put them in the hash. */ char query[512]; safef(query, sizeof(query), "select name,txStart,txEnd from %s where chrom='%s'", geneTable, chrom); struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { char *name = row[0]; int start = sqlUnsigned(row[1]); int end = sqlUnsigned(row[2]); if (binKeeperAnyOverlap(bk, start, end)) { hashStore(transcriptHash, name); if (m) binKeeperAdd(bkGenes, start, end, cloneString(name)); } } sqlFreeResult(&sr); if (m) { /* Read cgm file if it exists, looking at all markers on this chromosome * and if they overlap any of the regions and genes then output them. */ do { // marker, chrom, chromStart, val char *marker = cgmRow[0]; char *chr = cgmRow[1]; int start = sqlUnsigned(cgmRow[2]); int end = start+1; double val = sqlDouble(cgmRow[3]); int cmp = strcmp(chr,chrom); if (cmp > 0) break; if (cmp == 0) { if (val >= threshold) { struct binElement *el, *bkList = binKeeperFind(bkGenes, start, end); for (el = bkList; el; el=el->next) { /* output to custom column trash file */ fprintf(g, "%s %s\n", (char *)el->val, marker); } if (bkList) { ++markerCount; slFreeList(&bkList); } } } } while (lineFileRow(m, cgmRow)); } /* Clean up for this chromosome. */ binKeeperFree(&bk); if (m) { /* For speed, we do not free up the values (cloned the kg names earlier) */ binKeeperFree(&bkGenes); } } /* Get list of all transcripts in regions. */ struct hashEl *el, *list = hashElListHash(transcriptHash); /* Create file with all matching gene IDs. */ struct tempName keyTn; trashDirFile(&keyTn, "hgg", "key", ".key"); FILE *f = mustOpen(keyTn.forCgi, "w"); for (el = list; el != NULL; el = el->next) fprintf(f, "%s\n", el->name); carefulClose(&f); /* Print out some info. */ hPrintf("Thresholding <i>%s</i> at %g. ", gg->shortLabel, threshold); hPrintf("There are %d regions covering %lld bases.<BR>\n", slCount(bedList), bedTotalSize((struct bed*)bedList) ); hPrintf("Installed a Gene Sorter filter that selects only genes in these regions.<BR>\n"); if (m) { hPrintf("There are %d markers in the regions over threshold that overlap knownGenes.<BR>\n", markerCount); hPrintf("Installed a Gene Sorter custom column called \"%s Markers\" with these markers.<BR>\n", gg->shortLabel); } /* close custom column output file */ if (m) { lineFileClose(&m); carefulClose(&g); } /* Stuff cart variable with name of file. */ char keyCartName[256]; safef(keyCartName, sizeof(keyCartName), "%s%s.keyFile", advFilterPrefix, idColumn); cartSetString(cart, keyCartName, keyTn.forCgi); cartSetString(cart, customFileVarName, snpTn.forCgi); char snpVisCartNameTemp[256]; char *snpVisCartName = NULL; safef(snpVisCartNameTemp, sizeof(snpVisCartNameTemp), "%s%s Markers.vis", colConfigPrefix, gg->shortLabel); snpVisCartName = replaceChars(snpVisCartNameTemp, " ", "_"); cartSetString(cart, snpVisCartName, "1"); freeMem(snpVisCartName); hPrintf("<FORM ACTION=\"../cgi-bin/hgNear\" METHOD=GET>\n"); cartSaveSession(cart); hPrintf("<CENTER>"); cgiMakeButton("submit", "go to gene sorter"); hPrintf("</CENTER>"); hPrintf("</FORM>"); cartWebEnd(); }
void oneChrom(char *database, char *chrom, char *refAliTrack, char *bedTrack, struct hash *otherHash, struct stats *stats) /* Process one chromosome. */ { struct bed *bedList = NULL, *bed; struct sqlConnection *conn = hAllocConn(database); struct sqlResult *sr; char **row; int rowOffset; int chromSize = hChromSize(database, chrom); struct binKeeper *bk = binKeeperNew(0, chromSize); struct psl *pslList = NULL; struct dnaSeq *chromSeq = NULL; if (endsWith(bedTrack, ".bed")) { struct lineFile *lf = lineFileOpen(bedTrack, TRUE); char *row[3]; while (lineFileRow(lf, row)) { if (sameString(chrom, row[0])) { bed = bedLoad3(row); slAddHead(&bedList, bed); } } lineFileClose(&lf); } else { sr = hChromQuery(conn, bedTrack, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { bed = bedLoad3(row+rowOffset); slAddHead(&bedList, bed); } sqlFreeResult(&sr); } slReverse(&bedList); uglyf("Loaded beds\n"); sr = hChromQuery(conn, refAliTrack, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { struct psl *psl = pslLoad(row + rowOffset); slAddHead(&pslList, psl); binKeeperAdd(bk, psl->tStart, psl->tEnd, psl); } sqlFreeResult(&sr); uglyf("Loaded psls\n"); chromSeq = hLoadChrom(database, chrom); /* Fetch entire chromosome into memory. */ uglyf("Loaded human seq\n"); for (bed = bedList; bed != NULL; bed = bed->next) { struct binElement *el, *list = binKeeperFind(bk, bed->chromStart, bed->chromEnd); for (el = list; el != NULL; el = el->next) { struct psl *fullPsl = el->val; struct psl *psl = pslTrimToTargetRange(fullPsl, bed->chromStart, bed->chromEnd); if (psl != NULL) { foldPslIntoStats(psl, chromSeq, otherHash, stats); pslFree(&psl); } } slFreeList(&list); stats->bedCount += 1; stats->bedBaseCount += bed->chromEnd - bed->chromStart; sqlFreeResult(&sr); } freeDnaSeq(&chromSeq); pslFreeList(&pslList); binKeeperFree(&bk); hFreeConn(&conn); }