void outputProt(struct protInfo *prot, struct hash *seedToScop, FILE *f, FILE *fKnownTo) /* Callapse together all features of same type that overlap and output */ { slSort(&prot->featureList, protFeatureCmpName); struct protFeature *startFeature, *endFeature; for (startFeature = prot->featureList; startFeature != NULL; startFeature = endFeature) { for (endFeature = startFeature->next; endFeature != NULL; endFeature = endFeature->next) if (!sameString(startFeature->name, endFeature->name)) break; struct rbTree *rangeTree = rangeTreeNew(); struct protFeature *feature; for (feature = startFeature; feature != endFeature; feature = feature->next) rangeTreeAdd(rangeTree, feature->start, feature->end); struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) { feature = highestScoringFeature(startFeature, endFeature, range->start, range->end); fprintf(f, "%s\t%d\t%d\t%s\n", prot->name, range->start, range->end, startFeature->name); fprintf(fKnownTo, "%s\t%s\t%d\t%d\t%g\n", prot->name, (char *)hashMustFindVal(seedToScop, startFeature->name), range->start, range->end, feature->eVal); } rangeTreeFree(&rangeTree); } }
static void addBwCorrelations(struct bbiChromInfo *chrom, struct genomeRangeTree *targetGrt, struct bigWigValsOnChrom *aVals, struct bigWigValsOnChrom *bVals, struct bbiFile *aBbi, struct bbiFile *bBbi, double aThreshold, double bThreshold, struct correlate *c, struct correlate *cInEnriched, struct correlate *cClipped) /* Find bits of a and b that overlap and also overlap with targetRanges. Do correlations there */ { struct rbTree *targetRanges = genomeRangeTreeFindRangeTree(targetGrt, chrom->name); if (bigWigValsOnChromFetchData(aVals, chrom->name, aBbi) && bigWigValsOnChromFetchData(bVals, chrom->name, bBbi) ) { double *a = aVals->valBuf, *b = bVals->valBuf; int i, end = chrom->size; for (i=0; i<end; ++i) { double aVal = a[i], bVal = b[i]; correlateNext(c, aVal, bVal); if (aVal > aThreshold) aVal = aThreshold; if (bVal > bThreshold) bVal = bThreshold; correlateNext(cClipped, aVal, bVal); } if (targetRanges != NULL) { struct range *range, *rangeList = rangeTreeList(targetRanges); for (range = rangeList; range != NULL; range = range->next) { int start = range->start, end = range->end; for (i=start; i<end; ++i) correlateNext(cInEnriched, a[i], b[i]); } } } }
void freen(char *chrom) /* Test something */ { uglyTime(NULL); struct sqlConnection *conn = sqlConnect("hg19"); uglyTime("connect"); char query[512]; sqlSafef(query, sizeof(query), "select * from knownGene where chrom='%s'", chrom); struct sqlResult *sr = sqlGetResult(conn, query); uglyTime("get result"); char **row; struct rbTree *rt = rangeTreeNew(); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row); int i; int exonCount = gp->exonCount; for (i=0; i<exonCount; ++i) rangeTreeAdd(rt, gp->exonStarts[i], gp->exonEnds[i]); } uglyTime("Add rows"); struct range *list = rangeTreeList(rt); uglyTime("Did list"); uglyf("%d items in chrom %s\n", slCount(list), chrom); }
void setArrayCountsFromRangeTree(struct rbTree *rangeTree, int *array, int seqSize) /* Given a range tree that covers stuff from 0 to seqSize, * fill in array with amount of bases covered by range tree * before a given position in array. */ { struct range *range, *rangeList = rangeTreeList(rangeTree); if (rangeList == NULL) return; range = rangeList; int i, count = 0; for (i=0; i<seqSize; ++i) { array[i] = count; if (i >= range->end) { range = range->next; if (range == NULL) { for (;i<seqSize; ++i) array[i] = count; return; } } if (range->start <= i) ++count; } }
static struct bbiInterval *bigBedCoverageIntervals(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, struct lm *lm) /* Return intervals where the val is the depth of coverage. */ { /* Get list of overlapping intervals */ struct bigBedInterval *bi, *biList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); if (biList == NULL) return NULL; /* Make a range tree that collects coverage. */ struct rbTree *rangeTree = rangeTreeNew(); for (bi = biList; bi != NULL; bi = bi->next) rangeTreeAddToCoverageDepth(rangeTree, bi->start, bi->end); struct range *range, *rangeList = rangeTreeList(rangeTree); /* Convert rangeList to bbiInterval list. */ struct bbiInterval *bwi, *bwiList = NULL; for (range = rangeList; range != NULL; range = range->next) { lmAllocVar(lm, bwi); bwi->start = range->start; if (bwi->start < start) bwi->start = start; bwi->end = range->end; if (bwi->end > end) bwi->end = end; bwi->val = ptToInt(range->val); slAddHead(&bwiList, bwi); } slReverse(&bwiList); /* Clean up and go home. */ rangeTreeFree(&rangeTree); return bwiList; }
void sortAndMergeBlocks(struct bed *oneBed) /* construct a range tree out of blocks, then remake blocks arrays */ /* this should automatically merge everything. */ { struct rbTree *rt = rangeTreeNew(); struct range *rangeList = NULL; struct range *oneRange; int i; for (i = 0; i < oneBed->blockCount; i++) { rangeTreeAdd(rt, oneBed->chromStarts[i], oneBed->chromStarts[i] + oneBed->blockSizes[i]); /* delete old (bad) blocks in bed */ oneBed->chromStarts[i] = oneBed->blockSizes[i] = 0; } rangeList = rangeTreeList(rt); oneBed->blockCount = slCount(rangeList); /* remake bed blocks out of range tree */ i = 0; for (oneRange = rangeList; oneRange != NULL; oneRange = oneRange->next) { oneBed->chromStarts[i] = oneRange->start; oneBed->blockSizes[i] = oneRange->end - oneRange->start; i++; } rangeTreeFree(&rt); }
void edwBamToWig(char *input, char *output) /* edwBamToWig - Convert a bam file to a wig file by measuring depth of coverage, optionally adjusting hit size to average for library.. */ { FILE *f = mustOpen(output, "w"); /* Open file and get header for it. */ samfile_t *sf = samopen(input, "rb", NULL); if (sf == NULL) errnoAbort("Couldn't open %s.\n", input); bam_header_t *head = sf->header; if (head == NULL) errAbort("Aborting ... Bad BAM header in file: %s", input); /* Scan through input populating genome range trees */ struct genomeRangeTree *grt = genomeRangeTreeNew(); bam1_t one = {}; for (;;) { /* Read next record. */ if (bam_read1(sf->x.bam, &one) < 0) break; if (one.core.tid >= 0 && one.core.n_cigar > 0) { char *chrom = head->target_name[one.core.tid]; int start = one.core.pos; int end = start + one.core.l_qseq; if (one.core.flag & BAM_FREVERSE) { start -= clPad; } else { end += clPad; } struct rbTree *rt = genomeRangeTreeFindOrAddRangeTree(grt,chrom); rangeTreeAddToCoverageDepth(rt, start, end); } } /* Convert genome range tree into output wig */ /* Get list of chromosomes. */ struct hashEl *hel, *helList = hashElListHash(grt->hash); for (hel = helList; hel != NULL; hel = hel->next) { char *chrom = hel->name; struct rbTree *rt = hel->val; struct range *range, *rangeList = rangeTreeList(rt); for (range = rangeList; range != NULL; range = range->next) { fprintf(f, "%s\t%d\t%d\t%d\n", chrom, range->start, range->end, ptToInt(range->val)); } } carefulClose(&f); }
long rangeTreeTotalSize(struct rbTree *tree) /* Return total size of all ranges in tree. */ { struct range *range, *list = rangeTreeList(tree); long total = 0; for (range = list; range != NULL; range = range->next) total += range->end - range->start; return total; }
long rangeTreeRangeTreeOverlap(struct rbTree *a, struct rbTree *b) /* Return total overlap between two range trees. */ { struct range *range, *list = rangeTreeList(a); long total = 0; for (range = list; range != NULL; range = range->next) total += rangeTreeOverlapSize(b, range->start, range->end); return total; }
/* free slRef objects in the compRangeMap */ static void destructCompRangeMap(struct malnSet *malnSet) { struct hashCookie cookie = hashFirst(malnSet->compRangeMap->hash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) { struct rbTree *rangeTree = hel->val; for (struct range *rng = rangeTreeList(rangeTree); rng != NULL; rng = rng->next) { slFreeList(&rng->val); } } genomeRangeTreeFree(&malnSet->compRangeMap); }
void doEnrichmentsFromBed3Sample(struct bed3 *sampleList, struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Given a bed3 list, calculate enrichments for targets */ { struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList); struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash); /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; long long uniqOverlapBases = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *sampleTree = chrom->val; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(sampleTree); for (range = rangeList; range != NULL; range = range->next) { /* Do unique base overlap counts (since using range trees both sides) */ int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end); uniqOverlapBases += overlap; } } } /* Figure out how much we overlap allowing same bases in genome * to part of more than one overlap. */ long long overlapBases = 0; struct bed3 *sample; for (sample = sampleList; sample != NULL; sample = sample->next) { int overlap = genomeRangeTreeOverlapSize(grt, sample->chrom, sample->chromStart, sample->chromEnd); overlapBases += overlap; } /* Save to database. */ struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, overlapBases, uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } genomeRangeTreeFree(&sampleGrt); hashElFreeList(&chromList); }
void encodeMergeReplicates(int inCount, char *inNames[], char *outName) /* encodeMergeReplicates - Merge together replicates for a pooled output. * Only works on broadPeak and narrowPeak files currently. */ { /* Make list of sources out of input files. */ struct peakSource *source, *sourceList = NULL; int i; for (i=0; i<inCount; ++i) { AllocVar(source); source->dataSource = inNames[i]; source->chromColIx = 0; source->startColIx = 1; source->endColIx = 2; source->scoreColIx = SCORE_COL_IX; source->normFactor = 1.0; source->minColCount = SCORE_COL_IX+1; slAddTail(&sourceList, source); } /* Load in from all sources. */ struct peakClusterMaker *maker = peakClusterMakerNew(); for (source = sourceList; source != NULL; source = source->next) peakClusterMakerAddFromSource(maker, source); /* Cluster each chromosome. */ FILE *f = mustOpen(outName, "w"); struct hashEl *chrom, *chromList = peakClusterMakerChromList(maker); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *tree = chrom->val; struct range *range, *rangeList = rangeTreeList(tree); struct lm *lm = lmInit(0); for (range = rangeList; range != NULL; range = range->next) { struct peakCluster *cluster, *clusterList = peakClusterItems(lm, range->val, BIGNUM, 0.0); for (cluster = clusterList; cluster != NULL; cluster = cluster->next) { if (clAgree < 2 || peakClusterSourceCount(cluster) >= clAgree) outputClusterNarrowPeak(cluster, f, clAdd, clGotThreshold, clThreshold, clUniqueName); } } lmCleanup(&lm); } carefulClose(&f); }
void biggestGapFromRangeTree(struct rbTree *rt, int chromSize, int *retStart, int *retEnd, int *retSize) /* Given a range tree filled with data for given chromosome, figure out * location and size of biggest gap in data. */ { struct range *range, *rangeList = rangeTreeList(rt); if (rangeList == NULL) { *retStart = 0; *retEnd = *retSize = chromSize; } else { int lastEnd = 0; int biggestSize = 0, biggestStart = 0, biggestEnd = 0; for (range = rangeList; range != NULL; range = range->next) { int size = range->start - lastEnd; if (size > biggestSize) { biggestSize = size; biggestStart = lastEnd; biggestEnd = range->start; } lastEnd = range->end; if (range->next == NULL) { size = chromSize - lastEnd; if (size > biggestSize) { biggestSize = size; biggestStart = lastEnd; biggestEnd = chromSize; } } } *retStart = biggestStart; *retEnd = biggestEnd; *retSize = biggestSize; } }
struct bed *bedFromRangeTree(struct rbTree *rangeTree, char *chrom, char *name, char *strand) /* Create a bed based on range tree */ { struct range *range, *rangeList = rangeTreeList(rangeTree); if (rangeList == NULL) return NULL; /* Figure out overall start and end, and blockCount. Assumes * (rightly) that rangeList is sorted. */ int chromStart = rangeList->start, chromEnd = rangeList->end; int blockCount = 1; for (range = rangeList->next; range != NULL; range = range->next) { chromEnd = range->end; blockCount += 1; } struct bed *bed; AllocVar(bed); bed->chrom = cloneString(chrom); bed->chromStart = chromStart; bed->chromEnd = chromEnd; bed->name = cloneString(name); bed->strand[0] = strand[0]; bed->blockCount = blockCount; AllocArray(bed->blockSizes, blockCount); AllocArray(bed->chromStarts, blockCount); int i = 0; for (range = rangeList; range != NULL; range = range->next) { bed->blockSizes[i] = range->end - range->start; bed->chromStarts[i] = range->start - chromStart; ++i; } return bed; }
struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, double forceJoinScore, double weakLevel) /* Convert a list of items to a list of clusters of items. This may break up clusters that * have weakly linked parts. [ ] AAAAAAAAAAAAAAAAAA BBBBBB DDDDDD CCCC EEEE gets tranformed into [ ] [ ] AAAAAAAAAAAAAAAAAA BBBBBB DDDDDD CCCC EEEE The strategy is to build a rangeTree of coverage, which might look something like so: 123333211123333211 then define cluster ends that exceed the minimum limit, which is either weakLevel (usually 10%) of the highest or forceJoinScore if weakLevel times the highest is more than forceJoinScore. This will go to something like so: [---] [----] Finally the items that are overlapping a cluster are assigned to it. Note that this may mean that an item may be in multiple clusters. [ABC] [ ADE] */ { int easyMax = round(1.0/weakLevel); int itemCount = slCount(itemList); struct peakCluster *clusterList = NULL; if (itemCount < easyMax) { struct peakItem *item = itemList; int chromStart = item->chromStart; int chromEnd = item->chromEnd; for (item = item->next; item != NULL; item = item->next) { if (item->chromStart < chromStart) chromStart = item->chromStart; if (item->chromEnd > chromEnd) chromEnd = item->chromEnd; } addCluster(lm, itemList, chromStart, chromEnd, &clusterList); } else { /* Make up coverage tree. */ struct rbTree *covTree = rangeTreeNew(); struct peakItem *item; for (item = itemList; item != NULL; item = item->next) rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd); struct range *range, *rangeList = rangeTreeList(covTree); /* Figure out maximum coverage. */ int maxCov = 0; for (range = rangeList; range != NULL; range = range->next) { int cov = ptToInt(range->val); if (cov > maxCov) maxCov = cov; } /* Figure coverage threshold. */ int threshold = round(maxCov * weakLevel); if (threshold > forceJoinScore-1) threshold = forceJoinScore-1; /* Loop through emitting sections over threshold as clusters */ boolean inRange = FALSE; boolean start = 0, end = 0; for (range = rangeList; range != NULL; range = range->next) { int cov = ptToInt(range->val); if (cov > threshold) { if (inRange) end = range->end; else { inRange = TRUE; start = range->start; end = range->end; } } else { if (inRange) { addCluster(lm, itemList, start, end, &clusterList); inRange = FALSE; } } } if (inRange) addCluster(lm, itemList, start, end, &clusterList); } slReverse(&clusterList); return clusterList; }
static struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, struct bbiSummaryElement *totalSum) /* Write out data reduced by factor of initialReduction. Also calculate and keep in memory * next reduction level. This is more work than some ways, but it keeps us from having to * keep the first reduction entirely in memory. */ { struct bbiSummary *twiceReducedList = NULL; bits32 doubleReductionSize = initialReduction * zoomIncrement; struct bbiChromUsage *usage = usageList; struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd; boundsPt = AllocArray(boundsArray, initialReductionCount); boundsEnd = boundsPt + initialReductionCount; *retDataStart = ftell(f); writeOne(f, initialReductionCount); /* This gets a little complicated I'm afraid. The strategy is to: * 1) Build up a range tree that represents coverage depth on that chromosome * This also has the nice side effect of getting rid of overlaps. * 2) Stream through the range tree, outputting the initial summary level and * further reducing. */ boolean firstTime = TRUE; struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress); for (usage = usageList; usage != NULL; usage = usage->next) { struct bbiSummary oneSummary, *sum = NULL; struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name); struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) { /* Grab values we want from range. */ double val = ptToInt(range->val); int start = range->start; int end = range->end; bits32 size = end - start; /* Add to total summary. */ if (firstTime) { totalSum->validCount = size; totalSum->minVal = totalSum->maxVal = val; totalSum->sumData = val*size; totalSum->sumSquares = val*val*size; firstTime = FALSE; } else { totalSum->validCount += size; if (val < totalSum->minVal) totalSum->minVal = val; if (val > totalSum->maxVal) totalSum->maxVal = val; totalSum->sumData += val*size; totalSum->sumSquares += val*val*size; } /* If start past existing block then output it. */ if (sum != NULL && sum->end <= start && sum->end < usage->size) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If don't have a summary we're working on now, make one. */ if (sum == NULL) { oneSummary.chromId = usage->id; oneSummary.start = start; oneSummary.end = start + initialReduction; if (oneSummary.end > usage->size) oneSummary.end = usage->size; oneSummary.minVal = oneSummary.maxVal = val; oneSummary.sumData = oneSummary.sumSquares = 0.0; oneSummary.validCount = 0; sum = &oneSummary; } /* Deal with case where might have to split an item between multiple summaries. This * loop handles all but the final affected summary in that case. */ while (end > sum->end) { /* Fold in bits that overlap with existing summary and output. */ int overlap = rangeIntersection(start, end, sum->start, sum->end); assert(overlap > 0); verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap); sum->validCount += overlap; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * overlap; sum->sumSquares += val*val * overlap; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); size -= overlap; /* Move summary to next part. */ sum->start = start = sum->end; sum->end = start + initialReduction; if (sum->end > usage->size) sum->end = usage->size; sum->minVal = sum->maxVal = val; sum->sumData = sum->sumSquares = 0.0; sum->validCount = 0; } /* Add to summary. */ sum->validCount += size; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * size; sum->sumSquares += val*val * size; } if (sum != NULL) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); } rangeTreeFree(&rangeTree); } bbiSumOutStreamClose(&stream); /* Write out 1st zoom index. */ int indexOffset = *retIndexStart = ftell(f); assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; }
void doEnrichmentsFromSampleBed(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf, struct edwAssembly *assembly, struct target *targetList) /* Figure out enrichments from sample bed file. */ { char *sampleBed = vf->sampleBed; if (isEmpty(sampleBed)) { warn("No sample bed for %s", ef->edwFileName); return; } /* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/ struct bed3 *sample, *sampleList = bed3LoadAll(sampleBed); if (sampleList == NULL) { warn("Sample bed is empty for %s", ef->edwFileName); return; } struct genomeRangeTree *sampleGrt = edwMakeGrtFromBed3List(sampleList); struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash); /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; long long uniqOverlapBases = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *sampleTree = chrom->val; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(sampleTree); for (range = rangeList; range != NULL; range = range->next) { /* Do unique base overlap counts (since using range trees both sides) */ int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end); uniqOverlapBases += overlap; } } } /* Figure out how much we overlap allowing same bases in genome * to part of more than one overlap. */ long long overlapBases = 0; for (sample = sampleList; sample != NULL; sample = sample->next) { int overlap = genomeRangeTreeOverlapSize(grt, sample->chrom, sample->chromStart, sample->chromEnd); overlapBases += overlap; } /* Save to database. */ struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, overlapBases, uniqOverlapBases); edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128); edwQaEnrichFree(&enrich); } genomeRangeTreeFree(&sampleGrt); bed3FreeList(&sampleList); hashElFreeList(&chromList); }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }
void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew(); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { long startBigQueryTime = clock1000(); boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; if (gotData) { double *valBuf = valsOnChrom->valBuf; Bits *covBuf = valsOnChrom->covBuf; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(targetTree); for (range = rangeList; range != NULL; range = range->next) { int s = range->start, e = range->end, i; for (i=s; i<=e; ++i) { if (bitReadOne(covBuf, i)) { double x = valBuf[i]; target->uniqOverlapBases += 1; target->overlapBases += x; } } } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; } } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bigWigValsOnChromFree(&valsOnChrom); bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
void txGeneCanonical(char *codingCluster, char *infoFile, char *noncodingGraph, char *genesBed, char *nearCoding, char *outCanonical, char *outIsoforms, char *outClusters) /* txGeneCanonical - Pick a canonical version of each gene - that is the form * to use when just interested in a single splicing varient. Produces final * transcript clusters as well. */ { /* Read in input into lists in memory. */ struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster); struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph); struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12); struct txInfo *info, *infoList = txInfoLoadAll(infoFile); struct bed *nearList = bedLoadNAll(nearCoding, 12); /* Make hash of all beds. */ struct hash *bedHash = hashNew(18); for (bed = bedList; bed != NULL; bed = bed->next) hashAdd(bedHash, bed->name, bed); /* Make has of all info. */ struct hash *infoHash = hashNew(18); for (info = infoList; info != NULL; info = info->next) hashAdd(infoHash, info->name, info); /* Make a binKeeper structure that we'll populate with coding genes. */ struct hash *sizeHash = minChromSizeFromBeds(bedList); struct hash *keeperHash = minChromSizeKeeperHash(sizeHash); /* Make list of coding genes and toss them into binKeeper. * This will eat up bed list, but bedHash is ok. */ struct gene *gene, *geneList = NULL; for (coding = codingList; coding != NULL; coding = coding->next) { gene = geneFromCluster(coding, bedHash, infoHash); slAddHead(&geneList, gene); struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom); binKeeperAdd(bk, gene->start, gene->end, gene); } /* Go through near-coding genes and add them to the coding gene * they most overlap. */ for (bed = nearList; bed != NULL; bed = nextBed) { nextBed = bed->next; gene = mostOverlappingGene(keeperHash, bed); if (gene == NULL) errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name); geneAddBed(gene, bed); } /* Add non-coding genes. */ for (graph = graphList; graph != NULL; graph = graph->next) { gene = geneFromGraph(graph, bedHash); slAddHead(&geneList, gene); } /* Sort so it all looks nicer. */ slSort(&geneList, geneCmp); /* Open up output files. */ FILE *fCan = mustOpen(outCanonical, "w"); FILE *fIso = mustOpen(outIsoforms, "w"); FILE *fClus = mustOpen(outClusters, "w"); /* Loop through, making up gene name, and writing output. */ int geneId = 0; for (gene = geneList; gene != NULL; gene = gene->next) { /* Make up name. */ char name[16]; safef(name, sizeof(name), "g%05d", ++geneId); /* Reverse transcript list just to make it look better. */ slReverse(&gene->txList); /* Write out canonical file output */ bed = hashMustFindVal(bedHash, gene->niceTx->name); fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n", bed->chrom, bed->chromStart, bed->chromEnd, geneId, gene->niceTx->name, gene->niceTx->name); /* Write out isoforms output. */ for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fIso, "%d\t%s\n", geneId, bed->name); /* Write out cluster output, starting with bed 6 standard fields. */ fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t", gene->chrom, gene->start, gene->end, name, 0, gene->strand); /* Write out thick-start/thick end. */ if (gene->isCoding) { int thickStart = gene->end, thickEnd = gene->start; for (bed = gene->txList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) { thickStart = min(thickStart, bed->thickStart); thickEnd = max(thickEnd, bed->thickEnd); } } fprintf(fClus, "%d\t%d\t", thickStart, thickEnd); } else { fprintf(fClus, "%d\t%d\t", gene->start, gene->start); } /* We got no rgb value, just write out zero. */ fprintf(fClus, "0\t"); /* Get exons from exonTree. */ struct range *exon, *exonList = rangeTreeList(gene->exonTree); fprintf(fClus, "%d\t", slCount(exonList)); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->start - gene->start); fprintf(fClus, "\t"); for (exon = exonList; exon != NULL; exon = exon->next) fprintf(fClus, "%d,", exon->end - exon->start); fprintf(fClus, "\t"); /* Write out associated transcripts. */ fprintf(fClus, "%d\t", slCount(gene->txList)); for (bed = gene->txList; bed != NULL; bed = bed->next) fprintf(fClus, "%s,", bed->name); fprintf(fClus, "\t"); /* Write out nice value */ fprintf(fClus, "%s\t", gene->niceTx->name); /* Write out coding/noncoding value. */ fprintf(fClus, "%d\n", gene->isCoding); } /* Close up files. */ carefulClose(&fCan); carefulClose(&fIso); carefulClose(&fClus); }
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree) /* Merge together overlapping edges with soft ends. */ { struct mergedEdge /* Hold together info on a merged edge. */ { struct evidence *evidence; }; /* Traverse graph and build up range tree. Each node in the range tree * will represent the bounds of coordinates of overlapping double softs */ struct rbTree *rangeTree = rangeTreeNew(0); struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree); for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) rangeTreeAdd(rangeTree, start->position, end->position); } /* Traverse graph again merging edges */ for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start= edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) { struct range *r = rangeTreeFindEnclosing(rangeTree, start->position, end->position); assert(r != NULL); /* At this point, r represents the bounds of a double-soft * region that encompasses this edge. Collect the set of * evidence of edges overlapping this range */ struct mergedEdge *mergeEdge = r->val; if (mergeEdge == NULL) { lmAllocVar(rangeTree->lm, mergeEdge); r->val = mergeEdge; } mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence); verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", start->position, end->position, r->start, r->end); edge->evList = NULL; rbTreeRemove(edgeTree, edge); } } /* Traverse merged edge list, making a single edge from each range. At this point, * each range will have some evidence attached to it, from each of the double softs * that fall within the range. From all of this evidence, make a single consensus edge */ struct range *r; struct lm *lm = lmInit(0); for (r = rangeTreeList(rangeTree); r != NULL; r = r->next) { struct mergedEdge *mergedEdge = r->val; struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm); if (edge != NULL) rbTreeAdd(edgeTree, edge); verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", edge->start->position, edge->end->position, r->start, r->end); } /* Clean up and go home. */ lmCleanup(&lm); removeUnusedVertices(vertexTree, edgeTree); slFreeList(&edgeRefList); rbTreeFree(&rangeTree); }