void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f) /* Output one Ra record to file. */ { fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end); fprintf(f, "txName %s\n", seq->name); fprintf(f, "txSize %d\n", seq->size); fprintf(f, "cdsStart %d\n", start); fprintf(f, "cdsEnd %d\n", end); fprintf(f, "cdsSize %d\n", end-start); fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start)); fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3)); boolean gotKozak1 = FALSE; if (start >= 3) { char c = seq->dna[start-3]; gotKozak1 = (c == 'a' || c == 'g'); } fprintf(f, "gotKozak1 %d\n", gotKozak1); boolean gotKozak2 = FALSE; if (start+3 < seq->size) gotKozak2 = (seq->dna[start+3] == 'g'); fprintf(f, "gotKozak2 %d\n", gotKozak2); fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2); /* Count up upstream ATG and Kozak */ struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int upAtg = 0, upKozak = 0; int i; for (i=0; i<start; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = findOrfEnd(seq, i); if (orfEnd < start) rangeTreeAdd(upAtgRanges, i, orfEnd); ++upAtg; if (isKozak(seq->dna, seq->size, i)) { ++upKozak; if (orfEnd < start) rangeTreeAdd(upKozakRanges, i, orfEnd); } } } fprintf(f, "upstreamAtgCount %d\n", upAtg); fprintf(f, "upstreamKozakCount %d\n", upKozak); fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start)); fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start)); fprintf(f, "\n"); /* Cluen up and go home. */ rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
void sortAndMergeBlocks(struct bed *oneBed) /* construct a range tree out of blocks, then remake blocks arrays */ /* this should automatically merge everything. */ { struct rbTree *rt = rangeTreeNew(); struct range *rangeList = NULL; struct range *oneRange; int i; for (i = 0; i < oneBed->blockCount; i++) { rangeTreeAdd(rt, oneBed->chromStarts[i], oneBed->chromStarts[i] + oneBed->blockSizes[i]); /* delete old (bad) blocks in bed */ oneBed->chromStarts[i] = oneBed->blockSizes[i] = 0; } rangeList = rangeTreeList(rt); oneBed->blockCount = slCount(rangeList); /* remake bed blocks out of range tree */ i = 0; for (oneRange = rangeList; oneRange != NULL; oneRange = oneRange->next) { oneBed->chromStarts[i] = oneRange->start; oneBed->blockSizes[i] = oneRange->end - oneRange->start; i++; } rangeTreeFree(&rt); }
int pslBedOverlap(struct psl *psl, struct bed *bed) /* Return number of bases psl and bed overlap at the block level */ { /* No overlap if on wrong chromosome or wrong strand. */ if (psl->strand[0] != bed->strand[0]) return 0; if (!sameString(psl->tName, bed->chrom)) return 0; /* Build up range tree covering bed */ struct rbTree *rangeTree = rangeTreeNew(); int i; for (i=0; i<bed->blockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; rangeTreeAdd(rangeTree, start, end); } /* Loop through psl accumulating total overlap. */ int totalOverlap = 0; for (i=0; i<psl->blockCount; ++i) { int start = psl->tStarts[i]; int end = start + psl->blockSizes[i]; totalOverlap += rangeTreeOverlapSize(rangeTree, start, end); } /* Clean up and return result. */ rangeTreeFree(&rangeTree); return totalOverlap; }
static struct bbiInterval *bigBedCoverageIntervals(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, struct lm *lm) /* Return intervals where the val is the depth of coverage. */ { /* Get list of overlapping intervals */ struct bigBedInterval *bi, *biList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); if (biList == NULL) return NULL; /* Make a range tree that collects coverage. */ struct rbTree *rangeTree = rangeTreeNew(); for (bi = biList; bi != NULL; bi = bi->next) rangeTreeAddToCoverageDepth(rangeTree, bi->start, bi->end); struct range *range, *rangeList = rangeTreeList(rangeTree); /* Convert rangeList to bbiInterval list. */ struct bbiInterval *bwi, *bwiList = NULL; for (range = rangeList; range != NULL; range = range->next) { lmAllocVar(lm, bwi); bwi->start = range->start; if (bwi->start < start) bwi->start = start; bwi->end = range->end; if (bwi->end > end) bwi->end = end; bwi->val = ptToInt(range->val); slAddHead(&bwiList, bwi); } slReverse(&bwiList); /* Clean up and go home. */ rangeTreeFree(&rangeTree); return bwiList; }
void outputProt(struct protInfo *prot, struct hash *seedToScop, FILE *f, FILE *fKnownTo) /* Callapse together all features of same type that overlap and output */ { slSort(&prot->featureList, protFeatureCmpName); struct protFeature *startFeature, *endFeature; for (startFeature = prot->featureList; startFeature != NULL; startFeature = endFeature) { for (endFeature = startFeature->next; endFeature != NULL; endFeature = endFeature->next) if (!sameString(startFeature->name, endFeature->name)) break; struct rbTree *rangeTree = rangeTreeNew(); struct protFeature *feature; for (feature = startFeature; feature != endFeature; feature = feature->next) rangeTreeAdd(rangeTree, feature->start, feature->end); struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) { feature = highestScoringFeature(startFeature, endFeature, range->start, range->end); fprintf(f, "%s\t%d\t%d\t%s\n", prot->name, range->start, range->end, startFeature->name); fprintf(fKnownTo, "%s\t%s\t%d\t%d\t%g\n", prot->name, (char *)hashMustFindVal(seedToScop, startFeature->name), range->start, range->end, feature->eVal); } rangeTreeFree(&rangeTree); } }
void constExons(struct txGraph *graph, FILE *f) /* Write out constituitive exons. */ { /* Create a tree with all introns. */ struct rbTree *tree = rangeTreeNew(); struct txEdge *edge; for (edge = graph->edgeList; edge != NULL; edge = edge->next) { if (edge->type == ggIntron) { rangeTreeAdd(tree, graph->vertices[edge->startIx].position, graph->vertices[edge->endIx].position); } } /* Scan through all exons looking for ones that don't intersect * introns. */ int eId = 0; for (edge = graph->edgeList; edge != NULL; edge = edge->next) { if (edge->type == ggExon) { struct txVertex *s = &graph->vertices[edge->startIx]; struct txVertex *e = &graph->vertices[edge->endIx]; if (s->type == ggHardStart && e->type == ggHardEnd) { int start = s->position; int end = e->position; if (!rangeTreeOverlaps(tree, start, end)) { char *refSource = refSourceAcc(graph, edge); if (refSource != NULL && edge->evCount >= 10) { /* Do one more scan making sure that it doesn't * intersect any exons except for us. */ boolean anyOtherExon = FALSE; struct txEdge *ed; for (ed = graph->edgeList; ed != NULL; ed = ed->next) { if (ed != edge) { int edStart = graph->vertices[ed->startIx].position; int edEnd = graph->vertices[ed->endIx].position; if (rangeIntersection(edStart, edEnd, start, end) > 0) { anyOtherExon = TRUE; break; } } } if (!anyOtherExon) fprintf(f, "%s\t%d\t%d\t%s.%d\t0\t%s\n", graph->tName, start, end, refSource, ++eId, graph->strand); } } } } } rangeTreeFree(&tree); }
void printBiggestGap(char *database, struct sqlConnection *conn, struct slName *chromList, struct hash *chromHash, char *track) /* Look up track in database, figure out which type it is, call * appropriate biggest gap finder, and then print result. */ { struct trackDb *tdb = hTrackInfo(conn, track); struct hTableInfo *hti = hFindTableInfo(database, chromList->name, tdb->table); char *typeWord = cloneFirstWord(tdb->type); boolean isBig = FALSE, isBigBed = FALSE; struct bbiFile *bbi = NULL; if (sameString(typeWord, "bigBed")) { isBig = TRUE; isBigBed = TRUE; bbi = bigBedFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) ); } else if (sameString(typeWord, "bigWig")) { isBig = TRUE; bbi = bigWigFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) ); } char *biggestChrom = NULL; int biggestSize = 0, biggestStart = 0, biggestEnd = 0; struct slName *chrom; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { if (!allParts && strchr(chrom->name, '_')) // Generally skip weird chroms continue; if (female && sameString(chrom->name, "chrY")) continue; int chromSize = hashIntVal(chromHash, chrom->name); struct rbTree *rt = rangeTreeNew(); int start = 0, end = 0, size = 0; if (isBig) bigCoverageIntoTree(tdb, bbi, chrom->name, chromSize, rt, isBigBed); else tableCoverageIntoTree(hti, tdb, conn, chrom->name, chromSize, rt); if (rt->n > 0) // Want to keep completely uncovered chromosome uncovered addGaps(conn, chrom->name, rt); biggestGapFromRangeTree(rt, chromSize, &start, &end, &size); if (size > biggestSize) { biggestSize = size; biggestStart = start; biggestEnd = end; biggestChrom = chrom->name; } rangeTreeFree(&rt); } printf("%s\t%s:%d-%d\t", track, biggestChrom, biggestStart+1, biggestEnd); if (noComma) printf("%d", biggestSize); else printLongWithCommas(stdout, biggestSize); putchar('\n'); freez(&typeWord); bbiFileClose(&bbi); }
void calcUpstreams(struct dnaSeq *seq, int *upAtgCount, int *upKozakCount) /* Count up upstream ATG and Kozak */ { struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int endPos = seq->size-3; int i; for (i=0; i<=endPos; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = orfEndInSeq(seq, i); rangeTreeAdd(upAtgRanges, i, orfEnd); if (isKozak(seq->dna, seq->size, i)) rangeTreeAdd(upKozakRanges, i, orfEnd); } } setArrayCountsFromRangeTree(upAtgRanges, upAtgCount, seq->size); setArrayCountsFromRangeTree(upKozakRanges, upKozakCount, seq->size); rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
int overlapInSameFrame(struct bed *a, struct bed *b) /* Return amount of overlap between coding regions (in same frame) * between two beds. */ { int overlap = 0; /* Allocate range trees for each frame. */ struct rbTree *frameTrees[3]; int frame; for (frame = 0; frame<3; ++frame) frameTrees[frame] = rangeTreeNew(); /* Fill in frame trees with coding exons of a. */ int cdsPos = 0; int block, blockCount = a->blockCount; for (block = 0; block < blockCount; ++block) { int start = a->chromStart + a->chromStarts[block]; int end = start + a->blockSizes[block]; start = max(start, a->thickStart); end = min(end, a->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; rangeTreeAdd(frameTrees[frame], start, end); cdsPos += size; } } /* Add up overlaps by comparing bed b against frameTrees */ cdsPos = 0; blockCount = b->blockCount; for (block = 0; block < blockCount; ++block) { int start = b->chromStarts[block] + b->chromStart; int end = start + b->blockSizes[block]; start = max(start, b->thickStart); end = min(end, b->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; overlap += rangeTreeOverlapSize(frameTrees[frame], start, end); cdsPos += size; } } /* Clean up and go home. */ for (frame = 0; frame<3; ++frame) rangeTreeFree(&frameTrees[frame]); return overlap; }
void doStrand(struct bed *start, struct bed *end, FILE *f) /* Assuming all beds from start up to end are on same strand, * make a merged bed with all their blocks and output it. */ { struct rbTree *rangeTree = rangeTreeNew(); struct bed *bed; for (bed = start; bed != end; bed = bed->next) bedIntoRangeTree(bed, rangeTree); bed = bedFromRangeTree(rangeTree, start->chrom, start->name, start->strand); bedTabOutN(bed, 12, f); bedFree(&bed); rangeTreeFree(&rangeTree); }
double calcBaseCoverage(struct hash *refHash, struct hash *geneHash) /* Figure proportion refBases covered by genes. Both refHash and geneHash * are keyed by chromosome and filled with genes. */ { struct hashEl *chrom, *chromList = hashElListHash(refHash); long totalRef = 0, totalOverlap = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { char *chromName = chrom->name; struct binKeeper *refBk = chrom->val; struct rbTree *refTree = bedBkToTree(refBk); totalRef += rangeTreeTotalSize(refTree); struct binKeeper *geneBk = hashFindVal(geneHash, chromName); if (geneBk != NULL) { struct rbTree *geneTree = bedBkToTree(geneBk); totalOverlap += rangeTreeRangeTreeOverlap(refTree, geneTree); rangeTreeFree(&geneTree); } rangeTreeFree(&refTree); } return (double)totalOverlap / totalRef; }
static struct bed* subset_beds(char* sectionString, struct bed** pRegions, struct hash* chromHash) /* in the situation where both a regions bed file is given AND the filename specifies subsections, */ /* intersect the two. For simplictity sake, */ { struct bed* fname_ranges = parseSectionString(sectionString, chromHash); struct bed* bed; struct bed* subset = NULL; struct bed* regions = *pRegions; slSort(&fname_ranges, bedCmp); bed = fname_ranges; while (bed != NULL) { /* each iteration of the loop should be a separate chrom */ struct bed* region; struct rbTree* tree = rangeTreeNew(); while ((bed != NULL) && (bed->next != NULL) && (sameString(bed->chrom, bed->next->chrom))) { rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); bed = bed->next; } rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); /* now we're at a point that we're dealing only with one chromosome. */ for (region = regions; region != NULL; region = region->next) { if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd) && rangeTreeFindEnclosing(tree, region->chromStart, region->chromEnd)) { struct bed* clone = cloneBed(region); slAddHead(&subset, clone); } else if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd)) errAbort("range specified in file overlaps but is not contained by range specified on command-line"); } rangeTreeFree(&tree); bed = bed->next; } if (subset == NULL) { errAbort("no ranges specified in file were contained in ranges specified on command-line"); } slReverse(&subset); bedFreeList(&fname_ranges); bedFreeList(pRegions); return subset; }
static struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, struct bbiSummaryElement *totalSum) /* Write out data reduced by factor of initialReduction. Also calculate and keep in memory * next reduction level. This is more work than some ways, but it keeps us from having to * keep the first reduction entirely in memory. */ { struct bbiSummary *twiceReducedList = NULL; bits32 doubleReductionSize = initialReduction * zoomIncrement; struct bbiChromUsage *usage = usageList; struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd; boundsPt = AllocArray(boundsArray, initialReductionCount); boundsEnd = boundsPt + initialReductionCount; *retDataStart = ftell(f); writeOne(f, initialReductionCount); /* This gets a little complicated I'm afraid. The strategy is to: * 1) Build up a range tree that represents coverage depth on that chromosome * This also has the nice side effect of getting rid of overlaps. * 2) Stream through the range tree, outputting the initial summary level and * further reducing. */ boolean firstTime = TRUE; struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress); for (usage = usageList; usage != NULL; usage = usage->next) { struct bbiSummary oneSummary, *sum = NULL; struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name); struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) { /* Grab values we want from range. */ double val = ptToInt(range->val); int start = range->start; int end = range->end; bits32 size = end - start; /* Add to total summary. */ if (firstTime) { totalSum->validCount = size; totalSum->minVal = totalSum->maxVal = val; totalSum->sumData = val*size; totalSum->sumSquares = val*val*size; firstTime = FALSE; } else { totalSum->validCount += size; if (val < totalSum->minVal) totalSum->minVal = val; if (val > totalSum->maxVal) totalSum->maxVal = val; totalSum->sumData += val*size; totalSum->sumSquares += val*val*size; } /* If start past existing block then output it. */ if (sum != NULL && sum->end <= start && sum->end < usage->size) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If don't have a summary we're working on now, make one. */ if (sum == NULL) { oneSummary.chromId = usage->id; oneSummary.start = start; oneSummary.end = start + initialReduction; if (oneSummary.end > usage->size) oneSummary.end = usage->size; oneSummary.minVal = oneSummary.maxVal = val; oneSummary.sumData = oneSummary.sumSquares = 0.0; oneSummary.validCount = 0; sum = &oneSummary; } /* Deal with case where might have to split an item between multiple summaries. This * loop handles all but the final affected summary in that case. */ while (end > sum->end) { /* Fold in bits that overlap with existing summary and output. */ int overlap = rangeIntersection(start, end, sum->start, sum->end); assert(overlap > 0); verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap); sum->validCount += overlap; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * overlap; sum->sumSquares += val*val * overlap; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); size -= overlap; /* Move summary to next part. */ sum->start = start = sum->end; sum->end = start + initialReduction; if (sum->end > usage->size) sum->end = usage->size; sum->minVal = sum->maxVal = val; sum->sumData = sum->sumSquares = 0.0; sum->validCount = 0; } /* Add to summary. */ sum->validCount += size; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * size; sum->sumSquares += val*val * size; } if (sum != NULL) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); } rangeTreeFree(&rangeTree); } bbiSumOutStreamClose(&stream); /* Write out 1st zoom index. */ int indexOffset = *retIndexStart = ftell(f); assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }