void calcUpstreams(struct dnaSeq *seq, int *upAtgCount, int *upKozakCount) /* Count up upstream ATG and Kozak */ { struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int endPos = seq->size-3; int i; for (i=0; i<=endPos; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = orfEndInSeq(seq, i); rangeTreeAdd(upAtgRanges, i, orfEnd); if (isKozak(seq->dna, seq->size, i)) rangeTreeAdd(upKozakRanges, i, orfEnd); } } setArrayCountsFromRangeTree(upAtgRanges, upAtgCount, seq->size); setArrayCountsFromRangeTree(upKozakRanges, upKozakCount, seq->size); rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
int overlapInSameFrame(struct bed *a, struct bed *b) /* Return amount of overlap between coding regions (in same frame) * between two beds. */ { int overlap = 0; /* Allocate range trees for each frame. */ struct rbTree *frameTrees[3]; int frame; for (frame = 0; frame<3; ++frame) frameTrees[frame] = rangeTreeNew(); /* Fill in frame trees with coding exons of a. */ int cdsPos = 0; int block, blockCount = a->blockCount; for (block = 0; block < blockCount; ++block) { int start = a->chromStart + a->chromStarts[block]; int end = start + a->blockSizes[block]; start = max(start, a->thickStart); end = min(end, a->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; rangeTreeAdd(frameTrees[frame], start, end); cdsPos += size; } } /* Add up overlaps by comparing bed b against frameTrees */ cdsPos = 0; blockCount = b->blockCount; for (block = 0; block < blockCount; ++block) { int start = b->chromStarts[block] + b->chromStart; int end = start + b->blockSizes[block]; start = max(start, b->thickStart); end = min(end, b->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; overlap += rangeTreeOverlapSize(frameTrees[frame], start, end); cdsPos += size; } } /* Clean up and go home. */ for (frame = 0; frame<3; ++frame) rangeTreeFree(&frameTrees[frame]); return overlap; }
static struct bed* subset_beds(char* sectionString, struct bed** pRegions, struct hash* chromHash) /* in the situation where both a regions bed file is given AND the filename specifies subsections, */ /* intersect the two. For simplictity sake, */ { struct bed* fname_ranges = parseSectionString(sectionString, chromHash); struct bed* bed; struct bed* subset = NULL; struct bed* regions = *pRegions; slSort(&fname_ranges, bedCmp); bed = fname_ranges; while (bed != NULL) { /* each iteration of the loop should be a separate chrom */ struct bed* region; struct rbTree* tree = rangeTreeNew(); while ((bed != NULL) && (bed->next != NULL) && (sameString(bed->chrom, bed->next->chrom))) { rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); bed = bed->next; } rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); /* now we're at a point that we're dealing only with one chromosome. */ for (region = regions; region != NULL; region = region->next) { if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd) && rangeTreeFindEnclosing(tree, region->chromStart, region->chromEnd)) { struct bed* clone = cloneBed(region); slAddHead(&subset, clone); } else if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd)) errAbort("range specified in file overlaps but is not contained by range specified on command-line"); } rangeTreeFree(&tree); bed = bed->next; } if (subset == NULL) { errAbort("no ranges specified in file were contained in ranges specified on command-line"); } slReverse(&subset); bedFreeList(&fname_ranges); bedFreeList(pRegions); return subset; }
void tableCoverageIntoTree(struct hTableInfo *hti, struct trackDb *tdb, struct sqlConnection *conn, char *chrom, int chromSize, struct rbTree *rt) /* Find biggest gap in given chromosome in database table with chromosome coordinates */ { char fields[512]; safef(fields, sizeof(fields), "%s,%s", hti->startField, hti->endField); struct sqlResult *sr = hExtendedChromQuery(conn, hti->rootName, chrom, NULL, FALSE, fields, NULL); char **row; while ((row = sqlNextRow(sr)) != NULL) { rangeTreeAdd(rt, sqlUnsigned(row[0]), sqlUnsigned(row[1])); } sqlFreeResult(&sr); }
struct genomeRangeTree *edwGrtFromBigBed(char *fileName) /* Return genome range tree for simple (unblocked) bed */ { struct bbiFile *bbi = bigBedFileOpen(fileName); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct genomeRangeTree *grt = genomeRangeTreeNew(); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *tree = genomeRangeTreeFindOrAddRangeTree(grt, chrom->name); struct lm *lm = lmInit(0); struct bigBedInterval *iv, *ivList = NULL; ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(tree, iv->start, iv->end); lmCleanup(&lm); } bigBedFileClose(&bbi); bbiChromInfoFreeList(&chromList); return grt; }
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed) /* Create a new broken-up that excludes part of gene between CDS breaks. * Also jiggles cds->end coordinate to cope with the sequence we remove. * Deals with transcript to genome coordinate mapping including negative * strand. Be afraid, be very afraid! */ { /* Create range tree covering all breaks. The coordinates here * are transcript coordinates. While we're out it shrink outer CDS * since we are actually shrinking transcript. */ struct rbTree *gapTree = rangeTreeNew(); int bedSize = bed->chromEnd - bed->chromStart; struct lm *lm = gapTree->lm; /* Convenient place to allocate memory. */ int i, lastCds = cds->cdsCount-1; for (i=0; i<lastCds; ++i) { int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i]; int gapEnd = cds->cdsStarts[i+1]; int gapSize = gapEnd - gapStart; cds->end -= gapSize; rangeTreeAdd(gapTree, gapStart, gapEnd); } /* Get list of exons in bed, flipped to reverse strand if need be. */ struct range *exon, *exonList = bedToExonList(bed, lm); if (bed->strand[0] == '-') flipExonList(&exonList, bedSize); /* Go through exon list, mapping each exon to transcript * coordinates. Check if exon needs breaking up, and if * so do so, as we copy it to new list. */ /* Copy exons to new list, breaking them up if need be. */ struct range *newList = NULL, *nextExon, *newExon; int txStartPos = 0, txEndPos; for (exon = exonList; exon != NULL; exon = nextExon) { txEndPos = txStartPos + exon->end - exon->start; nextExon = exon->next; struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos); if (gapList != NULL) { verbose(3, "Splitting exon because of CDS gap\n"); /* Make up exons from current position up to next gap. This is a little * complicated by possibly the gap starting before the exon. */ int exonStart = exon->start; int txStart = txStartPos; struct range *gap; for (gap = gapList; gap != NULL; gap = gap->next) { int txEnd = gap->start; int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos); int exonSize = txEnd - txStart; if (exonSize > 0) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exonStart + exonSize; slAddHead(&newList, newExon); } else /* This case happens if gap starts before exon */ { exonSize = 0; } /* Update current position in both transcript and genome space. */ exonStart += exonSize + gapSize; txStart += exonSize + gapSize; } /* Make up final exon from last gap to end, at least if we don't end in a gap. */ if (exonStart < exon->end) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exon->end; slAddHead(&newList, newExon); } } else { /* Easy case where we don't intersect any gaps. */ slAddHead(&newList, exon); } txStartPos= txEndPos; } slReverse(&newList); /* Flip exons back to forward strand if need be */ if (bed->strand[0] == '-') flipExonList(&newList, bedSize); /* Convert exons to bed12 */ struct bed *newBed; AllocVar(newBed); newBed->chrom = cloneString(bed->chrom); newBed->chromStart = newList->start + bed->chromStart; newBed->chromEnd = newList->end + bed->chromStart; newBed->name = cloneString(bed->name); newBed->score = bed->score; newBed->strand[0] = bed->strand[0]; newBed->blockCount = slCount(newList); AllocArray(newBed->blockSizes, newBed->blockCount); AllocArray(newBed->chromStarts, newBed->blockCount); for (exon = newList, i=0; exon != NULL; exon = exon->next, i++) { newBed->chromStarts[i] = exon->start; newBed->blockSizes[i] = exon->end - exon->start; newBed->chromEnd = exon->end + bed->chromStart; } /* Clean up and go home. */ rbTreeFree(&gapTree); return newBed; }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }
struct range *genomeRangeTreeAdd(struct genomeRangeTree *tree, char *chrom, int start, int end) /* Add range to tree, merging with existing ranges if need be. * Adds new rangeTree if chrom not found. */ { return rangeTreeAdd(genomeRangeTreeFindOrAddRangeTree(tree,chrom), start, end); }
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree) /* Merge together overlapping edges with soft ends. */ { struct mergedEdge /* Hold together info on a merged edge. */ { struct evidence *evidence; }; /* Traverse graph and build up range tree. Each node in the range tree * will represent the bounds of coordinates of overlapping double softs */ struct rbTree *rangeTree = rangeTreeNew(0); struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree); for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) rangeTreeAdd(rangeTree, start->position, end->position); } /* Traverse graph again merging edges */ for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start= edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) { struct range *r = rangeTreeFindEnclosing(rangeTree, start->position, end->position); assert(r != NULL); /* At this point, r represents the bounds of a double-soft * region that encompasses this edge. Collect the set of * evidence of edges overlapping this range */ struct mergedEdge *mergeEdge = r->val; if (mergeEdge == NULL) { lmAllocVar(rangeTree->lm, mergeEdge); r->val = mergeEdge; } mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence); verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", start->position, end->position, r->start, r->end); edge->evList = NULL; rbTreeRemove(edgeTree, edge); } } /* Traverse merged edge list, making a single edge from each range. At this point, * each range will have some evidence attached to it, from each of the double softs * that fall within the range. From all of this evidence, make a single consensus edge */ struct range *r; struct lm *lm = lmInit(0); for (r = rangeTreeList(rangeTree); r != NULL; r = r->next) { struct mergedEdge *mergedEdge = r->val; struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm); if (edge != NULL) rbTreeAdd(edgeTree, edge); verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", edge->start->position, edge->end->position, r->start, r->end); } /* Clean up and go home. */ lmCleanup(&lm); removeUnusedVertices(vertexTree, edgeTree); slFreeList(&edgeRefList); rbTreeFree(&rangeTree); }
void refSeparateButJoined(struct txGraph *graph, FILE *f) /* Flag graphs that have two non-overlapping refSeqs. */ { int sourceIx; boolean foundIt = FALSE; struct lm *lm = lmInit(0); struct rbTreeNode **stack; lmAllocArray(lm, stack, 128); /* Loop through sources looking for reference type. */ for (sourceIx=0; sourceIx<graph->sourceCount; ++sourceIx) { struct txSource *source = &graph->sources[sourceIx]; if (sameString(source->type, refType)) { /* Create a rangeTree including all exons of source. */ struct rbTree *tree = rangeTreeNewDetailed(lm, stack); struct txEdge *edge; for (edge = graph->edgeList; edge != NULL; edge = edge->next) { if (edge->type == ggExon && evOfSourceOnList(edge->evList, sourceIx)) rangeTreeAdd(tree, graph->vertices[edge->startIx].position, graph->vertices[edge->endIx].position); } /* Go through remaining reference sources looking for no overlap. */ int i; for (i=0; i<graph->sourceCount; ++i) { if (i == sourceIx) continue; struct txSource *s = &graph->sources[i]; if (sameString(s->type, refType)) { boolean gotOverlap = FALSE; for (edge = graph->edgeList; edge != NULL; edge = edge->next) { if (edge->type == ggExon && evOfSourceOnList(edge->evList, i)) { if (rangeTreeOverlaps(tree, graph->vertices[edge->startIx].position, graph->vertices[edge->endIx].position)) { gotOverlap = TRUE; break; } } } if (!gotOverlap) { foundIt = TRUE; break; } } } freez(&tree); } if (foundIt) break; } if (foundIt) { fprintf(f, "%s\t%d\t%d\t%s\t0\t%s\n", graph->tName, graph->tStart, graph->tEnd, "refJoined", graph->strand); } lmCleanup(&lm); }