Exemple #1
void calcUpstreams(struct dnaSeq *seq, int *upAtgCount, int *upKozakCount)
/* Count up upstream ATG and Kozak */
struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew();
int endPos = seq->size-3;
int i;
for (i=0; i<=endPos; ++i)
    if (startsWith("atg", seq->dna + i))
        int orfEnd = orfEndInSeq(seq, i);
	rangeTreeAdd(upAtgRanges, i, orfEnd);
        if (isKozak(seq->dna, seq->size, i))
	    rangeTreeAdd(upKozakRanges, i, orfEnd);
setArrayCountsFromRangeTree(upAtgRanges, upAtgCount, seq->size);
setArrayCountsFromRangeTree(upKozakRanges, upKozakCount, seq->size);
int overlapInSameFrame(struct bed *a, struct bed *b)
/* Return amount of overlap between coding regions (in same frame)
 * between two beds. */
int overlap = 0;

/* Allocate range trees for each frame. */
struct rbTree *frameTrees[3];
int frame;
for (frame = 0; frame<3; ++frame)
    frameTrees[frame] = rangeTreeNew();

/* Fill in frame trees with coding exons of a. */
int cdsPos = 0;
int block, blockCount = a->blockCount;
for (block = 0; block < blockCount; ++block)
    int start = a->chromStart + a->chromStarts[block];
    int end = start + a->blockSizes[block];
    start = max(start, a->thickStart);
    end = min(end, a->thickEnd);
    if (start < end)
	int size = end - start;
	int frame = (start - cdsPos)%3;
	rangeTreeAdd(frameTrees[frame], start, end);
	cdsPos += size;

/* Add up overlaps by comparing bed b against frameTrees */
cdsPos = 0;
blockCount = b->blockCount;
for (block = 0; block < blockCount; ++block)
    int start = b->chromStarts[block] + b->chromStart;
    int end = start + b->blockSizes[block];
    start = max(start, b->thickStart);
    end = min(end, b->thickEnd);
    if (start < end)
	int size = end - start;
	int frame = (start - cdsPos)%3;
	overlap += rangeTreeOverlapSize(frameTrees[frame], start, end);
	cdsPos += size;

/* Clean up and go home. */
for (frame = 0; frame<3; ++frame)
return overlap;
Exemple #3
static struct bed* subset_beds(char* sectionString, struct bed** pRegions, struct hash* chromHash)
/* in the situation where both a regions bed file is given AND the filename specifies subsections, */
/* intersect the two.  For simplictity sake,  */
    struct bed* fname_ranges = parseSectionString(sectionString, chromHash);
    struct bed* bed;
    struct bed* subset = NULL;
    struct bed* regions = *pRegions;
    slSort(&fname_ranges, bedCmp);
    bed = fname_ranges;
    while (bed != NULL) {
        /* each iteration of the loop should be a separate chrom */
        struct bed* region;
        struct rbTree* tree = rangeTreeNew();
        while ((bed != NULL) && (bed->next != NULL) && (sameString(bed->chrom, bed->next->chrom))) {
            rangeTreeAdd(tree, bed->chromStart, bed->chromEnd);
            bed = bed->next;
        rangeTreeAdd(tree, bed->chromStart, bed->chromEnd);
        /* now we're at a point that we're dealing only with one chromosome. */
        for (region = regions; region != NULL; region = region->next) {
            if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd)
                && rangeTreeFindEnclosing(tree, region->chromStart, region->chromEnd)) {
                struct bed* clone = cloneBed(region);
                slAddHead(&subset, clone);
            } else if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd))
                errAbort("range specified in file overlaps but is not contained by range specified on command-line");
        bed = bed->next;
    if (subset == NULL) {
        errAbort("no ranges specified in file were contained in ranges specified on command-line");
    return subset;
Exemple #4
void tableCoverageIntoTree(struct hTableInfo *hti, struct trackDb *tdb, struct sqlConnection *conn, 
	char *chrom, int chromSize, struct rbTree *rt)
/* Find biggest gap in given chromosome in database table with chromosome coordinates */
char fields[512];
safef(fields, sizeof(fields), "%s,%s", hti->startField, hti->endField);
struct sqlResult *sr = hExtendedChromQuery(conn, hti->rootName, chrom, NULL, FALSE,
	fields, NULL);
char **row;
while ((row = sqlNextRow(sr)) != NULL)
    rangeTreeAdd(rt, sqlUnsigned(row[0]), sqlUnsigned(row[1]));
Exemple #5
struct genomeRangeTree *edwGrtFromBigBed(char *fileName)
/* Return genome range tree for simple (unblocked) bed */
struct bbiFile *bbi = bigBedFileOpen(fileName);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
struct genomeRangeTree *grt = genomeRangeTreeNew();
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    struct rbTree *tree = genomeRangeTreeFindOrAddRangeTree(grt, chrom->name);
    struct lm *lm = lmInit(0);
    struct bigBedInterval *iv, *ivList = NULL;
    ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm);
    for (iv = ivList; iv != NULL; iv = iv->next)
        rangeTreeAdd(tree, iv->start, iv->end);
return grt;
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed)
/* Create a new broken-up that excludes part of gene between CDS breaks.  
 * Also jiggles cds->end coordinate to cope with the sequence we remove.
 * Deals with transcript to genome coordinate mapping including negative
 * strand.  Be afraid, be very afraid! */
/* Create range tree covering all breaks.  The coordinates here
 * are transcript coordinates.  While we're out it shrink outer CDS
 * since we are actually shrinking transcript. */
struct rbTree *gapTree = rangeTreeNew();
int bedSize = bed->chromEnd - bed->chromStart;
struct lm *lm = gapTree->lm;	/* Convenient place to allocate memory. */
int i, lastCds = cds->cdsCount-1;
for (i=0; i<lastCds; ++i)
    int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i];
    int gapEnd = cds->cdsStarts[i+1];
    int gapSize = gapEnd - gapStart;
    cds->end -= gapSize;
    rangeTreeAdd(gapTree, gapStart, gapEnd);

/* Get list of exons in bed, flipped to reverse strand if need be. */
struct range *exon, *exonList = bedToExonList(bed, lm);
if (bed->strand[0] == '-')
    flipExonList(&exonList, bedSize);

/* Go through exon list, mapping each exon to transcript
 * coordinates. Check if exon needs breaking up, and if
 * so do so, as we copy it to new list. */
/* Copy exons to new list, breaking them up if need be. */
struct range *newList = NULL, *nextExon, *newExon;
int txStartPos = 0, txEndPos;
for (exon = exonList; exon != NULL; exon = nextExon)
    txEndPos = txStartPos + exon->end - exon->start;
    nextExon = exon->next;
    struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos);
    if (gapList != NULL)
	verbose(3, "Splitting exon because of CDS gap\n");

	/* Make up exons from current position up to next gap.  This is a little
	 * complicated by possibly the gap starting before the exon. */
	int exonStart = exon->start;
	int txStart = txStartPos;
	struct range *gap;
	for (gap = gapList; gap != NULL; gap = gap->next)
	    int txEnd = gap->start;
	    int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos);
	    int exonSize = txEnd - txStart;
	    if (exonSize > 0)
		lmAllocVar(lm, newExon);
		newExon->start = exonStart;
		newExon->end = exonStart + exonSize;
		slAddHead(&newList, newExon);
	    else /* This case happens if gap starts before exon */
		exonSize = 0;

	    /* Update current position in both transcript and genome space. */
	    exonStart += exonSize + gapSize;
	    txStart += exonSize + gapSize;

	/* Make up final exon from last gap to end, at least if we don't end in a gap. */
	if (exonStart < exon->end)
	    lmAllocVar(lm, newExon);
	    newExon->start = exonStart;
	    newExon->end = exon->end;
	    slAddHead(&newList, newExon);
	/* Easy case where we don't intersect any gaps. */
	slAddHead(&newList, exon);
    txStartPos= txEndPos;

/* Flip exons back to forward strand if need be */
if (bed->strand[0] == '-')
    flipExonList(&newList, bedSize);

/* Convert exons to bed12 */
struct bed *newBed;
newBed->chrom = cloneString(bed->chrom);
newBed->chromStart = newList->start + bed->chromStart;
newBed->chromEnd = newList->end + bed->chromStart;
newBed->name  = cloneString(bed->name);
newBed->score = bed->score;
newBed->strand[0] = bed->strand[0];
newBed->blockCount = slCount(newList);
AllocArray(newBed->blockSizes,  newBed->blockCount);
AllocArray(newBed->chromStarts,  newBed->blockCount);
for (exon = newList, i=0; exon != NULL; exon = exon->next, i++)
    newBed->chromStarts[i] = exon->start;
    newBed->blockSizes[i] = exon->end - exon->start;
    newBed->chromEnd = exon->end + bed->chromStart;

/* Clean up and go home. */
return newBed;
void doEnrichmentsFromBigBed(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigBedPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigBedFileOpen(bigBedPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigBed data in memory. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */
    struct lm *lm = lmInit(0);
    struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm);
    struct bigBedInterval *iv;
    struct rbTree *bbTree = rangeTreeNew();
    for (iv = ivList; iv != NULL; iv = iv->next)
	 rangeTreeAdd(bbTree, iv->start, iv->end);
    struct range *bbRange, *bbRangeList = rangeTreeList(bbTree);

    /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */
    struct target *target;
    for (target = targetList; target != NULL; target = target->next)
	if (target->skip)
	struct genomeRangeTree *grt = target->grt;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    struct bigBedInterval *iv;
	    for (iv = ivList; iv != NULL; iv = iv->next)
		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
		target->overlapBases += overlap;
	    for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next)
		int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end);
		target->uniqOverlapBases += overlap;

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    if (target->skip)
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);

Exemple #8
struct range *genomeRangeTreeAdd(struct genomeRangeTree *tree, char *chrom, int start, int end)
/* Add range to tree, merging with existing ranges if need be. 
 * Adds new rangeTree if chrom not found. */
return rangeTreeAdd(genomeRangeTreeFindOrAddRangeTree(tree,chrom), start, end);
Exemple #9
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree)
/* Merge together overlapping edges with soft ends. */
struct mergedEdge
/* Hold together info on a merged edge. */
    struct evidence *evidence;

/* Traverse graph and build up range tree.  Each node in the range tree
 * will represent the bounds of coordinates of overlapping double softs */
struct rbTree *rangeTree = rangeTreeNew(0);
struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree);
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        rangeTreeAdd(rangeTree, start->position, end->position);

/* Traverse graph again merging edges */
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    struct edge *edge = edgeRef->val;
    struct vertex *start= edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
	struct range *r = rangeTreeFindEnclosing(rangeTree,
		start->position, end->position);
	assert(r != NULL);
	/* At this point, r represents the bounds of a double-soft
	 * region that encompasses this edge.  Collect the set of
	 * evidence of edges overlapping this range */
        struct mergedEdge *mergeEdge = r->val;
        if (mergeEdge == NULL)
            lmAllocVar(rangeTree->lm, mergeEdge);
            r->val = mergeEdge;
        mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence);
	verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", 
		start->position, end->position, r->start, r->end);
        edge->evList = NULL;
        rbTreeRemove(edgeTree, edge);

/* Traverse merged edge list, making a single edge from each range. At this point,
 * each range will have some evidence attached to it, from each of the double softs
 * that fall within the range.  From all of this evidence, make a single consensus edge */
struct range *r;
struct lm *lm = lmInit(0);
for (r = rangeTreeList(rangeTree); r != NULL; r = r->next)
    struct mergedEdge *mergedEdge = r->val;
    struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm);
    if (edge != NULL)
        rbTreeAdd(edgeTree, edge);
    verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", 
	    edge->start->position, edge->end->position, r->start, r->end);

/* Clean up and go home. */
removeUnusedVertices(vertexTree, edgeTree);
void refSeparateButJoined(struct txGraph *graph, FILE *f)
/* Flag graphs that have two non-overlapping refSeqs. */
    int sourceIx;
    boolean foundIt = FALSE;
    struct lm *lm = lmInit(0);
    struct rbTreeNode **stack;
    lmAllocArray(lm, stack, 128);

    /* Loop through sources looking for reference type. */
    for (sourceIx=0; sourceIx<graph->sourceCount; ++sourceIx)
        struct txSource *source = &graph->sources[sourceIx];
        if (sameString(source->type, refType))
            /* Create a rangeTree including all exons of source. */
            struct rbTree *tree = rangeTreeNewDetailed(lm, stack);
            struct txEdge *edge;
            for (edge = graph->edgeList; edge != NULL; edge = edge->next)
                if (edge->type == ggExon && evOfSourceOnList(edge->evList, sourceIx))
                    rangeTreeAdd(tree, graph->vertices[edge->startIx].position,

            /* Go through remaining reference sources looking for no overlap. */
            int i;
            for (i=0; i<graph->sourceCount; ++i)
                if (i == sourceIx)
                struct txSource *s = &graph->sources[i];
                if (sameString(s->type, refType))
                    boolean gotOverlap = FALSE;
                    for (edge = graph->edgeList; edge != NULL; edge = edge->next)
                        if (edge->type == ggExon && evOfSourceOnList(edge->evList, i))
                            if (rangeTreeOverlaps(tree,
                                gotOverlap = TRUE;
                    if (!gotOverlap)
                        foundIt = TRUE;
        if (foundIt)
    if (foundIt)
        fprintf(f, "%s\t%d\t%d\t%s\t0\t%s\n", graph->tName,
                graph->tStart, graph->tEnd, "refJoined", graph->strand);