void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f) /* Output one Ra record to file. */ { fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end); fprintf(f, "txName %s\n", seq->name); fprintf(f, "txSize %d\n", seq->size); fprintf(f, "cdsStart %d\n", start); fprintf(f, "cdsEnd %d\n", end); fprintf(f, "cdsSize %d\n", end-start); fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start)); fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3)); boolean gotKozak1 = FALSE; if (start >= 3) { char c = seq->dna[start-3]; gotKozak1 = (c == 'a' || c == 'g'); } fprintf(f, "gotKozak1 %d\n", gotKozak1); boolean gotKozak2 = FALSE; if (start+3 < seq->size) gotKozak2 = (seq->dna[start+3] == 'g'); fprintf(f, "gotKozak2 %d\n", gotKozak2); fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2); /* Count up upstream ATG and Kozak */ struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int upAtg = 0, upKozak = 0; int i; for (i=0; i<start; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = findOrfEnd(seq, i); if (orfEnd < start) rangeTreeAdd(upAtgRanges, i, orfEnd); ++upAtg; if (isKozak(seq->dna, seq->size, i)) { ++upKozak; if (orfEnd < start) rangeTreeAdd(upKozakRanges, i, orfEnd); } } } fprintf(f, "upstreamAtgCount %d\n", upAtg); fprintf(f, "upstreamKozakCount %d\n", upKozak); fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start)); fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start)); fprintf(f, "\n"); /* Cluen up and go home. */ rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
int pslBedOverlap(struct psl *psl, struct bed *bed) /* Return number of bases psl and bed overlap at the block level */ { /* No overlap if on wrong chromosome or wrong strand. */ if (psl->strand[0] != bed->strand[0]) return 0; if (!sameString(psl->tName, bed->chrom)) return 0; /* Build up range tree covering bed */ struct rbTree *rangeTree = rangeTreeNew(); int i; for (i=0; i<bed->blockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; rangeTreeAdd(rangeTree, start, end); } /* Loop through psl accumulating total overlap. */ int totalOverlap = 0; for (i=0; i<psl->blockCount; ++i) { int start = psl->tStarts[i]; int end = start + psl->blockSizes[i]; totalOverlap += rangeTreeOverlapSize(rangeTree, start, end); } /* Clean up and return result. */ rangeTreeFree(&rangeTree); return totalOverlap; }
int rangeTreeOverlapTotalSize(struct rbTree *tree) /* Return the total size of all ranges in range tree. * Sadly not thread-safe. * On 32 bit machines be careful not to overflow * range of start, end or total size return value. */ { return rangeTreeOverlapSize(tree, INT_MIN, INT_MAX); }
long rangeTreeRangeTreeOverlap(struct rbTree *a, struct rbTree *b) /* Return total overlap between two range trees. */ { struct range *range, *list = rangeTreeList(a); long total = 0; for (range = list; range != NULL; range = range->next) total += rangeTreeOverlapSize(b, range->start, range->end); return total; }
static void addBbCorrelations(struct bbiChromInfo *chrom, struct genomeRangeTree *targetGrt, struct bbiFile *aBbi, struct bbiFile *bBbi, int numColIx, struct correlate *c, struct correlate *cInEnriched, long long *aTotalSpan, long long *bTotalSpan, long long *overlapTotalSpan) /* Find bits of a and b that overlap and also overlap with targetRanges. Try to extract * some number from the bed (which number depends on format). Returns total number of * overlapping bases between the two big-beds. */ { struct lm *lm = lmInit(0); struct rbTree *targetRanges = NULL; if (targetGrt != NULL) targetRanges = genomeRangeTreeFindRangeTree(targetGrt, chrom->name); struct bigBedInterval *a, *aList = bigBedIntervalQuery(aBbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *b, *bList = bigBedIntervalQuery(bBbi, chrom->name, 0, chrom->size, 0, lm); long long totalOverlap = 0; /* This is a slightly complex but useful loop for two sorted lists that will get overlaps between * the two in linear time. */ a = aList; b = bList; for (;;) { if (a == NULL || b == NULL) break; int s = max(a->start,b->start); int e = min(a->end,b->end); int overlap = e - s; if (overlap > 0) { totalOverlap += overlap; /* Do correlation over a/b overlap */ double aVal = getDoubleValAt(a->rest, numColIx); double bVal = getDoubleValAt(b->rest, numColIx); correlateNextMulti(c, aVal, bVal, overlap); /* Got intersection of a and b - is it also in targetRange? */ if (targetRanges) { int targetOverlap = rangeTreeOverlapSize(targetRanges, s, e); if (targetOverlap > 0) { correlateNextMulti(cInEnriched, aVal, bVal, targetOverlap); } } } if (a->end < b->end) a = a->next; else b = b->next; } *overlapTotalSpan += totalOverlap; *aTotalSpan += bbIntervalListTotalSpan(aList); *bTotalSpan += bbIntervalListTotalSpan(bList); lmCleanup(&lm); }
void doEnrichmentsFromBed3Sample(struct bed3 *sampleList, struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Given a bed3 list, calculate enrichments for targets */ { struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList); struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash); /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; long long uniqOverlapBases = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *sampleTree = chrom->val; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(sampleTree); for (range = rangeList; range != NULL; range = range->next) { /* Do unique base overlap counts (since using range trees both sides) */ int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end); uniqOverlapBases += overlap; } } } /* Figure out how much we overlap allowing same bases in genome * to part of more than one overlap. */ long long overlapBases = 0; struct bed3 *sample; for (sample = sampleList; sample != NULL; sample = sample->next) { int overlap = genomeRangeTreeOverlapSize(grt, sample->chrom, sample->chromStart, sample->chromEnd); overlapBases += overlap; } /* Save to database. */ struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, overlapBases, uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } genomeRangeTreeFree(&sampleGrt); hashElFreeList(&chromList); }
int overlapInSameFrame(struct bed *a, struct bed *b) /* Return amount of overlap between coding regions (in same frame) * between two beds. */ { int overlap = 0; /* Allocate range trees for each frame. */ struct rbTree *frameTrees[3]; int frame; for (frame = 0; frame<3; ++frame) frameTrees[frame] = rangeTreeNew(); /* Fill in frame trees with coding exons of a. */ int cdsPos = 0; int block, blockCount = a->blockCount; for (block = 0; block < blockCount; ++block) { int start = a->chromStart + a->chromStarts[block]; int end = start + a->blockSizes[block]; start = max(start, a->thickStart); end = min(end, a->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; rangeTreeAdd(frameTrees[frame], start, end); cdsPos += size; } } /* Add up overlaps by comparing bed b against frameTrees */ cdsPos = 0; blockCount = b->blockCount; for (block = 0; block < blockCount; ++block) { int start = b->chromStarts[block] + b->chromStart; int end = start + b->blockSizes[block]; start = max(start, b->thickStart); end = min(end, b->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; overlap += rangeTreeOverlapSize(frameTrees[frame], start, end); cdsPos += size; } } /* Clean up and go home. */ for (frame = 0; frame<3; ++frame) rangeTreeFree(&frameTrees[frame]); return overlap; }
int bedOverlapWithRangeTree(struct rbTree *rangeTree, struct bed *bed) /* Return total overlap (at block level) of bed with tree */ { int total = 0; int i, blockCount=bed->blockCount; for (i=0; i<blockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; total += rangeTreeOverlapSize(rangeTree, start, end); } return total; }
/* This old way is ~3 times as slow */ void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); long startBigQueryTime = clock1000(); struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; struct bbiInterval *iv; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->uniqOverlapBases += overlap; target->overlapBases += overlap * iv->val; } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; lmCleanup(&lm); } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }
void doEnrichmentsFromSampleBed(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf, struct edwAssembly *assembly, struct target *targetList) /* Figure out enrichments from sample bed file. */ { char *sampleBed = vf->sampleBed; if (isEmpty(sampleBed)) { warn("No sample bed for %s", ef->edwFileName); return; } /* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/ struct bed3 *sample, *sampleList = bed3LoadAll(sampleBed); if (sampleList == NULL) { warn("Sample bed is empty for %s", ef->edwFileName); return; } struct genomeRangeTree *sampleGrt = edwMakeGrtFromBed3List(sampleList); struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash); /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; long long uniqOverlapBases = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *sampleTree = chrom->val; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(sampleTree); for (range = rangeList; range != NULL; range = range->next) { /* Do unique base overlap counts (since using range trees both sides) */ int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end); uniqOverlapBases += overlap; } } } /* Figure out how much we overlap allowing same bases in genome * to part of more than one overlap. */ long long overlapBases = 0; for (sample = sampleList; sample != NULL; sample = sample->next) { int overlap = genomeRangeTreeOverlapSize(grt, sample->chrom, sample->chromStart, sample->chromEnd); overlapBases += overlap; } /* Save to database. */ struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, overlapBases, uniqOverlapBases); edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128); edwQaEnrichFree(&enrich); } genomeRangeTreeFree(&sampleGrt); bed3FreeList(&sampleList); hashElFreeList(&chromList); }
static void removeEnclosedDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree, int maxBleedOver, double singleExonMaxOverlap) /* Move double-softs that overlap spliced things to a very great extent into * the spliced things. Also remove tiny double-softs (no more than 2*maxBleedOver). */ { /* Traverse graph and build up range tree covering spliced exons. For each * range of overlapping exons, assemble a singly-linked list of all exons in * the range */ struct rbTree *rangeTree = rangeTreeNew(0); struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree); int removedCount = 0; for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggHardStart || end->type == ggHardEnd) { rangeTreeAddValList(rangeTree, start->position, end->position, edge); } } /* Traverse graph yet one more time looking for doubly-soft exons * that are overlapping the spliced exons. */ for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) { int s = start->position; int e = end->position; int size = e - s; if (size <= maxBleedOver+maxBleedOver) { /* Tiny case, just remove edge and forget it. */ verbose(3, "Removing tiny double-soft edge from %d to %d\n", s, e); rbTreeRemove(edgeTree, edge); ++removedCount; } else { /* Normal case, look for exon list that encloses us, and * if any single exon in that list encloses us, merge into it. */ int splicedOverlap = rangeTreeOverlapSize(rangeTree, s, e); if (splicedOverlap > 0 && splicedOverlap > singleExonMaxOverlap*size) { if (!trustedEdge(edge)) { /* Once we find a range that overlaps the doubly-soft edge, find * (half-hard or better) edge from that range that encloses the * doubly soft edge. */ struct range *r = rangeTreeMaxOverlapping(rangeTree, s, e); struct edge *nextEdge, *edgeList = r->val; struct edge *enclosingEdge = NULL; for (nextEdge = edgeList; edgeList != NULL; edgeList = edgeList->next) { if (encloses(nextEdge, edge)) { enclosingEdge = nextEdge; } } if (enclosingEdge != NULL) { enclosingEdge->evList = slCat(enclosingEdge->evList, edge->evList); edge->evList = NULL; verbose(3, "Removing doubly-soft edge %d-%d, reassigning to %d-%d\n", s, e, enclosingEdge->start->position, enclosingEdge->end->position); rbTreeRemove(edgeTree, edge); ++removedCount; } } } } } } /* Clean up and go home. */ if (removedCount > 0) removeUnusedVertices(vertexTree, edgeTree); for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *nextEdge, *edge = edgeRef->val; while (edge != NULL) { nextEdge = edge->next; edge->next = NULL; edge = nextEdge; } } slFreeList(&edgeRefList); rbTreeFree(&rangeTree); }