void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f) /* Output one Ra record to file. */ { fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end); fprintf(f, "txName %s\n", seq->name); fprintf(f, "txSize %d\n", seq->size); fprintf(f, "cdsStart %d\n", start); fprintf(f, "cdsEnd %d\n", end); fprintf(f, "cdsSize %d\n", end-start); fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start)); fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3)); boolean gotKozak1 = FALSE; if (start >= 3) { char c = seq->dna[start-3]; gotKozak1 = (c == 'a' || c == 'g'); } fprintf(f, "gotKozak1 %d\n", gotKozak1); boolean gotKozak2 = FALSE; if (start+3 < seq->size) gotKozak2 = (seq->dna[start+3] == 'g'); fprintf(f, "gotKozak2 %d\n", gotKozak2); fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2); /* Count up upstream ATG and Kozak */ struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int upAtg = 0, upKozak = 0; int i; for (i=0; i<start; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = findOrfEnd(seq, i); if (orfEnd < start) rangeTreeAdd(upAtgRanges, i, orfEnd); ++upAtg; if (isKozak(seq->dna, seq->size, i)) { ++upKozak; if (orfEnd < start) rangeTreeAdd(upKozakRanges, i, orfEnd); } } } fprintf(f, "upstreamAtgCount %d\n", upAtg); fprintf(f, "upstreamKozakCount %d\n", upKozak); fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start)); fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start)); fprintf(f, "\n"); /* Cluen up and go home. */ rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
void constExons(struct txGraph *graph, FILE *f) /* Write out constituitive exons. */ { /* Create a tree with all introns. */ struct rbTree *tree = rangeTreeNew(); struct txEdge *edge; for (edge = graph->edgeList; edge != NULL; edge = edge->next) { if (edge->type == ggIntron) { rangeTreeAdd(tree, graph->vertices[edge->startIx].position, graph->vertices[edge->endIx].position); } } /* Scan through all exons looking for ones that don't intersect * introns. */ int eId = 0; for (edge = graph->edgeList; edge != NULL; edge = edge->next) { if (edge->type == ggExon) { struct txVertex *s = &graph->vertices[edge->startIx]; struct txVertex *e = &graph->vertices[edge->endIx]; if (s->type == ggHardStart && e->type == ggHardEnd) { int start = s->position; int end = e->position; if (!rangeTreeOverlaps(tree, start, end)) { char *refSource = refSourceAcc(graph, edge); if (refSource != NULL && edge->evCount >= 10) { /* Do one more scan making sure that it doesn't * intersect any exons except for us. */ boolean anyOtherExon = FALSE; struct txEdge *ed; for (ed = graph->edgeList; ed != NULL; ed = ed->next) { if (ed != edge) { int edStart = graph->vertices[ed->startIx].position; int edEnd = graph->vertices[ed->endIx].position; if (rangeIntersection(edStart, edEnd, start, end) > 0) { anyOtherExon = TRUE; break; } } } if (!anyOtherExon) fprintf(f, "%s\t%d\t%d\t%s.%d\t0\t%s\n", graph->tName, start, end, refSource, ++eId, graph->strand); } } } } } rangeTreeFree(&tree); }
void sortAndMergeBlocks(struct bed *oneBed) /* construct a range tree out of blocks, then remake blocks arrays */ /* this should automatically merge everything. */ { struct rbTree *rt = rangeTreeNew(); struct range *rangeList = NULL; struct range *oneRange; int i; for (i = 0; i < oneBed->blockCount; i++) { rangeTreeAdd(rt, oneBed->chromStarts[i], oneBed->chromStarts[i] + oneBed->blockSizes[i]); /* delete old (bad) blocks in bed */ oneBed->chromStarts[i] = oneBed->blockSizes[i] = 0; } rangeList = rangeTreeList(rt); oneBed->blockCount = slCount(rangeList); /* remake bed blocks out of range tree */ i = 0; for (oneRange = rangeList; oneRange != NULL; oneRange = oneRange->next) { oneBed->chromStarts[i] = oneRange->start; oneBed->blockSizes[i] = oneRange->end - oneRange->start; i++; } rangeTreeFree(&rt); }
void outputProt(struct protInfo *prot, struct hash *seedToScop, FILE *f, FILE *fKnownTo) /* Callapse together all features of same type that overlap and output */ { slSort(&prot->featureList, protFeatureCmpName); struct protFeature *startFeature, *endFeature; for (startFeature = prot->featureList; startFeature != NULL; startFeature = endFeature) { for (endFeature = startFeature->next; endFeature != NULL; endFeature = endFeature->next) if (!sameString(startFeature->name, endFeature->name)) break; struct rbTree *rangeTree = rangeTreeNew(); struct protFeature *feature; for (feature = startFeature; feature != endFeature; feature = feature->next) rangeTreeAdd(rangeTree, feature->start, feature->end); struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) { feature = highestScoringFeature(startFeature, endFeature, range->start, range->end); fprintf(f, "%s\t%d\t%d\t%s\n", prot->name, range->start, range->end, startFeature->name); fprintf(fKnownTo, "%s\t%s\t%d\t%d\t%g\n", prot->name, (char *)hashMustFindVal(seedToScop, startFeature->name), range->start, range->end, feature->eVal); } rangeTreeFree(&rangeTree); } }
static struct bbiInterval *bigBedCoverageIntervals(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, struct lm *lm) /* Return intervals where the val is the depth of coverage. */ { /* Get list of overlapping intervals */ struct bigBedInterval *bi, *biList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); if (biList == NULL) return NULL; /* Make a range tree that collects coverage. */ struct rbTree *rangeTree = rangeTreeNew(); for (bi = biList; bi != NULL; bi = bi->next) rangeTreeAddToCoverageDepth(rangeTree, bi->start, bi->end); struct range *range, *rangeList = rangeTreeList(rangeTree); /* Convert rangeList to bbiInterval list. */ struct bbiInterval *bwi, *bwiList = NULL; for (range = rangeList; range != NULL; range = range->next) { lmAllocVar(lm, bwi); bwi->start = range->start; if (bwi->start < start) bwi->start = start; bwi->end = range->end; if (bwi->end > end) bwi->end = end; bwi->val = ptToInt(range->val); slAddHead(&bwiList, bwi); } slReverse(&bwiList); /* Clean up and go home. */ rangeTreeFree(&rangeTree); return bwiList; }
void getRange(char *regionFile){ FILE *fp; char buf[500],chr[50]; char str[2][500]; int i, b, e; struct hashEl *el; struct rbTree *tr; fp = mustOpen(regionFile, "r"); aliHash = newHash(8); while (fgets(buf, 500, fp)) { if (sscanf(buf, "%[^\t]\t%[^\t]\t%*s", str[0], str[1]) != 2) errAbort("error: %s", buf); for (i = 0; i < 2; i++) { if (sscanf(str[i], "%[^:]:%d-%d", chr, &b, &e) != 3) errAbort("error: %s", str[i]); // aliHash = newHash(8); el = hashLookup(aliHash, chr); if (el == NULL) { tr = rangeTreeNew(); hashAdd(aliHash, chr, tr); } else tr = (struct rbTree *)(el->val); rangeTreeAdd(tr, b, e); } } // printf("range\n"); }
void freen(char *chrom) /* Test something */ { uglyTime(NULL); struct sqlConnection *conn = sqlConnect("hg19"); uglyTime("connect"); char query[512]; sqlSafef(query, sizeof(query), "select * from knownGene where chrom='%s'", chrom); struct sqlResult *sr = sqlGetResult(conn, query); uglyTime("get result"); char **row; struct rbTree *rt = rangeTreeNew(); while ((row = sqlNextRow(sr)) != NULL) { struct genePred *gp = genePredLoad(row); int i; int exonCount = gp->exonCount; for (i=0; i<exonCount; ++i) rangeTreeAdd(rt, gp->exonStarts[i], gp->exonEnds[i]); } uglyTime("Add rows"); struct range *list = rangeTreeList(rt); uglyTime("Did list"); uglyf("%d items in chrom %s\n", slCount(list), chrom); }
int pslBedOverlap(struct psl *psl, struct bed *bed) /* Return number of bases psl and bed overlap at the block level */ { /* No overlap if on wrong chromosome or wrong strand. */ if (psl->strand[0] != bed->strand[0]) return 0; if (!sameString(psl->tName, bed->chrom)) return 0; /* Build up range tree covering bed */ struct rbTree *rangeTree = rangeTreeNew(); int i; for (i=0; i<bed->blockCount; ++i) { int start = bed->chromStart + bed->chromStarts[i]; int end = start + bed->blockSizes[i]; rangeTreeAdd(rangeTree, start, end); } /* Loop through psl accumulating total overlap. */ int totalOverlap = 0; for (i=0; i<psl->blockCount; ++i) { int start = psl->tStarts[i]; int end = start + psl->blockSizes[i]; totalOverlap += rangeTreeOverlapSize(rangeTree, start, end); } /* Clean up and return result. */ rangeTreeFree(&rangeTree); return totalOverlap; }
void printBiggestGap(char *database, struct sqlConnection *conn, struct slName *chromList, struct hash *chromHash, char *track) /* Look up track in database, figure out which type it is, call * appropriate biggest gap finder, and then print result. */ { struct trackDb *tdb = hTrackInfo(conn, track); struct hTableInfo *hti = hFindTableInfo(database, chromList->name, tdb->table); char *typeWord = cloneFirstWord(tdb->type); boolean isBig = FALSE, isBigBed = FALSE; struct bbiFile *bbi = NULL; if (sameString(typeWord, "bigBed")) { isBig = TRUE; isBigBed = TRUE; bbi = bigBedFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) ); } else if (sameString(typeWord, "bigWig")) { isBig = TRUE; bbi = bigWigFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) ); } char *biggestChrom = NULL; int biggestSize = 0, biggestStart = 0, biggestEnd = 0; struct slName *chrom; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { if (!allParts && strchr(chrom->name, '_')) // Generally skip weird chroms continue; if (female && sameString(chrom->name, "chrY")) continue; int chromSize = hashIntVal(chromHash, chrom->name); struct rbTree *rt = rangeTreeNew(); int start = 0, end = 0, size = 0; if (isBig) bigCoverageIntoTree(tdb, bbi, chrom->name, chromSize, rt, isBigBed); else tableCoverageIntoTree(hti, tdb, conn, chrom->name, chromSize, rt); if (rt->n > 0) // Want to keep completely uncovered chromosome uncovered addGaps(conn, chrom->name, rt); biggestGapFromRangeTree(rt, chromSize, &start, &end, &size); if (size > biggestSize) { biggestSize = size; biggestStart = start; biggestEnd = end; biggestChrom = chrom->name; } rangeTreeFree(&rt); } printf("%s\t%s:%d-%d\t", track, biggestChrom, biggestStart+1, biggestEnd); if (noComma) printf("%d", biggestSize); else printLongWithCommas(stdout, biggestSize); putchar('\n'); freez(&typeWord); bbiFileClose(&bbi); }
struct gene *geneNew() /* Create empty gene. */ { struct gene *gene; AllocVar(gene); gene->exonTree = rangeTreeNew(); return gene; }
int overlapInSameFrame(struct bed *a, struct bed *b) /* Return amount of overlap between coding regions (in same frame) * between two beds. */ { int overlap = 0; /* Allocate range trees for each frame. */ struct rbTree *frameTrees[3]; int frame; for (frame = 0; frame<3; ++frame) frameTrees[frame] = rangeTreeNew(); /* Fill in frame trees with coding exons of a. */ int cdsPos = 0; int block, blockCount = a->blockCount; for (block = 0; block < blockCount; ++block) { int start = a->chromStart + a->chromStarts[block]; int end = start + a->blockSizes[block]; start = max(start, a->thickStart); end = min(end, a->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; rangeTreeAdd(frameTrees[frame], start, end); cdsPos += size; } } /* Add up overlaps by comparing bed b against frameTrees */ cdsPos = 0; blockCount = b->blockCount; for (block = 0; block < blockCount; ++block) { int start = b->chromStarts[block] + b->chromStart; int end = start + b->blockSizes[block]; start = max(start, b->thickStart); end = min(end, b->thickEnd); if (start < end) { int size = end - start; int frame = (start - cdsPos)%3; overlap += rangeTreeOverlapSize(frameTrees[frame], start, end); cdsPos += size; } } /* Clean up and go home. */ for (frame = 0; frame<3; ++frame) rangeTreeFree(&frameTrees[frame]); return overlap; }
struct rbTree *codingTree(struct bed *bedList) /* Return rangeTree for all coding transcripts in list. */ { struct rbTree *rangeTree = rangeTreeNew(); struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) { if (bed->thickStart < bed->thickEnd) addBedBlocksToRangeTree(rangeTree, bed); } return rangeTree; }
void calcUpstreams(struct dnaSeq *seq, int *upAtgCount, int *upKozakCount) /* Count up upstream ATG and Kozak */ { struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew(); int endPos = seq->size-3; int i; for (i=0; i<=endPos; ++i) { if (startsWith("atg", seq->dna + i)) { int orfEnd = orfEndInSeq(seq, i); rangeTreeAdd(upAtgRanges, i, orfEnd); if (isKozak(seq->dna, seq->size, i)) rangeTreeAdd(upKozakRanges, i, orfEnd); } } setArrayCountsFromRangeTree(upAtgRanges, upAtgCount, seq->size); setArrayCountsFromRangeTree(upKozakRanges, upKozakCount, seq->size); rangeTreeFree(&upAtgRanges); rangeTreeFree(&upKozakRanges); }
struct rbTree *bedBkToTree(struct binKeeper *bk) /* Make a rangeTree covering all exons in tree. */ { struct binElement *el, *list = binKeeperFindAll(bk); struct rbTree *tree = rangeTreeNew(); for (el = list; el != NULL; el = el->next) { struct bed *bed = el->val; bedIntoRangeTree(bed, tree); } slFreeList(&list); return tree; }
void doStrand(struct bed *start, struct bed *end, FILE *f) /* Assuming all beds from start up to end are on same strand, * make a merged bed with all their blocks and output it. */ { struct rbTree *rangeTree = rangeTreeNew(); struct bed *bed; for (bed = start; bed != end; bed = bed->next) bedIntoRangeTree(bed, rangeTree); bed = bedFromRangeTree(rangeTree, start->chrom, start->name, start->strand); bedTabOutN(bed, 12, f); bedFree(&bed); rangeTreeFree(&rangeTree); }
void altFivePrime(struct txGraph *graph, struct range *exonsWithIntrons, FILE *f) /* Write out instances of alt 5' prime splice sites on plus strand * (and alt 3' splice sites on minus strand). */ { struct txEdge *e1, *e2; struct txVertex *v = graph->vertices; struct lm *lm = lmInit(0); struct rbTree *tree = rangeTreeNew(); struct range *range, *rangeList = NULL; for (e1 = graph->edgeList; e1 != NULL; e1 = e1->next) { if (e1->type == ggExon) { int e1Start = v[e1->startIx].position; int e1End = v[e1->endIx].position; boolean e1HardStart = (v[e1->startIx].type == ggHardStart); if (e1HardStart) { for (e2 = graph->edgeList; e2 != NULL; e2 = e2->next) { if (e2->type == ggExon) { int e2Start = v[e2->startIx].position; int e2End = v[e2->endIx].position; boolean e2HardStart = (v[e2->startIx].type == ggHardStart); if (e2HardStart && e1Start != e2Start && e1End == e2End) { int aStart = min(e1Start, e2Start); int aEnd = max(e1Start, e2Start); if (!inRangeList(exonsWithIntrons, e1Start, e1End) && !inRangeList(exonsWithIntrons, e2Start, e2End) && !inRangeList(rangeList, aStart, aEnd)) { lmAllocVar(lm, range); range->start = aStart; range->end = aEnd; slAddHead(&rangeList, range); fprintf(f, "%s\t%d\t%d\t%s\t0\t%s\n", graph->tName, aStart, aEnd, (graph->strand[0] == '-' ? "altFivePrime" : "altThreePrime"), graph->strand); } } } } } } } rbTreeFree(&tree); lmCleanup(&lm); }
static void readMappingResults(char **argv) { FILE *fp; char buf[500], chr[50], id[500]; int beg, i, j, len; char *ch; struct slName *ali; struct hashEl *el; struct rbTree *tr; aliHash = newHash(8); for (i = 4; i <= 5; i++) { fp = mustOpen(argv[i], "r"); while (fgets(buf, 500, fp)) { if (ncbi && i >= 6 && i <= 7) { if (sscanf(buf, "%[^\t]\t%*c %s %d %d", id, chr, &beg, &len) != 4) errAbort("error: %s", buf); if ((ch = strchr(id, ' '))) *ch = '\0'; if (i >= 6 && i <= 7) { j = strlen(id); sprintf(id+j, "/%d", i-5); } } else { if (sscanf(buf, "%[^\t]\t%*c %s %d %d", id, chr, &beg, &len) != 4){ errAbort("error: %s", buf); } if ((ch = strchr(id, ' '))) *ch = '\0'; } ali = newSlName(id); el = hashLookup(aliHash, chr); if (el == NULL) { tr = rangeTreeNew(); hashAdd(aliHash, chr, tr); } else tr = (struct rbTree *)(el->val); rangeTreeAddValHead(tr, beg, beg + len - 1, &ali); } fclose(fp); } }
struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom) /* Read lines from bed file as long as they match chrom. Return a rangeTree that * corresponds to the coverage. */ { struct rbTree *tree = rangeTreeNew(); char *line; while (lineFileNextReal(lf, &line)) { if (!startsWithWord(chrom, line)) { lineFileReuse(lf); break; } char *row[3]; chopLine(line, row); unsigned start = sqlUnsigned(row[1]); unsigned end = sqlUnsigned(row[2]); rangeTreeAddToCoverageDepth(tree, start, end); } return tree; }
static struct bed* subset_beds(char* sectionString, struct bed** pRegions, struct hash* chromHash) /* in the situation where both a regions bed file is given AND the filename specifies subsections, */ /* intersect the two. For simplictity sake, */ { struct bed* fname_ranges = parseSectionString(sectionString, chromHash); struct bed* bed; struct bed* subset = NULL; struct bed* regions = *pRegions; slSort(&fname_ranges, bedCmp); bed = fname_ranges; while (bed != NULL) { /* each iteration of the loop should be a separate chrom */ struct bed* region; struct rbTree* tree = rangeTreeNew(); while ((bed != NULL) && (bed->next != NULL) && (sameString(bed->chrom, bed->next->chrom))) { rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); bed = bed->next; } rangeTreeAdd(tree, bed->chromStart, bed->chromEnd); /* now we're at a point that we're dealing only with one chromosome. */ for (region = regions; region != NULL; region = region->next) { if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd) && rangeTreeFindEnclosing(tree, region->chromStart, region->chromEnd)) { struct bed* clone = cloneBed(region); slAddHead(&subset, clone); } else if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd)) errAbort("range specified in file overlaps but is not contained by range specified on command-line"); } rangeTreeFree(&tree); bed = bed->next; } if (subset == NULL) { errAbort("no ranges specified in file were contained in ranges specified on command-line"); } slReverse(&subset); bedFreeList(&fname_ranges); bedFreeList(pRegions); return subset; }
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed) /* Create a new broken-up that excludes part of gene between CDS breaks. * Also jiggles cds->end coordinate to cope with the sequence we remove. * Deals with transcript to genome coordinate mapping including negative * strand. Be afraid, be very afraid! */ { /* Create range tree covering all breaks. The coordinates here * are transcript coordinates. While we're out it shrink outer CDS * since we are actually shrinking transcript. */ struct rbTree *gapTree = rangeTreeNew(); int bedSize = bed->chromEnd - bed->chromStart; struct lm *lm = gapTree->lm; /* Convenient place to allocate memory. */ int i, lastCds = cds->cdsCount-1; for (i=0; i<lastCds; ++i) { int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i]; int gapEnd = cds->cdsStarts[i+1]; int gapSize = gapEnd - gapStart; cds->end -= gapSize; rangeTreeAdd(gapTree, gapStart, gapEnd); } /* Get list of exons in bed, flipped to reverse strand if need be. */ struct range *exon, *exonList = bedToExonList(bed, lm); if (bed->strand[0] == '-') flipExonList(&exonList, bedSize); /* Go through exon list, mapping each exon to transcript * coordinates. Check if exon needs breaking up, and if * so do so, as we copy it to new list. */ /* Copy exons to new list, breaking them up if need be. */ struct range *newList = NULL, *nextExon, *newExon; int txStartPos = 0, txEndPos; for (exon = exonList; exon != NULL; exon = nextExon) { txEndPos = txStartPos + exon->end - exon->start; nextExon = exon->next; struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos); if (gapList != NULL) { verbose(3, "Splitting exon because of CDS gap\n"); /* Make up exons from current position up to next gap. This is a little * complicated by possibly the gap starting before the exon. */ int exonStart = exon->start; int txStart = txStartPos; struct range *gap; for (gap = gapList; gap != NULL; gap = gap->next) { int txEnd = gap->start; int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos); int exonSize = txEnd - txStart; if (exonSize > 0) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exonStart + exonSize; slAddHead(&newList, newExon); } else /* This case happens if gap starts before exon */ { exonSize = 0; } /* Update current position in both transcript and genome space. */ exonStart += exonSize + gapSize; txStart += exonSize + gapSize; } /* Make up final exon from last gap to end, at least if we don't end in a gap. */ if (exonStart < exon->end) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exon->end; slAddHead(&newList, newExon); } } else { /* Easy case where we don't intersect any gaps. */ slAddHead(&newList, exon); } txStartPos= txEndPos; } slReverse(&newList); /* Flip exons back to forward strand if need be */ if (bed->strand[0] == '-') flipExonList(&newList, bedSize); /* Convert exons to bed12 */ struct bed *newBed; AllocVar(newBed); newBed->chrom = cloneString(bed->chrom); newBed->chromStart = newList->start + bed->chromStart; newBed->chromEnd = newList->end + bed->chromStart; newBed->name = cloneString(bed->name); newBed->score = bed->score; newBed->strand[0] = bed->strand[0]; newBed->blockCount = slCount(newList); AllocArray(newBed->blockSizes, newBed->blockCount); AllocArray(newBed->chromStarts, newBed->blockCount); for (exon = newList, i=0; exon != NULL; exon = exon->next, i++) { newBed->chromStarts[i] = exon->start; newBed->blockSizes[i] = exon->end - exon->start; newBed->chromEnd = exon->end + bed->chromStart; } /* Clean up and go home. */ rbTreeFree(&gapTree); return newBed; }
static void removeEnclosedDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree, int maxBleedOver, double singleExonMaxOverlap) /* Move double-softs that overlap spliced things to a very great extent into * the spliced things. Also remove tiny double-softs (no more than 2*maxBleedOver). */ { /* Traverse graph and build up range tree covering spliced exons. For each * range of overlapping exons, assemble a singly-linked list of all exons in * the range */ struct rbTree *rangeTree = rangeTreeNew(0); struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree); int removedCount = 0; for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggHardStart || end->type == ggHardEnd) { rangeTreeAddValList(rangeTree, start->position, end->position, edge); } } /* Traverse graph yet one more time looking for doubly-soft exons * that are overlapping the spliced exons. */ for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) { int s = start->position; int e = end->position; int size = e - s; if (size <= maxBleedOver+maxBleedOver) { /* Tiny case, just remove edge and forget it. */ verbose(3, "Removing tiny double-soft edge from %d to %d\n", s, e); rbTreeRemove(edgeTree, edge); ++removedCount; } else { /* Normal case, look for exon list that encloses us, and * if any single exon in that list encloses us, merge into it. */ int splicedOverlap = rangeTreeOverlapSize(rangeTree, s, e); if (splicedOverlap > 0 && splicedOverlap > singleExonMaxOverlap*size) { if (!trustedEdge(edge)) { /* Once we find a range that overlaps the doubly-soft edge, find * (half-hard or better) edge from that range that encloses the * doubly soft edge. */ struct range *r = rangeTreeMaxOverlapping(rangeTree, s, e); struct edge *nextEdge, *edgeList = r->val; struct edge *enclosingEdge = NULL; for (nextEdge = edgeList; edgeList != NULL; edgeList = edgeList->next) { if (encloses(nextEdge, edge)) { enclosingEdge = nextEdge; } } if (enclosingEdge != NULL) { enclosingEdge->evList = slCat(enclosingEdge->evList, edge->evList); edge->evList = NULL; verbose(3, "Removing doubly-soft edge %d-%d, reassigning to %d-%d\n", s, e, enclosingEdge->start->position, enclosingEdge->end->position); rbTreeRemove(edgeTree, edge); ++removedCount; } } } } } } /* Clean up and go home. */ if (removedCount > 0) removeUnusedVertices(vertexTree, edgeTree); for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *nextEdge, *edge = edgeRef->val; while (edge != NULL) { nextEdge = edge->next; edge->next = NULL; edge = nextEdge; } } slFreeList(&edgeRefList); rbTreeFree(&rangeTree); }
struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, double forceJoinScore, double weakLevel) /* Convert a list of items to a list of clusters of items. This may break up clusters that * have weakly linked parts. [ ] AAAAAAAAAAAAAAAAAA BBBBBB DDDDDD CCCC EEEE gets tranformed into [ ] [ ] AAAAAAAAAAAAAAAAAA BBBBBB DDDDDD CCCC EEEE The strategy is to build a rangeTree of coverage, which might look something like so: 123333211123333211 then define cluster ends that exceed the minimum limit, which is either weakLevel (usually 10%) of the highest or forceJoinScore if weakLevel times the highest is more than forceJoinScore. This will go to something like so: [---] [----] Finally the items that are overlapping a cluster are assigned to it. Note that this may mean that an item may be in multiple clusters. [ABC] [ ADE] */ { int easyMax = round(1.0/weakLevel); int itemCount = slCount(itemList); struct peakCluster *clusterList = NULL; if (itemCount < easyMax) { struct peakItem *item = itemList; int chromStart = item->chromStart; int chromEnd = item->chromEnd; for (item = item->next; item != NULL; item = item->next) { if (item->chromStart < chromStart) chromStart = item->chromStart; if (item->chromEnd > chromEnd) chromEnd = item->chromEnd; } addCluster(lm, itemList, chromStart, chromEnd, &clusterList); } else { /* Make up coverage tree. */ struct rbTree *covTree = rangeTreeNew(); struct peakItem *item; for (item = itemList; item != NULL; item = item->next) rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd); struct range *range, *rangeList = rangeTreeList(covTree); /* Figure out maximum coverage. */ int maxCov = 0; for (range = rangeList; range != NULL; range = range->next) { int cov = ptToInt(range->val); if (cov > maxCov) maxCov = cov; } /* Figure coverage threshold. */ int threshold = round(maxCov * weakLevel); if (threshold > forceJoinScore-1) threshold = forceJoinScore-1; /* Loop through emitting sections over threshold as clusters */ boolean inRange = FALSE; boolean start = 0, end = 0; for (range = rangeList; range != NULL; range = range->next) { int cov = ptToInt(range->val); if (cov > threshold) { if (inRange) end = range->end; else { inRange = TRUE; start = range->start; end = range->end; } } else { if (inRange) { addCluster(lm, itemList, start, end, &clusterList); inRange = FALSE; } } } if (inRange) addCluster(lm, itemList, start, end, &clusterList); } slReverse(&clusterList); return clusterList; }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree) /* Merge together overlapping edges with soft ends. */ { struct mergedEdge /* Hold together info on a merged edge. */ { struct evidence *evidence; }; /* Traverse graph and build up range tree. Each node in the range tree * will represent the bounds of coordinates of overlapping double softs */ struct rbTree *rangeTree = rangeTreeNew(0); struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree); for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) rangeTreeAdd(rangeTree, start->position, end->position); } /* Traverse graph again merging edges */ for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start= edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) { struct range *r = rangeTreeFindEnclosing(rangeTree, start->position, end->position); assert(r != NULL); /* At this point, r represents the bounds of a double-soft * region that encompasses this edge. Collect the set of * evidence of edges overlapping this range */ struct mergedEdge *mergeEdge = r->val; if (mergeEdge == NULL) { lmAllocVar(rangeTree->lm, mergeEdge); r->val = mergeEdge; } mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence); verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", start->position, end->position, r->start, r->end); edge->evList = NULL; rbTreeRemove(edgeTree, edge); } } /* Traverse merged edge list, making a single edge from each range. At this point, * each range will have some evidence attached to it, from each of the double softs * that fall within the range. From all of this evidence, make a single consensus edge */ struct range *r; struct lm *lm = lmInit(0); for (r = rangeTreeList(rangeTree); r != NULL; r = r->next) { struct mergedEdge *mergedEdge = r->val; struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm); if (edge != NULL) rbTreeAdd(edgeTree, edge); verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", edge->start->position, edge->end->position, r->start, r->end); } /* Clean up and go home. */ lmCleanup(&lm); removeUnusedVertices(vertexTree, edgeTree); slFreeList(&edgeRefList); rbTreeFree(&rangeTree); }