struct txGraph *makeGraph(struct linkedBeds *lbList, int maxBleedOver, int maxUncheckedBleed, struct nibTwoCache *seqCache, double singleExonMaxOverlap, char *name) /* Create a graph corresponding to linkedBedsList. * The maxBleedOver parameter controls how much of a soft edge that * can be cut off when snapping to a hard edge. The singleExonMaxOverlap * controls what ratio of a single exon transcript can overlap spliced * transcripts */ { char *chromName = lbList->bedList->chrom; /* Create tree of all unique vertices. */ struct rbTree *vertexTree = makeVertexTree(lbList); verbose(2, "%d unique vertices\n", vertexTree->n); /* Create tree of all unique edges */ struct rbTree *edgeTree = makeEdgeTree(lbList, vertexTree); verbose(2, "%d unique edges\n", edgeTree->n); snapSoftToCloseHard(vertexTree, edgeTree, maxBleedOver, maxUncheckedBleed, seqCache, chromName); verbose(2, "%d edges, %d vertices after snapSoftToCloseHard\n", edgeTree->n, vertexTree->n); removeEmptyEdges(vertexTree, edgeTree); verbose(2, "%d edges, %d vertices after removeEmptyEdges\n", edgeTree->n, vertexTree->n); snapHalfHards(vertexTree, edgeTree); verbose(2, "%d edges, %d vertices after snapHalfHards\n", edgeTree->n, vertexTree->n); halfHardConsensuses(vertexTree, edgeTree); verbose(2, "%d edges, %d vertices after medianHalfHards\n", edgeTree->n, vertexTree->n); removeEnclosedDoubleSofts(vertexTree, edgeTree, maxBleedOver, singleExonMaxOverlap); verbose(2, "%d edges, %d vertices after mergeEnclosedDoubleSofts\n", edgeTree->n, vertexTree->n); mergeDoubleSofts(vertexTree, edgeTree); verbose(2, "%d edges, %d vertices after mergeDoubleSofts\n", edgeTree->n, vertexTree->n); struct txGraph *txg = treeTreeToTxg(vertexTree, edgeTree, name, lbList); /* Clean up and go home. */ rbTreeFree(&vertexTree); rbTreeFree(&edgeTree); return txg; }
void rbTreeFreeList(struct rbTree **pList) /* Free up a list of rbTrees. */ { struct rbTree *tree, *next; for (tree = *pList; tree != NULL; tree = next) { next = tree->next; rbTreeFree(&tree); } }
static void visiSearcherFree(struct visiSearcher **pSearcher) /* Free up memory associated with *pSearcher */ { struct visiSearcher *searcher = *pSearcher; if (searcher != NULL) { visiMatchFreeList(&searcher->matchList); rbTreeFree(&searcher->tree); freez(pSearcher); } }
void separateChrom(struct chrom *chrom, struct hash *infoHash, struct bed **retCoding, struct bed **retNearCoding, struct bed **retNearCodingJunk, struct bed **retAntisense, struct bed **retNoncoding) /* Separate bed list into four parts depending on whether or not * it's coding. */ { *retCoding = *retNearCoding = *retNearCodingJunk = *retAntisense = *retNoncoding = NULL; /* Make trees that cover coding on both strands. */ struct rbTree *plusCoding = codingTree(chrom->plusList); struct rbTree *minusCoding = codingTree(chrom->minusList); /* Split things up. */ separateStrand(chrom->plusList, infoHash, plusCoding, minusCoding, retCoding, retNearCoding, retNearCodingJunk, retAntisense, retNoncoding); separateStrand(chrom->minusList, infoHash, minusCoding, plusCoding, retCoding, retNearCoding, retNearCodingJunk, retAntisense, retNoncoding); /* Clean up and go home. */ rbTreeFree(&plusCoding); rbTreeFree(&minusCoding); }
void altFivePrime(struct txGraph *graph, struct range *exonsWithIntrons, FILE *f) /* Write out instances of alt 5' prime splice sites on plus strand * (and alt 3' splice sites on minus strand). */ { struct txEdge *e1, *e2; struct txVertex *v = graph->vertices; struct lm *lm = lmInit(0); struct rbTree *tree = rangeTreeNew(); struct range *range, *rangeList = NULL; for (e1 = graph->edgeList; e1 != NULL; e1 = e1->next) { if (e1->type == ggExon) { int e1Start = v[e1->startIx].position; int e1End = v[e1->endIx].position; boolean e1HardStart = (v[e1->startIx].type == ggHardStart); if (e1HardStart) { for (e2 = graph->edgeList; e2 != NULL; e2 = e2->next) { if (e2->type == ggExon) { int e2Start = v[e2->startIx].position; int e2End = v[e2->endIx].position; boolean e2HardStart = (v[e2->startIx].type == ggHardStart); if (e2HardStart && e1Start != e2Start && e1End == e2End) { int aStart = min(e1Start, e2Start); int aEnd = max(e1Start, e2Start); if (!inRangeList(exonsWithIntrons, e1Start, e1End) && !inRangeList(exonsWithIntrons, e2Start, e2End) && !inRangeList(rangeList, aStart, aEnd)) { lmAllocVar(lm, range); range->start = aStart; range->end = aEnd; slAddHead(&rangeList, range); fprintf(f, "%s\t%d\t%d\t%s\t0\t%s\n", graph->tName, aStart, aEnd, (graph->strand[0] == '-' ? "altFivePrime" : "altThreePrime"), graph->strand); } } } } } } } rbTreeFree(&tree); lmCleanup(&lm); }
void rbTest(int count) /* Fill up rbTree with count # of nodes and then search for those * nodes and then free it up. */ { int i, j; struct rbTree *tree = rbTreeNew(rbTreeCmpInt); struct lm *lm = tree->lm; for (i=0; i<count; ++i) { int *pt; lmAllocVar(lm, pt); *pt = i; rbTreeAdd(tree, pt); } for (j=0; j<10; ++j) for (i=0; i<count; ++i) if (!rbTreeFind(tree, &i)) errAbort("Couldnt' find %d", i); rbTreeFree(&tree); }
void netClass(char *inName, char *tDb, char *qDb, char *outName) /* netClass - Add classification info to net. */ { struct chainNet *net; struct lineFile *lf = lineFileOpen(inName, TRUE); FILE *f = mustOpen(outName, "w"); struct chrom *qChromList, *chrom; struct hash *qChromHash; struct hash *arHash = NULL; struct sqlConnection *tConn = sqlConnect(tDb); struct sqlConnection *qConn = sqlConnect(qDb); qLm = lmInit(0); if (!noAr) arHash = getAncientRepeats(tConn, qConn); getChroms(qConn, &qChromHash, &qChromList); verbose(1, "Reading gaps in %s\n", qDb); if (sqlTableExists(qConn, "gap")) { getSeqGapsUnsplit(qConn, qChromHash); } else { for (chrom = qChromList; chrom != NULL; chrom = chrom->next) chrom->nGaps = getSeqGaps(qConn, chrom->name); } if (qNewR) { verbose(1, "Reading new repeats from %s\n", qNewR); for (chrom = qChromList; chrom != NULL; chrom = chrom->next) chrom->newRepeats = getNewRepeats(qNewR, chrom->name); } verbose(1, "Reading simpleRepeats in %s\n", qDb); getTrfUnsplit(qConn, qChromHash); if (qRepeatTable) { verbose(1, "Reading repeats in %s from table %s\n", qDb, qRepeatTable); getRepeatsUnsplitTable(qConn, qChromHash, qRepeatTable); } else { verbose(1, "Reading repeats in %s\n", qDb); if (sqlTableExists(qConn, "rmsk")) getRepeatsUnsplit(qConn, qChromHash, arHash); else { for (chrom = qChromList; chrom != NULL; chrom = chrom->next) getRepeats(qConn, arHash, chrom->name, &chrom->repeats, &chrom->oldRepeats); } } while ((net = chainNetRead(lf)) != NULL) { struct rbTree *tN, *tRepeats, *tOldRepeats, *tTrf; char *tName = net->name; if (liftHashT != NULL) { struct liftSpec *lft = hashMustFindVal(liftHashT, net->name); tName = lft->newName; } verbose(1, "Processing %s.%s\n", tDb, net->name); tN = getSeqGaps(tConn, tName); tAddN(net, net->fillList, tN); rbTreeFree(&tN); qAddN(net, net->fillList, qChromHash); if (tRepeatTable) getRepeatsTable(tConn, tRepeatTable, tName, &tRepeats, &tOldRepeats); else getRepeats(tConn, arHash, tName, &tRepeats, &tOldRepeats); tAddR(net, net->fillList, tRepeats); if (!noAr) tAddOldR(net, net->fillList, tOldRepeats); rbTreeFree(&tRepeats); rbTreeFree(&tOldRepeats); qAddR(net, net->fillList, qChromHash); if (!noAr) qAddOldR(net, net->fillList, qChromHash); tTrf = getTrf(tConn, tName); tAddTrf(net, net->fillList, tTrf); rbTreeFree(&tTrf); qAddTrf(net, net->fillList, qChromHash); if (tNewR) { struct rbTree *tree = getNewRepeats(tNewR, tName); tAddNewR(net, net->fillList, tree); rbTreeFree(&tree); } if (qNewR) qAddNewR(net, net->fillList, qChromHash); chainNetWrite(net, f); chainNetFree(&net); } sqlDisconnect(&tConn); sqlDisconnect(&qConn); }
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed) /* Create a new broken-up that excludes part of gene between CDS breaks. * Also jiggles cds->end coordinate to cope with the sequence we remove. * Deals with transcript to genome coordinate mapping including negative * strand. Be afraid, be very afraid! */ { /* Create range tree covering all breaks. The coordinates here * are transcript coordinates. While we're out it shrink outer CDS * since we are actually shrinking transcript. */ struct rbTree *gapTree = rangeTreeNew(); int bedSize = bed->chromEnd - bed->chromStart; struct lm *lm = gapTree->lm; /* Convenient place to allocate memory. */ int i, lastCds = cds->cdsCount-1; for (i=0; i<lastCds; ++i) { int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i]; int gapEnd = cds->cdsStarts[i+1]; int gapSize = gapEnd - gapStart; cds->end -= gapSize; rangeTreeAdd(gapTree, gapStart, gapEnd); } /* Get list of exons in bed, flipped to reverse strand if need be. */ struct range *exon, *exonList = bedToExonList(bed, lm); if (bed->strand[0] == '-') flipExonList(&exonList, bedSize); /* Go through exon list, mapping each exon to transcript * coordinates. Check if exon needs breaking up, and if * so do so, as we copy it to new list. */ /* Copy exons to new list, breaking them up if need be. */ struct range *newList = NULL, *nextExon, *newExon; int txStartPos = 0, txEndPos; for (exon = exonList; exon != NULL; exon = nextExon) { txEndPos = txStartPos + exon->end - exon->start; nextExon = exon->next; struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos); if (gapList != NULL) { verbose(3, "Splitting exon because of CDS gap\n"); /* Make up exons from current position up to next gap. This is a little * complicated by possibly the gap starting before the exon. */ int exonStart = exon->start; int txStart = txStartPos; struct range *gap; for (gap = gapList; gap != NULL; gap = gap->next) { int txEnd = gap->start; int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos); int exonSize = txEnd - txStart; if (exonSize > 0) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exonStart + exonSize; slAddHead(&newList, newExon); } else /* This case happens if gap starts before exon */ { exonSize = 0; } /* Update current position in both transcript and genome space. */ exonStart += exonSize + gapSize; txStart += exonSize + gapSize; } /* Make up final exon from last gap to end, at least if we don't end in a gap. */ if (exonStart < exon->end) { lmAllocVar(lm, newExon); newExon->start = exonStart; newExon->end = exon->end; slAddHead(&newList, newExon); } } else { /* Easy case where we don't intersect any gaps. */ slAddHead(&newList, exon); } txStartPos= txEndPos; } slReverse(&newList); /* Flip exons back to forward strand if need be */ if (bed->strand[0] == '-') flipExonList(&newList, bedSize); /* Convert exons to bed12 */ struct bed *newBed; AllocVar(newBed); newBed->chrom = cloneString(bed->chrom); newBed->chromStart = newList->start + bed->chromStart; newBed->chromEnd = newList->end + bed->chromStart; newBed->name = cloneString(bed->name); newBed->score = bed->score; newBed->strand[0] = bed->strand[0]; newBed->blockCount = slCount(newList); AllocArray(newBed->blockSizes, newBed->blockCount); AllocArray(newBed->chromStarts, newBed->blockCount); for (exon = newList, i=0; exon != NULL; exon = exon->next, i++) { newBed->chromStarts[i] = exon->start; newBed->blockSizes[i] = exon->end - exon->start; newBed->chromEnd = exon->end + bed->chromStart; } /* Clean up and go home. */ rbTreeFree(&gapTree); return newBed; }
static void removeEnclosedDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree, int maxBleedOver, double singleExonMaxOverlap) /* Move double-softs that overlap spliced things to a very great extent into * the spliced things. Also remove tiny double-softs (no more than 2*maxBleedOver). */ { /* Traverse graph and build up range tree covering spliced exons. For each * range of overlapping exons, assemble a singly-linked list of all exons in * the range */ struct rbTree *rangeTree = rangeTreeNew(0); struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree); int removedCount = 0; for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggHardStart || end->type == ggHardEnd) { rangeTreeAddValList(rangeTree, start->position, end->position, edge); } } /* Traverse graph yet one more time looking for doubly-soft exons * that are overlapping the spliced exons. */ for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) { int s = start->position; int e = end->position; int size = e - s; if (size <= maxBleedOver+maxBleedOver) { /* Tiny case, just remove edge and forget it. */ verbose(3, "Removing tiny double-soft edge from %d to %d\n", s, e); rbTreeRemove(edgeTree, edge); ++removedCount; } else { /* Normal case, look for exon list that encloses us, and * if any single exon in that list encloses us, merge into it. */ int splicedOverlap = rangeTreeOverlapSize(rangeTree, s, e); if (splicedOverlap > 0 && splicedOverlap > singleExonMaxOverlap*size) { if (!trustedEdge(edge)) { /* Once we find a range that overlaps the doubly-soft edge, find * (half-hard or better) edge from that range that encloses the * doubly soft edge. */ struct range *r = rangeTreeMaxOverlapping(rangeTree, s, e); struct edge *nextEdge, *edgeList = r->val; struct edge *enclosingEdge = NULL; for (nextEdge = edgeList; edgeList != NULL; edgeList = edgeList->next) { if (encloses(nextEdge, edge)) { enclosingEdge = nextEdge; } } if (enclosingEdge != NULL) { enclosingEdge->evList = slCat(enclosingEdge->evList, edge->evList); edge->evList = NULL; verbose(3, "Removing doubly-soft edge %d-%d, reassigning to %d-%d\n", s, e, enclosingEdge->start->position, enclosingEdge->end->position); rbTreeRemove(edgeTree, edge); ++removedCount; } } } } } } /* Clean up and go home. */ if (removedCount > 0) removeUnusedVertices(vertexTree, edgeTree); for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *nextEdge, *edge = edgeRef->val; while (edge != NULL) { nextEdge = edge->next; edge->next = NULL; edge = nextEdge; } } slFreeList(&edgeRefList); rbTreeFree(&rangeTree); }
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree) /* Merge together overlapping edges with soft ends. */ { struct mergedEdge /* Hold together info on a merged edge. */ { struct evidence *evidence; }; /* Traverse graph and build up range tree. Each node in the range tree * will represent the bounds of coordinates of overlapping double softs */ struct rbTree *rangeTree = rangeTreeNew(0); struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree); for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start = edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) rangeTreeAdd(rangeTree, start->position, end->position); } /* Traverse graph again merging edges */ for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next) { struct edge *edge = edgeRef->val; struct vertex *start= edge->start; struct vertex *end = edge->end; if (start->type == ggSoftStart && end->type == ggSoftEnd) { struct range *r = rangeTreeFindEnclosing(rangeTree, start->position, end->position); assert(r != NULL); /* At this point, r represents the bounds of a double-soft * region that encompasses this edge. Collect the set of * evidence of edges overlapping this range */ struct mergedEdge *mergeEdge = r->val; if (mergeEdge == NULL) { lmAllocVar(rangeTree->lm, mergeEdge); r->val = mergeEdge; } mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence); verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", start->position, end->position, r->start, r->end); edge->evList = NULL; rbTreeRemove(edgeTree, edge); } } /* Traverse merged edge list, making a single edge from each range. At this point, * each range will have some evidence attached to it, from each of the double softs * that fall within the range. From all of this evidence, make a single consensus edge */ struct range *r; struct lm *lm = lmInit(0); for (r = rangeTreeList(rangeTree); r != NULL; r = r->next) { struct mergedEdge *mergedEdge = r->val; struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm); if (edge != NULL) rbTreeAdd(edgeTree, edge); verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", edge->start->position, edge->end->position, r->start, r->end); } /* Clean up and go home. */ lmCleanup(&lm); removeUnusedVertices(vertexTree, edgeTree); slFreeList(&edgeRefList); rbTreeFree(&rangeTree); }