Пример #1
0
struct txGraph *makeGraph(struct linkedBeds *lbList, int maxBleedOver, 
	int maxUncheckedBleed, struct nibTwoCache *seqCache,
	double singleExonMaxOverlap, char *name)
/* Create a graph corresponding to linkedBedsList.
 * The maxBleedOver parameter controls how much of a soft edge that
 * can be cut off when snapping to a hard edge.  The singleExonMaxOverlap
 * controls what ratio of a single exon transcript can overlap spliced 
 * transcripts */
{
char *chromName = lbList->bedList->chrom;

/* Create tree of all unique vertices. */
struct rbTree *vertexTree = makeVertexTree(lbList);
verbose(2, "%d unique vertices\n", vertexTree->n);

/* Create tree of all unique edges */
struct rbTree *edgeTree = makeEdgeTree(lbList, vertexTree);
verbose(2, "%d unique edges\n", edgeTree->n);

snapSoftToCloseHard(vertexTree, edgeTree, maxBleedOver, maxUncheckedBleed, seqCache, chromName);
verbose(2, "%d edges, %d vertices after snapSoftToCloseHard\n", 
	edgeTree->n, vertexTree->n);

removeEmptyEdges(vertexTree, edgeTree);
verbose(2, "%d edges, %d vertices after removeEmptyEdges\n",
	edgeTree->n, vertexTree->n);

snapHalfHards(vertexTree, edgeTree);
verbose(2, "%d edges, %d vertices after snapHalfHards\n", 
	edgeTree->n, vertexTree->n);

halfHardConsensuses(vertexTree, edgeTree);
verbose(2, "%d edges, %d vertices after medianHalfHards\n", 
	edgeTree->n, vertexTree->n);

removeEnclosedDoubleSofts(vertexTree, edgeTree, maxBleedOver, singleExonMaxOverlap);
verbose(2, "%d edges, %d vertices after mergeEnclosedDoubleSofts\n",
	edgeTree->n, vertexTree->n);

mergeDoubleSofts(vertexTree, edgeTree);
verbose(2, "%d edges, %d vertices after mergeDoubleSofts\n",
	edgeTree->n, vertexTree->n);

struct txGraph *txg = treeTreeToTxg(vertexTree, edgeTree, name, lbList);

/* Clean up and go home. */
rbTreeFree(&vertexTree);
rbTreeFree(&edgeTree);
return txg;
}
Пример #2
0
void rbTreeFreeList(struct rbTree **pList)
/* Free up a list of rbTrees. */
{
struct rbTree *tree, *next;
for (tree = *pList; tree != NULL; tree = next)
    {
    next = tree->next;
    rbTreeFree(&tree);
    }
}
static void visiSearcherFree(struct visiSearcher **pSearcher)
/* Free up memory associated with *pSearcher */
{
struct visiSearcher *searcher = *pSearcher;
if (searcher != NULL)
    {
    visiMatchFreeList(&searcher->matchList);
    rbTreeFree(&searcher->tree);
    freez(pSearcher);
    }
}
Пример #4
0
void separateChrom(struct chrom *chrom, struct hash *infoHash,
	struct bed **retCoding, struct bed **retNearCoding, struct bed **retNearCodingJunk,
	struct bed **retAntisense, struct bed **retNoncoding)
/* Separate bed list into four parts depending on whether or not
 * it's coding. */
{
*retCoding = *retNearCoding = *retNearCodingJunk = *retAntisense = *retNoncoding = NULL;

/* Make trees that cover coding on both strands. */
struct rbTree *plusCoding = codingTree(chrom->plusList);
struct rbTree *minusCoding = codingTree(chrom->minusList);

/* Split things up. */
separateStrand(chrom->plusList, infoHash, plusCoding, minusCoding,
	retCoding, retNearCoding, retNearCodingJunk, retAntisense, retNoncoding);
separateStrand(chrom->minusList, infoHash, minusCoding, plusCoding,
	retCoding, retNearCoding, retNearCodingJunk, retAntisense, retNoncoding);

/* Clean up and go home. */
rbTreeFree(&plusCoding);
rbTreeFree(&minusCoding);
}
Пример #5
0
void altFivePrime(struct txGraph *graph, struct range *exonsWithIntrons, FILE *f)
/* Write out instances of alt 5' prime splice sites on plus strand
 * (and alt 3' splice sites on minus strand). */
{
    struct txEdge *e1, *e2;
    struct txVertex *v = graph->vertices;
    struct lm *lm = lmInit(0);
    struct rbTree *tree = rangeTreeNew();
    struct range *range, *rangeList = NULL;
    for (e1 = graph->edgeList; e1 != NULL; e1 = e1->next)
    {
        if (e1->type == ggExon)
        {
            int e1Start = v[e1->startIx].position;
            int e1End = v[e1->endIx].position;
            boolean  e1HardStart = (v[e1->startIx].type == ggHardStart);
            if (e1HardStart)
            {
                for (e2 = graph->edgeList; e2 != NULL; e2 = e2->next)
                {
                    if (e2->type == ggExon)
                    {
                        int e2Start = v[e2->startIx].position;
                        int e2End = v[e2->endIx].position;
                        boolean  e2HardStart = (v[e2->startIx].type == ggHardStart);
                        if (e2HardStart && e1Start != e2Start && e1End == e2End)
                        {
                            int aStart = min(e1Start, e2Start);
                            int aEnd = max(e1Start, e2Start);
                            if (!inRangeList(exonsWithIntrons, e1Start, e1End)
                                    && !inRangeList(exonsWithIntrons, e2Start, e2End)
                                    && !inRangeList(rangeList, aStart, aEnd))
                            {
                                lmAllocVar(lm, range);
                                range->start = aStart;
                                range->end = aEnd;
                                slAddHead(&rangeList, range);
                                fprintf(f, "%s\t%d\t%d\t%s\t0\t%s\n", graph->tName,
                                        aStart, aEnd,
                                        (graph->strand[0] == '-' ? "altFivePrime" : "altThreePrime"),
                                        graph->strand);
                            }
                        }
                    }
                }
            }
        }
    }
    rbTreeFree(&tree);
    lmCleanup(&lm);
}
Пример #6
0
void rbTest(int count)
/* Fill up rbTree with count # of nodes and then search for those
 * nodes and then free it up. */
{
    int i, j;
    struct rbTree *tree = rbTreeNew(rbTreeCmpInt);
    struct lm *lm = tree->lm;
    for (i=0; i<count; ++i)
    {
        int *pt;
        lmAllocVar(lm, pt);
        *pt = i;
        rbTreeAdd(tree, pt);
    }
    for (j=0; j<10; ++j)
        for (i=0; i<count; ++i)
            if (!rbTreeFind(tree, &i))
                errAbort("Couldnt' find %d", i);
    rbTreeFree(&tree);
}
Пример #7
0
void netClass(char *inName, char *tDb, char *qDb, char *outName)
/* netClass - Add classification info to net. */
{
struct chainNet *net;
struct lineFile *lf = lineFileOpen(inName, TRUE);
FILE *f = mustOpen(outName, "w");
struct chrom *qChromList, *chrom;
struct hash *qChromHash;
struct hash *arHash = NULL;
struct sqlConnection *tConn = sqlConnect(tDb);
struct sqlConnection *qConn = sqlConnect(qDb);

qLm = lmInit(0);

if (!noAr)
    arHash = getAncientRepeats(tConn, qConn);

getChroms(qConn, &qChromHash, &qChromList);

verbose(1, "Reading gaps in %s\n", qDb);
if (sqlTableExists(qConn, "gap"))
    {
    getSeqGapsUnsplit(qConn, qChromHash);
    }
else
    {
    for (chrom = qChromList; chrom != NULL; chrom = chrom->next)
	chrom->nGaps = getSeqGaps(qConn, chrom->name);
    }

if (qNewR)
    {
    verbose(1, "Reading new repeats from %s\n", qNewR);
    for (chrom = qChromList; chrom != NULL; chrom = chrom->next)
        chrom->newRepeats = getNewRepeats(qNewR, chrom->name);
    }

verbose(1, "Reading simpleRepeats in %s\n", qDb);
getTrfUnsplit(qConn, qChromHash);

if (qRepeatTable)
    {
    verbose(1, "Reading repeats in %s from table %s\n", qDb, qRepeatTable);
    getRepeatsUnsplitTable(qConn, qChromHash, qRepeatTable);
    }
else
    {
    verbose(1, "Reading repeats in %s\n", qDb);
    if (sqlTableExists(qConn, "rmsk"))
	getRepeatsUnsplit(qConn, qChromHash, arHash);
    else
	{
	for (chrom = qChromList; chrom != NULL; chrom = chrom->next)
	    getRepeats(qConn, arHash, chrom->name, &chrom->repeats,
		       &chrom->oldRepeats);
	}
    }

while ((net = chainNetRead(lf)) != NULL)
    {
    struct rbTree *tN, *tRepeats, *tOldRepeats, *tTrf;
    char *tName = net->name;
    if (liftHashT != NULL)
	{
	struct liftSpec *lft = hashMustFindVal(liftHashT, net->name);
	tName = lft->newName;
	}

    verbose(1, "Processing %s.%s\n", tDb, net->name);
    tN = getSeqGaps(tConn, tName);
    tAddN(net, net->fillList, tN);
    rbTreeFree(&tN);
    qAddN(net, net->fillList, qChromHash);

    if (tRepeatTable)
	getRepeatsTable(tConn, tRepeatTable, tName, &tRepeats, &tOldRepeats);
    else
	getRepeats(tConn, arHash, tName, &tRepeats, &tOldRepeats);
    tAddR(net, net->fillList, tRepeats);
    if (!noAr)
	tAddOldR(net, net->fillList, tOldRepeats);
    rbTreeFree(&tRepeats);
    rbTreeFree(&tOldRepeats);
    qAddR(net, net->fillList, qChromHash);
    if (!noAr)
	qAddOldR(net, net->fillList, qChromHash);

    tTrf = getTrf(tConn, tName);
    tAddTrf(net, net->fillList, tTrf);
    rbTreeFree(&tTrf);
    qAddTrf(net, net->fillList, qChromHash);

    if (tNewR)
        {
	struct rbTree *tree = getNewRepeats(tNewR, tName);
	tAddNewR(net, net->fillList, tree);
	rbTreeFree(&tree);
	}
    if (qNewR)
        qAddNewR(net, net->fillList, qChromHash);
    chainNetWrite(net, f);
    chainNetFree(&net);
    }
sqlDisconnect(&tConn);
sqlDisconnect(&qConn);
}
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed)
/* Create a new broken-up that excludes part of gene between CDS breaks.  
 * Also jiggles cds->end coordinate to cope with the sequence we remove.
 * Deals with transcript to genome coordinate mapping including negative
 * strand.  Be afraid, be very afraid! */
{
/* Create range tree covering all breaks.  The coordinates here
 * are transcript coordinates.  While we're out it shrink outer CDS
 * since we are actually shrinking transcript. */
struct rbTree *gapTree = rangeTreeNew();
int bedSize = bed->chromEnd - bed->chromStart;
struct lm *lm = gapTree->lm;	/* Convenient place to allocate memory. */
int i, lastCds = cds->cdsCount-1;
for (i=0; i<lastCds; ++i)
    {
    int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i];
    int gapEnd = cds->cdsStarts[i+1];
    int gapSize = gapEnd - gapStart;
    cds->end -= gapSize;
    rangeTreeAdd(gapTree, gapStart, gapEnd);
    }

/* Get list of exons in bed, flipped to reverse strand if need be. */
struct range *exon, *exonList = bedToExonList(bed, lm);
if (bed->strand[0] == '-')
    flipExonList(&exonList, bedSize);

/* Go through exon list, mapping each exon to transcript
 * coordinates. Check if exon needs breaking up, and if
 * so do so, as we copy it to new list. */
/* Copy exons to new list, breaking them up if need be. */
struct range *newList = NULL, *nextExon, *newExon;
int txStartPos = 0, txEndPos;
for (exon = exonList; exon != NULL; exon = nextExon)
    {
    txEndPos = txStartPos + exon->end - exon->start;
    nextExon = exon->next;
    struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos);
    if (gapList != NULL)
        {
	verbose(3, "Splitting exon because of CDS gap\n");

	/* Make up exons from current position up to next gap.  This is a little
	 * complicated by possibly the gap starting before the exon. */
	int exonStart = exon->start;
	int txStart = txStartPos;
	struct range *gap;
	for (gap = gapList; gap != NULL; gap = gap->next)
	    {
	    int txEnd = gap->start;
	    int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos);
	    int exonSize = txEnd - txStart;
	    if (exonSize > 0)
		{
		lmAllocVar(lm, newExon);
		newExon->start = exonStart;
		newExon->end = exonStart + exonSize;
		slAddHead(&newList, newExon);
		}
	    else /* This case happens if gap starts before exon */
	        {
		exonSize = 0;
		}

	    /* Update current position in both transcript and genome space. */
	    exonStart += exonSize + gapSize;
	    txStart += exonSize + gapSize;
	    }

	/* Make up final exon from last gap to end, at least if we don't end in a gap. */
	if (exonStart < exon->end)
	    {
	    lmAllocVar(lm, newExon);
	    newExon->start = exonStart;
	    newExon->end = exon->end;
	    slAddHead(&newList, newExon);
	    }
	}
    else
        {
	/* Easy case where we don't intersect any gaps. */
	slAddHead(&newList, exon);
	}
    txStartPos= txEndPos;
    }
slReverse(&newList);

/* Flip exons back to forward strand if need be */
if (bed->strand[0] == '-')
    flipExonList(&newList, bedSize);

/* Convert exons to bed12 */
struct bed *newBed;
AllocVar(newBed);
newBed->chrom = cloneString(bed->chrom);
newBed->chromStart = newList->start + bed->chromStart;
newBed->chromEnd = newList->end + bed->chromStart;
newBed->name  = cloneString(bed->name);
newBed->score = bed->score;
newBed->strand[0] = bed->strand[0];
newBed->blockCount = slCount(newList);
AllocArray(newBed->blockSizes,  newBed->blockCount);
AllocArray(newBed->chromStarts,  newBed->blockCount);
for (exon = newList, i=0; exon != NULL; exon = exon->next, i++)
    {
    newBed->chromStarts[i] = exon->start;
    newBed->blockSizes[i] = exon->end - exon->start;
    newBed->chromEnd = exon->end + bed->chromStart;
    }

/* Clean up and go home. */
rbTreeFree(&gapTree);
return newBed;
}
Пример #9
0
static void removeEnclosedDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree, 
	int maxBleedOver, double singleExonMaxOverlap)
/* Move double-softs that overlap spliced things to a very great extent into
 * the spliced things. Also remove tiny double-softs (no more than 2*maxBleedOver). */
{
/* Traverse graph and build up range tree covering spliced exons.  For each 
 * range of overlapping exons, assemble a singly-linked list of all exons in 
 * the range */
struct rbTree *rangeTree = rangeTreeNew(0);
struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree);
int removedCount = 0;
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggHardStart || end->type == ggHardEnd)
	{
	rangeTreeAddValList(rangeTree, start->position, end->position, edge);
	}
    }

/* Traverse graph yet one more time looking for doubly-soft exons
 * that are overlapping the spliced exons. */
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        {
	int s = start->position;
	int e = end->position;
	int size = e - s;
	if (size <= maxBleedOver+maxBleedOver)
	     {
	     /* Tiny case, just remove edge and forget it. */
	     verbose(3, "Removing tiny double-soft edge from %d to %d\n", s, e);
	     rbTreeRemove(edgeTree, edge);
	     ++removedCount;
	     }
	else
	     {
	     /* Normal case, look for exon list that encloses us, and
	      * if any single exon in that list encloses us, merge into it. */
	     int splicedOverlap = rangeTreeOverlapSize(rangeTree, s, e);
	     if (splicedOverlap > 0 && splicedOverlap > singleExonMaxOverlap*size)
	         {
		 if (!trustedEdge(edge))
		     {
		     /* Once we find a range that overlaps the doubly-soft edge, find 
		      * (half-hard or better) edge from that range that encloses the 
		      * doubly soft edge. */
		     struct range *r = rangeTreeMaxOverlapping(rangeTree, s, e);
		     struct edge *nextEdge, *edgeList = r->val;
		     struct edge *enclosingEdge = NULL;
		     for (nextEdge = edgeList; edgeList != NULL; edgeList = edgeList->next)
			 {
			 if (encloses(nextEdge, edge))
			     {
			     enclosingEdge = nextEdge;
			     }
			 }
		     if (enclosingEdge != NULL) 
			 {
			 enclosingEdge->evList = slCat(enclosingEdge->evList, edge->evList);
			 edge->evList = NULL;
			 verbose(3, "Removing doubly-soft edge %d-%d, reassigning to %d-%d\n",
				 s, e, enclosingEdge->start->position, 
				 enclosingEdge->end->position);
			 rbTreeRemove(edgeTree, edge);
			 ++removedCount;
			 }
		     }
		 }
	     }
	}
    }

/* Clean up and go home. */
if (removedCount > 0)
    removeUnusedVertices(vertexTree, edgeTree);
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *nextEdge, *edge = edgeRef->val;
    while (edge != NULL) 
	{
	nextEdge = edge->next;
	edge->next = NULL;
	edge = nextEdge;
	}
    }
slFreeList(&edgeRefList);
rbTreeFree(&rangeTree);
}
Пример #10
0
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree)
/* Merge together overlapping edges with soft ends. */
{
struct mergedEdge
/* Hold together info on a merged edge. */
    {
    struct evidence *evidence;
    };

/* Traverse graph and build up range tree.  Each node in the range tree
 * will represent the bounds of coordinates of overlapping double softs */
struct rbTree *rangeTree = rangeTreeNew(0);
struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree);
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        rangeTreeAdd(rangeTree, start->position, end->position);
    }

/* Traverse graph again merging edges */
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start= edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        {
	struct range *r = rangeTreeFindEnclosing(rangeTree,
		start->position, end->position);
	assert(r != NULL);
	/* At this point, r represents the bounds of a double-soft
	 * region that encompasses this edge.  Collect the set of
	 * evidence of edges overlapping this range */
        struct mergedEdge *mergeEdge = r->val;
        if (mergeEdge == NULL)
            {
            lmAllocVar(rangeTree->lm, mergeEdge);
            r->val = mergeEdge;
            }
        mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence);
	verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", 
		start->position, end->position, r->start, r->end);
        edge->evList = NULL;
        rbTreeRemove(edgeTree, edge);
	}
    }

/* Traverse merged edge list, making a single edge from each range. At this point,
 * each range will have some evidence attached to it, from each of the double softs
 * that fall within the range.  From all of this evidence, make a single consensus edge */
struct range *r;
struct lm *lm = lmInit(0);
for (r = rangeTreeList(rangeTree); r != NULL; r = r->next)
    {
    struct mergedEdge *mergedEdge = r->val;
    struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm);
    if (edge != NULL)
        rbTreeAdd(edgeTree, edge);
    verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", 
	    edge->start->position, edge->end->position, r->start, r->end);
    }


/* Clean up and go home. */
lmCleanup(&lm);
removeUnusedVertices(vertexTree, edgeTree);
slFreeList(&edgeRefList);
rbTreeFree(&rangeTree);
}