void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f)
/* Output one Ra record to file. */
{
    fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end);
    fprintf(f, "txName %s\n", seq->name);
    fprintf(f, "txSize %d\n", seq->size);
    fprintf(f, "cdsStart %d\n", start);
    fprintf(f, "cdsEnd %d\n", end);
    fprintf(f, "cdsSize %d\n", end-start);
    fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start));
    fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3));
    boolean gotKozak1 = FALSE;
    if (start >= 3)
    {
        char c = seq->dna[start-3];
        gotKozak1 = (c == 'a' || c == 'g');
    }
    fprintf(f, "gotKozak1 %d\n", gotKozak1);
    boolean gotKozak2 = FALSE;
    if (start+3 < seq->size)
        gotKozak2 = (seq->dna[start+3] == 'g');
    fprintf(f, "gotKozak2 %d\n", gotKozak2);
    fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2);

    /* Count up upstream ATG and Kozak */
    struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew();
    int upAtg = 0, upKozak = 0;
    int i;
    for (i=0; i<start; ++i)
    {
        if (startsWith("atg", seq->dna + i))
        {
            int orfEnd = findOrfEnd(seq, i);
            if (orfEnd < start)
                rangeTreeAdd(upAtgRanges, i, orfEnd);
            ++upAtg;
            if (isKozak(seq->dna, seq->size, i))
            {
                ++upKozak;
                if (orfEnd < start)
                    rangeTreeAdd(upKozakRanges, i, orfEnd);
            }
        }
    }
    fprintf(f, "upstreamAtgCount %d\n", upAtg);
    fprintf(f, "upstreamKozakCount %d\n", upKozak);
    fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start));
    fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start));
    fprintf(f, "\n");

    /* Cluen up and go home. */
    rangeTreeFree(&upAtgRanges);
    rangeTreeFree(&upKozakRanges);
}
예제 #2
0
void constExons(struct txGraph *graph, FILE *f)
/* Write out constituitive exons. */
{
    /* Create a tree with all introns. */
    struct rbTree *tree = rangeTreeNew();
    struct txEdge *edge;
    for (edge = graph->edgeList; edge != NULL; edge = edge->next)
    {
        if (edge->type == ggIntron)
        {
            rangeTreeAdd(tree, graph->vertices[edge->startIx].position,
                         graph->vertices[edge->endIx].position);
        }
    }

    /* Scan through all exons looking for ones that don't intersect
     * introns. */
    int eId = 0;
    for (edge = graph->edgeList; edge != NULL; edge = edge->next)
    {
        if (edge->type == ggExon)
        {
            struct txVertex *s = &graph->vertices[edge->startIx];
            struct txVertex *e = &graph->vertices[edge->endIx];
            if (s->type == ggHardStart && e->type == ggHardEnd)
            {
                int start = s->position;
                int end = e->position;
                if (!rangeTreeOverlaps(tree, start, end))
                {
                    char *refSource = refSourceAcc(graph, edge);
                    if (refSource != NULL && edge->evCount >= 10)
                    {
                        /* Do one more scan making sure that it doesn't
                         * intersect any exons except for us. */
                        boolean anyOtherExon = FALSE;
                        struct txEdge *ed;
                        for (ed = graph->edgeList; ed != NULL; ed = ed->next)
                        {
                            if (ed != edge)
                            {
                                int edStart = graph->vertices[ed->startIx].position;
                                int edEnd = graph->vertices[ed->endIx].position;
                                if (rangeIntersection(edStart, edEnd, start, end) > 0)
                                {
                                    anyOtherExon = TRUE;
                                    break;
                                }
                            }
                        }
                        if (!anyOtherExon)
                            fprintf(f, "%s\t%d\t%d\t%s.%d\t0\t%s\n",
                                    graph->tName, start, end, refSource, ++eId, graph->strand);
                    }
                }
            }
        }
    }
    rangeTreeFree(&tree);
}
void sortAndMergeBlocks(struct bed *oneBed)
/* construct a range tree out of blocks, then remake blocks arrays */
/* this should automatically merge everything. */
{
struct rbTree *rt = rangeTreeNew();
struct range *rangeList = NULL;
struct range *oneRange;
int i;
for (i = 0; i < oneBed->blockCount; i++)
    {
    rangeTreeAdd(rt, oneBed->chromStarts[i], oneBed->chromStarts[i] + oneBed->blockSizes[i]);
    /* delete old (bad) blocks in bed */
    oneBed->chromStarts[i] = oneBed->blockSizes[i] = 0;
    }
rangeList = rangeTreeList(rt);
oneBed->blockCount = slCount(rangeList);
/* remake bed blocks out of range tree */
i = 0;
for (oneRange = rangeList; oneRange != NULL; oneRange = oneRange->next)
    {
    oneBed->chromStarts[i] = oneRange->start;
    oneBed->blockSizes[i] = oneRange->end - oneRange->start;
    i++;
    }
rangeTreeFree(&rt);
}
예제 #4
0
void outputProt(struct protInfo *prot, struct hash *seedToScop, FILE *f, FILE *fKnownTo)
/* Callapse together all features of same type that overlap and output */
{
slSort(&prot->featureList, protFeatureCmpName);
struct protFeature *startFeature, *endFeature;
for (startFeature = prot->featureList; startFeature != NULL; startFeature = endFeature)
    {
    for (endFeature = startFeature->next; endFeature != NULL; endFeature = endFeature->next)
        if (!sameString(startFeature->name, endFeature->name))
	    break;
    struct rbTree *rangeTree = rangeTreeNew();
    struct protFeature *feature;
    for (feature = startFeature; feature != endFeature; feature = feature->next)
	rangeTreeAdd(rangeTree, feature->start, feature->end);
    struct range *range, *rangeList = rangeTreeList(rangeTree);
    for (range = rangeList; range != NULL; range = range->next)
	{
	feature = highestScoringFeature(startFeature, endFeature, range->start, range->end);
        fprintf(f, "%s\t%d\t%d\t%s\n", prot->name, range->start, range->end, startFeature->name);
	fprintf(fKnownTo, "%s\t%s\t%d\t%d\t%g\n",
		prot->name, (char *)hashMustFindVal(seedToScop, startFeature->name),
		range->start, range->end, feature->eVal);
	}

    rangeTreeFree(&rangeTree);
    }
}
예제 #5
0
파일: bigBed.c 프로젝트: cestmoi7/AGAPE
static struct bbiInterval *bigBedCoverageIntervals(struct bbiFile *bbi,
	char *chrom, bits32 start, bits32 end, struct lm *lm)
/* Return intervals where the val is the depth of coverage. */
{
/* Get list of overlapping intervals */
struct bigBedInterval *bi, *biList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm);
if (biList == NULL)
    return NULL;

/* Make a range tree that collects coverage. */
struct rbTree *rangeTree = rangeTreeNew();
for (bi = biList; bi != NULL; bi = bi->next)
    rangeTreeAddToCoverageDepth(rangeTree, bi->start, bi->end);
struct range *range, *rangeList = rangeTreeList(rangeTree);

/* Convert rangeList to bbiInterval list. */
struct bbiInterval *bwi, *bwiList = NULL;
for (range = rangeList; range != NULL; range = range->next)
    {
    lmAllocVar(lm, bwi);
    bwi->start = range->start;
    if (bwi->start < start)
       bwi->start = start;
    bwi->end = range->end;
    if (bwi->end > end)
       bwi->end = end;
    bwi->val = ptToInt(range->val);
    slAddHead(&bwiList, bwi);
    }
slReverse(&bwiList);

/* Clean up and go home. */
rangeTreeFree(&rangeTree);
return bwiList;
}
예제 #6
0
void getRange(char *regionFile){
                FILE *fp;
        char buf[500],chr[50];
        char str[2][500];
        int i, b, e;
        struct hashEl *el;
        struct rbTree *tr;
        fp = mustOpen(regionFile, "r");
        aliHash = newHash(8);
        while (fgets(buf, 500, fp)) {
		if (sscanf(buf, "%[^\t]\t%[^\t]\t%*s", str[0], str[1]) != 2)
                        errAbort("error: %s", buf);
		for (i = 0; i < 2; i++) {
                        if (sscanf(str[i], "%[^:]:%d-%d", chr, &b, &e) != 3)
                                errAbort("error: %s", str[i]);
     //   aliHash = newHash(8);
                	el = hashLookup(aliHash, chr);
                	if (el == NULL) {
                                tr = rangeTreeNew();
                                hashAdd(aliHash, chr, tr);
                        }
                	else
                        	tr = (struct rbTree *)(el->val);
	                rangeTreeAdd(tr, b, e);
        	}
	}
//      printf("range\n");
}
예제 #7
0
void freen(char *chrom)
/* Test something */
{
uglyTime(NULL);
struct sqlConnection *conn = sqlConnect("hg19");
uglyTime("connect");
char query[512];
sqlSafef(query, sizeof(query), "select * from knownGene where chrom='%s'", chrom);
struct sqlResult *sr = sqlGetResult(conn, query);
uglyTime("get result");
char **row;
struct rbTree *rt = rangeTreeNew();
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *gp = genePredLoad(row);
    int i;
    int exonCount = gp->exonCount;
    for (i=0; i<exonCount; ++i)
        rangeTreeAdd(rt, gp->exonStarts[i], gp->exonEnds[i]);
    }
uglyTime("Add rows");
struct range *list = rangeTreeList(rt);
uglyTime("Did list");
uglyf("%d items in chrom %s\n", slCount(list), chrom);
}
예제 #8
0
int pslBedOverlap(struct psl *psl, struct bed *bed)
/* Return number of bases psl and bed overlap at the block level */
{
/* No overlap if on wrong chromosome or wrong strand. */
if (psl->strand[0] != bed->strand[0])
    return 0;
if (!sameString(psl->tName, bed->chrom)) 
    return 0;

/* Build up range tree covering bed */
struct rbTree *rangeTree = rangeTreeNew();
int i;
for (i=0; i<bed->blockCount; ++i)
    {
    int start = bed->chromStart + bed->chromStarts[i];
    int end = start + bed->blockSizes[i];
    rangeTreeAdd(rangeTree, start, end);
    }

/* Loop through psl accumulating total overlap. */
int totalOverlap = 0;
for (i=0; i<psl->blockCount; ++i)
    {
    int start = psl->tStarts[i];
    int end = start + psl->blockSizes[i];
    totalOverlap += rangeTreeOverlapSize(rangeTree, start, end);
    }

/* Clean up and return result. */
rangeTreeFree(&rangeTree);
return totalOverlap;
}
예제 #9
0
void printBiggestGap(char *database, struct sqlConnection *conn, 
	struct slName *chromList, struct hash *chromHash, char *track)
/* Look up track in database, figure out which type it is, call
 * appropriate biggest gap finder, and then print result. */
{
struct trackDb *tdb = hTrackInfo(conn, track);
struct hTableInfo *hti = hFindTableInfo(database, chromList->name, tdb->table);
char *typeWord = cloneFirstWord(tdb->type);
boolean isBig = FALSE, isBigBed = FALSE;
struct bbiFile *bbi = NULL;
if (sameString(typeWord, "bigBed"))
    {
    isBig = TRUE;
    isBigBed = TRUE;
    bbi = bigBedFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) );
    }
else if (sameString(typeWord, "bigWig"))
    {
    isBig = TRUE;
    bbi = bigWigFileOpen( bbiNameFromSettingOrTable(tdb, conn, tdb->table) );
    }
char *biggestChrom = NULL;
int biggestSize = 0, biggestStart = 0, biggestEnd = 0;

struct slName *chrom;
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    if (!allParts && strchr(chrom->name, '_'))	// Generally skip weird chroms
        continue;
    if (female && sameString(chrom->name, "chrY"))
        continue;
    int chromSize = hashIntVal(chromHash, chrom->name);
    struct rbTree *rt = rangeTreeNew();
    int start = 0, end = 0, size = 0;
    if (isBig)
	bigCoverageIntoTree(tdb, bbi, chrom->name, chromSize, rt, isBigBed);
    else
        tableCoverageIntoTree(hti, tdb, conn, chrom->name, chromSize, rt);
    if (rt->n > 0)	// Want to keep completely uncovered chromosome uncovered
	addGaps(conn, chrom->name, rt);
    biggestGapFromRangeTree(rt, chromSize, &start, &end, &size);
    if (size > biggestSize)
        {
	biggestSize = size;
	biggestStart = start;
	biggestEnd = end;
	biggestChrom = chrom->name;
	}
    rangeTreeFree(&rt);
    }
printf("%s\t%s:%d-%d\t", track, biggestChrom, biggestStart+1, biggestEnd);
if (noComma)
    printf("%d", biggestSize);
else
    printLongWithCommas(stdout, biggestSize);
putchar('\n');
freez(&typeWord);
bbiFileClose(&bbi);
}
struct gene *geneNew()
/* Create empty gene. */
{
struct gene *gene;
AllocVar(gene);
gene->exonTree = rangeTreeNew();
return gene;
}
int overlapInSameFrame(struct bed *a, struct bed *b)
/* Return amount of overlap between coding regions (in same frame)
 * between two beds. */
{
int overlap = 0;

/* Allocate range trees for each frame. */
struct rbTree *frameTrees[3];
int frame;
for (frame = 0; frame<3; ++frame)
    frameTrees[frame] = rangeTreeNew();

/* Fill in frame trees with coding exons of a. */
int cdsPos = 0;
int block, blockCount = a->blockCount;
for (block = 0; block < blockCount; ++block)
    {
    int start = a->chromStart + a->chromStarts[block];
    int end = start + a->blockSizes[block];
    start = max(start, a->thickStart);
    end = min(end, a->thickEnd);
    if (start < end)
	{
	int size = end - start;
	int frame = (start - cdsPos)%3;
	rangeTreeAdd(frameTrees[frame], start, end);
	cdsPos += size;
	}
    }

/* Add up overlaps by comparing bed b against frameTrees */
cdsPos = 0;
blockCount = b->blockCount;
for (block = 0; block < blockCount; ++block)
    {
    int start = b->chromStarts[block] + b->chromStart;
    int end = start + b->blockSizes[block];
    start = max(start, b->thickStart);
    end = min(end, b->thickEnd);
    if (start < end)
	{
	int size = end - start;
	int frame = (start - cdsPos)%3;
	overlap += rangeTreeOverlapSize(frameTrees[frame], start, end);
	cdsPos += size;
	}
    }

/* Clean up and go home. */
for (frame = 0; frame<3; ++frame)
    rangeTreeFree(&frameTrees[frame]);
return overlap;
}
예제 #12
0
struct rbTree *codingTree(struct bed *bedList)
/* Return rangeTree for all coding transcripts in list. */
{
struct rbTree *rangeTree = rangeTreeNew();
struct bed *bed;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    if (bed->thickStart < bed->thickEnd)
	addBedBlocksToRangeTree(rangeTree, bed);
    }
return rangeTree;
}
예제 #13
0
void calcUpstreams(struct dnaSeq *seq, int *upAtgCount, int *upKozakCount)
/* Count up upstream ATG and Kozak */
{
struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew();
int endPos = seq->size-3;
int i;
for (i=0; i<=endPos; ++i)
    {
    if (startsWith("atg", seq->dna + i))
        {
        int orfEnd = orfEndInSeq(seq, i);
	rangeTreeAdd(upAtgRanges, i, orfEnd);
        if (isKozak(seq->dna, seq->size, i))
	    rangeTreeAdd(upKozakRanges, i, orfEnd);
        }
    }
setArrayCountsFromRangeTree(upAtgRanges, upAtgCount, seq->size);
setArrayCountsFromRangeTree(upKozakRanges, upKozakCount, seq->size);
rangeTreeFree(&upAtgRanges);
rangeTreeFree(&upKozakRanges);
}
예제 #14
0
struct rbTree *bedBkToTree(struct binKeeper *bk)
/* Make a rangeTree covering all exons in tree. */
{
struct binElement *el, *list = binKeeperFindAll(bk);
struct rbTree *tree = rangeTreeNew();
for (el = list; el != NULL; el = el->next)
    {
    struct bed *bed = el->val;
    bedIntoRangeTree(bed, tree);
    }
slFreeList(&list);
return tree;
}
void doStrand(struct bed *start, struct bed *end, FILE *f)
/* Assuming all beds from start up to end are on same strand,
 * make a merged bed with all their blocks and output it. */
{
struct rbTree *rangeTree = rangeTreeNew();
struct bed *bed;
for (bed = start; bed != end; bed = bed->next)
    bedIntoRangeTree(bed, rangeTree);
bed = bedFromRangeTree(rangeTree, start->chrom, start->name, start->strand);
bedTabOutN(bed, 12, f);
bedFree(&bed);
rangeTreeFree(&rangeTree);
}
예제 #16
0
void altFivePrime(struct txGraph *graph, struct range *exonsWithIntrons, FILE *f)
/* Write out instances of alt 5' prime splice sites on plus strand
 * (and alt 3' splice sites on minus strand). */
{
    struct txEdge *e1, *e2;
    struct txVertex *v = graph->vertices;
    struct lm *lm = lmInit(0);
    struct rbTree *tree = rangeTreeNew();
    struct range *range, *rangeList = NULL;
    for (e1 = graph->edgeList; e1 != NULL; e1 = e1->next)
    {
        if (e1->type == ggExon)
        {
            int e1Start = v[e1->startIx].position;
            int e1End = v[e1->endIx].position;
            boolean  e1HardStart = (v[e1->startIx].type == ggHardStart);
            if (e1HardStart)
            {
                for (e2 = graph->edgeList; e2 != NULL; e2 = e2->next)
                {
                    if (e2->type == ggExon)
                    {
                        int e2Start = v[e2->startIx].position;
                        int e2End = v[e2->endIx].position;
                        boolean  e2HardStart = (v[e2->startIx].type == ggHardStart);
                        if (e2HardStart && e1Start != e2Start && e1End == e2End)
                        {
                            int aStart = min(e1Start, e2Start);
                            int aEnd = max(e1Start, e2Start);
                            if (!inRangeList(exonsWithIntrons, e1Start, e1End)
                                    && !inRangeList(exonsWithIntrons, e2Start, e2End)
                                    && !inRangeList(rangeList, aStart, aEnd))
                            {
                                lmAllocVar(lm, range);
                                range->start = aStart;
                                range->end = aEnd;
                                slAddHead(&rangeList, range);
                                fprintf(f, "%s\t%d\t%d\t%s\t0\t%s\n", graph->tName,
                                        aStart, aEnd,
                                        (graph->strand[0] == '-' ? "altFivePrime" : "altThreePrime"),
                                        graph->strand);
                            }
                        }
                    }
                }
            }
        }
    }
    rbTreeFree(&tree);
    lmCleanup(&lm);
}
예제 #17
0
static void readMappingResults(char **argv) {
	FILE *fp;
	char buf[500], chr[50], id[500];
	int beg, i, j, len;
	char *ch;
	struct slName *ali;
	struct hashEl *el;
	struct rbTree *tr;

	aliHash = newHash(8);
	
	for (i = 4; i <= 5; i++) {
		fp = mustOpen(argv[i], "r");
		while (fgets(buf, 500, fp)) {
			
			if (ncbi && i >= 6 && i <= 7) {
				if (sscanf(buf, "%[^\t]\t%*c %s %d %d", id, chr, &beg, &len) != 4)
					errAbort("error: %s", buf);
				if ((ch = strchr(id, ' ')))
					*ch = '\0';
				if (i >= 6 && i <= 7) {
					j = strlen(id);
					sprintf(id+j, "/%d", i-5);
				}
			}
			
			else {	
				if (sscanf(buf, "%[^\t]\t%*c %s %d %d", id, chr, &beg, &len) != 4){
					errAbort("error: %s", buf);
				}
					if ((ch = strchr(id, ' ')))
                                        	*ch = '\0';
				
			}
			ali = newSlName(id);
			el = hashLookup(aliHash, chr);
			if (el == NULL) {
				tr = rangeTreeNew();
				hashAdd(aliHash, chr, tr);
			}
			else 
				tr = (struct rbTree *)(el->val);
			rangeTreeAddValHead(tr, beg, beg + len - 1, &ali);
		}
		fclose(fp);
	}
}
예제 #18
0
struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom)
/* Read lines from bed file as long as they match chrom.  Return a rangeTree that
 * corresponds to the coverage. */
{
struct rbTree *tree = rangeTreeNew();
char *line;
while (lineFileNextReal(lf, &line))
    {
    if (!startsWithWord(chrom, line))
        {
	lineFileReuse(lf);
	break;
	}
    char *row[3];
    chopLine(line, row);
    unsigned start = sqlUnsigned(row[1]);
    unsigned end = sqlUnsigned(row[2]);
    rangeTreeAddToCoverageDepth(tree, start, end);
    }
return tree;
}
예제 #19
0
static struct bed* subset_beds(char* sectionString, struct bed** pRegions, struct hash* chromHash)
/* in the situation where both a regions bed file is given AND the filename specifies subsections, */
/* intersect the two.  For simplictity sake,  */
{
    struct bed* fname_ranges = parseSectionString(sectionString, chromHash);
    struct bed* bed;
    struct bed* subset = NULL;
    struct bed* regions = *pRegions;
    slSort(&fname_ranges, bedCmp);
    bed = fname_ranges;
    while (bed != NULL) {
        /* each iteration of the loop should be a separate chrom */
        struct bed* region;
        struct rbTree* tree = rangeTreeNew();
        while ((bed != NULL) && (bed->next != NULL) && (sameString(bed->chrom, bed->next->chrom))) {
            rangeTreeAdd(tree, bed->chromStart, bed->chromEnd);
            bed = bed->next;
        }
        rangeTreeAdd(tree, bed->chromStart, bed->chromEnd);
        /* now we're at a point that we're dealing only with one chromosome. */
        for (region = regions; region != NULL; region = region->next) {
            if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd)
                && rangeTreeFindEnclosing(tree, region->chromStart, region->chromEnd)) {
                struct bed* clone = cloneBed(region);
                slAddHead(&subset, clone);
            } else if (sameString(region->chrom, bed->chrom) && rangeTreeOverlaps(tree, region->chromStart, region->chromEnd))
                errAbort("range specified in file overlaps but is not contained by range specified on command-line");
        }
        rangeTreeFree(&tree);
        bed = bed->next;
    }
    if (subset == NULL) {
        errAbort("no ranges specified in file were contained in ranges specified on command-line");
    }
    slReverse(&subset);
    bedFreeList(&fname_ranges);
    bedFreeList(pRegions);
    return subset;
}
struct bed *breakUpBedAtCdsBreaks(struct cdsEvidence *cds, struct bed *bed)
/* Create a new broken-up that excludes part of gene between CDS breaks.  
 * Also jiggles cds->end coordinate to cope with the sequence we remove.
 * Deals with transcript to genome coordinate mapping including negative
 * strand.  Be afraid, be very afraid! */
{
/* Create range tree covering all breaks.  The coordinates here
 * are transcript coordinates.  While we're out it shrink outer CDS
 * since we are actually shrinking transcript. */
struct rbTree *gapTree = rangeTreeNew();
int bedSize = bed->chromEnd - bed->chromStart;
struct lm *lm = gapTree->lm;	/* Convenient place to allocate memory. */
int i, lastCds = cds->cdsCount-1;
for (i=0; i<lastCds; ++i)
    {
    int gapStart = cds->cdsStarts[i] + cds->cdsSizes[i];
    int gapEnd = cds->cdsStarts[i+1];
    int gapSize = gapEnd - gapStart;
    cds->end -= gapSize;
    rangeTreeAdd(gapTree, gapStart, gapEnd);
    }

/* Get list of exons in bed, flipped to reverse strand if need be. */
struct range *exon, *exonList = bedToExonList(bed, lm);
if (bed->strand[0] == '-')
    flipExonList(&exonList, bedSize);

/* Go through exon list, mapping each exon to transcript
 * coordinates. Check if exon needs breaking up, and if
 * so do so, as we copy it to new list. */
/* Copy exons to new list, breaking them up if need be. */
struct range *newList = NULL, *nextExon, *newExon;
int txStartPos = 0, txEndPos;
for (exon = exonList; exon != NULL; exon = nextExon)
    {
    txEndPos = txStartPos + exon->end - exon->start;
    nextExon = exon->next;
    struct range *gapList = rangeTreeAllOverlapping(gapTree, txStartPos, txEndPos);
    if (gapList != NULL)
        {
	verbose(3, "Splitting exon because of CDS gap\n");

	/* Make up exons from current position up to next gap.  This is a little
	 * complicated by possibly the gap starting before the exon. */
	int exonStart = exon->start;
	int txStart = txStartPos;
	struct range *gap;
	for (gap = gapList; gap != NULL; gap = gap->next)
	    {
	    int txEnd = gap->start;
	    int gapSize = rangeIntersection(gap->start, gap->end, txStart, txEndPos);
	    int exonSize = txEnd - txStart;
	    if (exonSize > 0)
		{
		lmAllocVar(lm, newExon);
		newExon->start = exonStart;
		newExon->end = exonStart + exonSize;
		slAddHead(&newList, newExon);
		}
	    else /* This case happens if gap starts before exon */
	        {
		exonSize = 0;
		}

	    /* Update current position in both transcript and genome space. */
	    exonStart += exonSize + gapSize;
	    txStart += exonSize + gapSize;
	    }

	/* Make up final exon from last gap to end, at least if we don't end in a gap. */
	if (exonStart < exon->end)
	    {
	    lmAllocVar(lm, newExon);
	    newExon->start = exonStart;
	    newExon->end = exon->end;
	    slAddHead(&newList, newExon);
	    }
	}
    else
        {
	/* Easy case where we don't intersect any gaps. */
	slAddHead(&newList, exon);
	}
    txStartPos= txEndPos;
    }
slReverse(&newList);

/* Flip exons back to forward strand if need be */
if (bed->strand[0] == '-')
    flipExonList(&newList, bedSize);

/* Convert exons to bed12 */
struct bed *newBed;
AllocVar(newBed);
newBed->chrom = cloneString(bed->chrom);
newBed->chromStart = newList->start + bed->chromStart;
newBed->chromEnd = newList->end + bed->chromStart;
newBed->name  = cloneString(bed->name);
newBed->score = bed->score;
newBed->strand[0] = bed->strand[0];
newBed->blockCount = slCount(newList);
AllocArray(newBed->blockSizes,  newBed->blockCount);
AllocArray(newBed->chromStarts,  newBed->blockCount);
for (exon = newList, i=0; exon != NULL; exon = exon->next, i++)
    {
    newBed->chromStarts[i] = exon->start;
    newBed->blockSizes[i] = exon->end - exon->start;
    newBed->chromEnd = exon->end + bed->chromStart;
    }

/* Clean up and go home. */
rbTreeFree(&gapTree);
return newBed;
}
예제 #21
0
static void removeEnclosedDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree, 
	int maxBleedOver, double singleExonMaxOverlap)
/* Move double-softs that overlap spliced things to a very great extent into
 * the spliced things. Also remove tiny double-softs (no more than 2*maxBleedOver). */
{
/* Traverse graph and build up range tree covering spliced exons.  For each 
 * range of overlapping exons, assemble a singly-linked list of all exons in 
 * the range */
struct rbTree *rangeTree = rangeTreeNew(0);
struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree);
int removedCount = 0;
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggHardStart || end->type == ggHardEnd)
	{
	rangeTreeAddValList(rangeTree, start->position, end->position, edge);
	}
    }

/* Traverse graph yet one more time looking for doubly-soft exons
 * that are overlapping the spliced exons. */
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        {
	int s = start->position;
	int e = end->position;
	int size = e - s;
	if (size <= maxBleedOver+maxBleedOver)
	     {
	     /* Tiny case, just remove edge and forget it. */
	     verbose(3, "Removing tiny double-soft edge from %d to %d\n", s, e);
	     rbTreeRemove(edgeTree, edge);
	     ++removedCount;
	     }
	else
	     {
	     /* Normal case, look for exon list that encloses us, and
	      * if any single exon in that list encloses us, merge into it. */
	     int splicedOverlap = rangeTreeOverlapSize(rangeTree, s, e);
	     if (splicedOverlap > 0 && splicedOverlap > singleExonMaxOverlap*size)
	         {
		 if (!trustedEdge(edge))
		     {
		     /* Once we find a range that overlaps the doubly-soft edge, find 
		      * (half-hard or better) edge from that range that encloses the 
		      * doubly soft edge. */
		     struct range *r = rangeTreeMaxOverlapping(rangeTree, s, e);
		     struct edge *nextEdge, *edgeList = r->val;
		     struct edge *enclosingEdge = NULL;
		     for (nextEdge = edgeList; edgeList != NULL; edgeList = edgeList->next)
			 {
			 if (encloses(nextEdge, edge))
			     {
			     enclosingEdge = nextEdge;
			     }
			 }
		     if (enclosingEdge != NULL) 
			 {
			 enclosingEdge->evList = slCat(enclosingEdge->evList, edge->evList);
			 edge->evList = NULL;
			 verbose(3, "Removing doubly-soft edge %d-%d, reassigning to %d-%d\n",
				 s, e, enclosingEdge->start->position, 
				 enclosingEdge->end->position);
			 rbTreeRemove(edgeTree, edge);
			 ++removedCount;
			 }
		     }
		 }
	     }
	}
    }

/* Clean up and go home. */
if (removedCount > 0)
    removeUnusedVertices(vertexTree, edgeTree);
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *nextEdge, *edge = edgeRef->val;
    while (edge != NULL) 
	{
	nextEdge = edge->next;
	edge->next = NULL;
	edge = nextEdge;
	}
    }
slFreeList(&edgeRefList);
rbTreeFree(&rangeTree);
}
예제 #22
0
struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, 
	double forceJoinScore, double weakLevel)
/* Convert a list of items to a list of clusters of items.  This may break up clusters that
 * have weakly linked parts. 
      [                ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   gets tranformed into
       [    ]   [    ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   The strategy is to build a rangeTree of coverage, which might look something like so:
      123333211123333211 
   then define cluster ends that exceed the minimum limit, which is either weakLevel
   (usually 10%) of the highest or forceJoinScore if weakLevel times the highest is 
   more than forceJoinScore.  This will go to something like so:
        [---]   [----]   
   Finally the items that are overlapping a cluster are assigned to it.  Note that this
   may mean that an item may be in multiple clusters.
        [ABC]   [ ADE]
 */
{
int easyMax = round(1.0/weakLevel);
int itemCount = slCount(itemList);
struct peakCluster *clusterList = NULL;
if (itemCount < easyMax)
    {
    struct peakItem *item = itemList;
    int chromStart = item->chromStart;
    int chromEnd = item->chromEnd;
    for (item = item->next; item != NULL; item = item->next)
        {
	if (item->chromStart < chromStart) chromStart = item->chromStart;
	if (item->chromEnd > chromEnd) chromEnd = item->chromEnd;
	}
    addCluster(lm, itemList, chromStart, chromEnd, &clusterList);
    }
else
    {
    /* Make up coverage tree. */
    struct rbTree *covTree = rangeTreeNew();
    struct peakItem *item;
    for (item = itemList; item != NULL; item = item->next)
	rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd);
    struct range *range, *rangeList = rangeTreeList(covTree);

    /* Figure out maximum coverage. */
    int maxCov = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > maxCov) maxCov = cov;
	}

    /* Figure coverage threshold. */
    int threshold = round(maxCov * weakLevel);
    if (threshold > forceJoinScore-1) threshold = forceJoinScore-1;

    /* Loop through emitting sections over threshold as clusters */
    boolean inRange = FALSE;
    boolean start = 0, end = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > threshold)
	    {
	    if (inRange)
	       end = range->end;
	    else
	       {
	       inRange = TRUE;
	       start = range->start;
	       end = range->end;
	       }
	    }
	else
	    {
	    if (inRange)
		{
		addCluster(lm, itemList, start, end, &clusterList);
		inRange = FALSE;
		}
	    }
	}
    if (inRange)
        addCluster(lm, itemList, start, end, &clusterList);
    }
slReverse(&clusterList);
return clusterList;
}
예제 #23
0
void doEnrichmentsFromBigBed(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigBedPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigBedFileOpen(bigBedPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigBed data in memory. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */
    struct lm *lm = lmInit(0);
    struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm);
    struct bigBedInterval *iv;
    struct rbTree *bbTree = rangeTreeNew();
    for (iv = ivList; iv != NULL; iv = iv->next)
	 rangeTreeAdd(bbTree, iv->start, iv->end);
    struct range *bbRange, *bbRangeList = rangeTreeList(bbTree);

    /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */
    struct target *target;
    for (target = targetList; target != NULL; target = target->next)
        {
	if (target->skip)
	    continue;
	struct genomeRangeTree *grt = target->grt;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct bigBedInterval *iv;
	    for (iv = ivList; iv != NULL; iv = iv->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
		target->overlapBases += overlap;
		}
	    for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end);
		target->uniqOverlapBases += overlap;
		}
	    }
	}
    rangeTreeFree(&bbTree);
    lmCleanup(&lm);
    }

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
	continue;
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bbiChromInfoFreeList(&chromList);
bigBedFileClose(&bbi);
freez(&bigBedPath);
}
예제 #24
0
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree)
/* Merge together overlapping edges with soft ends. */
{
struct mergedEdge
/* Hold together info on a merged edge. */
    {
    struct evidence *evidence;
    };

/* Traverse graph and build up range tree.  Each node in the range tree
 * will represent the bounds of coordinates of overlapping double softs */
struct rbTree *rangeTree = rangeTreeNew(0);
struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree);
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        rangeTreeAdd(rangeTree, start->position, end->position);
    }

/* Traverse graph again merging edges */
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start= edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        {
	struct range *r = rangeTreeFindEnclosing(rangeTree,
		start->position, end->position);
	assert(r != NULL);
	/* At this point, r represents the bounds of a double-soft
	 * region that encompasses this edge.  Collect the set of
	 * evidence of edges overlapping this range */
        struct mergedEdge *mergeEdge = r->val;
        if (mergeEdge == NULL)
            {
            lmAllocVar(rangeTree->lm, mergeEdge);
            r->val = mergeEdge;
            }
        mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence);
	verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", 
		start->position, end->position, r->start, r->end);
        edge->evList = NULL;
        rbTreeRemove(edgeTree, edge);
	}
    }

/* Traverse merged edge list, making a single edge from each range. At this point,
 * each range will have some evidence attached to it, from each of the double softs
 * that fall within the range.  From all of this evidence, make a single consensus edge */
struct range *r;
struct lm *lm = lmInit(0);
for (r = rangeTreeList(rangeTree); r != NULL; r = r->next)
    {
    struct mergedEdge *mergedEdge = r->val;
    struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm);
    if (edge != NULL)
        rbTreeAdd(edgeTree, edge);
    verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", 
	    edge->start->position, edge->end->position, r->start, r->end);
    }


/* Clean up and go home. */
lmCleanup(&lm);
removeUnusedVertices(vertexTree, edgeTree);
slFreeList(&edgeRefList);
rbTreeFree(&rangeTree);
}