コード例 #1
0
void outputOneRa(struct dnaSeq *seq, int start, int end, FILE *f)
/* Output one Ra record to file. */
{
    fprintf(f, "orfName %s_%d_%d\n", seq->name, start, end);
    fprintf(f, "txName %s\n", seq->name);
    fprintf(f, "txSize %d\n", seq->size);
    fprintf(f, "cdsStart %d\n", start);
    fprintf(f, "cdsEnd %d\n", end);
    fprintf(f, "cdsSize %d\n", end-start);
    fprintf(f, "gotStart %d\n", startsWith("atg", seq->dna+start));
    fprintf(f, "gotEnd %d\n", isStopCodon(seq->dna+end-3));
    boolean gotKozak1 = FALSE;
    if (start >= 3)
    {
        char c = seq->dna[start-3];
        gotKozak1 = (c == 'a' || c == 'g');
    }
    fprintf(f, "gotKozak1 %d\n", gotKozak1);
    boolean gotKozak2 = FALSE;
    if (start+3 < seq->size)
        gotKozak2 = (seq->dna[start+3] == 'g');
    fprintf(f, "gotKozak2 %d\n", gotKozak2);
    fprintf(f, "gotKozak %d\n", gotKozak1 + gotKozak2);

    /* Count up upstream ATG and Kozak */
    struct rbTree *upAtgRanges = rangeTreeNew(), *upKozakRanges = rangeTreeNew();
    int upAtg = 0, upKozak = 0;
    int i;
    for (i=0; i<start; ++i)
    {
        if (startsWith("atg", seq->dna + i))
        {
            int orfEnd = findOrfEnd(seq, i);
            if (orfEnd < start)
                rangeTreeAdd(upAtgRanges, i, orfEnd);
            ++upAtg;
            if (isKozak(seq->dna, seq->size, i))
            {
                ++upKozak;
                if (orfEnd < start)
                    rangeTreeAdd(upKozakRanges, i, orfEnd);
            }
        }
    }
    fprintf(f, "upstreamAtgCount %d\n", upAtg);
    fprintf(f, "upstreamKozakCount %d\n", upKozak);
    fprintf(f, "upstreamSize %d\n", rangeTreeOverlapSize(upAtgRanges, 0, start));
    fprintf(f, "upstreamKozakSize %d\n", rangeTreeOverlapSize(upKozakRanges, 0, start));
    fprintf(f, "\n");

    /* Cluen up and go home. */
    rangeTreeFree(&upAtgRanges);
    rangeTreeFree(&upKozakRanges);
}
コード例 #2
0
ファイル: txInfoAssemble.c プロジェクト: elmargb/kentUtils
int pslBedOverlap(struct psl *psl, struct bed *bed)
/* Return number of bases psl and bed overlap at the block level */
{
/* No overlap if on wrong chromosome or wrong strand. */
if (psl->strand[0] != bed->strand[0])
    return 0;
if (!sameString(psl->tName, bed->chrom)) 
    return 0;

/* Build up range tree covering bed */
struct rbTree *rangeTree = rangeTreeNew();
int i;
for (i=0; i<bed->blockCount; ++i)
    {
    int start = bed->chromStart + bed->chromStarts[i];
    int end = start + bed->blockSizes[i];
    rangeTreeAdd(rangeTree, start, end);
    }

/* Loop through psl accumulating total overlap. */
int totalOverlap = 0;
for (i=0; i<psl->blockCount; ++i)
    {
    int start = psl->tStarts[i];
    int end = start + psl->blockSizes[i];
    totalOverlap += rangeTreeOverlapSize(rangeTree, start, end);
    }

/* Clean up and return result. */
rangeTreeFree(&rangeTree);
return totalOverlap;
}
コード例 #3
0
ファイル: rangeTree.c プロジェクト: Puneet-Shivanand/zinba
int rangeTreeOverlapTotalSize(struct rbTree *tree)
/* Return the total size of all ranges in range tree.
 * Sadly not thread-safe. 
 * On 32 bit machines be careful not to overflow
 * range of start, end or total size return value. */
{
return rangeTreeOverlapSize(tree, INT_MIN, INT_MAX);
}
コード例 #4
0
long rangeTreeRangeTreeOverlap(struct rbTree *a, struct rbTree *b)
/* Return total overlap between two range trees. */
{
struct range *range, *list = rangeTreeList(a);
long total = 0;
for (range = list; range != NULL; range = range->next)
    total += rangeTreeOverlapSize(b, range->start, range->end);
return total;
}
コード例 #5
0
static void addBbCorrelations(struct bbiChromInfo *chrom, struct genomeRangeTree *targetGrt,
    struct bbiFile *aBbi, struct bbiFile *bBbi, 
    int numColIx, struct correlate *c, struct correlate *cInEnriched,
    long long *aTotalSpan, long long *bTotalSpan, long long *overlapTotalSpan)
/* Find bits of a and b that overlap and also overlap with targetRanges.  Try to extract
 * some number from the bed (which number depends on format). Returns total number of
 * overlapping bases between the two big-beds. */
{
struct lm *lm = lmInit(0);
struct rbTree *targetRanges = NULL;
if (targetGrt != NULL)
    targetRanges = genomeRangeTreeFindRangeTree(targetGrt, chrom->name);
struct bigBedInterval *a, *aList = bigBedIntervalQuery(aBbi, chrom->name, 0, chrom->size, 0, lm);
struct bigBedInterval *b, *bList = bigBedIntervalQuery(bBbi, chrom->name, 0, chrom->size, 0, lm);
long long totalOverlap = 0;

/* This is a slightly complex but useful loop for two sorted lists that will get overlaps between 
 * the two in linear time. */
a = aList;
b = bList;
for (;;)
    {
    if (a == NULL || b == NULL)
        break;
    int s = max(a->start,b->start);
    int e = min(a->end,b->end);
    int overlap = e - s;
    if (overlap > 0)
        {
	totalOverlap += overlap;

	/* Do correlation over a/b overlap */
	double aVal = getDoubleValAt(a->rest, numColIx);
	double bVal = getDoubleValAt(b->rest, numColIx);
	correlateNextMulti(c, aVal, bVal, overlap);

	/* Got intersection of a and b - is it also in targetRange? */
	if (targetRanges)
	    {
	    int targetOverlap = rangeTreeOverlapSize(targetRanges, s, e);
	    if (targetOverlap > 0)
		{
		correlateNextMulti(cInEnriched, aVal, bVal, targetOverlap);
		}
	    }
	}
    if (a->end < b->end)
       a = a->next;
    else 
       b = b->next;
    }
*overlapTotalSpan += totalOverlap;
*aTotalSpan += bbIntervalListTotalSpan(aList);
*bTotalSpan += bbIntervalListTotalSpan(bList);
lmCleanup(&lm);
}
コード例 #6
0
ファイル: cdwMakeEnrichments.c プロジェクト: maximilianh/kent
void doEnrichmentsFromBed3Sample(struct bed3 *sampleList,
    struct sqlConnection *conn,
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Given a bed3 list,  calculate enrichments for targets */
{
struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList);
struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash);

/* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
        continue;
    struct genomeRangeTree *grt = target->grt;
    long long uniqOverlapBases = 0;
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	struct rbTree *sampleTree = chrom->val;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct range *range, *rangeList = rangeTreeList(sampleTree);
	    for (range = rangeList; range != NULL; range = range->next)
		{
		/* Do unique base overlap counts (since using range trees both sides) */
		int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end);
		uniqOverlapBases += overlap;
		}
	    }
	}

    /* Figure out how much we overlap allowing same bases in genome
     * to part of more than one overlap. */ 
    long long overlapBases = 0;
    struct bed3 *sample;
    for (sample = sampleList; sample != NULL; sample = sample->next)
        {
	int overlap = genomeRangeTreeOverlapSize(grt, 
	    sample->chrom, sample->chromStart, sample->chromEnd);
	overlapBases += overlap;
	}

    /* Save to database. */
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly,
	target, overlapBases, uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }
genomeRangeTreeFree(&sampleGrt);
hashElFreeList(&chromList);
}
コード例 #7
0
int overlapInSameFrame(struct bed *a, struct bed *b)
/* Return amount of overlap between coding regions (in same frame)
 * between two beds. */
{
int overlap = 0;

/* Allocate range trees for each frame. */
struct rbTree *frameTrees[3];
int frame;
for (frame = 0; frame<3; ++frame)
    frameTrees[frame] = rangeTreeNew();

/* Fill in frame trees with coding exons of a. */
int cdsPos = 0;
int block, blockCount = a->blockCount;
for (block = 0; block < blockCount; ++block)
    {
    int start = a->chromStart + a->chromStarts[block];
    int end = start + a->blockSizes[block];
    start = max(start, a->thickStart);
    end = min(end, a->thickEnd);
    if (start < end)
	{
	int size = end - start;
	int frame = (start - cdsPos)%3;
	rangeTreeAdd(frameTrees[frame], start, end);
	cdsPos += size;
	}
    }

/* Add up overlaps by comparing bed b against frameTrees */
cdsPos = 0;
blockCount = b->blockCount;
for (block = 0; block < blockCount; ++block)
    {
    int start = b->chromStarts[block] + b->chromStart;
    int end = start + b->blockSizes[block];
    start = max(start, b->thickStart);
    end = min(end, b->thickEnd);
    if (start < end)
	{
	int size = end - start;
	int frame = (start - cdsPos)%3;
	overlap += rangeTreeOverlapSize(frameTrees[frame], start, end);
	cdsPos += size;
	}
    }

/* Clean up and go home. */
for (frame = 0; frame<3; ++frame)
    rangeTreeFree(&frameTrees[frame]);
return overlap;
}
コード例 #8
0
int bedOverlapWithRangeTree(struct rbTree *rangeTree, struct bed *bed)
/* Return total overlap (at block level) of bed with tree */
{
int total = 0;
int i, blockCount=bed->blockCount;
for (i=0; i<blockCount; ++i)
    {
    int start = bed->chromStart + bed->chromStarts[i];
    int end = start + bed->blockSizes[i];
    total += rangeTreeOverlapSize(rangeTree, start, end);
    }
return total;
}
コード例 #9
0
ファイル: cdwMakeEnrichments.c プロジェクト: maximilianh/kent
/* This old way is ~3 times as slow */
void doEnrichmentsFromBigWig(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigWigPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigWigFileOpen(bigWigPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);

/* This takes a while, so let's figure out what parts take the time. */
long totalBigQueryTime = 0;
long totalOverlapTime = 0;

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigWig data in memory. Also just for performance we do a lookup of target range tree to
 * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */
    struct lm *lm = lmInit(0);
    long startBigQueryTime = clock1000();
    struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm);
    long endBigQueryTime = clock1000();
    totalBigQueryTime += endBigQueryTime - startBigQueryTime;
    struct bbiInterval *iv;

    /* Loop through all targets adding overlaps from ivList */
    long startOverlapTime = clock1000();
    struct target *target;
    for (target = targetList; target != NULL; target = target->next)
        {
	struct genomeRangeTree *grt = target->grt;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    for (iv = ivList; iv != NULL; iv = iv->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
		target->uniqOverlapBases += overlap;
		target->overlapBases += overlap * iv->val;
		}
	    }
	}
    long endOverlapTime = clock1000();
    totalOverlapTime += endOverlapTime - startOverlapTime;
    lmCleanup(&lm);
    }

verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime);

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bbiChromInfoFreeList(&chromList);
bigWigFileClose(&bbi);
freez(&bigWigPath);
}
コード例 #10
0
ファイル: cdwMakeEnrichments.c プロジェクト: maximilianh/kent
void doEnrichmentsFromBigBed(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigBedPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigBedFileOpen(bigBedPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigBed data in memory. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */
    struct lm *lm = lmInit(0);
    struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm);
    struct bigBedInterval *iv;
    struct rbTree *bbTree = rangeTreeNew();
    for (iv = ivList; iv != NULL; iv = iv->next)
	 rangeTreeAdd(bbTree, iv->start, iv->end);
    struct range *bbRange, *bbRangeList = rangeTreeList(bbTree);

    /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */
    struct target *target;
    for (target = targetList; target != NULL; target = target->next)
        {
	if (target->skip)
	    continue;
	struct genomeRangeTree *grt = target->grt;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct bigBedInterval *iv;
	    for (iv = ivList; iv != NULL; iv = iv->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
		target->overlapBases += overlap;
		}
	    for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end);
		target->uniqOverlapBases += overlap;
		}
	    }
	}
    rangeTreeFree(&bbTree);
    lmCleanup(&lm);
    }

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
	continue;
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bbiChromInfoFreeList(&chromList);
bigBedFileClose(&bbi);
freez(&bigBedPath);
}
コード例 #11
0
void doEnrichmentsFromSampleBed(struct sqlConnection *conn, 
    struct edwFile *ef, struct edwValidFile *vf, 
    struct edwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from sample bed file. */
{
char *sampleBed = vf->sampleBed;
if (isEmpty(sampleBed))
    {
    warn("No sample bed for %s", ef->edwFileName);
    return;
    }

/* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/
struct bed3 *sample, *sampleList = bed3LoadAll(sampleBed);
if (sampleList == NULL)
    {
    warn("Sample bed is empty for %s", ef->edwFileName);
    return;
    }
struct genomeRangeTree *sampleGrt = edwMakeGrtFromBed3List(sampleList);
struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash);

/* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
        continue;
    struct genomeRangeTree *grt = target->grt;
    long long uniqOverlapBases = 0;
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	struct rbTree *sampleTree = chrom->val;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct range *range, *rangeList = rangeTreeList(sampleTree);
	    for (range = rangeList; range != NULL; range = range->next)
		{
		/* Do unique base overlap counts (since using range trees both sides) */
		int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end);
		uniqOverlapBases += overlap;
		}
	    }
	}

    /* Figure out how much we overlap allowing same bases in genome
     * to part of more than one overlap. */ 
    long long overlapBases = 0;
    for (sample = sampleList; sample != NULL; sample = sample->next)
        {
	int overlap = genomeRangeTreeOverlapSize(grt, 
	    sample->chrom, sample->chromStart, sample->chromEnd);
	overlapBases += overlap;
	}

    /* Save to database. */
    struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly,
	target, overlapBases, uniqOverlapBases);
    edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128);
    edwQaEnrichFree(&enrich);
    }
genomeRangeTreeFree(&sampleGrt);
bed3FreeList(&sampleList);
hashElFreeList(&chromList);
}
コード例 #12
0
ファイル: makeGraph.c プロジェクト: davidhoover/kent
static void removeEnclosedDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree, 
	int maxBleedOver, double singleExonMaxOverlap)
/* Move double-softs that overlap spliced things to a very great extent into
 * the spliced things. Also remove tiny double-softs (no more than 2*maxBleedOver). */
{
/* Traverse graph and build up range tree covering spliced exons.  For each 
 * range of overlapping exons, assemble a singly-linked list of all exons in 
 * the range */
struct rbTree *rangeTree = rangeTreeNew(0);
struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree);
int removedCount = 0;
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggHardStart || end->type == ggHardEnd)
	{
	rangeTreeAddValList(rangeTree, start->position, end->position, edge);
	}
    }

/* Traverse graph yet one more time looking for doubly-soft exons
 * that are overlapping the spliced exons. */
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        {
	int s = start->position;
	int e = end->position;
	int size = e - s;
	if (size <= maxBleedOver+maxBleedOver)
	     {
	     /* Tiny case, just remove edge and forget it. */
	     verbose(3, "Removing tiny double-soft edge from %d to %d\n", s, e);
	     rbTreeRemove(edgeTree, edge);
	     ++removedCount;
	     }
	else
	     {
	     /* Normal case, look for exon list that encloses us, and
	      * if any single exon in that list encloses us, merge into it. */
	     int splicedOverlap = rangeTreeOverlapSize(rangeTree, s, e);
	     if (splicedOverlap > 0 && splicedOverlap > singleExonMaxOverlap*size)
	         {
		 if (!trustedEdge(edge))
		     {
		     /* Once we find a range that overlaps the doubly-soft edge, find 
		      * (half-hard or better) edge from that range that encloses the 
		      * doubly soft edge. */
		     struct range *r = rangeTreeMaxOverlapping(rangeTree, s, e);
		     struct edge *nextEdge, *edgeList = r->val;
		     struct edge *enclosingEdge = NULL;
		     for (nextEdge = edgeList; edgeList != NULL; edgeList = edgeList->next)
			 {
			 if (encloses(nextEdge, edge))
			     {
			     enclosingEdge = nextEdge;
			     }
			 }
		     if (enclosingEdge != NULL) 
			 {
			 enclosingEdge->evList = slCat(enclosingEdge->evList, edge->evList);
			 edge->evList = NULL;
			 verbose(3, "Removing doubly-soft edge %d-%d, reassigning to %d-%d\n",
				 s, e, enclosingEdge->start->position, 
				 enclosingEdge->end->position);
			 rbTreeRemove(edgeTree, edge);
			 ++removedCount;
			 }
		     }
		 }
	     }
	}
    }

/* Clean up and go home. */
if (removedCount > 0)
    removeUnusedVertices(vertexTree, edgeTree);
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *nextEdge, *edge = edgeRef->val;
    while (edge != NULL) 
	{
	nextEdge = edge->next;
	edge->next = NULL;
	edge = nextEdge;
	}
    }
slFreeList(&edgeRefList);
rbTreeFree(&rangeTree);
}