Ejemplo n.º 1
0
void outputProt(struct protInfo *prot, struct hash *seedToScop, FILE *f, FILE *fKnownTo)
/* Callapse together all features of same type that overlap and output */
{
slSort(&prot->featureList, protFeatureCmpName);
struct protFeature *startFeature, *endFeature;
for (startFeature = prot->featureList; startFeature != NULL; startFeature = endFeature)
    {
    for (endFeature = startFeature->next; endFeature != NULL; endFeature = endFeature->next)
        if (!sameString(startFeature->name, endFeature->name))
	    break;
    struct rbTree *rangeTree = rangeTreeNew();
    struct protFeature *feature;
    for (feature = startFeature; feature != endFeature; feature = feature->next)
	rangeTreeAdd(rangeTree, feature->start, feature->end);
    struct range *range, *rangeList = rangeTreeList(rangeTree);
    for (range = rangeList; range != NULL; range = range->next)
	{
	feature = highestScoringFeature(startFeature, endFeature, range->start, range->end);
        fprintf(f, "%s\t%d\t%d\t%s\n", prot->name, range->start, range->end, startFeature->name);
	fprintf(fKnownTo, "%s\t%s\t%d\t%d\t%g\n",
		prot->name, (char *)hashMustFindVal(seedToScop, startFeature->name),
		range->start, range->end, feature->eVal);
	}

    rangeTreeFree(&rangeTree);
    }
}
Ejemplo n.º 2
0
static void addBwCorrelations(struct bbiChromInfo *chrom, struct genomeRangeTree *targetGrt,
    struct bigWigValsOnChrom *aVals, struct bigWigValsOnChrom *bVals,
    struct bbiFile *aBbi, struct bbiFile *bBbi, 
    double aThreshold, double bThreshold,
    struct correlate *c, struct correlate *cInEnriched, struct correlate *cClipped)
/* Find bits of a and b that overlap and also overlap with targetRanges.  Do correlations there */
{
struct rbTree *targetRanges = genomeRangeTreeFindRangeTree(targetGrt, chrom->name);
if (bigWigValsOnChromFetchData(aVals, chrom->name, aBbi) &&
    bigWigValsOnChromFetchData(bVals, chrom->name, bBbi) )
    {
    double *a = aVals->valBuf, *b = bVals->valBuf;
    int i, end = chrom->size;
    for (i=0; i<end; ++i)
	{
	double aVal = a[i], bVal = b[i];
	correlateNext(c, aVal, bVal);
	if (aVal > aThreshold) aVal = aThreshold;
	if (bVal > bThreshold) bVal = bThreshold;
	correlateNext(cClipped, aVal, bVal);
	}
    if (targetRanges != NULL)
	{
	struct range *range, *rangeList = rangeTreeList(targetRanges);
	for (range = rangeList; range != NULL; range = range->next)
	    {
	    int start = range->start, end = range->end;
	    for (i=start; i<end; ++i)
		correlateNext(cInEnriched, a[i], b[i]);
	    }
	}
    }
}
Ejemplo n.º 3
0
void freen(char *chrom)
/* Test something */
{
uglyTime(NULL);
struct sqlConnection *conn = sqlConnect("hg19");
uglyTime("connect");
char query[512];
sqlSafef(query, sizeof(query), "select * from knownGene where chrom='%s'", chrom);
struct sqlResult *sr = sqlGetResult(conn, query);
uglyTime("get result");
char **row;
struct rbTree *rt = rangeTreeNew();
while ((row = sqlNextRow(sr)) != NULL)
    {
    struct genePred *gp = genePredLoad(row);
    int i;
    int exonCount = gp->exonCount;
    for (i=0; i<exonCount; ++i)
        rangeTreeAdd(rt, gp->exonStarts[i], gp->exonEnds[i]);
    }
uglyTime("Add rows");
struct range *list = rangeTreeList(rt);
uglyTime("Did list");
uglyf("%d items in chrom %s\n", slCount(list), chrom);
}
Ejemplo n.º 4
0
void setArrayCountsFromRangeTree(struct rbTree *rangeTree, int *array, int seqSize)
/* Given a range tree that covers stuff from 0 to seqSize, 
 * fill in array with amount of bases covered by range tree
 * before a given position in array. */
{
struct range *range, *rangeList = rangeTreeList(rangeTree);
if (rangeList == NULL)
    return;
range = rangeList;
int i, count = 0;
for (i=0; i<seqSize; ++i)
    {
    array[i] = count;
    if (i >= range->end)
	{
        range = range->next;
	if (range == NULL)
	    {
	    for (;i<seqSize; ++i)
	       array[i] = count; 
	    return;
	    }
	}
    if (range->start <= i)
        ++count;
    }
}
Ejemplo n.º 5
0
static struct bbiInterval *bigBedCoverageIntervals(struct bbiFile *bbi,
	char *chrom, bits32 start, bits32 end, struct lm *lm)
/* Return intervals where the val is the depth of coverage. */
{
/* Get list of overlapping intervals */
struct bigBedInterval *bi, *biList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm);
if (biList == NULL)
    return NULL;

/* Make a range tree that collects coverage. */
struct rbTree *rangeTree = rangeTreeNew();
for (bi = biList; bi != NULL; bi = bi->next)
    rangeTreeAddToCoverageDepth(rangeTree, bi->start, bi->end);
struct range *range, *rangeList = rangeTreeList(rangeTree);

/* Convert rangeList to bbiInterval list. */
struct bbiInterval *bwi, *bwiList = NULL;
for (range = rangeList; range != NULL; range = range->next)
    {
    lmAllocVar(lm, bwi);
    bwi->start = range->start;
    if (bwi->start < start)
       bwi->start = start;
    bwi->end = range->end;
    if (bwi->end > end)
       bwi->end = end;
    bwi->val = ptToInt(range->val);
    slAddHead(&bwiList, bwi);
    }
slReverse(&bwiList);

/* Clean up and go home. */
rangeTreeFree(&rangeTree);
return bwiList;
}
void sortAndMergeBlocks(struct bed *oneBed)
/* construct a range tree out of blocks, then remake blocks arrays */
/* this should automatically merge everything. */
{
struct rbTree *rt = rangeTreeNew();
struct range *rangeList = NULL;
struct range *oneRange;
int i;
for (i = 0; i < oneBed->blockCount; i++)
    {
    rangeTreeAdd(rt, oneBed->chromStarts[i], oneBed->chromStarts[i] + oneBed->blockSizes[i]);
    /* delete old (bad) blocks in bed */
    oneBed->chromStarts[i] = oneBed->blockSizes[i] = 0;
    }
rangeList = rangeTreeList(rt);
oneBed->blockCount = slCount(rangeList);
/* remake bed blocks out of range tree */
i = 0;
for (oneRange = rangeList; oneRange != NULL; oneRange = oneRange->next)
    {
    oneBed->chromStarts[i] = oneRange->start;
    oneBed->blockSizes[i] = oneRange->end - oneRange->start;
    i++;
    }
rangeTreeFree(&rt);
}
Ejemplo n.º 7
0
void edwBamToWig(char *input, char *output)
/* edwBamToWig - Convert a bam file to a wig file by measuring depth of coverage, optionally adjusting hit size to average for library.. */
{
FILE *f = mustOpen(output, "w");
/* Open file and get header for it. */
samfile_t *sf = samopen(input, "rb", NULL);
if (sf == NULL)
    errnoAbort("Couldn't open %s.\n", input);
bam_header_t *head = sf->header;
if (head == NULL)
    errAbort("Aborting ... Bad BAM header in file: %s", input);


/* Scan through input populating genome range trees */
struct genomeRangeTree *grt = genomeRangeTreeNew();
bam1_t one = {};
for (;;)
    {
    /* Read next record. */
    if (bam_read1(sf->x.bam, &one) < 0)
	break;
    if (one.core.tid >= 0 && one.core.n_cigar > 0)
	{
	char *chrom = head->target_name[one.core.tid];
	int start = one.core.pos;
	int end = start + one.core.l_qseq;
	if (one.core.flag & BAM_FREVERSE)
	    {
	    start -= clPad;
	    }
	else
	    {
	    end += clPad;
	    }
	struct rbTree *rt = genomeRangeTreeFindOrAddRangeTree(grt,chrom);
	rangeTreeAddToCoverageDepth(rt, start, end);
	}
    }


/* Convert genome range tree into output wig */

/* Get list of chromosomes. */
struct hashEl *hel, *helList = hashElListHash(grt->hash);
for (hel = helList; hel != NULL; hel = hel->next)
    {
    char *chrom = hel->name;
    struct rbTree *rt = hel->val;
    struct range *range, *rangeList = rangeTreeList(rt);
    for (range = rangeList; range != NULL; range = range->next)
         {
	 fprintf(f, "%s\t%d\t%d\t%d\n",  chrom, range->start, range->end, ptToInt(range->val));
	 }
    }

carefulClose(&f);
}
Ejemplo n.º 8
0
long rangeTreeTotalSize(struct rbTree *tree)
/* Return total size of all ranges in tree. */
{
struct range *range, *list = rangeTreeList(tree);
long total = 0;
for (range = list; range != NULL; range = range->next)
    total += range->end - range->start;
return total;
}
Ejemplo n.º 9
0
long rangeTreeRangeTreeOverlap(struct rbTree *a, struct rbTree *b)
/* Return total overlap between two range trees. */
{
struct range *range, *list = rangeTreeList(a);
long total = 0;
for (range = list; range != NULL; range = range->next)
    total += rangeTreeOverlapSize(b, range->start, range->end);
return total;
}
Ejemplo n.º 10
0
/* free slRef objects in the compRangeMap */
static void destructCompRangeMap(struct malnSet *malnSet) {
    struct hashCookie cookie = hashFirst(malnSet->compRangeMap->hash);
    struct hashEl *hel;
    while ((hel = hashNext(&cookie)) != NULL) {
        struct rbTree *rangeTree = hel->val;
        for (struct range *rng = rangeTreeList(rangeTree); rng != NULL; rng = rng->next) {
            slFreeList(&rng->val);
        }
    }
    genomeRangeTreeFree(&malnSet->compRangeMap);
}
Ejemplo n.º 11
0
void doEnrichmentsFromBed3Sample(struct bed3 *sampleList,
    struct sqlConnection *conn,
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Given a bed3 list,  calculate enrichments for targets */
{
struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList);
struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash);

/* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
        continue;
    struct genomeRangeTree *grt = target->grt;
    long long uniqOverlapBases = 0;
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	struct rbTree *sampleTree = chrom->val;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct range *range, *rangeList = rangeTreeList(sampleTree);
	    for (range = rangeList; range != NULL; range = range->next)
		{
		/* Do unique base overlap counts (since using range trees both sides) */
		int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end);
		uniqOverlapBases += overlap;
		}
	    }
	}

    /* Figure out how much we overlap allowing same bases in genome
     * to part of more than one overlap. */ 
    long long overlapBases = 0;
    struct bed3 *sample;
    for (sample = sampleList; sample != NULL; sample = sample->next)
        {
	int overlap = genomeRangeTreeOverlapSize(grt, 
	    sample->chrom, sample->chromStart, sample->chromEnd);
	overlapBases += overlap;
	}

    /* Save to database. */
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly,
	target, overlapBases, uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }
genomeRangeTreeFree(&sampleGrt);
hashElFreeList(&chromList);
}
void encodeMergeReplicates(int inCount, char *inNames[], char *outName)
/* encodeMergeReplicates - Merge together replicates for a pooled output.  
 * Only works on broadPeak and narrowPeak files currently. */
{
/* Make list of sources out of input files. */
struct peakSource *source, *sourceList = NULL;
int i;
for (i=0; i<inCount; ++i)
    {
    AllocVar(source);
    source->dataSource = inNames[i];
    source->chromColIx = 0;
    source->startColIx = 1;
    source->endColIx = 2;
    source->scoreColIx = SCORE_COL_IX;
    source->normFactor = 1.0;
    source->minColCount = SCORE_COL_IX+1;
    slAddTail(&sourceList, source);
    }

/* Load in from all sources. */
struct peakClusterMaker *maker = peakClusterMakerNew();
for (source = sourceList; source != NULL; source = source->next)
    peakClusterMakerAddFromSource(maker, source);

/* Cluster each chromosome. */
FILE *f = mustOpen(outName, "w");
struct hashEl *chrom, *chromList = peakClusterMakerChromList(maker);
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    struct rbTree *tree = chrom->val;
    struct range *range, *rangeList = rangeTreeList(tree);
    struct lm *lm = lmInit(0);
    for (range = rangeList; range != NULL; range = range->next)
         {
	 struct peakCluster *cluster, *clusterList = peakClusterItems(lm, range->val,
		 BIGNUM, 0.0);
	 for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
	     {
	     if (clAgree < 2 || peakClusterSourceCount(cluster) >= clAgree)
		 outputClusterNarrowPeak(cluster, f, clAdd, clGotThreshold, clThreshold,
		 	clUniqueName);
	     }
	 }
    lmCleanup(&lm);
    }
carefulClose(&f);
}
Ejemplo n.º 13
0
void biggestGapFromRangeTree(struct rbTree *rt, int chromSize, 
	int *retStart, int *retEnd, int *retSize)
/* Given a range tree filled with data for given chromosome, figure out
 * location and size of biggest gap in data. */
{
struct range *range, *rangeList = rangeTreeList(rt);
if (rangeList == NULL)
    {
    *retStart = 0;
    *retEnd = *retSize = chromSize;
    }
else
    {
    int lastEnd = 0;
    int biggestSize = 0, biggestStart = 0, biggestEnd = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int size = range->start - lastEnd;
	if (size > biggestSize)
	    {
	    biggestSize = size;
	    biggestStart = lastEnd;
	    biggestEnd = range->start;
	    }
	lastEnd = range->end;
	if (range->next == NULL)
	    {
	    size = chromSize - lastEnd;
	    if (size > biggestSize)
	        {
		biggestSize = size;
		biggestStart = lastEnd;
		biggestEnd = chromSize;
		}
	    }
	}
    *retStart = biggestStart;
    *retEnd = biggestEnd;
    *retSize = biggestSize;
    }
}
struct bed *bedFromRangeTree(struct rbTree *rangeTree, char *chrom, char *name, char *strand)
/* Create a bed based on range tree */
{
struct range *range, *rangeList = rangeTreeList(rangeTree);
if (rangeList == NULL)
    return NULL;

/* Figure out overall start and end, and blockCount.  Assumes
 * (rightly) that rangeList is sorted. */
int chromStart = rangeList->start, chromEnd = rangeList->end;
int blockCount = 1;
for (range = rangeList->next; range != NULL; range = range->next)
    {
    chromEnd = range->end;
    blockCount += 1;
    }

struct bed *bed;
AllocVar(bed);
bed->chrom = cloneString(chrom);
bed->chromStart = chromStart;
bed->chromEnd = chromEnd;
bed->name = cloneString(name);
bed->strand[0] = strand[0];
bed->blockCount = blockCount;
AllocArray(bed->blockSizes, blockCount);
AllocArray(bed->chromStarts, blockCount);
int i = 0;
for (range = rangeList; range != NULL; range = range->next)
    {
    bed->blockSizes[i] = range->end - range->start;
    bed->chromStarts[i] = range->start - chromStart;
    ++i;
    }
return bed;
}
Ejemplo n.º 15
0
struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, 
	double forceJoinScore, double weakLevel)
/* Convert a list of items to a list of clusters of items.  This may break up clusters that
 * have weakly linked parts. 
      [                ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   gets tranformed into
       [    ]   [    ]
      AAAAAAAAAAAAAAAAAA 
       BBBBBB   DDDDDD
        CCCC     EEEE
   The strategy is to build a rangeTree of coverage, which might look something like so:
      123333211123333211 
   then define cluster ends that exceed the minimum limit, which is either weakLevel
   (usually 10%) of the highest or forceJoinScore if weakLevel times the highest is 
   more than forceJoinScore.  This will go to something like so:
        [---]   [----]   
   Finally the items that are overlapping a cluster are assigned to it.  Note that this
   may mean that an item may be in multiple clusters.
        [ABC]   [ ADE]
 */
{
int easyMax = round(1.0/weakLevel);
int itemCount = slCount(itemList);
struct peakCluster *clusterList = NULL;
if (itemCount < easyMax)
    {
    struct peakItem *item = itemList;
    int chromStart = item->chromStart;
    int chromEnd = item->chromEnd;
    for (item = item->next; item != NULL; item = item->next)
        {
	if (item->chromStart < chromStart) chromStart = item->chromStart;
	if (item->chromEnd > chromEnd) chromEnd = item->chromEnd;
	}
    addCluster(lm, itemList, chromStart, chromEnd, &clusterList);
    }
else
    {
    /* Make up coverage tree. */
    struct rbTree *covTree = rangeTreeNew();
    struct peakItem *item;
    for (item = itemList; item != NULL; item = item->next)
	rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd);
    struct range *range, *rangeList = rangeTreeList(covTree);

    /* Figure out maximum coverage. */
    int maxCov = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > maxCov) maxCov = cov;
	}

    /* Figure coverage threshold. */
    int threshold = round(maxCov * weakLevel);
    if (threshold > forceJoinScore-1) threshold = forceJoinScore-1;

    /* Loop through emitting sections over threshold as clusters */
    boolean inRange = FALSE;
    boolean start = 0, end = 0;
    for (range = rangeList; range != NULL; range = range->next)
        {
	int cov = ptToInt(range->val);
	if (cov > threshold)
	    {
	    if (inRange)
	       end = range->end;
	    else
	       {
	       inRange = TRUE;
	       start = range->start;
	       end = range->end;
	       }
	    }
	else
	    {
	    if (inRange)
		{
		addCluster(lm, itemList, start, end, &clusterList);
		inRange = FALSE;
		}
	    }
	}
    if (inRange)
        addCluster(lm, itemList, start, end, &clusterList);
    }
slReverse(&clusterList);
return clusterList;
}
Ejemplo n.º 16
0
static struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
	struct bbiSummaryElement *totalSum)
/* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
 * next reduction level.  This is more work than some ways, but it keeps us from having to
 * keep the first reduction entirely in memory. */
{
struct bbiSummary *twiceReducedList = NULL;
bits32 doubleReductionSize = initialReduction * zoomIncrement;
struct bbiChromUsage *usage = usageList;
struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
boundsPt = AllocArray(boundsArray, initialReductionCount);
boundsEnd = boundsPt + initialReductionCount;

*retDataStart = ftell(f);
writeOne(f, initialReductionCount);

/* This gets a little complicated I'm afraid.  The strategy is to:
 *   1) Build up a range tree that represents coverage depth on that chromosome
 *      This also has the nice side effect of getting rid of overlaps.
 *   2) Stream through the range tree, outputting the initial summary level and
 *      further reducing. 
 */
boolean firstTime = TRUE;
struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
for (usage = usageList; usage != NULL; usage = usage->next)
    {
    struct bbiSummary oneSummary, *sum = NULL;
    struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
    struct range *range, *rangeList = rangeTreeList(rangeTree);
    for (range = rangeList; range != NULL; range = range->next)
        {
	/* Grab values we want from range. */
	double val = ptToInt(range->val);
	int start = range->start;
	int end = range->end;
	bits32 size = end - start;

	/* Add to total summary. */
	if (firstTime)
	    {
	    totalSum->validCount = size;
	    totalSum->minVal = totalSum->maxVal = val;
	    totalSum->sumData = val*size;
	    totalSum->sumSquares = val*val*size;
	    firstTime = FALSE;
	    }
	else
	    {
	    totalSum->validCount += size;
	    if (val < totalSum->minVal) totalSum->minVal = val;
	    if (val > totalSum->maxVal) totalSum->maxVal = val;
	    totalSum->sumData += val*size;
	    totalSum->sumSquares += val*val*size;
	    }

	/* If start past existing block then output it. */
	if (sum != NULL && sum->end <= start && sum->end < usage->size)
	    {
	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
		&boundsPt, boundsEnd, lm, stream);
	    sum = NULL;
	    }
	/* If don't have a summary we're working on now, make one. */
	if (sum == NULL)
	    {
	    oneSummary.chromId = usage->id;
	    oneSummary.start = start;
	    oneSummary.end = start + initialReduction;
	    if (oneSummary.end > usage->size) oneSummary.end = usage->size;
	    oneSummary.minVal = oneSummary.maxVal = val;
	    oneSummary.sumData = oneSummary.sumSquares = 0.0;
	    oneSummary.validCount = 0;
	    sum = &oneSummary;
	    }
	/* Deal with case where might have to split an item between multiple summaries.  This
	 * loop handles all but the final affected summary in that case. */
	while (end > sum->end)
	    {
	    /* Fold in bits that overlap with existing summary and output. */
	    int overlap = rangeIntersection(start, end, sum->start, sum->end);
	    assert(overlap > 0);
	    verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap);
	    sum->validCount += overlap;
	    if (sum->minVal > val) sum->minVal = val;
	    if (sum->maxVal < val) sum->maxVal = val;
	    sum->sumData += val * overlap;
	    sum->sumSquares += val*val * overlap;
	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
		    &boundsPt, boundsEnd, lm, stream);
	    size -= overlap;

	    /* Move summary to next part. */
	    sum->start = start = sum->end;
	    sum->end = start + initialReduction;
	    if (sum->end > usage->size) sum->end = usage->size;
	    sum->minVal = sum->maxVal = val;
	    sum->sumData = sum->sumSquares = 0.0;
	    sum->validCount = 0;
	    }

	/* Add to summary. */
	sum->validCount += size;
	if (sum->minVal > val) sum->minVal = val;
	if (sum->maxVal < val) sum->maxVal = val;
	sum->sumData += val * size;
	sum->sumSquares += val*val * size;
	}
    if (sum != NULL)
	{
	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
	    &boundsPt, boundsEnd, lm, stream);
	}
    rangeTreeFree(&rangeTree);
    }
bbiSumOutStreamClose(&stream);

/* Write out 1st zoom index. */
int indexOffset = *retIndexStart = ftell(f);
assert(boundsPt == boundsEnd);
cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
    blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
    indexOffset, f);

freez(&boundsArray);
slReverse(&twiceReducedList);
return twiceReducedList;
}
Ejemplo n.º 17
0
void doEnrichmentsFromSampleBed(struct sqlConnection *conn, 
    struct edwFile *ef, struct edwValidFile *vf, 
    struct edwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from sample bed file. */
{
char *sampleBed = vf->sampleBed;
if (isEmpty(sampleBed))
    {
    warn("No sample bed for %s", ef->edwFileName);
    return;
    }

/* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/
struct bed3 *sample, *sampleList = bed3LoadAll(sampleBed);
if (sampleList == NULL)
    {
    warn("Sample bed is empty for %s", ef->edwFileName);
    return;
    }
struct genomeRangeTree *sampleGrt = edwMakeGrtFromBed3List(sampleList);
struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash);

/* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
        continue;
    struct genomeRangeTree *grt = target->grt;
    long long uniqOverlapBases = 0;
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	struct rbTree *sampleTree = chrom->val;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct range *range, *rangeList = rangeTreeList(sampleTree);
	    for (range = rangeList; range != NULL; range = range->next)
		{
		/* Do unique base overlap counts (since using range trees both sides) */
		int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end);
		uniqOverlapBases += overlap;
		}
	    }
	}

    /* Figure out how much we overlap allowing same bases in genome
     * to part of more than one overlap. */ 
    long long overlapBases = 0;
    for (sample = sampleList; sample != NULL; sample = sample->next)
        {
	int overlap = genomeRangeTreeOverlapSize(grt, 
	    sample->chrom, sample->chromStart, sample->chromEnd);
	overlapBases += overlap;
	}

    /* Save to database. */
    struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly,
	target, overlapBases, uniqOverlapBases);
    edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128);
    edwQaEnrichFree(&enrich);
    }
genomeRangeTreeFree(&sampleGrt);
bed3FreeList(&sampleList);
hashElFreeList(&chromList);
}
Ejemplo n.º 18
0
void doEnrichmentsFromBigBed(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigBedPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigBedFileOpen(bigBedPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigBed data in memory. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */
    struct lm *lm = lmInit(0);
    struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm);
    struct bigBedInterval *iv;
    struct rbTree *bbTree = rangeTreeNew();
    for (iv = ivList; iv != NULL; iv = iv->next)
	 rangeTreeAdd(bbTree, iv->start, iv->end);
    struct range *bbRange, *bbRangeList = rangeTreeList(bbTree);

    /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */
    struct target *target;
    for (target = targetList; target != NULL; target = target->next)
        {
	if (target->skip)
	    continue;
	struct genomeRangeTree *grt = target->grt;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct bigBedInterval *iv;
	    for (iv = ivList; iv != NULL; iv = iv->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
		target->overlapBases += overlap;
		}
	    for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end);
		target->uniqOverlapBases += overlap;
		}
	    }
	}
    rangeTreeFree(&bbTree);
    lmCleanup(&lm);
    }

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
	continue;
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bbiChromInfoFreeList(&chromList);
bigBedFileClose(&bbi);
freez(&bigBedPath);
}
Ejemplo n.º 19
0
void doEnrichmentsFromBigWig(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigWigPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigWigFileOpen(bigWigPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew();

/* This takes a while, so let's figure out what parts take the time. */
long totalBigQueryTime = 0;
long totalOverlapTime = 0;

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigWig data in memory. Also just for performance we do a lookup of target range tree to
 * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    long startBigQueryTime = clock1000();
    boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi);
    long endBigQueryTime = clock1000();
    totalBigQueryTime += endBigQueryTime - startBigQueryTime;
    if (gotData)
	{
	double *valBuf = valsOnChrom->valBuf;
	Bits *covBuf = valsOnChrom->covBuf;

	/* Loop through all targets adding overlaps from ivList */
	long startOverlapTime = clock1000();
	struct target *target;
	for (target = targetList; target != NULL; target = target->next)
	    {
	    if (target->skip)
		continue;
	    struct genomeRangeTree *grt = target->grt;
	    struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	    if (targetTree != NULL)
		{
		struct range *range, *rangeList = rangeTreeList(targetTree);
		for (range = rangeList; range != NULL; range = range->next)
		    {
		    int s = range->start, e = range->end, i;
		    for (i=s; i<=e; ++i)
		        {
			if (bitReadOne(covBuf, i))
			    {
			    double x = valBuf[i];
			    target->uniqOverlapBases += 1;
			    target->overlapBases += x;
			    }
			}
		    }
		}
	    }
	long endOverlapTime = clock1000();
	totalOverlapTime += endOverlapTime - startOverlapTime;
	}
    }

verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime);

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
	continue;
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bigWigValsOnChromFree(&valsOnChrom);
bbiChromInfoFreeList(&chromList);
bigWigFileClose(&bbi);
freez(&bigWigPath);
}
void txGeneCanonical(char *codingCluster, char *infoFile, 
	char *noncodingGraph, char *genesBed, char *nearCoding, 
	char *outCanonical, char *outIsoforms, char *outClusters)
/* txGeneCanonical - Pick a canonical version of each gene - that is the form
 * to use when just interested in a single splicing varient. Produces final
 * transcript clusters as well. */
{
/* Read in input into lists in memory. */
struct txCluster *coding, *codingList = txClusterLoadAll(codingCluster);
struct txGraph *graph, *graphList = txGraphLoadAll(noncodingGraph);
struct bed *bed, *nextBed, *bedList = bedLoadNAll(genesBed, 12);
struct txInfo *info, *infoList = txInfoLoadAll(infoFile);
struct bed *nearList = bedLoadNAll(nearCoding, 12);

/* Make hash of all beds. */
struct hash *bedHash = hashNew(18);
for (bed = bedList; bed != NULL; bed = bed->next)
    hashAdd(bedHash, bed->name, bed);

/* Make has of all info. */
struct hash *infoHash = hashNew(18);
for (info = infoList; info != NULL; info = info->next)
    hashAdd(infoHash, info->name, info);

/* Make a binKeeper structure that we'll populate with coding genes. */
struct hash *sizeHash = minChromSizeFromBeds(bedList);
struct hash *keeperHash = minChromSizeKeeperHash(sizeHash);

/* Make list of coding genes and toss them into binKeeper.
 * This will eat up bed list, but bedHash is ok. */
struct gene *gene, *geneList = NULL;
for (coding = codingList; coding != NULL; coding = coding->next)
    {
    gene = geneFromCluster(coding, bedHash, infoHash);
    slAddHead(&geneList, gene);
    struct binKeeper *bk = hashMustFindVal(keeperHash, gene->chrom);
    binKeeperAdd(bk, gene->start, gene->end, gene);
    }

/* Go through near-coding genes and add them to the coding gene
 * they most overlap. */
for (bed = nearList; bed != NULL; bed = nextBed)
    {
    nextBed = bed->next;
    gene = mostOverlappingGene(keeperHash, bed);
    if (gene == NULL)
        errAbort("%s is near coding, but doesn't overlap any coding!?", bed->name);
    geneAddBed(gene, bed);
    }

/* Add non-coding genes. */
for (graph = graphList; graph != NULL; graph = graph->next)
    {
    gene = geneFromGraph(graph, bedHash);
    slAddHead(&geneList, gene);
    }

/* Sort so it all looks nicer. */
slSort(&geneList, geneCmp);

/* Open up output files. */
FILE *fCan = mustOpen(outCanonical, "w");
FILE *fIso = mustOpen(outIsoforms, "w");
FILE *fClus = mustOpen(outClusters, "w");

/* Loop through, making up gene name, and writing output. */
int geneId = 0;
for (gene = geneList; gene != NULL; gene = gene->next)
    {
    /* Make up name. */
    char name[16];
    safef(name, sizeof(name), "g%05d", ++geneId);

    /* Reverse transcript list just to make it look better. */
    slReverse(&gene->txList);

    /* Write out canonical file output */
    bed = hashMustFindVal(bedHash, gene->niceTx->name);
    fprintf(fCan, "%s\t%d\t%d\t%d\t%s\t%s\n",
    	bed->chrom, bed->chromStart, bed->chromEnd, geneId,
	gene->niceTx->name, gene->niceTx->name);

    /* Write out isoforms output. */
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fIso, "%d\t%s\n", geneId, bed->name);

    /* Write out cluster output, starting with bed 6 standard fields. */
    fprintf(fClus, "%s\t%d\t%d\t%s\t%d\t%c\t",
    	gene->chrom, gene->start, gene->end, name, 0, gene->strand);

    /* Write out thick-start/thick end. */
    if (gene->isCoding)
        {
	int thickStart = gene->end, thickEnd  = gene->start;
	for (bed = gene->txList; bed != NULL; bed = bed->next)
	    {
	    if (bed->thickStart < bed->thickEnd)
	        {
		thickStart = min(thickStart, bed->thickStart);
		thickEnd = max(thickEnd, bed->thickEnd);
		}
	    }
	fprintf(fClus, "%d\t%d\t", thickStart, thickEnd);
	}
    else
        {
	fprintf(fClus, "%d\t%d\t", gene->start, gene->start);
	}

    /* We got no rgb value, just write out zero. */
    fprintf(fClus, "0\t");

    /* Get exons from exonTree. */
    struct range *exon, *exonList = rangeTreeList(gene->exonTree);
    fprintf(fClus, "%d\t", slCount(exonList));
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->start - gene->start);
    fprintf(fClus, "\t");
    for (exon = exonList; exon != NULL; exon = exon->next)
	fprintf(fClus, "%d,", exon->end - exon->start);
    fprintf(fClus, "\t");

    /* Write out associated transcripts. */
    fprintf(fClus, "%d\t", slCount(gene->txList));
    for (bed = gene->txList; bed != NULL; bed = bed->next)
        fprintf(fClus, "%s,", bed->name);
    fprintf(fClus, "\t");

    /* Write out nice value */
    fprintf(fClus, "%s\t", gene->niceTx->name);

    /* Write out coding/noncoding value. */
    fprintf(fClus, "%d\n", gene->isCoding);
    }

/* Close up files. */
carefulClose(&fCan);
carefulClose(&fIso);
carefulClose(&fClus);
}
Ejemplo n.º 21
0
static void mergeDoubleSofts(struct rbTree *vertexTree, struct rbTree *edgeTree)
/* Merge together overlapping edges with soft ends. */
{
struct mergedEdge
/* Hold together info on a merged edge. */
    {
    struct evidence *evidence;
    };

/* Traverse graph and build up range tree.  Each node in the range tree
 * will represent the bounds of coordinates of overlapping double softs */
struct rbTree *rangeTree = rangeTreeNew(0);
struct slRef *edgeRef, *edgeRefList = rbTreeItems(edgeTree);
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start = edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        rangeTreeAdd(rangeTree, start->position, end->position);
    }

/* Traverse graph again merging edges */
for (edgeRef = edgeRefList; edgeRef != NULL; edgeRef = edgeRef->next)
    {
    struct edge *edge = edgeRef->val;
    struct vertex *start= edge->start;
    struct vertex *end = edge->end;
    if (start->type == ggSoftStart && end->type == ggSoftEnd)
        {
	struct range *r = rangeTreeFindEnclosing(rangeTree,
		start->position, end->position);
	assert(r != NULL);
	/* At this point, r represents the bounds of a double-soft
	 * region that encompasses this edge.  Collect the set of
	 * evidence of edges overlapping this range */
        struct mergedEdge *mergeEdge = r->val;
        if (mergeEdge == NULL)
            {
            lmAllocVar(rangeTree->lm, mergeEdge);
            r->val = mergeEdge;
            }
        mergeEdge->evidence = slCat(edge->evList, mergeEdge->evidence);
	verbose(3, "Merging doubly-soft edge (%d,%d) into range (%d,%d)\n", 
		start->position, end->position, r->start, r->end);
        edge->evList = NULL;
        rbTreeRemove(edgeTree, edge);
	}
    }

/* Traverse merged edge list, making a single edge from each range. At this point,
 * each range will have some evidence attached to it, from each of the double softs
 * that fall within the range.  From all of this evidence, make a single consensus edge */
struct range *r;
struct lm *lm = lmInit(0);
for (r = rangeTreeList(rangeTree); r != NULL; r = r->next)
    {
    struct mergedEdge *mergedEdge = r->val;
    struct edge *edge = edgeFromConsensusOfEvidence(vertexTree, mergedEdge->evidence, lm);
    if (edge != NULL)
        rbTreeAdd(edgeTree, edge);
    verbose(3, "Deriving edge (%d,%d) from all the double softs in range (%d,%d)\n", 
	    edge->start->position, edge->end->position, r->start, r->end);
    }


/* Clean up and go home. */
lmCleanup(&lm);
removeUnusedVertices(vertexTree, edgeTree);
slFreeList(&edgeRefList);
rbTreeFree(&rangeTree);
}