Esempio n. 1
0
void doBed3Replicate(struct sqlConnection *conn, char *format, struct cdwAssembly *assembly,
    struct cdwFile *elderEf, struct cdwValidFile *elderVf, struct bed3 *elderBedList,
    struct cdwFile *youngerEf, struct cdwValidFile *youngerVf, struct bed3 *youngerBedList)
/* Do correlation analysis between elder and younger bedLists and save result to
 * a new cdwQaPairSampleOverlap record. Do this for a format where we have a bed3 sample file. */
{
struct cdwQaPairSampleOverlap *sam;
AllocVar(sam);
sam->elderFileId = elderVf->fileId;
sam->youngerFileId = youngerVf->fileId;
sam->elderSampleBases = elderVf->basesInSample;
sam->youngerSampleBases = youngerVf->basesInSample;

/* Load up elder into genome range tree. */
struct genomeRangeTree *elderGrt = cdwMakeGrtFromBed3List(elderBedList);

/* Load up younger as bed, and loop through to get overlap */
long long totalOverlap = 0;
struct bed3 *bed;
for (bed = youngerBedList; bed != NULL; bed = bed->next)
    {
    int overlap = genomeRangeTreeOverlapSize(elderGrt, 
	bed->chrom, bed->chromStart, bed->chromEnd);
    totalOverlap += overlap;
    }
sam->sampleOverlapBases = totalOverlap;
setSampleSampleEnrichment(sam, format, assembly, elderVf, youngerVf);

/* Save to database, clean up, go home. */
cdwQaPairSampleOverlapSaveToDb(conn, sam, "cdwQaPairSampleOverlap", 128);
freez(&sam);
genomeRangeTreeFree(&elderGrt);
}
Esempio n. 2
0
void doBigWigReplicate(struct sqlConnection *conn, struct edwAssembly *assembly,
    struct edwFile *elderEf, struct edwValidFile *elderVf,
    struct edwFile *youngerEf, struct edwValidFile *youngerVf)
/* Do correlation analysis between elder and younger and save result to
 * a new edwQaPairCorrelation record. Do this for a format where we have a bigWig file. */
{
if (pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation"))
    return;
char *enrichedIn = elderVf->enrichedIn;
if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown"))
    {
    struct genomeRangeTree *targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn);

    /* Get open big wig files for both younger and older. */
    char *elderPath = edwPathForFileId(conn, elderEf->id);
    char *youngerPath = edwPathForFileId(conn, youngerEf->id);
    struct bbiFile *elderBbi = bigWigFileOpen(elderPath);
    struct bbiFile *youngerBbi = bigWigFileOpen(youngerPath);

    /* Figure out thresholds */
    double elderThreshold = twoStdsOverMean(elderBbi);
    double youngerThreshold = twoStdsOverMean(youngerBbi);

    /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/
    struct correlate *c = correlateNew(), *cInEnriched = correlateNew(), *cClipped = correlateNew();
    struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi);
    struct bigWigValsOnChrom *aVals = bigWigValsOnChromNew();
    struct bigWigValsOnChrom *bVals = bigWigValsOnChromNew();
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	addBwCorrelations(chrom, targetGrt, aVals, bVals, elderBbi, youngerBbi, 
	    elderThreshold, youngerThreshold, c, cInEnriched, cClipped);
	}

    /* Make up correlation structure . */
    struct edwQaPairCorrelation *cor;
    AllocVar(cor);
    cor->elderFileId = elderVf->fileId;
    cor->youngerFileId = youngerVf->fileId;
    cor->pearsonOverall = correlateResult(c);
    cor->pearsonInEnriched = correlateResult(cInEnriched);
    cor->pearsonClipped = correlateResult(cClipped);
    edwQaPairCorrelationSaveToDb(conn, cor, "edwQaPairCorrelation", 128);


    bigWigValsOnChromFree(&bVals);
    bigWigValsOnChromFree(&aVals);
    genomeRangeTreeFree(&targetGrt);
    freez(&cor);
    correlateFree(&c);
    bigWigFileClose(&youngerBbi);
    bigWigFileClose(&elderBbi);
    freez(&youngerPath);
    freez(&elderPath);
    }
}
Esempio n. 3
0
/* free slRef objects in the compRangeMap */
static void destructCompRangeMap(struct malnSet *malnSet) {
    struct hashCookie cookie = hashFirst(malnSet->compRangeMap->hash);
    struct hashEl *hel;
    while ((hel = hashNext(&cookie)) != NULL) {
        struct rbTree *rangeTree = hel->val;
        for (struct range *rng = rangeTreeList(rangeTree); rng != NULL; rng = rng->next) {
            slFreeList(&rng->val);
        }
    }
    genomeRangeTreeFree(&malnSet->compRangeMap);
}
Esempio n. 4
0
void doEnrichmentsFromBed3Sample(struct bed3 *sampleList,
    struct sqlConnection *conn,
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Given a bed3 list,  calculate enrichments for targets */
{
struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList);
struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash);

/* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
        continue;
    struct genomeRangeTree *grt = target->grt;
    long long uniqOverlapBases = 0;
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	struct rbTree *sampleTree = chrom->val;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct range *range, *rangeList = rangeTreeList(sampleTree);
	    for (range = rangeList; range != NULL; range = range->next)
		{
		/* Do unique base overlap counts (since using range trees both sides) */
		int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end);
		uniqOverlapBases += overlap;
		}
	    }
	}

    /* Figure out how much we overlap allowing same bases in genome
     * to part of more than one overlap. */ 
    long long overlapBases = 0;
    struct bed3 *sample;
    for (sample = sampleList; sample != NULL; sample = sample->next)
        {
	int overlap = genomeRangeTreeOverlapSize(grt, 
	    sample->chrom, sample->chromStart, sample->chromEnd);
	overlapBases += overlap;
	}

    /* Save to database. */
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly,
	target, overlapBases, uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }
genomeRangeTreeFree(&sampleGrt);
hashElFreeList(&chromList);
}
Esempio n. 5
0
void edwAlignFastqMakeBed(struct edwFile *ef, struct edwAssembly *assembly,
    char *fastqPath, struct edwValidFile *vf, FILE *bedF,
    double *retMapRatio,  double *retDepth,  double *retSampleCoverage)
/* Take a sample fastq and run bwa on it, and then convert that file to a bed. 
 * bedF and all the ret parameters can be NULL. */
{
/* Hmm, tried doing this with Mark's pipeline code, but somehow it would be flaky the
 * second time it was run in same app.  Resorting therefore to temp files. */
char genoFile[PATH_LEN];
safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", 
    edwValDataDir, assembly->ucscDb, assembly->ucscDb);

char cmd[3*PATH_LEN];
char *saiName = cloneString(rTempName(edwTempDir(), "edwSample1", ".sai"));
safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, fastqPath, saiName);
mustSystem(cmd);

char *samName = cloneString(rTempName(edwTempDir(), "ewdSample1", ".sam"));
safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", genoFile, saiName, fastqPath, samName);
mustSystem(cmd);
remove(saiName);

/* Scan sam file to calculate vf->mapRatio, vf->sampleCoverage and vf->depth. 
 * and also to produce little bed file for enrichment step. */
struct genomeRangeTree *grt = genomeRangeTreeNew();
long long hitCount=0, missCount=0, totalBasesInHits=0;
scanSam(samName, bedF, grt, &hitCount, &missCount, &totalBasesInHits);
verbose(1, "hitCount=%lld, missCount=%lld, totalBasesInHits=%lld, grt=%p\n", 
    hitCount, missCount, totalBasesInHits, grt);
if (retMapRatio)
    *retMapRatio = (double)hitCount/(hitCount+missCount);
if (retDepth)
    *retDepth = (double)totalBasesInHits/assembly->baseCount 
	    * (double)vf->itemCount/vf->sampleCount;
long long basesHitBySample = genomeRangeTreeSumRanges(grt);
if (retSampleCoverage)
    *retSampleCoverage = (double)basesHitBySample/assembly->baseCount;
genomeRangeTreeFree(&grt);
remove(samName);
}
struct correlate *bigWigCorrelate(char *aFileName, char *bFileName)
/* bigWigCorrelate - Correlate bigWig files, optionally only on target regions.. */
{
struct genomeRangeTree *targetGrt = NULL;
if (restrictFile)
    targetGrt = grtFromBigBed(restrictFile);
struct bbiFile *aBbi = bigWigFileOpen(aFileName);
struct bbiFile *bBbi = bigWigFileOpen(bFileName);
struct correlate *c = correlateNew();
struct bbiChromInfo *chrom, *chromList = bbiChromList(aBbi);
struct bigWigValsOnChrom *aVals = bigWigValsOnChromNew();
struct bigWigValsOnChrom *bVals = bigWigValsOnChromNew();
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    addBwCorrelations(chrom, targetGrt, aVals, bVals, aBbi, bBbi, threshold, threshold, c);
    }
bigWigValsOnChromFree(&aVals);
bigWigValsOnChromFree(&bVals);
bbiFileClose(&aBbi);
bbiFileClose(&bBbi);
genomeRangeTreeFree(&targetGrt);
return c;
}
Esempio n. 7
0
void calculateBinomialP(char* regdomFn, char* antigapFn, int totalRegions, int hitRegions)
/* Calculate binomial p-value of enrichment based on regulatory domains and regions hit */
{
	struct regdom* regdoms = readInitializedRegdomFile(regdomFn);

	// This will hold the union of all regulatory domains for quick search
	struct genomeRangeTree *ranges = getRangeTreeOfRegdoms(regdoms);

	// NOTE: Each of these regions must be non-overlapping.
	struct bed* antigaps = bedLoadAll(antigapFn);
	long totalNonGapBases = getTotalNonGapBases(antigaps);
	long annotatedNonGapBases = getAnnotatedNonGapBases(ranges, antigaps);

	double annotationWeight = (double)annotatedNonGapBases/(double)totalNonGapBases;

	double binomP = getBinomPval(totalRegions, hitRegions, annotationWeight);

	printf("%e\n", binomP);

	regdomFreeList(&regdoms);
	bedFreeList(&antigaps);
	genomeRangeTreeFree(&ranges);
}
/* entry */
int main(int argc, char** argv)
{
char *baseMask1, *baseMask2, *obama;
struct genomeRangeTreeFile *tf1, *tf2;
struct genomeRangeTree *t1, *t2;
unsigned size = 0;
int nodes, numChroms;
optionInit(&argc, argv, optionSpecs);
boolean and = optionExists("and");
boolean or = optionExists("or");
boolean quiet = optionExists("quiet");
boolean saveMem = optionExists("saveMem");
boolean orDirectToFile = optionExists("orDirectToFile");
--argc;
++argv;
if (argc==0)
    usage("");
if (argc > 3)
    usage("wrong # args\n");
if (argc == 1 && (and || or))
    usage("specify second file for options: -and or -or\n");
if (argc >= 2 && ((and && or) || (!and && !or)))
    usage("specify only one of the options: -and or -or\n");

baseMask1 = argv[0];
baseMask2 = (argc > 1 ? argv[1] : NULL);
obama = (argc > 2 ? argv[2] : NULL);

if (argc == 1)
    {
    if (!quiet)
	{
	genomeRangeTreeFileStats(baseMask1, &numChroms, &nodes, &size);
	fprintf(stderr, "%d bases in %d ranges in %d chroms in baseMask\n", size, nodes, numChroms);
	}
    }
else
    {
    tf1 = genomeRangeTreeFileReadHeader(baseMask1);
    tf2 = genomeRangeTreeFileReadHeader(baseMask2);
    if (and)
	{
	genomeRangeTreeFileIntersectionDetailed(tf1, tf2, obama, &numChroms, &nodes, 
	    (quiet ? NULL : &size), saveMem);
	if (!quiet)
	    fprintf(stderr, "%d bases in %d ranges in %d chroms in intersection\n", size, nodes, numChroms);
	}
    else if (or)
	{
	genomeRangeTreeFileUnionDetailed(tf1, tf2, obama, &numChroms, &nodes, 
	    (quiet ? NULL : &size), saveMem, orDirectToFile);
	if (!quiet)
	    fprintf(stderr, "%d bases in %d ranges in %d chroms in union\n", size, nodes, numChroms);
	}
    t1 = genomeRangeTreeFileFree(&tf1);
    genomeRangeTreeFree(&t1);
    t2 = genomeRangeTreeFileFree(&tf2);
    genomeRangeTreeFree(&t2);
    }
return 0;
}
Esempio n. 9
0
void doBigBedReplicate(struct sqlConnection *conn, char *format, struct edwAssembly *assembly,
    struct edwFile *elderEf, struct edwValidFile *elderVf,
    struct edwFile *youngerEf, struct edwValidFile *youngerVf)
/* Do correlation analysis between elder and younger and save result to
 * a new edwQaPairCorrelation record. Do this for a format where we have a bigBed file. */
{
/* If got both pairs, work is done already */
if (pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairSampleOverlap") 
    && pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation"))
    return;

int numColIx = 0;
if (sameString(format, "narrowPeak") || sameString(format, "broadPeak"))
    numColIx = 6;	// signalVal
else
    numColIx = 4;	// score
numColIx -= 3;		// Subtract off chrom/start/end
char *enrichedIn = elderVf->enrichedIn;
struct genomeRangeTree *targetGrt = NULL;
if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown"))
    targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn);

/* Get open big bed files for both younger and older. */
char *elderPath = edwPathForFileId(conn, elderEf->id);
char *youngerPath = edwPathForFileId(conn, youngerEf->id);
struct bbiFile *elderBbi = bigBedFileOpen(elderPath);
struct bbiFile *youngerBbi = bigBedFileOpen(youngerPath);

/* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/
struct correlate *c = correlateNew(), *cInEnriched = correlateNew();
struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi);
long long elderTotalSpan = 0, youngerTotalSpan = 0, overlapTotalSpan = 0;
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    addBbCorrelations(chrom, targetGrt, elderBbi, youngerBbi, numColIx, c, cInEnriched,
	&elderTotalSpan, &youngerTotalSpan, &overlapTotalSpan);
    }

/* Make up correlation structure and save. */
if (!pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation"))
    {
    struct edwQaPairCorrelation *cor;
    AllocVar(cor);
    cor->elderFileId = elderVf->fileId;
    cor->youngerFileId = youngerVf->fileId;
    cor->pearsonOverall = correlateResult(c);
    cor->pearsonInEnriched = correlateResult(cInEnriched);
    edwQaPairCorrelationSaveToDb(conn, cor, "edwQaPairCorrelation", 128);
    freez(&cor);
    }

/* Also make up sample structure and save.  */
if (!pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairSampleOverlap"))
    {
    struct edwQaPairSampleOverlap *sam;
    AllocVar(sam);
    sam->elderFileId = elderVf->fileId;
    sam->youngerFileId = youngerVf->fileId;
    sam->elderSampleBases = elderTotalSpan;
    sam->youngerSampleBases = youngerTotalSpan;
    sam->sampleOverlapBases = overlapTotalSpan;
    setSampleSampleEnrichment(sam, format, assembly, elderVf, youngerVf);
    edwQaPairSampleOverlapSaveToDb(conn, sam, "edwQaPairSampleOverlap", 128);
    freez(&sam);
    }

genomeRangeTreeFree(&targetGrt);
correlateFree(&c);
bigBedFileClose(&youngerBbi);
bigBedFileClose(&elderBbi);
freez(&youngerPath);
freez(&elderPath);
}
Esempio n. 10
0
void doEnrichmentsFromSampleBed(struct sqlConnection *conn, 
    struct edwFile *ef, struct edwValidFile *vf, 
    struct edwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from sample bed file. */
{
char *sampleBed = vf->sampleBed;
if (isEmpty(sampleBed))
    {
    warn("No sample bed for %s", ef->edwFileName);
    return;
    }

/* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/
struct bed3 *sample, *sampleList = bed3LoadAll(sampleBed);
if (sampleList == NULL)
    {
    warn("Sample bed is empty for %s", ef->edwFileName);
    return;
    }
struct genomeRangeTree *sampleGrt = edwMakeGrtFromBed3List(sampleList);
struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash);

/* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
        continue;
    struct genomeRangeTree *grt = target->grt;
    long long uniqOverlapBases = 0;
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	struct rbTree *sampleTree = chrom->val;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct range *range, *rangeList = rangeTreeList(sampleTree);
	    for (range = rangeList; range != NULL; range = range->next)
		{
		/* Do unique base overlap counts (since using range trees both sides) */
		int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end);
		uniqOverlapBases += overlap;
		}
	    }
	}

    /* Figure out how much we overlap allowing same bases in genome
     * to part of more than one overlap. */ 
    long long overlapBases = 0;
    for (sample = sampleList; sample != NULL; sample = sample->next)
        {
	int overlap = genomeRangeTreeOverlapSize(grt, 
	    sample->chrom, sample->chromStart, sample->chromEnd);
	overlapBases += overlap;
	}

    /* Save to database. */
    struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly,
	target, overlapBases, uniqOverlapBases);
    edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128);
    edwQaEnrichFree(&enrich);
    }
genomeRangeTreeFree(&sampleGrt);
bed3FreeList(&sampleList);
hashElFreeList(&chromList);
}