void doBed3Replicate(struct sqlConnection *conn, char *format, struct cdwAssembly *assembly, struct cdwFile *elderEf, struct cdwValidFile *elderVf, struct bed3 *elderBedList, struct cdwFile *youngerEf, struct cdwValidFile *youngerVf, struct bed3 *youngerBedList) /* Do correlation analysis between elder and younger bedLists and save result to * a new cdwQaPairSampleOverlap record. Do this for a format where we have a bed3 sample file. */ { struct cdwQaPairSampleOverlap *sam; AllocVar(sam); sam->elderFileId = elderVf->fileId; sam->youngerFileId = youngerVf->fileId; sam->elderSampleBases = elderVf->basesInSample; sam->youngerSampleBases = youngerVf->basesInSample; /* Load up elder into genome range tree. */ struct genomeRangeTree *elderGrt = cdwMakeGrtFromBed3List(elderBedList); /* Load up younger as bed, and loop through to get overlap */ long long totalOverlap = 0; struct bed3 *bed; for (bed = youngerBedList; bed != NULL; bed = bed->next) { int overlap = genomeRangeTreeOverlapSize(elderGrt, bed->chrom, bed->chromStart, bed->chromEnd); totalOverlap += overlap; } sam->sampleOverlapBases = totalOverlap; setSampleSampleEnrichment(sam, format, assembly, elderVf, youngerVf); /* Save to database, clean up, go home. */ cdwQaPairSampleOverlapSaveToDb(conn, sam, "cdwQaPairSampleOverlap", 128); freez(&sam); genomeRangeTreeFree(&elderGrt); }
void doBigWigReplicate(struct sqlConnection *conn, struct edwAssembly *assembly, struct edwFile *elderEf, struct edwValidFile *elderVf, struct edwFile *youngerEf, struct edwValidFile *youngerVf) /* Do correlation analysis between elder and younger and save result to * a new edwQaPairCorrelation record. Do this for a format where we have a bigWig file. */ { if (pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation")) return; char *enrichedIn = elderVf->enrichedIn; if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown")) { struct genomeRangeTree *targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn); /* Get open big wig files for both younger and older. */ char *elderPath = edwPathForFileId(conn, elderEf->id); char *youngerPath = edwPathForFileId(conn, youngerEf->id); struct bbiFile *elderBbi = bigWigFileOpen(elderPath); struct bbiFile *youngerBbi = bigWigFileOpen(youngerPath); /* Figure out thresholds */ double elderThreshold = twoStdsOverMean(elderBbi); double youngerThreshold = twoStdsOverMean(youngerBbi); /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/ struct correlate *c = correlateNew(), *cInEnriched = correlateNew(), *cClipped = correlateNew(); struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi); struct bigWigValsOnChrom *aVals = bigWigValsOnChromNew(); struct bigWigValsOnChrom *bVals = bigWigValsOnChromNew(); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { addBwCorrelations(chrom, targetGrt, aVals, bVals, elderBbi, youngerBbi, elderThreshold, youngerThreshold, c, cInEnriched, cClipped); } /* Make up correlation structure . */ struct edwQaPairCorrelation *cor; AllocVar(cor); cor->elderFileId = elderVf->fileId; cor->youngerFileId = youngerVf->fileId; cor->pearsonOverall = correlateResult(c); cor->pearsonInEnriched = correlateResult(cInEnriched); cor->pearsonClipped = correlateResult(cClipped); edwQaPairCorrelationSaveToDb(conn, cor, "edwQaPairCorrelation", 128); bigWigValsOnChromFree(&bVals); bigWigValsOnChromFree(&aVals); genomeRangeTreeFree(&targetGrt); freez(&cor); correlateFree(&c); bigWigFileClose(&youngerBbi); bigWigFileClose(&elderBbi); freez(&youngerPath); freez(&elderPath); } }
/* free slRef objects in the compRangeMap */ static void destructCompRangeMap(struct malnSet *malnSet) { struct hashCookie cookie = hashFirst(malnSet->compRangeMap->hash); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) { struct rbTree *rangeTree = hel->val; for (struct range *rng = rangeTreeList(rangeTree); rng != NULL; rng = rng->next) { slFreeList(&rng->val); } } genomeRangeTreeFree(&malnSet->compRangeMap); }
void doEnrichmentsFromBed3Sample(struct bed3 *sampleList, struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Given a bed3 list, calculate enrichments for targets */ { struct genomeRangeTree *sampleGrt = cdwMakeGrtFromBed3List(sampleList); struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash); /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; long long uniqOverlapBases = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *sampleTree = chrom->val; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(sampleTree); for (range = rangeList; range != NULL; range = range->next) { /* Do unique base overlap counts (since using range trees both sides) */ int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end); uniqOverlapBases += overlap; } } } /* Figure out how much we overlap allowing same bases in genome * to part of more than one overlap. */ long long overlapBases = 0; struct bed3 *sample; for (sample = sampleList; sample != NULL; sample = sample->next) { int overlap = genomeRangeTreeOverlapSize(grt, sample->chrom, sample->chromStart, sample->chromEnd); overlapBases += overlap; } /* Save to database. */ struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, overlapBases, uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } genomeRangeTreeFree(&sampleGrt); hashElFreeList(&chromList); }
void edwAlignFastqMakeBed(struct edwFile *ef, struct edwAssembly *assembly, char *fastqPath, struct edwValidFile *vf, FILE *bedF, double *retMapRatio, double *retDepth, double *retSampleCoverage) /* Take a sample fastq and run bwa on it, and then convert that file to a bed. * bedF and all the ret parameters can be NULL. */ { /* Hmm, tried doing this with Mark's pipeline code, but somehow it would be flaky the * second time it was run in same app. Resorting therefore to temp files. */ char genoFile[PATH_LEN]; safef(genoFile, sizeof(genoFile), "%s%s/bwaData/%s.fa", edwValDataDir, assembly->ucscDb, assembly->ucscDb); char cmd[3*PATH_LEN]; char *saiName = cloneString(rTempName(edwTempDir(), "edwSample1", ".sai")); safef(cmd, sizeof(cmd), "bwa aln -t 3 %s %s > %s", genoFile, fastqPath, saiName); mustSystem(cmd); char *samName = cloneString(rTempName(edwTempDir(), "ewdSample1", ".sam")); safef(cmd, sizeof(cmd), "bwa samse %s %s %s > %s", genoFile, saiName, fastqPath, samName); mustSystem(cmd); remove(saiName); /* Scan sam file to calculate vf->mapRatio, vf->sampleCoverage and vf->depth. * and also to produce little bed file for enrichment step. */ struct genomeRangeTree *grt = genomeRangeTreeNew(); long long hitCount=0, missCount=0, totalBasesInHits=0; scanSam(samName, bedF, grt, &hitCount, &missCount, &totalBasesInHits); verbose(1, "hitCount=%lld, missCount=%lld, totalBasesInHits=%lld, grt=%p\n", hitCount, missCount, totalBasesInHits, grt); if (retMapRatio) *retMapRatio = (double)hitCount/(hitCount+missCount); if (retDepth) *retDepth = (double)totalBasesInHits/assembly->baseCount * (double)vf->itemCount/vf->sampleCount; long long basesHitBySample = genomeRangeTreeSumRanges(grt); if (retSampleCoverage) *retSampleCoverage = (double)basesHitBySample/assembly->baseCount; genomeRangeTreeFree(&grt); remove(samName); }
struct correlate *bigWigCorrelate(char *aFileName, char *bFileName) /* bigWigCorrelate - Correlate bigWig files, optionally only on target regions.. */ { struct genomeRangeTree *targetGrt = NULL; if (restrictFile) targetGrt = grtFromBigBed(restrictFile); struct bbiFile *aBbi = bigWigFileOpen(aFileName); struct bbiFile *bBbi = bigWigFileOpen(bFileName); struct correlate *c = correlateNew(); struct bbiChromInfo *chrom, *chromList = bbiChromList(aBbi); struct bigWigValsOnChrom *aVals = bigWigValsOnChromNew(); struct bigWigValsOnChrom *bVals = bigWigValsOnChromNew(); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { addBwCorrelations(chrom, targetGrt, aVals, bVals, aBbi, bBbi, threshold, threshold, c); } bigWigValsOnChromFree(&aVals); bigWigValsOnChromFree(&bVals); bbiFileClose(&aBbi); bbiFileClose(&bBbi); genomeRangeTreeFree(&targetGrt); return c; }
void calculateBinomialP(char* regdomFn, char* antigapFn, int totalRegions, int hitRegions) /* Calculate binomial p-value of enrichment based on regulatory domains and regions hit */ { struct regdom* regdoms = readInitializedRegdomFile(regdomFn); // This will hold the union of all regulatory domains for quick search struct genomeRangeTree *ranges = getRangeTreeOfRegdoms(regdoms); // NOTE: Each of these regions must be non-overlapping. struct bed* antigaps = bedLoadAll(antigapFn); long totalNonGapBases = getTotalNonGapBases(antigaps); long annotatedNonGapBases = getAnnotatedNonGapBases(ranges, antigaps); double annotationWeight = (double)annotatedNonGapBases/(double)totalNonGapBases; double binomP = getBinomPval(totalRegions, hitRegions, annotationWeight); printf("%e\n", binomP); regdomFreeList(®doms); bedFreeList(&antigaps); genomeRangeTreeFree(&ranges); }
/* entry */ int main(int argc, char** argv) { char *baseMask1, *baseMask2, *obama; struct genomeRangeTreeFile *tf1, *tf2; struct genomeRangeTree *t1, *t2; unsigned size = 0; int nodes, numChroms; optionInit(&argc, argv, optionSpecs); boolean and = optionExists("and"); boolean or = optionExists("or"); boolean quiet = optionExists("quiet"); boolean saveMem = optionExists("saveMem"); boolean orDirectToFile = optionExists("orDirectToFile"); --argc; ++argv; if (argc==0) usage(""); if (argc > 3) usage("wrong # args\n"); if (argc == 1 && (and || or)) usage("specify second file for options: -and or -or\n"); if (argc >= 2 && ((and && or) || (!and && !or))) usage("specify only one of the options: -and or -or\n"); baseMask1 = argv[0]; baseMask2 = (argc > 1 ? argv[1] : NULL); obama = (argc > 2 ? argv[2] : NULL); if (argc == 1) { if (!quiet) { genomeRangeTreeFileStats(baseMask1, &numChroms, &nodes, &size); fprintf(stderr, "%d bases in %d ranges in %d chroms in baseMask\n", size, nodes, numChroms); } } else { tf1 = genomeRangeTreeFileReadHeader(baseMask1); tf2 = genomeRangeTreeFileReadHeader(baseMask2); if (and) { genomeRangeTreeFileIntersectionDetailed(tf1, tf2, obama, &numChroms, &nodes, (quiet ? NULL : &size), saveMem); if (!quiet) fprintf(stderr, "%d bases in %d ranges in %d chroms in intersection\n", size, nodes, numChroms); } else if (or) { genomeRangeTreeFileUnionDetailed(tf1, tf2, obama, &numChroms, &nodes, (quiet ? NULL : &size), saveMem, orDirectToFile); if (!quiet) fprintf(stderr, "%d bases in %d ranges in %d chroms in union\n", size, nodes, numChroms); } t1 = genomeRangeTreeFileFree(&tf1); genomeRangeTreeFree(&t1); t2 = genomeRangeTreeFileFree(&tf2); genomeRangeTreeFree(&t2); } return 0; }
void doBigBedReplicate(struct sqlConnection *conn, char *format, struct edwAssembly *assembly, struct edwFile *elderEf, struct edwValidFile *elderVf, struct edwFile *youngerEf, struct edwValidFile *youngerVf) /* Do correlation analysis between elder and younger and save result to * a new edwQaPairCorrelation record. Do this for a format where we have a bigBed file. */ { /* If got both pairs, work is done already */ if (pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairSampleOverlap") && pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation")) return; int numColIx = 0; if (sameString(format, "narrowPeak") || sameString(format, "broadPeak")) numColIx = 6; // signalVal else numColIx = 4; // score numColIx -= 3; // Subtract off chrom/start/end char *enrichedIn = elderVf->enrichedIn; struct genomeRangeTree *targetGrt = NULL; if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown")) targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn); /* Get open big bed files for both younger and older. */ char *elderPath = edwPathForFileId(conn, elderEf->id); char *youngerPath = edwPathForFileId(conn, youngerEf->id); struct bbiFile *elderBbi = bigBedFileOpen(elderPath); struct bbiFile *youngerBbi = bigBedFileOpen(youngerPath); /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/ struct correlate *c = correlateNew(), *cInEnriched = correlateNew(); struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi); long long elderTotalSpan = 0, youngerTotalSpan = 0, overlapTotalSpan = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { addBbCorrelations(chrom, targetGrt, elderBbi, youngerBbi, numColIx, c, cInEnriched, &elderTotalSpan, &youngerTotalSpan, &overlapTotalSpan); } /* Make up correlation structure and save. */ if (!pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation")) { struct edwQaPairCorrelation *cor; AllocVar(cor); cor->elderFileId = elderVf->fileId; cor->youngerFileId = youngerVf->fileId; cor->pearsonOverall = correlateResult(c); cor->pearsonInEnriched = correlateResult(cInEnriched); edwQaPairCorrelationSaveToDb(conn, cor, "edwQaPairCorrelation", 128); freez(&cor); } /* Also make up sample structure and save. */ if (!pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairSampleOverlap")) { struct edwQaPairSampleOverlap *sam; AllocVar(sam); sam->elderFileId = elderVf->fileId; sam->youngerFileId = youngerVf->fileId; sam->elderSampleBases = elderTotalSpan; sam->youngerSampleBases = youngerTotalSpan; sam->sampleOverlapBases = overlapTotalSpan; setSampleSampleEnrichment(sam, format, assembly, elderVf, youngerVf); edwQaPairSampleOverlapSaveToDb(conn, sam, "edwQaPairSampleOverlap", 128); freez(&sam); } genomeRangeTreeFree(&targetGrt); correlateFree(&c); bigBedFileClose(&youngerBbi); bigBedFileClose(&elderBbi); freez(&youngerPath); freez(&elderPath); }
void doEnrichmentsFromSampleBed(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf, struct edwAssembly *assembly, struct target *targetList) /* Figure out enrichments from sample bed file. */ { char *sampleBed = vf->sampleBed; if (isEmpty(sampleBed)) { warn("No sample bed for %s", ef->edwFileName); return; } /* Load sample bed, make a range tree to track unique coverage, and get list of all chroms .*/ struct bed3 *sample, *sampleList = bed3LoadAll(sampleBed); if (sampleList == NULL) { warn("Sample bed is empty for %s", ef->edwFileName); return; } struct genomeRangeTree *sampleGrt = edwMakeGrtFromBed3List(sampleList); struct hashEl *chrom, *chromList = hashElListHash(sampleGrt->hash); /* Iterate through each target - and in lockstep each associated grt to calculate unique overlap */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; long long uniqOverlapBases = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { struct rbTree *sampleTree = chrom->val; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(sampleTree); for (range = rangeList; range != NULL; range = range->next) { /* Do unique base overlap counts (since using range trees both sides) */ int overlap = rangeTreeOverlapSize(targetTree, range->start, range->end); uniqOverlapBases += overlap; } } } /* Figure out how much we overlap allowing same bases in genome * to part of more than one overlap. */ long long overlapBases = 0; for (sample = sampleList; sample != NULL; sample = sample->next) { int overlap = genomeRangeTreeOverlapSize(grt, sample->chrom, sample->chromStart, sample->chromEnd); overlapBases += overlap; } /* Save to database. */ struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, overlapBases, uniqOverlapBases); edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128); edwQaEnrichFree(&enrich); } genomeRangeTreeFree(&sampleGrt); bed3FreeList(&sampleList); hashElFreeList(&chromList); }