void eapFixSortedBams(char *outSql) /* eapFixSortedBams - Help fix early eap run that left bams sorted by read rather than by chromosome.. */ { FILE *f = mustOpen(outSql, "w"); struct sqlConnection *conn = edwConnect(); char query[512]; sqlSafef(query, sizeof(query), "select * from edwAnalysisRun"); struct edwAnalysisRun *run, *runList = edwAnalysisRunLoadByQuery(conn, query); for (run = runList; run != NULL; run = run->next) { if (run->createStatus > 0 && run->createCount == 1) { struct edwFile *create = getFilesFromIds(conn, run->createFileIds, run->createCount); if (create->submitId == badSubmit) { char *fileName = edwPathForFileId(conn, create->id); if (!bamIsSortedByTarget(fileName, 1000)) { hookupFix(conn, run, create, f); } else errAbort("Looks like bad %s is already sorted\n", fileName); } else if (create->submitId == goodSubmit) { char *fileName = edwPathForFileId(conn, create->id); if (!bamIsSortedByTarget(fileName, 1000)) { errAbort("Looks like good %s needs sorting\n", fileName); } } } } carefulClose(&f); }
void doBigWigReplicate(struct sqlConnection *conn, struct edwAssembly *assembly, struct edwFile *elderEf, struct edwValidFile *elderVf, struct edwFile *youngerEf, struct edwValidFile *youngerVf) /* Do correlation analysis between elder and younger and save result to * a new edwQaPairCorrelation record. Do this for a format where we have a bigWig file. */ { if (pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation")) return; char *enrichedIn = elderVf->enrichedIn; if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown")) { struct genomeRangeTree *targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn); /* Get open big wig files for both younger and older. */ char *elderPath = edwPathForFileId(conn, elderEf->id); char *youngerPath = edwPathForFileId(conn, youngerEf->id); struct bbiFile *elderBbi = bigWigFileOpen(elderPath); struct bbiFile *youngerBbi = bigWigFileOpen(youngerPath); /* Figure out thresholds */ double elderThreshold = twoStdsOverMean(elderBbi); double youngerThreshold = twoStdsOverMean(youngerBbi); /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/ struct correlate *c = correlateNew(), *cInEnriched = correlateNew(), *cClipped = correlateNew(); struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi); struct bigWigValsOnChrom *aVals = bigWigValsOnChromNew(); struct bigWigValsOnChrom *bVals = bigWigValsOnChromNew(); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { addBwCorrelations(chrom, targetGrt, aVals, bVals, elderBbi, youngerBbi, elderThreshold, youngerThreshold, c, cInEnriched, cClipped); } /* Make up correlation structure . */ struct edwQaPairCorrelation *cor; AllocVar(cor); cor->elderFileId = elderVf->fileId; cor->youngerFileId = youngerVf->fileId; cor->pearsonOverall = correlateResult(c); cor->pearsonInEnriched = correlateResult(cInEnriched); cor->pearsonClipped = correlateResult(cClipped); edwQaPairCorrelationSaveToDb(conn, cor, "edwQaPairCorrelation", 128); bigWigValsOnChromFree(&bVals); bigWigValsOnChromFree(&aVals); genomeRangeTreeFree(&targetGrt); freez(&cor); correlateFree(&c); bigWigFileClose(&youngerBbi); bigWigFileClose(&elderBbi); freez(&youngerPath); freez(&elderPath); } }
void edwFixTargetSeq(char *when) /* edwFixTargetSeq - Fill in new fields about target seq to edwBamFile and edwAssembly.. */ { struct sqlConnection *conn = edwConnectReadWrite(); struct edwAssembly *as, *asList = edwAssemblyLoadByQuery(conn, "select * from edwAssembly"); char query[512]; for (as = asList; as != NULL; as = as->next) { char *twoBitFileName = edwPathForFileId(conn, as->twoBitId); struct twoBitFile *tbf = twoBitOpen(twoBitFileName); safef(query, sizeof(query), "update edwAssembly set seqCount=%u where id=%u", tbf->seqCount, as->id); sqlUpdate(conn, query); freez(&twoBitFileName); twoBitClose(&tbf); } edwAssemblyFreeList(&asList); struct edwBamFile *bam, *bamList = edwBamFileLoadByQuery(conn, "select * from edwBamFile"); for (bam = bamList; bam != NULL; bam = bam->next) { char *fileName = edwPathForFileId(conn, bam->fileId); samfile_t *sf = samopen(fileName, "rb", NULL); if (sf == NULL) errnoAbort("Couldn't open %s.\n", fileName); bam_header_t *head = sf->header; if (head == NULL) errAbort("Aborting ... Bad BAM header in file: %s", fileName); /* Sum up some target sizes. */ long long targetBaseCount = 0; /* Total size of all bases in target seq */ int i; for (i=0; i<head->n_targets; ++i) targetBaseCount += head->target_len[i]; safef(query, sizeof(query), "update edwBamFile set targetBaseCount=%lld,targetSeqCount=%u where id=%u", targetBaseCount, (unsigned)head->n_targets, bam->id); sqlUpdate(conn, query); samclose(sf); freez(&fileName); } }
static struct genomeRangeTree *genomeRangeTreeForTarget(struct sqlConnection *conn, struct edwAssembly *assembly, char *enrichedIn) /* Return genome range tree filled with enrichment target for assembly */ { char query[256]; sqlSafef(query, sizeof(query), "select * from edwQaEnrichTarget where assemblyId=%d and name='%s'", assembly->id, enrichedIn); struct edwQaEnrichTarget *target = edwQaEnrichTargetLoadByQuery(conn, query); if (target == NULL) errAbort("Can't find %s enrichment target for assembly %s", enrichedIn, assembly->name); char *targetPath = edwPathForFileId(conn, target->fileId); struct genomeRangeTree *targetGrt = edwGrtFromBigBed(targetPath); edwQaEnrichTargetFree(&target); freez(&targetPath); return targetGrt; }
struct target *targetsForAssembly(struct sqlConnection *conn, struct edwAssembly *assembly) /* Get list of enrichment targets for given assembly */ { char query[128]; sqlSafef(query, sizeof(query), "select * from edwQaEnrichTarget where assemblyId=%d", assembly->id); struct edwQaEnrichTarget *et, *etList = edwQaEnrichTargetLoadByQuery(conn, query); /* Wrap a new structure around the enrichment targets where we'll store summary info. */ struct target *target, *targetList = NULL, **targetTail = &targetList; for (et = etList; et != NULL; et = et->next) { char *targetBed = edwPathForFileId(conn, et->fileId); struct genomeRangeTree *grt = edwGrtFromBigBed(targetBed); target = targetNew(et, grt); *targetTail = target; targetTail = &target->next; freez(&targetBed); } return targetList; }
void doBigBedReplicate(struct sqlConnection *conn, char *format, struct edwAssembly *assembly, struct edwFile *elderEf, struct edwValidFile *elderVf, struct edwFile *youngerEf, struct edwValidFile *youngerVf) /* Do correlation analysis between elder and younger and save result to * a new edwQaPairCorrelation record. Do this for a format where we have a bigBed file. */ { /* If got both pairs, work is done already */ if (pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairSampleOverlap") && pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation")) return; int numColIx = 0; if (sameString(format, "narrowPeak") || sameString(format, "broadPeak")) numColIx = 6; // signalVal else numColIx = 4; // score numColIx -= 3; // Subtract off chrom/start/end char *enrichedIn = elderVf->enrichedIn; struct genomeRangeTree *targetGrt = NULL; if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown")) targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn); /* Get open big bed files for both younger and older. */ char *elderPath = edwPathForFileId(conn, elderEf->id); char *youngerPath = edwPathForFileId(conn, youngerEf->id); struct bbiFile *elderBbi = bigBedFileOpen(elderPath); struct bbiFile *youngerBbi = bigBedFileOpen(youngerPath); /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/ struct correlate *c = correlateNew(), *cInEnriched = correlateNew(); struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi); long long elderTotalSpan = 0, youngerTotalSpan = 0, overlapTotalSpan = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { addBbCorrelations(chrom, targetGrt, elderBbi, youngerBbi, numColIx, c, cInEnriched, &elderTotalSpan, &youngerTotalSpan, &overlapTotalSpan); } /* Make up correlation structure and save. */ if (!pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairCorrelation")) { struct edwQaPairCorrelation *cor; AllocVar(cor); cor->elderFileId = elderVf->fileId; cor->youngerFileId = youngerVf->fileId; cor->pearsonOverall = correlateResult(c); cor->pearsonInEnriched = correlateResult(cInEnriched); edwQaPairCorrelationSaveToDb(conn, cor, "edwQaPairCorrelation", 128); freez(&cor); } /* Also make up sample structure and save. */ if (!pairExists(conn, elderEf->id, youngerEf->id, "edwQaPairSampleOverlap")) { struct edwQaPairSampleOverlap *sam; AllocVar(sam); sam->elderFileId = elderVf->fileId; sam->youngerFileId = youngerVf->fileId; sam->elderSampleBases = elderTotalSpan; sam->youngerSampleBases = youngerTotalSpan; sam->sampleOverlapBases = overlapTotalSpan; setSampleSampleEnrichment(sam, format, assembly, elderVf, youngerVf); edwQaPairSampleOverlapSaveToDb(conn, sam, "edwQaPairSampleOverlap", 128); freez(&sam); } genomeRangeTreeFree(&targetGrt); correlateFree(&c); bigBedFileClose(&youngerBbi); bigBedFileClose(&elderBbi); freez(&youngerPath); freez(&elderPath); }
void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf, struct edwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = edwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew(); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { long startBigQueryTime = clock1000(); boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; if (gotData) { double *valBuf = valsOnChrom->valBuf; Bits *covBuf = valsOnChrom->covBuf; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(targetTree); for (range = rangeList; range != NULL; range = range->next) { int s = range->start, e = range->end, i; for (i=s; i<=e; ++i) { if (bitReadOne(covBuf, i)) { double x = valBuf[i]; target->uniqOverlapBases += 1; target->overlapBases += x; } } } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; } } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128); edwQaEnrichFree(&enrich); } bigWigValsOnChromFree(&valsOnChrom); bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
/* This old way is ~3 times as slow */ void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf, struct edwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = edwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); long startBigQueryTime = clock1000(); struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; struct bbiInterval *iv; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->uniqOverlapBases += overlap; target->overlapBases += overlap * iv->val; } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; lmCleanup(&lm); } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128); edwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct edwFile *ef, struct edwValidFile *vf, struct edwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = edwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct edwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); edwQaEnrichSaveToDb(conn, enrich, "edwQaEnrich", 128); edwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }