void doBedReplicate(struct sqlConnection *conn, char *format, struct cdwAssembly *assembly, struct cdwFile *elderEf, struct cdwValidFile *elderVf, struct cdwFile *youngerEf, struct cdwValidFile *youngerVf) /* Do correlation analysis between elder and younger and save result to * a new cdwQaPairCorrelation record. Do this for a format where we have a bigBed file. */ { /* If got both pairs, work is done already */ if (pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairSampleOverlap")) return; /* Get files for both younger and older. */ char *elderPath = cdwPathForFileId(conn, elderEf->id); char *youngerPath = cdwPathForFileId(conn, youngerEf->id); /* Do replicate calcs on bed3 lists from files. */ struct bed3 *elderBedList = bed3LoadAll(elderPath); struct bed3 *youngerBedList = bed3LoadAll(youngerPath); doBed3Replicate(conn, format, assembly, elderEf, elderVf, elderBedList, youngerEf, youngerVf, youngerBedList); /* Clean up. */ bed3FreeList(&elderBedList); bed3FreeList(&youngerBedList); freez(&youngerPath); freez(&elderPath); }
void doBigWigReplicate(struct sqlConnection *conn, struct cdwAssembly *assembly, struct cdwFile *elderEf, struct cdwValidFile *elderVf, struct cdwFile *youngerEf, struct cdwValidFile *youngerVf) /* Do correlation analysis between elder and younger and save result to * a new cdwQaPairCorrelation record. Do this for a format where we have a bigWig file. */ { if (pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairCorrelation")) return; char *enrichedIn = elderVf->enrichedIn; if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown")) { struct genomeRangeTree *targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn); /* Get open big wig files for both younger and older. */ char *elderPath = cdwPathForFileId(conn, elderEf->id); char *youngerPath = cdwPathForFileId(conn, youngerEf->id); struct bbiFile *elderBbi = bigWigFileOpen(elderPath); struct bbiFile *youngerBbi = bigWigFileOpen(youngerPath); /* Figure out thresholds */ double elderThreshold = twoStdsOverMean(elderBbi); double youngerThreshold = twoStdsOverMean(youngerBbi); /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/ struct correlate *c = correlateNew(), *cInEnriched = correlateNew(), *cClipped = correlateNew(); struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi); struct bigWigValsOnChrom *aVals = bigWigValsOnChromNew(); struct bigWigValsOnChrom *bVals = bigWigValsOnChromNew(); for (chrom = chromList; chrom != NULL; chrom = chrom->next) { addBwCorrelations(chrom, targetGrt, aVals, bVals, elderBbi, youngerBbi, elderThreshold, youngerThreshold, c, cInEnriched, cClipped); } /* Make up correlation structure . */ struct cdwQaPairCorrelation *cor; AllocVar(cor); cor->elderFileId = elderVf->fileId; cor->youngerFileId = youngerVf->fileId; cor->pearsonOverall = correlateResult(c); cor->pearsonInEnriched = correlateResult(cInEnriched); cor->pearsonClipped = correlateResult(cClipped); cdwQaPairCorrelationSaveToDb(conn, cor, "cdwQaPairCorrelation", 128); bigWigValsOnChromFree(&bVals); bigWigValsOnChromFree(&aVals); genomeRangeTreeFree(&targetGrt); freez(&cor); correlateFree(&c); bigWigFileClose(&youngerBbi); bigWigFileClose(&elderBbi); freez(&youngerPath); freez(&elderPath); } }
void sendFileByAcc(struct sqlConnection *conn, char* acc) /* send file identified by acc (=cdwValidFile.licensePlate), suggests a canonical filename of the format * <licensePlate>.<originalExtension> * Example URL: http://hgwdev.soe.ucsc.edu/cgi-bin/cdwGetFile?acc=SCH000FSW */ { struct cdwValidFile *vf = cdwValidFileFromLicensePlate(conn, acc); if (vf==NULL) errExit("%s is not a valid accession in the CDW.", acc); struct cdwFile *ef = cdwFileFromId(conn, vf->fileId); char* filePath = cdwPathForFileId(conn, vf->fileId); mustHaveAccess(conn, ef); // use the license plate as the basename of the downloaded file. // Take the extension from the submitted filename, as cdwFile.format is not the same as the extension // e.g. format=fasta -> fa.gz char *submitExt = skipBeyondDelimit(basename(ef->submitFileName), '.'); char suggestName[8000]; safef(suggestName, sizeof(suggestName), "%s.%s", vf->licensePlate, submitExt); apacheSendX(vf->format, filePath, suggestName); }
void sendFileByPath(struct sqlConnection *conn, char *path) /* send file identified by a submission pathname (cdwFile.submitFileName), * suggests the original filename. */ /* path can be a suffix that matches a filename, so the initial '/hive' or * '/data' can be omitted. * * Example URL for testing: * http://hgwdev.soe.ucsc.edu/cgi-bin/cdwGetFile/hive/groups/cirm/pilot/labs/quake/130625_M00361_0080_000000000-A43D1/Sample_1_L13_C31_IL3541-701-506/1_L13_C31_IL3541-701-506_TAAGGCGA-ACTGCATA_L001_R1_001.fastq.gz */ { int fileId = cdwFileIdFromPathSuffix(conn, path); if (fileId == 0) errExit("A file with suffix %s does not exist in the database", path); char *localPath = cdwPathForFileId(conn, fileId); if (localPath == NULL) errExit("A local file with suffix %s was not found in the database. This is an internal error.", path); struct cdwFile *ef = cdwFileFromId(conn, fileId); if (ef == NULL) errExit("Could not find cdwFile for path %s", path); mustHaveAccess(conn, ef); apacheSendX(NULL, localPath, basename(ef->submitFileName)); }
void doEnrichmentsFromBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bed file. */ { char *bedPath = cdwPathForFileId(conn, ef->id); struct bed3 *sampleList = bed3LoadAll(bedPath); doEnrichmentsFromBed3Sample(sampleList, conn, ef, vf, assembly, targetList); bed3FreeList(&sampleList); freez(&bedPath); }
void maybeRemoveFile(struct sqlConnection *conn, long long fileId, boolean really) /* Remove references to file, and file itself from database. If really is FALSE just print out * what we would do. */ { char query[256]; /* Delete from all the auxiliarry tables - tables are alphabetical to help update. */ cdwReally sqlSafef(query, sizeof(query), "delete from cdwBamFile where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwFastqFile where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwVcfFile where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwTrackViz where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwQaContam where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwQaEnrich where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwQaFail where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwQaPairCorrelation where elderFileId=%lld or youngerFileId=%lld", fileId, fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwQaPairSampleOverlap where elderFileId=%lld or youngerFileId=%lld", fileId, fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwQaPairedEndFastq where fileId1=%lld or fileId2=%lld", fileId, fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwQaRepeat where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); sqlSafef(query, sizeof(query), "delete from cdwValidFile where fileId=%lld", fileId); maybeDoUpdate(conn, query, really); /* Get file name */ char *path = cdwPathForFileId(conn, fileId); /* Delete from cdwFileTable */ sqlSafef(query, sizeof(query), "delete from cdwFile where id=%lld", fileId); maybeDoUpdate(conn, query, really); /* Delete file */ if (really) remove(path); else printf("remove: %s\n", path); }
static struct genomeRangeTree *genomeRangeTreeForTarget(struct sqlConnection *conn, struct cdwAssembly *assembly, char *enrichedIn) /* Return genome range tree filled with enrichment target for assembly */ { char query[256]; sqlSafef(query, sizeof(query), "select * from cdwQaEnrichTarget where assemblyId=%d and name='%s'", assembly->id, enrichedIn); struct cdwQaEnrichTarget *target = cdwQaEnrichTargetLoadByQuery(conn, query); if (target == NULL) errAbort("Can't find %s enrichment target for assembly %s", enrichedIn, assembly->name); char *targetPath = cdwPathForFileId(conn, target->fileId); struct genomeRangeTree *targetGrt = cdwGrtFromBigBed(targetPath); cdwQaEnrichTargetFree(&target); freez(&targetPath); return targetGrt; }
struct target *targetsForAssembly(struct sqlConnection *conn, struct cdwAssembly *assembly) /* Get list of enrichment targets for given assembly */ { char query[128]; sqlSafef(query, sizeof(query), "select * from cdwQaEnrichTarget where assemblyId=%d", assembly->id); struct cdwQaEnrichTarget *et, *etList = cdwQaEnrichTargetLoadByQuery(conn, query); /* Wrap a new structure around the enrichment targets where we'll store summary info. */ struct target *target, *targetList = NULL, **targetTail = &targetList; for (et = etList; et != NULL; et = et->next) { char *targetBed = cdwPathForFileId(conn, et->fileId); struct genomeRangeTree *grt = cdwGrtFromBigBed(targetBed); target = targetNew(et, grt); *targetTail = target; targetTail = &target->next; freez(&targetBed); } return targetList; }
void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew(); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { long startBigQueryTime = clock1000(); boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; if (gotData) { double *valBuf = valsOnChrom->valBuf; Bits *covBuf = valsOnChrom->covBuf; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct range *range, *rangeList = rangeTreeList(targetTree); for (range = rangeList; range != NULL; range = range->next) { int s = range->start, e = range->end, i; for (i=s; i<=e; ++i) { if (bitReadOne(covBuf, i)) { double x = valBuf[i]; target->uniqOverlapBases += 1; target->overlapBases += x; } } } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; } } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bigWigValsOnChromFree(&valsOnChrom); bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
/* This old way is ~3 times as slow */ void doEnrichmentsFromBigWig(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigWigPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigWigFileOpen(bigWigPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* This takes a while, so let's figure out what parts take the time. */ long totalBigQueryTime = 0; long totalOverlapTime = 0; /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigWig data in memory. Also just for performance we do a lookup of target range tree to * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); long startBigQueryTime = clock1000(); struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm); long endBigQueryTime = clock1000(); totalBigQueryTime += endBigQueryTime - startBigQueryTime; struct bbiInterval *iv; /* Loop through all targets adding overlaps from ivList */ long startOverlapTime = clock1000(); struct target *target; for (target = targetList; target != NULL; target = target->next) { struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->uniqOverlapBases += overlap; target->overlapBases += overlap * iv->val; } } } long endOverlapTime = clock1000(); totalOverlapTime += endOverlapTime - startOverlapTime; lmCleanup(&lm); } verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime); /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigWigFileClose(&bbi); freez(&bigWigPath); }
void doEnrichmentsFromBigBed(struct sqlConnection *conn, struct cdwFile *ef, struct cdwValidFile *vf, struct cdwAssembly *assembly, struct target *targetList) /* Figure out enrichments from a bigBed file. */ { /* Get path to bigBed, open it, and read all chromosomes. */ char *bigBedPath = cdwPathForFileId(conn, ef->id); struct bbiFile *bbi = bigBedFileOpen(bigBedPath); struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); /* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases * for all targets. This is complicated by just wanting to keep one chromosome worth of * bigBed data in memory. */ for (chrom = chromList; chrom != NULL; chrom = chrom->next) { /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */ struct lm *lm = lmInit(0); struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm); struct bigBedInterval *iv; struct rbTree *bbTree = rangeTreeNew(); for (iv = ivList; iv != NULL; iv = iv->next) rangeTreeAdd(bbTree, iv->start, iv->end); struct range *bbRange, *bbRangeList = rangeTreeList(bbTree); /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct genomeRangeTree *grt = target->grt; struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name); if (targetTree != NULL) { struct bigBedInterval *iv; for (iv = ivList; iv != NULL; iv = iv->next) { int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end); target->overlapBases += overlap; } for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next) { int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end); target->uniqOverlapBases += overlap; } } } rangeTreeFree(&bbTree); lmCleanup(&lm); } /* Now loop through targets and save enrichment info to database */ struct target *target; for (target = targetList; target != NULL; target = target->next) { if (target->skip) continue; struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, target->overlapBases, target->uniqOverlapBases); cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128); cdwQaEnrichFree(&enrich); } bbiChromInfoFreeList(&chromList); bigBedFileClose(&bbi); freez(&bigBedPath); }
void doBigBedReplicate(struct sqlConnection *conn, char *format, struct cdwAssembly *assembly, struct cdwFile *elderEf, struct cdwValidFile *elderVf, struct cdwFile *youngerEf, struct cdwValidFile *youngerVf) /* Do correlation analysis between elder and younger and save result to * a new cdwQaPairCorrelation record. Do this for a format where we have a bigBed file. */ { /* If got both pairs, work is done already */ if (pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairSampleOverlap") && pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairCorrelation")) return; int numColIx = 0; if (sameString(format, "narrowPeak") || sameString(format, "broadPeak")) numColIx = 6; // signalVal else numColIx = 4; // score numColIx -= 3; // Subtract off chrom/start/end char *enrichedIn = elderVf->enrichedIn; struct genomeRangeTree *targetGrt = NULL; if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown")) targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn); /* Get open big bed files for both younger and older. */ char *elderPath = cdwPathForFileId(conn, elderEf->id); char *youngerPath = cdwPathForFileId(conn, youngerEf->id); struct bbiFile *elderBbi = bigBedFileOpen(elderPath); struct bbiFile *youngerBbi = bigBedFileOpen(youngerPath); /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/ struct correlate *c = correlateNew(), *cInEnriched = correlateNew(); struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi); long long elderTotalSpan = 0, youngerTotalSpan = 0, overlapTotalSpan = 0; for (chrom = chromList; chrom != NULL; chrom = chrom->next) { addBbCorrelations(chrom, targetGrt, elderBbi, youngerBbi, numColIx, c, cInEnriched, &elderTotalSpan, &youngerTotalSpan, &overlapTotalSpan); } /* Make up correlation structure and save. */ if (!pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairCorrelation")) { struct cdwQaPairCorrelation *cor; AllocVar(cor); cor->elderFileId = elderVf->fileId; cor->youngerFileId = youngerVf->fileId; cor->pearsonOverall = correlateResult(c); cor->pearsonInEnriched = correlateResult(cInEnriched); cdwQaPairCorrelationSaveToDb(conn, cor, "cdwQaPairCorrelation", 128); freez(&cor); } /* Also make up sample structure and save. */ if (!pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairSampleOverlap")) { struct cdwQaPairSampleOverlap *sam; AllocVar(sam); sam->elderFileId = elderVf->fileId; sam->youngerFileId = youngerVf->fileId; sam->elderSampleBases = elderTotalSpan; sam->youngerSampleBases = youngerTotalSpan; sam->sampleOverlapBases = overlapTotalSpan; setSampleSampleEnrichment(sam, format, assembly, elderVf, youngerVf); cdwQaPairSampleOverlapSaveToDb(conn, sam, "cdwQaPairSampleOverlap", 128); freez(&sam); } genomeRangeTreeFree(&targetGrt); correlateFree(&c); bigBedFileClose(&youngerBbi); bigBedFileClose(&elderBbi); freez(&youngerPath); freez(&elderPath); }