Ejemplo n.º 1
0
void doBedReplicate(struct sqlConnection *conn, char *format, struct cdwAssembly *assembly,
    struct cdwFile *elderEf, struct cdwValidFile *elderVf,
    struct cdwFile *youngerEf, struct cdwValidFile *youngerVf)
/* Do correlation analysis between elder and younger and save result to
 * a new cdwQaPairCorrelation record. Do this for a format where we have a bigBed file. */
{
/* If got both pairs, work is done already */
if (pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairSampleOverlap"))
    return;

/* Get files for both younger and older. */
char *elderPath = cdwPathForFileId(conn, elderEf->id);
char *youngerPath = cdwPathForFileId(conn, youngerEf->id);

/* Do replicate calcs on bed3 lists from files. */
struct bed3 *elderBedList = bed3LoadAll(elderPath);
struct bed3 *youngerBedList = bed3LoadAll(youngerPath);
doBed3Replicate(conn, format, assembly, elderEf, elderVf, elderBedList,
		youngerEf, youngerVf, youngerBedList);

/* Clean up. */
bed3FreeList(&elderBedList);
bed3FreeList(&youngerBedList);
freez(&youngerPath);
freez(&elderPath);
}
Ejemplo n.º 2
0
void doBigWigReplicate(struct sqlConnection *conn, struct cdwAssembly *assembly,
    struct cdwFile *elderEf, struct cdwValidFile *elderVf,
    struct cdwFile *youngerEf, struct cdwValidFile *youngerVf)
/* Do correlation analysis between elder and younger and save result to
 * a new cdwQaPairCorrelation record. Do this for a format where we have a bigWig file. */
{
if (pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairCorrelation"))
    return;
char *enrichedIn = elderVf->enrichedIn;
if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown"))
    {
    struct genomeRangeTree *targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn);

    /* Get open big wig files for both younger and older. */
    char *elderPath = cdwPathForFileId(conn, elderEf->id);
    char *youngerPath = cdwPathForFileId(conn, youngerEf->id);
    struct bbiFile *elderBbi = bigWigFileOpen(elderPath);
    struct bbiFile *youngerBbi = bigWigFileOpen(youngerPath);

    /* Figure out thresholds */
    double elderThreshold = twoStdsOverMean(elderBbi);
    double youngerThreshold = twoStdsOverMean(youngerBbi);

    /* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/
    struct correlate *c = correlateNew(), *cInEnriched = correlateNew(), *cClipped = correlateNew();
    struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi);
    struct bigWigValsOnChrom *aVals = bigWigValsOnChromNew();
    struct bigWigValsOnChrom *bVals = bigWigValsOnChromNew();
    for (chrom = chromList; chrom != NULL; chrom = chrom->next)
        {
	addBwCorrelations(chrom, targetGrt, aVals, bVals, elderBbi, youngerBbi, 
	    elderThreshold, youngerThreshold, c, cInEnriched, cClipped);
	}

    /* Make up correlation structure . */
    struct cdwQaPairCorrelation *cor;
    AllocVar(cor);
    cor->elderFileId = elderVf->fileId;
    cor->youngerFileId = youngerVf->fileId;
    cor->pearsonOverall = correlateResult(c);
    cor->pearsonInEnriched = correlateResult(cInEnriched);
    cor->pearsonClipped = correlateResult(cClipped);
    cdwQaPairCorrelationSaveToDb(conn, cor, "cdwQaPairCorrelation", 128);


    bigWigValsOnChromFree(&bVals);
    bigWigValsOnChromFree(&aVals);
    genomeRangeTreeFree(&targetGrt);
    freez(&cor);
    correlateFree(&c);
    bigWigFileClose(&youngerBbi);
    bigWigFileClose(&elderBbi);
    freez(&youngerPath);
    freez(&elderPath);
    }
}
Ejemplo n.º 3
0
void sendFileByAcc(struct sqlConnection *conn, char* acc)
/* send file identified by acc (=cdwValidFile.licensePlate), suggests a canonical filename of the format
 * <licensePlate>.<originalExtension> 
 * Example URL: http://hgwdev.soe.ucsc.edu/cgi-bin/cdwGetFile?acc=SCH000FSW */
{

struct cdwValidFile *vf = cdwValidFileFromLicensePlate(conn, acc);
if (vf==NULL)
    errExit("%s is not a valid accession in the CDW.", acc);

struct cdwFile *ef = cdwFileFromId(conn, vf->fileId);
char* filePath = cdwPathForFileId(conn, vf->fileId);

mustHaveAccess(conn, ef);

// use the license plate as the basename of the downloaded file.
// Take the extension from the submitted filename, as cdwFile.format is not the same as the extension
// e.g. format=fasta -> fa.gz
char *submitExt = skipBeyondDelimit(basename(ef->submitFileName), '.');

char suggestName[8000];
safef(suggestName, sizeof(suggestName), "%s.%s", vf->licensePlate, submitExt);

apacheSendX(vf->format, filePath, suggestName);
}
Ejemplo n.º 4
0
void sendFileByPath(struct sqlConnection *conn, char *path) 
/* send file identified by a submission pathname (cdwFile.submitFileName),
 * suggests the original filename. */
/* path can be a suffix that matches a filename, so the initial '/hive' or
 * '/data' can be omitted. *
 * Example URL for testing:
 * http://hgwdev.soe.ucsc.edu/cgi-bin/cdwGetFile/hive/groups/cirm/pilot/labs/quake/130625_M00361_0080_000000000-A43D1/Sample_1_L13_C31_IL3541-701-506/1_L13_C31_IL3541-701-506_TAAGGCGA-ACTGCATA_L001_R1_001.fastq.gz */
{
int fileId = cdwFileIdFromPathSuffix(conn, path);


if (fileId == 0)
    errExit("A file with suffix %s does not exist in the database", path);
    
char *localPath = cdwPathForFileId(conn, fileId);

if (localPath == NULL)
    errExit("A local file with suffix %s was not found in the database. This is an internal error.", path);

struct cdwFile *ef = cdwFileFromId(conn, fileId);

if (ef == NULL)
    errExit("Could not find cdwFile for path %s", path);

mustHaveAccess(conn, ef);
apacheSendX(NULL, localPath, basename(ef->submitFileName));
}
Ejemplo n.º 5
0
void doEnrichmentsFromBed(struct sqlConnection *conn,
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bed file. */
{
char *bedPath = cdwPathForFileId(conn, ef->id);
struct bed3 *sampleList = bed3LoadAll(bedPath);
doEnrichmentsFromBed3Sample(sampleList, conn, ef, vf, assembly, targetList);
bed3FreeList(&sampleList);
freez(&bedPath);
}
Ejemplo n.º 6
0
void maybeRemoveFile(struct sqlConnection *conn, long long fileId, boolean really)
/* Remove references to file, and file itself from database. If really is FALSE just print out
 * what we would do. */
{
char query[256];

/* Delete from all the auxiliarry tables - tables are alphabetical to help update. */
cdwReally
sqlSafef(query, sizeof(query), "delete from cdwBamFile where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwFastqFile where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwVcfFile where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwTrackViz where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwQaContam where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwQaEnrich where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwQaFail where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), 
    "delete from cdwQaPairCorrelation where elderFileId=%lld or youngerFileId=%lld", 
    fileId, fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), 
    "delete from cdwQaPairSampleOverlap where elderFileId=%lld or youngerFileId=%lld", 
    fileId, fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), 
    "delete from cdwQaPairedEndFastq where fileId1=%lld or fileId2=%lld", 
    fileId, fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwQaRepeat where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);
sqlSafef(query, sizeof(query), "delete from cdwValidFile where fileId=%lld", fileId);
maybeDoUpdate(conn, query, really);

/* Get file name */
char *path = cdwPathForFileId(conn, fileId);

/* Delete from cdwFileTable */
sqlSafef(query, sizeof(query), "delete from cdwFile where id=%lld", fileId);
maybeDoUpdate(conn, query, really);

/* Delete file */
if (really)
    remove(path);
else
    printf("remove: %s\n", path);
}
Ejemplo n.º 7
0
static struct genomeRangeTree *genomeRangeTreeForTarget(struct sqlConnection *conn,
    struct cdwAssembly *assembly, char *enrichedIn)
/* Return genome range tree filled with enrichment target for assembly */
{
char query[256];
sqlSafef(query, sizeof(query), "select * from cdwQaEnrichTarget where assemblyId=%d and name='%s'", 
    assembly->id, enrichedIn);
struct cdwQaEnrichTarget *target = cdwQaEnrichTargetLoadByQuery(conn, query);
if (target == NULL)
   errAbort("Can't find %s enrichment target for assembly %s", enrichedIn, assembly->name);
char *targetPath = cdwPathForFileId(conn, target->fileId);
struct genomeRangeTree *targetGrt = cdwGrtFromBigBed(targetPath);
cdwQaEnrichTargetFree(&target);
freez(&targetPath);
return targetGrt;
}
Ejemplo n.º 8
0
struct target *targetsForAssembly(struct sqlConnection *conn, struct cdwAssembly *assembly)
/* Get list of enrichment targets for given assembly */
{
char query[128];
sqlSafef(query, sizeof(query), "select * from cdwQaEnrichTarget where assemblyId=%d", assembly->id);
struct cdwQaEnrichTarget *et, *etList = cdwQaEnrichTargetLoadByQuery(conn, query);

/* Wrap a new structure around the enrichment targets where we'll store summary info. */
struct target *target, *targetList = NULL, **targetTail = &targetList;
for (et = etList; et != NULL; et = et->next)
    {
    char *targetBed = cdwPathForFileId(conn, et->fileId);
    struct genomeRangeTree *grt = cdwGrtFromBigBed(targetBed);
    target = targetNew(et, grt);
    *targetTail = target;
    targetTail = &target->next;
    freez(&targetBed);
    }
return targetList;
}
Ejemplo n.º 9
0
void doEnrichmentsFromBigWig(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigWigPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigWigFileOpen(bigWigPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);
struct bigWigValsOnChrom *valsOnChrom = bigWigValsOnChromNew();

/* This takes a while, so let's figure out what parts take the time. */
long totalBigQueryTime = 0;
long totalOverlapTime = 0;

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigWig data in memory. Also just for performance we do a lookup of target range tree to
 * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    long startBigQueryTime = clock1000();
    boolean gotData = bigWigValsOnChromFetchData(valsOnChrom, chrom->name, bbi);
    long endBigQueryTime = clock1000();
    totalBigQueryTime += endBigQueryTime - startBigQueryTime;
    if (gotData)
	{
	double *valBuf = valsOnChrom->valBuf;
	Bits *covBuf = valsOnChrom->covBuf;

	/* Loop through all targets adding overlaps from ivList */
	long startOverlapTime = clock1000();
	struct target *target;
	for (target = targetList; target != NULL; target = target->next)
	    {
	    if (target->skip)
		continue;
	    struct genomeRangeTree *grt = target->grt;
	    struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	    if (targetTree != NULL)
		{
		struct range *range, *rangeList = rangeTreeList(targetTree);
		for (range = rangeList; range != NULL; range = range->next)
		    {
		    int s = range->start, e = range->end, i;
		    for (i=s; i<=e; ++i)
		        {
			if (bitReadOne(covBuf, i))
			    {
			    double x = valBuf[i];
			    target->uniqOverlapBases += 1;
			    target->overlapBases += x;
			    }
			}
		    }
		}
	    }
	long endOverlapTime = clock1000();
	totalOverlapTime += endOverlapTime - startOverlapTime;
	}
    }

verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime);

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
	continue;
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bigWigValsOnChromFree(&valsOnChrom);
bbiChromInfoFreeList(&chromList);
bigWigFileClose(&bbi);
freez(&bigWigPath);
}
Ejemplo n.º 10
0
/* This old way is ~3 times as slow */
void doEnrichmentsFromBigWig(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigWigPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigWigFileOpen(bigWigPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);

/* This takes a while, so let's figure out what parts take the time. */
long totalBigQueryTime = 0;
long totalOverlapTime = 0;

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigWig data in memory. Also just for performance we do a lookup of target range tree to
 * get chromosome specific one to use, which avoids a hash lookup in the inner loop. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    /* Get list of intervals in bigWig for this chromosome, and feed it to a rangeTree. */
    struct lm *lm = lmInit(0);
    long startBigQueryTime = clock1000();
    struct bbiInterval *ivList = bigWigIntervalQuery(bbi, chrom->name, 0, chrom->size, lm);
    long endBigQueryTime = clock1000();
    totalBigQueryTime += endBigQueryTime - startBigQueryTime;
    struct bbiInterval *iv;

    /* Loop through all targets adding overlaps from ivList */
    long startOverlapTime = clock1000();
    struct target *target;
    for (target = targetList; target != NULL; target = target->next)
        {
	struct genomeRangeTree *grt = target->grt;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    for (iv = ivList; iv != NULL; iv = iv->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
		target->uniqOverlapBases += overlap;
		target->overlapBases += overlap * iv->val;
		}
	    }
	}
    long endOverlapTime = clock1000();
    totalOverlapTime += endOverlapTime - startOverlapTime;
    lmCleanup(&lm);
    }

verbose(1, "totalBig %0.3f, totalOverlap %0.3f\n", 0.001*totalBigQueryTime, 0.001*totalOverlapTime);

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bbiChromInfoFreeList(&chromList);
bigWigFileClose(&bbi);
freez(&bigWigPath);
}
Ejemplo n.º 11
0
void doEnrichmentsFromBigBed(struct sqlConnection *conn, 
    struct cdwFile *ef, struct cdwValidFile *vf, 
    struct cdwAssembly *assembly, struct target *targetList)
/* Figure out enrichments from a bigBed file. */
{
/* Get path to bigBed, open it, and read all chromosomes. */
char *bigBedPath = cdwPathForFileId(conn, ef->id);
struct bbiFile *bbi = bigBedFileOpen(bigBedPath);
struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi);

/* Do a pretty complex loop that just aims to set target->overlapBases and ->uniqOverlapBases
 * for all targets.  This is complicated by just wanting to keep one chromosome worth of
 * bigBed data in memory. */
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    /* Get list of intervals in bigBed for this chromosome, and feed it to a rangeTree. */
    struct lm *lm = lmInit(0);
    struct bigBedInterval *ivList = bigBedIntervalQuery(bbi, chrom->name, 0, chrom->size, 0, lm);
    struct bigBedInterval *iv;
    struct rbTree *bbTree = rangeTreeNew();
    for (iv = ivList; iv != NULL; iv = iv->next)
	 rangeTreeAdd(bbTree, iv->start, iv->end);
    struct range *bbRange, *bbRangeList = rangeTreeList(bbTree);

    /* Loop through all targets adding overlaps from ivList and unique overlaps from bbRangeList */
    struct target *target;
    for (target = targetList; target != NULL; target = target->next)
        {
	if (target->skip)
	    continue;
	struct genomeRangeTree *grt = target->grt;
	struct rbTree *targetTree = genomeRangeTreeFindRangeTree(grt, chrom->name);
	if (targetTree != NULL)
	    {
	    struct bigBedInterval *iv;
	    for (iv = ivList; iv != NULL; iv = iv->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, iv->start, iv->end);
		target->overlapBases += overlap;
		}
	    for (bbRange = bbRangeList; bbRange != NULL; bbRange = bbRange->next)
		{
		int overlap = rangeTreeOverlapSize(targetTree, bbRange->start, bbRange->end);
		target->uniqOverlapBases += overlap;
		}
	    }
	}
    rangeTreeFree(&bbTree);
    lmCleanup(&lm);
    }

/* Now loop through targets and save enrichment info to database */
struct target *target;
for (target = targetList; target != NULL; target = target->next)
    {
    if (target->skip)
	continue;
    struct cdwQaEnrich *enrich = enrichFromOverlaps(ef, vf, assembly, target, 
	target->overlapBases, target->uniqOverlapBases);
    cdwQaEnrichSaveToDb(conn, enrich, "cdwQaEnrich", 128);
    cdwQaEnrichFree(&enrich);
    }

bbiChromInfoFreeList(&chromList);
bigBedFileClose(&bbi);
freez(&bigBedPath);
}
Ejemplo n.º 12
0
void doBigBedReplicate(struct sqlConnection *conn, char *format, struct cdwAssembly *assembly,
    struct cdwFile *elderEf, struct cdwValidFile *elderVf,
    struct cdwFile *youngerEf, struct cdwValidFile *youngerVf)
/* Do correlation analysis between elder and younger and save result to
 * a new cdwQaPairCorrelation record. Do this for a format where we have a bigBed file. */
{
/* If got both pairs, work is done already */
if (pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairSampleOverlap") 
    && pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairCorrelation"))
    return;

int numColIx = 0;
if (sameString(format, "narrowPeak") || sameString(format, "broadPeak"))
    numColIx = 6;	// signalVal
else
    numColIx = 4;	// score
numColIx -= 3;		// Subtract off chrom/start/end
char *enrichedIn = elderVf->enrichedIn;
struct genomeRangeTree *targetGrt = NULL;
if (!isEmpty(enrichedIn) && !sameString(enrichedIn, "unknown"))
    targetGrt = genomeRangeTreeForTarget(conn, assembly, enrichedIn);

/* Get open big bed files for both younger and older. */
char *elderPath = cdwPathForFileId(conn, elderEf->id);
char *youngerPath = cdwPathForFileId(conn, youngerEf->id);
struct bbiFile *elderBbi = bigBedFileOpen(elderPath);
struct bbiFile *youngerBbi = bigBedFileOpen(youngerPath);

/* Loop through a chromosome at a time adding to correlation, and at the end save result in r.*/
struct correlate *c = correlateNew(), *cInEnriched = correlateNew();
struct bbiChromInfo *chrom, *chromList = bbiChromList(elderBbi);
long long elderTotalSpan = 0, youngerTotalSpan = 0, overlapTotalSpan = 0;
for (chrom = chromList; chrom != NULL; chrom = chrom->next)
    {
    addBbCorrelations(chrom, targetGrt, elderBbi, youngerBbi, numColIx, c, cInEnriched,
	&elderTotalSpan, &youngerTotalSpan, &overlapTotalSpan);
    }

/* Make up correlation structure and save. */
if (!pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairCorrelation"))
    {
    struct cdwQaPairCorrelation *cor;
    AllocVar(cor);
    cor->elderFileId = elderVf->fileId;
    cor->youngerFileId = youngerVf->fileId;
    cor->pearsonOverall = correlateResult(c);
    cor->pearsonInEnriched = correlateResult(cInEnriched);
    cdwQaPairCorrelationSaveToDb(conn, cor, "cdwQaPairCorrelation", 128);
    freez(&cor);
    }

/* Also make up sample structure and save.  */
if (!pairExists(conn, elderEf->id, youngerEf->id, "cdwQaPairSampleOverlap"))
    {
    struct cdwQaPairSampleOverlap *sam;
    AllocVar(sam);
    sam->elderFileId = elderVf->fileId;
    sam->youngerFileId = youngerVf->fileId;
    sam->elderSampleBases = elderTotalSpan;
    sam->youngerSampleBases = youngerTotalSpan;
    sam->sampleOverlapBases = overlapTotalSpan;
    setSampleSampleEnrichment(sam, format, assembly, elderVf, youngerVf);
    cdwQaPairSampleOverlapSaveToDb(conn, sam, "cdwQaPairSampleOverlap", 128);
    freez(&sam);
    }

genomeRangeTreeFree(&targetGrt);
correlateFree(&c);
bigBedFileClose(&youngerBbi);
bigBedFileClose(&elderBbi);
freez(&youngerPath);
freez(&elderPath);
}