void doSummaryStatsBed(struct sqlConnection *conn) /* Put up page showing summary stats for track that is in database * or that is bed-format custom. */ { struct bed *bedList = NULL; struct region *regionList = getRegions(), *region; char *regionName = getRegionName(); long long regionSize = 0, gapTotal = 0, realSize = 0; long startTime, midTime, endTime; long loadTime = 0, calcTime = 0, freeTime = 0; struct covStats *itemCovList = NULL, *blockCovList = NULL, *cov; int itemCount = 0; struct hTableInfo *hti = getHti(database, curTable, conn); int minScore = BIGNUM, maxScore = -BIGNUM; long long sumScores = 0; boolean hasBlocks = hti->hasBlocks; boolean hasScore = (hti->scoreField[0] != 0); int fieldCount; htmlOpen("%s (%s) Summary Statistics", curTableLabel(), curTable); for (region = regionList; region != NULL; region = region->next) { struct lm *lm = lmInit(64*1024); startTime = clock1000(); bedList = cookedBedList(conn, curTable, region, lm, &fieldCount); if (fieldCount < 12) hasBlocks = FALSE; if (fieldCount < 5) hasScore = FALSE; midTime = clock1000(); loadTime += midTime - startTime; if (bedList != NULL) { itemCount += slCount(bedList); regionSize += region->end - region->start; cov = calcSpanOverRegion(region, bedList); slAddHead(&itemCovList, cov); if (hasBlocks) { cov = calcBlocksOverRegion(region, bedList); slAddHead(&blockCovList, cov); } if (hti->scoreField[0] != 0) { struct bed *bed; for (bed = bedList; bed != NULL; bed = bed->next) { int score = bed->score; if (score < minScore) minScore = score; if (score > maxScore) maxScore = score; sumScores += score; } } } endTime = clock1000(); calcTime += endTime - midTime; lmCleanup(&lm); bedList = NULL; freeTime += clock1000() - endTime; } regionSize = basesInRegion(regionList, 0); gapTotal = gapsInRegion(conn, regionList, 0); realSize = regionSize - gapTotal; hTableStart(); startTime = clock1000(); numberStatRow("item count", itemCount); if (itemCount > 0) { cov = covStatsSum(itemCovList); percentStatRow("item bases", cov->basesCovered, realSize); percentStatRow("item total", cov->sumBases, realSize); numberStatRow("smallest item", cov->minBases); numberStatRow("average item", round((double)cov->sumBases/cov->itemCount)); numberStatRow("biggest item", cov->maxBases); } if (hasBlocks && itemCount > 0) { cov = covStatsSum(blockCovList); hPrintf("<TR><TD>block count</TD><TD ALIGN=RIGHT>"); printLongWithCommas(stdout, cov->itemCount); hPrintf("</TD></TR>\n"); percentStatRow("block bases", cov->basesCovered, realSize); percentStatRow("block total", cov->sumBases, realSize); numberStatRow("smallest block", cov->minBases); numberStatRow("average block", round((double)cov->sumBases/cov->itemCount)); numberStatRow("biggest block", cov->maxBases); } if (hasScore != 0 && itemCount > 0 && sumScores != 0) { numberStatRow("smallest score", minScore); numberStatRow("average score", round((double)sumScores/itemCount)); numberStatRow("biggest score", maxScore); } hTableEnd(); /* Show region and time stats part of stats page. */ webNewSection("Region and Timing Statistics"); hTableStart(); stringStatRow("region", regionName); numberStatRow("bases in region", regionSize); numberStatRow("bases in gaps", gapTotal); floatStatRow("load time", 0.001*loadTime); floatStatRow("calculation time", 0.001*calcTime); floatStatRow("free memory time", 0.001*freeTime); stringStatRow("filter", (anyFilter() ? "on" : "off")); stringStatRow("intersection", (anyIntersection() ? "on" : "off")); hTableEnd(); covStatsFreeList(&itemCovList); covStatsFreeList(&blockCovList); htmlClose(); }
void doSummaryStatsWiggle(struct sqlConnection *conn) /* Put up page showing summary stats for wiggle track. */ { // grab the right trackDb for the current table. The curTrack variable // has the composite trackDb in it struct trackDb *track = hTrackDbForTrack(database, curTable); char *table = curTable; struct region *region, *regionList = getRegions(); char *regionName = getRegionName(); long long regionSize = 0; long long gapTotal = 0; long startTime = 0, wigFetchTime = 0; char splitTableOrFileName[HDB_MAX_TABLE_STRING]; struct customTrack *ct = NULL; boolean isCustom = FALSE; struct wiggleDataStream *wds = NULL; unsigned long long valuesMatched = 0; int regionCount = 0; int regionsDone = 0; unsigned span = 0; char *dataConstraint; double ll = 0.0; double ul = 0.0; boolean hasConstraint = FALSE; char *table2 = NULL; boolean fullGenome = FALSE; boolean statsHeaderDone = FALSE; boolean gotSome = FALSE; char *shortLabel = table; long long statsItemCount = 0; /* global accumulators for overall */ int statsSpan = 0; /* stats summary on a multiple region */ double statsSumData = 0.0; /* output */ double statsSumSquares = 0.0; /* " " */ double lowerLimit = INFINITY; /* " " */ double upperLimit = -1.0 * INFINITY; /* " " */ startTime = clock1000(); if (track != NULL) shortLabel = track->shortLabel; /* Count the regions, when only one, we can do more stats */ for (region = regionList; region != NULL; region = region->next) ++regionCount; htmlOpen("%s (%s) Wiggle Summary Statistics", shortLabel, table); if (anySubtrackMerge(database, curTable)) hPrintf("<P><EM><B>Note:</B> subtrack merge is currently ignored on this " "page (not implemented yet). Statistics shown here are only for " "the primary table %s (%s).</EM>", shortLabel, table); fullGenome = fullGenomeRegion(); WIG_INIT; /* ct, isCustom, hasConstraint, wds and table2 are set here */ for (region = regionList; region != NULL; region = region->next) { struct bed *intersectBedList = NULL; int operations; ++regionsDone; if (table2) intersectBedList = bedTable2(conn, region, table2); operations = wigFetchStats; #if defined(NOT) /* can't do the histogram now, that operation times out */ if (1 == regionCount) operations |= wigFetchAscii; #endif wds->setChromConstraint(wds, region->chrom); if (fullGenome) wds->setPositionConstraint(wds, 0, 0); else wds->setPositionConstraint(wds, region->start, region->end); if (hasConstraint) wds->setDataConstraint(wds, dataConstraint, ll, ul); /* depending on what is coming in on regionList, we may need to be * smart about how often we call getData for these custom tracks * since that is potentially a large file read each time. */ if (isCustom) { if (ct->dbTrack) { struct sqlConnection *trashConn = hAllocConn(CUSTOM_TRASH); struct trackDb *tdb = findTdbForTable(database, curTrack, table, ctLookupName); span = minSpan(trashConn, splitTableOrFileName, region->chrom, region->start, region->end, cart, tdb); wds->setSpanConstraint(wds, span); valuesMatched = getWigglePossibleIntersection(wds, region, CUSTOM_TRASH, table2, &intersectBedList, splitTableOrFileName, operations); hFreeConn(&trashConn); } else { valuesMatched = getWigglePossibleIntersection(wds, region, NULL, table2, &intersectBedList, splitTableOrFileName, operations); /* XXX We need to properly get the smallest span for custom tracks */ /* This is not necessarily the correct answer here */ if (wds->stats) span = wds->stats->span; else span = 1; } } else { if (hFindSplitTable(database, region->chrom, table, splitTableOrFileName, sizeof splitTableOrFileName, NULL)) { span = minSpan(conn, splitTableOrFileName, region->chrom, region->start, region->end, cart, track); wds->setSpanConstraint(wds, span); valuesMatched = getWigglePossibleIntersection(wds, region, database, table2, &intersectBedList, splitTableOrFileName, operations); if (intersectBedList) span = 1; } } /* when doing multiple regions, we need to print out each result as * it happens to keep the connection open to the browser and * prevent any timeout since this could take a while. * (worst case test is quality track on panTro1) */ if (wds->stats) statsItemCount += wds->stats->count; if (wds->stats && (regionCount > 1) && (valuesMatched > 0)) { double sumData = wds->stats->mean * wds->stats->count; double sumSquares; if (wds->stats->count > 1) sumSquares = (wds->stats->variance * (wds->stats->count - 1)) + ((sumData * sumData)/wds->stats->count); else sumSquares = sumData * sumData; /* global accumulators for overall summary */ statsSpan = wds->stats->span; statsSumData += sumData; statsSumSquares += sumSquares; if (wds->stats->lowerLimit < lowerLimit) lowerLimit = wds->stats->lowerLimit; if ((wds->stats->lowerLimit + wds->stats->dataRange) > upperLimit) upperLimit = wds->stats->lowerLimit + wds->stats->dataRange; if (statsHeaderDone) wds->statsOut(wds, database, "stdout", TRUE, TRUE, FALSE, TRUE); else { wds->statsOut(wds, database, "stdout", TRUE, TRUE, TRUE, TRUE); statsHeaderDone = TRUE; } wds->freeStats(wds); gotSome = TRUE; } if ((regionCount > MAX_REGION_DISPLAY) && (regionsDone >= MAX_REGION_DISPLAY)) { hPrintf("<TR><TH ALIGN=CENTER COLSPAN=12> Can not display more " "than %d regions, <BR> would take too much time </TH></TR>\n", MAX_REGION_DISPLAY); break; /* exit this for loop */ } } /*for (region = regionList; region != NULL; region = region->next) */ if (hasConstraint) freeMem(dataConstraint); /* been cloned into wds */ if (1 == regionCount) { statsPreamble(wds, regionList->chrom, regionList->start, regionList->end, span, valuesMatched, table2); /* 3 X TRUE = sort results, html table output, with header, * the FALSE means close the table after printing, no more rows to * come. The case in the if() statement was already taken care of * in the statsPreamble() printout. No need to do that again. */ if ( ! ((valuesMatched == 0) && table2) ) wds->statsOut(wds, database, "stdout", TRUE, TRUE, TRUE, FALSE); regionSize = basesInRegion(regionList,0); gapTotal = gapsInRegion(conn, regionList,0); } else { /* this is a bit of a kludge here since these printouts are done in the * library source wigDataStream.c statsOut() function and * this is a clean up of that. That function should be * pulled out of there and made independent and more * versatile. */ long long realSize; double variance; double stddev; /* Too expensive to lookup the numbers for thousands of regions */ regionSize = basesInRegion(regionList,MAX_REGION_DISPLAY); gapTotal = gapsInRegion(conn, regionList,MAX_REGION_DISPLAY); realSize = regionSize - gapTotal; /* close the table which was left open in the loop above */ if (!gotSome) hPrintf("<TR><TH ALIGN=CENTER COLSPAN=12> No data found matching this request </TH></TR>\n"); hPrintf("<TR><TH ALIGN=LEFT> SUMMARY: </TH>\n"); hPrintf("\t<TD> </TD>\n"); /* chromStart */ hPrintf("\t<TD> </TD>\n"); /* chromEnd */ hPrintf("\t<TD ALIGN=RIGHT> "); printLongWithCommas(stdout, statsItemCount); hPrintf(" </TD>\n" ); hPrintf("\t<TD ALIGN=RIGHT> %d </TD>\n", statsSpan); hPrintf("\t<TD ALIGN=RIGHT> "); printLongWithCommas(stdout, statsItemCount*statsSpan); hPrintf(" (%.2f%%) </TD>\n", 100.0*(double)(statsItemCount*statsSpan)/(double)realSize); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", lowerLimit); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", upperLimit); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", upperLimit - lowerLimit); if (statsItemCount > 0) hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", statsSumData/statsItemCount); else hPrintf("\t<TD ALIGN=RIGHT> 0.0 </TD>\n"); stddev = 0.0; variance = 0.0; if (statsItemCount > 1) { variance = (statsSumSquares - ((statsSumData * statsSumData)/(double) statsItemCount)) / (double) (statsItemCount - 1); if (variance > 0.0) stddev = sqrt(variance); } hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", variance); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", stddev); hPrintf("</TR>\n"); wigStatsTableHeading(stdout, TRUE); hPrintf("</TABLE></TD></TR></TABLE></P>\n"); } #if defined(NOT) /* can't do the histogram now, that operation times out */ /* Single region, we can do the histogram */ if ((valuesMatched > 1) && (1 == regionCount)) { float *valuesArray = NULL; size_t valueCount = 0; struct histoResult *histoGramResult; /* convert the ascii data listings to one giant float array */ valuesArray = wds->asciiToDataArray(wds, valuesMatched, &valueCount); /* histoGram() may return NULL if it doesn't work */ histoGramResult = histoGram(valuesArray, valueCount, NAN, (unsigned) 0, NAN, (float) wds->stats->lowerLimit, (float) (wds->stats->lowerLimit + wds->stats->dataRange), (struct histoResult *)NULL); printHistoGram(histoGramResult, TRUE); /* TRUE == html output */ freeHistoGram(&histoGramResult); wds->freeAscii(wds); wds->freeArray(wds); } #endif wds->freeStats(wds); wiggleDataStreamFree(&wds); wigFetchTime = clock1000() - startTime; webNewSection("Region and Timing Statistics"); hTableStart(); stringStatRow("region", regionName); numberStatRow("bases in region", regionSize); numberStatRow("bases in gaps", gapTotal); floatStatRow("load and calc time", 0.001*wigFetchTime); wigFilterStatRow(conn); stringStatRow("intersection", cartUsualString(cart, hgtaIntersectTable, "off")); hTableEnd(); htmlClose(); } /* void doSummaryStatsWiggle(struct sqlConnection *conn) */
void doSummaryStatsBigWig(struct sqlConnection *conn) /* Put up page showing summary stats for bigWig track. */ { struct trackDb *track = curTrack; char *table = curTable; char *shortLabel = (track == NULL ? table : track->shortLabel); char *fileName = bigWigFileName(table, conn); long startTime = clock1000(); htmlOpen("%s (%s) Big Wig Summary Statistics", shortLabel, table); if (anySubtrackMerge(database, curTable)) hPrintf("<P><EM><B>Note:</B> subtrack merge is currently ignored on this " "page (not implemented yet). Statistics shown here are only for " "the primary table %s (%s).</EM>", shortLabel, table); struct bbiFile *bwf = bigWigFileOpen(fileName); struct region *region, *regionList = getRegions(); double sumData = 0, sumSquares = 0, minVal = 0, maxVal = 0; bits64 validCount = 0; if (!anyFilter() && !anyIntersection()) { for (region = regionList; region != NULL; region = region->next) { struct bbiSummaryElement sum; if (bbiSummaryArrayExtended(bwf, region->chrom, region->start, region->end, bigWigIntervalQuery, 1, &sum)) { if (validCount == 0) { minVal = sum.minVal; maxVal = sum.maxVal; } else { if (sum.minVal < minVal) minVal = sum.minVal; if (sum.maxVal > maxVal) maxVal = sum.maxVal; } sumData += sum.sumData; sumSquares += sum.sumSquares; validCount += sum.validCount; } } } else { double ll, ul; enum wigCompare cmp; getWigFilter(database, curTable, &cmp, &ll, &ul); for (region = regionList; region != NULL; region = region->next) { struct lm *lm = lmInit(0); struct bbiInterval *iv, *ivList; ivList = intersectedFilteredBbiIntervalsOnRegion(conn, bwf, region, cmp, ll, ul, lm); for (iv = ivList; iv != NULL; iv = iv->next) { double val = iv->val; double size = iv->end - iv->start; if (validCount == 0) minVal = maxVal = val; else { if (val < minVal) minVal = val; if (val > maxVal) maxVal = val; } sumData += size*val; sumSquares += size*val*val; validCount += size; } lmCleanup(&lm); } } hTableStart(); floatStatRow("mean", sumData/validCount); floatStatRow("min", minVal); floatStatRow("max", maxVal); floatStatRow("standard deviation", calcStdFromSums(sumData, sumSquares, validCount)); numberStatRow("bases with data", validCount); long long regionSize = basesInRegion(regionList,0); long long gapTotal = gapsInRegion(conn, regionList,0); numberStatRow("bases with sequence", regionSize - gapTotal); numberStatRow("bases in region", regionSize); wigFilterStatRow(conn); stringStatRow("intersection", cartUsualString(cart, hgtaIntersectTable, "off")); long wigFetchTime = clock1000() - startTime; floatStatRow("load and calc time", 0.001*wigFetchTime); hTableEnd(); bbiFileClose(&bwf); htmlClose(); }