void doSummaryStatsWiggle(struct sqlConnection *conn) /* Put up page showing summary stats for wiggle track. */ { // grab the right trackDb for the current table. The curTrack variable // has the composite trackDb in it struct trackDb *track = hTrackDbForTrack(database, curTable); char *table = curTable; struct region *region, *regionList = getRegions(); char *regionName = getRegionName(); long long regionSize = 0; long long gapTotal = 0; long startTime = 0, wigFetchTime = 0; char splitTableOrFileName[HDB_MAX_TABLE_STRING]; struct customTrack *ct = NULL; boolean isCustom = FALSE; struct wiggleDataStream *wds = NULL; unsigned long long valuesMatched = 0; int regionCount = 0; int regionsDone = 0; unsigned span = 0; char *dataConstraint; double ll = 0.0; double ul = 0.0; boolean hasConstraint = FALSE; char *table2 = NULL; boolean fullGenome = FALSE; boolean statsHeaderDone = FALSE; boolean gotSome = FALSE; char *shortLabel = table; long long statsItemCount = 0; /* global accumulators for overall */ int statsSpan = 0; /* stats summary on a multiple region */ double statsSumData = 0.0; /* output */ double statsSumSquares = 0.0; /* " " */ double lowerLimit = INFINITY; /* " " */ double upperLimit = -1.0 * INFINITY; /* " " */ startTime = clock1000(); if (track != NULL) shortLabel = track->shortLabel; /* Count the regions, when only one, we can do more stats */ for (region = regionList; region != NULL; region = region->next) ++regionCount; htmlOpen("%s (%s) Wiggle Summary Statistics", shortLabel, table); if (anySubtrackMerge(database, curTable)) hPrintf("<P><EM><B>Note:</B> subtrack merge is currently ignored on this " "page (not implemented yet). Statistics shown here are only for " "the primary table %s (%s).</EM>", shortLabel, table); fullGenome = fullGenomeRegion(); WIG_INIT; /* ct, isCustom, hasConstraint, wds and table2 are set here */ for (region = regionList; region != NULL; region = region->next) { struct bed *intersectBedList = NULL; int operations; ++regionsDone; if (table2) intersectBedList = bedTable2(conn, region, table2); operations = wigFetchStats; #if defined(NOT) /* can't do the histogram now, that operation times out */ if (1 == regionCount) operations |= wigFetchAscii; #endif wds->setChromConstraint(wds, region->chrom); if (fullGenome) wds->setPositionConstraint(wds, 0, 0); else wds->setPositionConstraint(wds, region->start, region->end); if (hasConstraint) wds->setDataConstraint(wds, dataConstraint, ll, ul); /* depending on what is coming in on regionList, we may need to be * smart about how often we call getData for these custom tracks * since that is potentially a large file read each time. */ if (isCustom) { if (ct->dbTrack) { struct sqlConnection *trashConn = hAllocConn(CUSTOM_TRASH); struct trackDb *tdb = findTdbForTable(database, curTrack, table, ctLookupName); span = minSpan(trashConn, splitTableOrFileName, region->chrom, region->start, region->end, cart, tdb); wds->setSpanConstraint(wds, span); valuesMatched = getWigglePossibleIntersection(wds, region, CUSTOM_TRASH, table2, &intersectBedList, splitTableOrFileName, operations); hFreeConn(&trashConn); } else { valuesMatched = getWigglePossibleIntersection(wds, region, NULL, table2, &intersectBedList, splitTableOrFileName, operations); /* XXX We need to properly get the smallest span for custom tracks */ /* This is not necessarily the correct answer here */ if (wds->stats) span = wds->stats->span; else span = 1; } } else { if (hFindSplitTable(database, region->chrom, table, splitTableOrFileName, sizeof splitTableOrFileName, NULL)) { span = minSpan(conn, splitTableOrFileName, region->chrom, region->start, region->end, cart, track); wds->setSpanConstraint(wds, span); valuesMatched = getWigglePossibleIntersection(wds, region, database, table2, &intersectBedList, splitTableOrFileName, operations); if (intersectBedList) span = 1; } } /* when doing multiple regions, we need to print out each result as * it happens to keep the connection open to the browser and * prevent any timeout since this could take a while. * (worst case test is quality track on panTro1) */ if (wds->stats) statsItemCount += wds->stats->count; if (wds->stats && (regionCount > 1) && (valuesMatched > 0)) { double sumData = wds->stats->mean * wds->stats->count; double sumSquares; if (wds->stats->count > 1) sumSquares = (wds->stats->variance * (wds->stats->count - 1)) + ((sumData * sumData)/wds->stats->count); else sumSquares = sumData * sumData; /* global accumulators for overall summary */ statsSpan = wds->stats->span; statsSumData += sumData; statsSumSquares += sumSquares; if (wds->stats->lowerLimit < lowerLimit) lowerLimit = wds->stats->lowerLimit; if ((wds->stats->lowerLimit + wds->stats->dataRange) > upperLimit) upperLimit = wds->stats->lowerLimit + wds->stats->dataRange; if (statsHeaderDone) wds->statsOut(wds, database, "stdout", TRUE, TRUE, FALSE, TRUE); else { wds->statsOut(wds, database, "stdout", TRUE, TRUE, TRUE, TRUE); statsHeaderDone = TRUE; } wds->freeStats(wds); gotSome = TRUE; } if ((regionCount > MAX_REGION_DISPLAY) && (regionsDone >= MAX_REGION_DISPLAY)) { hPrintf("<TR><TH ALIGN=CENTER COLSPAN=12> Can not display more " "than %d regions, <BR> would take too much time </TH></TR>\n", MAX_REGION_DISPLAY); break; /* exit this for loop */ } } /*for (region = regionList; region != NULL; region = region->next) */ if (hasConstraint) freeMem(dataConstraint); /* been cloned into wds */ if (1 == regionCount) { statsPreamble(wds, regionList->chrom, regionList->start, regionList->end, span, valuesMatched, table2); /* 3 X TRUE = sort results, html table output, with header, * the FALSE means close the table after printing, no more rows to * come. The case in the if() statement was already taken care of * in the statsPreamble() printout. No need to do that again. */ if ( ! ((valuesMatched == 0) && table2) ) wds->statsOut(wds, database, "stdout", TRUE, TRUE, TRUE, FALSE); regionSize = basesInRegion(regionList,0); gapTotal = gapsInRegion(conn, regionList,0); } else { /* this is a bit of a kludge here since these printouts are done in the * library source wigDataStream.c statsOut() function and * this is a clean up of that. That function should be * pulled out of there and made independent and more * versatile. */ long long realSize; double variance; double stddev; /* Too expensive to lookup the numbers for thousands of regions */ regionSize = basesInRegion(regionList,MAX_REGION_DISPLAY); gapTotal = gapsInRegion(conn, regionList,MAX_REGION_DISPLAY); realSize = regionSize - gapTotal; /* close the table which was left open in the loop above */ if (!gotSome) hPrintf("<TR><TH ALIGN=CENTER COLSPAN=12> No data found matching this request </TH></TR>\n"); hPrintf("<TR><TH ALIGN=LEFT> SUMMARY: </TH>\n"); hPrintf("\t<TD> </TD>\n"); /* chromStart */ hPrintf("\t<TD> </TD>\n"); /* chromEnd */ hPrintf("\t<TD ALIGN=RIGHT> "); printLongWithCommas(stdout, statsItemCount); hPrintf(" </TD>\n" ); hPrintf("\t<TD ALIGN=RIGHT> %d </TD>\n", statsSpan); hPrintf("\t<TD ALIGN=RIGHT> "); printLongWithCommas(stdout, statsItemCount*statsSpan); hPrintf(" (%.2f%%) </TD>\n", 100.0*(double)(statsItemCount*statsSpan)/(double)realSize); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", lowerLimit); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", upperLimit); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", upperLimit - lowerLimit); if (statsItemCount > 0) hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", statsSumData/statsItemCount); else hPrintf("\t<TD ALIGN=RIGHT> 0.0 </TD>\n"); stddev = 0.0; variance = 0.0; if (statsItemCount > 1) { variance = (statsSumSquares - ((statsSumData * statsSumData)/(double) statsItemCount)) / (double) (statsItemCount - 1); if (variance > 0.0) stddev = sqrt(variance); } hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", variance); hPrintf("\t<TD ALIGN=RIGHT> %g </TD>\n", stddev); hPrintf("</TR>\n"); wigStatsTableHeading(stdout, TRUE); hPrintf("</TABLE></TD></TR></TABLE></P>\n"); } #if defined(NOT) /* can't do the histogram now, that operation times out */ /* Single region, we can do the histogram */ if ((valuesMatched > 1) && (1 == regionCount)) { float *valuesArray = NULL; size_t valueCount = 0; struct histoResult *histoGramResult; /* convert the ascii data listings to one giant float array */ valuesArray = wds->asciiToDataArray(wds, valuesMatched, &valueCount); /* histoGram() may return NULL if it doesn't work */ histoGramResult = histoGram(valuesArray, valueCount, NAN, (unsigned) 0, NAN, (float) wds->stats->lowerLimit, (float) (wds->stats->lowerLimit + wds->stats->dataRange), (struct histoResult *)NULL); printHistoGram(histoGramResult, TRUE); /* TRUE == html output */ freeHistoGram(&histoGramResult); wds->freeAscii(wds); wds->freeArray(wds); } #endif wds->freeStats(wds); wiggleDataStreamFree(&wds); wigFetchTime = clock1000() - startTime; webNewSection("Region and Timing Statistics"); hTableStart(); stringStatRow("region", regionName); numberStatRow("bases in region", regionSize); numberStatRow("bases in gaps", gapTotal); floatStatRow("load and calc time", 0.001*wigFetchTime); wigFilterStatRow(conn); stringStatRow("intersection", cartUsualString(cart, hgtaIntersectTable, "off")); hTableEnd(); htmlClose(); } /* void doSummaryStatsWiggle(struct sqlConnection *conn) */
void genericWiggleClick(struct sqlConnection *conn, struct trackDb *tdb, char *item, int start) /* Display details for Wiggle data tracks. * conn may be NULL for custom tracks when from file */ { char *chrom = cartString(cart, "c"); char table[64]; boolean hasBin; unsigned span = 0; struct wiggleDataStream *wds = wiggleDataStreamNew(); unsigned long long valuesMatched = 0; struct histoResult *histoGramResult; float *valuesArray = NULL; size_t valueCount = 0; struct customTrack *ct = NULL; boolean isCustom = FALSE; int operations = wigFetchStats; /* default operation */ if (startsWith("ct_", tdb->table)) { ct = lookupCt(tdb->table); if (!ct) { warn("<P>wiggleClick: can not find custom wiggle track '%s'</P>", tdb->table); return; } if (! ct->wiggle) { warn("<P>wiggleClick: called to do stats on a custom track that isn't wiggle data ?</P>"); return; } if (ct->dbTrack) { safef(table,ArraySize(table), "%s", ct->dbTableName); span = minSpan(conn, table, chrom, winStart, winEnd, cart, tdb); } else { safef(table,ArraySize(table), "%s", ct->wigFile); span = 0; /* cause all spans to be examined */ } isCustom = TRUE; } else { hFindSplitTable(database, seqName, tdb->table, table, &hasBin); /*span = spanInUse(conn, table, chrom, winStart, winEnd, cart);*/ span = minSpan(conn, table, chrom, winStart, winEnd, cart, tdb); } /* if for some reason we don't have a chrom and win positions, this * should be run in a loop that does one chrom at a time. In the * case of hgc, there seems to be a chrom and a position. */ wds->setSpanConstraint(wds, span); wds->setChromConstraint(wds, chrom); wds->setPositionConstraint(wds, winStart, winEnd); /* If our window is less than some number of points, we can do * the histogram too. */ #define MAX_WINDOW_ALLOW_STATS 100000001 #define MAX_WINDOW_ALLOW_STRING "100,000,000" if ((winEnd - winStart) < MAX_WINDOW_ALLOW_STATS) operations |= wigFetchAscii; /* We want to also fetch the actual data values so we can run a * histogram function on them. You can't fetch the data in the * form of the data array since the span information is then lost. * We have to do the ascii data list format, and prepare that to * send to the histogram function. */ if (isCustom) { if (ct->dbTrack) valuesMatched = wds->getData(wds, CUSTOM_TRASH, table, operations); else valuesMatched = wds->getData(wds, (char *)NULL, table, operations); } else valuesMatched = wds->getData(wds, database, table, operations); statsPreamble(wds, chrom, winStart, winEnd, span, valuesMatched, NULL); /* output statistics table * (+sort, +html output, +with header, +close table) */ wds->statsOut(wds, database, "stdout", TRUE, TRUE, TRUE, FALSE); if ((winEnd - winStart) < MAX_WINDOW_ALLOW_STATS) { char *words[16]; int wordCount = 0; char *dupe = cloneString(tdb->type); double minY, maxY, tDbMinY, tDbMaxY; float hMin, hMax, hRange; wordCount = chopLine(dupe, words); wigFetchMinMaxY(tdb, &minY, &maxY, &tDbMinY, &tDbMaxY, wordCount, words); hMin = min(minY,tDbMinY); hMax = max(maxY,tDbMaxY); hRange = hMax - hMin; /* convert the ascii data listings to one giant float array */ valuesArray = wds->asciiToDataArray(wds, valuesMatched, &valueCount); /* let's see if we really want to use the range from the track type * line, or the actual range in this data. If there is a good * actual range in the data, use that instead */ if (hRange > 0.0) { if (wds->stats->dataRange != 0) hRange = 0.0; } /* If we have a valid range, use a specified 20 bin histogram * NOTE: pass 21 as binCount to get a 20 bin histogram */ if (hRange > 0.0) histoGramResult = histoGram(valuesArray, valueCount, (hRange/20.0), (unsigned) 21, hMin, hMin, hMax, (struct histoResult *)NULL); else histoGramResult = histoGram(valuesArray, valueCount, NAN, (unsigned) 0, NAN, (float) wds->stats->lowerLimit, (float) (wds->stats->lowerLimit + wds->stats->dataRange), (struct histoResult *)NULL); /* histoGram() may return NULL if it doesn't work, that's OK, the * print out will indicate no results (TRUE == html output) */ printHistoGram(histoGramResult, TRUE); freeHistoGram(&histoGramResult); freeMem(valuesArray); } else { printf("<P>(viewing windows of fewer than %s bases will also" " display a histogram)</P>\n", MAX_WINDOW_ALLOW_STRING); } wiggleDataStreamFree(&wds); }