void correlation(GArgReader& args) { GMatrix* pA = loadData(args.pop_string()); Holder<GMatrix> hA(pA); int attr1 = args.pop_uint(); int attr2 = args.pop_uint(); // Parse Options bool aboutorigin = false; while(args.size() > 0) { if(args.if_pop("-aboutorigin")) aboutorigin = true; else ThrowError("Invalid option: ", args.peek()); } double m1, m2; if(aboutorigin) { m1 = 0; m2 = 0; } else { m1 = pA->mean(attr1); m2 = pA->mean(attr2); } double corr = pA->linearCorrelationCoefficient(attr1, m1, attr2, m2); cout.precision(14); cout << corr << "\n"; }
GHistogram::GHistogram(GMatrix& data, size_t col, double xmin, double xmax, size_t maxBuckets) { double dataMin, dataRange; data.minAndRangeUnbiased(col, &dataMin, &dataRange); double mean = data.mean(col); double median = data.median(col); double dev = sqrt(data.variance(col, mean)); if(xmin == UNKNOWN_REAL_VALUE) m_min = std::max(dataMin, median - 4 * dev); else m_min = xmin; if(xmax == UNKNOWN_REAL_VALUE) m_max = std::min(dataMin + dataRange, median + 4 * dev); else m_max = xmax; m_binCount = std::min(maxBuckets, (size_t)floor(sqrt((double)data.rows()))); m_bins = new double[m_binCount]; GVec::setAll(m_bins, 0.0, m_binCount); m_sum = 0.0; for(size_t i = 0; i < data.rows(); i++) addSample(data[i][col], 1.0); }
void significance(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); int attr1 = args.pop_uint(); int attr2 = args.pop_uint(); // Parse options double tolerance = 0.001; while(args.size() > 0) { if(args.if_pop("-tol")) tolerance = args.pop_double(); else ThrowError("Invalid option: ", args.peek()); } // Print some basic stats cout.precision(8); { cout << "### Some basic stats\n"; cout << "Medians = " << pData->median(attr1) << ", " << pData->median(attr2) << "\n"; double mean1 = pData->mean(attr1); double mean2 = pData->mean(attr2); cout << "Means = " << mean1 << ", " << mean2 << "\n"; double var1 = pData->variance(attr1, mean1); double var2 = pData->variance(attr2, mean2); cout << "Standard deviations = " << sqrt(var1) << ", " << sqrt(var2) << "\n"; int less = 0; int eq = 0; int more = 0; for(size_t i = 0; i < pData->rows(); i++) { double* pRow = pData->row(i); if(std::abs(pRow[attr1] - pRow[attr2]) < tolerance) eq++; else if(pRow[attr1] < pRow[attr2]) less++; else more++; } cout << less << " less, " << eq << " same, " << more << " greater\n"; } // Perform the significance tests { cout << "\n### Paired T-test\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, false); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Paired T-test with normalized values\n"; size_t v; double t; pData->pairedTTest(&v, &t, attr1, attr2, true); double p = GMath::tTestAlphaValue(v, t); cout << "v=" << v << ", t=" << t << ", p=" << p << "\n"; } { cout << "\n### Wilcoxon Signed Ranks Test"; int num; double wMinus, wPlus; pData->wilcoxonSignedRanksTest(attr1, attr2, tolerance, &num, &wMinus, &wPlus); cout << "Number of signed ranks: " << num << "\n"; double w_min = std::min(wMinus, wPlus); double w_sum = wPlus - wMinus; cout << "W- = " << wMinus << ", W+ = " << wPlus << ", W_min = " << w_min << ", W_sum = " << w_sum << "\n"; double p_min = 0.5 * GMath::wilcoxonPValue(num, w_min); if(num < 10) cout << "Because the number of signed ranks is small, you should use a lookup table, rather than rely on the normal approximation for the P-value.\n"; cout << "One-tailed P-value (for directional comparisons) computed with a normal approximation using W_min = " << 0.5 * p_min << "\n"; cout << "Two-tailed P-value (for non-directional comparisons) computed with a normal approximation using W_min = " << p_min << "\n"; cout << "To show that something is \"better\" than something else, use the one-tailed P-value.\n"; cout << "Commonly, a P-value less that 0.05 is considered to be significant.\n"; /* double p_sum = GMath::wilcoxonPValue(num, w_sum); cout << "Directional (one-tailed) P-value computed with W_sum = " << p_sum << "\n"; */ } }