// // CRITCHI -- Compute critical chi-square value to // produce given p. We just do a bisection // search for a value within CHI_EPSILON, // relying on the monotonicity of pochisq(). // double critchi( double p, int df) { double CHI_EPSILON = 0.000001; // Accuracy of critchi approximation double CHI_MAX = 99999.0; // Maximum chi-square value double minchisq = 0.0; double maxchisq = CHI_MAX; double chisqval; if (p <= 0.0) { return maxchisq; } else { if (p >= 1.0) { return 0.0; } } chisqval = df / sqrt(p); // fair first value while ((maxchisq - minchisq) > CHI_EPSILON) { if (pochisq(chisqval, df) < p) { maxchisq = chisqval; } else { minchisq = chisqval; } chisqval = (maxchisq + minchisq) * 0.5; } return chisqval; }
double rtlchsq(int df, double z) { double a, x, y ; if (df==1) return 2.0*ntail(sqrt(z)) ; if (df==2) return exp(-0.5*z) ; y = pochisq(z, df) ; if (y<1.0e-6) { a = 0.5*(double) df ; x = 0.5*z ; return rtlg(a,x) ; } return y ; }
VALUE rb_rt_chisquare_probability_force(VALUE self) { rt_ctx *ctx; double ret, chip; Data_Get_Struct(self, rt_ctx, ctx); if(! ctx->ended) rt_end(ctx); chip = pochisq(ctx->r_chisq, (ctx->binary ? 1 : 255)); if (chip < 0.0001) ret = 0.009; else if (chip > 0.9999) ret = 99.991; else ret = chip * 100; return DBL2NUM(ret); }
/** * Calculate how frequently each byte occurs in data. * If every possible byte occurs with the same frequency then the * data is perfectly random. * * The expected occurence of each possible byte is data.size / 256 * * If one byte occurs more often than another you get a small error * (1*1) / expected. If it occurs much more the error grows by * the square. */ bool is_random( const fc::vector<char>& data ) { if( data.size() < RAND_THRESHOLD ) return true; fc::vector<uint16_t> buckets(256); memset( buckets.data(), 0, buckets.size() * sizeof(uint16_t) ); for( auto itr = data.begin(); itr != data.end(); ++itr ) buckets[(uint8_t)*itr]++; double expected = data.size() / 256; double x2 = 0; for( auto itr = buckets.begin(); itr != buckets.end(); ++itr ) { double de = *itr - expected; x2 += (de*de) / expected; } //slog( "%s", fc::to_hex( data.data(), 128 ).c_str() ); //slog( "%d", data.size() ); float prob = pochisq( x2, 255 ); //slog( "Prob %f", prob ); // smaller chunks have a higher chance of 'low' entrempy return prob < .80 && prob > .20; }
int main(int argc, char *argv[]) { if (!getenv("FORCE_PRNG_VERF")) { fprintf(stderr, "prng: Skipping PRNG verification test\n" "prng: Set FORCE_PRNG_VERF=1 to enable PRNG verification\n"); return 77; } uint8_t ob[BUFFER_SIZE]; unsigned long totalc = 0; /* Total character count */ double montepi, chip, scc, ent, mean, chisq; /* Initialise for calculations */ rt_init(0); /* Scan input and count character occurrences */ for (totalc = 0; totalc < SAMPLE_SIZE_BYTES; totalc += BUFFER_SIZE) { assert(prng_get_random_bytes(ob, BUFFER_SIZE) >= 0); rt_add(ob, BUFFER_SIZE); } /* Complete calculation and return sequence metrics */ rt_end(&ent, &chisq, &mean, &montepi, &scc); /* Calculate probability of observed distribution occurring from the results of the Chi-Square test */ chip = pochisq(chisq, 255); /* Print calculated results */ printf("Entropy:\n"); printf("========\n"); printf("Entropy = %f bits per byte.\n", ent); printf("\nOptimum compression would reduce the size\n"); printf("of this %ld byte input by %d percent.\n\n", totalc, (int16_t)(100 * (8 - ent) / 8.0)); // Optimum compression would reduction equal to 0% assert((int16_t)(100 * (8 - ent) / 8.0) == 0); printf("Chi-square Test:\n"); printf("================\n"); printf("Chi square distribution for %ld samples is %1.2f, and randomly\n", totalc, chisq); if (chip < 0.0001) { printf( "would exceed this value less than 0.01 percent of the times.\n\n"); } else if (chip > 0.9999) { printf("would exceed this value more than than 99.99 percent of the " "times.\n\n"); } else { printf("would exceed this value %1.2f percent of the times.\n\n", chip * 100); } // Chi-square test result between 10% and 90% assert(90 > (chip * 100)); assert((chip * 100) > 10); printf("Arithmetic Mean:\n"); printf("================\n"); printf("Arithmetic mean value of data bytes is %1.4f (%.1f = random).\n\n", mean, 127.5); // Arithmetic Mean between 127 and 128 assert(127.0 < mean); assert(mean < 128.0); printf("Monte Carlo Value for Pi:\n"); printf("=========================\n"); printf("Monte Carlo value for Pi is %1.9f (error %1.2f percent).\n\n", montepi, 100.0 * (fabs(PI - montepi) / PI)); // Monte Carlo Value for Pi less than 0.5 assert(0.5 > 100.0 * (fabs(PI - montepi) / PI)); printf("Serial Correlation Coefficient:\n"); printf("===============================\n"); printf("Serial correlation coefficient is "); if (scc >= -99999) { printf("%1.6f (totally uncorrelated = 0.0).\n", scc); } else { printf("undefined (all values equal!).\n"); } printf("\nSee https://www.fourmilab.ch/random/ for detailed description of " "output\n"); // Serial Correlation Coefficient between -0.005 and 0.005 assert(0.005 > scc); assert(scc > -0.005); return 0; }
// Runs experiments with one inputed data set and varies the memory // Input: takes 5 command line arguments, lower and upper fraction of memory, // names of files the data are in, and number of bins // Output: creates 4 files, the log file holds all the data generated. the tabl // file holds the pvalues deliminated by tabs, the extra file holds all the // calculated statistics. and the pvalue holds the actual and estimated pvalues int main(int argc, char* argv[]) { if (argc < 6) { cout << "usage: VaryMemoryReal lower-memory upper-memory filename1 filename2 num_bins\n"; exit(1); } double lower = atof(argv[1]), upper = atof(argv[2]); char *filename1 = argv[3], *filename2 = argv[4]; int num_buckets = atoi(argv[5]); if (lower <=0 || upper <= 0) { cout << "The memory must be greater than 0.\n"; exit(1); } if (num_buckets <= 0) { cout << "The number of buckets must be greater than 0.\n"; exit(1); } // finds the number of times the experiment will run double memory_percent; int repeats = 0; double mem = lower; while (mem <= (upper + 0.0000001)) // accounts for rounding error { repeats++; mem *= sqrt(10); } double actual_values[repeats]; double GK_values[repeats]; double QD_values[repeats]; double RS_values[repeats]; double percents[repeats]; std::vector<double> data1, data2; // creates and initializes the log file ofstream data_file; char str[100]; name_file(str, argv, 0); data_file.open(str); ifstream input_file; char output[100]; int stream_size1 = 0, stream_size2 = 0; std::default_random_engine generator(1); std::uniform_real_distribution<double> distribution(0.0, 1.0); input_file.open(filename1); while (!input_file.eof()) { input_file >> output; data1.push_back(atof(output) + (distribution(generator) * 0.000000001)); stream_size1++; } input_file.close(); input_file.open(filename2); while (!input_file.eof()) { input_file >> output; data2.push_back(atof(output) + (distribution(generator) * 0.000000001)); stream_size2++; } input_file.close(); data_file << filename1 << endl << filename2 << endl; data_file << "num_buckets: " << num_buckets << endl; data_file << "stream 1 size: " << stream_size1 << endl; data_file << "stream 2 size: " << stream_size2 << endl; memory_percent = lower; int i = 0; while (memory_percent <= (upper + 0.00000001)) //accounts for rounding { data_file << "memory percent: " << memory_percent << endl; percents[i] = memory_percent; int sample_size1 = memory_percent * stream_size1; int sample_size2 = memory_percent * stream_size2; // calculates GK statistic ChiSquareContinuous GK_sketch1(sample_size1,1); for (std::vector<double>::iterator j = data1.begin(); j != data1.end();j++) GK_sketch1.insert(*j); ChiSquareContinuous GK_sketch2(sample_size2,1); for (std::vector<double>::iterator j = data2.begin(); j != data2.end();j++) GK_sketch2.insert(*j); double GK_stat = GK_sketch1.two_sample_statistic(GK_sketch2, num_buckets); GK_values[i] = GK_stat; data_file << "GK = " << GK_stat << endl; // calculates real statistic double *upper_intervals = GK_sketch1.get_upper(); double *lower_intervals = GK_sketch1.get_lower(); double constant_1 = sqrt((double)stream_size2/stream_size1); double constant_2 = sqrt((double)stream_size1/stream_size2); double chi_squared = 0; for (int i = 0; i < num_buckets; i++) { double frequency1 = 0, frequency2 = 0; for (std::vector<double>::iterator j = data1.begin(); j!=data1.end();j++) { if (*j <= upper_intervals[i+1] && *j > lower_intervals[i+1]) frequency1++; } for (std::vector<double>::iterator j = data2.begin(); j!=data2.end();j++) { if (*j <= upper_intervals[i+1] && *j > lower_intervals[i+1]) frequency2++; } double lambda = frequency1 * constant_1 - frequency2 * constant_2; chi_squared += (lambda * lambda) / (frequency1 + frequency2); } actual_values[i] = chi_squared; data_file << "actual = " << chi_squared << endl; memory_percent *= sqrt(10); i++; } data_file.close(); // creates pvalue file name_file(str, argv, 3); data_file.open(str); // creates table file ofstream data2_file; name_file(str, argv, 1); data2_file.open(str); // creates extra file ofstream data3_file; name_file(str, argv, 2); data3_file.open(str); int deg_freedom = num_buckets; if (stream_size1 != stream_size2) deg_freedom--; for (int i = 0; i < repeats; i++) { // adds values to the table data2_file << percents[i] * 100 << "\t"; double error = abs(pochisq(GK_values[i], deg_freedom) - pochisq(actual_values[i], deg_freedom)); data2_file << error << endl; // adds values to pvalue file data_file << pochisq(actual_values[i], deg_freedom) << " actual" << endl; data_file << pochisq(GK_values[i], deg_freedom) << " GK" << endl; // adds values to extra file data3_file << percents[i] * 100 << "\t"; error = abs(GK_values[i] - actual_values[i]) / actual_values[i]; data3_file << error << endl; } data3_file.close(); data2_file.close(); data_file.close(); return 0; }
unsigned char ob = ocb; for (b = 0; b < 8; b++) { ccount[ob & 1]++; ob >>= 1; } } else { ccount[ocb]++; } rt_add(&ocb, 1); } /* Complete calculation and return sequence metrics */ rt_end(&ent, &chisq, &mean, &montepi, &scc); /* Calculate probability of observed distribution occurring from the results of the Chi-Square test */ chip = pochisq(chisq, binary ? 1 : 255); /* Print calculated results */ printf("%ld samples, entropy %f, chisq %1.2f, mean %1.4f, chip %1.2f %s pi %f scc %f\n", totalc, ent, chisq, mean, chip*100, attn(chip), montepi, scc); *csq = chisq; } void test(void) { double chip,chisq; int i;
// Runs experiments with one inputed data set and varies the memory // Input: takes 5 command line arguments, lower and upper fraction of memory, // names of files the data are in, and number of bins // Output: creates 4 files, the log file holds all the data generated, the // table file holds the pvalues deliminated by tabs, the extra file holds all // the calculated statistics. and the pvalue holds the actual and estimated // pvalues int main(int argc, char* argv[]) { if (argc < 5) { cout << "usage: VaryMemoryCategorical lower-memory upper-memory filename1 filename2" << endl; exit(1); } double lower = atof(argv[1]), upper = atof(argv[2]); char *filename1 = argv[3], *filename2 = argv[4]; double memory_percent, mem = lower; int repeats = 0; while (mem <= (upper + 0.00000001)) { repeats++; mem += 10; } double actual_values[repeats], estimated_values[repeats], percents[repeats]; long times[repeats]; long times2[repeats]; std::vector<double> data1, data2; std::unordered_map<double,int> stream1, stream2; // creates and initializes the log file ofstream data_file; char str[100]; name_file(str, argv, 0); data_file.open(str); ifstream input_file; char output[100]; int stream_size1 = 0, stream_size2 = 0; input_file.open(filename1); while (!input_file.eof()) { input_file >> output; data1.push_back(atof(output)); stream_size1++; if (stream1.find(atof(output)) == stream1.end()) stream1.insert(std::make_pair(atof(output),1)); else stream1[atof(output)] += 1; } input_file.close(); input_file.open(filename2); while (!input_file.eof()) { input_file >> output; data2.push_back(atof(output)); stream_size2++; if (stream2.find(atof(output)) == stream2.end()) stream2.insert(std::make_pair(atof(output),1)); else stream2[atof(output)] += 1; } input_file.close(); data_file << filename1 << endl << filename2 << endl; data_file << "stream 1 size: " << stream_size1 << endl; data_file << "stream 2 size: " << stream_size2 << endl; data_file << "number of categories: " << stream1.size() << endl; memory_percent = lower; int i = 0, num_categories; while (memory_percent <= (upper + 0.00000001)) //accounts for rounding { data_file << "memory percent: " << memory_percent << endl; percents[i] = memory_percent; // calculates the estimated statistic ChiSquareCategorical sketch1(memory_percent); ChiSquareCategorical sketch2(memory_percent); timeval timeBefore, timeAfter; // initializes variables long diffSeconds, diffUSeconds; gettimeofday(&timeBefore, NULL); for (std::vector<double>::iterator j = data1.begin(); j != data1.end();j++) sketch1.insert(*j); for (std::vector<double>::iterator j = data2.begin(); j != data2.end();j++) sketch2.insert(*j); gettimeofday(&timeAfter, NULL); // get time for insertion diffSeconds = timeAfter.tv_sec - timeBefore.tv_sec; diffUSeconds = timeAfter.tv_usec - timeBefore.tv_usec; times[i] = diffSeconds; times2[i] = diffUSeconds; double estimated_stat = sketch1.calculate_statistic(sketch2, 0); estimated_values[i] = estimated_stat; data_file << "estimate = " << estimated_stat << endl; // calculates actual statistic double constant1 = sqrt(double(stream_size2) / double(stream_size1)); double constant2 = sqrt(double(stream_size1) / double(stream_size2)); double actual_stat = 0; for (std::unordered_map<double,int>::const_iterator j = stream1.begin(); j!= stream1.end(); j++) { double frequency1 = j->second; double frequency2 = 0; num_categories++; if (stream2.find(j->first) != stream2.end()) frequency2 = stream2[j->first]; double value = frequency1 * constant1 - frequency2 * constant2; actual_stat += (value * value) / (frequency1 + frequency2); } // have to loop through other stream to find when first one is 0 for (std::unordered_map<double,int>::const_iterator j = stream2.begin(); j!= stream2.end(); j++) { if (stream1.find(j->first) == stream1.end()) { num_categories++; int frequency1 = 0; int frequency2 = j->second; double value = frequency1 * constant1 - frequency2 * constant2; actual_stat += (value * value) / (frequency1 + frequency2); } } actual_values[i] = actual_stat; data_file << "actual = " << actual_stat << endl; i++; memory_percent += 10; } data_file.close(); // creates pvalues file name_file(str, argv, 3); data_file.open(str); // creates table file ofstream data2_file; name_file(str, argv, 1); data2_file.open(str); // creates time table file ofstream time_file; char str2[150]; name_file(str2, argv, 5); time_file.open(str2); // creates extra file ofstream data3_file; name_file(str, argv, 2); data3_file.open(str); int deg_freedom = num_categories - 1; for (int i = 0; i < repeats; i++) { // adds values to the table data2_file << percents[i] << "\t"; double error = abs(pochisq(estimated_values[i], deg_freedom) - pochisq(actual_values[i], deg_freedom)); data2_file << error << endl; // adds values to the time table time_file << percents[i] << "\t"; long double avg_time = times[i] + times2[i]/1000000.0; time_file << avg_time << endl; // adds values to pvalue file data_file << pochisq(actual_values[i], deg_freedom) << " actual" << endl; data_file << pochisq(estimated_values[i], deg_freedom) << " estimated" << endl; // adds values to extra file data3_file << percents[i] << "\t"; error = abs(estimated_values[i] - actual_values[i]) / actual_values[i]; data3_file << error << endl; } data3_file.close(); data2_file.close(); data_file.close(); return 0; }