Пример #1
0
//
// CRITCHI  --  Compute critical chi-square value to
//              produce given p.  We just do a bisection
//              search for a value within CHI_EPSILON,
//              relying on the monotonicity of pochisq().
//
double critchi( double p, int df)
{
    double CHI_EPSILON = 0.000001;   // Accuracy of critchi approximation
    double CHI_MAX = 99999.0;        // Maximum chi-square value
    double minchisq = 0.0;
    double maxchisq = CHI_MAX;
    double chisqval;

    if (p <= 0.0) {
        return maxchisq;
    } else {
        if (p >= 1.0) {
            return 0.0;
        }
    }

    chisqval = df / sqrt(p);    // fair first value
    while ((maxchisq - minchisq) > CHI_EPSILON) {
        if (pochisq(chisqval, df) < p) {
            maxchisq = chisqval;
        } else {
            minchisq = chisqval;
        }
        chisqval = (maxchisq + minchisq) * 0.5;
    }
    return chisqval;
}
double rtlchsq(int df, double z)  
{
 double a, x, y ;
 if (df==1) return 2.0*ntail(sqrt(z)) ;
 if (df==2) return exp(-0.5*z) ;

 y =   pochisq(z, df) ;
 if (y<1.0e-6) { 
  a = 0.5*(double) df ;
  x = 0.5*z ;
  return rtlg(a,x) ;
 }
 return y ;

}
Пример #3
0
VALUE rb_rt_chisquare_probability_force(VALUE self) {
    rt_ctx *ctx;
    double ret, chip;

    Data_Get_Struct(self, rt_ctx, ctx);

    if(! ctx->ended)
        rt_end(ctx);

    chip = pochisq(ctx->r_chisq, (ctx->binary ? 1 : 255));

    if (chip < 0.0001)
        ret = 0.009;
    else if (chip > 0.9999)
        ret = 99.991;
    else
        ret = chip * 100;

    return DBL2NUM(ret);
}
Пример #4
0
/**
 *  Calculate how frequently each byte occurs in data.
 *  If every possible byte occurs with the same frequency then the
 *  data is perfectly random. 
 *
 *  The expected occurence of each possible byte is data.size / 256
 *
 *  If one byte occurs more often than another you get a small error
 *  (1*1) / expected.  If it occurs much more the error grows by
 *  the square.
 */
bool is_random( const fc::vector<char>& data ) {
   if( data.size() < RAND_THRESHOLD ) 
      return true;
   fc::vector<uint16_t> buckets(256);
   memset( buckets.data(), 0, buckets.size() * sizeof(uint16_t) );
   for( auto itr = data.begin(); itr != data.end(); ++itr )
     buckets[(uint8_t)*itr]++;
   
   double expected = data.size() / 256;
   
   double x2 = 0;
   for( auto itr = buckets.begin(); itr != buckets.end(); ++itr ) {
       double de = *itr - expected;
       x2 +=  (de*de) / expected;
   } 
   //slog( "%s", fc::to_hex( data.data(), 128 ).c_str() );
   //slog( "%d", data.size() );
   float prob = pochisq( x2, 255 );
   //slog( "Prob %f", prob );

   // smaller chunks have a higher chance of 'low' entrempy
   return prob < .80 && prob > .20;
}
Пример #5
0
int main(int argc, char *argv[]) {

    if (!getenv("FORCE_PRNG_VERF")) {
        fprintf(stderr,
                "prng: Skipping PRNG verification test\n"
                "prng: Set FORCE_PRNG_VERF=1 to enable PRNG verification\n");
        return 77;
    }

    uint8_t ob[BUFFER_SIZE];
    unsigned long totalc = 0; /* Total character count */
    double montepi, chip, scc, ent, mean, chisq;

    /* Initialise for calculations */

    rt_init(0);

    /* Scan input and count character occurrences */

    for (totalc = 0; totalc < SAMPLE_SIZE_BYTES; totalc += BUFFER_SIZE) {
        assert(prng_get_random_bytes(ob, BUFFER_SIZE) >= 0);
        rt_add(ob, BUFFER_SIZE);
    }

    /* Complete calculation and return sequence metrics */

    rt_end(&ent, &chisq, &mean, &montepi, &scc);

    /* Calculate probability of observed distribution occurring from
    the results of the Chi-Square test */

    chip = pochisq(chisq, 255);

    /* Print calculated results */
    printf("Entropy:\n");
    printf("========\n");
    printf("Entropy = %f bits per byte.\n", ent);
    printf("\nOptimum compression would reduce the size\n");
    printf("of this %ld byte input by %d percent.\n\n", totalc,
           (int16_t)(100 * (8 - ent) / 8.0));

    // Optimum compression would reduction equal to 0%
    assert((int16_t)(100 * (8 - ent) / 8.0) == 0);

    printf("Chi-square Test:\n");
    printf("================\n");
    printf("Chi square distribution for %ld samples is %1.2f, and randomly\n",
           totalc, chisq);
    if (chip < 0.0001) {
        printf(
            "would exceed this value less than 0.01 percent of the times.\n\n");
    } else if (chip > 0.9999) {
        printf("would exceed this value more than than 99.99 percent of the "
               "times.\n\n");
    } else {
        printf("would exceed this value %1.2f percent of the times.\n\n",
               chip * 100);
    }

    // Chi-square test result between 10% and 90%
    assert(90 > (chip * 100));
    assert((chip * 100) > 10);

    printf("Arithmetic Mean:\n");
    printf("================\n");
    printf("Arithmetic mean value of data bytes is %1.4f (%.1f = random).\n\n",
           mean, 127.5);

    // Arithmetic Mean between 127 and 128
    assert(127.0 < mean);
    assert(mean < 128.0);

    printf("Monte Carlo Value for Pi:\n");
    printf("=========================\n");
    printf("Monte Carlo value for Pi is %1.9f (error %1.2f percent).\n\n",
           montepi, 100.0 * (fabs(PI - montepi) / PI));

    // Monte Carlo Value for Pi less than 0.5
    assert(0.5 > 100.0 * (fabs(PI - montepi) / PI));

    printf("Serial Correlation Coefficient:\n");
    printf("===============================\n");
    printf("Serial correlation coefficient is ");
    if (scc >= -99999) {
        printf("%1.6f (totally uncorrelated = 0.0).\n", scc);
    } else {
        printf("undefined (all values equal!).\n");
    }
    printf("\nSee https://www.fourmilab.ch/random/ for detailed description of "
           "output\n");

    // Serial Correlation Coefficient between -0.005 and 0.005
    assert(0.005 > scc);
    assert(scc > -0.005);

    return 0;
}
Пример #6
0
// Runs experiments with one inputed data set and varies the memory
// Input: takes 5 command line arguments, lower and upper fraction of memory,
// names of files the data are in, and number of bins
// Output: creates 4 files, the log file holds all the data generated. the tabl
// file holds the pvalues deliminated by tabs, the extra file holds all the 
// calculated statistics. and the pvalue holds the actual and estimated pvalues
int main(int argc, char* argv[])
{
  if (argc < 6)
  {
    cout << "usage: VaryMemoryReal lower-memory upper-memory filename1 filename2 num_bins\n";
    exit(1);
  }

  double lower = atof(argv[1]), upper = atof(argv[2]);
  char *filename1 = argv[3], *filename2 = argv[4];
  int num_buckets = atoi(argv[5]);

  if (lower <=0 || upper <= 0)
  {
    cout << "The memory must be greater than 0.\n";
    exit(1);
  }
  if (num_buckets <= 0)
  {
    cout << "The number of buckets must be greater than 0.\n";
    exit(1);
  }

  // finds the number of times the experiment will run
  double memory_percent;
  int repeats = 0;
  double mem = lower;
  while (mem <= (upper + 0.0000001)) // accounts for rounding error
  {
    repeats++;
    mem *= sqrt(10);
  }

  double actual_values[repeats];
  double GK_values[repeats];
  double QD_values[repeats];
  double RS_values[repeats];
  double percents[repeats];
  std::vector<double> data1, data2;

  // creates and initializes the log file
  ofstream data_file;
  char str[100];
  name_file(str, argv, 0);
  data_file.open(str);
  ifstream input_file;
  char output[100];
  int stream_size1 = 0, stream_size2 = 0;

  std::default_random_engine generator(1);
  std::uniform_real_distribution<double> distribution(0.0, 1.0);

  input_file.open(filename1);
  while (!input_file.eof())
  {
    input_file >> output;
    data1.push_back(atof(output) + (distribution(generator) * 0.000000001));
    stream_size1++;
  }
  input_file.close();

  input_file.open(filename2);
  while (!input_file.eof())
  {
    input_file >> output;
    data2.push_back(atof(output) + (distribution(generator) * 0.000000001));
    stream_size2++;
  }
  input_file.close();

  data_file << filename1 << endl << filename2 << endl;
  data_file << "num_buckets: " << num_buckets << endl;
  data_file << "stream 1 size: " << stream_size1 << endl;
  data_file << "stream 2 size: " << stream_size2 << endl;
  memory_percent = lower;
  int i = 0;
  while (memory_percent <= (upper + 0.00000001)) //accounts for rounding
  {
    data_file << "memory percent: " << memory_percent << endl;
    percents[i] = memory_percent;
    int sample_size1 = memory_percent * stream_size1;
    int sample_size2 = memory_percent * stream_size2;

    // calculates GK statistic
    ChiSquareContinuous GK_sketch1(sample_size1,1);
    for (std::vector<double>::iterator j = data1.begin(); j != data1.end();j++)
      GK_sketch1.insert(*j);
    ChiSquareContinuous GK_sketch2(sample_size2,1);
    for (std::vector<double>::iterator j = data2.begin(); j != data2.end();j++)
      GK_sketch2.insert(*j);
    
    double GK_stat = GK_sketch1.two_sample_statistic(GK_sketch2, num_buckets);
    GK_values[i] = GK_stat;
    data_file << "GK = " << GK_stat << endl;

    // calculates real statistic
    double *upper_intervals = GK_sketch1.get_upper();
    double *lower_intervals = GK_sketch1.get_lower();
    double constant_1 = sqrt((double)stream_size2/stream_size1);
    double constant_2 = sqrt((double)stream_size1/stream_size2);
    double chi_squared = 0;

    for (int i = 0; i < num_buckets; i++)
    {
      double frequency1 = 0, frequency2 = 0;
      for (std::vector<double>::iterator j = data1.begin(); j!=data1.end();j++)
	{
        if (*j <= upper_intervals[i+1] && *j > lower_intervals[i+1])
          frequency1++;
      }
      for (std::vector<double>::iterator j = data2.begin(); j!=data2.end();j++)
      {
        if (*j <= upper_intervals[i+1] && *j > lower_intervals[i+1])
	  frequency2++;
      }

      double lambda = frequency1 * constant_1 - frequency2 * constant_2;
      chi_squared += (lambda * lambda) / (frequency1 + frequency2);
    }

    actual_values[i] = chi_squared;
    data_file << "actual = " << chi_squared << endl;

    memory_percent *= sqrt(10);
    i++;
  }
  data_file.close();

  // creates pvalue file
  name_file(str, argv, 3);
  data_file.open(str);
  
  // creates table file
  ofstream data2_file;
  name_file(str, argv, 1);
  data2_file.open(str);

  // creates extra file
  ofstream data3_file;
  name_file(str, argv, 2);
  data3_file.open(str);

  int deg_freedom = num_buckets;
  if (stream_size1 != stream_size2)
    deg_freedom--;
  for (int i = 0; i < repeats; i++)
  {
    // adds values to the table
    data2_file << percents[i] * 100 << "\t";
    double error = abs(pochisq(GK_values[i], deg_freedom) - pochisq(actual_values[i], deg_freedom));
    data2_file << error << endl;

    // adds values to pvalue file
    data_file << pochisq(actual_values[i], deg_freedom) << " actual" << endl;
    data_file << pochisq(GK_values[i], deg_freedom) << " GK" << endl;

    // adds values to extra file
    data3_file << percents[i] * 100 << "\t";
    error = abs(GK_values[i] - actual_values[i]) / actual_values[i]; 
    data3_file << error << endl;
  }
  data3_file.close();
  data2_file.close();
  data_file.close();
  return 0;
}
Пример #7
0
    	    unsigned char ob = ocb;
	    for (b = 0; b < 8; b++) {
		ccount[ob & 1]++;
		ob >>= 1;
	    }    
	} else {
	    ccount[ocb]++;
	}
	rt_add(&ocb, 1);
    }

    /* Complete calculation and return sequence metrics */
    rt_end(&ent, &chisq, &mean, &montepi, &scc);

    /* Calculate probability of observed distribution occurring from the results of the Chi-Square test */
    chip = pochisq(chisq, binary ? 1 : 255);

    /* Print calculated results */
    printf("%ld samples, entropy %f, chisq %1.2f, mean %1.4f, chip %1.2f %s pi %f scc %f\n",
				 totalc, ent, chisq, mean, chip*100, attn(chip), montepi, scc);

    *csq = chisq;
}



void test(void)
{
double chip,chisq;
int i;
Пример #8
0
// Runs experiments with one inputed data set and varies the memory
// Input: takes 5 command line arguments, lower and upper fraction of memory,
// names of files the data are in, and number of bins
// Output: creates 4 files, the log file holds all the data generated, the 
// table file holds the pvalues deliminated by tabs, the extra file holds all 
// the calculated statistics. and the pvalue holds the actual and estimated 
// pvalues
int main(int argc, char* argv[])
{
  if (argc < 5)
  {
    cout << "usage: VaryMemoryCategorical lower-memory upper-memory filename1 filename2" << endl;
    exit(1);
  }
  
  double lower = atof(argv[1]), upper = atof(argv[2]);
  char *filename1 = argv[3], *filename2 = argv[4];

  double memory_percent, mem = lower;
  int repeats = 0;
  while (mem <= (upper + 0.00000001))
  {
    repeats++;
    mem += 10;
  }

  double actual_values[repeats], estimated_values[repeats], percents[repeats];
  long times[repeats];
  long times2[repeats];
  std::vector<double> data1, data2;
  std::unordered_map<double,int> stream1, stream2;

  // creates and initializes the log file
  ofstream data_file;
  char str[100];
  name_file(str, argv, 0);
  data_file.open(str);
  ifstream input_file;
  char output[100];
  int stream_size1 = 0, stream_size2 = 0;

  input_file.open(filename1);
  while (!input_file.eof())
    {
      input_file >> output;
      data1.push_back(atof(output));
      stream_size1++;
      if (stream1.find(atof(output)) == stream1.end())
	stream1.insert(std::make_pair(atof(output),1));
      else
	stream1[atof(output)] += 1;
    }
  input_file.close();

  input_file.open(filename2);
  while (!input_file.eof())
    {
      input_file >> output;
      data2.push_back(atof(output));
      stream_size2++;
      if (stream2.find(atof(output)) == stream2.end())
	stream2.insert(std::make_pair(atof(output),1));
      else
	stream2[atof(output)] += 1;
    }
  input_file.close();

  data_file << filename1 << endl << filename2 << endl;
  data_file << "stream 1 size: " << stream_size1 << endl;
  data_file << "stream 2 size: " << stream_size2 << endl;
  data_file << "number of categories: " << stream1.size() << endl;
  memory_percent = lower;
  int i = 0, num_categories;


  while (memory_percent <= (upper + 0.00000001)) //accounts for rounding
  {
    data_file << "memory percent: " << memory_percent << endl;
    percents[i] = memory_percent;

    // calculates the estimated statistic
    ChiSquareCategorical sketch1(memory_percent);
    ChiSquareCategorical sketch2(memory_percent);

	timeval timeBefore, timeAfter; // initializes variables
  	long diffSeconds, diffUSeconds;
 	gettimeofday(&timeBefore, NULL); 

    for (std::vector<double>::iterator j = data1.begin(); j != data1.end();j++)
      sketch1.insert(*j);
    for (std::vector<double>::iterator j = data2.begin(); j != data2.end();j++)
      sketch2.insert(*j);

	gettimeofday(&timeAfter, NULL); // get time for insertion
  	diffSeconds = timeAfter.tv_sec - timeBefore.tv_sec;
  	diffUSeconds = timeAfter.tv_usec - timeBefore.tv_usec;

	times[i] = diffSeconds;
	times2[i] = diffUSeconds;

    double estimated_stat = sketch1.calculate_statistic(sketch2, 0);

    estimated_values[i] = estimated_stat;
    data_file << "estimate = " << estimated_stat << endl;

    // calculates actual statistic
    double constant1 = sqrt(double(stream_size2) / double(stream_size1));
    double constant2 = sqrt(double(stream_size1) / double(stream_size2));
    double actual_stat = 0;

    for (std::unordered_map<double,int>::const_iterator j = stream1.begin(); j!= stream1.end(); j++)
    {
      double frequency1 = j->second;
      double frequency2 = 0;
      num_categories++;
      if (stream2.find(j->first) != stream2.end())
	frequency2 = stream2[j->first];
      double value = frequency1 * constant1 - frequency2 * constant2;
      actual_stat += (value * value) / (frequency1 + frequency2);
    }
    // have to loop through other stream to find when first one is 0
    for (std::unordered_map<double,int>::const_iterator j = stream2.begin(); j!= stream2.end(); j++)
    {
      if (stream1.find(j->first) == stream1.end())
      {
	num_categories++;
	int frequency1 = 0;
	int frequency2 = j->second;
	double value = frequency1 * constant1 - frequency2 * constant2;
	actual_stat += (value * value) / (frequency1 + frequency2);
      }
    }
    actual_values[i] = actual_stat;
    data_file << "actual = " << actual_stat << endl;
    i++;
    memory_percent += 10;
  }
  data_file.close();

  // creates pvalues file
  name_file(str, argv, 3);
  data_file.open(str);

  // creates table file
  ofstream data2_file;
  name_file(str, argv, 1);
  data2_file.open(str);

  // creates time table file
  ofstream time_file;
  char str2[150];
  name_file(str2, argv, 5);
  time_file.open(str2);

  // creates extra file
  ofstream data3_file;
  name_file(str, argv, 2);
  data3_file.open(str);

  int deg_freedom = num_categories - 1;
  for (int i = 0; i < repeats; i++)
    {
      // adds values to the table
      data2_file << percents[i] << "\t";
      double error = abs(pochisq(estimated_values[i], deg_freedom) - pochisq(actual_values[i], deg_freedom));
      data2_file << error << endl;

	  // adds values to the time table
	  time_file << percents[i] << "\t";
      long double avg_time = times[i] + times2[i]/1000000.0;
      time_file << avg_time << endl;
	  
      // adds values to pvalue file
      data_file << pochisq(actual_values[i], deg_freedom) << " actual" << endl;
      data_file << pochisq(estimated_values[i], deg_freedom) << " estimated" << endl;

      // adds values to extra file
      data3_file << percents[i] << "\t";
      error = abs(estimated_values[i] - actual_values[i]) / actual_values[i];
      data3_file << error << endl;
    }
  data3_file.close();
  data2_file.close();
  data_file.close(); 
  return 0;
}