/* Estimate entropy and Markov models */
void get_statistics(struct Statistics *stats, FILE* fp)
{
  unsigned char *buffer;
  unsigned char old_buffer[BUFFER_SIZE+2];
  int total = 0;
  double sum = 0.0;
  double probability;
  size_t i, bytes_read;

  init_statistics(stats);
  rewind(fp);
  buffer = old_buffer+2;

  /* Read the first two characters */
  old_buffer[0] = (unsigned char)fgetc(fp);
  stats->occurrence.single[old_buffer[0]]++;
  old_buffer[1] = (unsigned char)fgetc(fp);
  stats->occurrence.single[old_buffer[1]]++;
  increment_count(2, old_buffer, stats->occurrence.multi[0]);
  total += 2;
  while(!feof(fp))
  {
    /* Fill buffer */
    bytes_read = fread(buffer, 1, BUFFER_SIZE, fp);

    for(i=0; i<bytes_read; i++)
    {
      /* Count occurrence of the byte */
      stats->occurrence.single[buffer[i]]++;
      /* Count occurrence of the latest two bytes */
      increment_count(2, buffer+i-1, stats->occurrence.multi[0]);
      /* Count occurrence of the latest three bytes */
      increment_count(3, buffer+i-2, stats->occurrence.multi[1]);
      /* Count total number of bytes */
      total++;
    }
  }

  /* Report occurrences */
  for(i=0; i<256; i++)
    printf("occurrence[%3d] = %d\n", i, stats->occurrence.single[i]);
  /* Report file size */
  printf("total = %d\n", total);

  /* Estimate memoryless entropy */
  for(i=0; i<256; i++)
  {
    if(stats->occurrence.single[i] > 0)
    {
      /* Calculate probability of the byte */
      probability = stats->occurrence.single[i] / (double)total;
      /* Add to sum */
      sum = sum - (probability * log2(probability));
    }
  }
  stats->entropy.memoryless = sum;

  /* Calculate entropy rate of first-order Markov model */
  sum = 0;
  for(i=0; i<256*256; i++)
  {
    if(stats->occurrence.multi[0]->mem[i] > 0)
    {
      /* Calculate probability of the sequence */
      probability = stats->occurrence.multi[0]->mem[i] / (double)(total-1);
      /* Add to sum */
      sum = sum - (probability * log2(probability));
    }
  }
  stats->entropy.markov1 = sum - stats->entropy.memoryless;

  /* Calculate entropy rate of second-order Markov model */
  sum = 0;
  for(i=0; i<256*256*256; i++)
  {
    if(stats->occurrence.multi[1]->mem[i] > 0)
    {
      /* Calculate probability of the sequence */
      probability = stats->occurrence.multi[1]->mem[i] / (double)(total-2);
      /* Add to sum */
      sum = sum - (probability * log2(probability));
    }
  }
  stats->entropy.markov2 = sum
    - stats->entropy.markov1
    - stats->entropy.memoryless;

  /* Report memoryless entropy */
  printf("Memoryless entropy: %f bit(s)\n", stats->entropy.memoryless);
  /* Report entropy rate of first-order Markov model */
  printf("Markov model (Order 1) entropy rate: %f bit(s)\n", stats->entropy.markov1);
  /* Report entropy rate of second-order Markov model */
  printf("Markov model (Order 2) entropy rate: %f bit(s)\n", stats->entropy.markov2);

  free_statistics(stats);
  return;
}
Example #2
0
void                    classifyDatasetTT(dataset *train, dataset *test, distances *distanceSet, criteria *criterias, mParameters *param)
{
    char                *outputStr;
    FILE                *distFile, **outputFile;
    int                 t1, t2, i, k, c, l = 0;
    float               ***distSet, **tmpSet;
    float               *bestScores, *spaceScores;
    int                 *trueCompute, *curCompute;
    int                 classFound, nbDist, nbComb = 0;
    statistics          **criteriaStats, **bestStats, **spaceStats;
    series              *ts1, *ts2;
    
    distSet = malloc(test->cardinality * sizeof(float **));
    tmpSet = malloc(train->cardinality * sizeof(float *));
    trueCompute = distanceSet->compute;
    curCompute = calloc(distanceSet->nb_distances, sizeof(int));
    outputStr = calloc(1024, sizeof(char));
    outputFile = calloc(criterias->nb_criteria + 1, sizeof(FILE *));
    criteriaStats = calloc(criterias->nb_criteria, sizeof(statistics *));
    bestStats = calloc(criterias->nb_criteria, sizeof(statistics *));
    spaceStats = calloc(criterias->nb_criteria, sizeof(statistics *));
    bestScores = calloc(criterias->nb_criteria, sizeof(float));
    spaceScores = calloc(criterias->nb_criteria, sizeof(float));
    for (i = 0; i < criterias->nb_criteria; i++)
    {
        bestScores[i] = 1.0;
        spaceScores[i] = 1.0;
        criteriaStats[i] = init_statistics(train->nb_classes);
        sprintf(outputStr, "%s/%s/results/%s.txt", param->output, train->name, criterias->name[i]);
        outputFile[i] = fopen(outputStr, "w");
    }
    sprintf(outputStr, "%s/%s/results/globalResults.txt", param->output, train->name);
    outputFile[criterias->nb_criteria] = fopen(outputStr, "w");
    for (t1 = 0; t1 < test->cardinality; t1++)
    {
        distSet[t1] = calloc(train->cardinality, sizeof(float *));
        for (t2 = 0; t2 < train->cardinality; t2++)
            distSet[t1][t2] = calloc(distanceSet->nb_distances, sizeof(float));
        for (i = 0; i < distanceSet->nb_distances; i++)
            if (distanceSet->compute[i])
            {
                #pragma omp parallel for
                for (t2 = 0; t2 < train->cardinality; t2++)
                {
                    ts1 = test->data[t1];
                    ts2 = train->data[t2];
                    distSet[t1][t2][i] = distanceSet->functions[i](ts1, ts2, distanceSet->best[i]);
                }
            }
    }
    for (i = 0; i < distanceSet->nb_distances; i++)
        if (distanceSet->compute[i])
        {
            sprintf(outputStr, "%s/%s/distances/test_%s.txt", param->output, train->name, distanceSet->name[i]);
            distFile = fopen(outputStr, "w");
            for (t1 = 0; t1 < test->cardinality; t1++)
            {
                for (t2 = 0; t2 < train->cardinality; t2++)
                    fprintf(distFile, "%f ", distSet[t1][t2][i]);
                fprintf(distFile, "\n");
            }
            fclose(distFile);
        }
    if (param->combineAll)
    {
        for (i = 0; i < distanceSet->nb_distances; i++)
            if (distanceSet->compute[i])
                nbComb++;
        nbComb = pow(2, nbComb);
    }
    for (c = 1; c < nbComb; c++)
    {
        for (k = c, l = 0, nbDist = 0; l < distanceSet->nb_distances; l++)
            if (trueCompute[l]) { curCompute[l] = (k & 0x1); k >>= 1; nbDist += curCompute[l];}
        for (i = 0; i < criterias->nb_criteria; i++)
            empty_statistics(criteriaStats[i]);
        for (t1 = 0; t1 < test->cardinality; t1++)
        {
            for (t2 = 0; t2 < train->cardinality; t2++)
            {
                tmpSet[t2] = calloc(nbDist, sizeof(float));
                for (l = 0, k = 0; l < distanceSet->nb_distances; l++)
                    if (curCompute[l])
                        tmpSet[t2][k++] = distSet[t1][t2][l];
            }
            normalize_distance(tmpSet, nbDist, train->cardinality);
            for (i = 0; i < criterias->nb_criteria; i++)
                if (criterias->compute[i])
                {
                    classFound = criterias->functions[i](tmpSet, train->classes, train->cardinality, nbDist, -1, train->nb_classes);
                    criteriaStats[i]->confusionMatrix[test->classes[t1]][classFound]++;
                    if (classFound != test->classes[t1])
                    {
                        criteriaStats[i]->classesErrors[test->classes[t1]]++;
                        criteriaStats[i]->nbErrors++;
                    }
                }
            for (t2 = 0; t2 < train->cardinality; t2++)
                free(tmpSet[t2]);
        }
        for (k = 0; k < (criterias->nb_criteria + 1); k++)
        {
            for (i = 0; i < distanceSet->nb_distances; i++)
                if (curCompute[i])
                    fprintf(outputFile[k], "%s ", distanceSet->name[i]);
            fprintf(outputFile[k], ":\n");
            if (k < criterias->nb_criteria)
            {
                criteriaStats[k]->error = (float)criteriaStats[k]->nbErrors / (float)test->cardinality;
                export_statistics(outputFile[k], criteriaStats[k]);
                if (criteriaStats[k]->error < bestScores[k])
                {
                    bestScores[k] = criteriaStats[k]->error;
                    if (bestStats[k] != NULL)
                        free(bestStats[k]);
                    bestStats[k] = duplicate_statistics(criteriaStats[k]);
                }
                if (c == criterias->bestSpaceID[k])
                {
                    spaceStats[k] = duplicate_statistics(criteriaStats[k]);
                    spaceScores[k] = criteriaStats[k]->error;
                }
                continue;
            }
            for (i = 0; i < criterias->nb_criteria; i++)
                fprintf(outputFile[k], "%f ", criteriaStats[i]->error);
            fprintf(outputFile[k], "\n");
        }
    }
    fprintf(outputFile[criterias->nb_criteria], "Best statistics :\n");
    for (i = 0; i < criterias->nb_criteria; i++)
        fprintf(outputFile[criterias->nb_criteria], "%f ", bestStats[i]->error);
    fprintf(outputFile[criterias->nb_criteria], "\n");
    fprintf(outputFile[criterias->nb_criteria], "Space statistics :\n");
    for (i = 0; i < criterias->nb_criteria; i++)
        fprintf(outputFile[criterias->nb_criteria], "%f ", spaceStats[i]->error);
    fprintf(outputFile[criterias->nb_criteria], "\n");
    for (i = 0; i < criterias->nb_criteria; i++)
    {
        fprintf(outputFile[i], "Best statistics :\n");
        export_statistics(outputFile[i], bestStats[i]);
        fprintf(outputFile[i], "Space statistics :\n");
        export_statistics(outputFile[i], spaceStats[i]);
        free_statistics(criteriaStats[i]);
        free_statistics(bestStats[i]);
        free_statistics(spaceStats[i]);
        fclose(outputFile[i]);
    }
    fclose(outputFile[i]);
    for (t1 = 0; t1 < test->cardinality; t1++)
    {
        for (t2 = 0; t2 < train->cardinality; t2++)
            free(distSet[t1][t2]);
        free(distSet[t1]);
    }
    free(bestScores);
    free(spaceScores);
    free(bestStats);
    free(spaceStats);
    free(criteriaStats);
    free(tmpSet);
    free(curCompute);
    free(outputStr);
    free(outputFile);
    free(distSet);
    return;
}