/* Estimate entropy and Markov models */ void get_statistics(struct Statistics *stats, FILE* fp) { unsigned char *buffer; unsigned char old_buffer[BUFFER_SIZE+2]; int total = 0; double sum = 0.0; double probability; size_t i, bytes_read; init_statistics(stats); rewind(fp); buffer = old_buffer+2; /* Read the first two characters */ old_buffer[0] = (unsigned char)fgetc(fp); stats->occurrence.single[old_buffer[0]]++; old_buffer[1] = (unsigned char)fgetc(fp); stats->occurrence.single[old_buffer[1]]++; increment_count(2, old_buffer, stats->occurrence.multi[0]); total += 2; while(!feof(fp)) { /* Fill buffer */ bytes_read = fread(buffer, 1, BUFFER_SIZE, fp); for(i=0; i<bytes_read; i++) { /* Count occurrence of the byte */ stats->occurrence.single[buffer[i]]++; /* Count occurrence of the latest two bytes */ increment_count(2, buffer+i-1, stats->occurrence.multi[0]); /* Count occurrence of the latest three bytes */ increment_count(3, buffer+i-2, stats->occurrence.multi[1]); /* Count total number of bytes */ total++; } } /* Report occurrences */ for(i=0; i<256; i++) printf("occurrence[%3d] = %d\n", i, stats->occurrence.single[i]); /* Report file size */ printf("total = %d\n", total); /* Estimate memoryless entropy */ for(i=0; i<256; i++) { if(stats->occurrence.single[i] > 0) { /* Calculate probability of the byte */ probability = stats->occurrence.single[i] / (double)total; /* Add to sum */ sum = sum - (probability * log2(probability)); } } stats->entropy.memoryless = sum; /* Calculate entropy rate of first-order Markov model */ sum = 0; for(i=0; i<256*256; i++) { if(stats->occurrence.multi[0]->mem[i] > 0) { /* Calculate probability of the sequence */ probability = stats->occurrence.multi[0]->mem[i] / (double)(total-1); /* Add to sum */ sum = sum - (probability * log2(probability)); } } stats->entropy.markov1 = sum - stats->entropy.memoryless; /* Calculate entropy rate of second-order Markov model */ sum = 0; for(i=0; i<256*256*256; i++) { if(stats->occurrence.multi[1]->mem[i] > 0) { /* Calculate probability of the sequence */ probability = stats->occurrence.multi[1]->mem[i] / (double)(total-2); /* Add to sum */ sum = sum - (probability * log2(probability)); } } stats->entropy.markov2 = sum - stats->entropy.markov1 - stats->entropy.memoryless; /* Report memoryless entropy */ printf("Memoryless entropy: %f bit(s)\n", stats->entropy.memoryless); /* Report entropy rate of first-order Markov model */ printf("Markov model (Order 1) entropy rate: %f bit(s)\n", stats->entropy.markov1); /* Report entropy rate of second-order Markov model */ printf("Markov model (Order 2) entropy rate: %f bit(s)\n", stats->entropy.markov2); free_statistics(stats); return; }
void classifyDatasetTT(dataset *train, dataset *test, distances *distanceSet, criteria *criterias, mParameters *param) { char *outputStr; FILE *distFile, **outputFile; int t1, t2, i, k, c, l = 0; float ***distSet, **tmpSet; float *bestScores, *spaceScores; int *trueCompute, *curCompute; int classFound, nbDist, nbComb = 0; statistics **criteriaStats, **bestStats, **spaceStats; series *ts1, *ts2; distSet = malloc(test->cardinality * sizeof(float **)); tmpSet = malloc(train->cardinality * sizeof(float *)); trueCompute = distanceSet->compute; curCompute = calloc(distanceSet->nb_distances, sizeof(int)); outputStr = calloc(1024, sizeof(char)); outputFile = calloc(criterias->nb_criteria + 1, sizeof(FILE *)); criteriaStats = calloc(criterias->nb_criteria, sizeof(statistics *)); bestStats = calloc(criterias->nb_criteria, sizeof(statistics *)); spaceStats = calloc(criterias->nb_criteria, sizeof(statistics *)); bestScores = calloc(criterias->nb_criteria, sizeof(float)); spaceScores = calloc(criterias->nb_criteria, sizeof(float)); for (i = 0; i < criterias->nb_criteria; i++) { bestScores[i] = 1.0; spaceScores[i] = 1.0; criteriaStats[i] = init_statistics(train->nb_classes); sprintf(outputStr, "%s/%s/results/%s.txt", param->output, train->name, criterias->name[i]); outputFile[i] = fopen(outputStr, "w"); } sprintf(outputStr, "%s/%s/results/globalResults.txt", param->output, train->name); outputFile[criterias->nb_criteria] = fopen(outputStr, "w"); for (t1 = 0; t1 < test->cardinality; t1++) { distSet[t1] = calloc(train->cardinality, sizeof(float *)); for (t2 = 0; t2 < train->cardinality; t2++) distSet[t1][t2] = calloc(distanceSet->nb_distances, sizeof(float)); for (i = 0; i < distanceSet->nb_distances; i++) if (distanceSet->compute[i]) { #pragma omp parallel for for (t2 = 0; t2 < train->cardinality; t2++) { ts1 = test->data[t1]; ts2 = train->data[t2]; distSet[t1][t2][i] = distanceSet->functions[i](ts1, ts2, distanceSet->best[i]); } } } for (i = 0; i < distanceSet->nb_distances; i++) if (distanceSet->compute[i]) { sprintf(outputStr, "%s/%s/distances/test_%s.txt", param->output, train->name, distanceSet->name[i]); distFile = fopen(outputStr, "w"); for (t1 = 0; t1 < test->cardinality; t1++) { for (t2 = 0; t2 < train->cardinality; t2++) fprintf(distFile, "%f ", distSet[t1][t2][i]); fprintf(distFile, "\n"); } fclose(distFile); } if (param->combineAll) { for (i = 0; i < distanceSet->nb_distances; i++) if (distanceSet->compute[i]) nbComb++; nbComb = pow(2, nbComb); } for (c = 1; c < nbComb; c++) { for (k = c, l = 0, nbDist = 0; l < distanceSet->nb_distances; l++) if (trueCompute[l]) { curCompute[l] = (k & 0x1); k >>= 1; nbDist += curCompute[l];} for (i = 0; i < criterias->nb_criteria; i++) empty_statistics(criteriaStats[i]); for (t1 = 0; t1 < test->cardinality; t1++) { for (t2 = 0; t2 < train->cardinality; t2++) { tmpSet[t2] = calloc(nbDist, sizeof(float)); for (l = 0, k = 0; l < distanceSet->nb_distances; l++) if (curCompute[l]) tmpSet[t2][k++] = distSet[t1][t2][l]; } normalize_distance(tmpSet, nbDist, train->cardinality); for (i = 0; i < criterias->nb_criteria; i++) if (criterias->compute[i]) { classFound = criterias->functions[i](tmpSet, train->classes, train->cardinality, nbDist, -1, train->nb_classes); criteriaStats[i]->confusionMatrix[test->classes[t1]][classFound]++; if (classFound != test->classes[t1]) { criteriaStats[i]->classesErrors[test->classes[t1]]++; criteriaStats[i]->nbErrors++; } } for (t2 = 0; t2 < train->cardinality; t2++) free(tmpSet[t2]); } for (k = 0; k < (criterias->nb_criteria + 1); k++) { for (i = 0; i < distanceSet->nb_distances; i++) if (curCompute[i]) fprintf(outputFile[k], "%s ", distanceSet->name[i]); fprintf(outputFile[k], ":\n"); if (k < criterias->nb_criteria) { criteriaStats[k]->error = (float)criteriaStats[k]->nbErrors / (float)test->cardinality; export_statistics(outputFile[k], criteriaStats[k]); if (criteriaStats[k]->error < bestScores[k]) { bestScores[k] = criteriaStats[k]->error; if (bestStats[k] != NULL) free(bestStats[k]); bestStats[k] = duplicate_statistics(criteriaStats[k]); } if (c == criterias->bestSpaceID[k]) { spaceStats[k] = duplicate_statistics(criteriaStats[k]); spaceScores[k] = criteriaStats[k]->error; } continue; } for (i = 0; i < criterias->nb_criteria; i++) fprintf(outputFile[k], "%f ", criteriaStats[i]->error); fprintf(outputFile[k], "\n"); } } fprintf(outputFile[criterias->nb_criteria], "Best statistics :\n"); for (i = 0; i < criterias->nb_criteria; i++) fprintf(outputFile[criterias->nb_criteria], "%f ", bestStats[i]->error); fprintf(outputFile[criterias->nb_criteria], "\n"); fprintf(outputFile[criterias->nb_criteria], "Space statistics :\n"); for (i = 0; i < criterias->nb_criteria; i++) fprintf(outputFile[criterias->nb_criteria], "%f ", spaceStats[i]->error); fprintf(outputFile[criterias->nb_criteria], "\n"); for (i = 0; i < criterias->nb_criteria; i++) { fprintf(outputFile[i], "Best statistics :\n"); export_statistics(outputFile[i], bestStats[i]); fprintf(outputFile[i], "Space statistics :\n"); export_statistics(outputFile[i], spaceStats[i]); free_statistics(criteriaStats[i]); free_statistics(bestStats[i]); free_statistics(spaceStats[i]); fclose(outputFile[i]); } fclose(outputFile[i]); for (t1 = 0; t1 < test->cardinality; t1++) { for (t2 = 0; t2 < train->cardinality; t2++) free(distSet[t1][t2]); free(distSet[t1]); } free(bestScores); free(spaceScores); free(bestStats); free(spaceStats); free(criteriaStats); free(tmpSet); free(curCompute); free(outputStr); free(outputFile); free(distSet); return; }