/** * Computes the bag distance of two strings. The distance approximates * and lower bounds the Levenshtein distance. * @param x first string * @param y second string * @return Bag distance */ float dist_bag_compare(hstring_t x, hstring_t y) { float d = 0; bag_t *xh, *yh, *xb, *yb; xh = bag_create(x); yh = bag_create(y); int missing = y.len; for (xb = xh; xb != NULL; xb = xb->hh.next) { HASH_FIND(hh, yh, &(xb->sym), sizeof(sym_t), yb); if (!yb) { d += xb->cnt; } else { d += fabs(xb->cnt - yb->cnt); missing -= yb->cnt; } } d += missing; bag_destroy(xh); bag_destroy(yh); return lnorm(n, d, x, y); }
void bag_destroy(BAG b) { if (b) { bag_destroy(b->next); treenode_destroy(b->pennant); free(b); } }
void entry_destroy(bag_elem_t e) { entry_t *old_entry = e; free(old_entry -> entry_word); // empty and free the page index bag_traverse(old_entry->page_index, page_destroy); bag_destroy(old_entry->page_index); free(old_entry); }
int main(int argc, char *argv[]) { FILE *input, *log; int min_word_len = 0; bag_t *index; clock_t ticks; /* First, check that there is a first command line argument and * that it is the name of a file that can be opened for reading. */ if (argc <= 1 || ! (input = fopen(argv[1], "r"))) { fprintf(stderr, "ERROR: missing or incorrect argument!\n" "USAGE: %s <filename> [minimum_word_length]\n" " . <filename> is the name of a text file (required)\n" " . [minimum_word_length] is a positive integer (optional)\n", argv[0]); exit(EXIT_FAILURE); } /* If we get here, the file has been opened for reading. */ /* Next, check if there is a second command line argument to specify * a minimum word length. */ if (argc < 3 || (min_word_len = (int) strtol(argv[2], NULL, 10)) <= 0) min_word_len = MIN_WORD_LEN; /* If we get here, the minimum word length has a positive value. */ //creat or append to a runtime log file log = fopen("runtime_log.txt", "a"); fprintf(log, "For %s and word %d characters and larger:\n", argv[1], min_word_len); /* Next, generate the index, close the input file (because we're done with * it at this point), and print timing data. */ ticks = clock(); index = generate_index(input, min_word_len); ticks = clock() - ticks; fclose(input); fprintf(log, "Elapsed time for generating the index: %gms\n", 1000.0 * ticks / CLOCKS_PER_SEC); /* Timing data is printed on stderr so we can isolate it from the rest of * the output below, if desired. */ /* Finally, print the index on stdout and clean up: free the memory * allocated for each index entry, then the memory for the index itself. */ if (index) { // timing how long it takes to print the index ticks = clock(); bag_traverse(index, entry_print); ticks = clock() - ticks; fprintf(log, "Elapsed time for printing the index: %gms\n", 1000.0 * ticks / CLOCKS_PER_SEC); // timing how long it takes to destroy the index ticks = clock(); bag_traverse(index, entry_destroy); bag_destroy(index); ticks = clock() - ticks; fprintf(log, "Elapsed time for destroy the index: %gms\n\n", 1000.0 * ticks / CLOCKS_PER_SEC); } fclose(log); return EXIT_SUCCESS; }