int backtrack(int depth, int start, int size_set, int dist, short* nums, int num_nums, short* sol) { int i; if (depth == size_set) { return 1; } int j; int valid; for (i = start; i < num_nums; i++) { valid = 1; for (j = 0; j < depth; j++) { if (hamming_dist(nums[i], sol[j]) < dist) { valid = 0; break; } } if (valid == 1) { sol[depth] = nums[i]; if (backtrack(depth + 1, i + 1, size_set, dist, nums, num_nums, sol)) { return 1; } } } return 0; }
double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *str_b, int len_b){ double d = -1.0; switch(S->distance){ case osa : return osa_dist(str_a, len_a, str_b, len_b, S->weight, S->work); case lv : return lv_dist( str_a, len_a, str_b, len_b, S->weight, S->work); case dl : return dl_dist(str_a, len_a, str_b, len_b, S->weight, S->dict, S->work); case hamming : return hamming_dist(str_a, len_a, str_b, len_b); case lcs : return lcs_dist(str_a, len_a, str_b, len_b, S->work); case qgram : return qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 0L); case cosine : return qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 1L); case jaccard : d = qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 2L); break; case jw : return jaro_winkler_dist(str_a, len_a, str_b, len_b, S->p, S->weight, S->work); case soundex : return soundex_dist(str_a, len_a, str_b, len_b, &(S->ifail)); default : break; // set errno, return -1 } return d; }
int main(int argc, char** argv) { int size_set; int num_bits; int dist; short nums[1 << MAX_NUM_BITS]; short sol[1 << MAX_NUM_BITS]; int i; FILE* fin = fopen("hamming.in", "r"); assert(fin != NULL); fscanf(fin, "%d%d%d", &size_set, &num_bits, &dist); fclose(fin); int bound = 1 << num_bits; int num_nums = 0; for (i = 1; i < bound; i++) { if (hamming_dist(0, i) >= dist) { nums[num_nums++] = i; } } FILE* fout = fopen("hamming.out", "w"); assert(fout != NULL); sol[0] = 0; if (backtrack(1, 0, size_set, dist, nums, num_nums, sol)) { for (i = 0; i < size_set; i++) { if (i > 0) { if (i % 10 == 0) { fprintf(fout, "\n"); } else { fprintf(fout, " "); } } fprintf(fout, "%hd", sol[i]); } fprintf(fout, "\n"); } fclose(fout); return 0; }
int main( int argc, char **argv) { /* Print help if there are no parameters */ if (argc < 2) print_help(); /* Parse input line */ int opt; int stoken_size = 1; bool diff = false; bool quiet = false; while (opt = getopt( argc, argv, "dqs:"), opt != -1) switch (opt) { case 's': stoken_size = atoi( optarg); break; case 'd': diff = true; break; case 'q': quiet = true; break; case 'h': default: print_help(); /* Unreachable */ } if (stoken_size <= 0) { fprintf( stderr, "Error: Invalid super-token size\n"); return 6; } if (diff) { if (argc-optind < 2) { fprintf( stderr, "Error: At least two files must be specified.\n"); return 7; } argc = optind+2; } /* Do the work */ uint64_t hash1, hash2; int c = 0; for (c=optind; c<argc; c++) { char *data; long data_length; get_file( argv[ c], &data, &data_length); hash2 = hash1; hash1 = stoken_size == 1 ? charikar_hash64( data, data_length) /* Fast version */ : charikar_hash64_wide( data, data_length, stoken_size); /* Fast version */ free( data); if (!quiet) printf( "%016llx %s\n", (long long)hash1, argv[ c]); } /* Print results */ if (diff) { uint64_t hashx = hash1^hash2; int hamm = hamming_dist( hash1, hash2); if (quiet) { printf( "%d\n", hamm); } else { printf( "%016llx\n", (long long)hashx); printf( "difference: %d\n", hamm); } } return 0; }
void *search(void *unused) { // Various size constants for convenience const size_t prelen = strlen(PREFIX_STRING); const size_t sufflen = strlen(SUFFIX_STRING); const size_t suffstart = LEN - sufflen; // Buffer used to store candidate string, extra for null terminator. char str[LEN + 1]; str[LEN] = 0; // Put in our hardcoded prefix/suffix strings strcpy(str, PREFIX_STRING); strcpy(str + suffstart, SUFFIX_STRING); // Track how many hashes we've tried, for throughput estimate. uint64_t count = 0; char counting = 1; // Thread-local best score achieved, used to avoid // unnecessary accesses to shared global_best in the common case. int best = INT_MAX; // Buffer for storing computed hash bytes char hash[128]; start: // Fill the middle of the string with random data // (not including prefix, suffix, or portion we'll exhaustively search) gen_rand(str + prelen, LEN - prelen - sufflen - SEARCH_CHARS); // Iteration index array // Indirection used to keep character set flexible. unsigned idx[SEARCH_CHARS]; memset(idx, 0, sizeof(idx)); // Initialize enumeration part of string to first letter in charset char *iterstr = str + suffstart - SEARCH_CHARS; memset(iterstr, CHARSET[0], SEARCH_CHARS); while (1) { // Try string in current form Hash(1024, (BitSequence *)str, LEN * 8, (BitSequence *)hash); // How'd we do? int d = hamming_dist(hash, GOAL_BITS, 128); // If this is the best we've seen, print it and update best. if (d < best) { best = d; lock(); if (d < global_best) { global_best = d; printf("%d - '%s'\n", d, str); fflush(stdout); } unlock(); } // Increment string index array, updating str as we go // aaaaaa // baaaaa // caaaaa // ... // abaaaa // bbaaaa // cbaaaa // ... // (etc) int cur = 0; while (++idx[cur] >= CHARSET_SIZE) { idx[cur] = 0; iterstr[cur] = CHARSET[idx[cur]]; // Advance to next position. // If we've used all of our search characters, // time to start over with new random prefix. if (++cur == SEARCH_CHARS) goto start; } iterstr[cur] = CHARSET[idx[cur]]; // Throughput calculation. // Once this thread hits a limit, increment global_done // and print throughput estimate if we're the last thread to do so. const uint64_t iters = 1 << 24; // ~16M if (counting && ++count == iters) { counting = 0; time_t end = time(NULL); int elapsed = end - global_start; lock(); global_count += count; assert(global_count >= count && "counter overflow"); if (++global_done == num_threads) { printf("\n*** Total throughput ~= %f hash/S\n\n", ((double)(global_count)) / elapsed); } unlock(); } } }