int main(int argc, char **argv) { cmph_uint32 verbosity = 0; char generate = 0; char *mphf_file = NULL; FILE *mphf_fd = stdout; const char *keys_file = NULL; FILE *keys_fd; cmph_uint32 nkeys = UINT_MAX; cmph_uint32 seed = UINT_MAX; CMPH_HASH *hashes = NULL; cmph_uint32 nhashes = 0; cmph_uint32 i; CMPH_ALGO mph_algo = CMPH_CHM; double c = 0; cmph_config_t *config = NULL; cmph_t *mphf = NULL; char * tmp_dir = NULL; cmph_io_adapter_t *source; cmph_uint32 memory_availability = 0; cmph_uint32 b = 0; cmph_uint32 keys_per_bin = 1; while (1) { char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:"); if (ch == -1) break; switch (ch) { case 's': { char *cptr; seed = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Invalid seed %s\n", optarg); exit(1); } } break; case 'c': { char *endptr; c = strtod(optarg, &endptr); if(*endptr != 0) { fprintf(stderr, "Invalid c value %s\n", optarg); exit(1); } } break; case 'g': generate = 1; break; case 'k': { char *endptr; nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10); if(*endptr != 0) { fprintf(stderr, "Invalid number of keys %s\n", optarg); exit(1); } } break; case 'm': mphf_file = strdup(optarg); break; case 'd': tmp_dir = strdup(optarg); break; case 'M': { char *cptr; memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Invalid memory availability %s\n", optarg); exit(1); } } break; case 'b': { char *cptr; b = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Parameter b was not found: %s\n", optarg); exit(1); } } break; case 't': { char *cptr; keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Parameter t was not found: %s\n", optarg); exit(1); } } break; case 'v': ++verbosity; break; case 'V': printf("%s\n", VERSION); return 0; case 'h': usage_long(argv[0]); return 0; case 'a': { char valid = 0; for (i = 0; i < CMPH_COUNT; ++i) { if (strcmp(cmph_names[i], optarg) == 0) { mph_algo = (CMPH_ALGO)i; valid = 1; break; } } if (!valid) { fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION); return -1; } } break; case 'f': { char valid = 0; for (i = 0; i < CMPH_HASH_COUNT; ++i) { if (strcmp(cmph_hash_names[i], optarg) == 0) { hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 )); hashes[nhashes] = (CMPH_HASH)i; hashes[nhashes + 1] = CMPH_HASH_COUNT; ++nhashes; valid = 1; break; } } if (!valid) { fprintf(stderr, "Invalid hash function: %s\n", optarg); return -1; } } break; default: usage(argv[0]); return 1; } } if (optind != argc - 1) { usage(argv[0]); return 1; } keys_file = argv[optind]; if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); srand(seed); int ret = 0; if (mphf_file == NULL) { mphf_file = (char *)malloc(strlen(keys_file) + 5); memcpy(mphf_file, keys_file, strlen(keys_file)); memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5); } keys_fd = fopen(keys_file, "r"); if (keys_fd == NULL) { fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); return -1; } if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); if (generate) { //Create mphf mphf_fd = fopen(mphf_file, "w"); config = cmph_config_new(source); cmph_config_set_algo(config, mph_algo); if (nhashes) cmph_config_set_hashfuncs(config, hashes); cmph_config_set_verbosity(config, verbosity); cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir); cmph_config_set_mphf_fd(config, mphf_fd); cmph_config_set_memory_availability(config, memory_availability); cmph_config_set_b(config, b); cmph_config_set_keys_per_bin(config, keys_per_bin); //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); mphf = cmph_new(config); cmph_config_destroy(config); if (mphf == NULL) { fprintf(stderr, "Unable to create minimum perfect hashing function\n"); //cmph_config_destroy(config); free(mphf_file); return -1; } if (mphf_fd == NULL) { fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } cmph_dump(mphf, mphf_fd); cmph_destroy(mphf); fclose(mphf_fd); } else { cmph_uint8 * hashtable = NULL; mphf_fd = fopen(mphf_file, "r"); if (mphf_fd == NULL) { fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } mphf = cmph_load(mphf_fd); fclose(mphf_fd); if (!mphf) { fprintf(stderr, "Unable to parser input file %s\n", mphf_file); free(mphf_file); return -1; } cmph_uint32 siz = cmph_size(mphf); hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8)); memset(hashtable, 0,(size_t) siz); //check all keys for (i = 0; i < source->nkeys; ++i) { cmph_uint32 h; char *buf; cmph_uint32 buflen = 0; source->read(source->data, &buf, &buflen); h = cmph_search(mphf, buf, buflen); if (!(h < siz)) { fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); ret = 1; } else if(hashtable[h] >= keys_per_bin) { fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h); fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); ret = 1; } else hashtable[h]++; if (verbosity) { printf("%s -> %u\n", buf, h); } source->dispose(source->data, buf, buflen); } cmph_destroy(mphf); free(hashtable); } fclose(keys_fd); free(mphf_file); free(tmp_dir); cmph_io_nlfile_adapter_destroy(source); return ret; }
cmph_t *chd_new(cmph_config_t *mph, double c) { cmph_t *mphf = NULL; chd_data_t *chdf = NULL; chd_config_data_t *chd = (chd_config_data_t *)mph->data; chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data; compressed_rank_t cr; register cmph_t * chd_phf = NULL; register cmph_uint32 packed_chd_phf_size = 0; cmph_uint8 * packed_chd_phf = NULL; register cmph_uint32 packed_cr_size = 0; cmph_uint8 * packed_cr = NULL; register cmph_uint32 i, idx, nkeys, nvals, nbins; cmph_uint32 * vals_table = NULL; register cmph_uint32 * occup_table = NULL; #ifdef CMPH_TIMING double construction_time_begin = 0.0; double construction_time = 0.0; ELAPSED_TIME_IN_SECONDS(&construction_time_begin); #endif cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); cmph_config_set_graphsize(chd->chd_ph, c); if (mph->verbosity) { fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c); } chd_phf = cmph_new(chd->chd_ph); if(chd_phf == NULL) { return NULL; } packed_chd_phf_size = cmph_packed_size(chd_phf); DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size); /* Make sure that we have enough space to pack the mphf. */ packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1); /* Pack the mphf. */ cmph_pack(chd_phf, packed_chd_phf); cmph_destroy(chd_phf); if (mph->verbosity) { fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n"); } compressed_rank_init(&cr); nbins = chd_ph->n; nkeys = chd_ph->m; nvals = nbins - nkeys; vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32)); occup_table = (cmph_uint32 *)chd_ph->occup_table; for(i = 0, idx = 0; i < nbins; i++) { if(!GETBIT32(occup_table, i)) { vals_table[idx++] = i; } } compressed_rank_generate(&cr, vals_table, nvals); free(vals_table); packed_cr_size = compressed_rank_packed_size(&cr); packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8)); compressed_rank_pack(&cr, packed_cr); compressed_rank_destroy(&cr); mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; chdf = (chd_data_t *)malloc(sizeof(chd_data_t)); chdf->packed_cr = packed_cr; packed_cr = NULL; //transfer memory ownership chdf->packed_chd_phf = packed_chd_phf; packed_chd_phf = NULL; //transfer memory ownership chdf->packed_chd_phf_size = packed_chd_phf_size; chdf->packed_cr_size = packed_cr_size; mphf->data = chdf; mphf->size = nkeys; DEBUGP("Successfully generated minimal perfect hash\n"); if (mph->verbosity) { fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); register cmph_uint32 space_usage = chd_packed_size(mphf)*8; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys); #endif return mphf; }