int create_disk_hash(struct cmph_hash_obj *o) { char mphfile[1024]; FILE *mph; int count = 0; o->keys_fd = fopen(o->keyfile, "r"); if (o->keys_fd == NULL) { fprintf(stderr, "File %s not found\n", o->keyfile); return(-1); } if((count = convert_textfile_to_ip4_bin(o->keyfile)) > 0) { fprintf(stderr,"binary file dumped\n"); } o->source = cmph_io_nlfile_adapter(o->keys_fd); o->config = cmph_config_new(o->source); // cmph_config_set_algo(o->config, CMPH_CHM); cmph_config_set_algo(o->config, CMPH_BDZ); // seems like a win o->hash = cmph_new(o->config); sprintf(mphfile,"%s.mph",o->keyfile); mph = fopen(mphfile, "w+"); if (mph == NULL) { fprintf(stderr, "File %s not found\n",mphfile); return(1); } else { cmph_dump(o->hash, mph); pack_n_save(o,"backup.mph"); fclose(mph); } cmph_config_destroy(o->config); return (0); }
// Create minimal perfect hash function from in-disk keys using BDZ algorithm int main(int argc, char **argv) { //Open file with newline separated list of keys FILE * keys_fd = fopen("keys.txt", "r"); cmph_t *hash = NULL; if (keys_fd == NULL) { fprintf(stderr, "File \"keys.txt\" not found\n"); exit(1); } // Source of keys cmph_io_adapter_t *source = cmph_io_nlfile_adapter(keys_fd); cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_BDZ); hash = cmph_new(config); cmph_config_destroy(config); //Find key const char *key = "jjjjjjjjjj"; unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key)); fprintf(stderr, "Id:%u\n", id); //Destroy hash cmph_destroy(hash); cmph_io_nlfile_adapter_destroy(source); fclose(keys_fd); return 0; }
void BlockHashIndex::CalcHash(size_t current, void* source_void) { #ifdef HAVE_CMPH cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void; cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_CHD); cmph_t* hash = cmph_new(config); PairedPackedArray<> *pv = new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits); size_t i = 0; source->rewind(source->data); std::string lastKey = ""; while(i < source->nkeys) { unsigned keylen; char* key; source->read(source->data, &key, &keylen); std::string temp(key, keylen); source->dispose(source->data, key, keylen); if(lastKey > temp) { if(source->nkeys != 2 || temp != "###DUMMY_KEY###") { std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl; std::cerr << "1: " << lastKey << std::endl; std::cerr << "2: " << temp << std::endl; abort(); } } lastKey = temp; size_t fprint = GetFprint(temp.c_str()); size_t idx = cmph_search(hash, temp.c_str(), (cmph_uint32) temp.size()); pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits); i++; } cmph_config_destroy(config); #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif if(m_hashes.size() <= current) { m_hashes.resize(current + 1, 0); m_arrays.resize(current + 1, 0); m_clocks.resize(current + 1, 0); } m_hashes[current] = (void*)hash; m_arrays[current] = pv; m_clocks[current] = clock(); m_queue.push(-current); #endif }
int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n) { cmph_t *hash; cmph_config_t *config; cmph_io_adapter_t *source; cmph_uint32 i; char filename[256]; FILE* mphf_fd = NULL; printf("%s (%u)\n", cmph_names[alg_n], alg_n); source = cmph_io_struct_vector_adapter(items_to_hash, (cmph_uint32)sizeof(cmph_uint32), 0, (cmph_uint32)sizeof(cmph_uint32), items_len); config = cmph_config_new(source); cmph_config_set_algo(config, alg_n); if (alg_n == CMPH_BRZ) { sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len); mphf_fd = fopen(filename, "w"); cmph_config_set_mphf_fd(config, mphf_fd); } hash = cmph_new(config); cmph_config_destroy(config); if (alg_n == CMPH_BRZ) { cmph_dump(hash, mphf_fd); cmph_destroy(hash); fclose(mphf_fd); mphf_fd = fopen(filename, "r"); hash = cmph_load(mphf_fd); } printf("packed_size %u\n",cmph_packed_size(hash)); for (i=0; i<items_len; ++i) printf("%d -> %u\n", items_to_hash[i], cmph_search(hash, (char*)(items_to_hash+i), (cmph_uint32)sizeof(cmph_uint32))); printf("\n"); cmph_io_vector_adapter_destroy(source); cmph_destroy(hash); if (alg_n == CMPH_BRZ) { fclose(mphf_fd); } return 0; }
void BlockHashIndex::CalcHash(size_t current, void* source_void) { #ifdef HAVE_CMPH cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void; cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_CHD); cmph_t* hash = cmph_new(config); PairedPackedArray<> *pv = new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits); size_t i = 0; source->rewind(source->data); while(i < source->nkeys) { unsigned keylen; char* key; source->read(source->data, &key, &keylen); std::string temp(key, keylen); size_t fprint = GetFprint(temp.c_str()); size_t idx = cmph_search(hash, temp.c_str(), (cmph_uint32) temp.size()); pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits); i++; } cmph_config_destroy(config); #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif if(m_hashes.size() <= current) { m_hashes.resize(current + 1, 0); m_arrays.resize(current + 1, 0); m_clocks.resize(current + 1, 0); } m_hashes[current] = (void*)hash; m_arrays[current] = pv; m_clocks[current] = clock(); m_queue.push(-current); #endif }
// Create minimal perfect hash function from in-memory vector int main(int argc, char **argv) { // Creating a filled vector unsigned int i = 0; const char *vector[] = {"aaaaaaaaaa", "bbbbbbbbbb", "cccccccccc", "dddddddddd", "eeeeeeeeee", "ffffffffff", "gggggggggg", "hhhhhhhhhh", "iiiiiiiiii", "jjjjjjjjjj"}; unsigned int nkeys = 10; FILE* mphf_fd = fopen("temp.mph", "w"); // Source of keys cmph_io_adapter_t *source = cmph_io_vector_adapter((char **)vector, nkeys); //Create minimal perfect hash function using the brz algorithm. cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_BRZ); cmph_config_set_mphf_fd(config, mphf_fd); cmph_t *hash = cmph_new(config); cmph_config_destroy(config); cmph_dump(hash, mphf_fd); cmph_destroy(hash); fclose(mphf_fd); //Find key mphf_fd = fopen("temp.mph", "r"); hash = cmph_load(mphf_fd); while (i < nkeys) { const char *key = vector[i]; unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key)); fprintf(stderr, "key:%s -- hash:%u\n", key, id); i++; } //Destroy hash cmph_destroy(hash); cmph_io_vector_adapter_destroy(source); fclose(mphf_fd); return 0; }
int main(int argc, char **argv) { cmph_uint32 verbosity = 0; char generate = 0; char *mphf_file = NULL; FILE *mphf_fd = stdout; const char *keys_file = NULL; FILE *keys_fd; cmph_uint32 nkeys = UINT_MAX; cmph_uint32 seed = UINT_MAX; CMPH_HASH *hashes = NULL; cmph_uint32 nhashes = 0; cmph_uint32 i; CMPH_ALGO mph_algo = CMPH_CHM; double c = 0; cmph_config_t *config = NULL; cmph_t *mphf = NULL; char * tmp_dir = NULL; cmph_io_adapter_t *source; cmph_uint32 memory_availability = 0; cmph_uint32 b = 0; cmph_uint32 keys_per_bin = 1; while (1) { char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:"); if (ch == -1) break; switch (ch) { case 's': { char *cptr; seed = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Invalid seed %s\n", optarg); exit(1); } } break; case 'c': { char *endptr; c = strtod(optarg, &endptr); if(*endptr != 0) { fprintf(stderr, "Invalid c value %s\n", optarg); exit(1); } } break; case 'g': generate = 1; break; case 'k': { char *endptr; nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10); if(*endptr != 0) { fprintf(stderr, "Invalid number of keys %s\n", optarg); exit(1); } } break; case 'm': mphf_file = strdup(optarg); break; case 'd': tmp_dir = strdup(optarg); break; case 'M': { char *cptr; memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Invalid memory availability %s\n", optarg); exit(1); } } break; case 'b': { char *cptr; b = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Parameter b was not found: %s\n", optarg); exit(1); } } break; case 't': { char *cptr; keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Parameter t was not found: %s\n", optarg); exit(1); } } break; case 'v': ++verbosity; break; case 'V': printf("%s\n", VERSION); return 0; case 'h': usage_long(argv[0]); return 0; case 'a': { char valid = 0; for (i = 0; i < CMPH_COUNT; ++i) { if (strcmp(cmph_names[i], optarg) == 0) { mph_algo = (CMPH_ALGO)i; valid = 1; break; } } if (!valid) { fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION); return -1; } } break; case 'f': { char valid = 0; for (i = 0; i < CMPH_HASH_COUNT; ++i) { if (strcmp(cmph_hash_names[i], optarg) == 0) { hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 )); hashes[nhashes] = (CMPH_HASH)i; hashes[nhashes + 1] = CMPH_HASH_COUNT; ++nhashes; valid = 1; break; } } if (!valid) { fprintf(stderr, "Invalid hash function: %s\n", optarg); return -1; } } break; default: usage(argv[0]); return 1; } } if (optind != argc - 1) { usage(argv[0]); return 1; } keys_file = argv[optind]; if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); srand(seed); int ret = 0; if (mphf_file == NULL) { mphf_file = (char *)malloc(strlen(keys_file) + 5); memcpy(mphf_file, keys_file, strlen(keys_file)); memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5); } keys_fd = fopen(keys_file, "r"); if (keys_fd == NULL) { fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); return -1; } if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); if (generate) { //Create mphf mphf_fd = fopen(mphf_file, "w"); config = cmph_config_new(source); cmph_config_set_algo(config, mph_algo); if (nhashes) cmph_config_set_hashfuncs(config, hashes); cmph_config_set_verbosity(config, verbosity); cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir); cmph_config_set_mphf_fd(config, mphf_fd); cmph_config_set_memory_availability(config, memory_availability); cmph_config_set_b(config, b); cmph_config_set_keys_per_bin(config, keys_per_bin); //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); mphf = cmph_new(config); cmph_config_destroy(config); if (mphf == NULL) { fprintf(stderr, "Unable to create minimum perfect hashing function\n"); //cmph_config_destroy(config); free(mphf_file); return -1; } if (mphf_fd == NULL) { fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } cmph_dump(mphf, mphf_fd); cmph_destroy(mphf); fclose(mphf_fd); } else { cmph_uint8 * hashtable = NULL; mphf_fd = fopen(mphf_file, "r"); if (mphf_fd == NULL) { fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } mphf = cmph_load(mphf_fd); fclose(mphf_fd); if (!mphf) { fprintf(stderr, "Unable to parser input file %s\n", mphf_file); free(mphf_file); return -1; } cmph_uint32 siz = cmph_size(mphf); hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8)); memset(hashtable, 0,(size_t) siz); //check all keys for (i = 0; i < source->nkeys; ++i) { cmph_uint32 h; char *buf; cmph_uint32 buflen = 0; source->read(source->data, &buf, &buflen); h = cmph_search(mphf, buf, buflen); if (!(h < siz)) { fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); ret = 1; } else if(hashtable[h] >= keys_per_bin) { fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h); fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); ret = 1; } else hashtable[h]++; if (verbosity) { printf("%s -> %u\n", buf, h); } source->dispose(source->data, buf, buflen); } cmph_destroy(mphf); free(hashtable); } fclose(keys_fd); free(mphf_file); free(tmp_dir); cmph_io_nlfile_adapter_destroy(source); return ret; }
cmph_t *chd_new(cmph_config_t *mph, double c) { cmph_t *mphf = NULL; chd_data_t *chdf = NULL; chd_config_data_t *chd = (chd_config_data_t *)mph->data; chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data; compressed_rank_t cr; register cmph_t * chd_phf = NULL; register cmph_uint32 packed_chd_phf_size = 0; cmph_uint8 * packed_chd_phf = NULL; register cmph_uint32 packed_cr_size = 0; cmph_uint8 * packed_cr = NULL; register cmph_uint32 i, idx, nkeys, nvals, nbins; cmph_uint32 * vals_table = NULL; register cmph_uint32 * occup_table = NULL; #ifdef CMPH_TIMING double construction_time_begin = 0.0; double construction_time = 0.0; ELAPSED_TIME_IN_SECONDS(&construction_time_begin); #endif cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); cmph_config_set_graphsize(chd->chd_ph, c); if (mph->verbosity) { fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c); } chd_phf = cmph_new(chd->chd_ph); if(chd_phf == NULL) { return NULL; } packed_chd_phf_size = cmph_packed_size(chd_phf); DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size); /* Make sure that we have enough space to pack the mphf. */ packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1); /* Pack the mphf. */ cmph_pack(chd_phf, packed_chd_phf); cmph_destroy(chd_phf); if (mph->verbosity) { fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n"); } compressed_rank_init(&cr); nbins = chd_ph->n; nkeys = chd_ph->m; nvals = nbins - nkeys; vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32)); occup_table = (cmph_uint32 *)chd_ph->occup_table; for(i = 0, idx = 0; i < nbins; i++) { if(!GETBIT32(occup_table, i)) { vals_table[idx++] = i; } } compressed_rank_generate(&cr, vals_table, nvals); free(vals_table); packed_cr_size = compressed_rank_packed_size(&cr); packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8)); compressed_rank_pack(&cr, packed_cr); compressed_rank_destroy(&cr); mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; chdf = (chd_data_t *)malloc(sizeof(chd_data_t)); chdf->packed_cr = packed_cr; packed_cr = NULL; //transfer memory ownership chdf->packed_chd_phf = packed_chd_phf; packed_chd_phf = NULL; //transfer memory ownership chdf->packed_chd_phf_size = packed_chd_phf_size; chdf->packed_cr_size = packed_cr_size; mphf->data = chdf; mphf->size = nkeys; DEBUGP("Successfully generated minimal perfect hash\n"); if (mph->verbosity) { fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); register cmph_uint32 space_usage = chd_packed_size(mphf)*8; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys); #endif return mphf; }
ifq_codes_t ifq_create_index(char *fastq_path, char *index_prefix) { char *hash_path = concatenate( index_prefix, ".hsh" ); char *seek_path = concatenate( index_prefix, ".lup" ); ifq_codes_t ret = IFQ_OK; /* Open output files */ BGZF *fastq_file = bgzf_open( fastq_path, "r" ); if( fastq_file == NULL ) { ret = IFQ_BAD_FASTQ; goto index_fastq_fail; } FILE *hash_file = fopen( hash_path, "w" ); if( hash_file == NULL ) { ret = IFQ_BAD_PREFIX; goto index_prefix_fail; } /* Create hash function */ cmph_io_adapter_t *source = cmph_io_fastq_adapter( fastq_file ); if( source == NULL ) { ret = IFQ_BAD_HASH; goto index_prefix_fail; } cmph_config_t *config = cmph_config_new( source ); cmph_config_set_algo( config, CMPH_CHD ); cmph_config_set_mphf_fd( config, hash_file ); cmph_t *hash = cmph_new( config ); if( hash == NULL ) { ret = IFQ_BAD_HASH; goto index_hash_fail; } /* Create the file index using the hash */ bgzf_seek( fastq_file, 0, SEEK_SET ); if( create_index( fastq_file, hash, seek_path ) != 1 ) { ret = IFQ_BAD_INDEX; goto index_create_fail; } index_fastq_fail: free( hash_path ); free( seek_path ); index_create_fail: cmph_config_destroy( config ); cmph_dump( hash, hash_file ); cmph_destroy( hash ); free( source ); index_hash_fail: fclose( hash_file ); index_prefix_fail: bgzf_close( fastq_file ); return ret; }
static int brz_gen_mphf(cmph_config_t *mph) { cmph_uint32 i, e, error; brz_config_data_t *brz = (brz_config_data_t *)mph->data; cmph_uint32 memory_usage = 0; cmph_uint32 nkeys_in_buffer = 0; cmph_uint8 *buffer = (cmph_uint8 *)malloc((size_t)brz->memory_availability); cmph_uint32 *buckets_size = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32)); cmph_uint32 *keys_index = NULL; cmph_uint8 **buffer_merge = NULL; cmph_uint32 *buffer_h0 = NULL; cmph_uint32 nflushes = 0; cmph_uint32 h0; register size_t nbytes; FILE * tmp_fd = NULL; buffer_manager_t * buff_manager = NULL; char *filename = NULL; char *key = NULL; cmph_uint32 keylen; cmph_uint32 cur_bucket = 0; cmph_uint8 nkeys_vd = 0; cmph_uint8 ** keys_vd = NULL; mph->key_source->rewind(mph->key_source->data); DEBUGP("Generating graphs from %u keys\n", brz->m); // Partitioning for (e = 0; e < brz->m; ++e) { mph->key_source->read(mph->key_source->data, &key, &keylen); /* Buffers management */ if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers { if(mph->verbosity) { fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); } cmph_uint32 value = buckets_size[0]; cmph_uint32 sum = 0; cmph_uint32 keylen1 = 0; buckets_size[0] = 0; for(i = 1; i < brz->k; i++) { if(buckets_size[i] == 0) continue; sum += value; value = buckets_size[i]; buckets_size[i] = sum; } memory_usage = 0; keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); for(i = 0; i < nkeys_in_buffer; i++) { memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1)); h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k; keys_index[buckets_size[h0]] = memory_usage; buckets_size[h0]++; memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1); } filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); tmp_fd = fopen(filename, "wb"); free(filename); filename = NULL; for(i = 0; i < nkeys_in_buffer; i++) { memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1)); nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd); } nkeys_in_buffer = 0; memory_usage = 0; memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32)); nflushes++; free(keys_index); fclose(tmp_fd); } memcpy(buffer + memory_usage, &keylen, sizeof(keylen)); memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen); memory_usage += keylen + (cmph_uint32)sizeof(keylen); h0 = hash(brz->h0, key, keylen) % brz->k; if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) { free(buffer); free(buckets_size); return 0; } brz->size[h0] = (cmph_uint8)(brz->size[h0] + 1U); buckets_size[h0] ++; nkeys_in_buffer++; mph->key_source->dispose(mph->key_source->data, key, keylen); } if (memory_usage != 0) // flush buffers { if(mph->verbosity) { fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); } cmph_uint32 value = buckets_size[0]; cmph_uint32 sum = 0; cmph_uint32 keylen1 = 0; buckets_size[0] = 0; for(i = 1; i < brz->k; i++) { if(buckets_size[i] == 0) continue; sum += value; value = buckets_size[i]; buckets_size[i] = sum; } memory_usage = 0; keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); for(i = 0; i < nkeys_in_buffer; i++) { memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1)); h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k; keys_index[buckets_size[h0]] = memory_usage; buckets_size[h0]++; memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1); } filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); tmp_fd = fopen(filename, "wb"); free(filename); filename = NULL; for(i = 0; i < nkeys_in_buffer; i++) { memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1)); nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd); } nkeys_in_buffer = 0; memory_usage = 0; memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32)); nflushes++; free(keys_index); fclose(tmp_fd); } free(buffer); free(buckets_size); if(nflushes > 1024) return 0; // Too many files generated. // mphf generation if(mph->verbosity) { fprintf(stderr, "\nMPHF generation \n"); } /* Starting to dump to disk the resultant MPHF: __cmph_dump function */ nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd); //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *)); buff_manager = buffer_manager_new(brz->memory_availability, nflushes); buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *)); buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32)); memory_usage = 0; for(i = 0; i < nflushes; i++) { filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); sprintf(filename, "%s%u.cmph",brz->tmp_dir, i); buffer_manager_open(buff_manager, i, filename); free(filename); filename = NULL; key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; buffer_h0[i] = h0; buffer_merge[i] = (cmph_uint8 *)key; key = NULL; //transfer memory ownership } e = 0; keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *)); nkeys_vd = 0; error = 0; while(e < brz->m) { i = brz_min_index(buffer_h0, nflushes); cur_bucket = buffer_h0[i]; key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); if(key) { while(key) { //keylen = strlen(key); h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; if (h0 != buffer_h0[i]) break; keys_vd[nkeys_vd++] = (cmph_uint8 *)key; key = NULL; //transfer memory ownership e++; key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); } if (key) { assert(nkeys_vd < brz->size[cur_bucket]); keys_vd[nkeys_vd++] = buffer_merge[i]; buffer_merge[i] = NULL; //transfer memory ownership e++; buffer_h0[i] = h0; buffer_merge[i] = (cmph_uint8 *)key; } } if(!key) { assert(nkeys_vd < brz->size[cur_bucket]); keys_vd[nkeys_vd++] = buffer_merge[i]; buffer_merge[i] = NULL; //transfer memory ownership e++; buffer_h0[i] = UINT_MAX; } if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket. { cmph_io_adapter_t *source = NULL; cmph_config_t *config = NULL; cmph_t *mphf_tmp = NULL; char *bufmphf = NULL; cmph_uint32 buflenmphf = 0; // Source of keys source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); config = cmph_config_new(source); cmph_config_set_algo(config, brz->algo); //cmph_config_set_algo(config, CMPH_BMZ8); cmph_config_set_graphsize(config, brz->c); mphf_tmp = cmph_new(config); if (mphf_tmp == NULL) { if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k); error = 1; cmph_config_destroy(config); brz_destroy_keys_vd(keys_vd, nkeys_vd); cmph_io_byte_vector_adapter_destroy(source); break; } if(mph->verbosity) { if (cur_bucket % 1000 == 0) { fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k); } } switch(brz->algo) { case CMPH_FCH: { fch_data_t * fchf = NULL; fchf = (fch_data_t *)mphf_tmp->data; bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf); } break; case CMPH_BMZ8: { bmz8_data_t * bmzf = NULL; bmzf = (bmz8_data_t *)mphf_tmp->data; bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf); } break; default: assert(0); } nbytes = fwrite(bufmphf, (size_t)buflenmphf, (size_t)1, brz->mphf_fd); free(bufmphf); bufmphf = NULL; cmph_config_destroy(config); brz_destroy_keys_vd(keys_vd, nkeys_vd); cmph_destroy(mphf_tmp); cmph_io_byte_vector_adapter_destroy(source); nkeys_vd = 0; } } buffer_manager_destroy(buff_manager); free(keys_vd); free(buffer_merge); free(buffer_h0); if (error) return 0; return 1; }
char *cmph_op_build(bot_t * bot, cmphx_t ** cmphx, char *string) { cmphx_t *cmphx_ptr = NULL; char *str = NULL; char **keys = NULL; int nkeys = 0; debug(NULL, "cmph_op_build: Entered: %p %p %p\n", bot, cmphx, string); if (!bot || !cmphx || !_sNULL(string)) return NULL; cmphx_ptr = *cmphx; if (!cmphx_ptr) { cmphx_ptr = (cmphx_t *) calloc(1, sizeof(cmphx_t)); if (!cmphx_ptr) return NULL; *cmphx = cmphx_ptr; } if (cmphx_ptr->hash) { cmph_op_clear(bot, cmphx, string); } keys = tokenize_array(NULL, string, TOKENIZE_NORMAL | TOKENIZE_EATWHITESPACE, " ", &nkeys); if (!keys) goto cleanup; tokenize_sort_strings(keys, &nkeys, TOKENIZE_SORT_STRINGS_FORWARD | TOKENIZE_SORT_STRINGS_UNIQ); cmphx_ptr->fp = fopen("/tmp/cmph.mph", "w"); if (!cmphx_ptr->fp) goto cleanup; cmphx_ptr->source = cmph_io_vector_adapter((char **)keys, nkeys); if (!cmphx_ptr->source) goto cleanup; cmphx_ptr->config = cmph_config_new(cmphx_ptr->source); if (!cmphx_ptr->config) goto cleanup; cmph_config_set_algo(cmphx_ptr->config, CMPH_BRZ); cmph_config_set_tmp_dir(cmphx_ptr->config, (cmph_uint8 *) "/tmp/"); cmph_config_set_mphf_fd(cmphx_ptr->config, cmphx_ptr->fp); cmphx_ptr->hash = cmph_new(cmphx_ptr->config); if (!cmphx_ptr->hash) goto cleanup; cmph_config_destroy(cmphx_ptr->config); cmphx_ptr->config = NULL; cmph_dump(cmphx_ptr->hash, cmphx_ptr->fp); cmph_destroy(cmphx_ptr->hash); fclose(cmphx_ptr->fp); cmphx_ptr->fp = fopen("/tmp/cmph.mph", "r"); tokenize_destroy_array(NULL, keys); debug(NULL, "cmph_op_build: Success\n"); cmphx_ptr->hash = cmph_load(cmphx_ptr->fp); return str; /* error */ cleanup: cmph_op_clear(bot, cmphx, string); debug(NULL, "cmph_op_build: Failure\n"); return str; }