void populate_index(uint64_t *table, cmph_t *hash, BGZF *fastq_file) { while( 1 ) { /* Find @ */ char c; while( ( c = bgzf_getc( fastq_file ) ) != '@' && c >= 0 ) { } long pos = bgzf_tell( fastq_file ); if( pos == -1 ) { break; } char *accession = NULL; cmph_uint32 accession_length; if( read_one_line( &accession, &accession_length, fastq_file ) != 1 ) { break; } /* Next char is sequence, save pos */ unsigned int id = cmph_search( hash, accession, accession_length ); table[ id ] = (uint64_t) pos; } }
ifq_codes_t ifq_query_index(ifq_index_t *index, char *query, ifq_record_t *record) { // Find key unsigned int id = cmph_search( index->hash, query, (cmph_uint32) strlen( query ) ); uint64_t pos = index->table[ id ]; if( bgzf_seek( index->fastq_file, pos, SEEK_SET ) < 0 ) { return IFQ_NOT_FOUND; } cmph_uint32 length; read_one_line( &record->name, &length, index->fastq_file ); if( strncmp( record->name, query, length ) == 0 ) { read_one_line( &record->sequence, &length, index->fastq_file ); read_one_line( &record->quality, &length, index->fastq_file ); read_one_line( &record->quality, &length, index->fastq_file ); } else { return IFQ_NOT_FOUND; } return IFQ_OK; }
// Create minimal perfect hash function from in-disk keys using BDZ algorithm int main(int argc, char **argv) { //Open file with newline separated list of keys FILE * keys_fd = fopen("keys.txt", "r"); cmph_t *hash = NULL; if (keys_fd == NULL) { fprintf(stderr, "File \"keys.txt\" not found\n"); exit(1); } // Source of keys cmph_io_adapter_t *source = cmph_io_nlfile_adapter(keys_fd); cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_BDZ); hash = cmph_new(config); cmph_config_destroy(config); //Find key const char *key = "jjjjjjjjjj"; unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key)); fprintf(stderr, "Id:%u\n", id); //Destroy hash cmph_destroy(hash); cmph_io_nlfile_adapter_destroy(source); fclose(keys_fd); return 0; }
char *cmph_op_find(bot_t * bot, cmphx_t ** cmphx, char *string) { cmphx_t *cmphx_ptr = NULL; unsigned int key_id = 0; char *str = NULL; debug(NULL, "cmph_op_find: Entered\n"); if (!bot || !cmphx || !_sNULL(string)) return NULL; cmphx_ptr = *cmphx; if (!cmphx_ptr) return NULL; if (!cmphx_ptr->hash) return NULL; key_id = cmph_search(cmphx_ptr->hash, string, strlen(string)); if (key_id < 0) return NULL; str = str_unite("key [%s] found at slot %i", string, key_id); return str; }
void BlockHashIndex::CalcHash(size_t current, void* source_void) { #ifdef HAVE_CMPH cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void; cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_CHD); cmph_t* hash = cmph_new(config); PairedPackedArray<> *pv = new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits); size_t i = 0; source->rewind(source->data); std::string lastKey = ""; while(i < source->nkeys) { unsigned keylen; char* key; source->read(source->data, &key, &keylen); std::string temp(key, keylen); source->dispose(source->data, key, keylen); if(lastKey > temp) { if(source->nkeys != 2 || temp != "###DUMMY_KEY###") { std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl; std::cerr << "1: " << lastKey << std::endl; std::cerr << "2: " << temp << std::endl; abort(); } } lastKey = temp; size_t fprint = GetFprint(temp.c_str()); size_t idx = cmph_search(hash, temp.c_str(), (cmph_uint32) temp.size()); pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits); i++; } cmph_config_destroy(config); #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif if(m_hashes.size() <= current) { m_hashes.resize(current + 1, 0); m_arrays.resize(current + 1, 0); m_clocks.resize(current + 1, 0); } m_hashes[current] = (void*)hash; m_arrays[current] = pv; m_clocks[current] = clock(); m_queue.push(-current); #endif }
int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n) { cmph_t *hash; cmph_config_t *config; cmph_io_adapter_t *source; cmph_uint32 i; char filename[256]; FILE* mphf_fd = NULL; printf("%s (%u)\n", cmph_names[alg_n], alg_n); source = cmph_io_struct_vector_adapter(items_to_hash, (cmph_uint32)sizeof(cmph_uint32), 0, (cmph_uint32)sizeof(cmph_uint32), items_len); config = cmph_config_new(source); cmph_config_set_algo(config, alg_n); if (alg_n == CMPH_BRZ) { sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len); mphf_fd = fopen(filename, "w"); cmph_config_set_mphf_fd(config, mphf_fd); } hash = cmph_new(config); cmph_config_destroy(config); if (alg_n == CMPH_BRZ) { cmph_dump(hash, mphf_fd); cmph_destroy(hash); fclose(mphf_fd); mphf_fd = fopen(filename, "r"); hash = cmph_load(mphf_fd); } printf("packed_size %u\n",cmph_packed_size(hash)); for (i=0; i<items_len; ++i) printf("%d -> %u\n", items_to_hash[i], cmph_search(hash, (char*)(items_to_hash+i), (cmph_uint32)sizeof(cmph_uint32))); printf("\n"); cmph_io_vector_adapter_destroy(source); cmph_destroy(hash); if (alg_n == CMPH_BRZ) { fclose(mphf_fd); } return 0; }
int main(int argc, char **argv) { struct cmph_hash_obj o; o.keyfile = "keys.txt"; const char *key = "1.11.145.30"; struct mapfile ip_map; int err; if (!(err = create_or_open_disk_hash(&o))) { fmmap_ip_rw(&ip_map,"keys.txt.bin"); qsort(ip_map.map,ip_map.s.st_size/4, 4, int_cmp); walk_ips(&ip_map); unsigned int id = cmph_search(o.hash, key, (cmph_uint32)strlen(key)); fprintf(stderr, "Id:%u\n", id); id = cmph_search(o.hash, "192.168.176.1", (cmph_uint32)strlen("192.168.176.1")); fprintf(stderr, "Invalid Id:%u\n", id); closeit(&o); } else { fprintf(stderr, "Cannot create hash, err %d, errno %d\n", err, errno); } return 0; }
void BlockHashIndex::CalcHash(size_t current, void* source_void) { #ifdef HAVE_CMPH cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void; cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_CHD); cmph_t* hash = cmph_new(config); PairedPackedArray<> *pv = new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits); size_t i = 0; source->rewind(source->data); while(i < source->nkeys) { unsigned keylen; char* key; source->read(source->data, &key, &keylen); std::string temp(key, keylen); size_t fprint = GetFprint(temp.c_str()); size_t idx = cmph_search(hash, temp.c_str(), (cmph_uint32) temp.size()); pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits); i++; } cmph_config_destroy(config); #ifdef WITH_THREADS boost::mutex::scoped_lock lock(m_mutex); #endif if(m_hashes.size() <= current) { m_hashes.resize(current + 1, 0); m_arrays.resize(current + 1, 0); m_clocks.resize(current + 1, 0); } m_hashes[current] = (void*)hash; m_arrays[current] = pv; m_clocks[current] = clock(); m_queue.push(-current); #endif }
size_t BlockHashIndex::GetHash(size_t i, const char* key) { if(m_hashes[i] == 0) LoadRange(i); #ifdef HAVE_CMPH size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key)); #else size_t idx = 0; #endif std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits); m_clocks[i] = clock(); if(GetFprint(key) == orderPrint.second) return orderPrint.first; else return GetSize(); }
// Create minimal perfect hash function from in-memory vector int main(int argc, char **argv) { // Creating a filled vector unsigned int i = 0; const char *vector[] = {"aaaaaaaaaa", "bbbbbbbbbb", "cccccccccc", "dddddddddd", "eeeeeeeeee", "ffffffffff", "gggggggggg", "hhhhhhhhhh", "iiiiiiiiii", "jjjjjjjjjj"}; unsigned int nkeys = 10; FILE* mphf_fd = fopen("temp.mph", "w"); // Source of keys cmph_io_adapter_t *source = cmph_io_vector_adapter((char **)vector, nkeys); //Create minimal perfect hash function using the brz algorithm. cmph_config_t *config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_BRZ); cmph_config_set_mphf_fd(config, mphf_fd); cmph_t *hash = cmph_new(config); cmph_config_destroy(config); cmph_dump(hash, mphf_fd); cmph_destroy(hash); fclose(mphf_fd); //Find key mphf_fd = fopen("temp.mph", "r"); hash = cmph_load(mphf_fd); while (i < nkeys) { const char *key = vector[i]; unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key)); fprintf(stderr, "key:%s -- hash:%u\n", key, id); i++; } //Destroy hash cmph_destroy(hash); cmph_io_vector_adapter_destroy(source); fclose(mphf_fd); return 0; }
int main(int argc, char **argv) { char verbosity = 0; char *mphf_file = NULL; const char *keys_file = NULL; FILE *mphf_fd = stdout; FILE *keys_fd; cmph_uint32 nkeys = UINT_MAX; cmph_uint32 i = 0; cmph_t *mphf = NULL; cmph_io_adapter_t *source; while (1) { char ch = (char)getopt(argc, argv, "hVvk:m:"); if (ch == -1) break; switch (ch) { case 'k': { char *endptr; nkeys = (cmph_uint32) strtoul(optarg, &endptr, 10); if(*endptr != 0) { fprintf(stderr, "Invalid number of keys %s\n", optarg); exit(1); } } break; case 'm': mphf_file = strdup(optarg); break; case 'v': ++verbosity; break; case 'V': printf("%s\n", VERSION); return 0; case 'h': usage_long(argv[0]); return 0; default: usage(argv[0]); return 1; } } if (optind != argc - 1) { usage(argv[0]); return 1; } keys_file = argv[optind]; int ret = 0; if (mphf_file == NULL) { mphf_file = (char *)malloc(strlen(keys_file) + 5); memcpy(mphf_file, keys_file, strlen(keys_file)); memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5); } keys_fd = fopen(keys_file, "r"); if (keys_fd == NULL) { fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); return -1; } if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); cmph_uint8 * hashtable = NULL; mphf_fd = fopen(mphf_file, "r"); if (mphf_fd == NULL) { fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } mphf = cmph_load(mphf_fd); fclose(mphf_fd); if (!mphf) { fprintf(stderr, "Unable to parser input file %s\n", mphf_file); free(mphf_file); return -1; } cmph_uint32 siz = cmph_size(mphf); hashtable = (cmph_uint8*)malloc(siz*sizeof(cmph_uint8)); memset(hashtable, 0, (size_t)siz); //check all keys for (i = 0; i < source->nkeys; ++i) { cmph_uint32 h; char *buf; cmph_uint32 buflen = 0; source->read(source->data, &buf, &buflen); h = cmph_search(mphf, buf, buflen); if (!(h < siz)) { fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); ret = 1; } else if(hashtable[h]) { fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); ret = 1; } else hashtable[h] = 1; if (verbosity) { printf("%s -> %u\n", buf, h); } source->dispose(source->data, buf, buflen); } cmph_destroy(mphf); free(hashtable); fclose(keys_fd); free(mphf_file); cmph_io_nlfile_adapter_destroy(source); return ret; }
int main(int argc, char **argv) { cmph_uint32 verbosity = 0; char generate = 0; char *mphf_file = NULL; FILE *mphf_fd = stdout; const char *keys_file = NULL; FILE *keys_fd; cmph_uint32 nkeys = UINT_MAX; cmph_uint32 seed = UINT_MAX; CMPH_HASH *hashes = NULL; cmph_uint32 nhashes = 0; cmph_uint32 i; CMPH_ALGO mph_algo = CMPH_CHM; double c = 0; cmph_config_t *config = NULL; cmph_t *mphf = NULL; char * tmp_dir = NULL; cmph_io_adapter_t *source; cmph_uint32 memory_availability = 0; cmph_uint32 b = 0; cmph_uint32 keys_per_bin = 1; while (1) { char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:"); if (ch == -1) break; switch (ch) { case 's': { char *cptr; seed = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Invalid seed %s\n", optarg); exit(1); } } break; case 'c': { char *endptr; c = strtod(optarg, &endptr); if(*endptr != 0) { fprintf(stderr, "Invalid c value %s\n", optarg); exit(1); } } break; case 'g': generate = 1; break; case 'k': { char *endptr; nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10); if(*endptr != 0) { fprintf(stderr, "Invalid number of keys %s\n", optarg); exit(1); } } break; case 'm': mphf_file = strdup(optarg); break; case 'd': tmp_dir = strdup(optarg); break; case 'M': { char *cptr; memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Invalid memory availability %s\n", optarg); exit(1); } } break; case 'b': { char *cptr; b = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Parameter b was not found: %s\n", optarg); exit(1); } } break; case 't': { char *cptr; keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10); if(*cptr != 0) { fprintf(stderr, "Parameter t was not found: %s\n", optarg); exit(1); } } break; case 'v': ++verbosity; break; case 'V': printf("%s\n", VERSION); return 0; case 'h': usage_long(argv[0]); return 0; case 'a': { char valid = 0; for (i = 0; i < CMPH_COUNT; ++i) { if (strcmp(cmph_names[i], optarg) == 0) { mph_algo = (CMPH_ALGO)i; valid = 1; break; } } if (!valid) { fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION); return -1; } } break; case 'f': { char valid = 0; for (i = 0; i < CMPH_HASH_COUNT; ++i) { if (strcmp(cmph_hash_names[i], optarg) == 0) { hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 )); hashes[nhashes] = (CMPH_HASH)i; hashes[nhashes + 1] = CMPH_HASH_COUNT; ++nhashes; valid = 1; break; } } if (!valid) { fprintf(stderr, "Invalid hash function: %s\n", optarg); return -1; } } break; default: usage(argv[0]); return 1; } } if (optind != argc - 1) { usage(argv[0]); return 1; } keys_file = argv[optind]; if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); srand(seed); int ret = 0; if (mphf_file == NULL) { mphf_file = (char *)malloc(strlen(keys_file) + 5); memcpy(mphf_file, keys_file, strlen(keys_file)); memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5); } keys_fd = fopen(keys_file, "r"); if (keys_fd == NULL) { fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); return -1; } if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); if (generate) { //Create mphf mphf_fd = fopen(mphf_file, "w"); config = cmph_config_new(source); cmph_config_set_algo(config, mph_algo); if (nhashes) cmph_config_set_hashfuncs(config, hashes); cmph_config_set_verbosity(config, verbosity); cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir); cmph_config_set_mphf_fd(config, mphf_fd); cmph_config_set_memory_availability(config, memory_availability); cmph_config_set_b(config, b); cmph_config_set_keys_per_bin(config, keys_per_bin); //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); mphf = cmph_new(config); cmph_config_destroy(config); if (mphf == NULL) { fprintf(stderr, "Unable to create minimum perfect hashing function\n"); //cmph_config_destroy(config); free(mphf_file); return -1; } if (mphf_fd == NULL) { fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } cmph_dump(mphf, mphf_fd); cmph_destroy(mphf); fclose(mphf_fd); } else { cmph_uint8 * hashtable = NULL; mphf_fd = fopen(mphf_file, "r"); if (mphf_fd == NULL) { fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } mphf = cmph_load(mphf_fd); fclose(mphf_fd); if (!mphf) { fprintf(stderr, "Unable to parser input file %s\n", mphf_file); free(mphf_file); return -1; } cmph_uint32 siz = cmph_size(mphf); hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8)); memset(hashtable, 0,(size_t) siz); //check all keys for (i = 0; i < source->nkeys; ++i) { cmph_uint32 h; char *buf; cmph_uint32 buflen = 0; source->read(source->data, &buf, &buflen); h = cmph_search(mphf, buf, buflen); if (!(h < siz)) { fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); ret = 1; } else if(hashtable[h] >= keys_per_bin) { fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h); fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); ret = 1; } else hashtable[h]++; if (verbosity) { printf("%s -> %u\n", buf, h); } source->dispose(source->data, buf, buflen); } cmph_destroy(mphf); free(hashtable); } fclose(keys_fd); free(mphf_file); free(tmp_dir); cmph_io_nlfile_adapter_destroy(source); return ret; }
int lookup_key(struct cmph_hash_obj *o,char *key) { return(cmph_search(o->hash, key, (cmph_uint32)strlen(key))); }