Example #1
0
void
populate_index(uint64_t *table, cmph_t *hash, BGZF *fastq_file)
{
    while( 1 )
    {
        /* Find @ */
        char c;
        while( ( c = bgzf_getc( fastq_file ) ) != '@' && c >= 0 )
        {
        }
        
        long pos = bgzf_tell( fastq_file );
        if( pos == -1 )
        {
            break;
        }

        char *accession = NULL;
        cmph_uint32 accession_length;
        if( read_one_line( &accession, &accession_length, fastq_file ) != 1 )
        {
            break;
        }

        /* Next char is sequence, save pos */
        unsigned int id = cmph_search( hash, accession, accession_length );
        table[ id ] = (uint64_t) pos;
    }
}
Example #2
0
ifq_codes_t
ifq_query_index(ifq_index_t *index, char *query, ifq_record_t *record)
{
    // Find key
    unsigned int id = cmph_search( index->hash, query, (cmph_uint32) strlen( query ) );
    uint64_t pos = index->table[ id ];
    if( bgzf_seek( index->fastq_file, pos, SEEK_SET ) < 0 )
    {
        return IFQ_NOT_FOUND;
    }

    cmph_uint32 length;
    read_one_line( &record->name, &length, index->fastq_file );
    if( strncmp( record->name, query, length ) == 0 )
    {
        read_one_line( &record->sequence, &length, index->fastq_file );
        read_one_line( &record->quality, &length, index->fastq_file );
        read_one_line( &record->quality, &length, index->fastq_file );
    }
    else
    {
        return IFQ_NOT_FOUND;
    }

    return IFQ_OK;
}
Example #3
0
 // Create minimal perfect hash function from in-disk keys using BDZ algorithm
int main(int argc, char **argv)
{   
	 //Open file with newline separated list of keys
	FILE * keys_fd = fopen("keys.txt", "r");
	cmph_t *hash = NULL;
	if (keys_fd == NULL) 
	{
	  fprintf(stderr, "File \"keys.txt\" not found\n");
	  exit(1);
	}	
	// Source of keys
	cmph_io_adapter_t *source = cmph_io_nlfile_adapter(keys_fd);

	cmph_config_t *config = cmph_config_new(source);
	cmph_config_set_algo(config, CMPH_BDZ);
	hash = cmph_new(config);
	cmph_config_destroy(config);
   
	//Find key
	const char *key = "jjjjjjjjjj";
	unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key));
	fprintf(stderr, "Id:%u\n", id);
	//Destroy hash
	cmph_destroy(hash);
	cmph_io_nlfile_adapter_destroy(source);   
	fclose(keys_fd);
	return 0;
}
Example #4
0
char *cmph_op_find(bot_t * bot, cmphx_t ** cmphx, char *string)
{
	cmphx_t *cmphx_ptr = NULL;
	unsigned int key_id = 0;
	char *str = NULL;

	debug(NULL, "cmph_op_find: Entered\n");

	if (!bot || !cmphx || !_sNULL(string))
		return NULL;

	cmphx_ptr = *cmphx;
	if (!cmphx_ptr)
		return NULL;

	if (!cmphx_ptr->hash)
		return NULL;

	key_id = cmph_search(cmphx_ptr->hash, string, strlen(string));
	if (key_id < 0)
		return NULL;

	str = str_unite("key [%s] found at slot %i", string, key_id);

	return str;
}
Example #5
0
void BlockHashIndex::CalcHash(size_t current, void* source_void)
{
#ifdef HAVE_CMPH
  cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
  cmph_config_t *config = cmph_config_new(source);
  cmph_config_set_algo(config, CMPH_CHD);

  cmph_t* hash = cmph_new(config);
  PairedPackedArray<> *pv =
    new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);

  size_t i = 0;

  source->rewind(source->data);

  std::string lastKey = "";
  while(i < source->nkeys) {
    unsigned keylen;
    char* key;
    source->read(source->data, &key, &keylen);
    std::string temp(key, keylen);
    source->dispose(source->data, key, keylen);

    if(lastKey > temp) {
      if(source->nkeys != 2 || temp != "###DUMMY_KEY###") {
        std::cerr << "ERROR: Input file does not appear to be sorted with  LC_ALL=C sort" << std::endl;
        std::cerr << "1: " << lastKey << std::endl;
        std::cerr << "2: " << temp << std::endl;
        abort();
      }
    }
    lastKey = temp;

    size_t fprint = GetFprint(temp.c_str());
    size_t idx = cmph_search(hash, temp.c_str(),
                             (cmph_uint32) temp.size());

    pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
    i++;
  }

  cmph_config_destroy(config);

#ifdef WITH_THREADS
  boost::mutex::scoped_lock lock(m_mutex);
#endif

  if(m_hashes.size() <= current) {
    m_hashes.resize(current + 1, 0);
    m_arrays.resize(current + 1, 0);
    m_clocks.resize(current + 1, 0);
  }

  m_hashes[current] = (void*)hash;
  m_arrays[current] = pv;
  m_clocks[current] = clock();
  m_queue.push(-current);
#endif
}
Example #6
0
int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n)
{
    cmph_t *hash;
    cmph_config_t *config;
    cmph_io_adapter_t *source;
    cmph_uint32 i;
    char filename[256];
    FILE* mphf_fd = NULL;

    printf("%s (%u)\n", cmph_names[alg_n], alg_n);

    source = cmph_io_struct_vector_adapter(items_to_hash,
                                           (cmph_uint32)sizeof(cmph_uint32),
                                           0,
                                           (cmph_uint32)sizeof(cmph_uint32),
                                           items_len);
    config = cmph_config_new(source);
    cmph_config_set_algo(config, alg_n);
    if (alg_n == CMPH_BRZ) {
        sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len);
        mphf_fd = fopen(filename, "w");
        cmph_config_set_mphf_fd(config, mphf_fd);
    }
    hash = cmph_new(config);
    cmph_config_destroy(config);

    if (alg_n == CMPH_BRZ) {
        cmph_dump(hash, mphf_fd);
        cmph_destroy(hash);
        fclose(mphf_fd);
        mphf_fd = fopen(filename, "r");
        hash = cmph_load(mphf_fd);
    }
    printf("packed_size %u\n",cmph_packed_size(hash));

    for (i=0; i<items_len; ++i)
        printf("%d -> %u\n",
               items_to_hash[i],
               cmph_search(hash,
                           (char*)(items_to_hash+i), 
                           (cmph_uint32)sizeof(cmph_uint32)));
    printf("\n");

    cmph_io_vector_adapter_destroy(source);   
    cmph_destroy(hash);

    if (alg_n == CMPH_BRZ) {
        fclose(mphf_fd);
    }
    return 0;
}
Example #7
0
File: plook.c Project: dtaht/Cruft
int main(int argc, char **argv)
{
  struct cmph_hash_obj o;
  o.keyfile = "keys.txt";
  const char *key = "1.11.145.30";
  struct mapfile ip_map;
  int err;

  if (!(err = create_or_open_disk_hash(&o))) {
    fmmap_ip_rw(&ip_map,"keys.txt.bin");
    qsort(ip_map.map,ip_map.s.st_size/4, 4, int_cmp);
    walk_ips(&ip_map);

    unsigned int id = cmph_search(o.hash, key, (cmph_uint32)strlen(key));
    fprintf(stderr, "Id:%u\n", id);
    id = cmph_search(o.hash, "192.168.176.1", (cmph_uint32)strlen("192.168.176.1"));
    fprintf(stderr, "Invalid Id:%u\n", id);
    closeit(&o);
  } else {
    fprintf(stderr, "Cannot create hash, err %d, errno %d\n", err, errno);
  }
  return 0;
}
Example #8
0
void BlockHashIndex::CalcHash(size_t current, void* source_void)
{
#ifdef HAVE_CMPH
  cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
  cmph_config_t *config = cmph_config_new(source);
  cmph_config_set_algo(config, CMPH_CHD);
            
  cmph_t* hash = cmph_new(config);
  
  PairedPackedArray<> *pv =
    new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);

  size_t i = 0;
  
  source->rewind(source->data);
  while(i < source->nkeys)
  {
    unsigned keylen;
    char* key;
    source->read(source->data, &key, &keylen);
    std::string temp(key, keylen);
    
    size_t fprint = GetFprint(temp.c_str());
    size_t idx = cmph_search(hash, temp.c_str(),
                             (cmph_uint32) temp.size());

    pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
    i++;
  }
  
  cmph_config_destroy(config);
  
#ifdef WITH_THREADS
  boost::mutex::scoped_lock lock(m_mutex);
#endif

  if(m_hashes.size() <= current)
  {
    m_hashes.resize(current + 1, 0);    
    m_arrays.resize(current + 1, 0);
    m_clocks.resize(current + 1, 0);
  }
  
  m_hashes[current] = (void*)hash;
  m_arrays[current] = pv;
  m_clocks[current] = clock();
  m_queue.push(-current);  
#endif
}
Example #9
0
size_t BlockHashIndex::GetHash(size_t i, const char* key)
{
  if(m_hashes[i] == 0)
    LoadRange(i);

#ifdef HAVE_CMPH    
  size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key));
#else
  size_t idx = 0;
#endif

  std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits);
  m_clocks[i] = clock();
  
  if(GetFprint(key) == orderPrint.second)
      return orderPrint.first;
  else
      return GetSize();
}
Example #10
0
// Create minimal perfect hash function from in-memory vector
int main(int argc, char **argv)
{

    // Creating a filled vector
    unsigned int i = 0;
    const char *vector[] = {"aaaaaaaaaa", "bbbbbbbbbb", "cccccccccc", "dddddddddd", "eeeeeeeeee",
        "ffffffffff", "gggggggggg", "hhhhhhhhhh", "iiiiiiiiii", "jjjjjjjjjj"};
    unsigned int nkeys = 10;
    FILE* mphf_fd = fopen("temp.mph", "w");
    // Source of keys
    cmph_io_adapter_t *source = cmph_io_vector_adapter((char **)vector, nkeys);

    //Create minimal perfect hash function using the brz algorithm.
    cmph_config_t *config = cmph_config_new(source);
    cmph_config_set_algo(config, CMPH_BRZ);
    cmph_config_set_mphf_fd(config, mphf_fd);
    cmph_t *hash = cmph_new(config);
    cmph_config_destroy(config);
    cmph_dump(hash, mphf_fd);
    cmph_destroy(hash);
    fclose(mphf_fd);

    //Find key
    mphf_fd = fopen("temp.mph", "r");
    hash = cmph_load(mphf_fd);
    while (i < nkeys) {
        const char *key = vector[i];
        unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key));
        fprintf(stderr, "key:%s -- hash:%u\n", key, id);
        i++;
    }

    //Destroy hash
    cmph_destroy(hash);
    cmph_io_vector_adapter_destroy(source);
    fclose(mphf_fd);
    return 0;
}
Example #11
0
int main(int argc, char **argv)
{
	char verbosity = 0;
	char *mphf_file = NULL;
	const char *keys_file = NULL;
	FILE *mphf_fd = stdout;
	FILE *keys_fd;
	cmph_uint32 nkeys = UINT_MAX;
	cmph_uint32 i = 0;
	cmph_t *mphf = NULL;
	cmph_io_adapter_t *source;
	while (1)
	{
		char ch = (char)getopt(argc, argv, "hVvk:m:");
		if (ch == -1) break;
		switch (ch)
		{
			case 'k':
			        {
					char *endptr;
					nkeys = (cmph_uint32) strtoul(optarg, &endptr, 10);
					if(*endptr != 0) {
						fprintf(stderr, "Invalid number of keys %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'm':
				mphf_file = strdup(optarg);
				break;
			case 'v':
				++verbosity;
				break;
			case 'V':
				printf("%s\n", VERSION);
				return 0;
			case 'h':
				usage_long(argv[0]);
				return 0;
			default:
				usage(argv[0]);
				return 1;
		}
	}

	if (optind != argc - 1)
	{
		usage(argv[0]);
		return 1;
	}
	keys_file = argv[optind];
  
	int ret = 0;
	if (mphf_file == NULL)
	{
		mphf_file = (char *)malloc(strlen(keys_file) + 5);
		memcpy(mphf_file, keys_file, strlen(keys_file));
		memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
	}	

	keys_fd = fopen(keys_file, "r");

	if (keys_fd == NULL)
	{
		fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
		return -1;
	}

	if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
	else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);

	cmph_uint8 * hashtable = NULL;
	mphf_fd = fopen(mphf_file, "r");
	if (mphf_fd == NULL)
	{
		fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
		free(mphf_file);
		return -1;
	}
	mphf = cmph_load(mphf_fd);
	fclose(mphf_fd);
	if (!mphf)
	{
		fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
		free(mphf_file);
		return -1;
	}
	cmph_uint32 siz = cmph_size(mphf);
	hashtable = (cmph_uint8*)malloc(siz*sizeof(cmph_uint8));
	memset(hashtable, 0, (size_t)siz);
	//check all keys
	for (i = 0; i < source->nkeys; ++i)
	{
		cmph_uint32 h;
		char *buf;
		cmph_uint32 buflen = 0;
		source->read(source->data, &buf, &buflen);
		h = cmph_search(mphf, buf, buflen);
		if (!(h < siz))
		{
			fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
			ret = 1;
		} else if(hashtable[h])
		{
			fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
			ret = 1;
		} else hashtable[h] = 1;

		if (verbosity)
		{
			printf("%s -> %u\n", buf, h);
		}
		source->dispose(source->data, buf, buflen);
	}
		
	cmph_destroy(mphf);
	free(hashtable);

	fclose(keys_fd);
	free(mphf_file);
    cmph_io_nlfile_adapter_destroy(source);
	return ret;
  
}
Example #12
0
File: main.c Project: AlgoLab/Tagli
int main(int argc, char **argv)
{
	cmph_uint32 verbosity = 0;
	char generate = 0;
	char *mphf_file = NULL;
	FILE *mphf_fd = stdout;
	const char *keys_file = NULL;
	FILE *keys_fd;
	cmph_uint32 nkeys = UINT_MAX;
	cmph_uint32 seed = UINT_MAX;
	CMPH_HASH *hashes = NULL;
	cmph_uint32 nhashes = 0;
	cmph_uint32 i;
	CMPH_ALGO mph_algo = CMPH_CHM;
	double c = 0;
	cmph_config_t *config = NULL;
	cmph_t *mphf = NULL;
	char * tmp_dir = NULL;
	cmph_io_adapter_t *source;
	cmph_uint32 memory_availability = 0;
	cmph_uint32 b = 0;
	cmph_uint32 keys_per_bin = 1;
	while (1)
	{
		char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
		if (ch == -1) break;
		switch (ch)
		{
			case 's':
				{
					char *cptr;
					seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Invalid seed %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'c':
				{
					char *endptr;
					c = strtod(optarg, &endptr);
					if(*endptr != 0) {
						fprintf(stderr, "Invalid c value %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'g':
				generate = 1;
				break;
			case 'k':
			        {
					char *endptr;
					nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
					if(*endptr != 0) {
						fprintf(stderr, "Invalid number of keys %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'm':
				mphf_file = strdup(optarg);
				break;
			case 'd':
				tmp_dir = strdup(optarg);
				break;
			case 'M':
				{
					char *cptr;
					memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Invalid memory availability %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'b':
				{
					char *cptr;
					b =  (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Parameter b was not found: %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 't':
				{
					char *cptr;
					keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Parameter t was not found: %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'v':
				++verbosity;
				break;
			case 'V':
				printf("%s\n", VERSION);
				return 0;
			case 'h':
				usage_long(argv[0]);
				return 0;
			case 'a':
				{
				char valid = 0;
				for (i = 0; i < CMPH_COUNT; ++i)
				{
					if (strcmp(cmph_names[i], optarg) == 0)
					{
						mph_algo = (CMPH_ALGO)i;
						valid = 1;
						break;
					}
				}
				if (!valid)
				{
					fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
					return -1;
				}
				}
				break;
			case 'f':
				{
				char valid = 0;
				for (i = 0; i < CMPH_HASH_COUNT; ++i)
				{
					if (strcmp(cmph_hash_names[i], optarg) == 0)
					{
						hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
						hashes[nhashes] = (CMPH_HASH)i;
						hashes[nhashes + 1] = CMPH_HASH_COUNT;
						++nhashes;
						valid = 1;
						break;
					}
				}
				if (!valid)
				{
					fprintf(stderr, "Invalid hash function: %s\n", optarg);
					return -1;
				}
				}
				break;
			default:
				usage(argv[0]);
				return 1;
		}
	}

	if (optind != argc - 1)
	{
		usage(argv[0]);
		return 1;
	}
	keys_file = argv[optind];

	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
	srand(seed);
	int ret = 0;
	if (mphf_file == NULL)
	{
		mphf_file = (char *)malloc(strlen(keys_file) + 5);
		memcpy(mphf_file, keys_file, strlen(keys_file));
		memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
	}

	keys_fd = fopen(keys_file, "r");

	if (keys_fd == NULL)
	{
		fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
		return -1;
	}

	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
	if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
	else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
	if (generate)
	{
		//Create mphf
		mphf_fd = fopen(mphf_file, "w");
		config = cmph_config_new(source);
		cmph_config_set_algo(config, mph_algo);
		if (nhashes) cmph_config_set_hashfuncs(config, hashes);
		cmph_config_set_verbosity(config, verbosity);
		cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
		cmph_config_set_mphf_fd(config, mphf_fd);
		cmph_config_set_memory_availability(config, memory_availability);
		cmph_config_set_b(config, b);
		cmph_config_set_keys_per_bin(config, keys_per_bin);

		//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
		if(mph_algo == CMPH_BMZ  && c >= 2.0) c=1.15;
		if (c != 0) cmph_config_set_graphsize(config, c);
		mphf = cmph_new(config);

		cmph_config_destroy(config);
		if (mphf == NULL)
		{
			fprintf(stderr, "Unable to create minimum perfect hashing function\n");
			//cmph_config_destroy(config);
			free(mphf_file);
			return -1;
		}

		if (mphf_fd == NULL)
		{
			fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
			free(mphf_file);
			return -1;
		}
		cmph_dump(mphf, mphf_fd);
		cmph_destroy(mphf);
		fclose(mphf_fd);
	}
	else
	{
		cmph_uint8 * hashtable = NULL;
		mphf_fd = fopen(mphf_file, "r");
		if (mphf_fd == NULL)
		{
			fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
			free(mphf_file);
			return -1;
		}
		mphf = cmph_load(mphf_fd);
		fclose(mphf_fd);
		if (!mphf)
		{
			fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
			free(mphf_file);
			return -1;
		}
		cmph_uint32 siz = cmph_size(mphf);
		hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
		memset(hashtable, 0,(size_t) siz);
		//check all keys
		for (i = 0; i < source->nkeys; ++i)
		{
			cmph_uint32 h;
			char *buf;
			cmph_uint32 buflen = 0;
			source->read(source->data, &buf, &buflen);
			h = cmph_search(mphf, buf, buflen);
			if (!(h < siz))
			{
				fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
				ret = 1;
			} else if(hashtable[h] >= keys_per_bin)
			{
				fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
				fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
				ret = 1;
			} else hashtable[h]++;

			if (verbosity)
			{
				printf("%s -> %u\n", buf, h);
			}
			source->dispose(source->data, buf, buflen);
		}

		cmph_destroy(mphf);
		free(hashtable);
	}
	fclose(keys_fd);
	free(mphf_file);
	free(tmp_dir);
        cmph_io_nlfile_adapter_destroy(source);
	return ret;

}
Example #13
0
File: plook.c Project: dtaht/Cruft
int lookup_key(struct cmph_hash_obj *o,char *key) {
  return(cmph_search(o->hash, key, (cmph_uint32)strlen(key)));
}