Ejemplo n.º 1
0
Archivo: plook.c Proyecto: dtaht/Cruft
int create_disk_hash(struct cmph_hash_obj *o) {
  char mphfile[1024];
  FILE *mph;
  int count = 0;

  o->keys_fd = fopen(o->keyfile, "r");
  if (o->keys_fd == NULL)
    {
      fprintf(stderr, "File %s not found\n", o->keyfile);
      return(-1);
    }
  if((count = convert_textfile_to_ip4_bin(o->keyfile)) > 0) {
    fprintf(stderr,"binary file dumped\n");
  }
      
  o->source = cmph_io_nlfile_adapter(o->keys_fd);
  o->config = cmph_config_new(o->source);
  // cmph_config_set_algo(o->config, CMPH_CHM);
  cmph_config_set_algo(o->config, CMPH_BDZ); // seems like a win
  o->hash = cmph_new(o->config);
  sprintf(mphfile,"%s.mph",o->keyfile);
  mph = fopen(mphfile, "w+");
  if (mph == NULL)
    {
      fprintf(stderr, "File %s not found\n",mphfile);
      return(1);
    } else {
    cmph_dump(o->hash, mph);
    pack_n_save(o,"backup.mph");
    fclose(mph);
  }
  cmph_config_destroy(o->config);
  return (0);
}
Ejemplo n.º 2
0
 // Create minimal perfect hash function from in-disk keys using BDZ algorithm
int main(int argc, char **argv)
{   
	 //Open file with newline separated list of keys
	FILE * keys_fd = fopen("keys.txt", "r");
	cmph_t *hash = NULL;
	if (keys_fd == NULL) 
	{
	  fprintf(stderr, "File \"keys.txt\" not found\n");
	  exit(1);
	}	
	// Source of keys
	cmph_io_adapter_t *source = cmph_io_nlfile_adapter(keys_fd);

	cmph_config_t *config = cmph_config_new(source);
	cmph_config_set_algo(config, CMPH_BDZ);
	hash = cmph_new(config);
	cmph_config_destroy(config);
   
	//Find key
	const char *key = "jjjjjjjjjj";
	unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key));
	fprintf(stderr, "Id:%u\n", id);
	//Destroy hash
	cmph_destroy(hash);
	cmph_io_nlfile_adapter_destroy(source);   
	fclose(keys_fd);
	return 0;
}
Ejemplo n.º 3
0
void BlockHashIndex::CalcHash(size_t current, void* source_void)
{
#ifdef HAVE_CMPH
  cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
  cmph_config_t *config = cmph_config_new(source);
  cmph_config_set_algo(config, CMPH_CHD);

  cmph_t* hash = cmph_new(config);
  PairedPackedArray<> *pv =
    new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);

  size_t i = 0;

  source->rewind(source->data);

  std::string lastKey = "";
  while(i < source->nkeys) {
    unsigned keylen;
    char* key;
    source->read(source->data, &key, &keylen);
    std::string temp(key, keylen);
    source->dispose(source->data, key, keylen);

    if(lastKey > temp) {
      if(source->nkeys != 2 || temp != "###DUMMY_KEY###") {
        std::cerr << "ERROR: Input file does not appear to be sorted with  LC_ALL=C sort" << std::endl;
        std::cerr << "1: " << lastKey << std::endl;
        std::cerr << "2: " << temp << std::endl;
        abort();
      }
    }
    lastKey = temp;

    size_t fprint = GetFprint(temp.c_str());
    size_t idx = cmph_search(hash, temp.c_str(),
                             (cmph_uint32) temp.size());

    pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
    i++;
  }

  cmph_config_destroy(config);

#ifdef WITH_THREADS
  boost::mutex::scoped_lock lock(m_mutex);
#endif

  if(m_hashes.size() <= current) {
    m_hashes.resize(current + 1, 0);
    m_arrays.resize(current + 1, 0);
    m_clocks.resize(current + 1, 0);
  }

  m_hashes[current] = (void*)hash;
  m_arrays[current] = pv;
  m_clocks[current] = clock();
  m_queue.push(-current);
#endif
}
Ejemplo n.º 4
0
int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n)
{
    cmph_t *hash;
    cmph_config_t *config;
    cmph_io_adapter_t *source;
    cmph_uint32 i;
    char filename[256];
    FILE* mphf_fd = NULL;

    printf("%s (%u)\n", cmph_names[alg_n], alg_n);

    source = cmph_io_struct_vector_adapter(items_to_hash,
                                           (cmph_uint32)sizeof(cmph_uint32),
                                           0,
                                           (cmph_uint32)sizeof(cmph_uint32),
                                           items_len);
    config = cmph_config_new(source);
    cmph_config_set_algo(config, alg_n);
    if (alg_n == CMPH_BRZ) {
        sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len);
        mphf_fd = fopen(filename, "w");
        cmph_config_set_mphf_fd(config, mphf_fd);
    }
    hash = cmph_new(config);
    cmph_config_destroy(config);

    if (alg_n == CMPH_BRZ) {
        cmph_dump(hash, mphf_fd);
        cmph_destroy(hash);
        fclose(mphf_fd);
        mphf_fd = fopen(filename, "r");
        hash = cmph_load(mphf_fd);
    }
    printf("packed_size %u\n",cmph_packed_size(hash));

    for (i=0; i<items_len; ++i)
        printf("%d -> %u\n",
               items_to_hash[i],
               cmph_search(hash,
                           (char*)(items_to_hash+i), 
                           (cmph_uint32)sizeof(cmph_uint32)));
    printf("\n");

    cmph_io_vector_adapter_destroy(source);   
    cmph_destroy(hash);

    if (alg_n == CMPH_BRZ) {
        fclose(mphf_fd);
    }
    return 0;
}
Ejemplo n.º 5
0
void BlockHashIndex::CalcHash(size_t current, void* source_void)
{
#ifdef HAVE_CMPH
  cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
  cmph_config_t *config = cmph_config_new(source);
  cmph_config_set_algo(config, CMPH_CHD);
            
  cmph_t* hash = cmph_new(config);
  
  PairedPackedArray<> *pv =
    new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);

  size_t i = 0;
  
  source->rewind(source->data);
  while(i < source->nkeys)
  {
    unsigned keylen;
    char* key;
    source->read(source->data, &key, &keylen);
    std::string temp(key, keylen);
    
    size_t fprint = GetFprint(temp.c_str());
    size_t idx = cmph_search(hash, temp.c_str(),
                             (cmph_uint32) temp.size());

    pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
    i++;
  }
  
  cmph_config_destroy(config);
  
#ifdef WITH_THREADS
  boost::mutex::scoped_lock lock(m_mutex);
#endif

  if(m_hashes.size() <= current)
  {
    m_hashes.resize(current + 1, 0);    
    m_arrays.resize(current + 1, 0);
    m_clocks.resize(current + 1, 0);
  }
  
  m_hashes[current] = (void*)hash;
  m_arrays[current] = pv;
  m_clocks[current] = clock();
  m_queue.push(-current);  
#endif
}
Ejemplo n.º 6
0
// Create minimal perfect hash function from in-memory vector
int main(int argc, char **argv)
{

    // Creating a filled vector
    unsigned int i = 0;
    const char *vector[] = {"aaaaaaaaaa", "bbbbbbbbbb", "cccccccccc", "dddddddddd", "eeeeeeeeee",
        "ffffffffff", "gggggggggg", "hhhhhhhhhh", "iiiiiiiiii", "jjjjjjjjjj"};
    unsigned int nkeys = 10;
    FILE* mphf_fd = fopen("temp.mph", "w");
    // Source of keys
    cmph_io_adapter_t *source = cmph_io_vector_adapter((char **)vector, nkeys);

    //Create minimal perfect hash function using the brz algorithm.
    cmph_config_t *config = cmph_config_new(source);
    cmph_config_set_algo(config, CMPH_BRZ);
    cmph_config_set_mphf_fd(config, mphf_fd);
    cmph_t *hash = cmph_new(config);
    cmph_config_destroy(config);
    cmph_dump(hash, mphf_fd);
    cmph_destroy(hash);
    fclose(mphf_fd);

    //Find key
    mphf_fd = fopen("temp.mph", "r");
    hash = cmph_load(mphf_fd);
    while (i < nkeys) {
        const char *key = vector[i];
        unsigned int id = cmph_search(hash, key, (cmph_uint32)strlen(key));
        fprintf(stderr, "key:%s -- hash:%u\n", key, id);
        i++;
    }

    //Destroy hash
    cmph_destroy(hash);
    cmph_io_vector_adapter_destroy(source);
    fclose(mphf_fd);
    return 0;
}
Ejemplo n.º 7
0
Archivo: main.c Proyecto: AlgoLab/Tagli
int main(int argc, char **argv)
{
	cmph_uint32 verbosity = 0;
	char generate = 0;
	char *mphf_file = NULL;
	FILE *mphf_fd = stdout;
	const char *keys_file = NULL;
	FILE *keys_fd;
	cmph_uint32 nkeys = UINT_MAX;
	cmph_uint32 seed = UINT_MAX;
	CMPH_HASH *hashes = NULL;
	cmph_uint32 nhashes = 0;
	cmph_uint32 i;
	CMPH_ALGO mph_algo = CMPH_CHM;
	double c = 0;
	cmph_config_t *config = NULL;
	cmph_t *mphf = NULL;
	char * tmp_dir = NULL;
	cmph_io_adapter_t *source;
	cmph_uint32 memory_availability = 0;
	cmph_uint32 b = 0;
	cmph_uint32 keys_per_bin = 1;
	while (1)
	{
		char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
		if (ch == -1) break;
		switch (ch)
		{
			case 's':
				{
					char *cptr;
					seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Invalid seed %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'c':
				{
					char *endptr;
					c = strtod(optarg, &endptr);
					if(*endptr != 0) {
						fprintf(stderr, "Invalid c value %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'g':
				generate = 1;
				break;
			case 'k':
			        {
					char *endptr;
					nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
					if(*endptr != 0) {
						fprintf(stderr, "Invalid number of keys %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'm':
				mphf_file = strdup(optarg);
				break;
			case 'd':
				tmp_dir = strdup(optarg);
				break;
			case 'M':
				{
					char *cptr;
					memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Invalid memory availability %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'b':
				{
					char *cptr;
					b =  (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Parameter b was not found: %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 't':
				{
					char *cptr;
					keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Parameter t was not found: %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'v':
				++verbosity;
				break;
			case 'V':
				printf("%s\n", VERSION);
				return 0;
			case 'h':
				usage_long(argv[0]);
				return 0;
			case 'a':
				{
				char valid = 0;
				for (i = 0; i < CMPH_COUNT; ++i)
				{
					if (strcmp(cmph_names[i], optarg) == 0)
					{
						mph_algo = (CMPH_ALGO)i;
						valid = 1;
						break;
					}
				}
				if (!valid)
				{
					fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
					return -1;
				}
				}
				break;
			case 'f':
				{
				char valid = 0;
				for (i = 0; i < CMPH_HASH_COUNT; ++i)
				{
					if (strcmp(cmph_hash_names[i], optarg) == 0)
					{
						hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
						hashes[nhashes] = (CMPH_HASH)i;
						hashes[nhashes + 1] = CMPH_HASH_COUNT;
						++nhashes;
						valid = 1;
						break;
					}
				}
				if (!valid)
				{
					fprintf(stderr, "Invalid hash function: %s\n", optarg);
					return -1;
				}
				}
				break;
			default:
				usage(argv[0]);
				return 1;
		}
	}

	if (optind != argc - 1)
	{
		usage(argv[0]);
		return 1;
	}
	keys_file = argv[optind];

	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
	srand(seed);
	int ret = 0;
	if (mphf_file == NULL)
	{
		mphf_file = (char *)malloc(strlen(keys_file) + 5);
		memcpy(mphf_file, keys_file, strlen(keys_file));
		memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
	}

	keys_fd = fopen(keys_file, "r");

	if (keys_fd == NULL)
	{
		fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
		return -1;
	}

	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
	if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
	else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
	if (generate)
	{
		//Create mphf
		mphf_fd = fopen(mphf_file, "w");
		config = cmph_config_new(source);
		cmph_config_set_algo(config, mph_algo);
		if (nhashes) cmph_config_set_hashfuncs(config, hashes);
		cmph_config_set_verbosity(config, verbosity);
		cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
		cmph_config_set_mphf_fd(config, mphf_fd);
		cmph_config_set_memory_availability(config, memory_availability);
		cmph_config_set_b(config, b);
		cmph_config_set_keys_per_bin(config, keys_per_bin);

		//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
		if(mph_algo == CMPH_BMZ  && c >= 2.0) c=1.15;
		if (c != 0) cmph_config_set_graphsize(config, c);
		mphf = cmph_new(config);

		cmph_config_destroy(config);
		if (mphf == NULL)
		{
			fprintf(stderr, "Unable to create minimum perfect hashing function\n");
			//cmph_config_destroy(config);
			free(mphf_file);
			return -1;
		}

		if (mphf_fd == NULL)
		{
			fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
			free(mphf_file);
			return -1;
		}
		cmph_dump(mphf, mphf_fd);
		cmph_destroy(mphf);
		fclose(mphf_fd);
	}
	else
	{
		cmph_uint8 * hashtable = NULL;
		mphf_fd = fopen(mphf_file, "r");
		if (mphf_fd == NULL)
		{
			fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
			free(mphf_file);
			return -1;
		}
		mphf = cmph_load(mphf_fd);
		fclose(mphf_fd);
		if (!mphf)
		{
			fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
			free(mphf_file);
			return -1;
		}
		cmph_uint32 siz = cmph_size(mphf);
		hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
		memset(hashtable, 0,(size_t) siz);
		//check all keys
		for (i = 0; i < source->nkeys; ++i)
		{
			cmph_uint32 h;
			char *buf;
			cmph_uint32 buflen = 0;
			source->read(source->data, &buf, &buflen);
			h = cmph_search(mphf, buf, buflen);
			if (!(h < siz))
			{
				fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
				ret = 1;
			} else if(hashtable[h] >= keys_per_bin)
			{
				fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
				fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
				ret = 1;
			} else hashtable[h]++;

			if (verbosity)
			{
				printf("%s -> %u\n", buf, h);
			}
			source->dispose(source->data, buf, buflen);
		}

		cmph_destroy(mphf);
		free(hashtable);
	}
	fclose(keys_fd);
	free(mphf_file);
	free(tmp_dir);
        cmph_io_nlfile_adapter_destroy(source);
	return ret;

}
Ejemplo n.º 8
0
cmph_t *chd_new(cmph_config_t *mph, double c)
{
	cmph_t *mphf = NULL;
	chd_data_t *chdf = NULL;
	chd_config_data_t *chd = (chd_config_data_t *)mph->data;
	chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data;
	compressed_rank_t cr;
	
	register cmph_t * chd_phf = NULL;
	register cmph_uint32 packed_chd_phf_size = 0; 
	cmph_uint8 * packed_chd_phf = NULL;
	
	register cmph_uint32 packed_cr_size = 0; 
	cmph_uint8 * packed_cr = NULL;

	register cmph_uint32 i, idx, nkeys, nvals, nbins;
	cmph_uint32 * vals_table = NULL;
	register cmph_uint32 * occup_table = NULL;
	#ifdef CMPH_TIMING
	double construction_time_begin = 0.0;
	double construction_time = 0.0;
	ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
	#endif

	cmph_config_set_verbosity(chd->chd_ph, mph->verbosity);	
	cmph_config_set_graphsize(chd->chd_ph, c);
	
	if (mph->verbosity)
	{
		fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c);
	}
	
	chd_phf = cmph_new(chd->chd_ph);
	
	if(chd_phf == NULL) 
	{
		return NULL;
	}
	
	packed_chd_phf_size = cmph_packed_size(chd_phf); 
	DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size);
	
	/* Make sure that we have enough space to pack the mphf. */
	packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1);

	/* Pack the mphf. */
	cmph_pack(chd_phf, packed_chd_phf);

	cmph_destroy(chd_phf);
	
	
	if (mph->verbosity)
	{
		fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n");
	}

	compressed_rank_init(&cr);
	nbins = chd_ph->n;
	nkeys = chd_ph->m;
	nvals =  nbins - nkeys; 
	
	vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32));
	occup_table = (cmph_uint32 *)chd_ph->occup_table;
	
	for(i = 0, idx = 0; i < nbins; i++)
	{
		if(!GETBIT32(occup_table, i))
		{
			vals_table[idx++] = i;
		}
	}
	
	compressed_rank_generate(&cr, vals_table, nvals);
	free(vals_table);
	
	packed_cr_size = compressed_rank_packed_size(&cr);
	packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8));
	compressed_rank_pack(&cr, packed_cr);
	compressed_rank_destroy(&cr);

	mphf = (cmph_t *)malloc(sizeof(cmph_t));
	mphf->algo = mph->algo;
	chdf = (chd_data_t *)malloc(sizeof(chd_data_t));
	
	chdf->packed_cr = packed_cr;
	packed_cr = NULL; //transfer memory ownership

	chdf->packed_chd_phf = packed_chd_phf;
	packed_chd_phf = NULL; //transfer memory ownership
	
	chdf->packed_chd_phf_size = packed_chd_phf_size;
	chdf->packed_cr_size = packed_cr_size;
	
	mphf->data = chdf;
	mphf->size = nkeys;

	DEBUGP("Successfully generated minimal perfect hash\n");
	if (mph->verbosity)
	{
		fprintf(stderr, "Successfully generated minimal perfect hash function\n");
	}
	#ifdef CMPH_TIMING	
	ELAPSED_TIME_IN_SECONDS(&construction_time);
	register cmph_uint32 space_usage =  chd_packed_size(mphf)*8;
	construction_time = construction_time - construction_time_begin;
	fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys);
	#endif	

	return mphf;
}
Ejemplo n.º 9
0
ifq_codes_t ifq_create_index(char *fastq_path, char *index_prefix)
{
    char *hash_path = concatenate( index_prefix, ".hsh" );
    char *seek_path = concatenate( index_prefix, ".lup" );
    ifq_codes_t ret = IFQ_OK;
    
    /* Open output files */
    BGZF *fastq_file = bgzf_open( fastq_path, "r" );
    if( fastq_file == NULL )
    {
        ret = IFQ_BAD_FASTQ;
        goto index_fastq_fail;
    }
    
    FILE *hash_file = fopen( hash_path, "w" );
    if( hash_file == NULL )
    {
        ret = IFQ_BAD_PREFIX;
        goto index_prefix_fail;
    }

    /* Create hash function */
    cmph_io_adapter_t *source = cmph_io_fastq_adapter( fastq_file );
    if( source == NULL )
    {
        ret = IFQ_BAD_HASH;
        goto index_prefix_fail;
    }

    cmph_config_t *config = cmph_config_new( source );
    cmph_config_set_algo( config, CMPH_CHD );
    cmph_config_set_mphf_fd( config, hash_file );
    cmph_t *hash = cmph_new( config );
    if( hash == NULL )
    {
        ret = IFQ_BAD_HASH;
        goto index_hash_fail;
    }

    /* Create the file index using the hash */
    bgzf_seek( fastq_file, 0, SEEK_SET );
    if( create_index( fastq_file, hash, seek_path ) != 1 )
    {
        ret = IFQ_BAD_INDEX;
        goto index_create_fail;
    }
index_fastq_fail:
    free( hash_path );
    free( seek_path );

index_create_fail:
    cmph_config_destroy( config );
    cmph_dump( hash, hash_file );
    cmph_destroy( hash );
    free( source );

index_hash_fail:
    fclose( hash_file );

index_prefix_fail:
    bgzf_close( fastq_file );

    return ret;
}
Ejemplo n.º 10
0
static int brz_gen_mphf(cmph_config_t *mph)
{
	cmph_uint32 i, e, error;
	brz_config_data_t *brz = (brz_config_data_t *)mph->data;
	cmph_uint32 memory_usage = 0;
	cmph_uint32 nkeys_in_buffer = 0;
	cmph_uint8 *buffer = (cmph_uint8 *)malloc((size_t)brz->memory_availability);
	cmph_uint32 *buckets_size = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32));
	cmph_uint32 *keys_index = NULL;
	cmph_uint8 **buffer_merge = NULL;
	cmph_uint32 *buffer_h0 = NULL;
	cmph_uint32 nflushes = 0;
	cmph_uint32 h0;
	register size_t nbytes;
	FILE *  tmp_fd = NULL;
	buffer_manager_t * buff_manager = NULL;
	char *filename = NULL;
	char *key = NULL;
	cmph_uint32 keylen;
	cmph_uint32 cur_bucket = 0;
	cmph_uint8 nkeys_vd = 0;
	cmph_uint8 ** keys_vd = NULL;
	
	mph->key_source->rewind(mph->key_source->data);
	DEBUGP("Generating graphs from %u keys\n", brz->m);
	// Partitioning
	for (e = 0; e < brz->m; ++e)
	{
		mph->key_source->read(mph->key_source->data, &key, &keylen);

		/* Buffers management */
		if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers 
		{
			if(mph->verbosity)
			{
				fprintf(stderr, "Flushing  %u\n", nkeys_in_buffer);
			}
			cmph_uint32 value = buckets_size[0];
			cmph_uint32 sum = 0;
			cmph_uint32 keylen1 = 0;
			buckets_size[0]   = 0;
			for(i = 1; i < brz->k; i++)
			{
				if(buckets_size[i] == 0) continue;
				sum += value;
				value = buckets_size[i];
				buckets_size[i] = sum;
				
			}	
			memory_usage = 0;
			keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
			for(i = 0; i < nkeys_in_buffer; i++)
			{
				memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
				h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
				keys_index[buckets_size[h0]] = memory_usage;
				buckets_size[h0]++;
				memory_usage +=  keylen1 + (cmph_uint32)sizeof(keylen1);
			}
			filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
			sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
			tmp_fd = fopen(filename, "wb");
			free(filename);
			filename = NULL;
			for(i = 0; i < nkeys_in_buffer; i++)
			{
				memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
				nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
			}
			nkeys_in_buffer = 0;
			memory_usage = 0;
			memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
			nflushes++;
			free(keys_index);
			fclose(tmp_fd);
		}
		memcpy(buffer + memory_usage, &keylen, sizeof(keylen));
		memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen);
		memory_usage += keylen + (cmph_uint32)sizeof(keylen);
		h0 = hash(brz->h0, key, keylen) % brz->k;
		
		if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) 
		{
			free(buffer);
			free(buckets_size);
			return 0;
		}
		brz->size[h0] = (cmph_uint8)(brz->size[h0] + 1U);
		buckets_size[h0] ++;
		nkeys_in_buffer++;
		mph->key_source->dispose(mph->key_source->data, key, keylen);
	}
	if (memory_usage != 0) // flush buffers 
	{ 
		if(mph->verbosity)
		{
			fprintf(stderr, "Flushing  %u\n", nkeys_in_buffer);
		}
		cmph_uint32 value = buckets_size[0];
		cmph_uint32 sum = 0;
		cmph_uint32 keylen1 = 0;
		buckets_size[0]   = 0;
		for(i = 1; i < brz->k; i++)
		{
			if(buckets_size[i] == 0) continue;
			sum += value;
			value = buckets_size[i];
			buckets_size[i] = sum;
		}
		memory_usage = 0;
		keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32));
		for(i = 0; i < nkeys_in_buffer; i++)
		{
			memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1));
			h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k;
			keys_index[buckets_size[h0]] = memory_usage;
			buckets_size[h0]++;
			memory_usage +=  keylen1 + (cmph_uint32)sizeof(keylen1);
		}
		filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
		sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes);
		tmp_fd = fopen(filename, "wb");
		free(filename);
		filename = NULL;
		for(i = 0; i < nkeys_in_buffer; i++)
		{
			memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1));
			nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd);
		}
		nkeys_in_buffer = 0;
		memory_usage = 0;
		memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32));
		nflushes++;
		free(keys_index);
		fclose(tmp_fd);
	}

	free(buffer);
	free(buckets_size);
	if(nflushes > 1024) return 0; // Too many files generated.
	// mphf generation
	if(mph->verbosity)
	{
		fprintf(stderr, "\nMPHF generation \n");
	}
	/* Starting to dump to disk the resultant MPHF: __cmph_dump function */
	nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd);
	nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd);
	nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd);
	nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd);
	nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs
	nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd);
	
	//tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *));
	buff_manager = buffer_manager_new(brz->memory_availability, nflushes);
	buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *));
	buffer_h0    = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32));
	
	memory_usage = 0;
	for(i = 0; i < nflushes; i++)
	{
		filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char));
		sprintf(filename, "%s%u.cmph",brz->tmp_dir, i);
		buffer_manager_open(buff_manager, i, filename);
		free(filename);
		filename = NULL;
		key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
		h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
		buffer_h0[i] = h0;
                buffer_merge[i] = (cmph_uint8 *)key;
                key = NULL; //transfer memory ownership                 
	}
	e = 0;
	keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *));
	nkeys_vd = 0;
	error = 0;
	while(e < brz->m)
	{
		i = brz_min_index(buffer_h0, nflushes);
		cur_bucket = buffer_h0[i];
		key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
		if(key)
		{
			while(key)
			{
				//keylen = strlen(key);
				h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k;
				if (h0 != buffer_h0[i]) break;
				keys_vd[nkeys_vd++] = (cmph_uint8 *)key;
				key = NULL; //transfer memory ownership
				e++;
				key = (char *)buffer_manager_read_key(buff_manager, i, &keylen);
			}
			if (key)
			{
				assert(nkeys_vd < brz->size[cur_bucket]);
				keys_vd[nkeys_vd++] = buffer_merge[i];
				buffer_merge[i] = NULL; //transfer memory ownership
				e++;
				buffer_h0[i] = h0;
				buffer_merge[i] = (cmph_uint8 *)key;
			}
		}
		if(!key)
		{
			assert(nkeys_vd < brz->size[cur_bucket]);
			keys_vd[nkeys_vd++] = buffer_merge[i];
			buffer_merge[i] = NULL; //transfer memory ownership
			e++;
			buffer_h0[i] = UINT_MAX;
		}
		
		if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket.
		{
			cmph_io_adapter_t *source = NULL;
			cmph_config_t *config = NULL;
			cmph_t *mphf_tmp = NULL;
			char *bufmphf = NULL;
			cmph_uint32 buflenmphf = 0;
			// Source of keys
			source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd);
			config = cmph_config_new(source);
			cmph_config_set_algo(config, brz->algo);
			//cmph_config_set_algo(config, CMPH_BMZ8);
			cmph_config_set_graphsize(config, brz->c);
			mphf_tmp = cmph_new(config);
			if (mphf_tmp == NULL) 
			{
				if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k);
				error = 1;
				cmph_config_destroy(config);
 				brz_destroy_keys_vd(keys_vd, nkeys_vd);
				cmph_io_byte_vector_adapter_destroy(source);
				break;
			}
			if(mph->verbosity) 
			{
			  if (cur_bucket % 1000 == 0) 
  			  {
			  	fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k);
			  }
			}
			switch(brz->algo)
			{
				case CMPH_FCH:
				{
					fch_data_t * fchf = NULL;
					fchf = (fch_data_t *)mphf_tmp->data;			
					bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf);
				}
					break;
				case CMPH_BMZ8:
				{
					bmz8_data_t * bmzf = NULL;
					bmzf = (bmz8_data_t *)mphf_tmp->data;
					bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket,  &buflenmphf);
				}
					break;
				default: assert(0);
			}
		        nbytes = fwrite(bufmphf, (size_t)buflenmphf, (size_t)1, brz->mphf_fd);
			free(bufmphf);
			bufmphf = NULL;
			cmph_config_destroy(config);
 			brz_destroy_keys_vd(keys_vd, nkeys_vd);
			cmph_destroy(mphf_tmp);
			cmph_io_byte_vector_adapter_destroy(source);
			nkeys_vd = 0;
		}
	}
	buffer_manager_destroy(buff_manager);
	free(keys_vd);
	free(buffer_merge);
	free(buffer_h0);
	if (error) return 0;
	return 1;
}
Ejemplo n.º 11
0
char *cmph_op_build(bot_t * bot, cmphx_t ** cmphx, char *string)
{
	cmphx_t *cmphx_ptr = NULL;
	char *str = NULL;

	char **keys = NULL;
	int nkeys = 0;

	debug(NULL, "cmph_op_build: Entered: %p %p %p\n", bot, cmphx, string);

	if (!bot || !cmphx || !_sNULL(string))
		return NULL;

	cmphx_ptr = *cmphx;

	if (!cmphx_ptr) {
		cmphx_ptr = (cmphx_t *) calloc(1, sizeof(cmphx_t));
		if (!cmphx_ptr)
			return NULL;
		*cmphx = cmphx_ptr;
	}

	if (cmphx_ptr->hash) {
		cmph_op_clear(bot, cmphx, string);
	}

	keys =
	    tokenize_array(NULL, string,
			   TOKENIZE_NORMAL | TOKENIZE_EATWHITESPACE, " ",
			   &nkeys);
	if (!keys)
		goto cleanup;

	tokenize_sort_strings(keys, &nkeys,
			      TOKENIZE_SORT_STRINGS_FORWARD |
			      TOKENIZE_SORT_STRINGS_UNIQ);

	cmphx_ptr->fp = fopen("/tmp/cmph.mph", "w");
	if (!cmphx_ptr->fp)
		goto cleanup;

	cmphx_ptr->source = cmph_io_vector_adapter((char **)keys, nkeys);
	if (!cmphx_ptr->source)
		goto cleanup;

	cmphx_ptr->config = cmph_config_new(cmphx_ptr->source);
	if (!cmphx_ptr->config)
		goto cleanup;

	cmph_config_set_algo(cmphx_ptr->config, CMPH_BRZ);
	cmph_config_set_tmp_dir(cmphx_ptr->config, (cmph_uint8 *) "/tmp/");
	cmph_config_set_mphf_fd(cmphx_ptr->config, cmphx_ptr->fp);
	cmphx_ptr->hash = cmph_new(cmphx_ptr->config);
	if (!cmphx_ptr->hash)
		goto cleanup;

	cmph_config_destroy(cmphx_ptr->config);
	cmphx_ptr->config = NULL;

	cmph_dump(cmphx_ptr->hash, cmphx_ptr->fp);

	cmph_destroy(cmphx_ptr->hash);

	fclose(cmphx_ptr->fp);

	cmphx_ptr->fp = fopen("/tmp/cmph.mph", "r");

	tokenize_destroy_array(NULL, keys);

	debug(NULL, "cmph_op_build: Success\n");

	cmphx_ptr->hash = cmph_load(cmphx_ptr->fp);

	return str;

/* error */
 cleanup:
	cmph_op_clear(bot, cmphx, string);

	debug(NULL, "cmph_op_build: Failure\n");

	return str;
}