コード例 #1
0
ファイル: cmph.c プロジェクト: dguthrie/ShefLM
/** \fn cmph_uint32 cmph_packed_size(cmph_t *mphf);
 *  \brief Return the amount of space needed to pack mphf.
 *  \param mphf pointer to a mphf
 *  \return the size of the packed function or zero for failures
 */ 
cmph_uint32 cmph_packed_size(cmph_t *mphf)
{
	switch(mphf->algo)
	{
		case CMPH_CHM:
			return chm_packed_size(mphf);
		case CMPH_BMZ: /* included -- Fabiano */
			return bmz_packed_size(mphf);
		case CMPH_BMZ8: /* included -- Fabiano */
			return bmz8_packed_size(mphf);
		case CMPH_BRZ: /* included -- Fabiano */
			return brz_packed_size(mphf);
		case CMPH_FCH: /* included -- Fabiano */
			return fch_packed_size(mphf);
		case CMPH_BDZ: /* included -- Fabiano */
			return bdz_packed_size(mphf);
		case CMPH_BDZ_PH: /* included -- Fabiano */
			return bdz_ph_packed_size(mphf);
		case CMPH_CHD_PH: /* included -- Fabiano */
			return chd_ph_packed_size(mphf);
		case CMPH_CHD: /* included -- Fabiano */
			return chd_packed_size(mphf);
		default: 
			assert(0);
	}
	return 0; // FAILURE
}
コード例 #2
0
ファイル: chd_ph.c プロジェクト: Blei/gobject-introspection
cmph_t *chd_ph_new(cmph_config_t *mph, double c)
{
	cmph_t *mphf = NULL;
	chd_ph_data_t *chd_phf = NULL;
	chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
	
	register double load_factor = c;
	register cmph_uint8 searching_success = 0;
	register cmph_uint32 max_probes = 1 << 20; // default value for max_probes
	register cmph_uint32 iterations = 100;
	chd_ph_bucket_t * buckets = NULL;
	chd_ph_item_t * items = NULL;
	register cmph_uint8 failure = 0;
	cmph_uint32 max_bucket_size = 0;
	chd_ph_sorted_list_t * sorted_lists = NULL;
	cmph_uint32 * disp_table = NULL;
	register double space_lower_bound = 0;
	#ifdef CMPH_TIMING
	double construction_time_begin = 0.0;
	double construction_time = 0.0;
	ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
	#endif


	chd_ph->m = mph->key_source->nkeys;
	DEBUGP("m = %u\n", chd_ph->m);
	
	chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
	DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
	
	if(load_factor < 0.5 )
	{
		load_factor = 0.5;
	}
	
	if(load_factor >= 0.99)
	{
		load_factor = 0.99;
	}
	
	DEBUGP("load_factor = %.3f\n", load_factor);
	
	chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
	
	//Round the number of bins to the prime immediately above
	if(chd_ph->n % 2 == 0) chd_ph->n++;
	for(;;)
	{
		if(check_primality(chd_ph->n) == 1)
			break;
		chd_ph->n += 2; // just odd numbers can be primes for n > 2
		
	};
	
	DEBUGP("n = %u \n", chd_ph->n);
	if(chd_ph->keys_per_bin == 1)
	{
		space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
	}
	
	if(mph->verbosity)
	{
		fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
	}

       	// We allocate the working tables
	buckets = chd_ph_bucket_new(chd_ph->nbuckets); 
	items   = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));

	max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
	
	if(chd_ph->keys_per_bin == 1)
		chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));
	else
		chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
		
	disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
// 	
// 	init_genrand(time(0));
	
	while(1)
	{
		iterations --;
		if (mph->verbosity)
		{
			fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
		}
		
		if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
		{
			if (mph->verbosity)
			{
				fprintf(stderr, "Failure in mapping step\n");		
			}
			failure = 1;
			goto cleanup;
		}

		if (mph->verbosity)
		{
			fprintf(stderr, "Starting ordering step\n");
		}
		if(sorted_lists)
		{
			free(sorted_lists);
		}

        	sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size);
		
		if (mph->verbosity)
		{
			fprintf(stderr, "Starting searching step\n");
		}
		
		searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
		if(searching_success) break;
		
		// reset occup_table
		if(chd_ph->keys_per_bin > 1)
			memset(chd_ph->occup_table, 0, chd_ph->n);
		else
			memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
		if(iterations == 0)
		{
			// Cleanup memory
			if (mph->verbosity)
			{
				fprintf(stderr, "Failure because the max trials was exceeded\n");
			}
			failure = 1;
			goto cleanup;
		};
	}

	#ifdef DEBUG
	{
		if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size))
		{
		
			DEBUGP("Error for bin packing generation");
			failure = 1;
			goto cleanup;
		}
	}
	#endif
	
	if (mph->verbosity)
	{
		fprintf(stderr, "Starting compressing step\n");
	}
	
	if(chd_ph->cs)
	{
		free(chd_ph->cs);
	}
	chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
	compressed_seq_init(chd_ph->cs);
	compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
	
	#ifdef CMPH_TIMING
	ELAPSED_TIME_IN_SECONDS(&construction_time);
	register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
	DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
	#endif

cleanup:
	chd_ph_bucket_destroy(buckets); 
	free(items);
	free(sorted_lists);
	free(disp_table);
	if(failure) 
	{
		if(chd_ph->hl)
		{
			hash_state_destroy(chd_ph->hl);
		}
		chd_ph->hl = NULL;
		return NULL;
	}

	mphf = (cmph_t *)malloc(sizeof(cmph_t));
	mphf->algo = mph->algo;
	chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
	
	chd_phf->cs = chd_ph->cs;
	chd_ph->cs = NULL; //transfer memory ownership
	chd_phf->hl = chd_ph->hl;
	chd_ph->hl = NULL; //transfer memory ownership
	chd_phf->n = chd_ph->n;
	chd_phf->nbuckets = chd_ph->nbuckets;
	
	mphf->data = chd_phf;
	mphf->size = chd_ph->n;

	DEBUGP("Successfully generated minimal perfect hash\n");
	if (mph->verbosity)
	{
		fprintf(stderr, "Successfully generated minimal perfect hash function\n");
	}
	
	#ifdef CMPH_TIMING	
	register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
	construction_time = construction_time - construction_time_begin;
	fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
	#endif	

	return mphf;
}