Example #1
0
File: main.c Project: AlgoLab/Tagli
int main(int argc, char **argv)
{
	cmph_uint32 verbosity = 0;
	char generate = 0;
	char *mphf_file = NULL;
	FILE *mphf_fd = stdout;
	const char *keys_file = NULL;
	FILE *keys_fd;
	cmph_uint32 nkeys = UINT_MAX;
	cmph_uint32 seed = UINT_MAX;
	CMPH_HASH *hashes = NULL;
	cmph_uint32 nhashes = 0;
	cmph_uint32 i;
	CMPH_ALGO mph_algo = CMPH_CHM;
	double c = 0;
	cmph_config_t *config = NULL;
	cmph_t *mphf = NULL;
	char * tmp_dir = NULL;
	cmph_io_adapter_t *source;
	cmph_uint32 memory_availability = 0;
	cmph_uint32 b = 0;
	cmph_uint32 keys_per_bin = 1;
	while (1)
	{
		char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
		if (ch == -1) break;
		switch (ch)
		{
			case 's':
				{
					char *cptr;
					seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Invalid seed %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'c':
				{
					char *endptr;
					c = strtod(optarg, &endptr);
					if(*endptr != 0) {
						fprintf(stderr, "Invalid c value %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'g':
				generate = 1;
				break;
			case 'k':
			        {
					char *endptr;
					nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
					if(*endptr != 0) {
						fprintf(stderr, "Invalid number of keys %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'm':
				mphf_file = strdup(optarg);
				break;
			case 'd':
				tmp_dir = strdup(optarg);
				break;
			case 'M':
				{
					char *cptr;
					memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Invalid memory availability %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'b':
				{
					char *cptr;
					b =  (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Parameter b was not found: %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 't':
				{
					char *cptr;
					keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
					if(*cptr != 0) {
						fprintf(stderr, "Parameter t was not found: %s\n", optarg);
						exit(1);
					}
				}
				break;
			case 'v':
				++verbosity;
				break;
			case 'V':
				printf("%s\n", VERSION);
				return 0;
			case 'h':
				usage_long(argv[0]);
				return 0;
			case 'a':
				{
				char valid = 0;
				for (i = 0; i < CMPH_COUNT; ++i)
				{
					if (strcmp(cmph_names[i], optarg) == 0)
					{
						mph_algo = (CMPH_ALGO)i;
						valid = 1;
						break;
					}
				}
				if (!valid)
				{
					fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
					return -1;
				}
				}
				break;
			case 'f':
				{
				char valid = 0;
				for (i = 0; i < CMPH_HASH_COUNT; ++i)
				{
					if (strcmp(cmph_hash_names[i], optarg) == 0)
					{
						hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
						hashes[nhashes] = (CMPH_HASH)i;
						hashes[nhashes + 1] = CMPH_HASH_COUNT;
						++nhashes;
						valid = 1;
						break;
					}
				}
				if (!valid)
				{
					fprintf(stderr, "Invalid hash function: %s\n", optarg);
					return -1;
				}
				}
				break;
			default:
				usage(argv[0]);
				return 1;
		}
	}

	if (optind != argc - 1)
	{
		usage(argv[0]);
		return 1;
	}
	keys_file = argv[optind];

	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
	srand(seed);
	int ret = 0;
	if (mphf_file == NULL)
	{
		mphf_file = (char *)malloc(strlen(keys_file) + 5);
		memcpy(mphf_file, keys_file, strlen(keys_file));
		memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
	}

	keys_fd = fopen(keys_file, "r");

	if (keys_fd == NULL)
	{
		fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
		return -1;
	}

	if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
	if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
	else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
	if (generate)
	{
		//Create mphf
		mphf_fd = fopen(mphf_file, "w");
		config = cmph_config_new(source);
		cmph_config_set_algo(config, mph_algo);
		if (nhashes) cmph_config_set_hashfuncs(config, hashes);
		cmph_config_set_verbosity(config, verbosity);
		cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
		cmph_config_set_mphf_fd(config, mphf_fd);
		cmph_config_set_memory_availability(config, memory_availability);
		cmph_config_set_b(config, b);
		cmph_config_set_keys_per_bin(config, keys_per_bin);

		//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
		if(mph_algo == CMPH_BMZ  && c >= 2.0) c=1.15;
		if (c != 0) cmph_config_set_graphsize(config, c);
		mphf = cmph_new(config);

		cmph_config_destroy(config);
		if (mphf == NULL)
		{
			fprintf(stderr, "Unable to create minimum perfect hashing function\n");
			//cmph_config_destroy(config);
			free(mphf_file);
			return -1;
		}

		if (mphf_fd == NULL)
		{
			fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
			free(mphf_file);
			return -1;
		}
		cmph_dump(mphf, mphf_fd);
		cmph_destroy(mphf);
		fclose(mphf_fd);
	}
	else
	{
		cmph_uint8 * hashtable = NULL;
		mphf_fd = fopen(mphf_file, "r");
		if (mphf_fd == NULL)
		{
			fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
			free(mphf_file);
			return -1;
		}
		mphf = cmph_load(mphf_fd);
		fclose(mphf_fd);
		if (!mphf)
		{
			fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
			free(mphf_file);
			return -1;
		}
		cmph_uint32 siz = cmph_size(mphf);
		hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
		memset(hashtable, 0,(size_t) siz);
		//check all keys
		for (i = 0; i < source->nkeys; ++i)
		{
			cmph_uint32 h;
			char *buf;
			cmph_uint32 buflen = 0;
			source->read(source->data, &buf, &buflen);
			h = cmph_search(mphf, buf, buflen);
			if (!(h < siz))
			{
				fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
				ret = 1;
			} else if(hashtable[h] >= keys_per_bin)
			{
				fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
				fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
				ret = 1;
			} else hashtable[h]++;

			if (verbosity)
			{
				printf("%s -> %u\n", buf, h);
			}
			source->dispose(source->data, buf, buflen);
		}

		cmph_destroy(mphf);
		free(hashtable);
	}
	fclose(keys_fd);
	free(mphf_file);
	free(tmp_dir);
        cmph_io_nlfile_adapter_destroy(source);
	return ret;

}
Example #2
0
cmph_t *chd_new(cmph_config_t *mph, double c)
{
	cmph_t *mphf = NULL;
	chd_data_t *chdf = NULL;
	chd_config_data_t *chd = (chd_config_data_t *)mph->data;
	chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data;
	compressed_rank_t cr;
	
	register cmph_t * chd_phf = NULL;
	register cmph_uint32 packed_chd_phf_size = 0; 
	cmph_uint8 * packed_chd_phf = NULL;
	
	register cmph_uint32 packed_cr_size = 0; 
	cmph_uint8 * packed_cr = NULL;

	register cmph_uint32 i, idx, nkeys, nvals, nbins;
	cmph_uint32 * vals_table = NULL;
	register cmph_uint32 * occup_table = NULL;
	#ifdef CMPH_TIMING
	double construction_time_begin = 0.0;
	double construction_time = 0.0;
	ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
	#endif

	cmph_config_set_verbosity(chd->chd_ph, mph->verbosity);	
	cmph_config_set_graphsize(chd->chd_ph, c);
	
	if (mph->verbosity)
	{
		fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c);
	}
	
	chd_phf = cmph_new(chd->chd_ph);
	
	if(chd_phf == NULL) 
	{
		return NULL;
	}
	
	packed_chd_phf_size = cmph_packed_size(chd_phf); 
	DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size);
	
	/* Make sure that we have enough space to pack the mphf. */
	packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1);

	/* Pack the mphf. */
	cmph_pack(chd_phf, packed_chd_phf);

	cmph_destroy(chd_phf);
	
	
	if (mph->verbosity)
	{
		fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n");
	}

	compressed_rank_init(&cr);
	nbins = chd_ph->n;
	nkeys = chd_ph->m;
	nvals =  nbins - nkeys; 
	
	vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32));
	occup_table = (cmph_uint32 *)chd_ph->occup_table;
	
	for(i = 0, idx = 0; i < nbins; i++)
	{
		if(!GETBIT32(occup_table, i))
		{
			vals_table[idx++] = i;
		}
	}
	
	compressed_rank_generate(&cr, vals_table, nvals);
	free(vals_table);
	
	packed_cr_size = compressed_rank_packed_size(&cr);
	packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8));
	compressed_rank_pack(&cr, packed_cr);
	compressed_rank_destroy(&cr);

	mphf = (cmph_t *)malloc(sizeof(cmph_t));
	mphf->algo = mph->algo;
	chdf = (chd_data_t *)malloc(sizeof(chd_data_t));
	
	chdf->packed_cr = packed_cr;
	packed_cr = NULL; //transfer memory ownership

	chdf->packed_chd_phf = packed_chd_phf;
	packed_chd_phf = NULL; //transfer memory ownership
	
	chdf->packed_chd_phf_size = packed_chd_phf_size;
	chdf->packed_cr_size = packed_cr_size;
	
	mphf->data = chdf;
	mphf->size = nkeys;

	DEBUGP("Successfully generated minimal perfect hash\n");
	if (mph->verbosity)
	{
		fprintf(stderr, "Successfully generated minimal perfect hash function\n");
	}
	#ifdef CMPH_TIMING	
	ELAPSED_TIME_IN_SECONDS(&construction_time);
	register cmph_uint32 space_usage =  chd_packed_size(mphf)*8;
	construction_time = construction_time - construction_time_begin;
	fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys);
	#endif	

	return mphf;
}