Exemplo n.º 1
0
static void
print_all(void)
{
	printf("count: %lu\n", map_count(mapc, map));
	map_foreach(mapc, map, hashmap_print, NULL);
	printf("\n");
}
Exemplo n.º 2
0
/*
**copie file to struct map in the variable mem.
*/
void ft_count_clone_file(map *mp, int fd, char *buf)
{
	int count;
	int i;
	int countfind;

	i = 0;
	count = 0;
	countfind = 0;
	map_init(mp);
	while ((read(fd, buf, BUFFSIZE)))
	{
		if (count == 0)
		{
			mp->mem = cut_first_line_take_char(buf, mp);
			count = 1;
		}
		else
		{
			map_count(mp, buf);
			mp->mem = ft_strstr(mp->mem, buf);
			i++;
		}
		printf("mp.mem:%s\n", mp->mem);
	}
	if (close(fd))
		mp->error = 1;
	ft_map_max(mp);
}
Exemplo n.º 3
0
size_t PackageSet::size() const { return map_count(&pImpl->map); }
Exemplo n.º 4
0
int main(int argc, char **argv) {
	setlocale(LC_ALL, ""); // Comment-out on non-Posix systems
	clock_t time_start = clock();
	time_t time_t_start;
	time(&time_t_start);
	argv_0_basename = basename(argv[0]);
	get_usage_string(usage, USAGE_LEN); // This is a big scary string, so build it elsewhere

	//printf("sizeof(cmd_args)=%zd\n", sizeof(cmd_args));
	parse_cmd_args(argc, argv, usage, &cmd_args);

	if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN)
		memusage += sizeof(float) * ENTROPY_TERMS_MAX; // We'll build the precomputed entropy terms after reporting memusage

	struct_model_metadata global_metadata;

	// The list of unique words should always include <s>, unknown word, and </s>
	map_update_count(&word_map, UNKNOWN_WORD, 0, 0); // Should always be first
	map_update_count(&word_map, "<s>", 0, 1);
	map_update_count(&word_map, "</s>", 0, 2);

	// Open input
	FILE *in_train_file = stdin;
	if (in_train_file_string)
		in_train_file = fopen(in_train_file_string, "r");
	if (in_train_file == NULL) {
		fprintf(stderr, "%s: Error: Unable to open input file  %s\n", argv_0_basename, in_train_file_string); fflush(stderr);
		exit(15);
	}

	// Process input sentences
	size_t input_memusage = 0;
	const struct_model_metadata input_model_metadata = process_input(cmd_args, in_train_file, &word_map, &initial_bigram_map, &input_memusage);
	memusage += input_memusage;
	fclose(in_train_file);

	clock_t time_input_processed = clock();
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Corpus processed in %'.2f CPU secs. %'lu lines, %'u types, %'lu tokens, current memusage: %'.1fMB\n", argv_0_basename, (double)(time_input_processed - time_start)/CLOCKS_PER_SEC, input_model_metadata.line_count, input_model_metadata.type_count, input_model_metadata.token_count, (double)memusage / 1048576); fflush(stderr);

	global_metadata.token_count = input_model_metadata.token_count;
	global_metadata.type_count  = map_count(&word_map);

	// Filter out infrequent words, reassign word_id's, and build a mapping from old word_id's to new word_id's
	sort_by_count(&word_map);
	word_id_t * restrict word_id_remap = calloc(sizeof(word_id_t), input_model_metadata.type_count);
	get_ids(&word_map, word_id_remap);
	word_id_t number_of_deleted_words = filter_infrequent_words(cmd_args, &global_metadata, &word_map, word_id_remap);

	// Get list of unique words
	char * * restrict word_list = (char **)malloc(sizeof(char*) * global_metadata.type_count);
	memusage += sizeof(char*) * global_metadata.type_count;
	reassign_word_ids(&word_map, word_list, word_id_remap);
	get_keys(&word_map, word_list);
	sort_by_id(&word_map);


	// Check or set number of classes
	if (cmd_args.num_classes >= global_metadata.type_count) { // User manually set number of classes is too low
		fprintf(stderr, "%s: Error: Number of classes (%u) is not less than vocabulary size (%u).  Decrease the value of --classes\n", argv_0_basename, cmd_args.num_classes, global_metadata.type_count); fflush(stderr);
		exit(3);
	} else if (cmd_args.num_classes == 0) { // User did not manually set number of classes at all
		cmd_args.num_classes = (wclass_t) (sqrt(global_metadata.type_count) * 1.2);
	}

	// Build array of word_counts
	word_count_t * restrict word_counts = malloc(sizeof(word_count_t) * global_metadata.type_count);
	memusage += sizeof(word_count_t) * global_metadata.type_count;
	build_word_count_array(&word_map, word_list, word_counts, global_metadata.type_count);

	// Initialize clusters, and possibly read-in external class file
	wclass_t * restrict word2class = malloc(sizeof(wclass_t) * global_metadata.type_count);
	memusage += sizeof(wclass_t) * global_metadata.type_count;
	init_clusters(cmd_args, global_metadata.type_count, word2class, word_counts, word_list);
	if (initial_class_file != NULL)
		import_class_file(&word_map, word2class, initial_class_file, cmd_args.num_classes); // Overwrite subset of word mappings, from user-provided initial_class_file

	// Remap word_id's in initial_bigram_map
	remap_and_rev_bigram_map(&initial_bigram_map, &new_bigram_map, &new_bigram_map_rev, word_id_remap, map_find_id(&word_map, UNKNOWN_WORD, -1));
	global_metadata.start_sent_id = map_find_id(&word_map, "<s>", -1);; // need this for tallying emission probs
	global_metadata.end_sent_id   = map_find_id(&word_map, "</s>", -1);; // need this for tallying emission probs
	global_metadata.line_count    = map_find_count(&word_map, "</s>"); // Used for calculating perplexity

	if (global_metadata.line_count == 0) {
		fprintf(stderr, "%s: Warning: Number of lines is 0.  Include <s> and </s> in your ngram counts, or perplexity values will be unreliable.\n", argv_0_basename); fflush(stderr);
	}

	//printf("init_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map)); fflush(stdout);
	//printf("new_bigram_map hash_count=%u\n", HASH_COUNT(new_bigram_map)); fflush(stdout);
	free(word_id_remap);
	memusage -= sizeof(word_id_t) * input_model_metadata.type_count;
	delete_all(&word_map); // static
	delete_all_bigram(&initial_bigram_map); // static
	memusage -= input_memusage;

	// Initialize and set word bigram listing
	clock_t time_bigram_start = clock();
	size_t bigram_memusage = 0; size_t bigram_rev_memusage = 0;
	struct_word_bigram_entry * restrict word_bigrams = NULL;
	struct_word_bigram_entry * restrict word_bigrams_rev = NULL;

	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Word bigram listing ... ", argv_0_basename); fflush(stderr);

	#pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel
	{
		#pragma omp section
		{
			//sort_bigrams(&new_bigram_map); // speeds things up later
			word_bigrams = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
			memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
			bigram_memusage = set_bigram_counts(word_bigrams, new_bigram_map);
			// Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
			for (word_id_t word = 0; word < global_metadata.type_count; word++)
				word_bigrams[word].headword_count = word_counts[word];
		}

		// Initialize and set *reverse* word bigram listing
		#pragma omp section
		{
			if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
				//sort_bigrams(&new_bigram_map_rev); // speeds things up later
				word_bigrams_rev = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
				memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
				bigram_rev_memusage = set_bigram_counts(word_bigrams_rev, new_bigram_map_rev);
				// Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
				for (word_id_t word = 0; word < global_metadata.type_count; word++)
					word_bigrams_rev[word].headword_count = word_counts[word];
			}
		}
	}

	delete_all_bigram(&new_bigram_map);
	delete_all_bigram(&new_bigram_map_rev);
	memusage += bigram_memusage + bigram_rev_memusage;
	clock_t time_bigram_end = clock();
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "in %'.2f CPU secs.  Bigram memusage: %'.1f MB\n", (double)(time_bigram_end - time_bigram_start)/CLOCKS_PER_SEC, (bigram_memusage + bigram_rev_memusage)/(double)1048576); fflush(stderr);

	//print_word_bigrams(global_metadata, word_bigrams, word_list);

	// Build <v,c> counts, which consists of a word followed by a given class
	word_class_count_t * restrict word_class_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
	if (word_class_counts == NULL) {
		fprintf(stderr,  "%s: Error: Unable to allocate enough memory for <v,c>.  %'.1f MB needed.  Maybe increase --min-count\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
		exit(13);
	}
	memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
	fprintf(stderr, "%s: Allocating %'.1f MB for word_class_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
	build_word_class_counts(cmd_args, word_class_counts, word2class, word_bigrams, global_metadata.type_count/*, word_list*/);
	//print_word_class_counts(cmd_args, global_metadata, word_class_counts);

	// Build reverse: <c,v> counts: class followed by word.  This and the normal one are both pretty fast, so no need to parallelize this
	word_class_count_t * restrict word_class_rev_counts = NULL;
	if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
		word_class_rev_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
		if (word_class_rev_counts == NULL) {
			fprintf(stderr,  "%s: Warning: Unable to allocate enough memory for <v,c>.  %'.1f MB needed.  Falling back to --rev-alternate 0\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
			cmd_args.rev_alternate = 0;
		} else {
			memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
			fprintf(stderr, "%s: Allocating %'.1f MB for word_class_rev_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
			build_word_class_counts(cmd_args, word_class_rev_counts, word2class, word_bigrams_rev, global_metadata.type_count/*, word_list*/);
		}

	}

	// Calculate memusage for count_arrays
	for (unsigned char i = 1; i <= cmd_args.max_array; i++) {
		memusage += 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t));
		//printf("11 memusage += %zu (now=%zu) count_arrays\n", 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), memusage); fflush(stdout);
	}

	clock_t time_model_built = clock();
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Finished loading %'lu tokens and %'u types (%'u filtered) from %'lu lines in %'.2f CPU secs\n", argv_0_basename, global_metadata.token_count, global_metadata.type_count, number_of_deleted_words, global_metadata.line_count, (double)(time_model_built - time_start)/CLOCKS_PER_SEC); fflush(stderr);
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Approximate memory usage at clustering: %'.1fMB\n", argv_0_basename, (double)memusage / 1048576); fflush(stderr);

	cluster(cmd_args, global_metadata, word_counts, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);

	// Now print the final word2class mapping
	if (cmd_args.verbose >= 0) {
		FILE *out_file = stdout;
		if (out_file_string)
			out_file = fopen(out_file_string, "w");
		if (out_file == NULL) {
			fprintf(stderr, "%s: Error: Unable to open output file  %s\n", argv_0_basename, out_file_string); fflush(stderr);
			exit(16);
		}
		if (cmd_args.class_algo == EXCHANGE && (!cmd_args.print_word_vectors)) {
			print_words_and_classes(out_file, global_metadata.type_count, word_list, word_counts, word2class, (int)cmd_args.class_offset, cmd_args.print_freqs);
		} else if (cmd_args.class_algo == EXCHANGE && cmd_args.print_word_vectors) {
			print_words_and_vectors(out_file, cmd_args, global_metadata, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);
		}
		fclose(out_file);
	}

	clock_t time_clustered = clock();
	time_t time_t_end;
	time(&time_t_end);
	double time_secs_total = difftime(time_t_end, time_t_start);
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Finished clustering in %'.2f CPU seconds.  Total wall clock time was about %lim %lis\n", argv_0_basename, (double)(time_clustered - time_model_built)/CLOCKS_PER_SEC, (long)time_secs_total/60, ((long)time_secs_total % 60)  );

	free(word2class);
	free(word_bigrams);
	free(word_list);
	free(word_counts);
	exit(0);
}