Exemplo n.º 1
0
/**	Function clustering ()
 *
 *	Given fragments, each represented by a list of seeds, represented by
 *	[list_seeds]. Cluster the fragments by pairwise comparison of ones
 *	sharing the same seed. Two fragments are included in the same cluster
 *	if they meet max_mismatch criterion. Output [fragID2clsID], where
 *	the clstID is the smallest fragID in the cluster.
 *
 *	Method: a global union find structure [uf_clust] is initialized to be
 *	the number of fragments. Clustering iterates through each seed:
 *	boundaries in [list_seeds] are identified such that each chunk of
 *	fragments share the same seed. Then clustering can be applied in parallel:
 *	within the clustering function, a local union find structure is generated
 *	and [uf_clust] is only checked but not updated for fragment comparison.
 *	Once local clusters were generated, [uf_clust] is then updated to reflect
 *	the clustering. The purpose of using this approach is to use OMP
 */
void clustering_via_seeds (ivec_t& uf_clst, ii64vec_t& list_seeds,
		int num_seed, int max_mismatch, bool silent) {
	bool debug = false;

	if (list_seeds.size() == 0 || list_seeds[0].size() == 0) {
		abording ("DuplRm.cpp -- clustering () SC failed");
	}

	int sz = list_seeds.size();

	// -------- cluster according to seed i ---------
	int num_seed_to_check = std::min(num_seed, max_mismatch + 1);
	num_seed_to_check = std::min (5, num_seed_to_check); // cap at 5 iterations
	for (int seed_i = 0; seed_i < num_seed_to_check; ++ seed_i) {

		if (!silent) {
			std::cout << "\t\tcluster by seed " << seed_i << "\n";
		}

		std::sort (list_seeds.begin(), list_seeds.end(), cmp_seed(seed_i));

		// linear scan the sorted [list_seeds] wrt the ith seed, and
		// generate 2d vector, where each dimension stores the indices of
		// [list_seeds] that share the same seed
		iivec_t init_clusters (1, ivec_t{0});
		for (int i = 1; i < sz; ++ i) {
			if (list_seeds[i][seed_i] == list_seeds[i - 1][seed_i]) {
				init_clusters.rbegin()->push_back(i);

				/*
				if (debug) { // print out substring w/ large count
					if (init_clusters.rbegin()->size() > 100000) {
						std::string fwd_str = xny::ID2Str<int64_t>
							(list_seeds[i][seed_i], 31);
						std::cout << fwd_str << "\t";
						std::cout << xny::get_rvc_str(fwd_str) << "\n";
						exit(1);
					}
				} */
			} else init_clusters.push_back({i});
		}

		int init_sz = init_clusters.size();

		if (!silent){
			std::cout << "\t\t" << init_sz << " clusters to validate\n";
		}
		//------- generate clusters: parallel clustering for each chunk
		// of boundary then merge to the global cluster -------------
		validate_clusters (uf_clst, init_clusters, list_seeds, max_mismatch,
				20000);

	} // for (int seed_i = 0; seed_i < num_seeds; ++ seed_i) {

} // clustering_via_seeds
Exemplo n.º 2
0
/**	Function clustering_via_ss
 *
 *	Input 1) fragments in binary representation [list_seeds]
 *		  2) super_sketches for each fragment
 */
void clustering_via_ss (ivec_t& uf_clst, const ii64vec_t& list_seeds,
	const std::string& f, const std::string& f2, int batch,
	xny::sketch_list& slistgen, xny::super_sketch& ssgen, int max_mismatch,
	bool silent) {

	int sz = list_seeds.size();
	if (sz == 0) return;

	// ------- multiple iterations of sketching ------------------

	int pre_sz = 1, iter = 0;

	while (true) {

		if (! silent) std::cout << "\n\t\tsketching iteration: " << iter << "\n";
		++ iter;

		jaz::murmur264 hashfunc (rand() % RAND_MAX);

		// -----------------------  sketching ---------------------------
		if (! silent) std::cout << "\t\t\tgenerate super sketches ...\n";
		std::vector<sketch_t> super_sketches;
		get_super_sketches (super_sketches, f, f2, slistgen, ssgen,
				hashfunc, batch, silent);
		std::sort(super_sketches.begin(), super_sketches.end(),
					xny::cmp_sketch());

		if (sz != list_seeds.size()) abording ("clusetring_via_ss SC failed.");


		// In [init_clusters], each 1d elem stores the indices of
		// [list_seeds] that share the same super sketch, where in [list_seeds],
		// the index i should be equal to fragID
		iivec_t init_clusters (1, ivec_t{super_sketches[0].second});
		for (int i = 1; i < sz; ++ i) {
			if (super_sketches[i].first == super_sketches[i-1].first) {
				init_clusters.rbegin()->push_back(super_sketches[i].second);
			} else init_clusters.push_back({super_sketches[i].second});
		}

		int init_sz = init_clusters.size();

		if (!silent){
			std::cout << "\t\t\t" << init_sz << " clusters to validate\n";
		}

		validate_clusters (uf_clst, init_clusters, list_seeds, max_mismatch,
				INT_MAX);

		// ---- generate final union find clusters ------
		iivec_t clusters;
		uf_generate_cls (clusters, uf_clst);

		// ---- generate the duplicated fragment IDs ----------------
		// iteration ending criteria % duplicate ID increase < 5%
		iset_t duplIDs;
		for (int i = 0; i < (int) clusters.size(); ++ i) {
			duplIDs.insert(clusters[i].begin() + 1, clusters[i].end());
		}
		int perc_incr = 100 * (duplIDs.size() - pre_sz)/pre_sz;
		if (!silent){
			std::cout << "\n\t\t\tduplicates: " << duplIDs.size() << " (" <<
					perc_incr << " % increase)\n" ;
		}
		if (perc_incr < 5) break;
		else pre_sz = duplIDs.size();

		break;
	}// 	while (true)

} //
Exemplo n.º 3
0
int main(int argc, char **argv) {
	setlocale(LC_ALL, ""); // Comment-out on non-Posix systems
	clock_t time_start = clock();
	time_t time_t_start;
	time(&time_t_start);
	argv_0_basename = basename(argv[0]);
	get_usage_string(usage, USAGE_LEN); // This is a big scary string, so build it elsewhere

	//printf("sizeof(cmd_args)=%zd\n", sizeof(cmd_args));
	parse_cmd_args(argc, argv, usage, &cmd_args);

	if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN)
		memusage += sizeof(float) * ENTROPY_TERMS_MAX; // We'll build the precomputed entropy terms after reporting memusage

	struct_model_metadata global_metadata;

	// The list of unique words should always include <s>, unknown word, and </s>
	map_update_count(&word_map, UNKNOWN_WORD, 0, 0); // Should always be first
	map_update_count(&word_map, "<s>", 0, 1);
	map_update_count(&word_map, "</s>", 0, 2);

	// Open input
	FILE *in_train_file = stdin;
	if (in_train_file_string)
		in_train_file = fopen(in_train_file_string, "r");
	if (in_train_file == NULL) {
		fprintf(stderr, "%s: Error: Unable to open input file  %s\n", argv_0_basename, in_train_file_string); fflush(stderr);
		exit(15);
	}

	// Process input sentences
	size_t input_memusage = 0;
	const struct_model_metadata input_model_metadata = process_input(cmd_args, in_train_file, &word_map, &initial_bigram_map, &input_memusage);
	memusage += input_memusage;
	fclose(in_train_file);

	clock_t time_input_processed = clock();
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Corpus processed in %'.2f CPU secs. %'lu lines, %'u types, %'lu tokens, current memusage: %'.1fMB\n", argv_0_basename, (double)(time_input_processed - time_start)/CLOCKS_PER_SEC, input_model_metadata.line_count, input_model_metadata.type_count, input_model_metadata.token_count, (double)memusage / 1048576); fflush(stderr);

	global_metadata.token_count = input_model_metadata.token_count;
	global_metadata.type_count  = map_count(&word_map);

	// Filter out infrequent words, reassign word_id's, and build a mapping from old word_id's to new word_id's
	sort_by_count(&word_map);
	word_id_t * restrict word_id_remap = calloc(sizeof(word_id_t), input_model_metadata.type_count);
	get_ids(&word_map, word_id_remap);
	word_id_t number_of_deleted_words = filter_infrequent_words(cmd_args, &global_metadata, &word_map, word_id_remap);

	// Get list of unique words
	char * * restrict word_list = (char **)malloc(sizeof(char*) * global_metadata.type_count);
	memusage += sizeof(char*) * global_metadata.type_count;
	reassign_word_ids(&word_map, word_list, word_id_remap);
	get_keys(&word_map, word_list);
	sort_by_id(&word_map);


	// Check or set number of classes
	if (cmd_args.num_classes >= global_metadata.type_count) { // User manually set number of classes is too low
		fprintf(stderr, "%s: Error: Number of classes (%u) is not less than vocabulary size (%u).  Decrease the value of --classes\n", argv_0_basename, cmd_args.num_classes, global_metadata.type_count); fflush(stderr);
		exit(3);
	} else if (cmd_args.num_classes == 0) { // User did not manually set number of classes at all
		cmd_args.num_classes = (wclass_t) (sqrt(global_metadata.type_count) * 1.2);
	}

	// Build array of word_counts
	word_count_t * restrict word_counts = malloc(sizeof(word_count_t) * global_metadata.type_count);
	memusage += sizeof(word_count_t) * global_metadata.type_count;
	build_word_count_array(&word_map, word_list, word_counts, global_metadata.type_count);

	// Initialize clusters, and possibly read-in external class file
	wclass_t * restrict word2class = malloc(sizeof(wclass_t) * global_metadata.type_count);
	memusage += sizeof(wclass_t) * global_metadata.type_count;
	init_clusters(cmd_args, global_metadata.type_count, word2class, word_counts, word_list);
	if (initial_class_file != NULL)
		import_class_file(&word_map, word2class, initial_class_file, cmd_args.num_classes); // Overwrite subset of word mappings, from user-provided initial_class_file

	// Remap word_id's in initial_bigram_map
	remap_and_rev_bigram_map(&initial_bigram_map, &new_bigram_map, &new_bigram_map_rev, word_id_remap, map_find_id(&word_map, UNKNOWN_WORD, -1));
	global_metadata.start_sent_id = map_find_id(&word_map, "<s>", -1);; // need this for tallying emission probs
	global_metadata.end_sent_id   = map_find_id(&word_map, "</s>", -1);; // need this for tallying emission probs
	global_metadata.line_count    = map_find_count(&word_map, "</s>"); // Used for calculating perplexity

	if (global_metadata.line_count == 0) {
		fprintf(stderr, "%s: Warning: Number of lines is 0.  Include <s> and </s> in your ngram counts, or perplexity values will be unreliable.\n", argv_0_basename); fflush(stderr);
	}

	//printf("init_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map)); fflush(stdout);
	//printf("new_bigram_map hash_count=%u\n", HASH_COUNT(new_bigram_map)); fflush(stdout);
	free(word_id_remap);
	memusage -= sizeof(word_id_t) * input_model_metadata.type_count;
	delete_all(&word_map); // static
	delete_all_bigram(&initial_bigram_map); // static
	memusage -= input_memusage;

	// Initialize and set word bigram listing
	clock_t time_bigram_start = clock();
	size_t bigram_memusage = 0; size_t bigram_rev_memusage = 0;
	struct_word_bigram_entry * restrict word_bigrams = NULL;
	struct_word_bigram_entry * restrict word_bigrams_rev = NULL;

	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Word bigram listing ... ", argv_0_basename); fflush(stderr);

	#pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel
	{
		#pragma omp section
		{
			//sort_bigrams(&new_bigram_map); // speeds things up later
			word_bigrams = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
			memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
			bigram_memusage = set_bigram_counts(word_bigrams, new_bigram_map);
			// Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
			for (word_id_t word = 0; word < global_metadata.type_count; word++)
				word_bigrams[word].headword_count = word_counts[word];
		}

		// Initialize and set *reverse* word bigram listing
		#pragma omp section
		{
			if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
				//sort_bigrams(&new_bigram_map_rev); // speeds things up later
				word_bigrams_rev = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry));
				memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count;
				bigram_rev_memusage = set_bigram_counts(word_bigrams_rev, new_bigram_map_rev);
				// Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering
				for (word_id_t word = 0; word < global_metadata.type_count; word++)
					word_bigrams_rev[word].headword_count = word_counts[word];
			}
		}
	}

	delete_all_bigram(&new_bigram_map);
	delete_all_bigram(&new_bigram_map_rev);
	memusage += bigram_memusage + bigram_rev_memusage;
	clock_t time_bigram_end = clock();
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "in %'.2f CPU secs.  Bigram memusage: %'.1f MB\n", (double)(time_bigram_end - time_bigram_start)/CLOCKS_PER_SEC, (bigram_memusage + bigram_rev_memusage)/(double)1048576); fflush(stderr);

	//print_word_bigrams(global_metadata, word_bigrams, word_list);

	// Build <v,c> counts, which consists of a word followed by a given class
	word_class_count_t * restrict word_class_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
	if (word_class_counts == NULL) {
		fprintf(stderr,  "%s: Error: Unable to allocate enough memory for <v,c>.  %'.1f MB needed.  Maybe increase --min-count\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
		exit(13);
	}
	memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
	fprintf(stderr, "%s: Allocating %'.1f MB for word_class_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
	build_word_class_counts(cmd_args, word_class_counts, word2class, word_bigrams, global_metadata.type_count/*, word_list*/);
	//print_word_class_counts(cmd_args, global_metadata, word_class_counts);

	// Build reverse: <c,v> counts: class followed by word.  This and the normal one are both pretty fast, so no need to parallelize this
	word_class_count_t * restrict word_class_rev_counts = NULL;
	if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used
		word_class_rev_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t));
		if (word_class_rev_counts == NULL) {
			fprintf(stderr,  "%s: Warning: Unable to allocate enough memory for <v,c>.  %'.1f MB needed.  Falling back to --rev-alternate 0\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr);
			cmd_args.rev_alternate = 0;
		} else {
			memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t);
			fprintf(stderr, "%s: Allocating %'.1f MB for word_class_rev_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr);
			build_word_class_counts(cmd_args, word_class_rev_counts, word2class, word_bigrams_rev, global_metadata.type_count/*, word_list*/);
		}

	}

	// Calculate memusage for count_arrays
	for (unsigned char i = 1; i <= cmd_args.max_array; i++) {
		memusage += 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t));
		//printf("11 memusage += %zu (now=%zu) count_arrays\n", 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), memusage); fflush(stdout);
	}

	clock_t time_model_built = clock();
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Finished loading %'lu tokens and %'u types (%'u filtered) from %'lu lines in %'.2f CPU secs\n", argv_0_basename, global_metadata.token_count, global_metadata.type_count, number_of_deleted_words, global_metadata.line_count, (double)(time_model_built - time_start)/CLOCKS_PER_SEC); fflush(stderr);
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Approximate memory usage at clustering: %'.1fMB\n", argv_0_basename, (double)memusage / 1048576); fflush(stderr);

	cluster(cmd_args, global_metadata, word_counts, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);

	// Now print the final word2class mapping
	if (cmd_args.verbose >= 0) {
		FILE *out_file = stdout;
		if (out_file_string)
			out_file = fopen(out_file_string, "w");
		if (out_file == NULL) {
			fprintf(stderr, "%s: Error: Unable to open output file  %s\n", argv_0_basename, out_file_string); fflush(stderr);
			exit(16);
		}
		if (cmd_args.class_algo == EXCHANGE && (!cmd_args.print_word_vectors)) {
			print_words_and_classes(out_file, global_metadata.type_count, word_list, word_counts, word2class, (int)cmd_args.class_offset, cmd_args.print_freqs);
		} else if (cmd_args.class_algo == EXCHANGE && cmd_args.print_word_vectors) {
			print_words_and_vectors(out_file, cmd_args, global_metadata, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts);
		}
		fclose(out_file);
	}

	clock_t time_clustered = clock();
	time_t time_t_end;
	time(&time_t_end);
	double time_secs_total = difftime(time_t_end, time_t_start);
	if (cmd_args.verbose >= -1)
		fprintf(stderr, "%s: Finished clustering in %'.2f CPU seconds.  Total wall clock time was about %lim %lis\n", argv_0_basename, (double)(time_clustered - time_model_built)/CLOCKS_PER_SEC, (long)time_secs_total/60, ((long)time_secs_total % 60)  );

	free(word2class);
	free(word_bigrams);
	free(word_list);
	free(word_counts);
	exit(0);
}