Exemplos de init_clusters em C++ (Cpp)

Linguagem de programação: C++ (Cpp)

Método / Função: init_clusters

Exemplos em hotexamples.com: 3

init_clusters em C++ (Cpp) - 3 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de init_clusters em C++ (Cpp) extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Exemplo n.º 1

0

Exibir arquivo

Arquivo: DuplRm.cpp Projeto: ACEGID-Senegal/viral-ngs

/** Function clustering () * * Given fragments, each represented by a list of seeds, represented by * [list_seeds]. Cluster the fragments by pairwise comparison of ones * sharing the same seed. Two fragments are included in the same cluster * if they meet max_mismatch criterion. Output [fragID2clsID], where * the clstID is the smallest fragID in the cluster. * * Method: a global union find structure [uf_clust] is initialized to be * the number of fragments. Clustering iterates through each seed: * boundaries in [list_seeds] are identified such that each chunk of * fragments share the same seed. Then clustering can be applied in parallel: * within the clustering function, a local union find structure is generated * and [uf_clust] is only checked but not updated for fragment comparison. * Once local clusters were generated, [uf_clust] is then updated to reflect * the clustering. The purpose of using this approach is to use OMP */ void clustering_via_seeds (ivec_t& uf_clst, ii64vec_t& list_seeds, int num_seed, int max_mismatch, bool silent) { bool debug = false; if (list_seeds.size() == 0 || list_seeds[0].size() == 0) { abording ("DuplRm.cpp -- clustering () SC failed"); } int sz = list_seeds.size(); // -------- cluster according to seed i --------- int num_seed_to_check = std::min(num_seed, max_mismatch + 1); num_seed_to_check = std::min (5, num_seed_to_check); // cap at 5 iterations for (int seed_i = 0; seed_i < num_seed_to_check; ++ seed_i) { if (!silent) { std::cout << "\t\tcluster by seed " << seed_i << "\n"; } std::sort (list_seeds.begin(), list_seeds.end(), cmp_seed(seed_i)); // linear scan the sorted [list_seeds] wrt the ith seed, and // generate 2d vector, where each dimension stores the indices of // [list_seeds] that share the same seed iivec_t init_clusters (1, ivec_t{0}); for (int i = 1; i < sz; ++ i) { if (list_seeds[i][seed_i] == list_seeds[i - 1][seed_i]) { init_clusters.rbegin()->push_back(i); /* if (debug) { // print out substring w/ large count if (init_clusters.rbegin()->size() > 100000) { std::string fwd_str = xny::ID2Str<int64_t> (list_seeds[i][seed_i], 31); std::cout << fwd_str << "\t"; std::cout << xny::get_rvc_str(fwd_str) << "\n"; exit(1); } } */ } else init_clusters.push_back({i}); } int init_sz = init_clusters.size(); if (!silent){ std::cout << "\t\t" << init_sz << " clusters to validate\n"; } //------- generate clusters: parallel clustering for each chunk // of boundary then merge to the global cluster ------------- validate_clusters (uf_clst, init_clusters, list_seeds, max_mismatch, 20000); } // for (int seed_i = 0; seed_i < num_seeds; ++ seed_i) { } // clustering_via_seeds

Exemplo n.º 2

0

Exibir arquivo

Arquivo: DuplRm.cpp Projeto: ACEGID-Senegal/viral-ngs

/** Function clustering_via_ss * * Input 1) fragments in binary representation [list_seeds] * 2) super_sketches for each fragment */ void clustering_via_ss (ivec_t& uf_clst, const ii64vec_t& list_seeds, const std::string& f, const std::string& f2, int batch, xny::sketch_list& slistgen, xny::super_sketch& ssgen, int max_mismatch, bool silent) { int sz = list_seeds.size(); if (sz == 0) return; // ------- multiple iterations of sketching ------------------ int pre_sz = 1, iter = 0; while (true) { if (! silent) std::cout << "\n\t\tsketching iteration: " << iter << "\n"; ++ iter; jaz::murmur264 hashfunc (rand() % RAND_MAX); // ----------------------- sketching --------------------------- if (! silent) std::cout << "\t\t\tgenerate super sketches ...\n"; std::vector<sketch_t> super_sketches; get_super_sketches (super_sketches, f, f2, slistgen, ssgen, hashfunc, batch, silent); std::sort(super_sketches.begin(), super_sketches.end(), xny::cmp_sketch()); if (sz != list_seeds.size()) abording ("clusetring_via_ss SC failed."); // In [init_clusters], each 1d elem stores the indices of // [list_seeds] that share the same super sketch, where in [list_seeds], // the index i should be equal to fragID iivec_t init_clusters (1, ivec_t{super_sketches[0].second}); for (int i = 1; i < sz; ++ i) { if (super_sketches[i].first == super_sketches[i-1].first) { init_clusters.rbegin()->push_back(super_sketches[i].second); } else init_clusters.push_back({super_sketches[i].second}); } int init_sz = init_clusters.size(); if (!silent){ std::cout << "\t\t\t" << init_sz << " clusters to validate\n"; } validate_clusters (uf_clst, init_clusters, list_seeds, max_mismatch, INT_MAX); // ---- generate final union find clusters ------ iivec_t clusters; uf_generate_cls (clusters, uf_clst); // ---- generate the duplicated fragment IDs ---------------- // iteration ending criteria % duplicate ID increase < 5% iset_t duplIDs; for (int i = 0; i < (int) clusters.size(); ++ i) { duplIDs.insert(clusters[i].begin() + 1, clusters[i].end()); } int perc_incr = 100 * (duplIDs.size() - pre_sz)/pre_sz; if (!silent){ std::cout << "\n\t\t\tduplicates: " << duplIDs.size() << " (" << perc_incr << " % increase)\n" ; } if (perc_incr < 5) break; else pre_sz = duplIDs.size(); break; }// while (true) } //

Exemplo n.º 3

0

Exibir arquivo

Arquivo: clustercat.c Projeto: StevenLOL/clustercat

int main(int argc, char **argv) { setlocale(LC_ALL, ""); // Comment-out on non-Posix systems clock_t time_start = clock(); time_t time_t_start; time(&time_t_start); argv_0_basename = basename(argv[0]); get_usage_string(usage, USAGE_LEN); // This is a big scary string, so build it elsewhere //printf("sizeof(cmd_args)=%zd\n", sizeof(cmd_args)); parse_cmd_args(argc, argv, usage, &cmd_args); if (cmd_args.class_algo == EXCHANGE || cmd_args.class_algo == EXCHANGE_BROWN) memusage += sizeof(float) * ENTROPY_TERMS_MAX; // We'll build the precomputed entropy terms after reporting memusage struct_model_metadata global_metadata; // The list of unique words should always include <s>, unknown word, and </s> map_update_count(&word_map, UNKNOWN_WORD, 0, 0); // Should always be first map_update_count(&word_map, "<s>", 0, 1); map_update_count(&word_map, "</s>", 0, 2); // Open input FILE *in_train_file = stdin; if (in_train_file_string) in_train_file = fopen(in_train_file_string, "r"); if (in_train_file == NULL) { fprintf(stderr, "%s: Error: Unable to open input file %s\n", argv_0_basename, in_train_file_string); fflush(stderr); exit(15); } // Process input sentences size_t input_memusage = 0; const struct_model_metadata input_model_metadata = process_input(cmd_args, in_train_file, &word_map, &initial_bigram_map, &input_memusage); memusage += input_memusage; fclose(in_train_file); clock_t time_input_processed = clock(); if (cmd_args.verbose >= -1) fprintf(stderr, "%s: Corpus processed in %'.2f CPU secs. %'lu lines, %'u types, %'lu tokens, current memusage: %'.1fMB\n", argv_0_basename, (double)(time_input_processed - time_start)/CLOCKS_PER_SEC, input_model_metadata.line_count, input_model_metadata.type_count, input_model_metadata.token_count, (double)memusage / 1048576); fflush(stderr); global_metadata.token_count = input_model_metadata.token_count; global_metadata.type_count = map_count(&word_map); // Filter out infrequent words, reassign word_id's, and build a mapping from old word_id's to new word_id's sort_by_count(&word_map); word_id_t * restrict word_id_remap = calloc(sizeof(word_id_t), input_model_metadata.type_count); get_ids(&word_map, word_id_remap); word_id_t number_of_deleted_words = filter_infrequent_words(cmd_args, &global_metadata, &word_map, word_id_remap); // Get list of unique words char * * restrict word_list = (char **)malloc(sizeof(char*) * global_metadata.type_count); memusage += sizeof(char*) * global_metadata.type_count; reassign_word_ids(&word_map, word_list, word_id_remap); get_keys(&word_map, word_list); sort_by_id(&word_map); // Check or set number of classes if (cmd_args.num_classes >= global_metadata.type_count) { // User manually set number of classes is too low fprintf(stderr, "%s: Error: Number of classes (%u) is not less than vocabulary size (%u). Decrease the value of --classes\n", argv_0_basename, cmd_args.num_classes, global_metadata.type_count); fflush(stderr); exit(3); } else if (cmd_args.num_classes == 0) { // User did not manually set number of classes at all cmd_args.num_classes = (wclass_t) (sqrt(global_metadata.type_count) * 1.2); } // Build array of word_counts word_count_t * restrict word_counts = malloc(sizeof(word_count_t) * global_metadata.type_count); memusage += sizeof(word_count_t) * global_metadata.type_count; build_word_count_array(&word_map, word_list, word_counts, global_metadata.type_count); // Initialize clusters, and possibly read-in external class file wclass_t * restrict word2class = malloc(sizeof(wclass_t) * global_metadata.type_count); memusage += sizeof(wclass_t) * global_metadata.type_count; init_clusters(cmd_args, global_metadata.type_count, word2class, word_counts, word_list); if (initial_class_file != NULL) import_class_file(&word_map, word2class, initial_class_file, cmd_args.num_classes); // Overwrite subset of word mappings, from user-provided initial_class_file // Remap word_id's in initial_bigram_map remap_and_rev_bigram_map(&initial_bigram_map, &new_bigram_map, &new_bigram_map_rev, word_id_remap, map_find_id(&word_map, UNKNOWN_WORD, -1)); global_metadata.start_sent_id = map_find_id(&word_map, "<s>", -1);; // need this for tallying emission probs global_metadata.end_sent_id = map_find_id(&word_map, "</s>", -1);; // need this for tallying emission probs global_metadata.line_count = map_find_count(&word_map, "</s>"); // Used for calculating perplexity if (global_metadata.line_count == 0) { fprintf(stderr, "%s: Warning: Number of lines is 0. Include <s> and </s> in your ngram counts, or perplexity values will be unreliable.\n", argv_0_basename); fflush(stderr); } //printf("init_bigram_map hash_count=%u\n", HASH_COUNT(initial_bigram_map)); fflush(stdout); //printf("new_bigram_map hash_count=%u\n", HASH_COUNT(new_bigram_map)); fflush(stdout); free(word_id_remap); memusage -= sizeof(word_id_t) * input_model_metadata.type_count; delete_all(&word_map); // static delete_all_bigram(&initial_bigram_map); // static memusage -= input_memusage; // Initialize and set word bigram listing clock_t time_bigram_start = clock(); size_t bigram_memusage = 0; size_t bigram_rev_memusage = 0; struct_word_bigram_entry * restrict word_bigrams = NULL; struct_word_bigram_entry * restrict word_bigrams_rev = NULL; if (cmd_args.verbose >= -1) fprintf(stderr, "%s: Word bigram listing ... ", argv_0_basename); fflush(stderr); #pragma omp parallel sections // Both bigram listing and reverse bigram listing can be done in parallel { #pragma omp section { //sort_bigrams(&new_bigram_map); // speeds things up later word_bigrams = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry)); memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count; bigram_memusage = set_bigram_counts(word_bigrams, new_bigram_map); // Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering for (word_id_t word = 0; word < global_metadata.type_count; word++) word_bigrams[word].headword_count = word_counts[word]; } // Initialize and set *reverse* word bigram listing #pragma omp section { if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used //sort_bigrams(&new_bigram_map_rev); // speeds things up later word_bigrams_rev = calloc(global_metadata.type_count, sizeof(struct_word_bigram_entry)); memusage += sizeof(struct_word_bigram_entry) * global_metadata.type_count; bigram_rev_memusage = set_bigram_counts(word_bigrams_rev, new_bigram_map_rev); // Copy entries in word_counts to struct_word_bigram_entry.headword_count since that struct entry is already loaded when clustering for (word_id_t word = 0; word < global_metadata.type_count; word++) word_bigrams_rev[word].headword_count = word_counts[word]; } } } delete_all_bigram(&new_bigram_map); delete_all_bigram(&new_bigram_map_rev); memusage += bigram_memusage + bigram_rev_memusage; clock_t time_bigram_end = clock(); if (cmd_args.verbose >= -1) fprintf(stderr, "in %'.2f CPU secs. Bigram memusage: %'.1f MB\n", (double)(time_bigram_end - time_bigram_start)/CLOCKS_PER_SEC, (bigram_memusage + bigram_rev_memusage)/(double)1048576); fflush(stderr); //print_word_bigrams(global_metadata, word_bigrams, word_list); // Build <v,c> counts, which consists of a word followed by a given class word_class_count_t * restrict word_class_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t)); if (word_class_counts == NULL) { fprintf(stderr, "%s: Error: Unable to allocate enough memory for <v,c>. %'.1f MB needed. Maybe increase --min-count\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr); exit(13); } memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t); fprintf(stderr, "%s: Allocating %'.1f MB for word_class_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr); build_word_class_counts(cmd_args, word_class_counts, word2class, word_bigrams, global_metadata.type_count/*, word_list*/); //print_word_class_counts(cmd_args, global_metadata, word_class_counts); // Build reverse: <c,v> counts: class followed by word. This and the normal one are both pretty fast, so no need to parallelize this word_class_count_t * restrict word_class_rev_counts = NULL; if (cmd_args.rev_alternate) { // Don't bother building this if it won't be used word_class_rev_counts = calloc(1 + cmd_args.num_classes * global_metadata.type_count , sizeof(word_class_count_t)); if (word_class_rev_counts == NULL) { fprintf(stderr, "%s: Warning: Unable to allocate enough memory for <v,c>. %'.1f MB needed. Falling back to --rev-alternate 0\n", argv_0_basename, ((cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / (double)1048576 )); fflush(stderr); cmd_args.rev_alternate = 0; } else { memusage += cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t); fprintf(stderr, "%s: Allocating %'.1f MB for word_class_rev_counts: num_classes=%u x type_count=%u x sizeof(w-cl-count_t)=%zu\n", argv_0_basename, (double)(cmd_args.num_classes * global_metadata.type_count * sizeof(word_class_count_t)) / 1048576 , cmd_args.num_classes, global_metadata.type_count, sizeof(word_class_count_t)); fflush(stderr); build_word_class_counts(cmd_args, word_class_rev_counts, word2class, word_bigrams_rev, global_metadata.type_count/*, word_list*/); } } // Calculate memusage for count_arrays for (unsigned char i = 1; i <= cmd_args.max_array; i++) { memusage += 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)); //printf("11 memusage += %zu (now=%zu) count_arrays\n", 2 * (powi(cmd_args.num_classes, i) * sizeof(wclass_count_t)), memusage); fflush(stdout); } clock_t time_model_built = clock(); if (cmd_args.verbose >= -1) fprintf(stderr, "%s: Finished loading %'lu tokens and %'u types (%'u filtered) from %'lu lines in %'.2f CPU secs\n", argv_0_basename, global_metadata.token_count, global_metadata.type_count, number_of_deleted_words, global_metadata.line_count, (double)(time_model_built - time_start)/CLOCKS_PER_SEC); fflush(stderr); if (cmd_args.verbose >= -1) fprintf(stderr, "%s: Approximate memory usage at clustering: %'.1fMB\n", argv_0_basename, (double)memusage / 1048576); fflush(stderr); cluster(cmd_args, global_metadata, word_counts, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts); // Now print the final word2class mapping if (cmd_args.verbose >= 0) { FILE *out_file = stdout; if (out_file_string) out_file = fopen(out_file_string, "w"); if (out_file == NULL) { fprintf(stderr, "%s: Error: Unable to open output file %s\n", argv_0_basename, out_file_string); fflush(stderr); exit(16); } if (cmd_args.class_algo == EXCHANGE && (!cmd_args.print_word_vectors)) { print_words_and_classes(out_file, global_metadata.type_count, word_list, word_counts, word2class, (int)cmd_args.class_offset, cmd_args.print_freqs); } else if (cmd_args.class_algo == EXCHANGE && cmd_args.print_word_vectors) { print_words_and_vectors(out_file, cmd_args, global_metadata, word_list, word2class, word_bigrams, word_bigrams_rev, word_class_counts, word_class_rev_counts); } fclose(out_file); } clock_t time_clustered = clock(); time_t time_t_end; time(&time_t_end); double time_secs_total = difftime(time_t_end, time_t_start); if (cmd_args.verbose >= -1) fprintf(stderr, "%s: Finished clustering in %'.2f CPU seconds. Total wall clock time was about %lim %lis\n", argv_0_basename, (double)(time_clustered - time_model_built)/CLOCKS_PER_SEC, (long)time_secs_total/60, ((long)time_secs_total % 60) ); free(word2class); free(word_bigrams); free(word_list); free(word_counts); exit(0); }