data_set_type tfidf_driver( directory_listing_type & dir_list ) { struct timespec wc_end, tfidf_begin, tfidf_end; // word count get_time( tfidf_begin ); size_t num_files = dir_list.size(); std::vector<intm_map_type> catalog; catalog.resize( num_files ); asap::word_container_reducer<agg_map_type> allwords; cilk_for( size_t i=0; i < num_files; ++i ) { // File to read std::string filename = *std::next(dir_list.cbegin(),i); // Internally use the type intl_map_type, then merge into the catalog[i] asap::word_catalog<intl_map_type>( std::string(filename), catalog[i] ); // The list of pairs is sorted if intl_map_type is based on std::map // but it is not sorted if based on std::unordered_map if( do_sort ) kv_sort<intm_map_type,can_sort>( catalog[i] ); // asap::pair_cmp<typename intl_map_type::value_type, // typename intl_map_type::value_type>() ); // std::cerr << ": " << catalog[i].size() << " words\n"; // Reading from std::vector rather than std::map should be faster... // Validated: about 10% on word count, 20% on TF/IDF, 16 threads // TODO: replace by post-processing parallel multi-way merge? allwords.count_presence( catalog[i] ); } get_time( wc_end ); std::shared_ptr<agg_map_type> allwords_ptr = std::make_shared<agg_map_type>(); allwords_ptr->swap( allwords.get_value() ); std::shared_ptr<directory_listing_type> dir_list_ptr = std::make_shared<directory_listing_type>(); dir_list_ptr->swap( dir_list ); asap::internal::assign_ids( allwords_ptr->begin(), allwords_ptr->end() ); data_set_type tfidf( by_words ? asap::tfidf_by_words<typename data_set_type::vector_type>( catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr, do_sort ) // whether catalogs are sorted : asap::tfidf<typename data_set_type::vector_type>( catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr, false, true ) // whether catalogs are sorted ); get_time(tfidf_end); print_time("word count", tfidf_begin, wc_end); print_time("TF/IDF", wc_end, tfidf_end); std::cerr << "TF/IDF vectors: " << tfidf.get_num_points() << '\n'; std::cerr << "TF/IDF dimensions: " << tfidf.get_dimensions() << '\n'; print_time("library", tfidf_begin, tfidf_end); return tfidf; }
data_set_type tfidf_driver( directory_listing_type & dir_list ) { struct timespec wc_end, tfidf_begin, tfidf_end; // word count get_time( tfidf_begin ); size_t num_files = dir_list.size(); std::vector<intm_map_type> catalog; catalog.resize( num_files ); asap::ngram_container_reducer<agg_map_type> allwords; allwords.get_value().set_growth( 1, 2 ); cilk_for( size_t i=0; i < num_files; ++i ) { // File to read std::string filename = *std::next(dir_list.cbegin(),i); catalog[i].set_growth( 1, 2 ); size_t ngrams = asap::ngram_catalog<intl_map_type>( filename, catalog[i] ); // The list of pairs is sorted if intl_map_type is based on std::map // but it is not sorted if based on std::unordered_map if( do_sort ) kv_sort<intm_map_type,can_sort>( catalog[i] ); // std::cerr << filename << ": " << ngrams << " ngrams\n"; allwords.count_presence( catalog[i] ); } get_time( wc_end ); std::shared_ptr<agg_map_type> allwords_ptr = std::make_shared<agg_map_type>(); allwords_ptr->swap( allwords.get_value() ); std::shared_ptr<directory_listing_type> dir_list_ptr = std::make_shared<directory_listing_type>(); dir_list_ptr->swap( dir_list ); asap::internal::assign_ids( allwords_ptr->begin(), allwords_ptr->end() ); data_set_type tfidf( by_words ? asap::tfidf_by_words<typename data_set_type::vector_type>( catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr, do_sort ) // whether catalogs are sorted : asap::tfidf<typename data_set_type::vector_type>( catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr, do_sort, // whether sorted by word do_sort ) ); // whether catalogs are sorted get_time(tfidf_end); print_time("ngram count", tfidf_begin, wc_end); print_time("TF/IDF", wc_end, tfidf_end); std::cerr << "TF/IDF vectors: " << tfidf.get_num_points() << '\n'; std::cerr << "TF/IDF dimensions: " << tfidf.get_dimensions() << '\n'; print_time("library", tfidf_begin, tfidf_end); return tfidf; }
int tester(char* path) { printf("started.\n"); /// 文書集合. index_table table; assert(parse(path, &table) == 0); assert(table.n > 0); assert(table.m > 0); int tf_val = -1; assert((tf_val = tf(1552, 20, &table)) > -1); assert((tf_val = tf(152, 520, &table)) > -1); assert((tf_val = tf(1234, 324, &table)) > -1); printf("%d\n", tf_val); assert(tfidf(24, &table) > -1.0); return 0; }
int main(int argc, char **argv) { struct timespec begin, end; struct timespec veryStart; srand( time(NULL) ); get_time( begin ); get_time( veryStart ); // read args parse_args(argc,argv); std::cerr << "Available threads: " << __cilkrts_get_nworkers() << "\n"; get_time (end); print_time("init", begin, end); // Directory listing get_time( begin ); typedef asap::word_list<std::deque<const char*>, asap::word_bank_managed> directory_listing_type; directory_listing_type dir_list; asap::get_directory_listing( indir, dir_list ); get_time (end); print_time("directory listing", begin, end); typedef size_t index_type; typedef asap::word_bank_pre_alloc word_bank_type; typedef asap::sparse_vector<index_type, float, false, asap::mm_no_ownership_policy> vector_type; /* typedef asap::word_map< std::unordered_map<const char *, size_t, asap::text::charp_hash, asap::text::charp_eql>, word_bank_type> internal_map_type; typedef asap::kv_list<std::vector<std::pair<const char *, size_t>>, word_bank_type> intermediate_map_type; typedef asap::word_map< std::unordered_map<const char *, asap::appear_count<size_t, index_type>, asap::text::charp_hash, asap::text::charp_eql>, word_bank_type> aggregate_map_type; */ typedef asap::hash_table<const char *, size_t, asap::text::charp_hash, asap::text::charp_eql> wc_unordered_map; typedef asap::hash_table<const char *, asap::appear_count<size_t, index_type>, asap::text::charp_hash, asap::text::charp_eql> dc_unordered_map; typedef asap::word_map<wc_unordered_map, word_bank_type> internal_map_type; typedef asap::kv_list<std::vector<std::pair<const char*, size_t>>, word_bank_type> intermediate_map_type; typedef asap::word_map<dc_unordered_map, word_bank_type> aggregate_map_type; typedef asap::data_set<vector_type, aggregate_map_type, directory_listing_type> data_set_type; data_set_type tfidf( intm_map ? tfidf_driver<directory_listing_type, internal_map_type, internal_map_type, aggregate_map_type, data_set_type, false>( dir_list ) : tfidf_driver<directory_listing_type, internal_map_type, intermediate_map_type, aggregate_map_type, data_set_type, true>( dir_list ) ); get_time( begin ); if( outfile ) asap::arff_write( outfile, tfidf ); get_time (end); print_time("output", begin, end); print_time("complete time", veryStart, end); return 0; }