Ejemplo n.º 1
0
data_set_type tfidf_driver( directory_listing_type & dir_list ) {
    struct timespec wc_end, tfidf_begin, tfidf_end;

    // word count
    get_time( tfidf_begin );
    size_t num_files = dir_list.size();
    std::vector<intm_map_type> catalog;
    catalog.resize( num_files );

    asap::word_container_reducer<agg_map_type> allwords;
    cilk_for( size_t i=0; i < num_files; ++i ) {
	// File to read
	std::string filename = *std::next(dir_list.cbegin(),i);
	// Internally use the type intl_map_type, then merge into the catalog[i]
	asap::word_catalog<intl_map_type>( std::string(filename), catalog[i] );
	// The list of pairs is sorted if intl_map_type is based on std::map
	// but it is not sorted if based on std::unordered_map
	if( do_sort )
	    kv_sort<intm_map_type,can_sort>( catalog[i] );
	// asap::pair_cmp<typename intl_map_type::value_type,
		       	// typename intl_map_type::value_type>() );

	// std::cerr << ": " << catalog[i].size() << " words\n";
	// Reading from std::vector rather than std::map should be faster...
	// Validated: about 10% on word count, 20% on TF/IDF, 16 threads
	// TODO: replace by post-processing parallel multi-way merge?
	allwords.count_presence( catalog[i] );
    }
    get_time( wc_end );

    std::shared_ptr<agg_map_type> allwords_ptr
	= std::make_shared<agg_map_type>();
    allwords_ptr->swap( allwords.get_value() );

    std::shared_ptr<directory_listing_type> dir_list_ptr
	= std::make_shared<directory_listing_type>();
    dir_list_ptr->swap( dir_list );

    asap::internal::assign_ids( allwords_ptr->begin(), allwords_ptr->end() );

    data_set_type tfidf(
	by_words
	? asap::tfidf_by_words<typename data_set_type::vector_type>(
	    catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr,
	    do_sort ) // whether catalogs are sorted
	: asap::tfidf<typename data_set_type::vector_type>(
	    catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr,
	    false, true ) // whether catalogs are sorted
	);
    get_time(tfidf_end);

    print_time("word count", tfidf_begin, wc_end);
    print_time("TF/IDF", wc_end, tfidf_end);
    std::cerr << "TF/IDF vectors: " << tfidf.get_num_points() << '\n';
    std::cerr << "TF/IDF dimensions: " << tfidf.get_dimensions() << '\n';
    print_time("library", tfidf_begin, tfidf_end);

    return tfidf;
}
Ejemplo n.º 2
0
data_set_type tfidf_driver( directory_listing_type & dir_list ) {
    struct timespec wc_end, tfidf_begin, tfidf_end;

    // word count
    get_time( tfidf_begin );
    size_t num_files = dir_list.size();
    std::vector<intm_map_type> catalog;
    catalog.resize( num_files );

    asap::ngram_container_reducer<agg_map_type> allwords;
    allwords.get_value().set_growth( 1, 2 );

    cilk_for( size_t i=0; i < num_files; ++i ) {
	// File to read
	std::string filename = *std::next(dir_list.cbegin(),i);
	catalog[i].set_growth( 1, 2 );
	size_t ngrams = asap::ngram_catalog<intl_map_type>( filename, catalog[i] );

	// The list of pairs is sorted if intl_map_type is based on std::map
	// but it is not sorted if based on std::unordered_map
	if( do_sort )
	    kv_sort<intm_map_type,can_sort>( catalog[i] );

	// std::cerr << filename << ": " << ngrams << " ngrams\n";
	allwords.count_presence( catalog[i] );
    }
    get_time( wc_end );

    std::shared_ptr<agg_map_type> allwords_ptr
	= std::make_shared<agg_map_type>();
    allwords_ptr->swap( allwords.get_value() );

    std::shared_ptr<directory_listing_type> dir_list_ptr
	= std::make_shared<directory_listing_type>();
    dir_list_ptr->swap( dir_list );

    asap::internal::assign_ids( allwords_ptr->begin(), allwords_ptr->end() );

    data_set_type tfidf(
	by_words
	? asap::tfidf_by_words<typename data_set_type::vector_type>(
	    catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr,
	    do_sort ) // whether catalogs are sorted
	: asap::tfidf<typename data_set_type::vector_type>(
	    catalog.cbegin(), catalog.cend(), allwords_ptr, dir_list_ptr,
	    do_sort, // whether sorted by word
	    do_sort )
	); // whether catalogs are sorted
    get_time(tfidf_end);

    print_time("ngram count", tfidf_begin, wc_end);
    print_time("TF/IDF", wc_end, tfidf_end);
    std::cerr << "TF/IDF vectors: " << tfidf.get_num_points() << '\n';
    std::cerr << "TF/IDF dimensions: " << tfidf.get_dimensions() << '\n';
    print_time("library", tfidf_begin, tfidf_end);

    return tfidf;
}
Ejemplo n.º 3
0
int tester(char* path) {
 
  printf("started.\n");

  /// 文書集合.
  index_table table;

  assert(parse(path, &table) == 0);
  assert(table.n > 0);
  assert(table.m > 0);

  int tf_val = -1;
  assert((tf_val = tf(1552, 20, &table)) > -1);
  assert((tf_val = tf(152, 520, &table)) > -1);
  assert((tf_val = tf(1234, 324, &table)) > -1);
  printf("%d\n", tf_val);

  assert(tfidf(24, &table) > -1.0);

  return 0;
}
Ejemplo n.º 4
0
int main(int argc, char **argv) {
    struct timespec begin, end;
    struct timespec veryStart;

    srand( time(NULL) );

    get_time( begin );
    get_time( veryStart );

    // read args
    parse_args(argc,argv);

    std::cerr << "Available threads: " << __cilkrts_get_nworkers() << "\n";

    get_time (end);
    print_time("init", begin, end);

    // Directory listing
    get_time( begin );
    typedef asap::word_list<std::deque<const char*>, asap::word_bank_managed>
	directory_listing_type;
    directory_listing_type dir_list;
    asap::get_directory_listing( indir, dir_list );
    get_time (end);
    print_time("directory listing", begin, end);

    typedef size_t index_type;
    typedef asap::word_bank_pre_alloc word_bank_type;

    typedef asap::sparse_vector<index_type, float, false,
				asap::mm_no_ownership_policy>
	vector_type;

/*
    typedef asap::word_map<
	std::unordered_map<const char *, size_t, asap::text::charp_hash,
			   asap::text::charp_eql>,
	word_bank_type> internal_map_type;

    typedef asap::kv_list<std::vector<std::pair<const char *, size_t>>,
			  word_bank_type> intermediate_map_type;

    typedef asap::word_map<
	std::unordered_map<const char *,
			   asap::appear_count<size_t, index_type>,
			   asap::text::charp_hash, asap::text::charp_eql>,
	word_bank_type> aggregate_map_type;
*/
    typedef asap::hash_table<const char *, size_t, asap::text::charp_hash, asap::text::charp_eql> wc_unordered_map;
    typedef asap::hash_table<const char *,
		       asap::appear_count<size_t, index_type>,
		       asap::text::charp_hash, asap::text::charp_eql> dc_unordered_map;

    typedef asap::word_map<wc_unordered_map, word_bank_type> internal_map_type;
    typedef asap::kv_list<std::vector<std::pair<const char*, size_t>>,
			  word_bank_type> intermediate_map_type;
    typedef asap::word_map<dc_unordered_map, word_bank_type> aggregate_map_type;

    typedef asap::data_set<vector_type, aggregate_map_type,
			   directory_listing_type> data_set_type;

    data_set_type tfidf(
	intm_map
	? tfidf_driver<directory_listing_type, internal_map_type,
	internal_map_type, aggregate_map_type,
	data_set_type, false>( dir_list )
	: tfidf_driver<directory_listing_type, internal_map_type,
	intermediate_map_type, aggregate_map_type,
	data_set_type, true>( dir_list )
	);

    get_time( begin );
    if( outfile )
	asap::arff_write( outfile, tfidf );
    get_time (end);
    print_time("output", begin, end);

    print_time("complete time", veryStart, end);

    return 0;
}