Beispiel #1
0
int main(int argc, const char ** argv) {
  print_copyright();

  /* CE_Graph initialization will read the command line 
     arguments and the configuration file. */
  CE_Graph_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("item-cf2");    
  /* Basic arguments for application */
  min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection);

  distance_metric          = get_option_int("distance", JACCARD_WEIGHT);
      if (distance_metric != JACCARD_WEIGHT)
    logstream(LOG_FATAL)<<"--distance_metrix=XX should be one of:9= JACCARD_WEIGHT" << std::endl;
  debug                    = get_option_int("debug", 0);
  parse_command_line_args();

  //if (distance_metric != JACKARD && distance_metric != AA && distance_metric != RA)
  //  logstream(LOG_FATAL)<<"Wrong distance metric. --distance_metric=XX, where XX should be either 0) JACKARD, 1) AA, 2) RA" << std::endl;  

  mytimer.start();
  int nshards          = convert_matrixmarket<EdgeDataType>(training, 0, 0, 3, TRAINING, true);

  assert(M > 0 && N > 0);

  //initialize data structure which saves a subset of the items (pivots) in memory
  adjcontainer = new adjlist_container();

  /* Run */
  ItemDistanceProgram program;
  CE_Graph_engine<VertexDataType, EdgeDataType> engine(training, nshards, true, m); 
  set_engine_flags(engine);

  //open output files as the number of operating threads
  out_files.resize(number_of_omp_threads());
  for (uint i=0; i< out_files.size(); i++){
    char buf[256];
    sprintf(buf, "%s.out%d", training.c_str(), i);
    out_files[i] = open_file(buf, "w");
  }

  //run the program
  engine.run(program, niters);

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  
  std::cout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << written_pairs << std::endl;

  for (uint i=0; i< out_files.size(); i++)
    fclose(out_files[i]);

  std::cout<<"Created output files with the format: " << training << ".outXX, where XX is the output thread number" << std::endl; 

  return 0;
}
int main(int argc, const char ** argv) {
    print_copyright();

    /* GraphChi initialization will read the command line
       arguments and the configuration file. */
    graphchi_init(argc, argv);

    /* Metrics object for keeping track of performance counters
       and other information. Currently required. */
    metrics m("itemsim2rating2");

    /* Basic arguments for application */
    min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection);
    debug                    = get_option_int("debug", 0);
    parse_command_line_args();
    std::string similarity   = get_option_string("similarity", "");
    if (similarity == "")
        Rcpp::Rcerr<<"Missing similarity input file. Please specify one using the --similarity=filename command line flag" << std::endl;
    undirected               = get_option_int("undirected", 1);
    Q                        = get_option_float("Q", Q);
    K 			   = get_option_int("K");

    mytimer.start();
    vec unused;
    int nshards          = convert_matrixmarket_and_item_similarity<edge_data>(training, similarity, 3, unused);

    assert(M > 0 && N > 0);

    //initialize data structure which saves a subset of the items (pivots) in memory
    adjcontainer = new adjlist_container();

    //array for marking which items are conected to the pivot items via users.
    relevant_items = new bool[N];

    /* Run */
    ItemDistanceProgram program;
    graphchi_engine<VertexDataType, edge_data> engine(training, nshards, true, m);
    set_engine_flags(engine);

    out_file = open_file((training + "-rec").c_str(), "w");

    //run the program
    engine.run(program, niters);

    /* Report execution metrics */
    if (!quiet)
        metrics_report(m);

    Rcpp::Rcout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << written_pairs << std::endl;

    if (zero_edges)
        Rcpp::Rcout<<"Found: " << zero_edges<< " user edges with weight zero. Those are ignored." <<std::endl;

    delete[] relevant_items;
    fclose(out_file);
    return 0;
}
Beispiel #3
0
int main(int argc, const char ** argv) {
  //* GraphChi initialization will read the command line arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("climf-inmemory-factors");

  /* Basic arguments for application. NOTE: File will be automatically 'sharded'. */
  sgd_lambda    = get_option_float("sgd_lambda", 1e-3);
  sgd_gamma     = get_option_float("sgd_gamma", 1e-4);
  sgd_step_dec  = get_option_float("sgd_step_dec", 1.0);
  binary_relevance_thresh = get_option_float("binary_relevance_thresh", 0);
  halt_on_mrr_decrease = get_option_int("halt_on_mrr_decrease", 0);
  num_ratings = get_option_int("num_ratings", 10000); //number of top predictions over which we compute actual MRR
  verbose     = get_option_int("verbose", 0);
  debug       = get_option_int("debug", 0);

  parse_command_line_args();
  parse_implicit_command_line();

  /* Preprocess data if needed, or discover preprocess files */
  bool allow_square = false;
  int nshards = convert_matrixmarket<EdgeDataType>(training, 0, 0, 3, TRAINING, allow_square);
  init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, !load_factors_from_file, 0.01);

  if (validation != ""){
    int vshards = convert_matrixmarket<EdgeDataType>(validation, 0, 0, 3, VALIDATION);
    init_mrr_engine<VertexDataType, EdgeDataType>(pvalidation_engine, vshards);
  }

  if (load_factors_from_file)
  {
    load_matrix_market_matrix(training + "_U.mm", 0, D);
    load_matrix_market_matrix(training + "_V.mm", M, D);
  }

  print_config();

  /* Run */
  SGDVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m);
  set_engine_flags(engine);
  pengine = &engine;
  engine.run(program, niters);

  /* Output latent factor matrices in matrix-market format */
  output_sgd_result(training);
  test_predictions(&climf_predict);

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  
  return 0;
}
 binary_adjacency_list_writer(std::string filename) : filename(filename) {
     bufsize = (int) get_option_int("preprocessing.bufsize", 64 * 1024 * 1024);
     assert(bufsize > 1024 * 1024);
     fd = open(filename.c_str(), O_WRONLY | O_CREAT, S_IROTH | S_IWOTH | S_IWUSR | S_IRUSR);
     if (fd < 0) {
         logstream(LOG_FATAL) << "Could not open file " << filename << " for writing. " <<
         " Error: " << strerror(errno) << std::endl;
     }
     int res = ftruncate(fd, 0);
     if (res != 0) {
         logstream(LOG_FATAL) << "Could not truncate file " << filename <<
             " Error: " << strerror(errno) << std::endl;
     }
     assert(res == 0);
     
     header.format_version = FORMAT_VERSION;
     header.max_vertex_id = 0;
     header.contains_edge_values = false;
     header.numedges = 0;
     header.edge_value_size = (uint32_t) sizeof(EdgeDataType);
     
     buf = (char*) malloc(bufsize);
     bufptr = buf;
     bwrite<bin_adj_header>(fd, buf, bufptr,  header);
     counter = 0;
     lastid = 0;
     initialized = false;
     assert(fd >= 0);
 }
Beispiel #5
0
int main(int argc, const char ** argv) {

  print_copyright();
 
  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("als-inmemory-factors");

  lambda        = get_option_float("lambda", 0.065);
  user_sparsity = get_option_float("user_sparsity", 0.9);
  movie_sparsity = get_option_float("movie_sparsity", 0.9);
  algorithm      = get_option_int("algorithm", SPARSE_USR_FACTOR);

  parse_command_line_args();
  parse_implicit_command_line(); 

  if (user_sparsity < 0.5 || user_sparsity >= 1)
    logstream(LOG_FATAL)<<"Sparsity level should be [0.5,1). Please run again using --user_sparsity=XX in this range" << std::endl;

  if (movie_sparsity < 0.5 || movie_sparsity >= 1)
    logstream(LOG_FATAL)<<"Sparsity level should be [0.5,1). Please run again using --movie_sparsity=XX in this range" << std::endl;

if (algorithm != SPARSE_USR_FACTOR && algorithm != SPARSE_BOTH_FACTORS && algorithm != SPARSE_ITM_FACTOR)
    logstream(LOG_FATAL)<<"Algorithm should be 1 for SPARSE_USR_FACTOR, 2 for SPARSE_ITM_FACTOR and 3 for SPARSE_BOTH_FACTORS" << std::endl;

  /* Preprocess data if needed, or discover preprocess files */
  int nshards = convert_matrixmarket<EdgeDataType>(training);
  init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, !load_factors_from_file);
  if (validation != ""){
    int vshards = convert_matrixmarket<EdgeDataType>(validation, 0, 0, 3, VALIDATION);
    init_validation_rmse_engine<VertexDataType, EdgeDataType>(pvalidation_engine, vshards, &sparse_als_predict);
  }
 

  if (load_factors_from_file){
    load_matrix_market_matrix(training + "_U.mm", 0, D);
    load_matrix_market_matrix(training + "_V.mm", M, D);
  }

  /* Run */
  ALSVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
  set_engine_flags(engine);
  pengine = &engine;
  engine.run(program, niters);

  /* Output latent factor matrices in matrix-market format */
  output_als_result(training);
  test_predictions(&sparse_als_predict);    

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  return 0;
}
Beispiel #6
0
int main(int argc, const char ** argv) {

    print_copyright();

    //* GraphChi initialization will read the command line arguments and the configuration file. */
    graphchi_init(argc, argv);

    /* Metrics object for keeping track of performance counters
       and other information. Currently required. */
    metrics m("rbm-inmemory-factors");

    /* Basic arguments for RBM algorithm */
    rbm_bins      = get_option_int("rbm_bins", rbm_bins);
    rbm_alpha     = get_option_float("rbm_alpha", rbm_alpha);
    rbm_beta      = get_option_float("rbm_beta", rbm_beta);
    rbm_mult_step_dec  = get_option_float("rbm_mult_step_dec", rbm_mult_step_dec);
    rbm_scaling   = get_option_float("rbm_scaling", rbm_scaling);

    parse_command_line_args();
    parse_implicit_command_line();

    mytimer.start();

    /* Preprocess data if needed, or discover preprocess files */
    int nshards = convert_matrixmarket<float>(training);

    rbm_init();

    if (validation != "") {
        int vshards = convert_matrixmarket<EdgeDataType>(validation, 0, 0, 3, VALIDATION);
        init_validation_rmse_engine<VertexDataType, EdgeDataType>(pvalidation_engine, vshards, &rbm_predict);
    }

    /* load initial state from disk (optional) */
    if (load_factors_from_file) {
        load_matrix_market_matrix(training + "_U.mm", 0, 3*D);
        load_matrix_market_matrix(training + "_V.mm", M, rbm_bins*(D+1));
    }

    print_config();

    /* Run */
    RBMVerticesInMemProgram program;
    graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m);
    set_engine_flags(engine);
    pengine = &engine;
    engine.run(program, niters);

    /* Output latent factor matrices in matrix-market format */
    output_rbm_result(training);
    test_predictions(&rbm_predict);


    /* Report execution metrics */
    if (!quiet)
        metrics_report(m);
    return 0;
}
Beispiel #7
0
 /**
   * Constructor
   * @param base_filename base file prefix
   */
 degree_data(std::string base_filename, stripedio * iomgr) : iomgr(iomgr), loaded_chunk(NULL) {
     vertex_st = vertex_en = 0;
     this->base_filename = base_filename;
     use_mmap = get_option_int("mmap", 0);  // Whether to mmap the degree file to memory
     if (use_mmap) {
         logstream(LOG_INFO) << "Use memory mapping for degree data." << std::endl;
     }
     open_file(base_filename);
 }
Beispiel #8
0
int main(int argc, const char ** argv) {

	print_copyright();

	//* GraphChi initialization will read the command line arguments and the configuration file. */
	graphchi_init(argc, argv);

	/* Metrics object for keeping track of performance counters
	   and other information. Currently required. */

	/* Basic arguments for application. NOTE: File will be automatically 'sharded'. */
	beta       = get_option_float("beta", 1);
	debug      = get_option_int("debug", 0);

	parse_command_line_args();
	parse_implicit_command_line();
	D          = 0; //no feature vector is needed
	binary_relevance_threshold = 0; //treat all edge values as binary

	/* Preprocess data if needed, or discover preprocess files */
	int nshards = convert_matrixmarket<EdgeDataType>(training, 0, 0, 3, TRAINING, false);
	init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, !load_factors_from_file);
       
	//read initial vector from file 
	std::cout << "Load CTR vector from file" << training << ":vec" << std::endl;
	load_matrix_market_vector(training + ":vec", Y_POS, false, false);

        mu_ij    = zeros(M+N);
        sigma_ij = ones(M+N);

	if (validation != ""){
                //read validation data (optional)
		vshards = convert_matrixmarket<EdgeDataType>(validation, 0, 0, 3, VALIDATION, false);
		validation_targets = load_matrix_market_vector(validation + ":vec", false, false);                
                Me = validation_targets.size();
	}


	print_config();

	/* Run */
	AdPredictorVerticesInMemProgram program;
	metrics m("adpredictor");
	graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
        set_engine_flags(engine);
	pengine = &engine;
	engine.run(program, niters);

	/* Output latent factor matrices in matrix-market format */
	output_adpredictor_result(training);

	/* Report execution metrics */
	if (!quiet)
		metrics_report(m);

	return 0;
}
Beispiel #9
0
int main(int argc, const char ** argv) {

  print_copyright();
 
  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("pmf-inmemory-factors");

  lambda        = get_option_float("lambda", 0.065);
  debug        = get_option_int("debug", debug);
  pmf_burn_in  = get_option_int("pmf_burn_in", pmf_burn_in);
  pmf_additional_output = get_option_int("pmf_additional_output", pmf_additional_output);
  
  parse_command_line_args();
  parse_implicit_command_line();


  /* Preprocess data if needed, or discover preprocess files */
  int nshards = convert_matrixmarket<edge_data>(training, NULL, 0, 0, 3, TRAINING, false);
  init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, !load_factors_from_file);
  init_pmf();

  if (load_factors_from_file){
    load_matrix_market_matrix(training + "_U.mm", 0, D);
    load_matrix_market_matrix(training + "_V.mm", M, D);
  }

  /* Run */
  PMFVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
  set_engine_flags(engine, true);
  pengine = &engine;
  engine.run(program, niters);

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);

  return 0;
}
Beispiel #10
0
void parse_implicit_command_line(){
   implicitratingweight = get_option_float("implicitratingweight", implicitratingweight);
   implicitratingvalue = get_option_float("implicitratingvalue", implicitratingvalue);
   implicitratingtype = get_option_int("implicitratingtype", implicitratingtype);
   if (implicitratingtype != IMPLICIT_RATING_RANDOM && implicitratingtype != IMPLICIT_RATING_DISABLED)
     logstream(LOG_FATAL)<<"Implicit rating type should be either 0 (IMPLICIT_RATING_DISABLED) or 1 (IMPLICIT_RATING_RANDOM)" << std::endl;
   implicitratingpercentage = get_option_float("implicitratingpercentage", implicitratingpercentage);
   if (implicitratingpercentage < 1e-8 && implicitratingpercentage > 0.8)
     logstream(LOG_FATAL)<<"Implicit rating percentage should be (1e-8, 0.8)" << std::endl;
}
Beispiel #11
0
 vertex_data_store(std::string base_filename, size_t nvertices, stripedio * iomgr) : iomgr(iomgr), loaded_chunk(NULL){
     vertex_st = vertex_en = 0;
     filename = filename_vertex_data<VertexDataType>(base_filename);
     
     mmap_file = NULL;
     last_nvertices = 0;
     use_mmap = get_option_int("mmap", 0);  // Whether to mmap the degree file to memory
     if (use_mmap) {
         logstream(LOG_INFO) << "Use memory mapping for vertex data." << std::endl;
         check_size(nvertices);
     } else {
         check_size(nvertices);
         open_file();
     }
     
 }    
 virtual void do_report(std::string name, std::string ident, std::map<std::string, metrics_entry> & entries) {
     if (ident != name) {
         fprintf(f, "[%s:%s]\n", name.c_str(), ident.c_str());
     } else {
         fprintf(f, "[%s]\n", name.c_str());
     }
     std::map<std::string, metrics_entry>::iterator it;
     
     for(it = entries.begin(); it != entries.end(); ++it) {
         metrics_entry ent = it->second;
         switch(ent.valtype) {
             case INTEGER:
                 
                 fprintf(f, "%s.%s=%ld\n", ident.c_str(), it->first.c_str(), (long int) (ent.value));
                 fprintf(f, "%s.%s.count=%lu\n", ident.c_str(), it->first.c_str(), ent.count);
                 fprintf(f, "%s.%s.min=%ld\n", ident.c_str(), it->first.c_str(), (long int) (ent.minvalue));
                 fprintf(f, "%s.%s.max=%ld\n", ident.c_str(), it->first.c_str(), (long int) (ent.maxvalue));
                 fprintf(f, "%s.%s.avg=%lf\n", ident.c_str(), it->first.c_str(), ent.cumvalue/ent.count);
                 break;
             case REAL:
             case TIME:
                 fprintf(f, "%s.%s=%lf\n", ident.c_str(), it->first.c_str(),  (ent.value));
                 fprintf(f, "%s.%s.count=%lu\n", ident.c_str(), it->first.c_str(), ent.count);
                 fprintf(f, "%s.%s.min=%lf\n", ident.c_str(), it->first.c_str(),  (ent.minvalue));
                 fprintf(f, "%s.%s.max=%lf\n", ident.c_str(), it->first.c_str(),  (ent.maxvalue));
                 fprintf(f, "%s.%s.avg=%lf\n", ident.c_str(), it->first.c_str(), ent.cumvalue/ent.count);
                 break;
             case STRING:
                 fprintf(f, "%s.%s=%s\n", ident.c_str(), it->first.c_str(), it->second.stringval.c_str());                                
                 break;
             case VECTOR:
                 break;
         }
     }
     
     fflush(f);        
     fclose(f);
     
   // Following code used only for research purposes.
     if (get_option_int("metrics.insert_to_db", 0) == 1) {
         std::string cmd = "python2.7 benchtodb.py " + filename;
         int err = system(cmd.c_str());
         if (err != 0) {
           std::cout << "Error running the python script." << std::endl;
         }
     }
 };
Beispiel #13
0
 bool check_origfile_modification_earlier(std::string basefilename, int nshards) {
     /* Compare last modified dates of the original graph and the shards */
     if (file_exists(basefilename) && get_option_int("disable-modtime-check", 0) == 0) {
         struct stat origstat, shardstat;
         int err1 = stat(basefilename.c_str(), &origstat);
         
         std::string adjfname = filename_shard_adj(basefilename, 0, nshards);
         int err2 = stat(adjfname.c_str(), &shardstat);
         
         if (err1 != 0 || err2 != 0) {
             logstream(LOG_ERROR) << "Error when checking file modification times:  " << strerror(errno) << std::endl;
             return nshards;
         }
         
         if (origstat.st_mtime > shardstat.st_mtime) {
             logstream(LOG_INFO) << "The input graph modification date was newer than of the shards." << std::endl;
             logstream(LOG_INFO) << "Going to delete old shards and recreate new ones. To disable " << std::endl;
             logstream(LOG_INFO) << "functionality, specify --disable-modtime-check=1" << std::endl;
             
             // Delete shards
             delete_shards<EdgeDataType>(basefilename, nshards);
             
             // Delete the bin-file
             std::string preprocfile = preprocess_filename<EdgeDataType>(basefilename);
             if (file_exists(preprocfile)) {
                 logstream(LOG_DEBUG) << "Deleting: " << preprocfile << std::endl;
                 int err = remove(preprocfile.c_str());
                 if (err != 0) {
                     logstream(LOG_ERROR) << "Error deleting file: " << preprocfile << ", " <<
                     strerror(errno) << std::endl;
                 }
             }
             return false;
         } else {
             return true;
         }
     
         
     }
     return true;
 }
Beispiel #14
0
void parse_parser_command_line_arges(){
  fc.string_features = get_option_string("features", fc.default_feature_str);
   csv = get_option_int("csv", 0);
  if (csv) 
    ptokens = csv_tokens;
  
  file_columns = get_option_int("file_columns", file_columns); //get the number of columns in the edge file
  //input sanity checks
  if (file_columns < 3)
    logstream(LOG_FATAL)<<"You must have at least 3 columns in input file: [from] [to] [value] on each line"<<std::endl;
  if (file_columns >= FEATURE_WIDTH)
    logstream(LOG_FATAL)<<"file_columns exceeds the allowed storage limit - please increase FEATURE_WIDTH and recompile." << std::endl;
  fc.from_pos = get_option_int("from_pos", fc.from_pos);
  fc.to_pos = get_option_int("to_pos", fc.to_pos);
  fc.val_pos = get_option_int("val_pos", fc.val_pos);
  if (fc.from_pos >= file_columns || fc.to_pos >= file_columns || fc.val_pos >= file_columns)
    logstream(LOG_FATAL)<<"Please note that column numbering of from_pos, to_pos and val_pos starts from zero and should be smaller than file_columns" << std::endl;
  if (fc.from_pos == fc.to_pos || fc.from_pos == fc.val_pos || fc.to_pos == fc.val_pos)
    logstream(LOG_FATAL)<<"from_pos, to_pos and val_pos should have different values" << std::endl; 
  if (fc.val_pos == -1)
    logstream(LOG_FATAL)<<"you must specify a target column using --val_pos=XXX. Colmn index starts from 0." << std::endl;
  has_header_titles = get_option_int("has_header_titles", has_header_titles);
  limit_rating= get_option_int("limit_rating", 0); 
  //parse features (optional)
  if (fc.string_features != ""){
    char * pfeatures = strdup(fc.string_features.c_str());
    char * pch = strtok(pfeatures, ptokens);
    int node = atoi(pch);
    if (node < 0 || node >= MAX_FEATURES+3)
      logstream(LOG_FATAL)<<"Feature id using the --features=XX command should be non negative, starting from zero"<<std::endl;
    if (node >= file_columns)
      logstream(LOG_FATAL)<<"Feature id using the --feature=XX command should be < file_columns (counting starts from zero)" << std::endl;
    if (node == fc.from_pos || node == fc.to_pos || node == fc.val_pos)
      logstream(LOG_FATAL)<<"Feature id " << node << " can not be equal to --from_pos, --to_pos or --val_pos " << std::endl;
    fc.feature_selection[node] = true;
    fc.total_features++;
    while ((pch = strtok(NULL, ptokens))!= NULL){
      node = atoi(pch);
      if (node < 0 || node >= MAX_FEATURES+3)
        logstream(LOG_FATAL)<<"Feature id using the --features=XX command should be non negative, starting from zero"<<std::endl;
      fc.feature_selection[node] = true;
      fc.total_features++;
    }
  }
  train_only = get_option_int("train_only", 0);
  validation_only = get_option_int("validation_only", 0);
  fc.node_id_maps.resize(2+fc.total_features);
  real_features_string = get_option_string("real_features", real_features_string);
   //parse real features (optional)
  if (real_features_string != ""){
    int i=0;
    char * pfeatures = strdup(real_features_string.c_str());
    char * pch = strtok(pfeatures, ptokens);
    int node = atoi(pch);
    if (node < 0 || node >= MAX_FEATURES+3)
      logstream(LOG_FATAL)<<"Feature id using the --real_features=XX command should be non negative, starting from zero"<<std::endl;
    if (node >= file_columns)
      logstream(LOG_FATAL)<<"Feature id using the --real_feature=XX command should be < file_columns (counting starts from zero)" << std::endl;
    if (node == fc.from_pos || node == fc.to_pos || node == fc.val_pos)
      logstream(LOG_FATAL)<<"Feature id " << node << " can not be equal to --from_pos, --to_pos or --val_pos " << std::endl;
    fc.real_features_indicators[node] = true;
    fc.feature_positions[node] = i;
    i++;
    while ((pch = strtok(NULL, ptokens))!= NULL){
      node = atoi(pch);
      if (node < 0 || node >= MAX_FEATURES+3)
        logstream(LOG_FATAL)<<"Feature id using the --real_features=XX command should be non negative, starting from zero"<<std::endl;
      fc.real_features_indicators[node] = true;
      fc.feature_positions[node] = i;
      i++;
    }
  }

 
}
Beispiel #15
0
int main(int argc, const char ** argv) {

  print_copyright();

  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("item-cf");    
  /* Basic arguments for application */
  min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection);
  distance_metric          = get_option_int("distance", JACCARD);
  asym_cosine_alpha        = get_option_float("asym_cosine_alpha", 0.5);
  debug                    = get_option_int("debug", debug);
  if (distance_metric != JACCARD && distance_metric != AA && distance_metric != RA && distance_metric != ASYM_COSINE && distance_metric != PROB)
    logstream(LOG_FATAL)<<"Wrong distance metric. --distance_metric=XX, where XX should be either 0= JACCARD, 1= AA, 2= RA, 3= ASYM_COSINE, 4 = PROB" << std::endl;  
  parse_command_line_args();

  mytimer.start();
  int nshards          = convert_matrixmarket<EdgeDataType>(training, 0, 0, 3, TRAINING, false);
  if (nshards != 1)
    logstream(LOG_FATAL)<<"This application currently supports only 1 shard" << std::endl;
  K                        = get_option_int("K", K);
  if (K <= 0)
    logstream(LOG_FATAL)<<"Please specify the number of ratings to generate for each user using the --K command" << std::endl;

 logstream(LOG_INFO) << "M = " << M << std::endl;
  assert(M > 0 && N > 0);
  //initialize data structure which saves a subset of the items (pivots) in memory
  adjcontainer = new adjlist_container();
  //array for marking which items are conected to the pivot items via users.
  relevant_items = new bool[N];

  //store node degrees in an array to be used for AA distance metric
  if (distance_metric == AA || distance_metric == RA || distance_metric == PROB)
    latent_factors_inmem.resize(M);
  if (distance_metric == PROB)
    prob_sim_normalization_constant = (double)L / (double)(M*N-L);


  /* Run */
  ItemDistanceProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, 1, true, m); 
  set_engine_flags(engine);
  engine.set_maxwindow(M+N+1);

  //open output files as the number of operating threads
  out_files.resize(number_of_omp_threads());
  for (uint i=0; i< out_files.size(); i++){
    char buf[256];
    sprintf(buf, "%s.out%d", training.c_str(), i);
    out_files[i] = open_file(buf, "w");
  }

  //run the program
  engine.run(program, niters);

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  
  std::cout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << sum(written_pairs) << " pairs with zero distance: " << zero_dist << std::endl;
  if (not_enough)
    logstream(LOG_WARNING)<<"Items that did not have enough similar items: " << not_enough << std::endl;
 
  for (uint i=0; i< out_files.size(); i++)
    fclose(out_files[i]);

  delete[] relevant_items;

  /* write the matrix market info header to be used later */
  FILE * pmm = fopen((training + "-topk:info").c_str(), "w");
  if (pmm == NULL)
    logstream(LOG_FATAL)<<"Failed to open " << training << ":info to file" << std::endl;
  fprintf(pmm, "%%%%MatrixMarket matrix coordinate real general\n");
  fprintf(pmm, "%u %u %u\n", N, N, (unsigned int)sum(written_pairs));
  fclose(pmm);

  /* sort output files */
  logstream(LOG_INFO)<<"Going to sort and merge output files " << std::endl;
  std::string dname= dirname(strdup(argv[0]));
  system(("bash " + dname + "/topk.sh " + std::string(basename(strdup(training.c_str())))).c_str()); 

  return 0;
}
Beispiel #16
0
int main(int argc, const char ** argv) {
  print_copyright();

  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("itemsim2rating2");    

  /* Basic arguments for application */
  min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection);
  debug                    = get_option_int("debug", 0);
  parse_command_line_args();
  std::string similarity   = get_option_string("similarity", "");
  if (similarity == "")
    logstream(LOG_FATAL)<<"Missing similarity input file. Please specify one using the --similarity=filename command line flag" << std::endl;
  undirected               = get_option_int("undirected", 0);
  
  mytimer.start();

  int nshards          = convert_matrixmarket_and_item_similarity<edge_data>(training, similarity, 3, &degrees);

  assert(M > 0 && N > 0);
  prob_sim_normalization_constant = (double)L / (double)(M*N-L);
  
  //initialize data structure which saves a subset of the items (pivots) in memory
  adjcontainer = new adjlist_container();

  //array for marking which items are conected to the pivot items via users.
  relevant_items = new bool[N];

  /* Run */
  ItemDistanceProgram program;
  graphchi_engine<VertexDataType, edge_data> engine(training, nshards, true, m); 
  set_engine_flags(engine);

  //open output files as the number of operating threads
  out_files.resize(number_of_omp_threads());
  for (uint i=0; i< out_files.size(); i++){
    char buf[256];
    sprintf(buf, "%s-rec.out%d", training.c_str(), i);
    out_files[i] = open_file(buf, "w");
  }


  K 			   = get_option_int("K");
  assert(K > 0);
  //run the program
  engine.run(program, niters);

  for (uint i=0; i< out_files.size(); i++)
    fclose(out_files[i]);
  
  delete[] relevant_items;


  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);

  std::cout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << sum(written_pairs) << std::endl;

  logstream(LOG_INFO)<<"Going to sort and merge output files " << std::endl;
  std::string dname= dirname(strdup(argv[0]));
  system(("bash " + dname + "/topk.sh " + std::string(basename(strdup((training+"-rec").c_str())))).c_str()); 


  return 0;
}
int main(int argc, const char ** argv) {

  print_copyright();
 
  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("label_propagation");

  alpha        = get_option_float("alpha", alpha);
  debug        = get_option_int("debug", debug);
  
  parse_command_line_args();


  //load graph (adj matrix) from file
  int nshards = convert_matrixmarket<EdgeDataType>(training, 0, 0, 3, TRAINING, true);
  if (M != N)
    logstream(LOG_FATAL)<<"Label propagation supports only square matrices" << std::endl;

  init_feature_vectors<std::vector<vertex_data> >(M, latent_factors_inmem, false);
  
  //load seed initialization from file
  load_matrix_market_matrix(training + ".seeds", 0, D);

  #pragma omp parallel for
  for (int i=0; i< (int)M; i++){

    //normalize seed probabilities to sum up to one
    if (latent_factors_inmem[i].seed){
      assert(sum(latent_factors_inmem[i].pvec) != 0);
      latent_factors_inmem[i].pvec /= sum(latent_factors_inmem[i].pvec);
      continue;
    }
    //other nodes get random label probabilities
    for (int j=0; j< D; j++)
       latent_factors_inmem[i].pvec[j] = drand48();
  }

  /* load initial state from disk (optional) */
  if (load_factors_from_file){
    load_matrix_market_matrix(training + "_U.mm", 0, D);
  }

  /* Run */
  LPVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
  set_engine_flags(engine);
  pengine = &engine;
  engine.run(program, niters);

  /* Output latent factor matrices in matrix-market format */
  output_lp_result(training);

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  
  return 0;
}
Beispiel #18
0
int main(int argc, const char ** argv) {


  print_copyright();  

  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("libfm");

  //specific command line parameters for libfm
  libfm_rate = get_option_float("libfm_rate", libfm_rate);
  libfm_regw = get_option_float("libfm_regw", libfm_regw);
  libfm_regv = get_option_float("libfm_regv", libfm_regv);
  libfm_mult_dec = get_option_float("libfm_mult_dec", libfm_mult_dec);
  D = get_option_int("D", D);

  parse_command_line_args();
  parse_implicit_command_line();

  /* Preprocess data if needed, or discover preprocess files */
  int nshards = convert_matrixmarket4<edge_data>(training, false);
  init_libfm();
  if (validation != ""){
    int vshards = convert_matrixmarket4<EdgeDataType>(validation, true, M==N, VALIDATION);
    init_validation_rmse_engine<VertexDataType, EdgeDataType>(pvalidation_engine, vshards, &libfm_predict, false, true, 1);
   }


  if (load_factors_from_file){
    load_matrix_market_matrix(training + "_U.mm", 0, D);
    load_matrix_market_matrix(training + "_V.mm", M, D);
    load_matrix_market_matrix(training + "_T.mm", M+N, D);
    load_matrix_market_matrix(training + "_L.mm", M+N+K, D);
    vec user_bias =      load_matrix_market_vector(training +"_U_bias.mm", false, true);
    vec item_bias =      load_matrix_market_vector(training +"_V_bias.mm", false, true);
    vec time_bias =      load_matrix_market_vector(training+ "_T_bias.mm", false, true);
    vec last_item_bias = load_matrix_market_vector(training+"_L_bias.m", false, true);
    for (uint i=0; i<M+N+K+M; i++){
      if (i < M)
        latent_factors_inmem[i].bias = user_bias[i];
      else if (i <M+N)
        latent_factors_inmem[i].bias = item_bias[i-M];
      else if (i <M+N+K)
        latent_factors_inmem[i].bias = time_bias[i-M-N];
      else 
        latent_factors_inmem[i].bias = last_item_bias[i-M-N-K];
    }
    vec gm = load_matrix_market_vector(training + "_global_mean.mm", false, true);
    globalMean = gm[0];
}


  /* Run */
  LIBFMVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
  set_engine_flags(engine);
  pengine = &engine;
  engine.run(program, niters);

  /* Output test predictions in matrix-market format */
  output_libfm_result(training);
  test_predictions3(&libfm_predict, 1);    

  /* Report execution metrics */
  if (!quiet) 
    metrics_report(m);
  return 0;
}
int main(int argc, const char ** argv) {

 // print_copyright();
  write_copyright();
  //* GraphChi initialization will read the command line arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("sgd-inmemory-factors");

  /* Basic arguments for application. NOTE: File will be automatically 'sharded'. */
  sgd_lambda    = get_option_float("sgd_lambda", 1e-3);
  sgd_gamma     = get_option_float("sgd_gamma", 1e-3);
  sgd_step_dec  = get_option_float("sgd_step_dec", 0.9);

  int file_format   = get_option_int("ff", 3);


  parse_command_line_args();
  parse_implicit_command_line();

  /* Preprocess data if needed, or discover preprocess files */
  int nshards = convert_matrixmarket<EdgeDataType>(training, 0, 0, file_format, TRAINING, false);
  init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, !load_factors_from_file);
  if (validation != ""){
    int vshards = convert_matrixmarket<EdgeDataType>(validation, 0, 0, 3, VALIDATION, false);
    init_validation_rmse_engine<VertexDataType, EdgeDataType>(pvalidation_engine, vshards, &sgd_predict);
  }

  /* load initial state from disk (optional) */
  if (load_factors_from_file){
    load_matrix_market_matrix(training + "_U.mm", 0, D);
    load_matrix_market_matrix(training + "_V.mm", M, D);
  }

  print_config();

  /* Run */
  SGDVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
  set_engine_flags(engine);
  pengine = &engine;

  timer train_timer;
  engine.run(program, niters);
 // std::cout << "Trn Time for file test: " << std::setw(10) << train_timer.current_time() / niters << std::endl;

  std::ofstream ofs(result.c_str(), std::ofstream::out | std::ofstream::app);
  ofs << D << " " << train_timer.current_time() << " ";


  /* Run TopN program */
  n_top = get_option_int("n_int", 10);
  /*timer test_timer1;

  ofs << test_timer1.current_time() << " ";*/
  //run_general_topn_program(pengine, &latent_factors_inmem, &sgd_predict);
  timer index_timer;
  kd_Node* mroot = init_kdtree(&latent_factors_inmem);
  ofs << index_timer.current_time() << " ";
  timer test_timer;
  /* construct kd tree index */ 
//  ofs << "constructing index: " << test_timer.current_time() << " ";
  run_kd_topn_program(pengine, &latent_factors_inmem, mroot);

 // std::coua << "Tst Time: " << std::setw(10) << test_timer.current_time() << std::endl;
  ofs << test_timer.current_time() << std::endl;
  ofs.close();
  /* Output latent factor matrices in matrix-market format */
  output_sgd_result(training);
  test_predictions(&sgd_predict);    
  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  
  return 0;
}
Beispiel #20
0
int main(int argc, const char ** argv) {

  print_copyright();

  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("item-cf");    
  /* Basic arguments for application */
  min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection);
  distance_metric          = get_option_int("distance", JACCARD);
  asym_cosine_alpha        = get_option_float("asym_cosine_alpha", 0.5);
  if (distance_metric != JACCARD && distance_metric != AA && distance_metric != RA && distance_metric != ASYM_COSINE)
    logstream(LOG_FATAL)<<"Wrong distance metric. --distance_metric=XX, where XX should be either 0) JACCARD, 1) AA, 2) RA, 3) ASYM_COSINE" << std::endl;  
  parse_command_line_args();

  mytimer.start();
  int nshards          = convert_matrixmarket<EdgeDataType>(training/*, orderByDegreePreprocessor*/);
  if (nshards != 1)
    logstream(LOG_FATAL)<<"This application currently supports only 1 shard" << std::endl;
  K                        = get_option_int("K", K);
  if (K <= 0)
    logstream(LOG_FATAL)<<"Please specify the number of ratings to generate for each user using the --K command" << std::endl;

  assert(M > 0 && N > 0);
  //initialize data structure which saves a subset of the items (pivots) in memory
  adjcontainer = new adjlist_container();
  //array for marking which items are conected to the pivot items via users.
  relevant_items = new bool[N];

  //store node degrees in an array to be used for AA distance metric
  if (distance_metric == AA || distance_metric == RA)
    latent_factors_inmem.resize(M);

  /* Run */
  ItemDistanceProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, true, m); 
  set_engine_flags(engine);
  engine.set_maxwindow(M+N+1);

  //open output files as the number of operating threads
  out_files.resize(number_of_omp_threads());
  for (uint i=0; i< out_files.size(); i++){
    char buf[256];
    sprintf(buf, "%s.out%d", training.c_str(), i);
    out_files[i] = open_file(buf, "w");
  }

  //run the program
  engine.run(program, niters);

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  
  std::cout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << sum(written_pairs) << " pairs with zero distance: " << zero_dist << std::endl;
  if (not_enough)
    logstream(LOG_WARNING)<<"Items that did not have enough similar items: " << not_enough << std::endl;
  for (uint i=0; i< out_files.size(); i++){
    fflush(out_files[i]);
    fclose(out_files[i]);
  }

  std::cout<<"Created "  << number_of_omp_threads() << " output files with the format: " << training << ".outXX, where XX is the output thread number" << std::endl; 

  delete[] relevant_items;
  return 0;
}
Beispiel #21
0
int main(int argc, const char ** argv) {

  mytimer.start();
  print_copyright();

  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("rating2");

  knn_sample_percent = get_option_float("knn_sample_percent", 1.0);
  if (knn_sample_percent <= 0 || knn_sample_percent > 1)
    logstream(LOG_FATAL)<<"Sample percente should be in the range (0, 1] " << std::endl;

  num_ratings   = get_option_int("num_ratings", 10);
  if (num_ratings <= 0)
    logstream(LOG_FATAL)<<"num_ratings, the number of recomended items for each user, should be >=1 " << std::endl;

  debug         = get_option_int("debug", 0);
  tokens_per_row = get_option_int("tokens_per_row", tokens_per_row);
  std::string algorithm     = get_option_string("algorithm");
  /* Basic arguments for RBM algorithm */
  rbm_bins      = get_option_int("rbm_bins", rbm_bins);
  rbm_scaling   = get_option_float("rbm_scaling", rbm_scaling);

  if (algorithm == "svdpp" || algorithm == "svd++")
    algo = SVDPP;
  else if (algorithm == "biassgd")
    algo = BIASSGD;
  else if (algorithm == "rbm")
    algo = RBM;
  else logstream(LOG_FATAL)<<"--algorithm should be svd++ or biassgd or rbm"<<std::endl;

  parse_command_line_args();

  /* Preprocess data if needed, or discover preprocess files */
  int nshards = 0;
  if (tokens_per_row == 3)
    nshards = convert_matrixmarket<edge_data>(training, 0, 0, 3, TRAINING, false);
  else if (tokens_per_row == 4)
    nshards = convert_matrixmarket4<edge_data4>(training);
  else logstream(LOG_FATAL)<<"--tokens_per_row should be either 3 or 4" << std::endl;

  assert(M > 0 && N > 0);
  latent_factors_inmem.resize(M+N); // Initialize in-memory vertices.

  //initialize data structure to hold the matrix read from file
  if (algo == RBM){
#pragma omp parallel for
    for (uint i=0; i< M+N; i++){
      if (i < M){
        latent_factors_inmem[i].pvec = zeros(D*3);
      }
      else {  
        latent_factors_inmem[i].pvec = zeros(rbm_bins + rbm_bins * D);
      }
    } 
  }
 
  read_factors(training);
  if ((uint)num_ratings > N){
    logstream(LOG_WARNING)<<"num_ratings is too big - setting it to: " << N << std::endl;
    num_ratings = N;
  }
  srand(time(NULL));

  /* Run */
  if (tokens_per_row == 3){
    RatingVerticesInMemProgram<VertexDataType, EdgeDataType> program;
    graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
    set_engine_flags(engine);
    engine.run(program, 1);
  } 
  else if (tokens_per_row == 4){
    RatingVerticesInMemProgram<VertexDataType, edge_data4> program;
    graphchi_engine<VertexDataType, edge_data4> engine(training, nshards, false, m); 
    set_engine_flags(engine);
    engine.run(program, 1);
  }
  /* Output latent factor matrices in matrix-market format */
  output_knn_result(training);

  rating_stats();

  if (users_without_ratings > 0)
    logstream(LOG_WARNING)<<"Found " << users_without_ratings << " without ratings. For those users no items are recommended (item id 0)" << std::endl;

  if (users_no_ratings > 0)
    logstream(LOG_WARNING)<<"Failed to compute ratings for " << users_no_ratings << " Users. For those users no items are recommended (item id 0)" << std::endl;


  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  return 0;
}