int main(int argc, const char ** argv) {
    print_copyright();

    /* GraphChi initialization will read the command line
       arguments and the configuration file. */
    graphchi_init(argc, argv);

    /* Metrics object for keeping track of performance counters
       and other information. Currently required. */
    metrics m("itemsim2rating2");

    /* Basic arguments for application */
    min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection);
    debug                    = get_option_int("debug", 0);
    parse_command_line_args();
    std::string similarity   = get_option_string("similarity", "");
    if (similarity == "")
        Rcpp::Rcerr<<"Missing similarity input file. Please specify one using the --similarity=filename command line flag" << std::endl;
    undirected               = get_option_int("undirected", 1);
    Q                        = get_option_float("Q", Q);
    K 			   = get_option_int("K");

    mytimer.start();
    vec unused;
    int nshards          = convert_matrixmarket_and_item_similarity<edge_data>(training, similarity, 3, unused);

    assert(M > 0 && N > 0);

    //initialize data structure which saves a subset of the items (pivots) in memory
    adjcontainer = new adjlist_container();

    //array for marking which items are conected to the pivot items via users.
    relevant_items = new bool[N];

    /* Run */
    ItemDistanceProgram program;
    graphchi_engine<VertexDataType, edge_data> engine(training, nshards, true, m);
    set_engine_flags(engine);

    out_file = open_file((training + "-rec").c_str(), "w");

    //run the program
    engine.run(program, niters);

    /* Report execution metrics */
    if (!quiet)
        metrics_report(m);

    Rcpp::Rcout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << written_pairs << std::endl;

    if (zero_edges)
        Rcpp::Rcout<<"Found: " << zero_edges<< " user edges with weight zero. Those are ignored." <<std::endl;

    delete[] relevant_items;
    fclose(out_file);
    return 0;
}
    static VARIABLE_IS_NOT_USED void metrics_report(metrics &m) {
        std::string reporters = get_option_string("metrics.reporter", "console");
        char * creps = (char*)reporters.c_str();
        const char * delims = ",";
        char * t = strtok(creps, delims);

        while(t != NULL) {            
            std::string repname(t);
            if (repname == "basic" || repname == "console") {
                basic_reporter rep;
                m.report(rep);
            } else if (repname == "file") {
                file_reporter rep(get_option_string("metrics.reporter.filename", "metrics.txt"));
                m.report(rep);
            } else if (repname == "html") {
                html_reporter rep(get_option_string("metrics.reporter.htmlfile", "metrics.html"));
                m.report(rep);
            } else {
                logstream(LOG_WARNING) << "Could not find metrics reporter with name [" << repname << "], ignoring." << std::endl;
            }
            t = strtok(NULL, delims);
        }
    }
int main(int argc, const char ** argv) {

  print_copyright();

  //* GraphChi initialization will read the command line arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("sgd-inmemory-factors");

  algorithm     = get_option_string("algorithm", "global_mean");
  if (algorithm == "global_mean")
    algo = GLOBAL_MEAN;
  else if (algorithm == "user_mean")
    algo = USER_MEAN;
  else if (algorithm == "item_mean")
    algo = ITEM_MEAN;
  else logstream(LOG_FATAL)<<"Unsupported algorithm name. Should be --algorithm=XX where XX is one of [global_mean,user_mean,item_mean] for example --algorithm=global_mean" << std::endl;


  parse_command_line_args();
  mytimer.start();

  /* Preprocess data if needed, or discover preprocess files */
  int nshards = convert_matrixmarket<float>(training, NULL, 0, 0, 3, TRAINING, false);
  init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, false);
  rmse_vec = zeros(number_of_omp_threads());
  print_config();

  /* Run */
  BaselineVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
  set_engine_flags(engine); 
  pengine = &engine;
  engine.run(program, 1);

  if (algo == USER_MEAN || algo == ITEM_MEAN)
    output_baseline_result(training);
  test_predictions(&baseline_predict);    

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  return 0;
}
Exemple #4
0
int convert_matrixmarket(std::string base_filename, size_t nodes = 0, size_t edges = 0, int tokens_per_row = 3, int type = TRAINING, int allow_square = true) {
  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
  FILE *f;
  size_t nz;

  /**
   * Create sharder object
   */
  int nshards;
  if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {
    if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) {
      logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
      read_global_mean(base_filename, type);
      return nshards;
    }
  }

  sharder<als_edge_type> sharderobj(base_filename);
  sharderobj.start_preprocessing();

  detect_matrix_size(base_filename, f, type == TRAINING?M:Me, type == TRAINING?N:Ne, nz, nodes, edges, type);
  if (f == NULL){
    if (type == TRAINING){
      logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl;
    }
    else if (type == VALIDATION){
      logstream(LOG_INFO)<<"Validation file: "  << base_filename << " is not found. " << std::endl;
      return -1;
    }
  }

  compute_matrix_size(nz, type);   
  uint I, J;
  double val = 1.0;
  bool active_edge = true;
  int zero_entries = 0;

  for (size_t i=0; i<nz; i++)
    {
      if (tokens_per_row == 3){
        int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val);
        if (rc != 3)
          logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl;
        if (val == 0 && ! allow_zeros)
          logstream(LOG_FATAL)<<"Encountered zero edge [ " << I << " " <<J << " 0] in line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl;
        else if (val == 0){
           zero_entries++;
           continue;
        }
      }
      else if (tokens_per_row == 2){
        int rc = fscanf(f, "%u %u\n", &I, &J);
        if (rc != 2)
          logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl;
      }
      else assert(false);

      if (I ==987654321 || J== 987654321) //hack - to be removed later
        continue;
      I-=(uint)input_file_offset;  /* adjust from 1-based to 0-based */
      J-=(uint)input_file_offset;
      if (I >= M)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I+1 << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J+1 << " > " << N << " in line; " << i << std::endl;
      if (minval != -1e100 && val < minval)
        logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where min value is: " << minval << std::endl;
      if (maxval != 1e100 && val > maxval)
        logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where max value is: " << maxval << std::endl;

      active_edge = decide_if_edge_is_active(i, type);

      if (active_edge){
        if (type == TRAINING)
          globalMean += val;
        else globalMean2 += val;
        sharderobj.preprocessing_add_edge(I, (M==N && allow_square)?J:M + J, als_edge_type((float)val));
      } 
    }

    if (type == TRAINING){
      uint toadd = 0;
      if (implicitratingtype == IMPLICIT_RATING_RANDOM)
        toadd = add_implicit_edges(implicitratingtype, sharderobj);
      globalMean += implicitratingvalue * toadd;
      L += toadd;
      globalMean /= L;
      logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl;
    }
    else {
      globalMean2 /= Le;
      logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " Now creating shards." << std::endl;
    }
    write_global_mean(base_filename, type);
    sharderobj.end_preprocessing();

  if (zero_entries)
     logstream(LOG_WARNING)<<"Found " << zero_entries << " zero edges!" << std::endl; 
  fclose(f);


  logstream(LOG_INFO) << "Now creating shards." << std::endl;

  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));
  logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename<< std::endl;
  logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl;

  return nshards;
}
Exemple #5
0
int convert_matrixmarket_and_item_similarity(std::string base_filename, std::string similarity_file, int tokens_per_row, vec & degrees) {
  FILE *f = NULL, *fsim = NULL;
  size_t nz, nz_sim;
  /**
   * Create sharder object
   */
  int nshards;
  if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {
    if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) {
      logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
      read_global_mean(base_filename, TRAINING);
      return nshards;
    }
  }

  sharder<als_edge_type> sharderobj(base_filename);
  sharderobj.start_preprocessing();

  detect_matrix_size(base_filename, f, M, N, nz);
  if (f == NULL)
    logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl;
  uint N_row = 0 ,N_col = 0;
  detect_matrix_size(similarity_file, fsim, N_row, N_col, nz_sim);
  if (fsim == NULL || nz_sim == 0)
    logstream(LOG_FATAL)<<"Failed to open item similarity input file: " << similarity_file << std::endl;
  if (N_row != N || N_col != N)
    logstream(LOG_FATAL)<<"Wrong item similarity file matrix size: " << N_row <<" x " << N_col << "  Instead of " << N << " x " << N << std::endl;
  L=nz + nz_sim;

  degrees.resize(M+N);

  uint I, J;
  double val = 1.0;
  int zero_entries = 0;
  unsigned int actual_edges = 0;
    logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: "
      << M << " x " << N << ", non-zeros: " << nz << std::endl;

    for (size_t i=0; i<nz; i++){
      if (tokens_per_row == 3){
        int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val);
        if (rc != 3)
          logstream(LOG_FATAL)<<"Error when reading input file in line: " << i << std::endl;
        if (val == 0 && ! allow_zeros)
          logstream(LOG_FATAL)<<"Zero weight encountered at input file line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl;
        else if (val == 0) { zero_entries++; continue; }
      }
      else if (tokens_per_row == 2){
        int rc = fscanf(f, "%u %u\n", &I, &J);
        if (rc != 2)
          logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl;
      }
      else assert(false);

      I-=input_file_offset;  /* adjust from 1-based to 0-based */
      J-=input_file_offset;
      if (I >= M)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl;
      degrees[J+M]++;
      degrees[I]++;
      if (I< (uint)start_user || I >= (uint)end_user){
         continue;
      }
      sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val, 0));
      //std::cout<<"adding an edge: " <<I << " -> " << M+J << std::endl;
      actual_edges++;
    }

    logstream(LOG_DEBUG)<<"Finished loading " << actual_edges << " ratings from file: " << base_filename << std::endl;

    for (size_t i=0; i<nz_sim; i++){
      if (tokens_per_row == 3){
        int rc = fscanf(fsim, "%u %u %lg\n", &I, &J, &val);
        if (rc != 3)
          logstream(LOG_FATAL)<<"Error when reading input file: " << similarity_file << " line: " << i << std::endl;
      }
      else if (tokens_per_row == 2){
        int rc = fscanf(fsim, "%u %u\n", &I, &J);
        if (rc != 2)
          logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl;
      }
      else assert(false);

      I-=input_file_offset;  /* adjust from 1-based to 0-based */
      J-=input_file_offset;
      if (I >= N)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl;
      if (I == J)
        logstream(LOG_FATAL)<<"Item similarity to itself found for item " << I << " in line; " << i << std::endl;
      //std::cout<<"Adding an edge between "<<M+I<< " : " << M+J << "  " << (I<J)  << " " << val << std::endl; 
      sharderobj.preprocessing_add_edge(M+I, M+J, als_edge_type(I < J? val: 0, I>J? val: 0));
      actual_edges++;
    }

    L = actual_edges;
    logstream(LOG_DEBUG)<<"Finished loading " << nz_sim << " ratings from file: " << similarity_file << std::endl;
    write_global_mean(base_filename, TRAINING);
    sharderobj.end_preprocessing();

    if (zero_entries)
      logstream(LOG_WARNING)<<"Found " << zero_entries << " edges with zero weight!" << std::endl;
    
  fclose(f);
  fclose(fsim);


  logstream(LOG_INFO) << "Now creating shards." << std::endl;

  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));
  logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename << std::endl;
  logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl;

  return nshards;
}
Exemple #6
0
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false, int type = TRAINING, int matlab_time_offset = 1) {
  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
  FILE *f = NULL;
  size_t nz;
  /**
   * Create sharder object
   */
  int nshards;
  if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {

    if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) {
      logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
      read_global_mean(base_filename, type);
    }
    if (type == TRAINING)
      time_nodes_offset = M+N;
     return nshards;
  }

  sharder<als_edge_type> sharderobj(base_filename);
  sharderobj.start_preprocessing();


  detect_matrix_size(base_filename, f, type == TRAINING? M:Me, type == TRAINING? N:Ne, nz);
  if (f == NULL){
    if (type == VALIDATION){
      logstream(LOG_INFO)<< "Did not find validation file: " << base_filename << std::endl;
      return -1;
    }
    else if (type == TRAINING)
      logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl;
  }

  if (type == TRAINING)
     time_nodes_offset = M+N;
 
 compute_matrix_size(nz, type); 

  uint I, J;
  double val, time;
  bool active_edge = true;

    for (size_t i=0; i<nz; i++)
    {
      int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val);
      if (rc != 4)
        logstream(LOG_FATAL)<<"Error when reading input file - line " << i << std::endl;
      if (time < 0)
        logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl;
      I-=input_file_offset;  /* adjust from 1-based to 0-based */
      J-=input_file_offset;
      if (I >= M)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl;
      K = std::max((int)time, (int)K);
      time -= matlab_time_offset;
      if (time < 0 && add_time_edges)
        logstream(LOG_FATAL)<<"Time bins should be >= " << matlab_time_offset << " in row " << i << std::endl;

      //only for tensor ALS we add edges between user and time bin and also item and time bin
      //time bins are numbered beteen M+N to M+N+K
      if (!weighted_als)
         time += time_nodes_offset;

      //avoid self edges
      if (square && I == J)
        continue;

      active_edge = decide_if_edge_is_active(i, type);

      if (active_edge){
        if (type == TRAINING)
        globalMean += val;
        else globalMean2 += val;
        sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time));
      }
      //in case of a tensor, add besides of the user-> movie edge also
      //time -> user and time-> movie edges
      if (add_time_edges){
        sharderobj.preprocessing_add_edge((uint)time, I, als_edge_type(val, M+J));
        sharderobj.preprocessing_add_edge((uint)time, M+J , als_edge_type(val, I));
      }
    }

    if (type == TRAINING){
      uint toadd = 0;
      if (implicitratingtype == IMPLICIT_RATING_RANDOM)
        toadd = add_implicit_edges4(implicitratingtype, sharderobj);
      globalMean += implicitratingvalue * toadd;
      L += toadd;
      globalMean /= L;
      logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl;
    }
    else {
      globalMean2 /= Le;
      logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " time bins: " << K << " . Now creating shards." << std::endl;
    }
    write_global_mean(base_filename, type);

    sharderobj.end_preprocessing();

 
  fclose(f);
  logstream(LOG_INFO) << "Now creating shards." << std::endl;

  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));

  return nshards;
}
Exemple #7
0
void parse_parser_command_line_arges(){
  fc.string_features = get_option_string("features", fc.default_feature_str);
   csv = get_option_int("csv", 0);
  if (csv) 
    ptokens = csv_tokens;
  
  file_columns = get_option_int("file_columns", file_columns); //get the number of columns in the edge file
  //input sanity checks
  if (file_columns < 3)
    logstream(LOG_FATAL)<<"You must have at least 3 columns in input file: [from] [to] [value] on each line"<<std::endl;
  if (file_columns >= FEATURE_WIDTH)
    logstream(LOG_FATAL)<<"file_columns exceeds the allowed storage limit - please increase FEATURE_WIDTH and recompile." << std::endl;
  fc.from_pos = get_option_int("from_pos", fc.from_pos);
  fc.to_pos = get_option_int("to_pos", fc.to_pos);
  fc.val_pos = get_option_int("val_pos", fc.val_pos);
  if (fc.from_pos >= file_columns || fc.to_pos >= file_columns || fc.val_pos >= file_columns)
    logstream(LOG_FATAL)<<"Please note that column numbering of from_pos, to_pos and val_pos starts from zero and should be smaller than file_columns" << std::endl;
  if (fc.from_pos == fc.to_pos || fc.from_pos == fc.val_pos || fc.to_pos == fc.val_pos)
    logstream(LOG_FATAL)<<"from_pos, to_pos and val_pos should have different values" << std::endl; 
  if (fc.val_pos == -1)
    logstream(LOG_FATAL)<<"you must specify a target column using --val_pos=XXX. Colmn index starts from 0." << std::endl;
  has_header_titles = get_option_int("has_header_titles", has_header_titles);
  limit_rating= get_option_int("limit_rating", 0); 
  //parse features (optional)
  if (fc.string_features != ""){
    char * pfeatures = strdup(fc.string_features.c_str());
    char * pch = strtok(pfeatures, ptokens);
    int node = atoi(pch);
    if (node < 0 || node >= MAX_FEATURES+3)
      logstream(LOG_FATAL)<<"Feature id using the --features=XX command should be non negative, starting from zero"<<std::endl;
    if (node >= file_columns)
      logstream(LOG_FATAL)<<"Feature id using the --feature=XX command should be < file_columns (counting starts from zero)" << std::endl;
    if (node == fc.from_pos || node == fc.to_pos || node == fc.val_pos)
      logstream(LOG_FATAL)<<"Feature id " << node << " can not be equal to --from_pos, --to_pos or --val_pos " << std::endl;
    fc.feature_selection[node] = true;
    fc.total_features++;
    while ((pch = strtok(NULL, ptokens))!= NULL){
      node = atoi(pch);
      if (node < 0 || node >= MAX_FEATURES+3)
        logstream(LOG_FATAL)<<"Feature id using the --features=XX command should be non negative, starting from zero"<<std::endl;
      fc.feature_selection[node] = true;
      fc.total_features++;
    }
  }
  train_only = get_option_int("train_only", 0);
  validation_only = get_option_int("validation_only", 0);
  fc.node_id_maps.resize(2+fc.total_features);
  real_features_string = get_option_string("real_features", real_features_string);
   //parse real features (optional)
  if (real_features_string != ""){
    int i=0;
    char * pfeatures = strdup(real_features_string.c_str());
    char * pch = strtok(pfeatures, ptokens);
    int node = atoi(pch);
    if (node < 0 || node >= MAX_FEATURES+3)
      logstream(LOG_FATAL)<<"Feature id using the --real_features=XX command should be non negative, starting from zero"<<std::endl;
    if (node >= file_columns)
      logstream(LOG_FATAL)<<"Feature id using the --real_feature=XX command should be < file_columns (counting starts from zero)" << std::endl;
    if (node == fc.from_pos || node == fc.to_pos || node == fc.val_pos)
      logstream(LOG_FATAL)<<"Feature id " << node << " can not be equal to --from_pos, --to_pos or --val_pos " << std::endl;
    fc.real_features_indicators[node] = true;
    fc.feature_positions[node] = i;
    i++;
    while ((pch = strtok(NULL, ptokens))!= NULL){
      node = atoi(pch);
      if (node < 0 || node >= MAX_FEATURES+3)
        logstream(LOG_FATAL)<<"Feature id using the --real_features=XX command should be non negative, starting from zero"<<std::endl;
      fc.real_features_indicators[node] = true;
      fc.feature_positions[node] = i;
      i++;
    }
  }

 
}
Exemple #8
0
int convert_matrixmarket_N(std::string base_filename, bool square, int limit_rating = 0) {
  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
  FILE *f;
  size_t nz;

  int nshards;
  if (validation_only && (nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {
    if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) {
      logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
      FILE * infile = fopen((base_filename + ".gm").c_str(), "r");
      int node_id_maps_size = 0;
      assert( fscanf(infile, "%d\n%d\n%ld\n%d\n%lf\n%d\n%d\n%d\n", &M, &N, &L, &fc.total_features, &globalMean, &node_id_maps_size, &latent_factors_inmem_size,&num_feature_bins_size) ==8);
      assert(node_id_maps_size >= 0);
      assert(latent_factors_inmem_size >=M+N);
      fclose(infile);
      fc.node_id_maps.resize(node_id_maps_size);
      for (int i=0; i < (int)fc.node_id_maps.size(); i++){
        char buf[256];
        sprintf(buf, "%s.map.%d", training.c_str(), i);
        load_map_from_txt_file(fc.node_id_maps[i].string2nodeid, buf, 2);
        assert(fc.node_id_maps[i].string2nodeid.size() > 0);
      }
      logstream(LOG_INFO)<<"Finished loading " << node_id_maps_size << " maps. "<<std::endl;
      return nshards;
    }
  }

  /**
   * Create sharder object
   */
  sharder<als_edge_type> sharderobj(base_filename);
  sharderobj.start_preprocessing();

  detect_matrix_size(base_filename, f, M, N, nz, 0, 0, 0);
  if (f == NULL)
    logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl;
  if (M == 0 && N == 0)
    logstream(LOG_FATAL)<<"Failed to detect matrix size. Please prepare a file named: " << base_filename << ":info with matrix market header, as explained here: http://bickson.blogspot.co.il/2012/12/collaborative-filtering-3rd-generation_14.html " << std::endl;

  logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl;


  if (has_header_titles){
    char * linebuf = NULL;
    size_t linesize;
    char linebuf_debug[1024];

    /* READ LINE */
    int rc = getline(&linebuf, &linesize, f);
    if (rc == -1)
      logstream(LOG_FATAL)<<"Error header line " << " [ " << linebuf_debug << " ] " << std::endl;

    strncpy(linebuf_debug, linebuf, 1024);
    char *pch = strtok(linebuf,ptokens);
    if (pch == NULL)
      logstream(LOG_FATAL)<<"Error header line " << " [ " << linebuf_debug << " ] " << std::endl;

    header_titles.push_back(std::string(pch));

    while (pch != NULL){
      pch = strtok(NULL, ptokens);
      if (pch == NULL)
        break;
      header_titles.push_back(pch);
    }
  }

  compute_matrix_size(nz, TRAINING);
  uint I, J;
  int val_array_len = std::max(1, fc.total_features);
  assert(val_array_len < FEATURE_WIDTH);
  std::vector<float> valarray; valarray.resize(val_array_len);
  float val = 0.0f;

  if (limit_rating > 0 && limit_rating < (int)nz)
    nz = limit_rating;

  char linebuf_debug[1024];
  for (size_t i=0; i<nz; i++)
  {

    if (!read_line(f, base_filename, i,I, J, val, valarray, TRAINING, linebuf_debug))
      logstream(LOG_FATAL)<<"Failed to read line: " <<i<< " in file: " << base_filename << std::endl;

    if (I>= M || J >= N || I < 0 || J < 0){
      if (i == 0)
        logstream(LOG_FATAL)<<"Failed to parse first line, there are too many tokens. Did you forget the --has_header_titles=1 flag when file has string column headers? [ " << linebuf_debug << " ] " << " I : " << I << " J: " << J << std::endl;
      else 
        logstream(LOG_FATAL)<<"Problem parsing input line number: " << i <<" in file: " << base_filename << ".  Can not add edge from " << I << " to  J " << J << 
                            " since matrix size is: " << M <<"x" <<N<< " [ original line: " << linebuf_debug << " ] . You probaably need to increase matrix size in the matrix market header." << std::endl;
    }

    bool active_edge = decide_if_edge_is_active(i, TRAINING);

    if (active_edge){
      //calc stats
      globalMean += val;
      sharderobj.preprocessing_add_edge(I, square?J:M+J, als_edge_type(val, &valarray[0], val_array_len));
    }
  }

  sharderobj.end_preprocessing();

  //calc stats
  assert(L > 0);
  //assert(globalMean != 0);
  if (globalMean == 0)
    logstream(LOG_WARNING)<<"Found global mean of the data to be zero (val_pos). Please verify this is correct." << std::endl;
  globalMean /= L;
  logstream(LOG_INFO)<<"Computed global mean is: " << globalMean << std::endl;
  inputGlobalMean = globalMean;

  fclose(f);

  if (fc.hash_strings){
    for (int i=0; i< fc.total_features+2; i++){
      if (fc.node_id_maps[i].string2nodeid.size() == 0)
        logstream(LOG_FATAL)<<"Failed sanity check for feature number : " << i << " no values find in data " << std::endl;
    }
  }

  logstream(LOG_INFO) << "Now creating shards." << std::endl;
  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));

  return nshards;
}
int main(int argc, const char ** argv) {
  print_copyright();

  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("itemsim2rating2");    

  /* Basic arguments for application */
  min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection);
  debug                    = get_option_int("debug", 0);
  parse_command_line_args();
  std::string similarity   = get_option_string("similarity", "");
  if (similarity == "")
    logstream(LOG_FATAL)<<"Missing similarity input file. Please specify one using the --similarity=filename command line flag" << std::endl;
  undirected               = get_option_int("undirected", 0);
  
  mytimer.start();

  int nshards          = convert_matrixmarket_and_item_similarity<edge_data>(training, similarity, 3, &degrees);

  assert(M > 0 && N > 0);
  prob_sim_normalization_constant = (double)L / (double)(M*N-L);
  
  //initialize data structure which saves a subset of the items (pivots) in memory
  adjcontainer = new adjlist_container();

  //array for marking which items are conected to the pivot items via users.
  relevant_items = new bool[N];

  /* Run */
  ItemDistanceProgram program;
  graphchi_engine<VertexDataType, edge_data> engine(training, nshards, true, m); 
  set_engine_flags(engine);

  //open output files as the number of operating threads
  out_files.resize(number_of_omp_threads());
  for (uint i=0; i< out_files.size(); i++){
    char buf[256];
    sprintf(buf, "%s-rec.out%d", training.c_str(), i);
    out_files[i] = open_file(buf, "w");
  }


  K 			   = get_option_int("K");
  assert(K > 0);
  //run the program
  engine.run(program, niters);

  for (uint i=0; i< out_files.size(); i++)
    fclose(out_files[i]);
  
  delete[] relevant_items;


  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);

  std::cout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << sum(written_pairs) << std::endl;

  logstream(LOG_INFO)<<"Going to sort and merge output files " << std::endl;
  std::string dname= dirname(strdup(argv[0]));
  system(("bash " + dname + "/topk.sh " + std::string(basename(strdup((training+"-rec").c_str())))).c_str()); 


  return 0;
}
Exemple #10
0
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false) {
  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
  int ret_code;
  MM_typecode matcode;
  FILE *f;
  size_t nz;
  /**
   * Create sharder object
   */
  int nshards;
  if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {
    logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
    FILE * inf = fopen((base_filename + ".gm").c_str(), "r");
    int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K);
    if (rc != 5)
      logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename << ".gm" << std::endl;
    fclose(inf);
    if (K <= 0)
      logstream(LOG_FATAL)<<"Incorrect number of time bins K in .gm file " << base_filename << ".gm" << std::endl;

    logstream(LOG_INFO) << "Read matrix of size " << M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl;
    return nshards;
  }   

  sharder<als_edge_type> sharderobj(base_filename);
  sharderobj.start_preprocessing();


  if ((f = fopen(base_filename.c_str(), "r")) == NULL) {
    logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl;
  }


  if (mm_read_banner(f, &matcode) != 0)
    logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl;


  /*  This is how one can screen matrix types if their application */
  /*  only supports a subset of the Matrix Market data types.      */

  if (mm_is_complex(matcode) || !mm_is_sparse(matcode))
    logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl;

  /* find out size of sparse matrix .... */

  if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) {
    logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl;
  }

  logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " 
    << M << " x " << N << ", non-zeros: " << nz << std::endl;

  uint I, J;
  double val, time;

  if (!sharderobj.preprocessed_file_exists()) {
    for (size_t i=0; i<nz; i++)
    {
      int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val);
      if (rc != 4)
        logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl;
      if (time < 0)
        logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl;
      I--;  /* adjust from 1-based to 0-based */
      J--;
      if (I >= M)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl;
      K = std::max((int)time, (int)K);
      //avoid self edges
      if (square && I == J)
        continue;
      globalMean += val; 
      L++;
      sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time+M+N));
      //in case of a tensor, add besides of the user-> movie edge also
      //time -> user and time-> movie edges
      if (add_time_edges){
        sharderobj.preprocessing_add_edge((uint)time + M + N, I, als_edge_type(val, M+J));
        sharderobj.preprocessing_add_edge((uint)time + M + N, M+J , als_edge_type(val, I));
      }
    }

    uint toadd = 0;
    if (implicitratingtype == IMPLICIT_RATING_RANDOM)
      toadd = add_implicit_edges4(implicitratingtype, sharderobj);
    globalMean += implicitratingvalue * toadd;
    L += toadd;

    sharderobj.end_preprocessing();
    globalMean /= L;
    logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl;
    FILE * outf = fopen((base_filename + ".gm").c_str(), "w");
    fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K);
    fclose(outf);


  } else {
    logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl;
  }

  fclose(f);
  logstream(LOG_INFO) << "Now creating shards." << std::endl;

  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));

  return nshards;
}
Exemple #11
0
int convert_matrixmarket(std::string base_filename, SharderPreprocessor<als_edge_type> * preprocessor = NULL) {
  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
  int ret_code;
  MM_typecode matcode;
  FILE *f;
  size_t nz;   

  std::string suffix = "";
  if (preprocessor != NULL) {
    suffix = preprocessor->getSuffix();
  }

  /**
   * Create sharder object
   */
  int nshards;
  if ((nshards = find_shards<als_edge_type>(base_filename+ suffix, get_option_string("nshards", "auto")))) {
    logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
    FILE * inf = fopen((base_filename + ".gm").c_str(), "r");
    int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K);
    if (rc != 5)
      logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename+ suffix << ".gm" << std::endl;
    fclose(inf);
    logstream(LOG_INFO) << "Opened matrix size: " <<M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl;
    return nshards;
  }   

  sharder<als_edge_type> sharderobj(base_filename + suffix);
  sharderobj.start_preprocessing();


  if ((f = fopen(base_filename.c_str(), "r")) == NULL) {
    logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl;
  }


  if (mm_read_banner(f, &matcode) != 0)
    logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl;


  /*  This is how one can screen matrix types if their application */
  /*  only supports a subset of the Matrix Market data types.      */

  if (mm_is_complex(matcode) || !mm_is_sparse(matcode))
    logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl;

  /* find out size of sparse matrix .... */

  if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) {
    logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl;
  }

  L=nz;

  logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " 
    << M << " x " << N << ", non-zeros: " << nz << std::endl;

  uint I, J;
  double val;
  if (!sharderobj.preprocessed_file_exists()) {
    for (size_t i=0; i<nz; i++)
    {

      int rc = fscanf(f, "%d %d %lg\n", &I, &J, &val);
      if (rc != 3)
        logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl;
      I--;  /* adjust from 1-based to 0-based */
      J--;
      if (I >= M)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl;
      globalMean += val; 
      sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val));
    }
    uint toadd = 0;
    if (implicitratingtype == IMPLICIT_RATING_RANDOM)
      toadd = add_implicit_edges(implicitratingtype, sharderobj);
    globalMean += implicitratingvalue * toadd;
    L += toadd;

    sharderobj.end_preprocessing();
    globalMean /= L;
    logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl;

    if (preprocessor != NULL) {
      preprocessor->reprocess(sharderobj.preprocessed_name(), base_filename);
    }

    FILE * outf = fopen((base_filename + ".gm").c_str(), "w");
    fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K);
    fclose(outf);


  } else {
    logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl;
  }
  fclose(f);


  logstream(LOG_INFO) << "Now creating shards." << std::endl;

  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));
  logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename + suffix << std::endl;
  logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl;

  return nshards;
}
Exemple #12
0
 void run_functional_unweighted_semisynchronous(std::string filename, int niters, metrics &_m) {
     FunctionalProgramProxySemisync<KERNEL> program;
      
     /* Process input file - if not already preprocessed */
     int nshards           
         = convert_if_notexists<typename FunctionalProgramProxySemisync<KERNEL>::EdgeDataType>(filename, get_option_string("nshards", "auto"));
     functional_engine<typename FunctionalProgramProxySemisync<KERNEL>::VertexDataType, 
         typename FunctionalProgramProxySemisync<KERNEL>::EdgeDataType,
         typename FunctionalProgramProxySemisync<KERNEL>::fvertex_t > 
             engine(filename, nshards, false, _m);
     
     
     engine.set_modifies_inedges(false); // Important
     engine.set_modifies_outedges(true); // Important
     engine.run(program, niters);
 }
Exemple #13
0
 void run_functional_unweighted_synchronous(std::string filename, int niters, metrics &_m) {
     FunctionalProgramProxyBulkSync<KERNEL> program;
     int nshards           
         = convert_if_notexists<typename FunctionalProgramProxyBulkSync<KERNEL>::EdgeDataType>(filename, get_option_string("nshards", "auto"));
     functional_engine<typename FunctionalProgramProxyBulkSync<KERNEL>::VertexDataType,
         typename FunctionalProgramProxyBulkSync<KERNEL>::EdgeDataType,
         typename FunctionalProgramProxyBulkSync<KERNEL>::fvertex_t > 
             engine(filename, nshards, false, _m);
     engine.set_modifies_inedges(false); // Important
     engine.set_modifies_outedges(true); // Important
     engine.set_enable_deterministic_parallelism(false); // Bulk synchronous does not need consistency.
     engine.run(program, niters);
 }
Exemple #14
0
int main(int argc, const char ** argv) {

  print_copyright();
 
  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("label_propagation");

  contexts_file = get_option_string("contexts");
  nouns_file = get_option_string("nouns"); 
  pos_seeds = get_option_string("pos_seeds");
  neg_seeds = get_option_string("neg_seeds");
  parse_command_line_args();

  load_map_from_txt_file(contexts.string2nodeid, contexts_file, 1);
  load_map_from_txt_file(nouns.string2nodeid, nouns_file, 1);
    //load graph (adj matrix) from file
  int nshards = convert_matrixmarket<EdgeDataType>(training, 0, 0, 3, TRAINING, true);

  init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem);

  load_seeds_from_txt_file(nouns.string2nodeid, pos_seeds, false);
  load_seeds_from_txt_file(nouns.string2nodeid, neg_seeds, true); 

#pragma omp parallel for
  for (int i=0; i< (int)M; i++){

    //normalize seed probabilities to sum up to one
    if (latent_factors_inmem[i].seed){
      if (sum(latent_factors_inmem[i].pvec) != 0)
      latent_factors_inmem[i].pvec /= sum(latent_factors_inmem[i].pvec);
      continue;
    }
    //other nodes get random label probabilities
    for (int j=0; j< D; j++)
       latent_factors_inmem[i].pvec[j] = drand48();
  }

  /* load initial state from disk (optional) */
  if (load_factors_from_file){
    load_matrix_market_matrix(training + "_U.mm", 0, D);
  }

  /* Run */
  COEMVerticesInMemProgram program;
  graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
  set_engine_flags(engine);
  pengine = &engine;
  engine.run(program, niters);

  /* Output latent factor matrices in matrix-market format */
  output_coem_result(training);

  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  
  return 0;
}
Exemple #15
0
/* ---------------------------------------------------------------------- */
int
spread_row_to_solution (struct spread_row *heading, struct spread_row *units,
			struct spread_row *data, struct defaults defaults)
/* ---------------------------------------------------------------------- */
{
  int i, j, n, l, next_keyword_save;
  int n_user, n_user_end;
  int default_pe, alk;
  int count_isotopes;
  int max_mass_balance, count_mass_balance;
  char *ptr, *ptr1;
  char *description;
  char token[MAX_LENGTH], token1[MAX_LENGTH];
  char string[2 * MAX_LENGTH];
  LDBLE dummy;

  int return_value, opt;
  char *next_char;
  const char *opt_list[] = {
    "temp",			/* 0 */
    "temperature",		/* 1 */
    "dens",			/* 2 */
    "density",			/* 3 */
    "units",			/* 4 */
    "redox",			/* 5 */
    "ph",			/* 6 */
    "pe",			/* 7 */
    "unit",			/* 8 */
    "isotope",			/* 9 */
    "water",			/* 10 */
    "description",		/* 11 */
    "desc",			/* 12 */
    "descriptor"		/* 13 */
  };
  int count_opt_list = 14;

/*
 *      look for solution number
 */
  n_user = -1;
  n_user_end = -1;
  description = string_duplicate ("");
  for (i = 0; i < heading->count; i++)
  {
    if (strcmp_nocase (heading->char_vector[i], "number") == 0)
    {
      break;
    }
  }
  if (i == heading->count || data->type_vector[i] == EMPTY
      || data->count <= i)
  {
    n_user = -1;
#ifdef SKIP
    for (i = 0; i < count_solution; i++)
    {
      if (n_user <= solution[i]->n_user)
      {
	n_user = solution[i]->n_user + 1;
      }
    }
#endif
  }
  else if (data->type_vector[i] == STRING)
  {
    input_error++;
    sprintf (error_string,
	     "Expected solution number or number range in 'number' column, found:  %s.",
	     data->char_vector[i]);
    error_msg (error_string, CONTINUE);
  }
  else
  {
    strcpy (string, "solution_s ");
    strcat (string, data->char_vector[i]);
    ptr = string;
    description = (char *) free_check_null (description);
    next_keyword_save = next_keyword;
    next_keyword = 42;
    read_number_description (ptr, &n_user, &n_user_end, &description);
    next_keyword = next_keyword_save;
  }
/*
 *   set up solution
 */

  if (n_user >= 0 && solution_bsearch (n_user, &n, FALSE) != NULL)
  {
    solution_free (solution[n]);
  }
  else
  {
    n = count_solution++;
    if (count_solution >= max_solution)
    {
      space ((void **) ((void *) &(solution)), count_solution, &max_solution,
	     sizeof (struct solution *));
    }
  }
  solution[n] = solution_alloc ();

  solution[n]->n_user = n_user;
  solution[n]->n_user_end = n_user_end;
  if (use.solution_in == FALSE)
  {
    use.solution_in = TRUE;
    use.n_solution_user = n_user;
  }
  max_mass_balance = MAX_MASS_BALANCE;
/*
 *   Set default ph, temp, density, pe, units
 */
  solution[n]->description = description;
  solution[n]->tc = defaults.temp;
  solution[n]->ph = defaults.ph;
  solution[n]->density = defaults.density;
  solution[n]->solution_pe = defaults.pe;
  solution[n]->mass_water = defaults.water;
  solution[n]->ah2o = 1.0;
  solution[n]->mu = 1e-7;
  solution[n]->cb = 0.0;
  default_pe = 0;
  solution[n]->units = defaults.units;
  solution[n]->totals[0].description = NULL;
  count_mass_balance = 0;
  count_isotopes = 0;
  default_pe = pe_data_store (&(solution[n]->pe), defaults.redox);
/*
 *   Read concentration data
 */
  return_value = UNKNOWN;
  for (i = 0; i < heading->count; i++)
  {
    if (strcmp_nocase (heading->char_vector[i], "number") == 0)
      continue;
    if (strcmp_nocase (heading->char_vector[i], "uncertainty") == 0)
      continue;
    if (strcmp_nocase (heading->char_vector[i], "uncertainties") == 0)
      continue;
    if (strcmp_nocase (heading->char_vector[i], "isotope_uncertainty") == 0)
      continue;
    /*
     *  Copy in element name
     */
    if (heading->type_vector[i] == EMPTY)
      continue;
    strcpy (string, heading->char_vector[i]);
    strcat (string, " ");
    /*
     *  Copy in concentration data
     */
    if (i >= data->count || data->type_vector[i] == EMPTY)
      continue;
    strcat (string, data->char_vector[i]);
    strcat (string, " ");
    /*
     *  Copy in concentration data
     */
    if (units != NULL && i < units->count && units->type_vector[i] != EMPTY)
    {
      strcat (string, units->char_vector[i]);
    }
/*
 *   Parse string just like read_solution input 
 */
    next_char = string;
    opt = get_option_string (opt_list, count_opt_list, &next_char);
    if (opt == OPTION_DEFAULT && heading->type_vector[i] == NUMBER)
    {
      opt = 9;
    }
    switch (opt)
    {
    case OPTION_EOF:		/* end of file */
      return_value = EOF;
      break;
    case OPTION_KEYWORD:	/* keyword */
      return_value = KEYWORD;
      break;
    case OPTION_ERROR:
      input_error++;
      error_msg ("Unknown input in SOLUTION keyword.", CONTINUE);
      error_msg (line_save, CONTINUE);
      break;
    case 0:			/* temperature */
    case 1:
      sscanf (next_char, SCANFORMAT, &(solution[n]->tc));
      break;
    case 2:			/* density */
    case 3:
      sscanf (next_char, SCANFORMAT, &(solution[n]->density));
      break;
    case 4:			/* units */
    case 8:			/* unit */
      if (copy_token (token, &next_char, &l) == EMPTY)
	break;
      if (check_units (token, FALSE, FALSE, solution[n]->units, TRUE) == OK)
      {
	solution[n]->units = string_hsave (token);
      }
      else
      {
	input_error++;
      }
      break;
    case 5:			/* redox */
      if (copy_token (token, &next_char, &l) == EMPTY)
	break;
      if (parse_couple (token) == OK)
      {
	default_pe = pe_data_store (&(solution[n]->pe), token);
      }
      else
      {
	input_error++;
      }
      break;
    case 6:			/* ph */
      next_char = string;
      if (read_conc (n, count_mass_balance, next_char) == ERROR)
      {
	input_error++;
	break;
      }
      solution[n]->ph = solution[n]->totals[count_mass_balance].input_conc;
      if (solution[n]->totals[count_mass_balance].equation_name == NULL)
      {
	break;
      }
      solution[n]->totals[count_mass_balance].description =
	string_hsave ("H(1)");
      count_mass_balance++;
      break;
    case 7:			/* pe */
      next_char = string;
      if (read_conc (n, count_mass_balance, next_char) == ERROR)
      {
	input_error++;
	break;
      }
      solution[n]->solution_pe =
	solution[n]->totals[count_mass_balance].input_conc;
      if (solution[n]->totals[count_mass_balance].equation_name == NULL)
      {
	break;
      }
      solution[n]->totals[count_mass_balance].description =
	string_hsave ("E");
      count_mass_balance++;
      break;
    case 9:			/* isotope */
      next_char = string;
      if (copy_token (token, &next_char, &l) != DIGIT)
      {
	input_error++;
	sprintf (error_string, "Expected isotope name to"
		 " begin with an isotopic number.");
	error_msg (error_string, CONTINUE);
	continue;
      }
      solution[n]->isotopes =
	(struct isotope *) PHRQ_realloc (solution[n]->isotopes,
					 (size_t) (count_isotopes +
						   1) *
					 sizeof (struct isotope));
      if (solution[n]->isotopes == NULL)
	malloc_error ();
      /* read and save element name */
      ptr1 = token;
      get_num (&ptr1,
	       &(solution[n]->isotopes[count_isotopes].isotope_number));
      if (ptr1[0] == '\0' || isupper ((int) ptr1[0]) == FALSE)
      {
	error_msg ("Expecting element name.", CONTINUE);
	error_msg (line_save, CONTINUE);
	input_error++;
	return (ERROR);
      }
      solution[n]->isotopes[count_isotopes].elt_name = string_hsave (ptr1);

      /* read and store isotope ratio */
      if (copy_token (token, &next_char, &l) != DIGIT)
      {
	input_error++;
	sprintf (error_string, "Expected numeric value for isotope ratio.");
	error_msg (error_string, CONTINUE);
	continue;
      }
      sscanf (token, SCANFORMAT,
	      &(solution[n]->isotopes[count_isotopes].ratio));

      /* read and store isotope ratio uncertainty */
      /* first choice is next column */
      if ((i + 1) < heading->count &&
	  (strcmp_nocase (heading->char_vector[i + 1], "uncertainty") == 0 ||
	   strcmp_nocase (heading->char_vector[i + 1],
			  "isotope_uncertainty") == 0
	   || strcmp_nocase (heading->char_vector[i + 1],
			     "uncertainties") == 0) && (i + 1) < data->count
	  && data->type_vector[i + 1] == NUMBER)
      {
	solution[n]->isotopes[count_isotopes].ratio_uncertainty =
	  data->d_vector[i + 1];
      }
      else
      {
	next_char = string;
	copy_token (token, &next_char, &l);
	for (j = 0; j < defaults.count_iso; j++)
	{
	  if (strcmp (token, defaults.iso[j].name) == 0)
	  {
	    solution[n]->isotopes[count_isotopes].ratio_uncertainty =
	      defaults.iso[j].uncertainty;
	    break;
	  }
	}
	if (j == defaults.count_iso)
	{
	  solution[n]->isotopes[count_isotopes].ratio_uncertainty = NAN;
	}
      }
      count_isotopes++;
      break;
    case 10:			/* water */
      j = copy_token (token, &next_char, &l);
      if (j == EMPTY)
      {
	solution[n]->mass_water = 1.0;
      }
      else if (j != DIGIT)
      {
	input_error++;
	sprintf (error_string,
		 "Expected numeric value for mass of water in solution.");
	error_msg (error_string, CONTINUE);
      }
      else
      {
	sscanf (token, SCANFORMAT, &dummy);
	solution[n]->mass_water = (LDBLE) dummy;
      }
      break;
    case 11:			/* description */
    case 12:			/* desc */
    case 13:			/* descriptor */
      solution[n]->description =
	(char *) free_check_null (solution[n]->description);
      solution[n]->description = string_duplicate (next_char);
      break;
    case OPTION_DEFAULT:
/*
 *   Read concentration
 */
      next_char = string;
      if (copy_token (token, &next_char, &l) == LOWER)
	continue;
      next_char = string;
      if (read_conc (n, count_mass_balance, next_char) == ERROR)
      {
#ifdef SKIP
	input_error++;
	break;
#endif
      }
      count_mass_balance++;
      break;
    }
    if (count_mass_balance + 1 >= max_mass_balance)
    {
      space ((void **) ((void *) &(solution[n]->totals)), count_mass_balance + 1,
	     &max_mass_balance, sizeof (struct conc));
    }
    if (return_value == EOF || return_value == KEYWORD)
      break;
  }
/*
 *   Sort totals by description
 */
  qsort (solution[n]->totals,
	 (size_t) count_mass_balance,
	 (size_t) sizeof (struct conc), conc_compare);
/*
 *   fix up default units and default pe
 */
  for (i = 0; i < count_mass_balance; i++)
  {
    strcpy (token, solution[n]->totals[i].description);
    str_tolower (token);
    if (solution[n]->totals[i].units == NULL)
    {
      solution[n]->totals[i].units = solution[n]->units;
    }
    else
    {
      alk = FALSE;
      if (strstr (token, "alk") == token)
	alk = TRUE;
      strcpy (token1, solution[n]->totals[i].units);
      if (check_units (token1, alk, TRUE, solution[n]->units, TRUE) == ERROR)
      {
	input_error++;
      }
      else
      {
	solution[n]->totals[i].units = string_hsave (token1);
      }
    }
    if (solution[n]->totals[i].n_pe < 0)
    {
      solution[n]->totals[i].n_pe = default_pe;
    }
  }
  solution[n]->default_pe = default_pe;
/*
 *   Mark end of solution
 */
  solution[n]->totals[count_mass_balance].description = NULL;
  solution[n]->count_isotopes = count_isotopes;
  if (count_isotopes > 0)
  {
    qsort (solution[n]->isotopes,
	   (size_t) count_isotopes,
	   (size_t) sizeof (struct isotope), isotope_compare);
  }
  else
  {
    solution[n]->isotopes =
      (struct isotope *) free_check_null (solution[n]->isotopes);
  }
  return (return_value);
}
Exemple #16
0
int main(int argc, const char ** argv) {

  mytimer.start();
  print_copyright();

  /* GraphChi initialization will read the command line 
     arguments and the configuration file. */
  graphchi_init(argc, argv);

  /* Metrics object for keeping track of performance counters
     and other information. Currently required. */
  metrics m("rating2");

  knn_sample_percent = get_option_float("knn_sample_percent", 1.0);
  if (knn_sample_percent <= 0 || knn_sample_percent > 1)
    logstream(LOG_FATAL)<<"Sample percente should be in the range (0, 1] " << std::endl;

  num_ratings   = get_option_int("num_ratings", 10);
  if (num_ratings <= 0)
    logstream(LOG_FATAL)<<"num_ratings, the number of recomended items for each user, should be >=1 " << std::endl;

  debug         = get_option_int("debug", 0);
  tokens_per_row = get_option_int("tokens_per_row", tokens_per_row);
  std::string algorithm     = get_option_string("algorithm");
  /* Basic arguments for RBM algorithm */
  rbm_bins      = get_option_int("rbm_bins", rbm_bins);
  rbm_scaling   = get_option_float("rbm_scaling", rbm_scaling);

  if (algorithm == "svdpp" || algorithm == "svd++")
    algo = SVDPP;
  else if (algorithm == "biassgd")
    algo = BIASSGD;
  else if (algorithm == "rbm")
    algo = RBM;
  else logstream(LOG_FATAL)<<"--algorithm should be svd++ or biassgd or rbm"<<std::endl;

  parse_command_line_args();

  /* Preprocess data if needed, or discover preprocess files */
  int nshards = 0;
  if (tokens_per_row == 3)
    nshards = convert_matrixmarket<edge_data>(training, 0, 0, 3, TRAINING, false);
  else if (tokens_per_row == 4)
    nshards = convert_matrixmarket4<edge_data4>(training);
  else logstream(LOG_FATAL)<<"--tokens_per_row should be either 3 or 4" << std::endl;

  assert(M > 0 && N > 0);
  latent_factors_inmem.resize(M+N); // Initialize in-memory vertices.

  //initialize data structure to hold the matrix read from file
  if (algo == RBM){
#pragma omp parallel for
    for (uint i=0; i< M+N; i++){
      if (i < M){
        latent_factors_inmem[i].pvec = zeros(D*3);
      }
      else {  
        latent_factors_inmem[i].pvec = zeros(rbm_bins + rbm_bins * D);
      }
    } 
  }
 
  read_factors(training);
  if ((uint)num_ratings > N){
    logstream(LOG_WARNING)<<"num_ratings is too big - setting it to: " << N << std::endl;
    num_ratings = N;
  }
  srand(time(NULL));

  /* Run */
  if (tokens_per_row == 3){
    RatingVerticesInMemProgram<VertexDataType, EdgeDataType> program;
    graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); 
    set_engine_flags(engine);
    engine.run(program, 1);
  } 
  else if (tokens_per_row == 4){
    RatingVerticesInMemProgram<VertexDataType, edge_data4> program;
    graphchi_engine<VertexDataType, edge_data4> engine(training, nshards, false, m); 
    set_engine_flags(engine);
    engine.run(program, 1);
  }
  /* Output latent factor matrices in matrix-market format */
  output_knn_result(training);

  rating_stats();

  if (users_without_ratings > 0)
    logstream(LOG_WARNING)<<"Found " << users_without_ratings << " without ratings. For those users no items are recommended (item id 0)" << std::endl;

  if (users_no_ratings > 0)
    logstream(LOG_WARNING)<<"Failed to compute ratings for " << users_no_ratings << " Users. For those users no items are recommended (item id 0)" << std::endl;


  /* Report execution metrics */
  if (!quiet)
    metrics_report(m);
  return 0;
}