예제 #1
0
파일: io.hpp 프로젝트: abello/graphchi-cpp
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false, int type = TRAINING, int matlab_time_offset = 1) {
  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
  FILE *f = NULL;
  size_t nz;
  /**
   * Create sharder object
   */
  int nshards;
  if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {

    if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) {
      logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
      read_global_mean(base_filename, type);
    }
    if (type == TRAINING)
      time_nodes_offset = M+N;
     return nshards;
  }

  sharder<als_edge_type> sharderobj(base_filename);
  sharderobj.start_preprocessing();


  detect_matrix_size(base_filename, f, type == TRAINING? M:Me, type == TRAINING? N:Ne, nz);
  if (f == NULL){
    if (type == VALIDATION){
      logstream(LOG_INFO)<< "Did not find validation file: " << base_filename << std::endl;
      return -1;
    }
    else if (type == TRAINING)
      logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl;
  }

  if (type == TRAINING)
     time_nodes_offset = M+N;
 
 compute_matrix_size(nz, type); 

  uint I, J;
  double val, time;
  bool active_edge = true;

    for (size_t i=0; i<nz; i++)
    {
      int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val);
      if (rc != 4)
        logstream(LOG_FATAL)<<"Error when reading input file - line " << i << std::endl;
      if (time < 0)
        logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl;
      I-=input_file_offset;  /* adjust from 1-based to 0-based */
      J-=input_file_offset;
      if (I >= M)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl;
      K = std::max((int)time, (int)K);
      time -= matlab_time_offset;
      if (time < 0 && add_time_edges)
        logstream(LOG_FATAL)<<"Time bins should be >= " << matlab_time_offset << " in row " << i << std::endl;

      //only for tensor ALS we add edges between user and time bin and also item and time bin
      //time bins are numbered beteen M+N to M+N+K
      if (!weighted_als)
         time += time_nodes_offset;

      //avoid self edges
      if (square && I == J)
        continue;

      active_edge = decide_if_edge_is_active(i, type);

      if (active_edge){
        if (type == TRAINING)
        globalMean += val;
        else globalMean2 += val;
        sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time));
      }
      //in case of a tensor, add besides of the user-> movie edge also
      //time -> user and time-> movie edges
      if (add_time_edges){
        sharderobj.preprocessing_add_edge((uint)time, I, als_edge_type(val, M+J));
        sharderobj.preprocessing_add_edge((uint)time, M+J , als_edge_type(val, I));
      }
    }

    if (type == TRAINING){
      uint toadd = 0;
      if (implicitratingtype == IMPLICIT_RATING_RANDOM)
        toadd = add_implicit_edges4(implicitratingtype, sharderobj);
      globalMean += implicitratingvalue * toadd;
      L += toadd;
      globalMean /= L;
      logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl;
    }
    else {
      globalMean2 /= Le;
      logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " time bins: " << K << " . Now creating shards." << std::endl;
    }
    write_global_mean(base_filename, type);

    sharderobj.end_preprocessing();

 
  fclose(f);
  logstream(LOG_INFO) << "Now creating shards." << std::endl;

  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));

  return nshards;
}
예제 #2
0
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false) {
  // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c
  int ret_code;
  MM_typecode matcode;
  FILE *f;
  size_t nz;
  /**
   * Create sharder object
   */
  int nshards;
  if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) {
    logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl;
    FILE * inf = fopen((base_filename + ".gm").c_str(), "r");
    int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K);
    if (rc != 5)
      logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename << ".gm" << std::endl;
    fclose(inf);
    if (K <= 0)
      logstream(LOG_FATAL)<<"Incorrect number of time bins K in .gm file " << base_filename << ".gm" << std::endl;

    logstream(LOG_INFO) << "Read matrix of size " << M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl;
    return nshards;
  }   

  sharder<als_edge_type> sharderobj(base_filename);
  sharderobj.start_preprocessing();


  if ((f = fopen(base_filename.c_str(), "r")) == NULL) {
    logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl;
  }


  if (mm_read_banner(f, &matcode) != 0)
    logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl;


  /*  This is how one can screen matrix types if their application */
  /*  only supports a subset of the Matrix Market data types.      */

  if (mm_is_complex(matcode) || !mm_is_sparse(matcode))
    logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl;

  /* find out size of sparse matrix .... */

  if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) {
    logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl;
  }

  logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " 
    << M << " x " << N << ", non-zeros: " << nz << std::endl;

  uint I, J;
  double val, time;

  if (!sharderobj.preprocessed_file_exists()) {
    for (size_t i=0; i<nz; i++)
    {
      int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val);
      if (rc != 4)
        logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl;
      if (time < 0)
        logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl;
      I--;  /* adjust from 1-based to 0-based */
      J--;
      if (I >= M)
        logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl;
      if (J >= N)
        logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl;
      K = std::max((int)time, (int)K);
      //avoid self edges
      if (square && I == J)
        continue;
      globalMean += val; 
      L++;
      sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time+M+N));
      //in case of a tensor, add besides of the user-> movie edge also
      //time -> user and time-> movie edges
      if (add_time_edges){
        sharderobj.preprocessing_add_edge((uint)time + M + N, I, als_edge_type(val, M+J));
        sharderobj.preprocessing_add_edge((uint)time + M + N, M+J , als_edge_type(val, I));
      }
    }

    uint toadd = 0;
    if (implicitratingtype == IMPLICIT_RATING_RANDOM)
      toadd = add_implicit_edges4(implicitratingtype, sharderobj);
    globalMean += implicitratingvalue * toadd;
    L += toadd;

    sharderobj.end_preprocessing();
    globalMean /= L;
    logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl;
    FILE * outf = fopen((base_filename + ".gm").c_str(), "w");
    fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K);
    fclose(outf);


  } else {
    logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl;
  }

  fclose(f);
  logstream(LOG_INFO) << "Now creating shards." << std::endl;

  // Shard with a specified number of shards, or determine automatically if not defined
  nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto"));

  return nshards;
}