uint add_implicit_edges(int type, sharder<als_edge_type>& shrd ){ switch(type){ case IMPLICIT_RATING_DISABLED: return 0; case IMPLICIT_RATING_RANDOM: break; default: assert(false); }; uint added = 0; uint toadd = implicitratingpercentage*N*M; logstream(LOG_INFO)<<"Going to add: " << toadd << " implicit edges. " << std::endl; assert(toadd >= 1); for (uint j=0; j< toadd; j++){ ivec item = ::randi(1,0,N-1); ivec user = ::randi(1,0,M-1); shrd.preprocessing_add_edge(user[0], item[0], als_edge_type(implicitratingvalue)); added++; } logstream(LOG_INFO)<<"Finished adding " << toadd << " implicit edges. " << std::endl; return added; }
int convert_matrixmarket(std::string base_filename, size_t nodes = 0, size_t edges = 0, int tokens_per_row = 3, int type = TRAINING, int allow_square = true) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, type); return nshards; } } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, type == TRAINING?M:Me, type == TRAINING?N:Ne, nz, nodes, edges, type); if (f == NULL){ if (type == TRAINING){ logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; } else if (type == VALIDATION){ logstream(LOG_INFO)<<"Validation file: " << base_filename << " is not found. " << std::endl; return -1; } } compute_matrix_size(nz, type); uint I, J; double val = 1.0; bool active_edge = true; int zero_entries = 0; for (size_t i=0; i<nz; i++) { if (tokens_per_row == 3){ int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; if (val == 0 && ! allow_zeros) logstream(LOG_FATAL)<<"Encountered zero edge [ " << I << " " <<J << " 0] in line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl; else if (val == 0){ zero_entries++; continue; } } else if (tokens_per_row == 2){ int rc = fscanf(f, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); if (I ==987654321 || J== 987654321) //hack - to be removed later continue; I-=(uint)input_file_offset; /* adjust from 1-based to 0-based */ J-=(uint)input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I+1 << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J+1 << " > " << N << " in line; " << i << std::endl; if (minval != -1e100 && val < minval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where min value is: " << minval << std::endl; if (maxval != 1e100 && val > maxval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where max value is: " << maxval << std::endl; active_edge = decide_if_edge_is_active(i, type); if (active_edge){ if (type == TRAINING) globalMean += val; else globalMean2 += val; sharderobj.preprocessing_add_edge(I, (M==N && allow_square)?J:M + J, als_edge_type((float)val)); } } if (type == TRAINING){ uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl; } else { globalMean2 /= Le; logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " Now creating shards." << std::endl; } write_global_mean(base_filename, type); sharderobj.end_preprocessing(); if (zero_entries) logstream(LOG_WARNING)<<"Found " << zero_entries << " zero edges!" << std::endl; fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename<< std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
int convert_matrixmarket_and_item_similarity(std::string base_filename, std::string similarity_file, int tokens_per_row, vec & degrees) { FILE *f = NULL, *fsim = NULL; size_t nz, nz_sim; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, TRAINING); return nshards; } } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, M, N, nz); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; uint N_row = 0 ,N_col = 0; detect_matrix_size(similarity_file, fsim, N_row, N_col, nz_sim); if (fsim == NULL || nz_sim == 0) logstream(LOG_FATAL)<<"Failed to open item similarity input file: " << similarity_file << std::endl; if (N_row != N || N_col != N) logstream(LOG_FATAL)<<"Wrong item similarity file matrix size: " << N_row <<" x " << N_col << " Instead of " << N << " x " << N << std::endl; L=nz + nz_sim; degrees.resize(M+N); uint I, J; double val = 1.0; int zero_entries = 0; unsigned int actual_edges = 0; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; for (size_t i=0; i<nz; i++){ if (tokens_per_row == 3){ int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file in line: " << i << std::endl; if (val == 0 && ! allow_zeros) logstream(LOG_FATAL)<<"Zero weight encountered at input file line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl; else if (val == 0) { zero_entries++; continue; } } else if (tokens_per_row == 2){ int rc = fscanf(f, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; degrees[J+M]++; degrees[I]++; if (I< (uint)start_user || I >= (uint)end_user){ continue; } sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val, 0)); //std::cout<<"adding an edge: " <<I << " -> " << M+J << std::endl; actual_edges++; } logstream(LOG_DEBUG)<<"Finished loading " << actual_edges << " ratings from file: " << base_filename << std::endl; for (size_t i=0; i<nz_sim; i++){ if (tokens_per_row == 3){ int rc = fscanf(fsim, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << similarity_file << " line: " << i << std::endl; } else if (tokens_per_row == 2){ int rc = fscanf(fsim, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= N) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; if (I == J) logstream(LOG_FATAL)<<"Item similarity to itself found for item " << I << " in line; " << i << std::endl; //std::cout<<"Adding an edge between "<<M+I<< " : " << M+J << " " << (I<J) << " " << val << std::endl; sharderobj.preprocessing_add_edge(M+I, M+J, als_edge_type(I < J? val: 0, I>J? val: 0)); actual_edges++; } L = actual_edges; logstream(LOG_DEBUG)<<"Finished loading " << nz_sim << " ratings from file: " << similarity_file << std::endl; write_global_mean(base_filename, TRAINING); sharderobj.end_preprocessing(); if (zero_entries) logstream(LOG_WARNING)<<"Found " << zero_entries << " edges with zero weight!" << std::endl; fclose(f); fclose(fsim); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename << std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false, int type = TRAINING, int matlab_time_offset = 1) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f = NULL; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, type); } if (type == TRAINING) time_nodes_offset = M+N; return nshards; } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, type == TRAINING? M:Me, type == TRAINING? N:Ne, nz); if (f == NULL){ if (type == VALIDATION){ logstream(LOG_INFO)<< "Did not find validation file: " << base_filename << std::endl; return -1; } else if (type == TRAINING) logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; } if (type == TRAINING) time_nodes_offset = M+N; compute_matrix_size(nz, type); uint I, J; double val, time; bool active_edge = true; for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val); if (rc != 4) logstream(LOG_FATAL)<<"Error when reading input file - line " << i << std::endl; if (time < 0) logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl; I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; K = std::max((int)time, (int)K); time -= matlab_time_offset; if (time < 0 && add_time_edges) logstream(LOG_FATAL)<<"Time bins should be >= " << matlab_time_offset << " in row " << i << std::endl; //only for tensor ALS we add edges between user and time bin and also item and time bin //time bins are numbered beteen M+N to M+N+K if (!weighted_als) time += time_nodes_offset; //avoid self edges if (square && I == J) continue; active_edge = decide_if_edge_is_active(i, type); if (active_edge){ if (type == TRAINING) globalMean += val; else globalMean2 += val; sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time)); } //in case of a tensor, add besides of the user-> movie edge also //time -> user and time-> movie edges if (add_time_edges){ sharderobj.preprocessing_add_edge((uint)time, I, als_edge_type(val, M+J)); sharderobj.preprocessing_add_edge((uint)time, M+J , als_edge_type(val, I)); } } if (type == TRAINING){ uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges4(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl; } else { globalMean2 /= Le; logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " time bins: " << K << " . Now creating shards." << std::endl; } write_global_mean(base_filename, type); sharderobj.end_preprocessing(); fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); return nshards; }
int convert_matrixmarket_N(std::string base_filename, bool square, int limit_rating = 0) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f; size_t nz; int nshards; if (validation_only && (nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; FILE * infile = fopen((base_filename + ".gm").c_str(), "r"); int node_id_maps_size = 0; assert( fscanf(infile, "%d\n%d\n%ld\n%d\n%lf\n%d\n%d\n%d\n", &M, &N, &L, &fc.total_features, &globalMean, &node_id_maps_size, &latent_factors_inmem_size,&num_feature_bins_size) ==8); assert(node_id_maps_size >= 0); assert(latent_factors_inmem_size >=M+N); fclose(infile); fc.node_id_maps.resize(node_id_maps_size); for (int i=0; i < (int)fc.node_id_maps.size(); i++){ char buf[256]; sprintf(buf, "%s.map.%d", training.c_str(), i); load_map_from_txt_file(fc.node_id_maps[i].string2nodeid, buf, 2); assert(fc.node_id_maps[i].string2nodeid.size() > 0); } logstream(LOG_INFO)<<"Finished loading " << node_id_maps_size << " maps. "<<std::endl; return nshards; } } /** * Create sharder object */ sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, M, N, nz, 0, 0, 0); if (f == NULL) logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl; if (M == 0 && N == 0) logstream(LOG_FATAL)<<"Failed to detect matrix size. Please prepare a file named: " << base_filename << ":info with matrix market header, as explained here: http://bickson.blogspot.co.il/2012/12/collaborative-filtering-3rd-generation_14.html " << std::endl; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; if (has_header_titles){ char * linebuf = NULL; size_t linesize; char linebuf_debug[1024]; /* READ LINE */ int rc = getline(&linebuf, &linesize, f); if (rc == -1) logstream(LOG_FATAL)<<"Error header line " << " [ " << linebuf_debug << " ] " << std::endl; strncpy(linebuf_debug, linebuf, 1024); char *pch = strtok(linebuf,ptokens); if (pch == NULL) logstream(LOG_FATAL)<<"Error header line " << " [ " << linebuf_debug << " ] " << std::endl; header_titles.push_back(std::string(pch)); while (pch != NULL){ pch = strtok(NULL, ptokens); if (pch == NULL) break; header_titles.push_back(pch); } } compute_matrix_size(nz, TRAINING); uint I, J; int val_array_len = std::max(1, fc.total_features); assert(val_array_len < FEATURE_WIDTH); std::vector<float> valarray; valarray.resize(val_array_len); float val = 0.0f; if (limit_rating > 0 && limit_rating < (int)nz) nz = limit_rating; char linebuf_debug[1024]; for (size_t i=0; i<nz; i++) { if (!read_line(f, base_filename, i,I, J, val, valarray, TRAINING, linebuf_debug)) logstream(LOG_FATAL)<<"Failed to read line: " <<i<< " in file: " << base_filename << std::endl; if (I>= M || J >= N || I < 0 || J < 0){ if (i == 0) logstream(LOG_FATAL)<<"Failed to parse first line, there are too many tokens. Did you forget the --has_header_titles=1 flag when file has string column headers? [ " << linebuf_debug << " ] " << " I : " << I << " J: " << J << std::endl; else logstream(LOG_FATAL)<<"Problem parsing input line number: " << i <<" in file: " << base_filename << ". Can not add edge from " << I << " to J " << J << " since matrix size is: " << M <<"x" <<N<< " [ original line: " << linebuf_debug << " ] . You probaably need to increase matrix size in the matrix market header." << std::endl; } bool active_edge = decide_if_edge_is_active(i, TRAINING); if (active_edge){ //calc stats globalMean += val; sharderobj.preprocessing_add_edge(I, square?J:M+J, als_edge_type(val, &valarray[0], val_array_len)); } } sharderobj.end_preprocessing(); //calc stats assert(L > 0); //assert(globalMean != 0); if (globalMean == 0) logstream(LOG_WARNING)<<"Found global mean of the data to be zero (val_pos). Please verify this is correct." << std::endl; globalMean /= L; logstream(LOG_INFO)<<"Computed global mean is: " << globalMean << std::endl; inputGlobalMean = globalMean; fclose(f); if (fc.hash_strings){ for (int i=0; i< fc.total_features+2; i++){ if (fc.node_id_maps[i].string2nodeid.size() == 0) logstream(LOG_FATAL)<<"Failed sanity check for feature number : " << i << " no values find in data " << std::endl; } } logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); return nshards; }
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c int ret_code; MM_typecode matcode; FILE *f; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; FILE * inf = fopen((base_filename + ".gm").c_str(), "r"); int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K); if (rc != 5) logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename << ".gm" << std::endl; fclose(inf); if (K <= 0) logstream(LOG_FATAL)<<"Incorrect number of time bins K in .gm file " << base_filename << ".gm" << std::endl; logstream(LOG_INFO) << "Read matrix of size " << M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl; return nshards; } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); if ((f = fopen(base_filename.c_str(), "r")) == NULL) { logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl; } if (mm_read_banner(f, &matcode) != 0) logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl; /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ if (mm_is_complex(matcode) || !mm_is_sparse(matcode)) logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl; /* find out size of sparse matrix .... */ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) { logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl; } logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; uint I, J; double val, time; if (!sharderobj.preprocessed_file_exists()) { for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val); if (rc != 4) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; if (time < 0) logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl; I--; /* adjust from 1-based to 0-based */ J--; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; K = std::max((int)time, (int)K); //avoid self edges if (square && I == J) continue; globalMean += val; L++; sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time+M+N)); //in case of a tensor, add besides of the user-> movie edge also //time -> user and time-> movie edges if (add_time_edges){ sharderobj.preprocessing_add_edge((uint)time + M + N, I, als_edge_type(val, M+J)); sharderobj.preprocessing_add_edge((uint)time + M + N, M+J , als_edge_type(val, I)); } } uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges4(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; sharderobj.end_preprocessing(); globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl; FILE * outf = fopen((base_filename + ".gm").c_str(), "w"); fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K); fclose(outf); } else { logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl; } fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); return nshards; }
int convert_matrixmarket(std::string base_filename, SharderPreprocessor<als_edge_type> * preprocessor = NULL) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c int ret_code; MM_typecode matcode; FILE *f; size_t nz; std::string suffix = ""; if (preprocessor != NULL) { suffix = preprocessor->getSuffix(); } /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename+ suffix, get_option_string("nshards", "auto")))) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; FILE * inf = fopen((base_filename + ".gm").c_str(), "r"); int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K); if (rc != 5) logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename+ suffix << ".gm" << std::endl; fclose(inf); logstream(LOG_INFO) << "Opened matrix size: " <<M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl; return nshards; } sharder<als_edge_type> sharderobj(base_filename + suffix); sharderobj.start_preprocessing(); if ((f = fopen(base_filename.c_str(), "r")) == NULL) { logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl; } if (mm_read_banner(f, &matcode) != 0) logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl; /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ if (mm_is_complex(matcode) || !mm_is_sparse(matcode)) logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl; /* find out size of sparse matrix .... */ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) { logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl; } L=nz; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; uint I, J; double val; if (!sharderobj.preprocessed_file_exists()) { for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; I--; /* adjust from 1-based to 0-based */ J--; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; globalMean += val; sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val)); } uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; sharderobj.end_preprocessing(); globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl; if (preprocessor != NULL) { preprocessor->reprocess(sharderobj.preprocessed_name(), base_filename); } FILE * outf = fopen((base_filename + ".gm").c_str(), "w"); fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K); fclose(outf); } else { logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl; } fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename + suffix << std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }