int convert_matrixmarket(std::string base_filename, size_t nodes = 0, size_t edges = 0, int tokens_per_row = 3, int type = TRAINING, int allow_square = true) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, type); return nshards; } } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, type == TRAINING?M:Me, type == TRAINING?N:Ne, nz, nodes, edges, type); if (f == NULL){ if (type == TRAINING){ logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; } else if (type == VALIDATION){ logstream(LOG_INFO)<<"Validation file: " << base_filename << " is not found. " << std::endl; return -1; } } compute_matrix_size(nz, type); uint I, J; double val = 1.0; bool active_edge = true; int zero_entries = 0; for (size_t i=0; i<nz; i++) { if (tokens_per_row == 3){ int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; if (val == 0 && ! allow_zeros) logstream(LOG_FATAL)<<"Encountered zero edge [ " << I << " " <<J << " 0] in line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl; else if (val == 0){ zero_entries++; continue; } } else if (tokens_per_row == 2){ int rc = fscanf(f, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); if (I ==987654321 || J== 987654321) //hack - to be removed later continue; I-=(uint)input_file_offset; /* adjust from 1-based to 0-based */ J-=(uint)input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I+1 << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J+1 << " > " << N << " in line; " << i << std::endl; if (minval != -1e100 && val < minval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where min value is: " << minval << std::endl; if (maxval != 1e100 && val > maxval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where max value is: " << maxval << std::endl; active_edge = decide_if_edge_is_active(i, type); if (active_edge){ if (type == TRAINING) globalMean += val; else globalMean2 += val; sharderobj.preprocessing_add_edge(I, (M==N && allow_square)?J:M + J, als_edge_type((float)val)); } } if (type == TRAINING){ uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl; } else { globalMean2 /= Le; logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " Now creating shards." << std::endl; } write_global_mean(base_filename, type); sharderobj.end_preprocessing(); if (zero_entries) logstream(LOG_WARNING)<<"Found " << zero_entries << " zero edges!" << std::endl; fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename<< std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
int convert_matrixmarket(std::string base_filename, SharderPreprocessor<als_edge_type> * preprocessor = NULL) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c int ret_code; MM_typecode matcode; FILE *f; size_t nz; std::string suffix = ""; if (preprocessor != NULL) { suffix = preprocessor->getSuffix(); } /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename+ suffix, get_option_string("nshards", "auto")))) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; FILE * inf = fopen((base_filename + ".gm").c_str(), "r"); int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K); if (rc != 5) logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename+ suffix << ".gm" << std::endl; fclose(inf); logstream(LOG_INFO) << "Opened matrix size: " <<M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl; return nshards; } sharder<als_edge_type> sharderobj(base_filename + suffix); sharderobj.start_preprocessing(); if ((f = fopen(base_filename.c_str(), "r")) == NULL) { logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl; } if (mm_read_banner(f, &matcode) != 0) logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl; /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ if (mm_is_complex(matcode) || !mm_is_sparse(matcode)) logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl; /* find out size of sparse matrix .... */ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) { logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl; } L=nz; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; uint I, J; double val; if (!sharderobj.preprocessed_file_exists()) { for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; I--; /* adjust from 1-based to 0-based */ J--; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; globalMean += val; sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val)); } uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; sharderobj.end_preprocessing(); globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl; if (preprocessor != NULL) { preprocessor->reprocess(sharderobj.preprocessed_name(), base_filename); } FILE * outf = fopen((base_filename + ".gm").c_str(), "w"); fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K); fclose(outf); } else { logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl; } fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename + suffix << std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }