int main(int argc, const char ** argv) { print_copyright(); /* GraphChi initialization will read the command line arguments and the configuration file. */ graphchi_init(argc, argv); /* Metrics object for keeping track of performance counters and other information. Currently required. */ metrics m("itemsim2rating2"); /* Basic arguments for application */ min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection); debug = get_option_int("debug", 0); parse_command_line_args(); std::string similarity = get_option_string("similarity", ""); if (similarity == "") Rcpp::Rcerr<<"Missing similarity input file. Please specify one using the --similarity=filename command line flag" << std::endl; undirected = get_option_int("undirected", 1); Q = get_option_float("Q", Q); K = get_option_int("K"); mytimer.start(); vec unused; int nshards = convert_matrixmarket_and_item_similarity<edge_data>(training, similarity, 3, unused); assert(M > 0 && N > 0); //initialize data structure which saves a subset of the items (pivots) in memory adjcontainer = new adjlist_container(); //array for marking which items are conected to the pivot items via users. relevant_items = new bool[N]; /* Run */ ItemDistanceProgram program; graphchi_engine<VertexDataType, edge_data> engine(training, nshards, true, m); set_engine_flags(engine); out_file = open_file((training + "-rec").c_str(), "w"); //run the program engine.run(program, niters); /* Report execution metrics */ if (!quiet) metrics_report(m); Rcpp::Rcout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << written_pairs << std::endl; if (zero_edges) Rcpp::Rcout<<"Found: " << zero_edges<< " user edges with weight zero. Those are ignored." <<std::endl; delete[] relevant_items; fclose(out_file); return 0; }
static VARIABLE_IS_NOT_USED void metrics_report(metrics &m) { std::string reporters = get_option_string("metrics.reporter", "console"); char * creps = (char*)reporters.c_str(); const char * delims = ","; char * t = strtok(creps, delims); while(t != NULL) { std::string repname(t); if (repname == "basic" || repname == "console") { basic_reporter rep; m.report(rep); } else if (repname == "file") { file_reporter rep(get_option_string("metrics.reporter.filename", "metrics.txt")); m.report(rep); } else if (repname == "html") { html_reporter rep(get_option_string("metrics.reporter.htmlfile", "metrics.html")); m.report(rep); } else { logstream(LOG_WARNING) << "Could not find metrics reporter with name [" << repname << "], ignoring." << std::endl; } t = strtok(NULL, delims); } }
int main(int argc, const char ** argv) { print_copyright(); //* GraphChi initialization will read the command line arguments and the configuration file. */ graphchi_init(argc, argv); /* Metrics object for keeping track of performance counters and other information. Currently required. */ metrics m("sgd-inmemory-factors"); algorithm = get_option_string("algorithm", "global_mean"); if (algorithm == "global_mean") algo = GLOBAL_MEAN; else if (algorithm == "user_mean") algo = USER_MEAN; else if (algorithm == "item_mean") algo = ITEM_MEAN; else logstream(LOG_FATAL)<<"Unsupported algorithm name. Should be --algorithm=XX where XX is one of [global_mean,user_mean,item_mean] for example --algorithm=global_mean" << std::endl; parse_command_line_args(); mytimer.start(); /* Preprocess data if needed, or discover preprocess files */ int nshards = convert_matrixmarket<float>(training, NULL, 0, 0, 3, TRAINING, false); init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, false); rmse_vec = zeros(number_of_omp_threads()); print_config(); /* Run */ BaselineVerticesInMemProgram program; graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); set_engine_flags(engine); pengine = &engine; engine.run(program, 1); if (algo == USER_MEAN || algo == ITEM_MEAN) output_baseline_result(training); test_predictions(&baseline_predict); /* Report execution metrics */ if (!quiet) metrics_report(m); return 0; }
int convert_matrixmarket(std::string base_filename, size_t nodes = 0, size_t edges = 0, int tokens_per_row = 3, int type = TRAINING, int allow_square = true) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, type); return nshards; } } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, type == TRAINING?M:Me, type == TRAINING?N:Ne, nz, nodes, edges, type); if (f == NULL){ if (type == TRAINING){ logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; } else if (type == VALIDATION){ logstream(LOG_INFO)<<"Validation file: " << base_filename << " is not found. " << std::endl; return -1; } } compute_matrix_size(nz, type); uint I, J; double val = 1.0; bool active_edge = true; int zero_entries = 0; for (size_t i=0; i<nz; i++) { if (tokens_per_row == 3){ int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; if (val == 0 && ! allow_zeros) logstream(LOG_FATAL)<<"Encountered zero edge [ " << I << " " <<J << " 0] in line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl; else if (val == 0){ zero_entries++; continue; } } else if (tokens_per_row == 2){ int rc = fscanf(f, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); if (I ==987654321 || J== 987654321) //hack - to be removed later continue; I-=(uint)input_file_offset; /* adjust from 1-based to 0-based */ J-=(uint)input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I+1 << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J+1 << " > " << N << " in line; " << i << std::endl; if (minval != -1e100 && val < minval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where min value is: " << minval << std::endl; if (maxval != 1e100 && val > maxval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where max value is: " << maxval << std::endl; active_edge = decide_if_edge_is_active(i, type); if (active_edge){ if (type == TRAINING) globalMean += val; else globalMean2 += val; sharderobj.preprocessing_add_edge(I, (M==N && allow_square)?J:M + J, als_edge_type((float)val)); } } if (type == TRAINING){ uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl; } else { globalMean2 /= Le; logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " Now creating shards." << std::endl; } write_global_mean(base_filename, type); sharderobj.end_preprocessing(); if (zero_entries) logstream(LOG_WARNING)<<"Found " << zero_entries << " zero edges!" << std::endl; fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename<< std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
int convert_matrixmarket_and_item_similarity(std::string base_filename, std::string similarity_file, int tokens_per_row, vec & degrees) { FILE *f = NULL, *fsim = NULL; size_t nz, nz_sim; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, TRAINING); return nshards; } } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, M, N, nz); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; uint N_row = 0 ,N_col = 0; detect_matrix_size(similarity_file, fsim, N_row, N_col, nz_sim); if (fsim == NULL || nz_sim == 0) logstream(LOG_FATAL)<<"Failed to open item similarity input file: " << similarity_file << std::endl; if (N_row != N || N_col != N) logstream(LOG_FATAL)<<"Wrong item similarity file matrix size: " << N_row <<" x " << N_col << " Instead of " << N << " x " << N << std::endl; L=nz + nz_sim; degrees.resize(M+N); uint I, J; double val = 1.0; int zero_entries = 0; unsigned int actual_edges = 0; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; for (size_t i=0; i<nz; i++){ if (tokens_per_row == 3){ int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file in line: " << i << std::endl; if (val == 0 && ! allow_zeros) logstream(LOG_FATAL)<<"Zero weight encountered at input file line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl; else if (val == 0) { zero_entries++; continue; } } else if (tokens_per_row == 2){ int rc = fscanf(f, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; degrees[J+M]++; degrees[I]++; if (I< (uint)start_user || I >= (uint)end_user){ continue; } sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val, 0)); //std::cout<<"adding an edge: " <<I << " -> " << M+J << std::endl; actual_edges++; } logstream(LOG_DEBUG)<<"Finished loading " << actual_edges << " ratings from file: " << base_filename << std::endl; for (size_t i=0; i<nz_sim; i++){ if (tokens_per_row == 3){ int rc = fscanf(fsim, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << similarity_file << " line: " << i << std::endl; } else if (tokens_per_row == 2){ int rc = fscanf(fsim, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= N) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; if (I == J) logstream(LOG_FATAL)<<"Item similarity to itself found for item " << I << " in line; " << i << std::endl; //std::cout<<"Adding an edge between "<<M+I<< " : " << M+J << " " << (I<J) << " " << val << std::endl; sharderobj.preprocessing_add_edge(M+I, M+J, als_edge_type(I < J? val: 0, I>J? val: 0)); actual_edges++; } L = actual_edges; logstream(LOG_DEBUG)<<"Finished loading " << nz_sim << " ratings from file: " << similarity_file << std::endl; write_global_mean(base_filename, TRAINING); sharderobj.end_preprocessing(); if (zero_entries) logstream(LOG_WARNING)<<"Found " << zero_entries << " edges with zero weight!" << std::endl; fclose(f); fclose(fsim); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename << std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false, int type = TRAINING, int matlab_time_offset = 1) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f = NULL; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, type); } if (type == TRAINING) time_nodes_offset = M+N; return nshards; } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, type == TRAINING? M:Me, type == TRAINING? N:Ne, nz); if (f == NULL){ if (type == VALIDATION){ logstream(LOG_INFO)<< "Did not find validation file: " << base_filename << std::endl; return -1; } else if (type == TRAINING) logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; } if (type == TRAINING) time_nodes_offset = M+N; compute_matrix_size(nz, type); uint I, J; double val, time; bool active_edge = true; for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val); if (rc != 4) logstream(LOG_FATAL)<<"Error when reading input file - line " << i << std::endl; if (time < 0) logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl; I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; K = std::max((int)time, (int)K); time -= matlab_time_offset; if (time < 0 && add_time_edges) logstream(LOG_FATAL)<<"Time bins should be >= " << matlab_time_offset << " in row " << i << std::endl; //only for tensor ALS we add edges between user and time bin and also item and time bin //time bins are numbered beteen M+N to M+N+K if (!weighted_als) time += time_nodes_offset; //avoid self edges if (square && I == J) continue; active_edge = decide_if_edge_is_active(i, type); if (active_edge){ if (type == TRAINING) globalMean += val; else globalMean2 += val; sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time)); } //in case of a tensor, add besides of the user-> movie edge also //time -> user and time-> movie edges if (add_time_edges){ sharderobj.preprocessing_add_edge((uint)time, I, als_edge_type(val, M+J)); sharderobj.preprocessing_add_edge((uint)time, M+J , als_edge_type(val, I)); } } if (type == TRAINING){ uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges4(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl; } else { globalMean2 /= Le; logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " time bins: " << K << " . Now creating shards." << std::endl; } write_global_mean(base_filename, type); sharderobj.end_preprocessing(); fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); return nshards; }
void parse_parser_command_line_arges(){ fc.string_features = get_option_string("features", fc.default_feature_str); csv = get_option_int("csv", 0); if (csv) ptokens = csv_tokens; file_columns = get_option_int("file_columns", file_columns); //get the number of columns in the edge file //input sanity checks if (file_columns < 3) logstream(LOG_FATAL)<<"You must have at least 3 columns in input file: [from] [to] [value] on each line"<<std::endl; if (file_columns >= FEATURE_WIDTH) logstream(LOG_FATAL)<<"file_columns exceeds the allowed storage limit - please increase FEATURE_WIDTH and recompile." << std::endl; fc.from_pos = get_option_int("from_pos", fc.from_pos); fc.to_pos = get_option_int("to_pos", fc.to_pos); fc.val_pos = get_option_int("val_pos", fc.val_pos); if (fc.from_pos >= file_columns || fc.to_pos >= file_columns || fc.val_pos >= file_columns) logstream(LOG_FATAL)<<"Please note that column numbering of from_pos, to_pos and val_pos starts from zero and should be smaller than file_columns" << std::endl; if (fc.from_pos == fc.to_pos || fc.from_pos == fc.val_pos || fc.to_pos == fc.val_pos) logstream(LOG_FATAL)<<"from_pos, to_pos and val_pos should have different values" << std::endl; if (fc.val_pos == -1) logstream(LOG_FATAL)<<"you must specify a target column using --val_pos=XXX. Colmn index starts from 0." << std::endl; has_header_titles = get_option_int("has_header_titles", has_header_titles); limit_rating= get_option_int("limit_rating", 0); //parse features (optional) if (fc.string_features != ""){ char * pfeatures = strdup(fc.string_features.c_str()); char * pch = strtok(pfeatures, ptokens); int node = atoi(pch); if (node < 0 || node >= MAX_FEATURES+3) logstream(LOG_FATAL)<<"Feature id using the --features=XX command should be non negative, starting from zero"<<std::endl; if (node >= file_columns) logstream(LOG_FATAL)<<"Feature id using the --feature=XX command should be < file_columns (counting starts from zero)" << std::endl; if (node == fc.from_pos || node == fc.to_pos || node == fc.val_pos) logstream(LOG_FATAL)<<"Feature id " << node << " can not be equal to --from_pos, --to_pos or --val_pos " << std::endl; fc.feature_selection[node] = true; fc.total_features++; while ((pch = strtok(NULL, ptokens))!= NULL){ node = atoi(pch); if (node < 0 || node >= MAX_FEATURES+3) logstream(LOG_FATAL)<<"Feature id using the --features=XX command should be non negative, starting from zero"<<std::endl; fc.feature_selection[node] = true; fc.total_features++; } } train_only = get_option_int("train_only", 0); validation_only = get_option_int("validation_only", 0); fc.node_id_maps.resize(2+fc.total_features); real_features_string = get_option_string("real_features", real_features_string); //parse real features (optional) if (real_features_string != ""){ int i=0; char * pfeatures = strdup(real_features_string.c_str()); char * pch = strtok(pfeatures, ptokens); int node = atoi(pch); if (node < 0 || node >= MAX_FEATURES+3) logstream(LOG_FATAL)<<"Feature id using the --real_features=XX command should be non negative, starting from zero"<<std::endl; if (node >= file_columns) logstream(LOG_FATAL)<<"Feature id using the --real_feature=XX command should be < file_columns (counting starts from zero)" << std::endl; if (node == fc.from_pos || node == fc.to_pos || node == fc.val_pos) logstream(LOG_FATAL)<<"Feature id " << node << " can not be equal to --from_pos, --to_pos or --val_pos " << std::endl; fc.real_features_indicators[node] = true; fc.feature_positions[node] = i; i++; while ((pch = strtok(NULL, ptokens))!= NULL){ node = atoi(pch); if (node < 0 || node >= MAX_FEATURES+3) logstream(LOG_FATAL)<<"Feature id using the --real_features=XX command should be non negative, starting from zero"<<std::endl; fc.real_features_indicators[node] = true; fc.feature_positions[node] = i; i++; } } }
int convert_matrixmarket_N(std::string base_filename, bool square, int limit_rating = 0) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f; size_t nz; int nshards; if (validation_only && (nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; FILE * infile = fopen((base_filename + ".gm").c_str(), "r"); int node_id_maps_size = 0; assert( fscanf(infile, "%d\n%d\n%ld\n%d\n%lf\n%d\n%d\n%d\n", &M, &N, &L, &fc.total_features, &globalMean, &node_id_maps_size, &latent_factors_inmem_size,&num_feature_bins_size) ==8); assert(node_id_maps_size >= 0); assert(latent_factors_inmem_size >=M+N); fclose(infile); fc.node_id_maps.resize(node_id_maps_size); for (int i=0; i < (int)fc.node_id_maps.size(); i++){ char buf[256]; sprintf(buf, "%s.map.%d", training.c_str(), i); load_map_from_txt_file(fc.node_id_maps[i].string2nodeid, buf, 2); assert(fc.node_id_maps[i].string2nodeid.size() > 0); } logstream(LOG_INFO)<<"Finished loading " << node_id_maps_size << " maps. "<<std::endl; return nshards; } } /** * Create sharder object */ sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, M, N, nz, 0, 0, 0); if (f == NULL) logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl; if (M == 0 && N == 0) logstream(LOG_FATAL)<<"Failed to detect matrix size. Please prepare a file named: " << base_filename << ":info with matrix market header, as explained here: http://bickson.blogspot.co.il/2012/12/collaborative-filtering-3rd-generation_14.html " << std::endl; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; if (has_header_titles){ char * linebuf = NULL; size_t linesize; char linebuf_debug[1024]; /* READ LINE */ int rc = getline(&linebuf, &linesize, f); if (rc == -1) logstream(LOG_FATAL)<<"Error header line " << " [ " << linebuf_debug << " ] " << std::endl; strncpy(linebuf_debug, linebuf, 1024); char *pch = strtok(linebuf,ptokens); if (pch == NULL) logstream(LOG_FATAL)<<"Error header line " << " [ " << linebuf_debug << " ] " << std::endl; header_titles.push_back(std::string(pch)); while (pch != NULL){ pch = strtok(NULL, ptokens); if (pch == NULL) break; header_titles.push_back(pch); } } compute_matrix_size(nz, TRAINING); uint I, J; int val_array_len = std::max(1, fc.total_features); assert(val_array_len < FEATURE_WIDTH); std::vector<float> valarray; valarray.resize(val_array_len); float val = 0.0f; if (limit_rating > 0 && limit_rating < (int)nz) nz = limit_rating; char linebuf_debug[1024]; for (size_t i=0; i<nz; i++) { if (!read_line(f, base_filename, i,I, J, val, valarray, TRAINING, linebuf_debug)) logstream(LOG_FATAL)<<"Failed to read line: " <<i<< " in file: " << base_filename << std::endl; if (I>= M || J >= N || I < 0 || J < 0){ if (i == 0) logstream(LOG_FATAL)<<"Failed to parse first line, there are too many tokens. Did you forget the --has_header_titles=1 flag when file has string column headers? [ " << linebuf_debug << " ] " << " I : " << I << " J: " << J << std::endl; else logstream(LOG_FATAL)<<"Problem parsing input line number: " << i <<" in file: " << base_filename << ". Can not add edge from " << I << " to J " << J << " since matrix size is: " << M <<"x" <<N<< " [ original line: " << linebuf_debug << " ] . You probaably need to increase matrix size in the matrix market header." << std::endl; } bool active_edge = decide_if_edge_is_active(i, TRAINING); if (active_edge){ //calc stats globalMean += val; sharderobj.preprocessing_add_edge(I, square?J:M+J, als_edge_type(val, &valarray[0], val_array_len)); } } sharderobj.end_preprocessing(); //calc stats assert(L > 0); //assert(globalMean != 0); if (globalMean == 0) logstream(LOG_WARNING)<<"Found global mean of the data to be zero (val_pos). Please verify this is correct." << std::endl; globalMean /= L; logstream(LOG_INFO)<<"Computed global mean is: " << globalMean << std::endl; inputGlobalMean = globalMean; fclose(f); if (fc.hash_strings){ for (int i=0; i< fc.total_features+2; i++){ if (fc.node_id_maps[i].string2nodeid.size() == 0) logstream(LOG_FATAL)<<"Failed sanity check for feature number : " << i << " no values find in data " << std::endl; } } logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); return nshards; }
int main(int argc, const char ** argv) { print_copyright(); /* GraphChi initialization will read the command line arguments and the configuration file. */ graphchi_init(argc, argv); /* Metrics object for keeping track of performance counters and other information. Currently required. */ metrics m("itemsim2rating2"); /* Basic arguments for application */ min_allowed_intersection = get_option_int("min_allowed_intersection", min_allowed_intersection); debug = get_option_int("debug", 0); parse_command_line_args(); std::string similarity = get_option_string("similarity", ""); if (similarity == "") logstream(LOG_FATAL)<<"Missing similarity input file. Please specify one using the --similarity=filename command line flag" << std::endl; undirected = get_option_int("undirected", 0); mytimer.start(); int nshards = convert_matrixmarket_and_item_similarity<edge_data>(training, similarity, 3, °rees); assert(M > 0 && N > 0); prob_sim_normalization_constant = (double)L / (double)(M*N-L); //initialize data structure which saves a subset of the items (pivots) in memory adjcontainer = new adjlist_container(); //array for marking which items are conected to the pivot items via users. relevant_items = new bool[N]; /* Run */ ItemDistanceProgram program; graphchi_engine<VertexDataType, edge_data> engine(training, nshards, true, m); set_engine_flags(engine); //open output files as the number of operating threads out_files.resize(number_of_omp_threads()); for (uint i=0; i< out_files.size(); i++){ char buf[256]; sprintf(buf, "%s-rec.out%d", training.c_str(), i); out_files[i] = open_file(buf, "w"); } K = get_option_int("K"); assert(K > 0); //run the program engine.run(program, niters); for (uint i=0; i< out_files.size(); i++) fclose(out_files[i]); delete[] relevant_items; /* Report execution metrics */ if (!quiet) metrics_report(m); std::cout<<"Total item pairs compared: " << item_pairs_compared << " total written to file: " << sum(written_pairs) << std::endl; logstream(LOG_INFO)<<"Going to sort and merge output files " << std::endl; std::string dname= dirname(strdup(argv[0])); system(("bash " + dname + "/topk.sh " + std::string(basename(strdup((training+"-rec").c_str())))).c_str()); return 0; }
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c int ret_code; MM_typecode matcode; FILE *f; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; FILE * inf = fopen((base_filename + ".gm").c_str(), "r"); int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K); if (rc != 5) logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename << ".gm" << std::endl; fclose(inf); if (K <= 0) logstream(LOG_FATAL)<<"Incorrect number of time bins K in .gm file " << base_filename << ".gm" << std::endl; logstream(LOG_INFO) << "Read matrix of size " << M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl; return nshards; } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); if ((f = fopen(base_filename.c_str(), "r")) == NULL) { logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl; } if (mm_read_banner(f, &matcode) != 0) logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl; /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ if (mm_is_complex(matcode) || !mm_is_sparse(matcode)) logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl; /* find out size of sparse matrix .... */ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) { logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl; } logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; uint I, J; double val, time; if (!sharderobj.preprocessed_file_exists()) { for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val); if (rc != 4) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; if (time < 0) logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl; I--; /* adjust from 1-based to 0-based */ J--; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; K = std::max((int)time, (int)K); //avoid self edges if (square && I == J) continue; globalMean += val; L++; sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time+M+N)); //in case of a tensor, add besides of the user-> movie edge also //time -> user and time-> movie edges if (add_time_edges){ sharderobj.preprocessing_add_edge((uint)time + M + N, I, als_edge_type(val, M+J)); sharderobj.preprocessing_add_edge((uint)time + M + N, M+J , als_edge_type(val, I)); } } uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges4(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; sharderobj.end_preprocessing(); globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl; FILE * outf = fopen((base_filename + ".gm").c_str(), "w"); fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K); fclose(outf); } else { logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl; } fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); return nshards; }
int convert_matrixmarket(std::string base_filename, SharderPreprocessor<als_edge_type> * preprocessor = NULL) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c int ret_code; MM_typecode matcode; FILE *f; size_t nz; std::string suffix = ""; if (preprocessor != NULL) { suffix = preprocessor->getSuffix(); } /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename+ suffix, get_option_string("nshards", "auto")))) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; FILE * inf = fopen((base_filename + ".gm").c_str(), "r"); int rc = fscanf(inf,"%d\n%d\n%ld\n%lg\n%d\n",&M, &N, &L, &globalMean, &K); if (rc != 5) logstream(LOG_FATAL)<<"Failed to read global mean from file" << base_filename+ suffix << ".gm" << std::endl; fclose(inf); logstream(LOG_INFO) << "Opened matrix size: " <<M << " x " << N << " Global mean is: " << globalMean << " time bins: " << K << " Now creating shards." << std::endl; return nshards; } sharder<als_edge_type> sharderobj(base_filename + suffix); sharderobj.start_preprocessing(); if ((f = fopen(base_filename.c_str(), "r")) == NULL) { logstream(LOG_FATAL) << "Could not open file: " << base_filename << ", error: " << strerror(errno) << std::endl; } if (mm_read_banner(f, &matcode) != 0) logstream(LOG_FATAL) << "Could not process Matrix Market banner. File: " << base_filename << std::endl; /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ if (mm_is_complex(matcode) || !mm_is_sparse(matcode)) logstream(LOG_FATAL) << "Sorry, this application does not support complex values and requires a sparse matrix." << std::endl; /* find out size of sparse matrix .... */ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) { logstream(LOG_FATAL) << "Failed reading matrix size: error=" << ret_code << std::endl; } L=nz; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; uint I, J; double val; if (!sharderobj.preprocessed_file_exists()) { for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; I--; /* adjust from 1-based to 0-based */ J--; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; globalMean += val; sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val)); } uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; sharderobj.end_preprocessing(); globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl; if (preprocessor != NULL) { preprocessor->reprocess(sharderobj.preprocessed_name(), base_filename); } FILE * outf = fopen((base_filename + ".gm").c_str(), "w"); fprintf(outf, "%d\n%d\n%ld\n%lg\n%d\n", M, N, L, globalMean, K); fclose(outf); } else { logstream(LOG_INFO) << "Matrix already preprocessed, just run sharder." << std::endl; } fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename + suffix << std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
void run_functional_unweighted_semisynchronous(std::string filename, int niters, metrics &_m) { FunctionalProgramProxySemisync<KERNEL> program; /* Process input file - if not already preprocessed */ int nshards = convert_if_notexists<typename FunctionalProgramProxySemisync<KERNEL>::EdgeDataType>(filename, get_option_string("nshards", "auto")); functional_engine<typename FunctionalProgramProxySemisync<KERNEL>::VertexDataType, typename FunctionalProgramProxySemisync<KERNEL>::EdgeDataType, typename FunctionalProgramProxySemisync<KERNEL>::fvertex_t > engine(filename, nshards, false, _m); engine.set_modifies_inedges(false); // Important engine.set_modifies_outedges(true); // Important engine.run(program, niters); }
void run_functional_unweighted_synchronous(std::string filename, int niters, metrics &_m) { FunctionalProgramProxyBulkSync<KERNEL> program; int nshards = convert_if_notexists<typename FunctionalProgramProxyBulkSync<KERNEL>::EdgeDataType>(filename, get_option_string("nshards", "auto")); functional_engine<typename FunctionalProgramProxyBulkSync<KERNEL>::VertexDataType, typename FunctionalProgramProxyBulkSync<KERNEL>::EdgeDataType, typename FunctionalProgramProxyBulkSync<KERNEL>::fvertex_t > engine(filename, nshards, false, _m); engine.set_modifies_inedges(false); // Important engine.set_modifies_outedges(true); // Important engine.set_enable_deterministic_parallelism(false); // Bulk synchronous does not need consistency. engine.run(program, niters); }
int main(int argc, const char ** argv) { print_copyright(); /* GraphChi initialization will read the command line arguments and the configuration file. */ graphchi_init(argc, argv); /* Metrics object for keeping track of performance counters and other information. Currently required. */ metrics m("label_propagation"); contexts_file = get_option_string("contexts"); nouns_file = get_option_string("nouns"); pos_seeds = get_option_string("pos_seeds"); neg_seeds = get_option_string("neg_seeds"); parse_command_line_args(); load_map_from_txt_file(contexts.string2nodeid, contexts_file, 1); load_map_from_txt_file(nouns.string2nodeid, nouns_file, 1); //load graph (adj matrix) from file int nshards = convert_matrixmarket<EdgeDataType>(training, 0, 0, 3, TRAINING, true); init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem); load_seeds_from_txt_file(nouns.string2nodeid, pos_seeds, false); load_seeds_from_txt_file(nouns.string2nodeid, neg_seeds, true); #pragma omp parallel for for (int i=0; i< (int)M; i++){ //normalize seed probabilities to sum up to one if (latent_factors_inmem[i].seed){ if (sum(latent_factors_inmem[i].pvec) != 0) latent_factors_inmem[i].pvec /= sum(latent_factors_inmem[i].pvec); continue; } //other nodes get random label probabilities for (int j=0; j< D; j++) latent_factors_inmem[i].pvec[j] = drand48(); } /* load initial state from disk (optional) */ if (load_factors_from_file){ load_matrix_market_matrix(training + "_U.mm", 0, D); } /* Run */ COEMVerticesInMemProgram program; graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); set_engine_flags(engine); pengine = &engine; engine.run(program, niters); /* Output latent factor matrices in matrix-market format */ output_coem_result(training); /* Report execution metrics */ if (!quiet) metrics_report(m); return 0; }
/* ---------------------------------------------------------------------- */ int spread_row_to_solution (struct spread_row *heading, struct spread_row *units, struct spread_row *data, struct defaults defaults) /* ---------------------------------------------------------------------- */ { int i, j, n, l, next_keyword_save; int n_user, n_user_end; int default_pe, alk; int count_isotopes; int max_mass_balance, count_mass_balance; char *ptr, *ptr1; char *description; char token[MAX_LENGTH], token1[MAX_LENGTH]; char string[2 * MAX_LENGTH]; LDBLE dummy; int return_value, opt; char *next_char; const char *opt_list[] = { "temp", /* 0 */ "temperature", /* 1 */ "dens", /* 2 */ "density", /* 3 */ "units", /* 4 */ "redox", /* 5 */ "ph", /* 6 */ "pe", /* 7 */ "unit", /* 8 */ "isotope", /* 9 */ "water", /* 10 */ "description", /* 11 */ "desc", /* 12 */ "descriptor" /* 13 */ }; int count_opt_list = 14; /* * look for solution number */ n_user = -1; n_user_end = -1; description = string_duplicate (""); for (i = 0; i < heading->count; i++) { if (strcmp_nocase (heading->char_vector[i], "number") == 0) { break; } } if (i == heading->count || data->type_vector[i] == EMPTY || data->count <= i) { n_user = -1; #ifdef SKIP for (i = 0; i < count_solution; i++) { if (n_user <= solution[i]->n_user) { n_user = solution[i]->n_user + 1; } } #endif } else if (data->type_vector[i] == STRING) { input_error++; sprintf (error_string, "Expected solution number or number range in 'number' column, found: %s.", data->char_vector[i]); error_msg (error_string, CONTINUE); } else { strcpy (string, "solution_s "); strcat (string, data->char_vector[i]); ptr = string; description = (char *) free_check_null (description); next_keyword_save = next_keyword; next_keyword = 42; read_number_description (ptr, &n_user, &n_user_end, &description); next_keyword = next_keyword_save; } /* * set up solution */ if (n_user >= 0 && solution_bsearch (n_user, &n, FALSE) != NULL) { solution_free (solution[n]); } else { n = count_solution++; if (count_solution >= max_solution) { space ((void **) ((void *) &(solution)), count_solution, &max_solution, sizeof (struct solution *)); } } solution[n] = solution_alloc (); solution[n]->n_user = n_user; solution[n]->n_user_end = n_user_end; if (use.solution_in == FALSE) { use.solution_in = TRUE; use.n_solution_user = n_user; } max_mass_balance = MAX_MASS_BALANCE; /* * Set default ph, temp, density, pe, units */ solution[n]->description = description; solution[n]->tc = defaults.temp; solution[n]->ph = defaults.ph; solution[n]->density = defaults.density; solution[n]->solution_pe = defaults.pe; solution[n]->mass_water = defaults.water; solution[n]->ah2o = 1.0; solution[n]->mu = 1e-7; solution[n]->cb = 0.0; default_pe = 0; solution[n]->units = defaults.units; solution[n]->totals[0].description = NULL; count_mass_balance = 0; count_isotopes = 0; default_pe = pe_data_store (&(solution[n]->pe), defaults.redox); /* * Read concentration data */ return_value = UNKNOWN; for (i = 0; i < heading->count; i++) { if (strcmp_nocase (heading->char_vector[i], "number") == 0) continue; if (strcmp_nocase (heading->char_vector[i], "uncertainty") == 0) continue; if (strcmp_nocase (heading->char_vector[i], "uncertainties") == 0) continue; if (strcmp_nocase (heading->char_vector[i], "isotope_uncertainty") == 0) continue; /* * Copy in element name */ if (heading->type_vector[i] == EMPTY) continue; strcpy (string, heading->char_vector[i]); strcat (string, " "); /* * Copy in concentration data */ if (i >= data->count || data->type_vector[i] == EMPTY) continue; strcat (string, data->char_vector[i]); strcat (string, " "); /* * Copy in concentration data */ if (units != NULL && i < units->count && units->type_vector[i] != EMPTY) { strcat (string, units->char_vector[i]); } /* * Parse string just like read_solution input */ next_char = string; opt = get_option_string (opt_list, count_opt_list, &next_char); if (opt == OPTION_DEFAULT && heading->type_vector[i] == NUMBER) { opt = 9; } switch (opt) { case OPTION_EOF: /* end of file */ return_value = EOF; break; case OPTION_KEYWORD: /* keyword */ return_value = KEYWORD; break; case OPTION_ERROR: input_error++; error_msg ("Unknown input in SOLUTION keyword.", CONTINUE); error_msg (line_save, CONTINUE); break; case 0: /* temperature */ case 1: sscanf (next_char, SCANFORMAT, &(solution[n]->tc)); break; case 2: /* density */ case 3: sscanf (next_char, SCANFORMAT, &(solution[n]->density)); break; case 4: /* units */ case 8: /* unit */ if (copy_token (token, &next_char, &l) == EMPTY) break; if (check_units (token, FALSE, FALSE, solution[n]->units, TRUE) == OK) { solution[n]->units = string_hsave (token); } else { input_error++; } break; case 5: /* redox */ if (copy_token (token, &next_char, &l) == EMPTY) break; if (parse_couple (token) == OK) { default_pe = pe_data_store (&(solution[n]->pe), token); } else { input_error++; } break; case 6: /* ph */ next_char = string; if (read_conc (n, count_mass_balance, next_char) == ERROR) { input_error++; break; } solution[n]->ph = solution[n]->totals[count_mass_balance].input_conc; if (solution[n]->totals[count_mass_balance].equation_name == NULL) { break; } solution[n]->totals[count_mass_balance].description = string_hsave ("H(1)"); count_mass_balance++; break; case 7: /* pe */ next_char = string; if (read_conc (n, count_mass_balance, next_char) == ERROR) { input_error++; break; } solution[n]->solution_pe = solution[n]->totals[count_mass_balance].input_conc; if (solution[n]->totals[count_mass_balance].equation_name == NULL) { break; } solution[n]->totals[count_mass_balance].description = string_hsave ("E"); count_mass_balance++; break; case 9: /* isotope */ next_char = string; if (copy_token (token, &next_char, &l) != DIGIT) { input_error++; sprintf (error_string, "Expected isotope name to" " begin with an isotopic number."); error_msg (error_string, CONTINUE); continue; } solution[n]->isotopes = (struct isotope *) PHRQ_realloc (solution[n]->isotopes, (size_t) (count_isotopes + 1) * sizeof (struct isotope)); if (solution[n]->isotopes == NULL) malloc_error (); /* read and save element name */ ptr1 = token; get_num (&ptr1, &(solution[n]->isotopes[count_isotopes].isotope_number)); if (ptr1[0] == '\0' || isupper ((int) ptr1[0]) == FALSE) { error_msg ("Expecting element name.", CONTINUE); error_msg (line_save, CONTINUE); input_error++; return (ERROR); } solution[n]->isotopes[count_isotopes].elt_name = string_hsave (ptr1); /* read and store isotope ratio */ if (copy_token (token, &next_char, &l) != DIGIT) { input_error++; sprintf (error_string, "Expected numeric value for isotope ratio."); error_msg (error_string, CONTINUE); continue; } sscanf (token, SCANFORMAT, &(solution[n]->isotopes[count_isotopes].ratio)); /* read and store isotope ratio uncertainty */ /* first choice is next column */ if ((i + 1) < heading->count && (strcmp_nocase (heading->char_vector[i + 1], "uncertainty") == 0 || strcmp_nocase (heading->char_vector[i + 1], "isotope_uncertainty") == 0 || strcmp_nocase (heading->char_vector[i + 1], "uncertainties") == 0) && (i + 1) < data->count && data->type_vector[i + 1] == NUMBER) { solution[n]->isotopes[count_isotopes].ratio_uncertainty = data->d_vector[i + 1]; } else { next_char = string; copy_token (token, &next_char, &l); for (j = 0; j < defaults.count_iso; j++) { if (strcmp (token, defaults.iso[j].name) == 0) { solution[n]->isotopes[count_isotopes].ratio_uncertainty = defaults.iso[j].uncertainty; break; } } if (j == defaults.count_iso) { solution[n]->isotopes[count_isotopes].ratio_uncertainty = NAN; } } count_isotopes++; break; case 10: /* water */ j = copy_token (token, &next_char, &l); if (j == EMPTY) { solution[n]->mass_water = 1.0; } else if (j != DIGIT) { input_error++; sprintf (error_string, "Expected numeric value for mass of water in solution."); error_msg (error_string, CONTINUE); } else { sscanf (token, SCANFORMAT, &dummy); solution[n]->mass_water = (LDBLE) dummy; } break; case 11: /* description */ case 12: /* desc */ case 13: /* descriptor */ solution[n]->description = (char *) free_check_null (solution[n]->description); solution[n]->description = string_duplicate (next_char); break; case OPTION_DEFAULT: /* * Read concentration */ next_char = string; if (copy_token (token, &next_char, &l) == LOWER) continue; next_char = string; if (read_conc (n, count_mass_balance, next_char) == ERROR) { #ifdef SKIP input_error++; break; #endif } count_mass_balance++; break; } if (count_mass_balance + 1 >= max_mass_balance) { space ((void **) ((void *) &(solution[n]->totals)), count_mass_balance + 1, &max_mass_balance, sizeof (struct conc)); } if (return_value == EOF || return_value == KEYWORD) break; } /* * Sort totals by description */ qsort (solution[n]->totals, (size_t) count_mass_balance, (size_t) sizeof (struct conc), conc_compare); /* * fix up default units and default pe */ for (i = 0; i < count_mass_balance; i++) { strcpy (token, solution[n]->totals[i].description); str_tolower (token); if (solution[n]->totals[i].units == NULL) { solution[n]->totals[i].units = solution[n]->units; } else { alk = FALSE; if (strstr (token, "alk") == token) alk = TRUE; strcpy (token1, solution[n]->totals[i].units); if (check_units (token1, alk, TRUE, solution[n]->units, TRUE) == ERROR) { input_error++; } else { solution[n]->totals[i].units = string_hsave (token1); } } if (solution[n]->totals[i].n_pe < 0) { solution[n]->totals[i].n_pe = default_pe; } } solution[n]->default_pe = default_pe; /* * Mark end of solution */ solution[n]->totals[count_mass_balance].description = NULL; solution[n]->count_isotopes = count_isotopes; if (count_isotopes > 0) { qsort (solution[n]->isotopes, (size_t) count_isotopes, (size_t) sizeof (struct isotope), isotope_compare); } else { solution[n]->isotopes = (struct isotope *) free_check_null (solution[n]->isotopes); } return (return_value); }
int main(int argc, const char ** argv) { mytimer.start(); print_copyright(); /* GraphChi initialization will read the command line arguments and the configuration file. */ graphchi_init(argc, argv); /* Metrics object for keeping track of performance counters and other information. Currently required. */ metrics m("rating2"); knn_sample_percent = get_option_float("knn_sample_percent", 1.0); if (knn_sample_percent <= 0 || knn_sample_percent > 1) logstream(LOG_FATAL)<<"Sample percente should be in the range (0, 1] " << std::endl; num_ratings = get_option_int("num_ratings", 10); if (num_ratings <= 0) logstream(LOG_FATAL)<<"num_ratings, the number of recomended items for each user, should be >=1 " << std::endl; debug = get_option_int("debug", 0); tokens_per_row = get_option_int("tokens_per_row", tokens_per_row); std::string algorithm = get_option_string("algorithm"); /* Basic arguments for RBM algorithm */ rbm_bins = get_option_int("rbm_bins", rbm_bins); rbm_scaling = get_option_float("rbm_scaling", rbm_scaling); if (algorithm == "svdpp" || algorithm == "svd++") algo = SVDPP; else if (algorithm == "biassgd") algo = BIASSGD; else if (algorithm == "rbm") algo = RBM; else logstream(LOG_FATAL)<<"--algorithm should be svd++ or biassgd or rbm"<<std::endl; parse_command_line_args(); /* Preprocess data if needed, or discover preprocess files */ int nshards = 0; if (tokens_per_row == 3) nshards = convert_matrixmarket<edge_data>(training, 0, 0, 3, TRAINING, false); else if (tokens_per_row == 4) nshards = convert_matrixmarket4<edge_data4>(training); else logstream(LOG_FATAL)<<"--tokens_per_row should be either 3 or 4" << std::endl; assert(M > 0 && N > 0); latent_factors_inmem.resize(M+N); // Initialize in-memory vertices. //initialize data structure to hold the matrix read from file if (algo == RBM){ #pragma omp parallel for for (uint i=0; i< M+N; i++){ if (i < M){ latent_factors_inmem[i].pvec = zeros(D*3); } else { latent_factors_inmem[i].pvec = zeros(rbm_bins + rbm_bins * D); } } } read_factors(training); if ((uint)num_ratings > N){ logstream(LOG_WARNING)<<"num_ratings is too big - setting it to: " << N << std::endl; num_ratings = N; } srand(time(NULL)); /* Run */ if (tokens_per_row == 3){ RatingVerticesInMemProgram<VertexDataType, EdgeDataType> program; graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m); set_engine_flags(engine); engine.run(program, 1); } else if (tokens_per_row == 4){ RatingVerticesInMemProgram<VertexDataType, edge_data4> program; graphchi_engine<VertexDataType, edge_data4> engine(training, nshards, false, m); set_engine_flags(engine); engine.run(program, 1); } /* Output latent factor matrices in matrix-market format */ output_knn_result(training); rating_stats(); if (users_without_ratings > 0) logstream(LOG_WARNING)<<"Found " << users_without_ratings << " without ratings. For those users no items are recommended (item id 0)" << std::endl; if (users_no_ratings > 0) logstream(LOG_WARNING)<<"Failed to compute ratings for " << users_no_ratings << " Users. For those users no items are recommended (item id 0)" << std::endl; /* Report execution metrics */ if (!quiet) metrics_report(m); return 0; }