/** * Called after an iteration has finished. */ void after_iteration(int iteration, graphchi_context &ginfo) { logstream(LOG_DEBUG)<<mytimer.current_time() << "iteration: " << iteration << " changes: " << changes << std::endl; if (changes == 0) ginfo.set_last_iteration(iteration); changes = 0; iter++; }
void training_rmse(int iteration, graphchi_context &gcontext){ last_training_rmse = dtraining_rmse; dtraining_rmse = 0; #pragma omp parallel for reduction(+:dtraining_rmse) for (int i=0; i< (int)M; i++){ dtraining_rmse += latent_factors_inmem[i].rmse; } dtraining_rmse = sqrt(dtraining_rmse / pengine->num_edges()); std::cout<< std::setw(10) << mytimer.current_time() << ") Iteration: " << std::setw(3) <<iteration<<" Training RMSE: " << std::setw(10)<< dtraining_rmse; }
/** * Vertex update function - computes the least square step */ void update(graphchi_vertex<VertexDataType, EdgeDataType> &vertex, graphchi_context &gcontext) { if (vertex.id() >= M) return; vertex_data & vdata = latent_factors_inmem[vertex.id()]; int howmany = N*knn_sample_percent; assert(howmany > 0 ); vec distances = vec::Zero(howmany); ivec indices = ivec(howmany); for (int i=0; i< howmany; i++){ indices[i]= -2; } std::vector<bool> curratings; curratings.resize(N); for(int e=0; e < vertex.num_edges(); e++) { //no need to calculate this rating since it is given in the training data reference curratings[vertex.edge(e)->vertex_id() - M] = true; } if (knn_sample_percent == 1.0){ for (uint i=M; i< M+N; i++){ if (curratings[i-M]) continue; vertex_data & other = latent_factors_inmem[i]; double dist; als_predict(vdata, other, 0, dist); indices[i-M] = i-M; distances[i-M] = dist; } } else for (int i=0; i<howmany; i++){ int random_other = ::randi(M, M+N-1); vertex_data & other = latent_factors_inmem[random_other]; double dist; als_predict(vdata, other, 0, dist); indices[i-M] = i-M; distances[i-M] = dist; } vec out_dist(num_ratings); ivec indices_sorted = reverse_sort_index2(distances, indices, out_dist, num_ratings); assert(indices_sorted.size() <= num_ratings); assert(out_dist.size() <= num_ratings); vdata.ids = indices_sorted; vdata.ratings = out_dist; if (debug) printf("Closest is: %d with distance %g\n", (int)vdata.ids[0], vdata.ratings[0]); if (vertex.id() % 1000 == 0) printf("Computing recommendaitons for user %d at time: %g\n", vertex.id()+1, mytimer.current_time()); }
int main(int argc, const char *argv[]) { logstream(LOG_WARNING)<<"CE_Graph parsers library is written by Danny Bickson (c). Send any " " comments or bug reports to [email protected] " << std::endl; global_logger().set_log_level(LOG_INFO); global_logger().set_log_to_console(true); CE_Graph_init(argc, argv); debug = get_option_int("debug", 0); dir = get_option_string("file_list"); lines = get_option_int("lines", 0); omp_set_num_threads(get_option_int("ncpus", 1)); mytime.start(); FILE * f = fopen(dir.c_str(), "r"); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open file list!"<<std::endl; while(true){ char buf[256]; int rc = fscanf(f, "%s\n", buf); if (rc < 1) break; in_files.push_back(buf); } if (in_files.size() == 0) logstream(LOG_FATAL)<<"Failed to read any file names from the list file: " << dir << std::endl; #pragma omp parallel for for (uint i=0; i< in_files.size(); i++) parse(i); std::cout << "Finished in " << mytime.current_time() << std::endl << "\t direct tweets found: " << links_found << " \t global tweets: " << wide_tweets << "\t http links: " << http_links << "\t retweets: " << retweet_found << "\t total lines in input file : " << total_lines << " \t invalid records (missing names) " << missing_names << std::endl; save_map_to_text_file(string2nodeid, outdir + "map.text"); save_map_to_text_file(nodeid2hash, outdir + "reverse.map.text"); save_map_to_text_file(tweets_per_user, outdir + "tweets_per_user.text"); out_file fout("mm.info"); fprintf(fout.outf, "%%%%MatrixMarket matrix coordinate real general\n"); fprintf(fout.outf, "%u %u %lu\n", maxfrom+1, maxto+1, links_found); return 0; }
int main(int argc, const char *argv[]) { logstream(LOG_WARNING)<<"GraphChi parsers library is written by Danny Bickson (c). Send any " " comments or bug reports to [email protected] " << std::endl; global_logger().set_log_level(LOG_INFO); global_logger().set_log_to_console(true); graphchi_init(argc, argv); debug = get_option_int("debug", 0); dir = get_option_string("file_list"); lines = get_option_int("lines", 0); omp_set_num_threads(get_option_int("ncpus", 1)); from_val = get_option_int("from_val", from_val); to_val = get_option_int("to_val", to_val); mid_val = get_option_int("mid_val", mid_val); if (from_val == -1) logstream(LOG_FATAL)<<"Must set from/to " << std::endl; mytime.start(); FILE * f = fopen(dir.c_str(), "r"); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open file list!"<<std::endl; while(true){ char buf[256]; int rc = fscanf(f, "%s\n", buf); if (rc < 1) break; in_files.push_back(buf); } if (in_files.size() == 0) logstream(LOG_FATAL)<<"Failed to read any file frommap from the list file: " << dir << std::endl; #pragma omp parallel for for (int i=0; i< (int)in_files.size(); i++) parse(i); std::cout << "Finished in " << mytime.current_time() << std::endl; save_map_to_text_file(frommap.string2nodeid, outdir + dir + "map.text"); return 0; }
/** * Vertex update function. */ void update(CE_Graph_vertex<VertexDataType, EdgeDataType> &v, CE_Graph_context &gcontext) { if (debug) printf("Entered iteration %d with %d - edges %d\n", gcontext.iteration, v.id(), v.num_edges()); /* even iteration numbers: * 1) load a subset of items into memory (pivots) * 2) Find which subset of items needs to compared to the users */ if (gcontext.iteration % 2 == 0) { if (adjcontainer->is_pivot(v.id())){ adjcontainer->load_edges_into_memory(v); if (debug) printf("Loading pivot %d intro memory\n", v.id()); } } else { for (vid_t i=adjcontainer->pivot_st; i< adjcontainer->pivot_en; i++){ //since metric is symmetric, compare only to pivots which are smaller than this item id if (i >= v.id()) continue; dense_adj &pivot_edges = adjcontainer->adjs[i - adjcontainer->pivot_st]; //pivot is not connected to this item, continue if (get_val(pivot_edges.edges, v.id()) == 0) continue; double dist = adjcontainer->calc_distance(v, i, distance_metric); item_pairs_compared++; if (item_pairs_compared % 1000000 == 0) logstream(LOG_INFO)<< std::setw(10) << mytimer.current_time() << ") " << std::setw(10) << item_pairs_compared << " pairs compared " << std::endl; if (debug) printf("comparing %d to pivot %d distance is %lg\n", i+ 1, v.id() + 1, dist); if (dist != 0){ fprintf(out_files[omp_get_thread_num()], "%u %u %.12lg\n", v.id()+1, i+1, (double)dist);//write item similarity to file //where the output format is: //[item A] [ item B ] [ distance ] written_pairs++; } } }//end of iteration % 2 == 1 }//end of update function
int main(int argc, const char *argv[]) { Rcpp::Rcout<<"GraphChi parsers library is written by Danny Bickson (c). Send any " " comments or bug reports to [email protected] " << std::endl; global_logger().set_log_level(LOG_INFO); global_logger().set_log_to_console(true); graphchi_init(argc, argv); debug = get_option_int("debug", 0); dir = get_option_string("file_list"); lines = get_option_int("lines", 0); omp_set_num_threads(get_option_int("ncpus", 1)); mytime.start(); FILE * f = fopen(dir.c_str(), "r"); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open file list!"<<std::endl; while(true){ char buf[256]; int rc = fscanf(f, "%s\n", buf); if (rc < 1) break; in_files.push_back(buf); } if (in_files.size() == 0) logstream(LOG_FATAL)<<"Failed to read any file names from the list file: " << dir << std::endl; //#pragma omp parallel for for (uint i=0; i< in_files.size(); i++) parse(i); std::cout << "Finished in " << mytime.current_time() << std::endl << "\t total lines in input file : " << total_lines << "\t max from: " << maxfrom << "\t max to: " <<maxto << std::endl; return 0; }
/** * Vertex update function. */ void update(graphchi_vertex<VertexDataType, edge_data> &v, graphchi_context &gcontext) { if (debug) printf("Entered iteration %d with %d\n", gcontext.iteration, is_item(v.id()) ? (v.id() - M + 1): v.id()); /* Even iteration numbers: * 1) load a subset of users into memory (pivots) * 2) Find which subset of items is connected to the users */ if (gcontext.iteration % 2 == 0) { if (adjcontainer->is_pivot(v.id()) && is_user(v.id())) { adjcontainer->load_edges_into_memory(v); if (debug) printf("Loading pivot %d intro memory\n", v.id()); } } /* odd iteration number: * 1) For any item connected to a pivot item * compute itersection */ else { assert(is_item(v.id())); for (int i=0; i< v.num_edges(); i++) { if (!adjcontainer->is_pivot(v.edge(i)->vertex_id())) continue; if (debug) printf("comparing user pivot %d to item %d\n", v.edge(i)->vertex_id()+1 , v.id() - M + 1); adjcontainer->compute_ratings(v, v.edge(i)->vertex_id(), v.edge(i)->get_data().up_weight); item_pairs_compared++; if (item_pairs_compared % 1000000 == 0) Rcpp::Rcout<< std::setw(10) << mytimer.current_time() << ") " << std::setw(10) << item_pairs_compared << " pairs compared " << std::endl; } }//end of iteration % 2 == 1 }//end of update function
void parse(int i){ in_file fin(in_files[i]); out_file fout((outdir + ".out")); size_t linesize = 0; char * saveptr = NULL, * linebuf = NULL; size_t line = 1; uint from,to; bool matrix_market = false; while(true){ int rc = getline(&linebuf, &linesize, fin.outf); if (rc < 1) break; if (strlen(linebuf) <= 1){ //skip empty lines line++; continue; } if (has_header_titles && line == 1){ line++; continue; } //skipping over matrix market header (if any) if (!strncmp(linebuf, "%%MatrixMarket", 14)){ matrix_market = true; continue; } if (matrix_market && linebuf[0] == '%'){ continue; } if (matrix_market && linebuf[0] != '%'){ matrix_market = false; continue; } //read [FROM] char *pch = strtok_r(linebuf,string_to_tokenize, &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf << "]" << std::endl; return; } assign_id(string2nodeid, from, pch, true); //read [TO] pch = strtok_r(NULL,string_to_tokenize, &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf << "]" << std::endl; return; } assign_id(single_domain ? string2nodeid:string2nodeid2, to, pch, single_domain ? true : false); //read the rest of the line if (!binary){ if (ignore_rest_of_line) pch = strtok_r(NULL, string_to_tokenize, &saveptr); else pch = strtok_r(NULL, "\n", &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf << "]" << std::endl; return; } } if (tsv) fprintf(fout.outf, "%u\t%u\t%s\n", from, to, binary? "": pch); else if (csv) fprintf(fout.outf, "%u %u %s\n", from, to, binary? "" : pch); else fprintf(fout.outf, "%u %u %s\n", from, to, binary? "" : pch); nnz++; line++; total_lines++; if (lines && line>=lines) break; if (debug && (line % 50000 == 0)) logstream(LOG_INFO) << mytimer.current_time() << ") Parsed line: " << line << " map size is: " << string2nodeid.size() << std::endl; if (string2nodeid.size() % 500000 == 0) logstream(LOG_INFO) << mytimer.current_time() << ") Hash map size: " << string2nodeid.size() << " at time: " << mytime.current_time() << " edges: " << total_lines << std::endl; } logstream(LOG_INFO) <<"Finished parsing total of " << line << " lines in file " << in_files[i] << endl << "total map size: " << string2nodeid.size() << endl; }
/// \cond GRAPHLAB_INTERNAL inline double get_current_time() const { return ti.current_time(); }
int main(int argc, const char *argv[]) { logstream(LOG_WARNING)<<"GraphChi parsers library is written by Danny Bickson (c). Send any " " comments or bug reports to [email protected] " << std::endl; global_logger().set_log_level(LOG_INFO); global_logger().set_log_to_console(true); graphchi_init(argc, argv); mytimer.start(); outdir = get_option_string("output",""); debug = get_option_int("debug", 0); dir = get_option_string("file_list",""); filename = get_option_string("training",""); lines = get_option_int("lines", 0); omp_set_num_threads(get_option_int("ncpus", 1)); tsv = get_option_int("tsv", 0); //is this tab seperated file? csv = get_option_int("csv", 0); // is the comma seperated file? binary = get_option_int("binary", 0); single_domain = get_option_int("single_domain", 0); has_header_titles = get_option_int("has_header_titles", has_header_titles); ignore_rest_of_line = get_option_int("ignore_rest_of_line", ignore_rest_of_line); mytime.start(); string_to_tokenize = spaces; if (tsv) string_to_tokenize = tsv_spaces; else if (csv) string_to_tokenize = csv_spaces; if (dir != ""){ FILE * f = fopen(dir.c_str(), "r"); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open file list!"<<std::endl; while(true){ char buf[256]; int rc = fscanf(f, "%s\n", buf); if (rc < 1) break; in_files.push_back(buf); } } else if (filename != "") in_files.push_back(filename); if (in_files.size() == 0) logstream(LOG_FATAL)<<"Failed to read any file names from the list file: " << dir << std::endl; #pragma omp parallel for for (uint i=0; i< in_files.size(); i++) parse(i); std::cout << "Finished in " << mytime.current_time() << std::endl; M = string2nodeid.size(); if (single_domain) N = M; else N = string2nodeid2.size(); save_map_to_text_file(string2nodeid, outdir + ".user.map"); if (!single_domain){ save_map_to_text_file(string2nodeid2, outdir + ".item.map"); } std::string filename = "matrix_market.info"; if (in_files.size() == 1) filename = in_files[0] + ".out:info"; logstream(LOG_INFO)<<"Writing matrix market header into file: " << filename << std::endl; out_file fout(filename.c_str()); MM_typecode out_typecode; mm_clear_typecode(&out_typecode); mm_set_integer(&out_typecode); mm_set_sparse(&out_typecode); mm_set_matrix(&out_typecode); mm_write_banner(fout.outf, out_typecode); mm_write_mtx_crd_size(fout.outf, M, N, nnz); return 0; }
void parse(int i){ in_file fin(in_files[i]); out_file fout((outdir + in_files[i] + ".out")); size_t linesize = 0; char * saveptr = NULL, * linebuf = NULL, buf1[256], linebuf_debug[1024]; size_t line = 1; uint id; long int ptime; bool ok; bool first = true; while(true){ int rc = getline(&linebuf, &linesize, fin.outf); strncpy(linebuf_debug, linebuf, 1024); total_lines++; if (rc < 1) break; if (strlen(linebuf) <= 1) //skip empty lines continue; if (first){ first = false; continue; } //skip first line char *pch = strtok_r(linebuf," \r\n\t:/-", &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf << "]" << std::endl; return; } switch(*pch){ case 'T': ok = convert_string_to_time(linebuf_debug, total_lines, i, saveptr, ptime); if (!ok) return; break; case 'U': ok = extract_user_name(linebuf_debug, total_lines, i, saveptr, buf1); if (ok) assign_id(id, buf1, line, in_files[i]); tweets_per_user[id]++; break; case 'W': ok = parse_links(linebuf_debug, total_lines, i, saveptr, id, ptime, fout.outf); if (debug && line < 20) printf("Found user: %s id %u time %ld\n", buf1, id, ptime); if (!ok) wide_tweets++; break; default: logstream(LOG_ERROR)<<"Error: expecting with T U or W first character" << std::endl; return; } line++; if (lines && line>=lines) break; if (debug && (line % 50000 == 0)) logstream(LOG_INFO) << "Parsed line: " << line << " map size is: " << string2nodeid.size() << std::endl; if (string2nodeid.size() % 500000 == 0) logstream(LOG_INFO) << "Hash map size: " << string2nodeid.size() << " at time: " << mytime.current_time() << " edges: " << total_lines << std::endl; } logstream(LOG_INFO) <<"Finished parsing total of " << line << " lines in file " << in_files[i] << endl << "total map size: " << string2nodeid.size() << endl; }
/** * Vertex update function - computes the least square step */ void update(graphchi_vertex<VertexDataType, EdgeDataType> &vertex, graphchi_context &gcontext) { //compute only for user nodes if (vertex.id() >= std::min(M,(uint)end_user) || vertex.id() < (uint)start_user) return; vertex_data & vdata = latent_factors_inmem[vertex.id()]; int howmany = (int)(N*knn_sample_percent); assert(howmany > 0 ); if (vertex.num_outedges() == 0){ mymutex.lock(); users_without_ratings++; mymutex.unlock(); } vec distances = zeros(howmany); ivec indices = ivec::Zero(howmany); for (int i=0; i< howmany; i++){ indices[i]= -1; } std::vector<bool> curratings; curratings.resize(N); for(int e=0; e < vertex.num_edges(); e++) { //no need to calculate this rating since it is given in the training data reference assert(vertex.edge(e)->vertex_id() - M >= 0 && vertex.edge(e)->vertex_id() - M < N); curratings[vertex.edge(e)->vertex_id() - M] = true; } if (knn_sample_percent == 1.0){ for (uint i=M; i< M+N; i++){ if (curratings[i-M]) continue; vertex_data & other = latent_factors_inmem[i]; double dist; if (algo == SVDPP) svdpp_predict(vdata, other, 0, dist); else if (algo == BIASSGD) biassgd_predict(vdata, other, 0, dist); else if (algo == RBM) rbm_predict(vdata, other, 0, dist); else assert(false); indices[i-M] = i-M; distances[i-M] = dist + 1e-10; } } else for (int i=0; i<howmany; i++){ int random_other = ::randi(M, M+N-1); vertex_data & other = latent_factors_inmem[random_other]; double dist; if (algo == SVDPP) svdpp_predict(vdata, other, 0, dist); else if (algo == BIASSGD) biassgd_predict(vdata, other, 0, dist); else if (algo == RBM) rbm_predict(vdata, other, 0, dist); else assert(false); indices[i] = random_other-M; distances[i] = dist; } vec out_dist(num_ratings); ivec indices_sorted = reverse_sort_index2(distances, indices, out_dist, num_ratings); assert(indices_sorted.size() <= num_ratings); assert(out_dist.size() <= num_ratings); vdata.ids = indices_sorted; vdata.ratings = out_dist; if (debug) printf("Closest is: %d with distance %g\n", (int)vdata.ids[0], vdata.ratings[0]); if (vertex.id() % 1000 == 0) printf("Computing recommendations for user %d at time: %g\n", vertex.id()+1, mytimer.current_time()); }
vec lanczos( bipartite_graph_descriptor & info, timer & mytimer, vec & errest, const std::string & vecfile){ int nconv = 0; int its = 1; DistMat A(info); DistSlicedMat U(info.is_square() ? data_size : 0, info.is_square() ? 2*data_size : data_size, true, info, "U"); DistSlicedMat V(0, data_size, false, info, "V"); vec alpha, beta, b; vec sigma = zeros(data_size); errest = zeros(nv); DistVec v_0(info, 0, false, "v_0"); if (vecfile.size() == 0) v_0 = randu(size(A,2)); PRINT_VEC2("svd->V", v_0); DistDouble vnorm = norm(v_0); v_0=v_0/vnorm; PRINT_INT(nv); while(nconv < nsv && its < max_iter){ std::cout<<"Starting iteration: " << its << " at time: " << mytimer.current_time() << std::endl; int k = nconv; int n = nv; PRINT_INT(k); PRINT_INT(n); alpha = zeros(n); beta = zeros(n); U[k] = V[k]*A._transpose(); orthogonalize_vs_all(U, k, alpha(0)); //alpha(0)=norm(U[k]).toDouble(); PRINT_VEC3("alpha", alpha, 0); //U[k] = U[k]/alpha(0); for (int i=k+1; i<n; i++){ std::cout <<"Starting step: " << i << " at time: " << mytimer.current_time() << std::endl; PRINT_INT(i); V[i]=U[i-1]*A; orthogonalize_vs_all(V, i, beta(i-k-1)); //beta(i-k-1)=norm(V[i]).toDouble(); //V[i] = V[i]/beta(i-k-1); PRINT_VEC3("beta", beta, i-k-1); U[i] = V[i]*A._transpose(); orthogonalize_vs_all(U, i, alpha(i-k)); //alpha(i-k)=norm(U[i]).toDouble(); //U[i] = U[i]/alpha(i-k); PRINT_VEC3("alpha", alpha, i-k); } V[n]= U[n-1]*A; orthogonalize_vs_all(V, n, beta(n-k-1)); //beta(n-k-1)=norm(V[n]).toDouble(); PRINT_VEC3("beta", beta, n-k-1); //compute svd of bidiagonal matrix PRINT_INT(nv); PRINT_NAMED_INT("svd->nconv", nconv); n = nv - nconv; PRINT_INT(n); alpha.conservativeResize(n); beta.conservativeResize(n); PRINT_MAT2("Q",eye(n)); PRINT_MAT2("PT",eye(n)); PRINT_VEC2("alpha",alpha); PRINT_VEC2("beta",beta); mat T=diag(alpha); for (int i=0; i<n-1; i++) set_val(T, i, i+1, beta(i)); PRINT_MAT2("T", T); mat a,PT; svd(T, a, PT, b); PRINT_MAT2("Q", a); alpha=b.transpose(); PRINT_MAT2("alpha", alpha); for (int t=0; t< n-1; t++) beta(t) = 0; PRINT_VEC2("beta",beta); PRINT_MAT2("PT", PT.transpose()); //estiamte the error int kk = 0; for (int i=nconv; i < nv; i++){ int j = i-nconv; PRINT_INT(j); sigma(i) = alpha(j); PRINT_NAMED_DBL("svd->sigma[i]", sigma(i)); PRINT_NAMED_DBL("Q[j*n+n-1]",a(n-1,j)); PRINT_NAMED_DBL("beta[n-1]",beta(n-1)); errest(i) = abs(a(n-1,j)*beta(n-1)); PRINT_NAMED_DBL("svd->errest[i]", errest(i)); if (alpha(j) > tol){ errest(i) = errest(i) / alpha(j); PRINT_NAMED_DBL("svd->errest[i]", errest(i)); } if (errest(i) < tol){ kk = kk+1; PRINT_NAMED_INT("k",kk); } if (nconv +kk >= nsv){ printf("set status to tol\n"); finished = true; } }//end for PRINT_NAMED_INT("k",kk); vec v; if (!finished){ vec swork=get_col(PT,kk); PRINT_MAT2("swork", swork); v = zeros(size(A,1)); for (int ttt=nconv; ttt < nconv+n; ttt++){ v = v+swork(ttt-nconv)*(V[ttt].to_vec()); } PRINT_VEC2("svd->V",V[nconv]); PRINT_VEC2("v[0]",v); } //compute the ritz eigenvectors of the converged singular triplets if (kk > 0){ PRINT_VEC2("svd->V", V[nconv]); mat tmp= V.get_cols(nconv,nconv+n)*PT; V.set_cols(nconv, nconv+kk, get_cols(tmp, 0, kk)); PRINT_VEC2("svd->V", V[nconv]); PRINT_VEC2("svd->U", U[nconv]); tmp= U.get_cols(nconv, nconv+n)*a; U.set_cols(nconv, nconv+kk,get_cols(tmp,0,kk)); PRINT_VEC2("svd->U", U[nconv]); } nconv=nconv+kk; if (finished) break; V[nconv]=v; PRINT_VEC2("svd->V", V[nconv]); PRINT_NAMED_INT("svd->nconv", nconv); its++; PRINT_NAMED_INT("svd->its", its); PRINT_NAMED_INT("svd->nconv", nconv); //nv = min(nconv+mpd, N); //if (nsv < 10) // nv = 10; PRINT_NAMED_INT("nv",nv); } // end(while) printf(" Number of computed signular values %d",nconv); printf("\n"); DistVec normret(info, nconv, false, "normret"); DistVec normret_tranpose(info, nconv, true, "normret_tranpose"); for (int i=0; i < nconv; i++){ normret = V[i]*A._transpose() -U[i]*sigma(i); double n1 = norm(normret).toDouble(); PRINT_DBL(n1); normret_tranpose = U[i]*A -V[i]*sigma(i); double n2 = norm(normret_tranpose).toDouble(); PRINT_DBL(n2); double err=sqrt(n1*n1+n2*n2); PRINT_DBL(err); PRINT_DBL(tol); if (sigma(i)>tol){ err = err/sigma(i); } PRINT_DBL(err); PRINT_DBL(sigma(i)); printf("Singular value %d \t%13.6g\tError estimate: %13.6g\n", i, sigma(i),err); } if (save_vectors){ std::cout<<"Going to save output vectors U and V" << std::endl; if (nconv == 0) logstream(LOG_FATAL)<<"No converged vectors. Aborting the save operation" << std::endl; char output_filename[256]; for (int i=0; i< nconv; i++){ sprintf(output_filename, "%s.U.%d", training.c_str(), i); write_output_vector(output_filename, U[i].to_vec(), false, "GraphLab v2 SVD output. This file contains eigenvector number i of the matrix U"); sprintf(output_filename, "%s.V.%d", training.c_str(), i); write_output_vector(output_filename, V[i].to_vec(), false, "GraphLab v2 SVD output. This file contains eigenvector number i of the matrix V'"); } } return sigma; }
void parse(int i){ in_file fin(in_files[i]); out_file fout((outdir + in_files[i] + ".out")); size_t linesize = 0; char * saveptr = NULL, * linebuf = NULL; size_t line = 1; uint id; while(true){ std::map<uint,uint> wordcount; int rc = getline(&linebuf, &linesize, fin.outf); if (rc < 1) break; if (strlen(linebuf) <= 1) //skip empty lines continue; char *pch = strtok_r(linebuf, spaces, &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf << "]" << std::endl; return; } assign_id(frommap, id, pch); wordcount[id]+= 1; while(pch != NULL){ pch = strtok_r(NULL, spaces ,&saveptr); if (pch != NULL && strlen(pch) > 1){ assign_id(frommap, id, pch); wordcount[id]+= 1; } } total_lines++; std::map<uint,uint>::const_iterator it; for (it = wordcount.begin(); it != wordcount.end(); it++){ if ((int)it->second >= min_threshold && (int)it->second <= max_threshold) fprintf(fout.outf, "%lu %u %u\n", line, it->first, it->second); } line++; if (lines && line>=lines) break; if (debug && (line % 50000 == 0)) logstream(LOG_INFO) << "Parsed line: " << line << " map size is: " << frommap.string2nodeid.size() << std::endl; if (frommap.string2nodeid.size() % 500000 == 0) logstream(LOG_INFO) << "Hash map size: " << frommap.string2nodeid.size() << " at time: " << mytime.current_time() << " edges: " << total_lines << std::endl; } logstream(LOG_INFO) <<"Finished parsing total of " << line << " lines in file " << in_files[i] << endl << "total map size: " << frommap.string2nodeid.size() << endl; }
int main(int argc, const char *argv[]) { Rcpp::Rcout<<"GraphChi parsers library is written by Danny Bickson (c). Send any " " comments or bug reports to [email protected] " << std::endl; global_logger().set_log_level(LOG_INFO); global_logger().set_log_to_console(true); graphchi_init(argc, argv); debug = get_option_int("debug", 0); dir = get_option_string("file_list"); lines = get_option_int("lines", 0); omp_set_num_threads(get_option_int("ncpus", 1)); from_val = get_option_int("from_val", from_val); to_val = get_option_int("to_val", to_val); if (from_val == -1) logstream(LOG_FATAL)<<"Must set from/to " << std::endl; mytime.start(); FILE * f = fopen(dir.c_str(), "r"); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open file list!"<<std::endl; while(true){ char buf[256]; int rc = fscanf(f, "%s\n", buf); if (rc < 1) break; in_files.push_back(buf); } if (in_files.size() == 0) logstream(LOG_FATAL)<<"Failed to read any file frommap from the list file: " << dir << std::endl; #pragma omp parallel for for (int i=0; i< (int)in_files.size(); i++) parse(i); std::cout << "Finished in " << mytime.current_time() << std::endl; int total_x =0 , total_y = 0; std::map<std::string, int>::iterator it; double h = 0; for (it = p_x.begin(); it != p_x.end(); it++){ total_x+= it->second; h-= (it->second / (double)n)*log2(it->second / (double)n); } for (it = p_y.begin(); it != p_y.end(); it++) total_y+= it->second; assert(total_x == n); assert(total_y == n); double mi = 0; std::map<std::string, uint>::iterator iter; assert(n != 0); int total_p_xy = 0; for (iter = frommap.string2nodeid.begin() ; iter != frommap.string2nodeid.end(); iter++){ double p_xy = iter->second / (double)n; assert(p_xy > 0); char buf[256]; strncpy(buf, iter->first.c_str(), 256); char * first = strtok(buf, "_"); char * second = strtok(NULL, "\n\r "); assert(first && second); double px = p_x[first] / (double)n; double py = p_y[second] / (double)n; assert(px > 0 && py > 0); mi += p_xy * log2(p_xy / (px * py)); total_p_xy += iter->second; } assert(total_p_xy == n); logstream(LOG_INFO)<<"Total examples: " <<n << std::endl; logstream(LOG_INFO)<<"Unique p(x) " << p_x.size() << std::endl; logstream(LOG_INFO)<<"Unique p(y) " << p_y.size() << std::endl; logstream(LOG_INFO)<<"Average F(x) " << total_x / (double)p_x.size() << std::endl; logstream(LOG_INFO)<<"Average F(y) " << total_y / (double)p_y.size() << std::endl; std::cout<<"Mutual information of " << from_val << " [" << header_titles[from_val-1] << "] <-> " << to_val << " [" << header_titles[to_val-1] << "] is: " ; if (mi/h > 1e-3) std::cout<<std::setprecision(3) << mi << std::endl; else std::cout<<"-"<<std::endl; save_map_to_text_file(frommap.string2nodeid, outdir + dir + "map.text"); logstream(LOG_INFO)<<"Saving map file " << outdir << dir << "map.text" << std::endl; return 0; }
int main(int argc, const char ** argv) { /* GraphChi initialization will read the command line arguments and the configuration file. */ graphchi_init(argc, argv); /* Metrics object for keeping track of performance counters and other information. Currently required. */ metrics m("connected-components-inmem"); /* Basic arguments for application */ std::string filename = get_option_string("file"); // Base filename int niters = get_option_int("niters", 100); // Number of iterations (max) int output_labels = get_option_int("output_labels", 0); //output node labels to file? bool scheduler = true; // Always run with scheduler /* Process input file - if not already preprocessed */ float p = get_option_float("p", -1); int n = get_option_int("n", -1); int quiet = get_option_int("quiet", 0); if (quiet) global_logger().set_log_level(LOG_ERROR); int nshards = (int) convert_if_notexists<EdgeDataType>(filename, get_option_string("nshards", "auto")); mytimer.start(); /* Run */ ConnectedComponentsProgram program; graphchi_engine<VertexDataType, EdgeDataType> engine(filename, nshards, scheduler, m); engine.set_disable_vertexdata_storage(); engine.set_enable_deterministic_parallelism(false); engine.set_modifies_inedges(false); engine.set_modifies_outedges(false); engine.set_preload_commit(false); engine.set_maxwindow(engine.num_vertices()); mytimer.start(); active_nodes = new bool[engine.num_vertices()]; for (int i=0; i< engine.num_vertices(); i++) active_nodes[i] = true; engine.run(program, niters); /* Run analysis of the connected components (output is written to a file) */ if (output_labels){ FILE * pfile = fopen((filename + "-components").c_str(), "w"); if (!pfile) logstream(LOG_FATAL)<<"Failed to open file: " << filename << std::endl; fprintf(pfile, "%%%%MatrixMarket matrix array real general\n"); fprintf(pfile, "%lu %u\n", engine.num_vertices()-1, 1); for (uint i=1; i< engine.num_vertices(); i++){ fprintf(pfile, "%u\n", vertex_values[i]); assert(vertex_values[i] >= 0 && vertex_values[i] < engine.num_vertices()); } fclose(pfile); logstream(LOG_INFO)<<"Saved succesfully to out file: " << filename << "-components" << " time for saving: " << mytimer.current_time() << std::endl; } std::cout<<"Total runtime: " << mytimer.current_time() << std::endl; if (p > 0) std::cout << "site fraction p= " << p << std::endl; if (n > 0){ std::cout << "n=" << n*p << std::endl; std::cout << "isolated sites: " << p*(double)n-actual_vertices << std::endl; } std::cout << "Number of sites: " << actual_vertices << std::endl; std::cout << "Number of bonds: " << engine.num_edges() << std::endl; if (n){ std::cout << "Percentage of sites: " << (double)actual_vertices / (double)n << std::endl; std::cout << "Percentage of bonds: " << (double)engine.num_edges() / (2.0*n) << std::endl; } std::cout << "Number of iterations: " << iter << std::endl; std::cout << "SITES RESULT:\nsize\tcount\n"; std::map<uint,uint> final_countsv; std::map<uint,uint> final_countse; std::map<uint,uint> statv; for (int i=0; i< engine.num_vertices(); i++) statv[vertex_values[i]]++; uint total_sites = 0; for (std::map<uint, uint>::const_iterator iter = statv.begin(); iter != statv.end(); iter++) { //std::cout << iter->first << "\t" << iter->second << "\n"; final_countsv[iter->second] += 1; total_sites += iter->second; } for (std::map<uint, uint>::const_iterator iter = final_countsv.begin(); iter != final_countsv.end(); iter++) { std::cout << iter->first << "\t" << iter->second << "\n"; } edge_count = 1; engine.run(program, 1); std::cout << "BONDS RESULT:\nsize\tcount\n"; uint total_bonds = 0; for (std::map<uint, uint>::const_iterator iter = state.begin(); iter != state.end(); iter++) { //std::cout << iter->first << "\t" << iter->second << "\n"; final_countse[iter->second] += 1; total_bonds += iter->second; } for (std::map<uint, uint>::const_iterator iter = final_countse.begin(); iter != final_countse.end(); iter++) { std::cout << iter->first << "\t" << iter->second << "\n"; } assert(total_sites == graph.num_vertices()); assert(total_bonds == graph.num_edges()); return 0; }
/* example file format: * 2884424247 11 1210682095 1789803763 1879013170 1910216645 2014570227 * 2109318072 2268277425 2289674265 2340794623 2513611825 2770280793 * 2884596247 31 1191220232 1191258123 1225281292 1240067740 * 2885009247 16 1420862042 1641392180 1642909335 1775498871 1781379945 * 1784537661 1846581167 1934183965 2011304655 2016713117 2017390697 * 2128869911 2132021133 2645747085 2684129850 2866009832 */ void parse(int i){ in_file fin(in_files[i]); out_file fout((outdir + in_files[i] + ".out")); size_t linesize = 0; char * saveptr = NULL, * linebuf = NULL; char linebuf_debug[1024]; size_t line = 1; uint from,to; bool matrix_market = false; while(true){ int rc = getline(&linebuf, &linesize, fin.outf); strncpy(linebuf_debug, linebuf, 1024); if (rc < 1) break; if (strlen(linebuf) <= 1) //skip empty lines continue; //skipping over matrix market header (if any) if (!strncmp(linebuf, "%%MatrixMarket", 14)){ matrix_market = true; continue; } if (matrix_market && linebuf[0] == '%'){ continue; } if (matrix_market && linebuf[0] != '%'){ matrix_market = false; continue; } //read [FROM] char *pch = strtok_r(linebuf,string_to_tokenize, &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf_debug << "]" << std::endl; return; } assign_id(string2nodeid,nodeid2hash, from, pch, true); //read [NUMBER OF EDGES] pch = strtok_r(NULL,string_to_tokenize, &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf_debug << "]" << std::endl; return; } int num_edges = atoi(pch); if (num_edges < 0) { logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf_debug << "] - number of edges < 0" << std::endl; return; } for (int k=0; k< num_edges; k++){ pch = strtok_r(NULL, "\n\t\r, ", &saveptr); if (!pch){ logstream(LOG_ERROR) << "Error when parsing file: " << in_files[i] << ":" << line << "[" << linebuf_debug << "]" << std::endl; return; } assign_id(single_domain ? string2nodeid:string2nodeid2, single_domain ? nodeid2hash : nodeid2hash2, to, pch, single_domain ? true : false); if (tsv) fprintf(fout.outf, "%u\t%u\n", from, to); else if (csv) fprintf(fout.outf, "%u,%un", from, to); else fprintf(fout.outf, "%u %u\n", from, to); nnz++; } line++; total_lines++; if (lines && line>=lines) break; if (debug && (line % 50000 == 0)) logstream(LOG_INFO) << "Parsed line: " << line << " map size is: " << string2nodeid.size() << std::endl; if (string2nodeid.size() % 500000 == 0) logstream(LOG_INFO) << "Hash map size: " << string2nodeid.size() << " at time: " << mytime.current_time() << " edges: " << total_lines << std::endl; } logstream(LOG_INFO) <<"Finished parsing total of " << line << " lines in file " << in_files[i] << endl << "total map size: " << string2nodeid.size() << endl; }
/** * Vertex update function. */ void update(graphchi_vertex<VertexDataType, EdgeDataType> &v, graphchi_context &gcontext) { if (debug) printf("Entered iteration %d with %d\n", gcontext.iteration, v.id()); /* even iteration numbers: * 1) load a subset of items into memory (pivots) * 2) Find which subset of items needs to compared to the users */ if (gcontext.iteration % 2 == 0) { if (adjcontainer->is_pivot(v.id()) && is_item(v.id())){ adjcontainer->load_edges_into_memory(v); if (debug) printf("Loading pivot %dintro memory\n", v.id()); } else if (is_user(v.id())){ //in the zero iteration, if using AA/RA/PROB distance metric, initialize array //with node degrees if (gcontext.iteration == 0 && (distance_metric == AA || distance_metric == RA || distance_metric == PROB)){ latent_factors_inmem[v.id()].degree = v.num_edges(); } //check if this user is connected to any pivot item bool has_pivot = false; int pivot = -1; for(int i=0; i<v.num_edges(); i++) { graphchi_edge<uint32_t> * e = v.edge(i); //assert(is_item(e->vertexid)); if (adjcontainer->is_pivot(e->vertexid)) { has_pivot = true; pivot = e->vertexid; break; } } if (debug) printf("user %d is linked to pivot %d\n", v.id(), pivot); if (!has_pivot) //this user is not connected to any of the pivot item nodes and thus //it is not relevant at this point return; //this user is connected to a pivot items, thus all connected items should be compared for(int i=0; i<v.num_edges(); i++) { graphchi_edge<uint32_t> * e = v.edge(i); //assert(v.id() != e->vertexid); relevant_items[e->vertexid - M] = true; } }//is_user } //iteration % 2 = 1 /* odd iteration number: * 1) For any item connected to a pivot item * compute itersection */ else { if (!relevant_items[v.id() - M]){ if (debug) logstream(LOG_DEBUG)<<"Skipping item: " << v.id() << " since not relevant" << std::endl; return; } std::vector<index_val> heap; for (vid_t i=adjcontainer->pivot_st; i< adjcontainer->pivot_en; i++){ //if JACCARD which is symmetric, compare only to pivots which are smaller than this item id if ((distance_metric != ASYM_COSINE && i >= v.id()) || (!relevant_items[i-M])) continue; else if (distance_metric == ASYM_COSINE && i == v.id()) continue; double dist = adjcontainer->calc_distance(v, i, distance_metric); item_pairs_compared++; if (item_pairs_compared % 10000000 == 0) logstream(LOG_INFO)<< std::setw(10) << mytimer.current_time() << ") " << std::setw(10) << item_pairs_compared << " pairs compared " << std::setw(10) <<sum(written_pairs) << " written. " << std::endl; if (debug) printf("comparing %d to pivot %d distance is %g\n", i - M + 1, v.id() - M + 1, dist); if (dist != 0){ heap.push_back(index_val(i, dist)); } else zero_dist++; } sort(heap.begin(), heap.end(), &Greater); int thread_num = omp_get_thread_num(); if (heap.size() < K) not_enough++; for (uint i=0; i< std::min(heap.size(), (size_t)K); i++){ int rc = fprintf(out_files[thread_num], "%u %u %.12lg\n", v.id()-M+1, heap[i].index-M+1, (double)heap[i].val);//write item similarity to file written_pairs[omp_get_thread_num()]++; if (rc <= 0){ perror("Failed to write output"); logstream(LOG_FATAL)<<"Failed to write output to: file: " << training << omp_get_thread_num() << ".out" << std::endl; } } }//end of iteration % 2 == 1 }//end of update function
/** * Vertex update function. */ void update(CE_Graph_vertex<VertexDataType, EdgeDataType> &v, CE_Graph_context &gcontext) { if (debug) printf("Entered iteration %d with %d\n", gcontext.iteration, v.id()); //in the zero iteration compute the mean if (gcontext.iteration == 0){ if (is_item(v.id())){ for(int i=0; i<v.num_edges(); i++) { CE_Graph_edge<float> * e = v.edge(i); vid_t user = e->vertexid; mean[user] += e->get_data() / (float)N; } } } //at the first iteration compute the stddev of each item from the mean else if (gcontext.iteration == 1){ if (is_item(v.id())){ dense_adj item_edges; for(int i=0; i < v.num_edges(); i++) set_new(item_edges.edges, v.edge(i)->vertexid, v.edge(i)->get_data()); stddev[v.id() - M] = sum(minus(item_edges.edges, mean).array().pow(2)) / (M-1.0); if (debug) std::cout<<"item: " << v.id() - M+1 << " stddev: " << stddev[v.id() - M] << std::endl; } } /* even iteration numbers: * 1) load a subset of items into memory (pivots) * 2) Find which subset of items needs to compared to the users */ else if (gcontext.iteration % 2 == 0) { if (adjcontainer->is_pivot(v.id()) && is_item(v.id())){ adjcontainer->load_edges_into_memory(v); if (debug) printf("Loading pivot %d intro memory\n", v.id()); } else if (is_user(v.id())){ //check if this user is connected to any pivot item bool has_pivot = false; int pivot = -1; for(int i=0; i<v.num_edges(); i++) { CE_Graph_edge<float> * e = v.edge(i); //assert(is_item(e->vertexid)); if (adjcontainer->is_pivot(e->vertexid) && relevant_items[e->vertexid-M]) { has_pivot = true; pivot = e->vertexid; break; } } if (debug) printf("user %d is linked to pivot %d\n", v.id(), pivot); if (!has_pivot) //this user is not connected to any of the pivot item nodes and thus //it is not relevant at this point return; //this user is connected to a pivot items, thus all connected items should be compared for(int i=0; i<v.num_edges(); i++) { CE_Graph_edge<float> * e = v.edge(i); //assert(v.id() != e->vertexid); relevant_items[e->vertexid - M] = true; } }//is_user } //iteration % 2 = 1 /* odd iteration number: * 1) For any item connected to a pivot item * compute itersection */ else { if (!relevant_items[v.id() - M]){ return; } for (vid_t i=adjcontainer->pivot_st; i< adjcontainer->pivot_en; i++){ //since metric is symmetric, compare only to pivots which are smaller than this item id if (i >= v.id() || (!relevant_items[i-M])) continue; double dist = adjcontainer->calc_distance(v, i, distance_metric); item_pairs_compared++; if (item_pairs_compared % 1000000 == 0) logstream(LOG_INFO)<< std::setw(10) << mytimer.current_time() << ") " << std::setw(10) << item_pairs_compared << " pairs compared " << std::endl; if (debug) printf("comparing %d to pivot %d distance is %lg\n", i - M + 1, v.id() - M + 1, dist); if (dist != 0){ fprintf(out_files[omp_get_thread_num()], "%u %u %.12lg\n", v.id()-M+1, i-M+1, (double)dist);//write item similarity to file //where the output format is: //[item A] [ item B ] [ distance ] written_pairs++; } } }//end of iteration % 2 == 1 }//end of update function