void recv(T& elem, const size_t id, const int tag = 0) { #ifdef HAS_MPI // Get the mpi rank and size assert(id < size()); int recv_buffer_size(-1); int dest(id); MPI_Status status; // recv the size int error = MPI_Recv(&recv_buffer_size, 1, MPI_INT, dest, tag, MPI_COMM_WORLD, &status); assert(error == MPI_SUCCESS); assert(recv_buffer_size > 0); std::vector<char> recv_buffer(recv_buffer_size); // recv the actual content error = MPI_Recv(&(recv_buffer[0]), recv_buffer_size, MPI_BYTE, dest, tag, MPI_COMM_WORLD, &status); assert(error == MPI_SUCCESS); // deserialize // Update the local map namespace bio = boost::iostreams; typedef bio::stream<bio::array_source> icharstream; icharstream strm(&(recv_buffer[0]), recv_buffer.size()); graphlab::iarchive iarc(strm); iarc >> elem; #else logstream(LOG_FATAL) << "MPI not installed!" << std::endl; #endif }
std::vector<sgraph_edge_data> graph_pylambda_evaluator::eval_triple_apply( const std::vector<sgraph_edge_data>& all_edge_data, size_t src_partition, size_t dst_partition, const std::vector<size_t>& mutated_edge_field_ids) { std::lock_guard<mutex> lg(m_mutex); logstream(LOG_INFO) << "graph_lambda_worker eval triple apply " << src_partition << ", " << dst_partition << std::endl; DASSERT_TRUE(is_loaded(src_partition)); DASSERT_TRUE(is_loaded(dst_partition)); auto& source_partition = m_graph_sync.get_partition(src_partition); auto& target_partition = m_graph_sync.get_partition(dst_partition); std::vector<std::string> mutated_edge_keys; for (size_t fid: mutated_edge_field_ids) { mutated_edge_keys.push_back(m_edge_keys[fid]); } std::vector<sgraph_edge_data> ret(all_edge_data.size()); lambda_graph_triple_apply_data lgt; lgt.all_edge_data = &all_edge_data; lgt.out_edge_data = &ret; lgt.source_partition = &source_partition; lgt.target_partition = &target_partition; lgt.vertex_keys = &m_vertex_keys; lgt.edge_keys = &m_edge_keys; lgt.mutated_edge_keys = &mutated_edge_keys; lgt.srcid_column = m_srcid_column; lgt.dstid_column = m_dstid_column; evaluation_functions.eval_graph_triple_apply(m_lambda_id, &lgt); python::check_for_python_exception(); return ret; }
/** load a matrix market file into a matrix */ void load_matrix_market_matrix(const std::string & filename, int offset, int D){ MM_typecode matcode; uint i,I,J; double val; uint rows, cols; size_t nnz; FILE * f = open_file(filename.c_str() ,"r"); int rc = mm_read_banner(f, &matcode); if (rc != 0) logstream(LOG_FATAL)<<"Failed to load matrix market banner in file: " << filename << std::endl; if (mm_is_sparse(matcode)){ int rc = mm_read_mtx_crd_size(f, &rows, &cols, &nnz); if (rc != 0) logstream(LOG_FATAL)<<"Failed to load matrix market banner in file: " << filename << std::endl; } else { //dense matrix rc = mm_read_mtx_array_size(f, &rows, &cols); if (rc != 0) logstream(LOG_FATAL)<<"Failed to load matrix market banner in file: " << filename << std::endl; nnz = rows * cols; } for (i=0; i<nnz; i++){ if (mm_is_sparse(matcode)){ rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error reading input line " << i << std::endl; I--; J--; assert(I >= 0 && I < rows); assert(J >= 0 && J < cols); //set_val(a, I, J, val); latent_factors_inmem[I+offset].pvec[J] = val; } else { rc = fscanf(f, "%lg", &val); if (rc != 1) logstream(LOG_FATAL)<<"Error reading nnz " << i << std::endl; I = i / D; J = i % cols; //set_val(a, I, J, val); latent_factors_inmem[I+offset].pvec[J] = val; } } logstream(LOG_INFO) << "Factors from file: loaded matrix of size " << rows << " x " << cols << " from file: " << filename << " total of " << nnz << " entries. "<< i << std::endl; }
void thread_pool::set_cpu_affinity(bool affinity) { if (affinity != cpu_affinity) { cpu_affinity = affinity; // stop the queue from blocking spawn_queue.stop_blocking(); // join the threads in the thread group while(1) { try { threads.join(); break; } catch (const char* c) { // this should not be possible! logstream(LOG_FATAL) << "Unexpected exception caught in thread pool destructor: " << c << std::endl; // ASSERT_TRUE(false); // unnecessary } } spawn_queue.start_blocking(); spawn_thread_group(); } } // end of set_cpu_affinity
void make_pds_constraint() { int p = 0; if (!is_pds_compatible(nshards, p)) { logstream(LOG_FATAL) << "Num shards: " << nshards << " cannot be used for pdsingress." << std::endl; }; pds pds_generator; std::vector<size_t> results; if (p == 1) { results.push_back(0); results.push_back(2); } else { results = pds_generator.get_pds(p); } for (size_t i = 0; i < nshards; i++) { std::vector<procid_t> adjlist; for (size_t j = 0; j < results.size(); j++) { adjlist.push_back( (results[j] + i) % nshards); } std::sort(adjlist.begin(), adjlist.end()); constraint_graph.push_back(adjlist); } }
bool process::kill(bool async) { if(!m_launched) log_and_throw("No process launched!"); if(m_proc_handle != NULL) { BOOL ret = TerminateProcess(m_proc_handle, 1); auto err_code = GetLastError(); if(!async) WaitForSingleObject(m_proc_handle, 10000); CloseHandle(m_proc_handle); m_proc_handle = NULL; if(!ret) { logstream(LOG_INFO) << get_last_err_str(err_code); return false; } return true; } return false; }
distributed_control::~distributed_control() { // detach the instance last_dc = NULL; last_dc_procid = 0; distributed_services->full_barrier(); logstream(LOG_INFO) << "Shutting down distributed control " << std::endl; FREE_CALLBACK_EVENT(EVENT_NETWORK_BYTES); FREE_CALLBACK_EVENT(EVENT_RPC_CALLS); // call all deletion callbacks for (size_t i = 0; i < deletion_callbacks.size(); ++i) { deletion_callbacks[i](); } size_t bytessent = bytes_sent(); for (size_t i = 0;i < senders.size(); ++i) { senders[i]->flush(); } comm->close(); for (size_t i = 0;i < senders.size(); ++i) { delete senders[i]; } senders.clear(); pthread_key_delete(dc_impl::thrlocal_sequentialization_key); pthread_key_delete(dc_impl::thrlocal_send_buffer_key); size_t bytesreceived = bytes_received(); for (size_t i = 0;i < receivers.size(); ++i) { receivers[i]->shutdown(); delete receivers[i]; } receivers.clear(); // shutdown function call handlers for (size_t i = 0;i < fcallqueue.size(); ++i) fcallqueue[i].stop_blocking(); fcallhandlers.join(); logstream(LOG_INFO) << "Bytes Sent: " << bytessent << std::endl; logstream(LOG_INFO) << "Calls Sent: " << calls_sent() << std::endl; logstream(LOG_INFO) << "Network Sent: " << network_bytes_sent() << std::endl; logstream(LOG_INFO) << "Bytes Received: " << bytesreceived << std::endl; logstream(LOG_INFO) << "Calls Received: " << calls_received() << std::endl; delete comm; }
void fiber_async_consensus::cancel() { /* Assertion: numactive > 0 if there is work to do. If there are fibers trying to sleep, lets wake them up */ if (trying_to_sleep > 0 || numactive < ncpus) { m.lock(); size_t oldnumactive = numactive; // once I acquire this lock, all fibers must be // in the following states // 1: still running and has not reached begin_critical_section() // 2: is sleeping in cond.wait() // 3: has called begin_critical_section() but has not acquired // the mutex // In the case of 1,3: These fibers will perform one more sweep // of their task queues. Therefore they will see any new job if available // in the case of 2: numactive must be < ncpus since numactive // is mutex protected. Then I can wake them up by // clearing their sleeping flags and broadcasting. if (numactive < ncpus) { // this is safe. Note that it is done from within // the critical section. for (size_t i = 0;i < ncpus; ++i) { numactive += sleeping[i]; if (sleeping[i]) { sleeping[i] = 0; // this here was basically cond[i].signal(); if (cond[i] != 0) fiber_control::schedule_tid(cond[i]); } } if (oldnumactive == 0 && !done) { logstream(LOG_INFO) << rmi.procid() << ": Waking" << std::endl; } } m.unlock(); } }
static VARIABLE_IS_NOT_USED void metrics_report(metrics &m) { std::string reporters = get_option_string("metrics.reporter", "console"); char * creps = (char*)reporters.c_str(); const char * delims = ","; char * t = strtok(creps, delims); while(t != NULL) { std::string repname(t); if (repname == "basic" || repname == "console") { basic_reporter rep; m.report(rep); } else if (repname == "file") { file_reporter rep(get_option_string("metrics.reporter.filename", "metrics.txt")); m.report(rep); } else if (repname == "html") { html_reporter rep(get_option_string("metrics.reporter.htmlfile", "metrics.html")); m.report(rep); } else { logstream(LOG_WARNING) << "Could not find metrics reporter with name [" << repname << "], ignoring." << std::endl; } t = strtok(NULL, delims); } }
binary_adjacency_list_writer(std::string filename) : filename(filename) { bufsize = (int) get_option_int("preprocessing.bufsize", 64 * 1024 * 1024); assert(bufsize > 1024 * 1024); fd = open(filename.c_str(), O_WRONLY | O_CREAT, S_IROTH | S_IWOTH | S_IWUSR | S_IRUSR); if (fd < 0) { logstream(LOG_FATAL) << "Could not open file " << filename << " for writing. " << " Error: " << strerror(errno) << std::endl; } header.format_version = FORMAT_VERSION; header.max_vertex_id = 0; header.contains_edge_values = false; header.numedges = 0; header.edge_value_size = (uint32_t) sizeof(EdgeDataType); buf = (char*) malloc(bufsize); bufptr = buf; bwrite<bin_adj_header>(fd, buf, bufptr, header); counter = 0; lastid = 0; initialized = false; assert(fd >= 0); }
void transform_vertices(GraphType& g, TransformType transform_functor, const vertex_set vset = GraphType::complete_set()) { typedef typename GraphType::vertex_type vertex_type; if(!g.is_finalized()) { logstream(LOG_FATAL) << "\n\tAttempting to call graph.transform_vertices(...)" << "\n\tbefore finalizing the graph." << std::endl; } g.dc().barrier(); size_t ibegin = 0; size_t iend = g.num_local_vertices(); parallel_for (ibegin, iend, [&](size_t i) { auto lvertex = g.l_vertex(i); if (lvertex.owned() && vset.l_contains(lvid_type(i))) { vertex_type vtx(lvertex); transform_functor(vtx); } }); g.dc().barrier(); g.synchronize(); }
sliding_shard(stripedio * iomgr, std::string _filename_edata, std::string _filename_adj, vid_t _range_st, vid_t _range_en, size_t _blocksize, metrics &_m, bool _disable_writes=false, bool onlyadj = false) : iomgr(iomgr), filename_edata(_filename_edata), filename_adj(_filename_adj), range_st(_range_st), range_end(_range_en), blocksize(_blocksize), m(_m), disable_writes(_disable_writes) { curvid = 0; adjoffset = 0; edataoffset = 0; disable_writes = false; only_adjacency = onlyadj; curblock = NULL; curadjblock = NULL; window_start_edataoffset = 0; disable_async_writes = false; while(blocksize % sizeof(int) != 0) blocksize++; assert(blocksize % sizeof(int)==0); adjfilesize = get_filesize(filename_adj); edatafilesize = get_shard_edata_filesize<int>(filename_edata); if (!only_adjacency) { logstream(LOG_DEBUG) << "Total edge data size: " << edatafilesize << std::endl; } else { // Nothing } adjfile_session = iomgr->open_session(filename_adj, true); save_offset(); async_edata_loading = false; // With dynamic edge data size, do not load }
float time_svdpp_predict(const time_svdpp_usr & usr, const time_svdpp_movie & mov, const time_svdpp_time & ptime, const float rating, double & prediction){ //prediction = global_mean + user_bias + movie_bias double pui = globalMean + *usr.bu + *mov.bi; for(int k=0;k<D;k++){ // + user x movie factors pui += (usr.ptemp[k] * mov.q[k]); // + user x time factors pui += usr.x[k] * ptime.z[k]; // + user x time x movies factors pui += usr.pu[k] * ptime.pt[k] * mov.q[k]; } pui = std::min(pui,maxval); pui = std::max(pui,minval); prediction = pui; if (std::isnan(prediction)) logstream(LOG_FATAL)<<"Got into numerical errors! Try to decrease --lrate, --gamma, --beta" <<std::endl; float err = rating - prediction; return err*err; }
void make_grid_constraint() { int ncols, nrows; if (!is_grid_compatible(nshards, nrows, ncols)) { logstream(LOG_FATAL) << "Num shards: " << nshards << " cannot be used for grid ingress." << std::endl; }; for (size_t i = 0; i < nshards; i++) { std::vector<procid_t> adjlist; // add self adjlist.push_back(i); // add the row of i size_t rowbegin = (i/ncols) * ncols; for (size_t j = rowbegin; j < rowbegin + ncols; ++j) if (i != j) adjlist.push_back(j); // add the col of i for (size_t j = i % ncols; j < nshards; j+=ncols) if (i != j) adjlist.push_back(j); std::sort(adjlist.begin(), adjlist.end()); constraint_graph.push_back(adjlist); } }
/** * Grab pivot's adjacency list into memory. */ int load_edges_into_memory(CE_Graph_vertex<VertexDataType, EdgeDataType> &v) { //assert(is_pivot(v.id())); //assert(is_item(v.id())); int num_edges = v.num_edges(); //not enough user rated this item, we don't need to compare to it if (num_edges < min_allowed_intersection){ if (debug) logstream(LOG_DEBUG)<<"Skipping since num edges: " << num_edges << std::endl; return 0; } // Count how many neighbors have larger id than v dense_adj dadj; for(int i=0; i<num_edges; i++) set_new( dadj.edges, v.edge(i)->vertex_id(), v.edge(i)->get_data()); //std::sort(&dadj.adjlist[0], &dadj.adjlist[0] + num_edges); adjs[v.id() - pivot_st] = dadj; assert(v.id() - pivot_st < adjs.size()); __sync_add_and_fetch(&grabbed_edges, num_edges /*edges_to_larger_id*/); return num_edges; }
std::vector<size_t> get_pds(size_t p) { std::vector<size_t> result = find_pds(p); // verify pdsness size_t pdslength = p *p + p + 1; std::vector<size_t> count(pdslength, 0); for (size_t i = 0;i < result.size(); ++i) { for (size_t j = 0;j < result.size(); ++j) { if (i == j) continue; count[(result[i] - result[j] + pdslength) % pdslength]++; } } bool ispds = true; for (size_t i = 1;i < count.size(); ++i) { if (count[i] != 1) ispds = false; } // If success, return the result, else, return empty vector. if (ispds) { return result; } else { logstream(LOG_ERROR) << "Fail to generate pds for p = " << p << std::endl; return std::vector<size_t>(); } }
distributed_glshared_manager::distributed_glshared_manager(distributed_control &dc): rmi(dc, this), glsharedobjs(distgl_impl::get_global_dist_glshared_registry()), dht(dc){ dht.attach_modification_trigger(boost::bind(&distributed_glshared_manager::invalidate, this, _1, _2, _3)); for (size_t i = 0; i < glsharedobjs.size(); ++i) { logstream(LOG_INFO) << "registered entry " << i << " with type " << glsharedobjs[i]->type_name() << std::endl; if (glsharedobjs[i]->manager != NULL) { logger(LOG_WARNING, "glshared objects are still attached to a previous manager!"); } glsharedobjs[i]->manager = this; glsharedobjs[i]->id = i; objrevmap[glsharedobjs[i]] = i; if (dht.owning_machine(i) == rmi.procid()) { std::stringstream strm; oarchive oarc(strm); glsharedobjs[i]->save(oarc); dht.set(i, strm.str()); } } // perform the sets }
void save_format(GraphType& g, const std::string& prefix, const std::string& format, bool gzip = true, size_t files_per_machine = 4) { if (prefix.length() == 0) return; if (format == "snap" || format == "tsv") { save(g, prefix, builtin_parsers::tsv_writer<GraphType>(), gzip, false, true, files_per_machine); } else if (format == "graphjrl") { save(g, prefix, builtin_parsers::graphjrl_writer<GraphType>(), gzip, true, true, files_per_machine); } else if (format == "bin") { save_binary(g, prefix); } // else if (format == "bintsv4") { // save_direct(prefix, gzip, &graph_type::save_bintsv4_to_stream); // } else { logstream(LOG_ERROR) << "Unrecognized Format \"" << format << "\"!" << std::endl; throw(std::string("Unrecognized Format \"" + format + "\"")); return; } } // end of save structure
bool check_origfile_modification_earlier(std::string basefilename, int nshards) { /* Compare last modified dates of the original graph and the shards */ if (file_exists(basefilename) && get_option_int("disable-modtime-check", 0) == 0) { struct stat origstat, shardstat; int err1 = stat(basefilename.c_str(), &origstat); std::string adjfname = filename_shard_adj(basefilename, 0, nshards); int err2 = stat(adjfname.c_str(), &shardstat); if (err1 != 0 || err2 != 0) { logstream(LOG_ERROR) << "Error when checking file modification times: " << strerror(errno) << std::endl; return nshards; } if (origstat.st_mtime > shardstat.st_mtime) { logstream(LOG_INFO) << "The input graph modification date was newer than of the shards." << std::endl; logstream(LOG_INFO) << "Going to delete old shards and recreate new ones. To disable " << std::endl; logstream(LOG_INFO) << "functionality, specify --disable-modtime-check=1" << std::endl; // Delete shards delete_shards<EdgeDataType>(basefilename, nshards); // Delete the bin-file std::string preprocfile = preprocess_filename<EdgeDataType>(basefilename); if (file_exists(preprocfile)) { logstream(LOG_DEBUG) << "Deleting: " << preprocfile << std::endl; int err = remove(preprocfile.c_str()); if (err != 0) { logstream(LOG_ERROR) << "Error deleting file: " << preprocfile << ", " << strerror(errno) << std::endl; } } return false; } else { return true; } } return true; }
distributed_control::~distributed_control() { distributed_services->full_barrier(); if(last_dc_procid==0) logstream(LOG_INFO) << "Shutting down distributed control " << std::endl; FREE_CALLBACK_EVENT(EVENT_NETWORK_BYTES); FREE_CALLBACK_EVENT(EVENT_RPC_CALLS); // call all deletion callbacks for (size_t i = 0; i < deletion_callbacks.size(); ++i) { deletion_callbacks[i](); } size_t bytessent = bytes_sent(); for (size_t i = 0;i < senders.size(); ++i) { senders[i]->flush(); } comm->close(); for (size_t i = 0;i < senders.size(); ++i) { delete senders[i]; } size_t bytesreceived = bytes_received(); for (size_t i = 0;i < receivers.size(); ++i) { receivers[i]->shutdown(); delete receivers[i]; } senders.clear(); receivers.clear(); // shutdown function call handlers for (size_t i = 0;i < fcallqueue.size(); ++i) fcallqueue[i].stop_blocking(); fcallhandlers.join(); if(last_dc_procid==0){ logstream(LOG_INFO) << "Bytes Sent: " << bytessent << std::endl; logstream(LOG_INFO) << "Calls Sent: " << calls_sent() << std::endl; logstream(LOG_INFO) << "Network Sent: " << network_bytes_sent() << std::endl; logstream(LOG_INFO) << "Bytes Received: " << bytesreceived << std::endl; logstream(LOG_INFO) << "Calls Received: " << calls_received() << std::endl; } delete comm; }
void fiber_async_consensus::pass_the_token() { // note that this function does not acquire the token lock // the caller must acquire it assert(hastoken); // first check if we are done if (cur_token.last_change == rmi.procid() && cur_token.total_calls_received == cur_token.total_calls_sent) { logstream(LOG_INFO) << "Completed Token: " << cur_token.total_calls_received << " " << cur_token.total_calls_sent << std::endl; // we have completed a loop around! // broadcast a completion for (procid_t i = 0;i < rmi.numprocs(); ++i) { if (i != rmi.procid()) { rmi.control_call(i, &fiber_async_consensus::force_done); } } // set the complete flag // we can't call consensus() since it will deadlock done = true; // this is the same code as cancel(), but we can't call cancel // since we are holding on to a lock if (numactive < ncpus) { // this is safe. Note that it is done from within // the critical section. for (size_t i = 0;i < ncpus; ++i) { numactive += sleeping[i]; if (sleeping[i]) { sleeping[i] = 0; // this here is basically cond[i].signal(); size_t ch = cond[i]; if (ch != 0) fiber_control::schedule_tid(ch); } } } } else { // update the token size_t callsrecv; size_t callssent; if (attachedobj) { callsrecv = attachedobj->calls_received(); callssent = attachedobj->calls_sent(); } else { callsrecv = rmi.dc().calls_received(); callssent = rmi.dc().calls_sent(); } if (callssent != last_calls_sent || callsrecv != last_calls_received) { cur_token.total_calls_sent += callssent - last_calls_sent; cur_token.total_calls_received += callsrecv - last_calls_received; cur_token.last_change = rmi.procid(); } //std::cout << "Sending token: (" << cur_token.total_calls_sent //<< ", " << cur_token.total_calls_received << ")" << std::endl; last_calls_sent = callssent; last_calls_received = callsrecv; // send it along. hastoken = false; /*logstream(LOG_INFO) << "Passing Token " << rmi.procid() << "-->" << (rmi.procid() + 1) % rmi.numprocs() << ": " << cur_token.total_calls_received << " " << cur_token.total_calls_sent << std::endl; */ rmi.control_call((procid_t)((rmi.procid() + 1) % rmi.numprocs()), &fiber_async_consensus::receive_the_token, cur_token); } }
inline void write_output_vector(const std::string & datafile, const vec& output, bool issparse, std::string comment = ""){ logstream(LOG_INFO)<<"Going to write output to file: " << datafile << " (vector of size: " << output.size() << ") " << std::endl; save_matrix_market_format_vector(datafile, output,issparse, comment); }
vec load_matrix_market_vector(const std::string & filename, bool optional_field, bool allow_zeros) { int ret_code; MM_typecode matcode; uint M, N; size_t i,nz; logstream(LOG_INFO) <<"Going to read matrix market vector from input file: " << filename << std::endl; FILE * f = open_file(filename.c_str(), "r", optional_field); //if optional file not found return if (f== NULL && optional_field){ return zeros(1); } if (mm_read_banner(f, &matcode) != 0) logstream(LOG_FATAL) << "Could not process Matrix Market banner." << std::endl; /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ if (mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode) ) logstream(LOG_FATAL) << "sorry, this application does not support " << std::endl << "Market Market type: " << mm_typecode_to_str(matcode) << std::endl; /* find out size of sparse matrix .... */ if (mm_is_sparse(matcode)){ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0) logstream(LOG_FATAL) << "failed to read matrix market cardinality size " << std::endl; } else { if ((ret_code = mm_read_mtx_array_size(f, &M, &N))!= 0) logstream(LOG_FATAL) << "failed to read matrix market vector size " << std::endl; if (N > M){ //if this is a row vector, transpose int tmp = N; N = M; M = tmp; } nz = M*N; } vec ret = zeros(M); uint row,col; double val; for (i=0; i<nz; i++) { if (mm_is_sparse(matcode)){ int rc = fscanf(f, "%u %u %lg\n", &row, &col, &val); if (rc != 3){ logstream(LOG_FATAL) << "Failed reading input file: " << filename << "Problm at data row " << i << " (not including header and comment lines)" << std::endl; } row--; /* adjust from 1-based to 0-based */ col--; } else { int rc = fscanf(f, "%lg\n", &val); if (rc != 1){ logstream(LOG_FATAL) << "Failed reading input file: " << filename << "Problm at data row " << i << " (not including header and comment lines)" << std::endl; } row = i; col = 0; } //some users have gibrish in text file - better check both I and J are >=0 as well assert(row >=0 && row< M); assert(col == 0); if (val == 0 && !allow_zeros) logstream(LOG_FATAL)<<"Zero entries are not allowed in a sparse matrix market vector. Use --zero=true to avoid this error"<<std::endl; //set observation value ret[row] = val; } fclose(f); logstream(LOG_INFO)<<"Succesfully read a vector of size: " << M << " [ " << nz << "]" << std::endl; return ret; }
int convert_matrixmarket(std::string base_filename, size_t nodes = 0, size_t edges = 0, int tokens_per_row = 3, int type = TRAINING, int allow_square = true) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, type); return nshards; } } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, type == TRAINING?M:Me, type == TRAINING?N:Ne, nz, nodes, edges, type); if (f == NULL){ if (type == TRAINING){ logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; } else if (type == VALIDATION){ logstream(LOG_INFO)<<"Validation file: " << base_filename << " is not found. " << std::endl; return -1; } } compute_matrix_size(nz, type); uint I, J; double val = 1.0; bool active_edge = true; int zero_entries = 0; for (size_t i=0; i<nz; i++) { if (tokens_per_row == 3){ int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; if (val == 0 && ! allow_zeros) logstream(LOG_FATAL)<<"Encountered zero edge [ " << I << " " <<J << " 0] in line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl; else if (val == 0){ zero_entries++; continue; } } else if (tokens_per_row == 2){ int rc = fscanf(f, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); if (I ==987654321 || J== 987654321) //hack - to be removed later continue; I-=(uint)input_file_offset; /* adjust from 1-based to 0-based */ J-=(uint)input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I+1 << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J+1 << " > " << N << " in line; " << i << std::endl; if (minval != -1e100 && val < minval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where min value is: " << minval << std::endl; if (maxval != 1e100 && val > maxval) logstream(LOG_FATAL)<<"Found illegal rating value: " << val << " where max value is: " << maxval << std::endl; active_edge = decide_if_edge_is_active(i, type); if (active_edge){ if (type == TRAINING) globalMean += val; else globalMean2 += val; sharderobj.preprocessing_add_edge(I, (M==N && allow_square)?J:M + J, als_edge_type((float)val)); } } if (type == TRAINING){ uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " Now creating shards." << std::endl; } else { globalMean2 /= Le; logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " Now creating shards." << std::endl; } write_global_mean(base_filename, type); sharderobj.end_preprocessing(); if (zero_entries) logstream(LOG_WARNING)<<"Found " << zero_entries << " zero edges!" << std::endl; fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename<< std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
int convert_matrixmarket_and_item_similarity(std::string base_filename, std::string similarity_file, int tokens_per_row, vec & degrees) { FILE *f = NULL, *fsim = NULL; size_t nz, nz_sim; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, TRAINING); return nshards; } } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, M, N, nz); if (f == NULL) logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; uint N_row = 0 ,N_col = 0; detect_matrix_size(similarity_file, fsim, N_row, N_col, nz_sim); if (fsim == NULL || nz_sim == 0) logstream(LOG_FATAL)<<"Failed to open item similarity input file: " << similarity_file << std::endl; if (N_row != N || N_col != N) logstream(LOG_FATAL)<<"Wrong item similarity file matrix size: " << N_row <<" x " << N_col << " Instead of " << N << " x " << N << std::endl; L=nz + nz_sim; degrees.resize(M+N); uint I, J; double val = 1.0; int zero_entries = 0; unsigned int actual_edges = 0; logstream(LOG_INFO) << "Starting to read matrix-market input. Matrix dimensions: " << M << " x " << N << ", non-zeros: " << nz << std::endl; for (size_t i=0; i<nz; i++){ if (tokens_per_row == 3){ int rc = fscanf(f, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file in line: " << i << std::endl; if (val == 0 && ! allow_zeros) logstream(LOG_FATAL)<<"Zero weight encountered at input file line: " << i << " . Run with --allow_zeros=1 to ignore zero weights." << std::endl; else if (val == 0) { zero_entries++; continue; } } else if (tokens_per_row == 2){ int rc = fscanf(f, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; degrees[J+M]++; degrees[I]++; if (I< (uint)start_user || I >= (uint)end_user){ continue; } sharderobj.preprocessing_add_edge(I, M + J, als_edge_type((float)val, 0)); //std::cout<<"adding an edge: " <<I << " -> " << M+J << std::endl; actual_edges++; } logstream(LOG_DEBUG)<<"Finished loading " << actual_edges << " ratings from file: " << base_filename << std::endl; for (size_t i=0; i<nz_sim; i++){ if (tokens_per_row == 3){ int rc = fscanf(fsim, "%u %u %lg\n", &I, &J, &val); if (rc != 3) logstream(LOG_FATAL)<<"Error when reading input file: " << similarity_file << " line: " << i << std::endl; } else if (tokens_per_row == 2){ int rc = fscanf(fsim, "%u %u\n", &I, &J); if (rc != 2) logstream(LOG_FATAL)<<"Error when reading input file: " << i << std::endl; } else assert(false); I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= N) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; if (I == J) logstream(LOG_FATAL)<<"Item similarity to itself found for item " << I << " in line; " << i << std::endl; //std::cout<<"Adding an edge between "<<M+I<< " : " << M+J << " " << (I<J) << " " << val << std::endl; sharderobj.preprocessing_add_edge(M+I, M+J, als_edge_type(I < J? val: 0, I>J? val: 0)); actual_edges++; } L = actual_edges; logstream(LOG_DEBUG)<<"Finished loading " << nz_sim << " ratings from file: " << similarity_file << std::endl; write_global_mean(base_filename, TRAINING); sharderobj.end_preprocessing(); if (zero_entries) logstream(LOG_WARNING)<<"Found " << zero_entries << " edges with zero weight!" << std::endl; fclose(f); fclose(fsim); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); logstream(LOG_INFO) << "Successfully finished sharding for " << base_filename << std::endl; logstream(LOG_INFO) << "Created " << nshards << " shards." << std::endl; return nshards; }
int convert_matrixmarket4(std::string base_filename, bool add_time_edges = false, bool square = false, int type = TRAINING, int matlab_time_offset = 1) { // Note, code based on: http://math.nist.gov/MatrixMarket/mmio/c/example_read.c FILE *f = NULL; size_t nz; /** * Create sharder object */ int nshards; if ((nshards = find_shards<als_edge_type>(base_filename, get_option_string("nshards", "auto")))) { if (check_origfile_modification_earlier<als_edge_type>(base_filename, nshards)) { logstream(LOG_INFO) << "File " << base_filename << " was already preprocessed, won't do it again. " << std::endl; read_global_mean(base_filename, type); } if (type == TRAINING) time_nodes_offset = M+N; return nshards; } sharder<als_edge_type> sharderobj(base_filename); sharderobj.start_preprocessing(); detect_matrix_size(base_filename, f, type == TRAINING? M:Me, type == TRAINING? N:Ne, nz); if (f == NULL){ if (type == VALIDATION){ logstream(LOG_INFO)<< "Did not find validation file: " << base_filename << std::endl; return -1; } else if (type == TRAINING) logstream(LOG_FATAL)<<"Failed to open training input file: " << base_filename << std::endl; } if (type == TRAINING) time_nodes_offset = M+N; compute_matrix_size(nz, type); uint I, J; double val, time; bool active_edge = true; for (size_t i=0; i<nz; i++) { int rc = fscanf(f, "%d %d %lg %lg\n", &I, &J, &time, &val); if (rc != 4) logstream(LOG_FATAL)<<"Error when reading input file - line " << i << std::endl; if (time < 0) logstream(LOG_FATAL)<<"Time (third columns) should be >= 0 " << std::endl; I-=input_file_offset; /* adjust from 1-based to 0-based */ J-=input_file_offset; if (I >= M) logstream(LOG_FATAL)<<"Row index larger than the matrix row size " << I << " > " << M << " in line: " << i << std::endl; if (J >= N) logstream(LOG_FATAL)<<"Col index larger than the matrix col size " << J << " > " << N << " in line; " << i << std::endl; K = std::max((int)time, (int)K); time -= matlab_time_offset; if (time < 0 && add_time_edges) logstream(LOG_FATAL)<<"Time bins should be >= " << matlab_time_offset << " in row " << i << std::endl; //only for tensor ALS we add edges between user and time bin and also item and time bin //time bins are numbered beteen M+N to M+N+K if (!weighted_als) time += time_nodes_offset; //avoid self edges if (square && I == J) continue; active_edge = decide_if_edge_is_active(i, type); if (active_edge){ if (type == TRAINING) globalMean += val; else globalMean2 += val; sharderobj.preprocessing_add_edge(I, (square? J : (M + J)), als_edge_type(val, time)); } //in case of a tensor, add besides of the user-> movie edge also //time -> user and time-> movie edges if (add_time_edges){ sharderobj.preprocessing_add_edge((uint)time, I, als_edge_type(val, M+J)); sharderobj.preprocessing_add_edge((uint)time, M+J , als_edge_type(val, I)); } } if (type == TRAINING){ uint toadd = 0; if (implicitratingtype == IMPLICIT_RATING_RANDOM) toadd = add_implicit_edges4(implicitratingtype, sharderobj); globalMean += implicitratingvalue * toadd; L += toadd; globalMean /= L; logstream(LOG_INFO) << "Global mean is: " << globalMean << " time bins: " << K << " . Now creating shards." << std::endl; } else { globalMean2 /= Le; logstream(LOG_INFO) << "Global mean is: " << globalMean2 << " time bins: " << K << " . Now creating shards." << std::endl; } write_global_mean(base_filename, type); sharderobj.end_preprocessing(); fclose(f); logstream(LOG_INFO) << "Now creating shards." << std::endl; // Shard with a specified number of shards, or determine automatically if not defined nshards = sharderobj.execute_sharding(get_option_string("nshards", "auto")); return nshards; }
void parse(T &x, const char * s) { logstream(LOG_FATAL) << "You need to define parse<your-type>(your-type &x, const char *s) function" << " to support parsing the edge value." << std::endl; assert(false); }
virtual void add_task(vid_t vid) { if (nwarnings++ % 10000 == 0) { logstream(LOG_WARNING) << "Tried to add task to scheduler, but scheduling was not enabled!" << std::endl; } }
void f() { LogStream logstream("test.log",New); enableModes(logstream); LOG(logstream,Debug) << "Debug message from a threaded function.\n"; FLUSH(logstream); }
static void delete_shards(std::string base_filename, int nshards) { #ifdef DYNAMICEDATA typedef int EdgeDataType; #else typedef EdgeDataType_ EdgeDataType; #endif logstream(LOG_DEBUG) << "Deleting files for " << base_filename << " shards=" << nshards << std::endl; std::string intervalfname = filename_intervals(base_filename, nshards); if (file_exists(intervalfname)) { int err = remove(intervalfname.c_str()); if (err != 0) logstream(LOG_ERROR) << "Error removing file " << intervalfname << ", " << strerror(errno) << std::endl; } /* Note: degree file is not removed, because same graph with different number of shards share the file. This should be probably change. std::string degreefname = filename_degree_data(base_filename); if (file_exists(degreefname)) { remove(degreefname.c_str()); } */ size_t blocksize = 4096 * 1024; while (blocksize % sizeof(EdgeDataType) != 0) blocksize++; for(int p=0; p < nshards; p++) { int blockid = 0; std::string filename_edata = filename_shard_edata<EdgeDataType>(base_filename, p, nshards); std::string fsizename = filename_edata + ".size"; if (file_exists(fsizename)) { int err = remove(fsizename.c_str()); if (err != 0) logstream(LOG_ERROR) << "Error removing file " << fsizename << ", " << strerror(errno) << std::endl; } while(true) { std::string block_filename = filename_shard_edata_block(filename_edata, blockid, blocksize); if (file_exists(block_filename)) { int err = remove(block_filename.c_str()); if (err != 0) logstream(LOG_ERROR) << "Error removing file " << block_filename << ", " << strerror(errno) << std::endl; } else { break; } #ifdef DYNAMICEDATA delete_block_uncompressed_sizefile(block_filename); #endif blockid++; } std::string dirname = dirname_shard_edata_block(filename_edata, blocksize); if (file_exists(dirname)) { int err = remove(dirname.c_str()); if (err != 0) logstream(LOG_ERROR) << "Error removing directory " << dirname << ", " << strerror(errno) << std::endl; } std::string adjname = filename_shard_adj(base_filename, p, nshards); logstream(LOG_DEBUG) << "Deleting " << adjname << " exists: " << file_exists(adjname) << std::endl; if (file_exists(adjname)) { int err = remove(adjname.c_str()); if (err != 0) logstream(LOG_ERROR) << "Error removing file " << adjname << ", " << strerror(errno) << std::endl; } } std::string numv_filename = base_filename + ".numvertices"; if (file_exists(numv_filename)) { int err = remove(numv_filename.c_str()); if (err != 0) logstream(LOG_ERROR) << "Error removing file " << numv_filename << ", " << strerror(errno) << std::endl; } /* Degree file */ std::string deg_filename = filename_degree_data(base_filename); if (file_exists(deg_filename)) { int err = remove(deg_filename.c_str()); if (err != 0) logstream(LOG_ERROR) << "Error removing file " << deg_filename << ", " << strerror(errno) << std::endl; } }