DWORD WINAPI Slave::SPEHandler(LPVOID p) #endif { Slave* self = ((Param4*)p)->serv_instance; const string ip = ((Param4*)p)->client_ip; const int ctrlport = ((Param4*)p)->client_ctrl_port; const int dataport = ((Param4*)p)->client_data_port; const int speid = ((Param4*)p)->speid; const int transid = ((Param4*)p)->transid; const int key = ((Param4*)p)->key; const string function = ((Param4*)p)->function; const int rows = ((Param4*)p)->rows; const char* param = ((Param4*)p)->param; const int psize = ((Param4*)p)->psize; const int type = ((Param4*)p)->type; const string master_ip = ((Param4*)p)->master_ip; const int master_port = ((Param4*)p)->master_port; delete (Param4*)p; SectorMsg msg; bool init_success = true; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "SPE starts " << ip << " " << dataport << LogEnd(); if (self->m_DataChn.connect(ip, dataport) < 0) { self->m_SectorLog << LogStart(LogLevel::LEVEL_2) << "failed to connect to spe client " << ip << ":" << ctrlport << " " << function << LogEnd(); init_success = false; } self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "connected." << LogEnd(); // read outupt parameters int buckets = 0; if (self->m_DataChn.recv4(ip, dataport, transid, buckets) < 0) init_success = false; SPEDestination dest; if (buckets > 0) { if (self->m_DataChn.recv4(ip, dataport, transid, dest.m_iLocNum) < 0) init_success = false; int len = dest.m_iLocNum * 80; if (self->m_DataChn.recv(ip, dataport, transid, dest.m_pcOutputLoc, len) < 0) init_success = false; len = buckets * 4; if (self->m_DataChn.recv(ip, dataport, transid, (char*&)dest.m_piLocID, len) < 0) init_success = false; } else if (buckets < 0) { int32_t len = 0; if (self->m_DataChn.recv(ip, dataport, transid, dest.m_pcOutputLoc, len) < 0) init_success = false; dest.m_strLocalFile = dest.m_pcOutputLoc; } dest.init(buckets); // initialize processing function self->acceptLibrary(key, ip, dataport, transid); SPHERE_PROCESS process = NULL; MR_MAP map = NULL; MR_PARTITION partition = NULL; void* lh = NULL; self->openLibrary(key, function, lh); if (NULL == lh) { self->m_SectorLog << LogStart(LogLevel::LEVEL_2) << "failed to open SPE library " << ip << ":" << ctrlport << " " << function << LogEnd(); init_success = false; } if (type == 0) { if (self->getSphereFunc(lh, function, process) < 0) init_success = false; } else if (type == 1) { if (self->getMapFunc(lh, function, map, partition) < 0) init_success = false; } else { init_success = false; } timeval t1, t2, t3, t4; gettimeofday(&t1, 0); msg.setType(1); // success, return result msg.setData(0, (char*)&(speid), 4); SPEResult result; result.init(buckets); // processing... while (init_success) { char* dataseg = NULL; int size = 0; if (self->m_DataChn.recv(ip, dataport, transid, dataseg, size) < 0) break; // client request to close this SPE if (size < 20) break; // read data segment parameters int64_t offset = *(int64_t*)(dataseg); int64_t totalrows = *(int64_t*)(dataseg + 8); int32_t dsid = *(int32_t*)(dataseg + 16); string datafile = dataseg + 20; sprintf(dest.m_pcLocalFileID, ".%d", dsid); delete [] dataseg; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "new job " << datafile << " " << offset << " " << totalrows << LogEnd(); int64_t* index = NULL; if ((totalrows > 0) && (rows != 0)) index = new int64_t[totalrows + 1]; char* block = NULL; int unitrows = (rows != -1) ? rows : totalrows; int progress = 0; // read data if (0 != rows) { size = 0; if (self->SPEReadData(datafile, offset, size, index, totalrows, block) <= 0) { delete [] index; delete [] block; progress = SectorError::E_SPEREAD; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); continue; } } else { // store file name in "process" parameter block = new char[datafile.length() + 1]; strcpy(block, datafile.c_str()); size = datafile.length() + 1; totalrows = 0; } SInput input; input.m_pcUnit = NULL; input.m_pcParam = (char*)param; input.m_iPSize = psize; SOutput output; output.m_iBufSize = (size < 64000000) ? 64000000 : size; output.m_pcResult = new char[output.m_iBufSize]; output.m_iIndSize = (totalrows < 640000) ? 640000 : totalrows + 2; output.m_pllIndex = new int64_t[output.m_iIndSize]; output.m_piBucketID = new int[output.m_iIndSize]; SFile file; file.m_strHomeDir = self->m_strHomeDir; char path[64]; sprintf(path, "%d", key); file.m_strLibDir = self->m_strHomeDir + ".sphere/" + path + "/"; file.m_strTempDir = self->m_strHomeDir + ".tmp/"; file.m_iSlaveID = self->m_iSlaveID; file.m_pInMemoryObjects = &self->m_InMemoryObjects; result.clear(); gettimeofday(&t3, 0); int deliverystatus = 0; int processstatus = 0; // process data segments for (int i = 0; i < totalrows; i += unitrows) { if (unitrows > totalrows - i) unitrows = totalrows - i; input.m_pcUnit = block + index[i] - index[0]; input.m_iRows = unitrows; input.m_pllIndex = index + i; output.m_iResSize = 0; output.m_iRows = 0; output.m_strError = ""; processstatus = self->processData(input, output, file, result, buckets, process, map, partition); if (processstatus < 0) { progress = SectorError::E_SPEPROC; break; } timeval t; gettimeofday(&t, NULL); unsigned int seed = t.tv_sec * 1000000 + t.tv_usec; srand(seed); int ds_thresh = 32000000 * ((rand() % 7) + 1); if ((result.m_llTotalDataSize >= ds_thresh) && (buckets != 0)) deliverystatus = self->deliverResult(buckets, result, dest); if (deliverystatus < 0) { progress = SectorError::E_SPEWRITE; break; } gettimeofday(&t4, 0); if (t4.tv_sec - t3.tv_sec > 1) { progress = i * 100 / totalrows; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); t3 = t4; } } // process files if (0 == unitrows) { SNode s; LocalFS::stat(self->m_strHomeDir + datafile, s); int64_t filesize = s.m_llSize; input.m_pcUnit = block; input.m_iRows = -1; input.m_pllIndex = NULL; output.m_llOffset = 0; for (int i = 0; (i == 0) || (output.m_llOffset > 0); ++ i) { // re-initialize output everytime UDF is called, except for offset output.m_iResSize = 0; output.m_iRows = 0; output.m_strError = ""; processstatus = self->processData(input, output, file, result, buckets, process, map, partition); if (processstatus < 0) { progress = SectorError::E_SPEPROC; break; } timeval t; gettimeofday(&t, NULL); unsigned int seed = t.tv_sec * 1000000 + t.tv_usec; srand(seed); int ds_thresh = 32000000 * ((rand() % 7) + 1); if ((result.m_llTotalDataSize >= ds_thresh) && (buckets != 0)) deliverystatus = self->deliverResult(buckets, result, dest); if (deliverystatus < 0) { progress = SectorError::E_SPEWRITE; break; } if (output.m_llOffset > 0) { progress = output.m_llOffset * 100LL / filesize; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); } } } // if buckets = 0, send back to clients, otherwise deliver to local or network locations if ((buckets != 0) && (progress >= 0)) deliverystatus = self->deliverResult(buckets, result, dest); if (deliverystatus < 0) progress = SectorError::E_SPEWRITE; else progress = 100; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "SPE completed " << progress << " " << ip << " " << ctrlport << LogEnd(); msg.setData(4, (char*)&progress, 4); if (100 == progress) { msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); self->sendResultToClient(buckets, dest.m_piSArray, dest.m_piRArray, result, ip, dataport, transid); dest.reset(buckets); // report new files vector<string> filelist; for (set<string>::iterator i = file.m_sstrFiles.begin(); i != file.m_sstrFiles.end(); ++ i) filelist.push_back(*i); self->report(master_ip, master_port, transid, filelist, +FileChangeType::FILE_UPDATE_NEW); self->reportMO(master_ip, master_port, transid); } else { msg.setData(8, (char*)&processstatus, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 12; if (output.m_strError.length() > 0) msg.setData(12, output.m_strError.c_str(), output.m_strError.length() + 1); else if (deliverystatus < 0) { string tmp = "System Error: data transfer to buckets failed."; msg.setData(12, tmp.c_str(), tmp.length() + 1); } int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); } delete [] index; delete [] block; delete [] output.m_pcResult; delete [] output.m_pllIndex; delete [] output.m_piBucketID; index = NULL; block = NULL; } gettimeofday(&t2, 0); int duration = t2.tv_sec - t1.tv_sec; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "comp server closed " << ip << " " << ctrlport << " " << duration << LogEnd(); delete [] param; vector<Address> bad; if (init_success) { self->closeLibrary(lh); multimap<int64_t, Address> sndspd; for (int i = 0; i < dest.m_iLocNum; ++ i) { Address addr; addr.m_strIP = dest.m_pcOutputLoc + i * 80; addr.m_iPort = *(int32_t*)(dest.m_pcOutputLoc + i * 80 + 64); int dataport = *(int32_t*)(dest.m_pcOutputLoc + i * 80 + 68); int64_t spd = self->m_DataChn.getRealSndSpeed(addr.m_strIP, dataport); if (spd > 0) sndspd.insert(pair<int64_t, Address>(spd, addr)); } vector<Address> bad; self->checkBadDest(sndspd, bad); } else { // this SPE failed to initialize. send the error to the client int progress = SectorError::E_SPEUDF; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); } self->reportSphere(master_ip, master_port, transid, &bad); // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); return NULL; }
DWORD WINAPI Slave::SPEShuffler(LPVOID p) #endif { Slave* self = ((Param5*)p)->serv_instance; int transid = ((Param5*)p)->transid; string client_ip = ((Param5*)p)->client_ip; int client_port = ((Param5*)p)->client_ctrl_port; int client_data_port = ((Param5*)p)->client_data_port; string path = ((Param5*)p)->path; string localfile = ((Param5*)p)->filename; int bucketnum = ((Param5*)p)->bucketnum; CGMP* gmp = ((Param5*)p)->gmp; string function = ((Param5*)p)->function; int bucketid = ((Param5*)p)->bucketid; const int key = ((Param5*)p)->key; const int type = ((Param5*)p)->type; string master_ip = ((Param5*)p)->master_ip; int master_port = ((Param5*)p)->master_port; queue<Bucket>* bq = NULL; CMutex* bqlock = NULL; CCond* bqcond = NULL; int64_t* pendingSize = NULL; pthread_t shufflerex; bool init_success = true; //set up data connection, for keep-alive purpose if (self->m_DataChn.connect(client_ip, client_data_port) < 0) { init_success = false; } else { // read library files for MapReduce, no need for Sphere UDF if (type == 1) self->acceptLibrary(key, client_ip, client_data_port, transid); bq = new queue<Bucket>; bqlock = new CMutex; bqcond = new CCond; pendingSize = new int64_t; *pendingSize = 0; ((Param5*)p)->bq = bq; ((Param5*)p)->bqlock = bqlock; ((Param5*)p)->bqcond = bqcond; ((Param5*)p)->pending = pendingSize; #ifndef WIN32 pthread_create(&shufflerex, NULL, SPEShufflerEx, p); #else DWORD ThreadID; shufflerex = CreateThread(NULL, 0, SPEShufflerEx, p, NULL, &ThreadID); #endif self->m_SectorLog << LogStart(LogLevel::SCREEN) << "SPE Shuffler " << path << " " << localfile << " " << bucketnum << LogEnd(); } while (init_success) { string speip; int speport; SectorMsg msg; int msgid; int r = gmp->recvfrom(speip, speport, msgid, &msg, false); // client releases the task or client has already been shutdown if (((r > 0) && (speip == client_ip) && (speport == client_port)) || ((r < 0) && (!self->m_DataChn.isConnected(client_ip, client_data_port)))) { Bucket b; b.totalnum = -1; b.totalsize = 0; bqlock->acquire(); bq->push(b); bqcond->signal(); bqlock->release(); break; } if (r < 0) continue; if (*pendingSize > 256000000) { // too many incoming results, ask the sender to wait // the receiver buffer size threshold is set to 256MB. This prevents the shuffler from being overflowed // it also helps direct the traffic to less congested shuffler and leads to better load balance msg.setType(-msg.getType()); gmp->sendto(speip, speport, msgid, &msg); } else { Bucket b; b.totalnum = *(int32_t*)(msg.getData() + 8);; b.totalsize = *(int32_t*)(msg.getData() + 12); b.src_ip = speip; b.src_dataport = *(int32_t*)msg.getData(); b.session = *(int32_t*)(msg.getData() + 4); gmp->sendto(speip, speport, msgid, &msg); if (!self->m_DataChn.isConnected(speip, b.src_dataport)) self->m_DataChn.connect(speip, b.src_dataport); bqlock->acquire(); bq->push(b); *pendingSize += b.totalsize; bqcond->signal(); bqlock->release(); } } if (init_success) { #ifndef WIN32 pthread_join(shufflerex, NULL); #else WaitForSingleObject(shufflerex, INFINITE); #endif delete bqlock; delete bqcond; delete pendingSize; SectorMsg msg; msg.setType(1); // success, return result msg.setData(0, (char*)&(bucketid), 4); int progress = 100; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(client_ip.c_str(), client_port, id, &msg); self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "bucket completed 100 " << client_ip << " " << client_port << LogEnd(); } gmp->close(); delete gmp; self->reportSphere(master_ip, master_port, transid); // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); return NULL; }
void* Slave::SPEShufflerEx(void* p) { Slave* self = ((Param5*)p)->serv_instance; int transid = ((Param5*)p)->transid; string client_ip = ((Param5*)p)->client_ip; int client_port = ((Param5*)p)->client_ctrl_port; int client_data_port = ((Param5*)p)->client_data_port; string path = ((Param5*)p)->path; string localfile = ((Param5*)p)->filename; int bucketnum = ((Param5*)p)->bucketnum; int bucketid = ((Param5*)p)->bucketid; const int key = ((Param5*)p)->key; const int type = ((Param5*)p)->type; string function = ((Param5*)p)->function; queue<Bucket>* bq = ((Param5*)p)->bq; pthread_mutex_t* bqlock = ((Param5*)p)->bqlock; pthread_cond_t* bqcond = ((Param5*)p)->bqcond; int64_t* pendingSize = ((Param5*)p)->pending; string master_ip = ((Param5*)p)->master_ip; int master_port = ((Param5*)p)->master_port; delete (Param5*)p; self->createDir(path); // remove old result data files for (int i = 0; i < bucketnum; ++ i) { char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), i); unlink(tmp); sprintf(tmp, "%s.%d.idx", (self->m_strHomeDir + path + "/" + localfile).c_str(), i); unlink(tmp); delete [] tmp; } // index file initial offset vector<int64_t> offset; offset.resize(bucketnum); for (vector<int64_t>::iterator i = offset.begin(); i != offset.end(); ++ i) *i = 0; set<int> fileid; while (true) { pthread_mutex_lock(bqlock); while (bq->empty()) pthread_cond_wait(bqcond, bqlock); Bucket b = bq->front(); bq->pop(); *pendingSize -= b.totalsize; pthread_mutex_unlock(bqlock); if (b.totalnum == -1) break; string speip = b.src_ip; int dataport = b.src_dataport; int session = b.session; for (int i = 0; i < b.totalnum; ++ i) { int bucket = 0; if (self->m_DataChn.recv4(speip, dataport, session, bucket) < 0) continue; fileid.insert(bucket); char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), bucket); fstream datafile(tmp, ios::out | ios::binary | ios::app); sprintf(tmp, "%s.%d.idx", (self->m_strHomeDir + path + "/" + localfile).c_str(), bucket); fstream indexfile(tmp, ios::out | ios::binary | ios::app); delete [] tmp; int64_t start = offset[bucket]; if (0 == start) indexfile.write((char*)&start, 8); int32_t len; char* data = NULL; if (self->m_DataChn.recv(speip, dataport, session, data, len) < 0) continue; datafile.write(data, len); delete [] data; tmp = NULL; if (self->m_DataChn.recv(speip, dataport, session, tmp, len) < 0) continue; int64_t* index = (int64_t*)tmp; for (int j = 0; j < len / 8; ++ j) index[j] += start; offset[bucket] = index[len / 8 - 1]; indexfile.write(tmp, len); delete [] tmp; datafile.close(); indexfile.close(); } // update total received data self->m_SlaveStat.updateIO(speip, b.totalsize, 0); } pthread_mutex_destroy(bqlock); pthread_cond_destroy(bqcond); delete bqlock; delete bqcond; delete pendingSize; // sort and reduce if (type == 1) { void* lh = NULL; self->openLibrary(key, function, lh); //if (NULL == lh) // break; MR_COMPARE comp = NULL; MR_REDUCE reduce = NULL; self->getReduceFunc(lh, function, comp, reduce); if (NULL != comp) { char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; for (set<int>::iterator i = fileid.begin(); i != fileid.end(); ++ i) { sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), *i); self->sort(tmp, comp, reduce); } delete [] tmp; } self->closeLibrary(lh); } // report sphere output files char* tmp = new char[path.length() + localfile.length() + 64]; vector<string> filelist; for (set<int>::iterator i = fileid.begin(); i != fileid.end(); ++ i) { sprintf(tmp, "%s.%d", (path + "/" + localfile).c_str(), *i); filelist.push_back(tmp); sprintf(tmp, "%s.%d.idx", (path + "/" + localfile).c_str(), *i); filelist.push_back(tmp); } delete [] tmp; self->report(master_ip, master_port, transid, filelist, 1); self->reportSphere(master_ip, master_port, transid); // cout << "bucket completed 100 " << client_ip << " " << client_port << endl; SectorMsg msg; msg.setType(1); // success, return result msg.setData(0, (char*)&(bucketid), 4); int progress = 100; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(client_ip.c_str(), client_port, id, &msg); //remove this client data channel self->m_DataChn.remove(client_ip, client_data_port); return NULL; }