DWORD WINAPI CGMP::sndHandler(LPVOID s) #endif { CGMP* self = (CGMP*)s; while (!self->m_bClosed) { #ifndef WIN32 timespec timeout; timeval now; gettimeofday(&now, 0); timeout.tv_sec = now.tv_sec + 1; timeout.tv_nsec = now.tv_usec * 1000; pthread_mutex_lock(&self->m_SndQueueLock); pthread_cond_timedwait(&self->m_SndQueueCond, &self->m_SndQueueLock, &timeout); pthread_mutex_unlock(&self->m_SndQueueLock); #else WaitForSingleObject(self->m_SndQueueCond, 1000); #endif vector<CMsgRecord*> udtsend; udtsend.clear(); CGuard::enterCS(self->m_SndQueueLock); int64_t ts = CTimer::getTime(); for (list<CMsgRecord*>::iterator i = self->m_lSndQueue.begin(); i != self->m_lSndQueue.end();) { int64_t diff = ts - (*i)->m_llTimeStamp; if (diff > 10 * 1000000) { // timeout, send with UDT... list<CMsgRecord*>::iterator j = i; i ++; udtsend.push_back(*j); self->m_lSndQueue.erase(j); continue; } else if (diff > 1000000) self->UDPsend((*i)->m_strIP.c_str(), (*i)->m_iPort, (*i)->m_pMsg); // check next msg ++ i; } CGuard::leaveCS(self->m_SndQueueLock); for (vector<CMsgRecord*>::iterator i = udtsend.begin(); i != udtsend.end(); ++ i) { self->UDTsend((*i)->m_strIP.c_str(), (*i)->m_iPort, (*i)->m_pMsg); delete (*i)->m_pMsg; delete (*i); } udtsend.clear(); } return NULL; }
DWORD WINAPI CGMP::udtRcvHandler(LPVOID s) #endif { CGMP* self = (CGMP*)s; int32_t header[CGMPMessage::g_iHdrField]; while (!self->m_bClosed) { //TODO: use timeout. set<UDTSOCKET> readfds; UDT::epoll_wait(self->m_iUDTEPollID, &readfds, NULL, -1); for (set<UDTSOCKET>::iterator i = readfds.begin(); i != readfds.end(); ++ i) { // TODO: This code is dangerous. We should either put all data all togeter into one recv(), // or make this asynchronous using a state machine. int port; if (self->UDTRecv(*i, (char*)&port, 4) < 0) continue; // recv "header" information if (self->UDTRecv(*i, (char*)header, CGMPMessage::g_iHdrSize) < 0) continue; // TODO: this may be retrieved from UDT connection cache as well. sockaddr_in addr; int addrlen = sizeof(sockaddr_in); UDT::getpeername(*i, (sockaddr*)&addr, &addrlen); char peer_ip[NI_MAXHOST]; char peer_udt_port[NI_MAXSERV]; getnameinfo((sockaddr*)&addr, addrlen, peer_ip, sizeof(peer_ip), peer_udt_port, sizeof(peer_udt_port), NI_NUMERICHOST|NI_NUMERICSERV); CMsgRecord* rec = new CMsgRecord; rec->m_strIP = peer_ip; rec->m_iPort = port; rec->m_pMsg = new CGMPMessage; //rec->m_pMsg->m_iType = type; rec->m_pMsg->m_iSession = header[1]; rec->m_pMsg->m_iSrcChn = header[2]; rec->m_pMsg->m_iDstChn = header[3]; rec->m_pMsg->m_iID = header[4]; rec->m_pMsg->m_iInfo = header[5]; cout << "udt recv " << peer_ip << " " << port << endl; // recv parameter size if (self->UDTRecv(*i, (char*)&(rec->m_pMsg->m_iLength), 4) < 0) { delete rec->m_pMsg; delete rec; continue; } rec->m_pMsg->m_pcData = new char[rec->m_pMsg->m_iLength]; if (self->UDTRecv(*i, rec->m_pMsg->m_pcData, rec->m_pMsg->m_iLength) < 0) { delete rec->m_pMsg; delete rec; continue; } CChannelRec* chn = self->getChnHandle(rec->m_pMsg->m_iDstChn); if (chn == NULL) { delete rec->m_pMsg; delete rec; continue; } if (self->m_pPeerMgmt->hit(rec->m_strIP, rec->m_iPort, rec->m_pMsg->m_iSession, rec->m_pMsg->m_iID)) continue; self->m_pPeerMgmt->insert(rec->m_strIP, rec->m_iPort, rec->m_pMsg->m_iSession, rec->m_pMsg->m_iID); int tmp; self->storeMsg(rec->m_pMsg->m_iInfo, chn, rec, tmp); } } return NULL; }
DWORD WINAPI CGMP::sndHandler(LPVOID s) #endif { CGMP* self = (CGMP*)s; while (!self->m_bClosed) { #ifndef WIN32 timespec timeout; timeval now; gettimeofday(&now, 0); timeout.tv_sec = now.tv_sec + 1; timeout.tv_nsec = now.tv_usec * 1000; pthread_mutex_lock(&self->m_SndQueueLock); pthread_cond_timedwait(&self->m_SndQueueCond, &self->m_SndQueueLock, &timeout); pthread_mutex_unlock(&self->m_SndQueueLock); #else WaitForSingleObject(self->m_SndQueueCond, 1000); #endif list<list<CMsgRecord*>::iterator> udtsend; udtsend.clear(); CGuard::enterCS(self->m_SndQueueLock); int64_t ts = CTimer::getTime(); for (list<CMsgRecord*>::iterator i = self->m_lSndQueue.begin(); i != self->m_lSndQueue.end(); ++i) { if ((*i)->m_pMsg->m_iLength > m_iMaxUDPMsgSize) { // Send large message using UDT. udtsend.push_back(i); continue; } int64_t diff = ts - (*i)->m_llTimeStamp; if ((diff > 10 * 1000000) && ((*i)->m_pMsg->m_piHeader[0] == 0)) { // timeout, send with UDT... udtsend.push_back(i); //TODO: should probably drop this msg instead of send using UDT continue; } else if (diff > 1000000) { // Don't send out UDP packets too often. self->UDPsend((*i)->m_strIP.c_str(), (*i)->m_iPort, (*i)->m_pMsg); } } CGuard::leaveCS(self->m_SndQueueLock); //Use UDT to send large & undelivered messages. for (list<list<CMsgRecord*>::iterator>::iterator i = udtsend.begin(); i != udtsend.end(); ++ i) { // TODO: erase this msg if send failure caused by connection problem. if (self->UDTsend((**i)->m_strIP.c_str(), (**i)->m_iPort, (**i)->m_pMsg) >= 0) { CGuard::enterCS(self->m_SndQueueLock); delete (**i)->m_pMsg; delete (**i); self->m_lSndQueue.erase(*i); CGuard::leaveCS(self->m_SndQueueLock); } } } return NULL; }
DWORD WINAPI Slave::SPEShuffler(LPVOID p) #endif { Slave* self = ((Param5*)p)->serv_instance; int transid = ((Param5*)p)->transid; string client_ip = ((Param5*)p)->client_ip; int client_port = ((Param5*)p)->client_ctrl_port; int client_data_port = ((Param5*)p)->client_data_port; string path = ((Param5*)p)->path; string localfile = ((Param5*)p)->filename; int bucketnum = ((Param5*)p)->bucketnum; CGMP* gmp = ((Param5*)p)->gmp; string function = ((Param5*)p)->function; int bucketid = ((Param5*)p)->bucketid; const int key = ((Param5*)p)->key; const int type = ((Param5*)p)->type; string master_ip = ((Param5*)p)->master_ip; int master_port = ((Param5*)p)->master_port; queue<Bucket>* bq = NULL; CMutex* bqlock = NULL; CCond* bqcond = NULL; int64_t* pendingSize = NULL; pthread_t shufflerex; bool init_success = true; //set up data connection, for keep-alive purpose if (self->m_DataChn.connect(client_ip, client_data_port) < 0) { init_success = false; } else { // read library files for MapReduce, no need for Sphere UDF if (type == 1) self->acceptLibrary(key, client_ip, client_data_port, transid); bq = new queue<Bucket>; bqlock = new CMutex; bqcond = new CCond; pendingSize = new int64_t; *pendingSize = 0; ((Param5*)p)->bq = bq; ((Param5*)p)->bqlock = bqlock; ((Param5*)p)->bqcond = bqcond; ((Param5*)p)->pending = pendingSize; #ifndef WIN32 pthread_create(&shufflerex, NULL, SPEShufflerEx, p); #else DWORD ThreadID; shufflerex = CreateThread(NULL, 0, SPEShufflerEx, p, NULL, &ThreadID); #endif self->m_SectorLog << LogStart(LogLevel::SCREEN) << "SPE Shuffler " << path << " " << localfile << " " << bucketnum << LogEnd(); } while (init_success) { string speip; int speport; SectorMsg msg; int msgid; int r = gmp->recvfrom(speip, speport, msgid, &msg, false); // client releases the task or client has already been shutdown if (((r > 0) && (speip == client_ip) && (speport == client_port)) || ((r < 0) && (!self->m_DataChn.isConnected(client_ip, client_data_port)))) { Bucket b; b.totalnum = -1; b.totalsize = 0; bqlock->acquire(); bq->push(b); bqcond->signal(); bqlock->release(); break; } if (r < 0) continue; if (*pendingSize > 256000000) { // too many incoming results, ask the sender to wait // the receiver buffer size threshold is set to 256MB. This prevents the shuffler from being overflowed // it also helps direct the traffic to less congested shuffler and leads to better load balance msg.setType(-msg.getType()); gmp->sendto(speip, speport, msgid, &msg); } else { Bucket b; b.totalnum = *(int32_t*)(msg.getData() + 8);; b.totalsize = *(int32_t*)(msg.getData() + 12); b.src_ip = speip; b.src_dataport = *(int32_t*)msg.getData(); b.session = *(int32_t*)(msg.getData() + 4); gmp->sendto(speip, speport, msgid, &msg); if (!self->m_DataChn.isConnected(speip, b.src_dataport)) self->m_DataChn.connect(speip, b.src_dataport); bqlock->acquire(); bq->push(b); *pendingSize += b.totalsize; bqcond->signal(); bqlock->release(); } } if (init_success) { #ifndef WIN32 pthread_join(shufflerex, NULL); #else WaitForSingleObject(shufflerex, INFINITE); #endif delete bqlock; delete bqcond; delete pendingSize; SectorMsg msg; msg.setType(1); // success, return result msg.setData(0, (char*)&(bucketid), 4); int progress = 100; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(client_ip.c_str(), client_port, id, &msg); self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "bucket completed 100 " << client_ip << " " << client_port << LogEnd(); } gmp->close(); delete gmp; self->reportSphere(master_ip, master_port, transid); // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); return NULL; }
void* Slave::SPEShuffler(void* p) { Slave* self = ((Param5*)p)->serv_instance; string client_ip = ((Param5*)p)->client_ip; int client_port = ((Param5*)p)->client_ctrl_port; int client_data_port = ((Param5*)p)->client_data_port; string path = ((Param5*)p)->path; string localfile = ((Param5*)p)->filename; // int bucketnum = ((Param5*)p)->bucketnum; CGMP* gmp = ((Param5*)p)->gmp; string function = ((Param5*)p)->function; //set up data connection, for keep-alive purpose if (self->m_DataChn.connect(client_ip, client_data_port) < 0) return NULL; queue<Bucket>* bq = new queue<Bucket>; pthread_mutex_t* bqlock = new pthread_mutex_t; pthread_mutex_init(bqlock, NULL); pthread_cond_t* bqcond = new pthread_cond_t; pthread_cond_init(bqcond, NULL); int64_t* pendingSize = new int64_t; *pendingSize = 0; ((Param5*)p)->bq = bq; ((Param5*)p)->bqlock = bqlock; ((Param5*)p)->bqcond = bqcond; ((Param5*)p)->pending = pendingSize; pthread_t ex; pthread_create(&ex, NULL, SPEShufflerEx, p); pthread_detach(ex); // cout << "SPE Shuffler " << path << " " << localfile << " " << bucketnum << endl; while (true) { string speip; int speport; SectorMsg msg; int msgid; int r = gmp->recvfrom(speip, speport, msgid, &msg, false); // client releases the task or client has already been shutdown if (((r > 0) && (speip == client_ip) && (speport == client_port)) || ((r < 0) && (!self->m_DataChn.isConnected(client_ip, client_data_port)))) { Bucket b; b.totalnum = -1; b.totalsize = 0; pthread_mutex_lock(bqlock); bq->push(b); pthread_cond_signal(bqcond); pthread_mutex_unlock(bqlock); break; } if (r < 0) continue; if (*pendingSize > 256000000) { // too many incoming results, ask the sender to wait // the receiver buffer size threshold is set to 256MB. This prevents the shuffler from being overflowed // it also helps direct the traffic to less congested shuffler and leads to better load balance msg.setType(-msg.getType()); gmp->sendto(speip, speport, msgid, &msg); } else { Bucket b; b.totalnum = *(int32_t*)(msg.getData() + 8);; b.totalsize = *(int32_t*)(msg.getData() + 12); b.src_ip = speip; b.src_dataport = *(int32_t*)msg.getData(); b.session = *(int32_t*)(msg.getData() + 4); gmp->sendto(speip, speport, msgid, &msg); if (!self->m_DataChn.isConnected(speip, b.src_dataport)) self->m_DataChn.connect(speip, b.src_dataport); pthread_mutex_lock(bqlock); bq->push(b); *pendingSize += b.totalsize; pthread_cond_signal(bqcond); pthread_mutex_unlock(bqlock); } } gmp->close(); delete gmp; return NULL; }
int Slave::processDCCmd(const string& ip, const int port, int id, SectorMsg* msg) { printf ("~~~> processDCCmd: %d\n", msg->getType()); switch (msg->getType()) { case 203: // processing engine { Param4* p = new Param4; p->serv_instance = this; p->client_ip = msg->getData(); p->client_ctrl_port = *(int32_t*)(msg->getData() + 64); p->client_data_port = *(int32_t*)(msg->getData() + 68); p->speid = *(int32_t*)(msg->getData() + 72); p->key = *(int32_t*)(msg->getData() + 76); p->function = msg->getData() + 80; int offset = 80 + p->function.length() + 1; p->rows = *(int32_t*)(msg->getData() + offset); p->psize = *(int32_t*)(msg->getData() + offset + 4); if (p->psize > 0) { p->param = new char[p->psize]; memcpy(p->param, msg->getData() + offset + 8, p->psize); } else p->param = NULL; p->type = *(int32_t*)(msg->getData() + msg->m_iDataLength - SectorMsg::m_iHdrSize - 8); p->transid = *(int32_t*)(msg->getData() + msg->m_iDataLength - SectorMsg::m_iHdrSize - 4); p->master_ip = ip; p->master_port = port; cout << "starting SPE ... " << p->speid << " " << p->client_data_port << " " << p->function << " " << p->transid << endl; char* tmp = new char[64 + p->function.length()]; sprintf(tmp, "starting SPE ... %d %d %s %d.", p->speid, p->client_data_port, p->function.c_str(), p->transid); m_SectorLog.insert(tmp); delete [] tmp; #ifndef WIN32 // <slr> pthread_t spe_handler; pthread_create(&spe_handler, NULL, SPEHandler, p); pthread_detach(spe_handler); #else unsigned int ThreadID; HANDLE spe_handler = (HANDLE)_beginthreadex(NULL, 0, SPEHandler, p, NULL, &ThreadID); if (spe_handler) CloseHandle(spe_handler); #endif msg->m_iDataLength = SectorMsg::m_iHdrSize; m_GMP.sendto(ip, port, id, msg); break; } case 204: // accept SPE buckets { CGMP* gmp = new CGMP; gmp->init(); Param5* p = new Param5; p->serv_instance = this; p->client_ip = msg->getData(); p->client_ctrl_port = *(int32_t*)(msg->getData() + 64); p->bucketnum = *(int32_t*)(msg->getData() + 68); p->bucketid = *(int32_t*)(msg->getData() + 72); p->path = msg->getData() + 80; int offset = 80 + p->path.length() + 1 + 4; p->filename = msg->getData() + offset; p->gmp = gmp; offset += p->filename.length() + 1; p->key = *(int32_t*)(msg->getData() + offset); p->type = *(int32_t*)(msg->getData() + offset + 4); if (p->type == 1) { p->function = msg->getData() + offset + 4 + 4 + 4; } p->transid = *(int32_t*)(msg->getData() + msg->m_iDataLength - SectorMsg::m_iHdrSize - 8); p->client_data_port = *(int32_t*)(msg->getData() + msg->m_iDataLength - SectorMsg::m_iHdrSize - 4); p->master_ip = ip; p->master_port = port; char* tmp = new char[64 + p->filename.length()]; sprintf(tmp, "starting SPE Bucket... %s %d %d %d.", p->filename.c_str(), p->key, p->type, p->transid); m_SectorLog.insert(tmp); delete [] tmp; #ifndef WIN32 // <slr> pthread_t spe_shuffler; pthread_create(&spe_shuffler, NULL, SPEShuffler, p); pthread_detach(spe_shuffler); #else unsigned int ThreadID; HANDLE spe_shuffler = (HANDLE)_beginthreadex(NULL, 0, SPEShuffler, p, NULL, &ThreadID); if (spe_shuffler) CloseHandle(spe_shuffler); #endif *(int32_t*)msg->getData() = gmp->getPort(); msg->m_iDataLength = SectorMsg::m_iHdrSize + 4; m_GMP.sendto(ip, port, id, msg); break; } default: return -1; } return 0; }