DWORD WINAPI Slave::copy(LPVOID p) #endif { Slave* self = ((Param3*)p)->serv_instance; int transid = ((Param3*)p)->transid; int dir = ((Param3*)p)->dir; string src = ((Param3*)p)->src; string dst = ((Param3*)p)->dst; string master_ip = ((Param3*)p)->master_ip; int master_port = ((Param3*)p)->master_port; delete (Param3*)p; if (src.c_str()[0] == '\0') src = "/" + src; if (dst.c_str()[0] == '\0') dst = "/" + dst; bool success = true; queue<string> tr; // files to be replicated queue<string> td; // directories to be explored if (dir > 0) td.push(src); else tr.push(src); while (!td.empty()) { // If the file to be replicated is a directory, recursively list all files first string src_path = td.front(); td.pop(); // try list this path SectorMsg msg; msg.setType(101); msg.setKey(0); msg.setData(0, src_path.c_str(), src_path.length() + 1); Address addr; self->m_Routing.lookup(src_path, addr); if (self->m_GMP.rpc(addr.m_strIP.c_str(), addr.m_iPort, &msg, &msg) < 0) { success = false; break; } // the master only returns positive if this is a directory if (msg.getType() >= 0) { // if this is a directory, create it, and put all files and sub-directories into the queue of files to be copied // create a local dir string dst_path = dst; if (src != src_path) dst_path += "/" + src_path.substr(src.length() + 1, src_path.length() - src.length() - 1); //create at .tmp first, then move to real location self->createDir(string(".tmp") + dst_path); string filelist = msg.getData(); unsigned int s = 0; while (s < filelist.length()) { int t = filelist.find(';', s); SNode sn; sn.deserialize(filelist.substr(s, t - s).c_str()); if (sn.m_bIsDir) td.push(src_path + "/" + sn.m_strName); else tr.push(src_path + "/" + sn.m_strName); s = t + 1; } continue; } } while (!tr.empty()) { string src_path = tr.front(); tr.pop(); SNode tmp; if (self->m_pLocalFile->lookup(src_path.c_str(), tmp) >= 0) { //if file is local, copy directly //note that in this case, src != dst, therefore this is a regular "cp" command, not a system replication //IMPORTANT!!! //local files must be read directly from local disk, and cannot be read via datachn due to its limitation string dst_path = dst; if (src != src_path) dst_path += "/" + src_path.substr(src.length() + 1, src_path.length() - src.length() - 1); //copy to .tmp first, then move to real location self->createDir(string(".tmp") + dst_path.substr(0, dst_path.rfind('/'))); LocalFS::copy(self->m_strHomeDir + src_path, self->m_strHomeDir + ".tmp" + dst_path); } else { // open the file and copy it to local SectorMsg msg; msg.setType(110); msg.setKey(0); int32_t mode = SF_MODE::READ; msg.setData(0, (char*)&mode, 4); int32_t localport = self->m_DataChn.getPort(); msg.setData(4, (char*)&localport, 4); int32_t len_name = src_path.length() + 1; msg.setData(8, (char*)&len_name, 4); msg.setData(12, src_path.c_str(), len_name); int32_t len_opt = 0; msg.setData(12 + len_name, (char*)&len_opt, 4); Address addr; self->m_Routing.lookup(src_path, addr); if ((self->m_GMP.rpc(addr.m_strIP.c_str(), addr.m_iPort, &msg, &msg) < 0) || (msg.getType() < 0)) { success = false; break; } int32_t session = *(int32_t*)msg.getData(); int64_t size = *(int64_t*)(msg.getData() + 4); time_t ts = *(int64_t*)(msg.getData() + 12); string ip = msg.getData() + 24; int32_t port = *(int32_t*)(msg.getData() + 64 + 24); if (!self->m_DataChn.isConnected(ip, port)) { if (self->m_DataChn.connect(ip, port) < 0) { success = false; break; } } // download command: 3 int32_t cmd = 3; self->m_DataChn.send(ip, port, session, (char*)&cmd, 4); int64_t offset = 0; self->m_DataChn.send(ip, port, session, (char*)&offset, 8); int response = -1; if ((self->m_DataChn.recv4(ip, port, session, response) < 0) || (-1 == response)) { success = false; break; } string dst_path = dst; if (src != src_path) dst_path += "/" + src_path.substr(src.length() + 1, src_path.length() - src.length() - 1); //copy to .tmp first, then move to real location self->createDir(string(".tmp") + dst_path.substr(0, dst_path.rfind('/'))); fstream ofs; ofs.open((self->m_strHomeDir + ".tmp" + dst_path).c_str(), ios::out | ios::binary | ios::trunc); int64_t unit = 64000000; //send 64MB each time int64_t torecv = size; int64_t recd = 0; while (torecv > 0) { int64_t block = (torecv < unit) ? torecv : unit; if (self->m_DataChn.recvfile(ip, port, session, ofs, offset + recd, block) < 0) { success = false; break; } recd += block; torecv -= block; } ofs.close(); // update total received data size self->m_SlaveStat.updateIO(ip, size, +SlaveStat::SYS_IN); cmd = 5; self->m_DataChn.send(ip, port, session, (char*)&cmd, 4); self->m_DataChn.recv4(ip, port, session, cmd); if (src == dst) { //utime: update timestamp according to the original copy, for replica only; files created by "cp" have new timestamp utimbuf ut; ut.actime = ts; ut.modtime = ts; utime((self->m_strHomeDir + ".tmp" + dst_path).c_str(), &ut); } } } if (success) { // move from temporary dir to the real dir when the copy is completed self->createDir(dst.substr(0, dst.rfind('/'))); LocalFS::rename(self->m_strHomeDir + ".tmp" + dst, self->m_strHomeDir + dst); // if the file has been modified during the replication, remove this replica int32_t type = (src == dst) ? +FileChangeType::FILE_UPDATE_REPLICA : +FileChangeType::FILE_UPDATE_NEW; if (self->report(master_ip, master_port, transid, dst, type) < 0) LocalFS::erase(self->m_strHomeDir + dst); } else { // failed, remove all temporary files LocalFS::erase(self->m_strHomeDir + ".tmp" + dst); self->report(master_ip, master_port, transid, "", +FileChangeType::FILE_UPDATE_NO); } // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); return NULL; }
DWORD WINAPI Slave::fileHandler(LPVOID p) #endif { Slave* self = ((Param2*)p)->serv_instance; string filename = self->m_strHomeDir + ((Param2*)p)->filename; string sname = ((Param2*)p)->filename; int key = ((Param2*)p)->key; int mode = ((Param2*)p)->mode; int transid = ((Param2*)p)->transid; string client_ip = ((Param2*)p)->client_ip; int client_port = ((Param2*)p)->client_port; unsigned char crypto_key[16]; unsigned char crypto_iv[8]; memcpy(crypto_key, ((Param2*)p)->crypto_key, 16); memcpy(crypto_iv, ((Param2*)p)->crypto_iv, 8); string master_ip = ((Param2*)p)->master_ip; int master_port = ((Param2*)p)->master_port; delete (Param2*)p; // uplink and downlink addresses for write, no need for read string src_ip = client_ip; int src_port = client_port; string dst_ip; int dst_port = -1; // IO permissions bool bRead = mode & 1; bool bWrite = mode & 2; bool trunc = mode & 4; bool bSecure = mode & 16; bool m_bChange = false; int last_timestamp = 0; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "connecting to " << client_ip << " " << client_port << " " << filename << LogEnd(); if ((!self->m_DataChn.isConnected(client_ip, client_port)) && (self->m_DataChn.connect(client_ip, client_port) < 0)) { self->m_SectorLog << LogStart(LogLevel::LEVEL_2) << "failed to connect to file client " << client_ip << " " << client_port << " " << filename << LogEnd(); // release transactions and file locks self->m_TransManager.updateSlave(transid, self->m_iSlaveID); self->m_pLocalFile->unlock(sname, key, mode); self->report(master_ip, master_port, transid, sname, +FileChangeType::FILE_UPDATE_NO); return NULL; } Crypto* encoder = NULL; Crypto* decoder = NULL; if (bSecure) { encoder = new Crypto; encoder->initEnc(crypto_key, crypto_iv); decoder = new Crypto; decoder->initDec(crypto_key, crypto_iv); } //create a new directory or file in case it does not exist if (bWrite) { self->createDir(sname.substr(0, sname.rfind('/'))); SNode s; if (LocalFS::stat(filename, s) < 0) { ofstream newfile(filename.c_str(), ios::out | ios::binary | ios::trunc); newfile.close(); } } timeval t1, t2; gettimeofday(&t1, 0); int64_t rb = 0; int64_t wb = 0; WriteLog writelog; fstream fhandle; if (!trunc) fhandle.open(filename.c_str(), ios::in | ios::out | ios::binary); else fhandle.open(filename.c_str(), ios::in | ios::out | ios::binary | ios::trunc); // a file session is successful only if the client issue a close() request bool success = true; bool run = true; int32_t cmd = 0; while (run) { if (self->m_DataChn.recv4(client_ip, client_port, transid, cmd) < 0) break; switch (cmd) { case 1: // read { char* param = NULL; int tmp = 8 * 2; if (self->m_DataChn.recv(client_ip, client_port, transid, param, tmp) < 0) { success = false; break; } int64_t offset = *(int64_t*)param; int64_t size = *(int64_t*)(param + 8); delete [] param; int32_t response = bRead ? 0 : -1; if (fhandle.fail() || !success || !self->m_bDiskHealth || !self->m_bNetworkHealth) response = -1; if (self->m_DataChn.send(client_ip, client_port, transid, (char*)&response, 4) < 0) break; if (response == -1) break; if (self->m_DataChn.sendfile(client_ip, client_port, transid, fhandle, offset, size, encoder) < 0) success = false; else rb += size; // update total sent data size self->m_SlaveStat.updateIO(client_ip, param[1], (key == 0) ? +SlaveStat::SYS_OUT : +SlaveStat::CLI_OUT); break; } case 2: // write { if (!bWrite) { // if the client does not have write permission, disconnect it immediately success = false; break; } //receive offset and size information from uplink char* param = NULL; int tmp = 8 * 2; if (self->m_DataChn.recv(src_ip, src_port, transid, param, tmp) < 0) break; int64_t offset = *(int64_t*)param; int64_t size = *(int64_t*)(param + 8); delete [] param; // no secure transfer between two slaves Crypto* tmp_decoder = decoder; if ((client_ip != src_ip) || (client_port != src_port)) tmp_decoder = NULL; bool io_status = (size > 0); if (!io_status || (self->m_DataChn.recvfile(src_ip, src_port, transid, fhandle, offset, size, tmp_decoder) < size)) io_status = false; //TODO: send incomplete write to next slave on chain, rather than -1 if (dst_port > 0) { // send offset and size parameters char req[16]; *(int64_t*)req = offset; if (io_status) *(int64_t*)(req + 8) = size; else *(int64_t*)(req + 8) = -1; self->m_DataChn.send(dst_ip, dst_port, transid, req, 16); // send the data to the next replica in the chain if (size > 0) self->m_DataChn.sendfile(dst_ip, dst_port, transid, fhandle, offset, size); } if (!io_status) break; wb += size; // update total received data size self->m_SlaveStat.updateIO(src_ip, size, (key == 0) ? +SlaveStat::SYS_IN : +SlaveStat::CLI_IN); // update write log writelog.insert(offset, size); m_bChange = true; break; } case 3: // download { int64_t offset; if (self->m_DataChn.recv8(client_ip, client_port, transid, offset) < 0) { success = false; break; } int32_t response = bRead ? 0 : -1; if (fhandle.fail() || !success || !self->m_bDiskHealth || !self->m_bNetworkHealth) response = -1; if (self->m_DataChn.send(client_ip, client_port, transid, (char*)&response, 4) < 0) break; if (response == -1) break; fhandle.seekg(0, ios::end); int64_t size = (int64_t)(fhandle.tellg()); fhandle.seekg(0, ios::beg); size -= offset; int64_t unit = 64000000; //send 64MB each time int64_t tosend = size; int64_t sent = 0; while (tosend > 0) { int64_t block = (tosend < unit) ? tosend : unit; if (self->m_DataChn.sendfile(client_ip, client_port, transid, fhandle, offset + sent, block, encoder) < 0) { success = false; break; } sent += block; tosend -= block; } rb += sent; // update total sent data size self->m_SlaveStat.updateIO(client_ip, size, (key == 0) ? +SlaveStat::SYS_OUT : +SlaveStat::CLI_OUT); break; } case 4: // upload { if (!bWrite) { // if the client does not have write permission, disconnect it immediately success = false; break; } int64_t offset = 0; int64_t size; if (self->m_DataChn.recv8(client_ip, client_port, transid, size) < 0) { success = false; break; } //TODO: check available size int32_t response = 0; if (fhandle.fail() || !success || !self->m_bDiskHealth || !self->m_bNetworkHealth) response = -1; if (self->m_DataChn.send(client_ip, client_port, transid, (char*)&response, 4) < 0) break; if (response == -1) break; int64_t unit = 64000000; //send 64MB each time int64_t torecv = size; int64_t recd = 0; // no secure transfer between two slaves Crypto* tmp_decoder = decoder; if ((client_ip != src_ip) || (client_port != src_port)) tmp_decoder = NULL; while (torecv > 0) { int64_t block = (torecv < unit) ? torecv : unit; if (self->m_DataChn.recvfile(src_ip, src_port, transid, fhandle, offset + recd, block, tmp_decoder) < 0) { success = false; break; } if (dst_port > 0) { // write to uplink for next replica in the chain if (self->m_DataChn.sendfile(dst_ip, dst_port, transid, fhandle, offset + recd, block) < 0) break; } recd += block; torecv -= block; } wb += recd; // update total received data size self->m_SlaveStat.updateIO(src_ip, size, (key == 0) ? +SlaveStat::SYS_IN : +SlaveStat::CLI_IN); // update write log writelog.insert(0, size); m_bChange = true; break; } case 5: // end session // the file has been successfully closed run = false; break; case 6: // read file path for local IO optimization self->m_DataChn.send(client_ip, client_port, transid, self->m_strHomeDir.c_str(), self->m_strHomeDir.length() + 1); break; case 7: // synchronize with the client, make sure write is correct { //TODO: merge all three recv() to one int32_t size = 0; if (self->m_DataChn.recv4(client_ip, client_port, transid, size) < 0) break; char* buf = NULL; if (self->m_DataChn.recv(client_ip, client_port, transid, buf, size) < 0) break; last_timestamp = 0; if (self->m_DataChn.recv4(client_ip, client_port, transid, last_timestamp) < 0) break; WriteLog log; log.deserialize(buf, size); delete [] buf; int32_t confirm = -1; if (writelog.compare(log)) confirm = 1; writelog.clear(); if (confirm > 0) { //synchronize timestamp utimbuf ut; ut.actime = last_timestamp; ut.modtime = last_timestamp; utime(filename.c_str(), &ut); } self->m_DataChn.send(client_ip, client_port, transid, (char*)&confirm, 4); break; } case 8: // specify up and down links { char* buf = NULL; int size = 136; if (self->m_DataChn.recv(client_ip, client_port, transid, buf, size) < 0) break; int32_t response = bWrite ? 0 : -1; if (fhandle.fail() || !success || !self->m_bDiskHealth || !self->m_bNetworkHealth) response = -1; if (self->m_DataChn.send(client_ip, client_port, transid, (char*)&response, 4) < 0) break; if (response == -1) break; src_ip = buf; src_port = *(int32_t*)(buf + 64); dst_ip = buf + 68; dst_port = *(int32_t*)(buf + 132); delete [] buf; if (src_port > 0) { // connect to uplink in the write chain if (!self->m_DataChn.isConnected(src_ip, src_port)) self->m_DataChn.connect(src_ip, src_port); } else { // first node in the chain, read from client src_ip = client_ip; src_port = client_port; } if (dst_port > 0) { //connect downlink in the write chain if (!self->m_DataChn.isConnected(dst_ip, dst_port)) self->m_DataChn.connect(dst_ip, dst_port); } break; } default: break; } } // close local file fhandle.close(); // update final timestamp if (last_timestamp > 0) { utimbuf ut; ut.actime = last_timestamp; ut.modtime = last_timestamp; utime(filename.c_str(), &ut); } gettimeofday(&t2, 0); int duration = t2.tv_sec - t1.tv_sec; double avgRS = 0; double avgWS = 0; if (duration > 0) { avgRS = rb / duration * 8.0 / 1000000.0; avgWS = wb / duration * 8.0 / 1000000.0; } self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "file server closed " << src_ip << " " << src_port << " " << (long long)avgWS << " " << (long long)avgRS << LogEnd(); // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); // unlock the file // this must be done before the client is disconnected, otherwise if the client immediately re-open the file, the lock may not be released yet self->m_pLocalFile->unlock(sname, key, mode); // report to master the task is completed // this also must be done before the client is disconnected, otherwise client may not be able to immediately re-open the file as the master is not updated int change = m_bChange ? +FileChangeType::FILE_UPDATE_WRITE : +FileChangeType::FILE_UPDATE_NO; self->report(master_ip, master_port, transid, sname, change); if (bSecure) { encoder->release(); delete encoder; decoder->release(); delete decoder; } if (success) self->m_DataChn.send(client_ip, client_port, transid, (char*)&cmd, 4); else self->m_DataChn.sendError(client_ip, client_port, transid); return NULL; }
void* Slave::copy(void* p) { Slave* self = ((Param3*)p)->serv_instance; int transid = ((Param3*)p)->transid; string src = ((Param3*)p)->src; string dst = ((Param3*)p)->dst; string master_ip = ((Param3*)p)->master_ip; int master_port = ((Param3*)p)->master_port; delete (Param3*)p; if (src.c_str()[0] == '\0') src = "/" + src; if (dst.c_str()[0] == '\0') dst = "/" + dst; SNode tmp; if (self->m_pLocalFile->lookup(src.c_str(), tmp) >= 0) { //if file is local, copy directly //note that in this case, src != dst, therefore this is a regular "cp" command, not a system replication //TODO: check disk space self->createDir(dst.substr(0, dst.rfind('/'))); string rhome = self->reviseSysCmdPath(self->m_strHomeDir); string rsrc = self->reviseSysCmdPath(src); string rdst = self->reviseSysCmdPath(dst); system(("cp " + rhome + rsrc + " " + rhome + rdst).c_str()); // if the file has been modified during the replication, remove this replica int type = (src == dst) ? +FileChangeType::FILE_UPDATE_REPLICA : +FileChangeType::FILE_UPDATE_NEW; struct stat64 s; if (stat64((self->m_strHomeDir + dst).c_str(), &s) < 0) type = +FileChangeType::FILE_UPDATE_NO; if (self->report(master_ip, master_port, transid, dst, type) < 0) system(("rm " + rhome + rdst).c_str()); // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); return NULL; } bool success = true; queue<string> tr; tr.push(src); while (!tr.empty()) { string src_path = tr.front(); tr.pop(); // try list this path SectorMsg msg; msg.setType(101); msg.setKey(0); msg.setData(0, src_path.c_str(), src_path.length() + 1); Address addr; self->m_Routing.lookup(src_path, addr); if (self->m_GMP.rpc(addr.m_strIP.c_str(), addr.m_iPort, &msg, &msg) < 0) { success = false; break; } if (msg.getType() >= 0) { // if this is a directory, put all files and sub-drectories into the queue of files to be copied string filelist = msg.getData(); unsigned int s = 0; while (s < filelist.length()) { int t = filelist.find(';', s); SNode sn; sn.deserialize(filelist.substr(s, t - s).c_str()); tr.push(src_path + "/" + sn.m_strName); s = t + 1; } continue; } // open the file and copy it to local msg.setType(110); msg.setKey(0); int32_t mode = SF_MODE::READ; msg.setData(0, (char*)&mode, 4); int64_t reserve = 0; msg.setData(4, (char*)&reserve, 8); int32_t localport = self->m_DataChn.getPort(); msg.setData(12, (char*)&localport, 4); msg.setData(16, "\0", 1); msg.setData(80, src_path.c_str(), src_path.length() + 1); if ((self->m_GMP.rpc(addr.m_strIP.c_str(), addr.m_iPort, &msg, &msg) < 0) || (msg.getType() < 0)) { success = false; break; } int32_t session = *(int32_t*)msg.getData(); int64_t size = *(int64_t*)(msg.getData() + 4); time_t ts = *(int64_t*)(msg.getData() + 12); string ip = msg.getData() + 24; int32_t port = *(int32_t*)(msg.getData() + 64 + 24); if (!self->m_DataChn.isConnected(ip, port)) { if (self->m_DataChn.connect(ip, port) < 0) { success = false; break; } } // download command: 3 int32_t cmd = 3; self->m_DataChn.send(ip, port, session, (char*)&cmd, 4); int64_t offset = 0; self->m_DataChn.send(ip, port, session, (char*)&offset, 8); int response = -1; if ((self->m_DataChn.recv4(ip, port, session, response) < 0) || (-1 == response)) { success = false; break; } string dst_path = dst; if (src != src_path) dst_path += "/" + src_path.substr(src.length() + 1, src_path.length() - src.length() - 1); //copy to .tmp first, then move to real location self->createDir(string(".tmp") + dst_path.substr(0, dst_path.rfind('/'))); fstream ofs; ofs.open((self->m_strHomeDir + ".tmp" + dst_path).c_str(), ios::out | ios::binary | ios::trunc); int64_t unit = 64000000; //send 64MB each time int64_t torecv = size; int64_t recd = 0; while (torecv > 0) { int64_t block = (torecv < unit) ? torecv : unit; if (self->m_DataChn.recvfile(ip, port, session, ofs, offset + recd, block) < 0) { success = false; break; } recd += block; torecv -= block; } ofs.close(); // update total received data size self->m_SlaveStat.updateIO(ip, size, +SlaveStat::SYS_IN); cmd = 5; self->m_DataChn.send(ip, port, session, (char*)&cmd, 4); self->m_DataChn.recv4(ip, port, session, cmd); if (src == dst) { //utime: update timestamp according to the original copy, for replica only; files created by "cp" have new timestamp utimbuf ut; ut.actime = ts; ut.modtime = ts; utime((self->m_strHomeDir + ".tmp" + dst_path).c_str(), &ut); } } string rhome = self->reviseSysCmdPath(self->m_strHomeDir); string rfile = self->reviseSysCmdPath(dst); if (success) { // move from temporary dir to the real dir when the copy is completed self->createDir(dst.substr(0, dst.rfind('/'))); system(("mv " + rhome + ".tmp" + rfile + " " + rhome + rfile).c_str()); // if the file has been modified during the replication, remove this replica int32_t type = (src == dst) ? +FileChangeType::FILE_UPDATE_REPLICA : +FileChangeType::FILE_UPDATE_NEW; if (self->report(master_ip, master_port, transid, dst, type) < 0) unlink((rhome + rfile).c_str()); } else { // failed, remove all temporary files system(("rm -rf " + rhome + ".tmp" + rfile).c_str()); self->report(master_ip, master_port, transid, "", +FileChangeType::FILE_UPDATE_NO); } // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); return NULL; }
DWORD WINAPI Slave::SPEShufflerEx(LPVOID p) #endif { Slave* self = ((Param5*)p)->serv_instance; int transid = ((Param5*)p)->transid; string path = ((Param5*)p)->path; string localfile = ((Param5*)p)->filename; int bucketnum = ((Param5*)p)->bucketnum; const int key = ((Param5*)p)->key; const int type = ((Param5*)p)->type; string function = ((Param5*)p)->function; string master_ip = ((Param5*)p)->master_ip; int master_port = ((Param5*)p)->master_port; queue<Bucket>* bq = ((Param5*)p)->bq; CMutex* bqlock = ((Param5*)p)->bqlock; CCond* bqcond = ((Param5*)p)->bqcond; int64_t* pendingSize = ((Param5*)p)->pending; delete (Param5*)p; self->createDir(path); // remove old result data files for (int i = 0; i < bucketnum; ++ i) { int size = self->m_strHomeDir.length() + path.length() + localfile.length() + 64; char* tmp = new char[size]; snprintf(tmp, size, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), i); LocalFS::erase(tmp); snprintf(tmp, size, "%s.%d.idx", (self->m_strHomeDir + path + "/" + localfile).c_str(), i); LocalFS::erase(tmp); delete [] tmp; } // index file initial offset vector<int64_t> offset; offset.resize(bucketnum); for (vector<int64_t>::iterator i = offset.begin(); i != offset.end(); ++ i) *i = 0; set<int> fileid; while (true) { bqlock->acquire(); while (bq->empty()) bqcond->wait(*bqlock); Bucket b = bq->front(); bq->pop(); *pendingSize -= b.totalsize; bqlock->release(); if (b.totalnum == -1) break; string speip = b.src_ip; int dataport = b.src_dataport; int session = b.session; for (int i = 0; i < b.totalnum; ++ i) { int bucket = 0; if (self->m_DataChn.recv4(speip, dataport, session, bucket) < 0) continue; fileid.insert(bucket); char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), bucket); fstream datafile(tmp, ios::out | ios::binary | ios::app); sprintf(tmp, "%s.%d.idx", (self->m_strHomeDir + path + "/" + localfile).c_str(), bucket); fstream indexfile(tmp, ios::out | ios::binary | ios::app); delete [] tmp; int64_t start = offset[bucket]; if (0 == start) indexfile.write((char*)&start, 8); int32_t len; char* data = NULL; if (self->m_DataChn.recv(speip, dataport, session, data, len) < 0) continue; datafile.write(data, len); delete [] data; tmp = NULL; if (self->m_DataChn.recv(speip, dataport, session, tmp, len) < 0) continue; int64_t* index = (int64_t*)tmp; for (int j = 0; j < len / 8; ++ j) index[j] += start; offset[bucket] = index[len / 8 - 1]; indexfile.write(tmp, len); delete [] tmp; datafile.close(); indexfile.close(); } // update total received data self->m_SlaveStat.updateIO(speip, b.totalsize, +SlaveStat::SYS_IN); } // sort and reduce if (type == 1) { void* lh = NULL; self->openLibrary(key, function, lh); if (NULL != lh) { MR_COMPARE comp = NULL; MR_REDUCE reduce = NULL; self->getReduceFunc(lh, function, comp, reduce); if (NULL != comp) { char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; for (set<int>::iterator i = fileid.begin(); i != fileid.end(); ++ i) { sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), *i); self->sort(tmp, comp, reduce); } delete [] tmp; } self->closeLibrary(lh); } } // report sphere output files char* tmp = new char[path.length() + localfile.length() + 64]; vector<string> filelist; for (set<int>::iterator i = fileid.begin(); i != fileid.end(); ++ i) { sprintf(tmp, "%s.%d", (path + "/" + localfile).c_str(), *i); filelist.push_back(tmp); sprintf(tmp, "%s.%d.idx", (path + "/" + localfile).c_str(), *i); filelist.push_back(tmp); } delete [] tmp; self->report(master_ip, master_port, transid, filelist, 1); return NULL; }
DWORD WINAPI Slave::SPEHandler(LPVOID p) #endif { Slave* self = ((Param4*)p)->serv_instance; const string ip = ((Param4*)p)->client_ip; const int ctrlport = ((Param4*)p)->client_ctrl_port; const int dataport = ((Param4*)p)->client_data_port; const int speid = ((Param4*)p)->speid; const int transid = ((Param4*)p)->transid; const int key = ((Param4*)p)->key; const string function = ((Param4*)p)->function; const int rows = ((Param4*)p)->rows; const char* param = ((Param4*)p)->param; const int psize = ((Param4*)p)->psize; const int type = ((Param4*)p)->type; const string master_ip = ((Param4*)p)->master_ip; const int master_port = ((Param4*)p)->master_port; delete (Param4*)p; SectorMsg msg; bool init_success = true; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "SPE starts " << ip << " " << dataport << LogEnd(); if (self->m_DataChn.connect(ip, dataport) < 0) { self->m_SectorLog << LogStart(LogLevel::LEVEL_2) << "failed to connect to spe client " << ip << ":" << ctrlport << " " << function << LogEnd(); init_success = false; } self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "connected." << LogEnd(); // read outupt parameters int buckets = 0; if (self->m_DataChn.recv4(ip, dataport, transid, buckets) < 0) init_success = false; SPEDestination dest; if (buckets > 0) { if (self->m_DataChn.recv4(ip, dataport, transid, dest.m_iLocNum) < 0) init_success = false; int len = dest.m_iLocNum * 80; if (self->m_DataChn.recv(ip, dataport, transid, dest.m_pcOutputLoc, len) < 0) init_success = false; len = buckets * 4; if (self->m_DataChn.recv(ip, dataport, transid, (char*&)dest.m_piLocID, len) < 0) init_success = false; } else if (buckets < 0) { int32_t len = 0; if (self->m_DataChn.recv(ip, dataport, transid, dest.m_pcOutputLoc, len) < 0) init_success = false; dest.m_strLocalFile = dest.m_pcOutputLoc; } dest.init(buckets); // initialize processing function self->acceptLibrary(key, ip, dataport, transid); SPHERE_PROCESS process = NULL; MR_MAP map = NULL; MR_PARTITION partition = NULL; void* lh = NULL; self->openLibrary(key, function, lh); if (NULL == lh) { self->m_SectorLog << LogStart(LogLevel::LEVEL_2) << "failed to open SPE library " << ip << ":" << ctrlport << " " << function << LogEnd(); init_success = false; } if (type == 0) { if (self->getSphereFunc(lh, function, process) < 0) init_success = false; } else if (type == 1) { if (self->getMapFunc(lh, function, map, partition) < 0) init_success = false; } else { init_success = false; } timeval t1, t2, t3, t4; gettimeofday(&t1, 0); msg.setType(1); // success, return result msg.setData(0, (char*)&(speid), 4); SPEResult result; result.init(buckets); // processing... while (init_success) { char* dataseg = NULL; int size = 0; if (self->m_DataChn.recv(ip, dataport, transid, dataseg, size) < 0) break; // client request to close this SPE if (size < 20) break; // read data segment parameters int64_t offset = *(int64_t*)(dataseg); int64_t totalrows = *(int64_t*)(dataseg + 8); int32_t dsid = *(int32_t*)(dataseg + 16); string datafile = dataseg + 20; sprintf(dest.m_pcLocalFileID, ".%d", dsid); delete [] dataseg; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "new job " << datafile << " " << offset << " " << totalrows << LogEnd(); int64_t* index = NULL; if ((totalrows > 0) && (rows != 0)) index = new int64_t[totalrows + 1]; char* block = NULL; int unitrows = (rows != -1) ? rows : totalrows; int progress = 0; // read data if (0 != rows) { size = 0; if (self->SPEReadData(datafile, offset, size, index, totalrows, block) <= 0) { delete [] index; delete [] block; progress = SectorError::E_SPEREAD; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); continue; } } else { // store file name in "process" parameter block = new char[datafile.length() + 1]; strcpy(block, datafile.c_str()); size = datafile.length() + 1; totalrows = 0; } SInput input; input.m_pcUnit = NULL; input.m_pcParam = (char*)param; input.m_iPSize = psize; SOutput output; output.m_iBufSize = (size < 64000000) ? 64000000 : size; output.m_pcResult = new char[output.m_iBufSize]; output.m_iIndSize = (totalrows < 640000) ? 640000 : totalrows + 2; output.m_pllIndex = new int64_t[output.m_iIndSize]; output.m_piBucketID = new int[output.m_iIndSize]; SFile file; file.m_strHomeDir = self->m_strHomeDir; char path[64]; sprintf(path, "%d", key); file.m_strLibDir = self->m_strHomeDir + ".sphere/" + path + "/"; file.m_strTempDir = self->m_strHomeDir + ".tmp/"; file.m_iSlaveID = self->m_iSlaveID; file.m_pInMemoryObjects = &self->m_InMemoryObjects; result.clear(); gettimeofday(&t3, 0); int deliverystatus = 0; int processstatus = 0; // process data segments for (int i = 0; i < totalrows; i += unitrows) { if (unitrows > totalrows - i) unitrows = totalrows - i; input.m_pcUnit = block + index[i] - index[0]; input.m_iRows = unitrows; input.m_pllIndex = index + i; output.m_iResSize = 0; output.m_iRows = 0; output.m_strError = ""; processstatus = self->processData(input, output, file, result, buckets, process, map, partition); if (processstatus < 0) { progress = SectorError::E_SPEPROC; break; } timeval t; gettimeofday(&t, NULL); unsigned int seed = t.tv_sec * 1000000 + t.tv_usec; srand(seed); int ds_thresh = 32000000 * ((rand() % 7) + 1); if ((result.m_llTotalDataSize >= ds_thresh) && (buckets != 0)) deliverystatus = self->deliverResult(buckets, result, dest); if (deliverystatus < 0) { progress = SectorError::E_SPEWRITE; break; } gettimeofday(&t4, 0); if (t4.tv_sec - t3.tv_sec > 1) { progress = i * 100 / totalrows; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); t3 = t4; } } // process files if (0 == unitrows) { SNode s; LocalFS::stat(self->m_strHomeDir + datafile, s); int64_t filesize = s.m_llSize; input.m_pcUnit = block; input.m_iRows = -1; input.m_pllIndex = NULL; output.m_llOffset = 0; for (int i = 0; (i == 0) || (output.m_llOffset > 0); ++ i) { // re-initialize output everytime UDF is called, except for offset output.m_iResSize = 0; output.m_iRows = 0; output.m_strError = ""; processstatus = self->processData(input, output, file, result, buckets, process, map, partition); if (processstatus < 0) { progress = SectorError::E_SPEPROC; break; } timeval t; gettimeofday(&t, NULL); unsigned int seed = t.tv_sec * 1000000 + t.tv_usec; srand(seed); int ds_thresh = 32000000 * ((rand() % 7) + 1); if ((result.m_llTotalDataSize >= ds_thresh) && (buckets != 0)) deliverystatus = self->deliverResult(buckets, result, dest); if (deliverystatus < 0) { progress = SectorError::E_SPEWRITE; break; } if (output.m_llOffset > 0) { progress = output.m_llOffset * 100LL / filesize; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); } } } // if buckets = 0, send back to clients, otherwise deliver to local or network locations if ((buckets != 0) && (progress >= 0)) deliverystatus = self->deliverResult(buckets, result, dest); if (deliverystatus < 0) progress = SectorError::E_SPEWRITE; else progress = 100; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "SPE completed " << progress << " " << ip << " " << ctrlport << LogEnd(); msg.setData(4, (char*)&progress, 4); if (100 == progress) { msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); self->sendResultToClient(buckets, dest.m_piSArray, dest.m_piRArray, result, ip, dataport, transid); dest.reset(buckets); // report new files vector<string> filelist; for (set<string>::iterator i = file.m_sstrFiles.begin(); i != file.m_sstrFiles.end(); ++ i) filelist.push_back(*i); self->report(master_ip, master_port, transid, filelist, +FileChangeType::FILE_UPDATE_NEW); self->reportMO(master_ip, master_port, transid); } else { msg.setData(8, (char*)&processstatus, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 12; if (output.m_strError.length() > 0) msg.setData(12, output.m_strError.c_str(), output.m_strError.length() + 1); else if (deliverystatus < 0) { string tmp = "System Error: data transfer to buckets failed."; msg.setData(12, tmp.c_str(), tmp.length() + 1); } int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); } delete [] index; delete [] block; delete [] output.m_pcResult; delete [] output.m_pllIndex; delete [] output.m_piBucketID; index = NULL; block = NULL; } gettimeofday(&t2, 0); int duration = t2.tv_sec - t1.tv_sec; self->m_SectorLog << LogStart(LogLevel::LEVEL_3) << "comp server closed " << ip << " " << ctrlport << " " << duration << LogEnd(); delete [] param; vector<Address> bad; if (init_success) { self->closeLibrary(lh); multimap<int64_t, Address> sndspd; for (int i = 0; i < dest.m_iLocNum; ++ i) { Address addr; addr.m_strIP = dest.m_pcOutputLoc + i * 80; addr.m_iPort = *(int32_t*)(dest.m_pcOutputLoc + i * 80 + 64); int dataport = *(int32_t*)(dest.m_pcOutputLoc + i * 80 + 68); int64_t spd = self->m_DataChn.getRealSndSpeed(addr.m_strIP, dataport); if (spd > 0) sndspd.insert(pair<int64_t, Address>(spd, addr)); } vector<Address> bad; self->checkBadDest(sndspd, bad); } else { // this SPE failed to initialize. send the error to the client int progress = SectorError::E_SPEUDF; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(ip.c_str(), ctrlport, id, &msg); } self->reportSphere(master_ip, master_port, transid, &bad); // clear this transaction self->m_TransManager.updateSlave(transid, self->m_iSlaveID); return NULL; }
void* Slave::SPEShufflerEx(void* p) { Slave* self = ((Param5*)p)->serv_instance; int transid = ((Param5*)p)->transid; string client_ip = ((Param5*)p)->client_ip; int client_port = ((Param5*)p)->client_ctrl_port; int client_data_port = ((Param5*)p)->client_data_port; string path = ((Param5*)p)->path; string localfile = ((Param5*)p)->filename; int bucketnum = ((Param5*)p)->bucketnum; int bucketid = ((Param5*)p)->bucketid; const int key = ((Param5*)p)->key; const int type = ((Param5*)p)->type; string function = ((Param5*)p)->function; queue<Bucket>* bq = ((Param5*)p)->bq; pthread_mutex_t* bqlock = ((Param5*)p)->bqlock; pthread_cond_t* bqcond = ((Param5*)p)->bqcond; int64_t* pendingSize = ((Param5*)p)->pending; string master_ip = ((Param5*)p)->master_ip; int master_port = ((Param5*)p)->master_port; delete (Param5*)p; self->createDir(path); // remove old result data files for (int i = 0; i < bucketnum; ++ i) { char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), i); unlink(tmp); sprintf(tmp, "%s.%d.idx", (self->m_strHomeDir + path + "/" + localfile).c_str(), i); unlink(tmp); delete [] tmp; } // index file initial offset vector<int64_t> offset; offset.resize(bucketnum); for (vector<int64_t>::iterator i = offset.begin(); i != offset.end(); ++ i) *i = 0; set<int> fileid; while (true) { pthread_mutex_lock(bqlock); while (bq->empty()) pthread_cond_wait(bqcond, bqlock); Bucket b = bq->front(); bq->pop(); *pendingSize -= b.totalsize; pthread_mutex_unlock(bqlock); if (b.totalnum == -1) break; string speip = b.src_ip; int dataport = b.src_dataport; int session = b.session; for (int i = 0; i < b.totalnum; ++ i) { int bucket = 0; if (self->m_DataChn.recv4(speip, dataport, session, bucket) < 0) continue; fileid.insert(bucket); char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), bucket); fstream datafile(tmp, ios::out | ios::binary | ios::app); sprintf(tmp, "%s.%d.idx", (self->m_strHomeDir + path + "/" + localfile).c_str(), bucket); fstream indexfile(tmp, ios::out | ios::binary | ios::app); delete [] tmp; int64_t start = offset[bucket]; if (0 == start) indexfile.write((char*)&start, 8); int32_t len; char* data = NULL; if (self->m_DataChn.recv(speip, dataport, session, data, len) < 0) continue; datafile.write(data, len); delete [] data; tmp = NULL; if (self->m_DataChn.recv(speip, dataport, session, tmp, len) < 0) continue; int64_t* index = (int64_t*)tmp; for (int j = 0; j < len / 8; ++ j) index[j] += start; offset[bucket] = index[len / 8 - 1]; indexfile.write(tmp, len); delete [] tmp; datafile.close(); indexfile.close(); } // update total received data self->m_SlaveStat.updateIO(speip, b.totalsize, 0); } pthread_mutex_destroy(bqlock); pthread_cond_destroy(bqcond); delete bqlock; delete bqcond; delete pendingSize; // sort and reduce if (type == 1) { void* lh = NULL; self->openLibrary(key, function, lh); //if (NULL == lh) // break; MR_COMPARE comp = NULL; MR_REDUCE reduce = NULL; self->getReduceFunc(lh, function, comp, reduce); if (NULL != comp) { char* tmp = new char[self->m_strHomeDir.length() + path.length() + localfile.length() + 64]; for (set<int>::iterator i = fileid.begin(); i != fileid.end(); ++ i) { sprintf(tmp, "%s.%d", (self->m_strHomeDir + path + "/" + localfile).c_str(), *i); self->sort(tmp, comp, reduce); } delete [] tmp; } self->closeLibrary(lh); } // report sphere output files char* tmp = new char[path.length() + localfile.length() + 64]; vector<string> filelist; for (set<int>::iterator i = fileid.begin(); i != fileid.end(); ++ i) { sprintf(tmp, "%s.%d", (path + "/" + localfile).c_str(), *i); filelist.push_back(tmp); sprintf(tmp, "%s.%d.idx", (path + "/" + localfile).c_str(), *i); filelist.push_back(tmp); } delete [] tmp; self->report(master_ip, master_port, transid, filelist, 1); self->reportSphere(master_ip, master_port, transid); // cout << "bucket completed 100 " << client_ip << " " << client_port << endl; SectorMsg msg; msg.setType(1); // success, return result msg.setData(0, (char*)&(bucketid), 4); int progress = 100; msg.setData(4, (char*)&progress, 4); msg.m_iDataLength = SectorMsg::m_iHdrSize + 8; int id = 0; self->m_GMP.sendto(client_ip.c_str(), client_port, id, &msg); //remove this client data channel self->m_DataChn.remove(client_ip, client_data_port); return NULL; }