int RedistributeWorkerThread::sendData()
{
	WriteEngine::FileOp fileOp;  // just to get filename, not for file operations
	bool remotePM = (fMyId.second != fPeerId.second);
	uint32_t dbroot = fPlanEntry.source;
	uint32_t partition = fPlanEntry.partition;
	int16_t source = fPlanEntry.source;
	int16_t dest = fPlanEntry.destination;

	IDBDataFile::Types fileType = 
		(IDBPolicy::useHdfs() ? IDBDataFile::HDFS : IDBDataFile::UNBUFFERED);
	IDBFileSystem& fs = IDBFileSystem::getFs( fileType );

	if ((remotePM) && (fileType != IDBDataFile::HDFS))
	{
		if (connectToWes(fPeerId.second) != 0)
		{
			fErrorCode = RED_EC_CONNECT_FAIL;
			ostringstream oss;
			oss << "Failed to connect to PM" << fPeerId.second << " from PM" << fMyId.second;
			fErrorMsg = oss.str();
			logMessage(fErrorMsg, __LINE__);
			return fErrorCode;
		}

		// start to send each segment file
		uint32_t seq = 0;
		ByteStream bs;

		// start conversion with peer, hand shaking.
		RedistributeMsgHeader header(dest, source, seq++, RED_DATA_INIT);
		bs << (ByteStream::byte) WriteEngine::WE_SVR_REDISTRIBUTE;
		bs.append((const ByteStream::byte*) &header, sizeof(header));
		fMsgQueueClient->write(bs);

		SBS sbs = fMsgQueueClient->read();
		if (!checkDataTransferAck(sbs, 0))
			return fErrorCode;

		for (vector<int64_t>::iterator i = fOids.begin(); i != fOids.end(); i++)
		{
			for (set<int16_t>::iterator j = fSegments.begin(); j != fSegments.end(); ++j)
			{
				char fileName[WriteEngine::FILE_NAME_SIZE];
				int rc = fileOp.oid2FileName(*i, fileName, false, dbroot, partition, *j);
				if (rc == WriteEngine::NO_ERROR)
				{
					ostringstream oss;
					oss << "<=redistributing: " << fileName << ", oid=" << *i << ", db="
						<< source << ", part=" << partition << ", seg=" << *j << " to db="
						<< dest;
					logMessage(oss.str(), __LINE__);
				}
				else
				{
					fErrorCode = RED_EC_OID_TO_FILENAME;
					ostringstream oss;
					oss << "Failed to get file name: oid=" << *i << ", dbroot=" << dbroot
						<< ", partition=" << partition << ", segment=" << *j;
					fErrorMsg = oss.str();
					logMessage(fErrorMsg, __LINE__);
					return fErrorCode;
				}

				if (fOldFilePtr != NULL)
					closeFile(fOldFilePtr);

				errno = 0;
				FILE* fOldFilePtr = fopen(fileName, "rb");
				if (fOldFilePtr != NULL)
				{
					ostringstream oss;
					oss << "open " << fileName << ", oid=" << *i << ", dbroot=" << dbroot
						<< ", partition=" << partition << ", segment=" << *j
						<< ". " << fOldFilePtr;
					logMessage(oss.str(), __LINE__);
				}
				else
				{
					int e = errno;
					fErrorCode = RED_EC_OPEN_FILE_FAIL;
					ostringstream oss;
					oss << "Failed to open " << fileName << ", oid=" << *i << ", dbroot=" << dbroot
						<< ", partition=" << partition << ", segment=" << *j
						<< ". " << strerror(e) << " (" << e << ")";
					fErrorMsg = oss.str();
					logMessage(fErrorMsg, __LINE__);
					return fErrorCode;
				}

				// add to set for remove after commit
				addToDirSet(fileName, true);

				char chunk[CHUNK_SIZE];
				errno = 0;
				fseek(fOldFilePtr, 0, SEEK_END);       // go to end of file
				long fileSize = ftell(fOldFilePtr);    // get current file size
				if (fileSize < 0)
				{
					int e = errno;
					ostringstream oss;
					oss << "Fail to tell file size: " << strerror(e) << " (" << e << ")";
					fErrorMsg = oss.str();
					fErrorCode = RED_EC_FSEEK_FAIL;
					logMessage(fErrorMsg, __LINE__);
					return fErrorCode;
				}

				// send start message to have the file of fileSize created at target dbroot.
				bs.restart();
				RedistributeMsgHeader header(dest, source, seq++, RED_DATA_START);
				RedistributeDataControl dataControl(*i, dest, partition, *j, fileSize);
				bs << (ByteStream::byte) WriteEngine::WE_SVR_REDISTRIBUTE;
				bs.append((const ByteStream::byte*) &header, sizeof(header));
				bs.append((const ByteStream::byte*) &dataControl, sizeof(dataControl));
				fMsgQueueClient->write(bs);

				sbs = fMsgQueueClient->read();
				if (!checkDataTransferAck(sbs, fileSize))
					return fErrorCode;

				// now send the file chunk by chunk.
				rewind(fOldFilePtr);
				int64_t bytesLeft = fileSize;
				size_t  bytesSend = CHUNK_SIZE;
				header.messageId = RED_DATA_CONT;
				while (bytesLeft > 0)
				{
					if (fStopAction)
					{
						closeFile(fOldFilePtr);
						fOldFilePtr = NULL;
						return RED_EC_USER_STOP;
					}

					if (bytesLeft < (long) CHUNK_SIZE)
						bytesSend = bytesLeft;

					errno = 0;
					size_t n = fread(chunk, 1, bytesSend, fOldFilePtr);
					if (n != bytesSend)
					{
						int e = errno;
						ostringstream oss;
						oss << "Fail to read: " << strerror(e) << " (" << e << ")";
						fErrorMsg = oss.str();
						fErrorCode = RED_EC_FREAD_FAIL;
						logMessage(fErrorMsg, __LINE__);
						return fErrorCode;
					}

					header.sequenceNum = seq++;
					bs.restart();
					bs << (ByteStream::byte) WriteEngine::WE_SVR_REDISTRIBUTE;
   				 	bs.append((const ByteStream::byte*) &header, sizeof(header));
   				 	bs << (size_t) bytesSend;
					bs.append((const ByteStream::byte*) chunk, bytesSend);
   				 	fMsgQueueClient->write(bs);

					sbs = fMsgQueueClient->read();
					if (!checkDataTransferAck(sbs, bytesSend))
						return fErrorCode;

					bytesLeft -= bytesSend;
				}

				closeFile(fOldFilePtr);
				fOldFilePtr = NULL;

				header.messageId = RED_DATA_FINISH;
				header.sequenceNum = seq++;
				bs.restart();
				bs << (ByteStream::byte) WriteEngine::WE_SVR_REDISTRIBUTE;
  			 	bs.append((const ByteStream::byte*) &header, sizeof(header));
  			 	bs << (uint64_t) fileSize;
  			 	fMsgQueueClient->write(bs);

				sbs = fMsgQueueClient->read();
				if (!checkDataTransferAck(sbs, fileSize))
					return fErrorCode;

			}  // segments
		}  // for oids
	}   // remote peer non-hdfs
	else                                           // local or HDFS file copy
	{
		std::map<int,std::string> rootToPathMap;

		// use cp, in case failed in middle.  May consider to use rename if possible.
		for (vector<int64_t>::iterator i = fOids.begin(); i != fOids.end(); i++)
		{
			for (set<int16_t>::iterator j = fSegments.begin(); j != fSegments.end(); ++j)
			{
				if (fStopAction)
					return RED_EC_USER_STOP;

				if (fileType == IDBDataFile::HDFS) // HDFS file copy
				{
					string sourceName;
					int rc = buildFullHdfsPath(
						rootToPathMap, // map of root to path
                        *i,            // OID
						source,        // dbroot
						partition,     // partition
						*j,            // segment
						sourceName );  // full path name
					if (rc != 0)
					{
						fErrorCode = RED_EC_OID_TO_FILENAME;
						ostringstream oss;
						oss << "Failed to get src file name: oid=" << *i
							<< ", dbroot=" << source
							<< ", partition=" << partition
							<< ", segment=" << *j;
						fErrorMsg = oss.str();
						logMessage(fErrorMsg, __LINE__);
						return fErrorCode;
					}

					string destName;
					rc = buildFullHdfsPath(
						rootToPathMap, // map of root to path
                        *i,            // OID
						dest,          // dbroot
						partition,     // partition
						*j,            // segment
						destName );    // full path name
					if (rc != 0)
					{
						fErrorCode = RED_EC_OID_TO_FILENAME;
						ostringstream oss;
						oss << "Failed to get dest file name: oid=" << *i
							<< ", dbroot=" << dest
							<< ", partition=" << partition
							<< ", segment=" << *j;
						fErrorMsg = oss.str();
						logMessage(fErrorMsg, __LINE__);
						return fErrorCode;
					}

					ostringstream oss;
					oss << "<=redistributing(hdfs): " << sourceName << ", oid="
						<< *i << ", db=" << source << ", part=" << partition
						<< ", seg=" << *j << " to db=" << dest;
					logMessage(oss.str(), __LINE__);

					// add to set for remove after commit/abort
					addToDirSet(sourceName.c_str(), true);
					addToDirSet(destName.c_str(), false);

					int ret = fs.copyFile(sourceName.c_str(), destName.c_str());
					if (ret != 0)
					{
						fErrorCode = RED_EC_COPY_FILE_FAIL;
						ostringstream oss;
						oss << "Failed to copy " << sourceName << " to " <<
							destName << "; error is: " << strerror(errno);
						fErrorMsg = oss.str();
						logMessage(fErrorMsg, __LINE__);
						return fErrorCode;
					}
				}
				else                               // local file copy
				{
					char sourceName[WriteEngine::FILE_NAME_SIZE];
					int rc = fileOp.oid2FileName(*i, sourceName, false, source,
						partition, *j);
					if (rc != WriteEngine::NO_ERROR)
					{
						fErrorCode = RED_EC_OID_TO_FILENAME;
						ostringstream oss;
						oss << "Failed to get file name: oid=" << *i
							<< ", dbroot=" << source
							<< ", partition=" << partition
							<< ", segment=" << *j;
						fErrorMsg = oss.str();
						logMessage(fErrorMsg, __LINE__);
						return fErrorCode;
					}

					char destName[WriteEngine::FILE_NAME_SIZE];
					rc = fileOp.oid2FileName(*i, destName, true,
						dest, partition, *j);
					if (rc != WriteEngine::NO_ERROR)
					{
						fErrorCode = RED_EC_OID_TO_FILENAME;
						ostringstream oss;
						oss << "Failed to get file name: oid=" << *i
							<< ", dbroot=" << dest
							<< ", partition=" << partition
							<< ", segment=" << *j;
						fErrorMsg = oss.str();
						logMessage(fErrorMsg, __LINE__);
						return fErrorCode;
					}

					ostringstream oss;
					oss << "<=redistributing(copy): " << sourceName << ", oid="
						<< *i << ", db=" << source << ", part=" << partition
						<< ", seg=" << *j << " to db=" << dest;
					logMessage(oss.str(), __LINE__);

					// add to set for remove after commit/abort
					addToDirSet(sourceName, true);
					addToDirSet(destName, false);

					// Using boost::copy_file() instead of IDBFileSystem::copy-
					// File() so we can capture/report any boost exception error
					// msg that IDBFileSystem::copyFile() currently swallows.
					try
					{
						filesystem::copy_file(sourceName, destName);
					}
#if BOOST_VERSION >= 105200
					catch(filesystem::filesystem_error& e)
#else
					catch(filesystem::basic_filesystem_error<filesystem::path>& e)
#endif
					{
						fErrorCode = RED_EC_COPY_FILE_FAIL;
						ostringstream oss;
						oss << "Failed to copy " << sourceName << " to " <<
							destName << "; error is: " << e.what();
						fErrorMsg = oss.str();
						logMessage(fErrorMsg, __LINE__);
						return fErrorCode;
					}
				}
			}  // segment
		}  // oid
	}  // !remote

	return 0;
}
void RedistributeWorkerThread::handleDataStart(SBS& sbs, size_t& size)
{
	char fileName[WriteEngine::FILE_NAME_SIZE];

	try
	{
		// extract the control data for the segment file
		RedistributeDataControl dc;
		if (sbs->length() >= sizeof(RedistributeDataControl))
		{
			memcpy(&dc, sbs->buf(), sizeof(RedistributeDataControl));
			sbs->advance(sizeof(RedistributeDataControl));
			size = dc.size;
		}
		else
		{
			ostringstream oss;
			oss << "Short message, length=" << sbs->length();
			fErrorMsg = oss.str();
			fErrorCode = RED_EC_WKR_MSG_SHORT;
			logMessage(fErrorMsg, __LINE__);
			throw runtime_error(fErrorMsg);
		}

		// create and open the file for writing.
		WriteEngine::FileOp fileOp;  // just to get filename, not for file operations
		int rc = fileOp.oid2FileName(dc.oid, fileName, true, dc.dbroot, dc.partition, dc.segment);
		if (rc == WriteEngine::NO_ERROR)
		{
			ostringstream oss;
			oss << "=>redistributing: " << fileName << ", oid=" << dc.oid << ", db=" << dc.dbroot
				<< ", part=" << dc.partition << ", seg=" << dc.segment << " from db="
				<< fMsgHeader.destination;  // fMsgHeader has swapped source and destination.
			logMessage(oss.str(), __LINE__);
		}
		else
		{
			fErrorCode = RED_EC_OID_TO_FILENAME;
			ostringstream oss;
			oss << "Failed to get file name: oid=" << dc.oid << ", dbroot=" << dc.dbroot
				<< ", partition=" << dc.partition << ", segment=" << dc.segment;
			fErrorMsg = oss.str();
			logMessage(fErrorMsg, __LINE__);
			throw runtime_error(fErrorMsg);
		}

		if (fNewFilePtr != NULL)
			closeFile(fNewFilePtr);

		errno = 0;
		fNewFilePtr = fopen(fileName, "wb");
		if (fNewFilePtr != NULL)
		{
			ostringstream oss;
			oss << "open " << fileName << ", oid=" << dc.oid << ", dbroot="
				<< dc.dbroot << ", partition=" << dc.partition << ", segment=" << dc.segment
				<< ". " << fNewFilePtr;
			logMessage(oss.str(), __LINE__);
		}
		else
		{
			int e = errno;
			fErrorCode = RED_EC_OPEN_FILE_FAIL;
			ostringstream oss;
			oss << "Failed to open " << fileName << ", oid=" << dc.oid << ", dbroot="
				<< dc.dbroot << ", partition=" << dc.partition << ", segment=" << dc.segment
				<< ". " << strerror(e) << " (" << e << ")";
			fErrorMsg = oss.str();
			logMessage(fErrorMsg, __LINE__);
			throw runtime_error(fErrorMsg);
		}

		// set output buffering
		errno = 0;
		if (setvbuf(fNewFilePtr, fWriteBuffer.get(), _IOFBF, CHUNK_SIZE))
		{
			int e = errno;
			ostringstream oss;
			oss << "Failed to set i/o buffer: " << strerror(e) << " (" << e << ")";
			fErrorMsg = oss.str();
			logMessage(fErrorMsg, __LINE__);

			// not throwing an exception now.
		}

		// add to set for remove after abort
		addToDirSet(fileName, false);

		// do a fseek will show the right size, but will not actually allocate the continuous block.
		// do write 4k block till file size.
		char buf[PRE_ALLOC_SIZE] = {1};
		size_t nmemb = size / PRE_ALLOC_SIZE;
		while (nmemb-- > 0)
		{
			errno = 0;
			size_t n = fwrite(buf, PRE_ALLOC_SIZE, 1, fNewFilePtr);
			if (n != 1)
			{
				int e = errno;
				ostringstream oss;
				oss << "Fail to preallocate file: " << strerror(e) << " (" << e << ")";
				fErrorMsg = oss.str();
				fErrorCode = RED_EC_FWRITE_FAIL;
				logMessage(fErrorMsg, __LINE__);
				throw runtime_error(fErrorMsg);
			}
		}

		// move back to beging to write real data
		fflush(fNewFilePtr);
		rewind(fNewFilePtr);
	}
	catch (const std::exception& ex)
	{
		// NACK
		size = -1;
		logMessage(ex.what(), __LINE__);
	}
	catch (...)
	{
		// NACK
		size = -1;
	}

	// ack file size
	fMsgHeader.messageId = RED_DATA_ACK;
	fBs.restart();
	fBs << (ByteStream::byte) WriteEngine::WE_SVR_REDISTRIBUTE;  // dummy, keep for now.
	fBs.append((const ByteStream::byte*) &fMsgHeader, sizeof(fMsgHeader));
	fBs << size;
	fIOSocket.write(fBs);

	// reset to count the data received
	size = 0;
	sbs.reset();
}