qioerr hdfs_locales_for_range(void* file, off_t start_byte, off_t end_byte, const char*** loc_names_out, int* num_locs_out, void* fs) { int i = 0; int j = 0; char*** info = NULL; info = hdfsGetHosts(to_hdfs_fs(fs)->hfs, to_hdfs_file(file)->pathnm, start_byte, end_byte); // unable to get hosts for this byte range if (!info || !info[0]) { *num_locs_out = 0; hdfsFreeHosts(info); QIO_RETURN_CONSTANT_ERROR(EREMOTEIO, "Unable to get owners for byterange"); } while(info[0][i]) { info[0][i] = get_locale_name(info[0][i]); i++; } *num_locs_out = i - 1; *loc_names_out = (const char**)info[0]; // Free the other hosts that we don't need for (i = 1; info[i]; i++) { for (j = 0; info[i][j]; j++) qio_free(info[i][j]); qio_free(info[i]); } return 0; }
NABoolean HHDFSFileStats::populate(hdfsFS fs, hdfsFileInfo *fileInfo, Int32& samples, NABoolean doEstimation, char recordTerminator, NABoolean isSequenceFile) { NABoolean result = TRUE; // copy fields from fileInfo fileName_ = fileInfo->mName; replication_ = (Int32) fileInfo->mReplication; totalSize_ = (Int64) fileInfo->mSize; blockSize_ = (Int64) fileInfo->mBlockSize; modificationTS_ = fileInfo->mLastMod; numFiles_ = 1; isSequenceFile_ = isSequenceFile; Int64 sampleBufferSize = MINOF(blockSize_, 65536); NABoolean sortHosts = (CmpCommon::getDefault(HIVE_SORT_HDFS_HOSTS) == DF_ON); sampleBufferSize = MINOF(sampleBufferSize,totalSize_/10); if (doEstimation && sampleBufferSize > 100) { // // Open the hdfs file to estimate record length. Read one block at // a time searching for <s> instances of record separators. Stop reading // when either <s> instances have been found or a partial number of // instances have and we have exhausted all data content in the block. // We will keep reading if the current block does not contain // any instance of the record separator. // hdfsFile file = hdfsOpenFile(fs, fileInfo->mName, O_RDONLY, sampleBufferSize, // buffer size 0, // replication, take the default size fileInfo->mBlockSize // blocksize ); if ( file != NULL ) { tOffset offset = 0; tSize bufLen = sampleBufferSize; char* buffer = new (heap_) char[bufLen+1]; buffer[bufLen] = 0; // extra null at the end to protect strchr() // to run over the buffer. NABoolean sampleDone = FALSE; Int32 totalSamples = 10; Int32 totalLen = 0; while (!sampleDone) { tSize szRead = hdfsPread(fs, file, offset, buffer, bufLen); char* pos = NULL; if ( szRead > 0 ) { //if (isSequenceFile && offset==0 && memcmp(buffer, "SEQ6", 4) == 0) // isSequenceFile_ = TRUE; char* start = buffer; for (Int32 i=0; i<totalSamples; i++ ) { if ( (pos=strchr(start, recordTerminator)) ) { totalLen += pos - start + 1 + offset; samples++; start = pos+1; if ( start > buffer + bufLen ) { sampleDone = TRUE; break; } } else break; } if ( samples > 0 ) break; else offset += bufLen; } else break; // fail to read any bytes. Bail out. } NADELETEBASIC(buffer, heap_); if ( samples > 0 ) { sampledBytes_ += totalLen; sampledRows_ += samples; } hdfsCloseFile(fs, file); } else { // can not do hdfs open on the file. Assume the file is empty. } } if (blockSize_) { numBlocks_ = totalSize_ / blockSize_; if (totalSize_ % blockSize_ > 0) numBlocks_++; // partial block at the end } else { CMPASSERT(blockSize_); // TBD:DIAGS result = FALSE; } if ( totalSize_ > 0 ) { blockHosts_ = new(heap_) HostId[replication_*numBlocks_]; // walk through blocks and record their locations tOffset o = 0; Int64 blockNum; for (blockNum=0; blockNum < numBlocks_ && result; blockNum++) { char*** blockHostNames = hdfsGetHosts(fs, fileInfo->mName, o, fileInfo->mBlockSize); o += blockSize_; if (blockHostNames == NULL) { CMPASSERT(blockHostNames); // TBD:DIAGS result = FALSE; } else { char **h = *blockHostNames; HostId hostId; for (Int32 r=0; r<replication_; r++) { if (h[r]) hostId = HHDFSMasterHostList::getHostNum(h[r]); else hostId = HHDFSMasterHostList::InvalidHostId; blockHosts_[r*numBlocks_+blockNum] = hostId; } if (sortHosts) sortHostArray(blockHosts_, (Int32) numBlocks_, replication_, getFileName()); } hdfsFreeHosts(blockHostNames); } } return result; }