NABoolean HHDFSListPartitionStats::populate(hdfsFS fs, const NAString &dir, Int32 numOfBuckets, NABoolean doEstimation, char recordTerminator, NABoolean isSequenceFile) { NABoolean result = TRUE; int numFiles = 0; // remember parameters partitionDir_ = dir; defaultBucketIdx_ = (numOfBuckets >= 1) ? numOfBuckets : 0; doEstimation_ = doEstimation; recordTerminator_ = recordTerminator; isSequenceFile_ = isSequenceFile; // list all the files in this directory, they all belong // to this partition and either belong to a specific bucket // or to the default bucket hdfsFileInfo *fileInfos = hdfsListDirectory(fs, dir.data(), &numFiles); // populate partition stats for (int f=0; f<numFiles && result; f++) if (fileInfos[f].mKind == kObjectKindFile) { // the default (unbucketed) bucket number is // defaultBucketIdx_ Int32 bucketNum = determineBucketNum(fileInfos[f].mName); HHDFSBucketStats *bucketStats = NULL; if (! bucketStatsList_.used(bucketNum)) { bucketStats = new(heap_) HHDFSBucketStats(heap_); bucketStatsList_.insertAt(bucketNum, bucketStats); } else bucketStats = bucketStatsList_[bucketNum]; if (! bucketStats->addFile(fs, &fileInfos[f], doEstimation, recordTerminator, isSequenceFile)) result = FALSE; } hdfsFreeFileInfo(fileInfos, numFiles); // aggregate statistics over all buckets for (Int32 b=0; b<=defaultBucketIdx_; b++) if (bucketStatsList_.used(b)) add(bucketStatsList_[b]); return result; }
NABoolean HHDFSListPartitionStats::validateAndRefresh(hdfsFS fs, NABoolean refresh) { NABoolean result = TRUE; // assume we get the files sorted by file name int numFiles = 0; Int32 lastBucketNum = -1; ARRAY(Int32) fileNumInBucket(getLastValidBucketIndx()+1); HHDFSBucketStats *bucketStats = NULL; for (CollIndex i=0; i<=getLastValidBucketIndx(); i++) fileNumInBucket.insertAt(i, (Int32) -1); // recursively call processDirectory() for each subdirectory hdfsFileInfo *fileInfos = hdfsListDirectory(fs, partitionDir_.data(), &numFiles); // populate partition stats for (int f=0; f<numFiles && result; f++) if (fileInfos[f].mKind == kObjectKindFile) { Int32 bucketNum = determineBucketNum(fileInfos[f].mName); if (bucketNum != lastBucketNum) { if (! bucketStatsList_.used(bucketNum)) { // first file for a new bucket got added if (!refresh) return FALSE; bucketStats = new(heap_) HHDFSBucketStats(heap_); bucketStatsList_.insertAt(bucketNum, bucketStats); } else bucketStats = bucketStatsList_[bucketNum]; lastBucketNum = bucketNum; } // file stats for an existing file, or NULL // for a new file HHDFSFileStats *fileStats = NULL; // position in bucketStats of the file (existing or new) fileNumInBucket[bucketNum] = fileNumInBucket[bucketNum] + 1; if (fileNumInBucket[bucketNum] < bucketStats->entries()) fileStats = (*bucketStats)[fileNumInBucket[bucketNum]]; // else this is a new file, indicated by fileStats==NULL if (fileStats && fileStats->getFileName() == fileInfos[f].mName) { // file still exists, check modification timestamp if (fileStats->getModificationTS() != fileInfos[f].mLastMod || fileStats->getTotalSize() != (Int64) fileInfos[f].mSize) { if (refresh) { // redo this file, it changed subtract(fileStats); bucketStats->removeAt(fileNumInBucket[bucketNum]); fileStats = NULL; } else result = FALSE; } // else this file is unchanged from last time } // file name matches else { if (refresh) { if (fileStats) { // We are looking at a file in the directory, fileInfos[f] // and at a file stats entry, with names that do not match. // This could be because a new file got inserted or because // the file of our file stats entry got deleted or both. // We can only refresh this object in the first case, if // a file got deleted we will return FALSE and not refresh. // check whether fileStats got deleted, // search for fileStats->getFileName() in the directory int f2; for (f2=f+1; f2<numFiles; f2++) if (fileStats->getFileName() == fileInfos[f2].mName) break; if (f2<numFiles) { // file fileInfos[f] got added, don't consume // a FileStats entry, instead add it below fileStats = NULL; } else { // file fileStats->getFileName() got deleted, // it's gone from the HDFS directory, // give up and redo the whole thing result = FALSE; } } // else file was inserted (fileStats is NULL) } else result = FALSE; } // file names for HHDFSFileStats and directory don't match if (result && !fileStats) { // add this file if (! bucketStats->addFile(fs, &fileInfos[f], doEstimation_, recordTerminator_, isSequenceFile_, fileNumInBucket[bucketNum])) result = FALSE; add((*bucketStats)[fileNumInBucket[bucketNum]]); } } // loop over actual files in the directory hdfsFreeFileInfo(fileInfos, numFiles); // check for file stats that we did not visit at the end of each bucket for (CollIndex i=0; i<=getLastValidBucketIndx() && result; i++) if (bucketStatsList_.used(i) && bucketStatsList_[i]->entries() != fileNumInBucket[i] + 1) result = FALSE; // some files got deleted at the end return result; }
void HHDFSListPartitionStats::populate(hdfsFS fs, const NAString &dir, Int32 numOfBuckets, HHDFSDiags &diags, NABoolean doEstimation, char recordTerminator) { int numFiles = 0; // remember parameters partitionDir_ = dir; defaultBucketIdx_ = (numOfBuckets >= 1) ? numOfBuckets : 0; doEstimation_ = doEstimation; recordTerminator_ = recordTerminator; // to avoid a crash, due to lacking permissions, check the directory // itself first hdfsFileInfo *dirInfo = hdfsGetPathInfo(fs, dir.data()); if (!dirInfo) { diags.recordError(NAString("Could not access HDFS directory ") + dir, "HHDFSListPartitionStats::populate"); } else { dirInfo_ = *dirInfo; // list all the files in this directory, they all belong // to this partition and either belong to a specific bucket // or to the default bucket hdfsFileInfo *fileInfos = hdfsListDirectory(fs, dir.data(), &numFiles); // populate partition stats for (int f=0; f<numFiles && diags.isSuccess(); f++) if (fileInfos[f].mKind == kObjectKindFile) { // the default (unbucketed) bucket number is // defaultBucketIdx_ Int32 bucketNum = determineBucketNum(fileInfos[f].mName); HHDFSBucketStats *bucketStats = NULL; if (! bucketStatsList_.used(bucketNum)) { bucketStats = new(heap_) HHDFSBucketStats(heap_, getTable()); bucketStatsList_.insertAt(bucketNum, bucketStats); } else bucketStats = bucketStatsList_[bucketNum]; bucketStats->addFile(fs, &fileInfos[f], diags, doEstimation, recordTerminator); } hdfsFreeFileInfo(fileInfos, numFiles); hdfsFreeFileInfo(dirInfo,1); // aggregate statistics over all buckets for (Int32 b=0; b<=defaultBucketIdx_; b++) if (bucketStatsList_.used(b)) add(bucketStatsList_[b]); } }