NABoolean HHDFSBucketStats::addFile(hdfsFS fs, hdfsFileInfo *fileInfo, NABoolean doEstimate, char recordTerminator, NABoolean isSequenceFile, CollIndex pos) { HHDFSFileStats *fileStats = new(heap_) HHDFSFileStats(heap_); if ( scount_ > 10 ) doEstimate = FALSE; Int32 sampledRecords = 0; NABoolean result = fileStats->populate(fs, fileInfo, sampledRecords, doEstimate, recordTerminator, isSequenceFile); if ( sampledRecords > 0 ) scount_++; if (pos == NULL_COLL_INDEX) fileStatsList_.insert(fileStats); else fileStatsList_.insertAt(pos, fileStats); add(fileStats); return result; }
void HHDFSBucketStats::addFile(hdfsFS fs, hdfsFileInfo *fileInfo, HHDFSDiags &diags, NABoolean doEstimate, char recordTerminator, CollIndex pos) { HHDFSFileStats *fileStats = new(heap_) HHDFSFileStats(heap_, getTable()); if ( scount_ > 10 ) doEstimate = FALSE; Int32 sampledRecords = 0; fileStats->populate(fs, fileInfo, sampledRecords, diags, doEstimate, recordTerminator); if (diags.isSuccess()) { if ( sampledRecords > 0 ) scount_++; if (pos == NULL_COLL_INDEX) fileStatsList_.insert(fileStats); else fileStatsList_.insertAt(pos, fileStats); add(fileStats); } }
NABoolean HHDFSListPartitionStats::validateAndRefresh(hdfsFS fs, NABoolean refresh) { NABoolean result = TRUE; // assume we get the files sorted by file name int numFiles = 0; Int32 lastBucketNum = -1; ARRAY(Int32) fileNumInBucket(getLastValidBucketIndx()+1); HHDFSBucketStats *bucketStats = NULL; for (CollIndex i=0; i<=getLastValidBucketIndx(); i++) fileNumInBucket.insertAt(i, (Int32) -1); // recursively call processDirectory() for each subdirectory hdfsFileInfo *fileInfos = hdfsListDirectory(fs, partitionDir_.data(), &numFiles); // populate partition stats for (int f=0; f<numFiles && result; f++) if (fileInfos[f].mKind == kObjectKindFile) { Int32 bucketNum = determineBucketNum(fileInfos[f].mName); if (bucketNum != lastBucketNum) { if (! bucketStatsList_.used(bucketNum)) { // first file for a new bucket got added if (!refresh) return FALSE; bucketStats = new(heap_) HHDFSBucketStats(heap_); bucketStatsList_.insertAt(bucketNum, bucketStats); } else bucketStats = bucketStatsList_[bucketNum]; lastBucketNum = bucketNum; } // file stats for an existing file, or NULL // for a new file HHDFSFileStats *fileStats = NULL; // position in bucketStats of the file (existing or new) fileNumInBucket[bucketNum] = fileNumInBucket[bucketNum] + 1; if (fileNumInBucket[bucketNum] < bucketStats->entries()) fileStats = (*bucketStats)[fileNumInBucket[bucketNum]]; // else this is a new file, indicated by fileStats==NULL if (fileStats && fileStats->getFileName() == fileInfos[f].mName) { // file still exists, check modification timestamp if (fileStats->getModificationTS() != fileInfos[f].mLastMod || fileStats->getTotalSize() != (Int64) fileInfos[f].mSize) { if (refresh) { // redo this file, it changed subtract(fileStats); bucketStats->removeAt(fileNumInBucket[bucketNum]); fileStats = NULL; } else result = FALSE; } // else this file is unchanged from last time } // file name matches else { if (refresh) { if (fileStats) { // We are looking at a file in the directory, fileInfos[f] // and at a file stats entry, with names that do not match. // This could be because a new file got inserted or because // the file of our file stats entry got deleted or both. // We can only refresh this object in the first case, if // a file got deleted we will return FALSE and not refresh. // check whether fileStats got deleted, // search for fileStats->getFileName() in the directory int f2; for (f2=f+1; f2<numFiles; f2++) if (fileStats->getFileName() == fileInfos[f2].mName) break; if (f2<numFiles) { // file fileInfos[f] got added, don't consume // a FileStats entry, instead add it below fileStats = NULL; } else { // file fileStats->getFileName() got deleted, // it's gone from the HDFS directory, // give up and redo the whole thing result = FALSE; } } // else file was inserted (fileStats is NULL) } else result = FALSE; } // file names for HHDFSFileStats and directory don't match if (result && !fileStats) { // add this file if (! bucketStats->addFile(fs, &fileInfos[f], doEstimation_, recordTerminator_, isSequenceFile_, fileNumInBucket[bucketNum])) result = FALSE; add((*bucketStats)[fileNumInBucket[bucketNum]]); } } // loop over actual files in the directory hdfsFreeFileInfo(fileInfos, numFiles); // check for file stats that we did not visit at the end of each bucket for (CollIndex i=0; i<=getLastValidBucketIndx() && result; i++) if (bucketStatsList_.used(i) && bucketStatsList_[i]->entries() != fileNumInBucket[i] + 1) result = FALSE; // some files got deleted at the end return result; }