Example #1
0
NABoolean HHDFSBucketStats::addFile(hdfsFS fs, hdfsFileInfo *fileInfo, 
                                    NABoolean doEstimate, 
                                    char recordTerminator,
                                    NABoolean isSequenceFile,
                                    CollIndex pos)
{
  HHDFSFileStats *fileStats = new(heap_) HHDFSFileStats(heap_);

  if ( scount_ > 10 )
    doEstimate = FALSE;

  Int32 sampledRecords = 0;

  NABoolean result = fileStats->populate(fs, fileInfo, sampledRecords,
                                         doEstimate, recordTerminator, isSequenceFile);

  if ( sampledRecords > 0 )
     scount_++;

  if (pos == NULL_COLL_INDEX)
    fileStatsList_.insert(fileStats);
  else
    fileStatsList_.insertAt(pos, fileStats);
  add(fileStats);

  return result;
}
Example #2
0
void HHDFSBucketStats::addFile(hdfsFS fs, hdfsFileInfo *fileInfo, 
                               HHDFSDiags &diags,
                               NABoolean doEstimate, 
                               char recordTerminator,
                               CollIndex pos)
{
  HHDFSFileStats *fileStats = new(heap_) HHDFSFileStats(heap_, getTable());

  if ( scount_ > 10 )
    doEstimate = FALSE;

  Int32 sampledRecords = 0;

  fileStats->populate(fs, fileInfo, sampledRecords, diags,
                      doEstimate, recordTerminator);

  if (diags.isSuccess())
    {
      if ( sampledRecords > 0 )
        scount_++;

      if (pos == NULL_COLL_INDEX)
        fileStatsList_.insert(fileStats);
      else
        fileStatsList_.insertAt(pos, fileStats);
      add(fileStats);
    }
}
Example #3
0
NABoolean HHDFSListPartitionStats::validateAndRefresh(hdfsFS fs, NABoolean refresh)
{
  NABoolean result = TRUE;

  // assume we get the files sorted by file name
  int numFiles = 0;
  Int32 lastBucketNum = -1;
  ARRAY(Int32) fileNumInBucket(getLastValidBucketIndx()+1);
  HHDFSBucketStats *bucketStats = NULL;

  for (CollIndex i=0; i<=getLastValidBucketIndx(); i++)
    fileNumInBucket.insertAt(i, (Int32) -1);

  // recursively call processDirectory() for each subdirectory
  hdfsFileInfo *fileInfos = hdfsListDirectory(fs,
                                              partitionDir_.data(),
                                              &numFiles);

  // populate partition stats
  for (int f=0; f<numFiles && result; f++)
    if (fileInfos[f].mKind == kObjectKindFile)
      {
        Int32 bucketNum = determineBucketNum(fileInfos[f].mName);

        if (bucketNum != lastBucketNum)
          {
            if (! bucketStatsList_.used(bucketNum))
              {
                // first file for a new bucket got added
                if (!refresh)
                  return FALSE;
                bucketStats = new(heap_) HHDFSBucketStats(heap_);
                bucketStatsList_.insertAt(bucketNum, bucketStats);
              }
            else
              bucketStats = bucketStatsList_[bucketNum];
            lastBucketNum = bucketNum;
          }

        // file stats for an existing file, or NULL
        // for a new file
        HHDFSFileStats *fileStats = NULL;
        // position in bucketStats of the file (existing or new)
        fileNumInBucket[bucketNum] = fileNumInBucket[bucketNum] + 1;

        if (fileNumInBucket[bucketNum] < bucketStats->entries())
          fileStats = (*bucketStats)[fileNumInBucket[bucketNum]];
        // else this is a new file, indicated by fileStats==NULL

        if (fileStats &&
            fileStats->getFileName() == fileInfos[f].mName)
          {
            // file still exists, check modification timestamp
            if (fileStats->getModificationTS() !=
                fileInfos[f].mLastMod ||
                fileStats->getTotalSize() !=
                (Int64) fileInfos[f].mSize)
              {
                if (refresh)
                  {
                    // redo this file, it changed
                    subtract(fileStats);
                    bucketStats->removeAt(fileNumInBucket[bucketNum]);
                    fileStats = NULL;
                  }
                else
                  result = FALSE;
              }
            // else this file is unchanged from last time
          } // file name matches
        else
          {
            if (refresh)
              {
                if (fileStats)
                  {
                    // We are looking at a file in the directory, fileInfos[f]
                    // and at a file stats entry, with names that do not match.
                    // This could be because a new file got inserted or because
                    // the file of our file stats entry got deleted or both.
                    // We can only refresh this object in the first case, if
                    // a file got deleted we will return FALSE and not refresh.

                    // check whether fileStats got deleted,
                    // search for fileStats->getFileName() in the directory
                    int f2;
                    for (f2=f+1; f2<numFiles; f2++)
                      if (fileStats->getFileName() == fileInfos[f2].mName)
                        break;

                    if (f2<numFiles)
                      {
                        // file fileInfos[f] got added, don't consume
                        // a FileStats entry, instead add it below
                        fileStats = NULL;
                      }
                    else
                      {
                        // file fileStats->getFileName() got deleted,
                        // it's gone from the HDFS directory,
                        // give up and redo the whole thing
                        result = FALSE;
                      }
                  }
                // else file was inserted (fileStats is NULL)
              }
            else
              result = FALSE;
          } // file names for HHDFSFileStats and directory don't match

        if (result && !fileStats)
          {
            // add this file
            if (! bucketStats->addFile(fs,
                                       &fileInfos[f],
                                       doEstimation_,
                                       recordTerminator_,
                                       isSequenceFile_,
                                       fileNumInBucket[bucketNum]))
              result = FALSE;
            add((*bucketStats)[fileNumInBucket[bucketNum]]);
          }
      } // loop over actual files in the directory

  hdfsFreeFileInfo(fileInfos, numFiles);

  // check for file stats that we did not visit at the end of each bucket
  for (CollIndex i=0; i<=getLastValidBucketIndx() && result; i++)
    if (bucketStatsList_.used(i) &&
        bucketStatsList_[i]->entries() != fileNumInBucket[i] + 1)
      result = FALSE; // some files got deleted at the end

  return result;
}