예제 #1
0
NABoolean HHDFSListPartitionStats::populate(hdfsFS fs,
                                            const NAString &dir,
                                            Int32 numOfBuckets, 
                                            NABoolean doEstimation,
                                            char recordTerminator,
                                            NABoolean isSequenceFile)
{
  NABoolean result = TRUE;
  int numFiles = 0;

  // remember parameters
  partitionDir_     = dir;
  defaultBucketIdx_ = (numOfBuckets >= 1) ? numOfBuckets : 0;
  doEstimation_     = doEstimation;
  recordTerminator_ = recordTerminator;
  isSequenceFile_   = isSequenceFile;

  // list all the files in this directory, they all belong
  // to this partition and either belong to a specific bucket
  // or to the default bucket
  hdfsFileInfo *fileInfos = hdfsListDirectory(fs,
                                              dir.data(),
                                              &numFiles);

  // populate partition stats
  for (int f=0; f<numFiles && result; f++)
    if (fileInfos[f].mKind == kObjectKindFile)
      {
        // the default (unbucketed) bucket number is
        // defaultBucketIdx_
        Int32 bucketNum = determineBucketNum(fileInfos[f].mName);
        HHDFSBucketStats *bucketStats = NULL;

        if (! bucketStatsList_.used(bucketNum))
          {
            bucketStats = new(heap_) HHDFSBucketStats(heap_);
            bucketStatsList_.insertAt(bucketNum, bucketStats);
          }
        else
          bucketStats = bucketStatsList_[bucketNum];

        if (! bucketStats->addFile(fs, &fileInfos[f], doEstimation, recordTerminator, isSequenceFile))
          result = FALSE;
      }

  hdfsFreeFileInfo(fileInfos, numFiles);

  // aggregate statistics over all buckets
  for (Int32 b=0; b<=defaultBucketIdx_; b++)
    if (bucketStatsList_.used(b))
      add(bucketStatsList_[b]);

  return result;
}
예제 #2
0
NABoolean HHDFSListPartitionStats::validateAndRefresh(hdfsFS fs, NABoolean refresh)
{
  NABoolean result = TRUE;

  // assume we get the files sorted by file name
  int numFiles = 0;
  Int32 lastBucketNum = -1;
  ARRAY(Int32) fileNumInBucket(getLastValidBucketIndx()+1);
  HHDFSBucketStats *bucketStats = NULL;

  for (CollIndex i=0; i<=getLastValidBucketIndx(); i++)
    fileNumInBucket.insertAt(i, (Int32) -1);

  // recursively call processDirectory() for each subdirectory
  hdfsFileInfo *fileInfos = hdfsListDirectory(fs,
                                              partitionDir_.data(),
                                              &numFiles);

  // populate partition stats
  for (int f=0; f<numFiles && result; f++)
    if (fileInfos[f].mKind == kObjectKindFile)
      {
        Int32 bucketNum = determineBucketNum(fileInfos[f].mName);

        if (bucketNum != lastBucketNum)
          {
            if (! bucketStatsList_.used(bucketNum))
              {
                // first file for a new bucket got added
                if (!refresh)
                  return FALSE;
                bucketStats = new(heap_) HHDFSBucketStats(heap_);
                bucketStatsList_.insertAt(bucketNum, bucketStats);
              }
            else
              bucketStats = bucketStatsList_[bucketNum];
            lastBucketNum = bucketNum;
          }

        // file stats for an existing file, or NULL
        // for a new file
        HHDFSFileStats *fileStats = NULL;
        // position in bucketStats of the file (existing or new)
        fileNumInBucket[bucketNum] = fileNumInBucket[bucketNum] + 1;

        if (fileNumInBucket[bucketNum] < bucketStats->entries())
          fileStats = (*bucketStats)[fileNumInBucket[bucketNum]];
        // else this is a new file, indicated by fileStats==NULL

        if (fileStats &&
            fileStats->getFileName() == fileInfos[f].mName)
          {
            // file still exists, check modification timestamp
            if (fileStats->getModificationTS() !=
                fileInfos[f].mLastMod ||
                fileStats->getTotalSize() !=
                (Int64) fileInfos[f].mSize)
              {
                if (refresh)
                  {
                    // redo this file, it changed
                    subtract(fileStats);
                    bucketStats->removeAt(fileNumInBucket[bucketNum]);
                    fileStats = NULL;
                  }
                else
                  result = FALSE;
              }
            // else this file is unchanged from last time
          } // file name matches
        else
          {
            if (refresh)
              {
                if (fileStats)
                  {
                    // We are looking at a file in the directory, fileInfos[f]
                    // and at a file stats entry, with names that do not match.
                    // This could be because a new file got inserted or because
                    // the file of our file stats entry got deleted or both.
                    // We can only refresh this object in the first case, if
                    // a file got deleted we will return FALSE and not refresh.

                    // check whether fileStats got deleted,
                    // search for fileStats->getFileName() in the directory
                    int f2;
                    for (f2=f+1; f2<numFiles; f2++)
                      if (fileStats->getFileName() == fileInfos[f2].mName)
                        break;

                    if (f2<numFiles)
                      {
                        // file fileInfos[f] got added, don't consume
                        // a FileStats entry, instead add it below
                        fileStats = NULL;
                      }
                    else
                      {
                        // file fileStats->getFileName() got deleted,
                        // it's gone from the HDFS directory,
                        // give up and redo the whole thing
                        result = FALSE;
                      }
                  }
                // else file was inserted (fileStats is NULL)
              }
            else
              result = FALSE;
          } // file names for HHDFSFileStats and directory don't match

        if (result && !fileStats)
          {
            // add this file
            if (! bucketStats->addFile(fs,
                                       &fileInfos[f],
                                       doEstimation_,
                                       recordTerminator_,
                                       isSequenceFile_,
                                       fileNumInBucket[bucketNum]))
              result = FALSE;
            add((*bucketStats)[fileNumInBucket[bucketNum]]);
          }
      } // loop over actual files in the directory

  hdfsFreeFileInfo(fileInfos, numFiles);

  // check for file stats that we did not visit at the end of each bucket
  for (CollIndex i=0; i<=getLastValidBucketIndx() && result; i++)
    if (bucketStatsList_.used(i) &&
        bucketStatsList_[i]->entries() != fileNumInBucket[i] + 1)
      result = FALSE; // some files got deleted at the end

  return result;
}
예제 #3
0
void HHDFSListPartitionStats::populate(hdfsFS fs,
                                       const NAString &dir,
                                       Int32 numOfBuckets,
                                       HHDFSDiags &diags,
                                       NABoolean doEstimation,
                                       char recordTerminator)
{
  int numFiles = 0;

  // remember parameters
  partitionDir_     = dir;
  defaultBucketIdx_ = (numOfBuckets >= 1) ? numOfBuckets : 0;
  doEstimation_     = doEstimation;
  recordTerminator_ = recordTerminator;

  // to avoid a crash, due to lacking permissions, check the directory
  // itself first
  hdfsFileInfo *dirInfo = hdfsGetPathInfo(fs, dir.data());
  
  if (!dirInfo)
    {
      diags.recordError(NAString("Could not access HDFS directory ") + dir,
                        "HHDFSListPartitionStats::populate");
    }
  else
    {
      dirInfo_ = *dirInfo;

      // list all the files in this directory, they all belong
      // to this partition and either belong to a specific bucket
      // or to the default bucket
      hdfsFileInfo *fileInfos = hdfsListDirectory(fs,
                                                  dir.data(),
                                                  &numFiles);

      // populate partition stats
      for (int f=0; f<numFiles && diags.isSuccess(); f++)
        if (fileInfos[f].mKind == kObjectKindFile)
          {
            // the default (unbucketed) bucket number is
            // defaultBucketIdx_
            Int32 bucketNum = determineBucketNum(fileInfos[f].mName);
            HHDFSBucketStats *bucketStats = NULL;

            if (! bucketStatsList_.used(bucketNum))
              {
                bucketStats = new(heap_) HHDFSBucketStats(heap_, getTable());
                bucketStatsList_.insertAt(bucketNum, bucketStats);
              }
            else
              bucketStats = bucketStatsList_[bucketNum];

            bucketStats->addFile(fs, &fileInfos[f], diags, doEstimation, recordTerminator);
          }

      hdfsFreeFileInfo(fileInfos, numFiles);
      hdfsFreeFileInfo(dirInfo,1);

      // aggregate statistics over all buckets
      for (Int32 b=0; b<=defaultBucketIdx_; b++)
        if (bucketStatsList_.used(b))
          add(bucketStatsList_[b]);
    }
}