Beispiel #1
0
NABoolean HHDFSTableStats::populate(struct hive_tbl_desc *htd)
{
  // here is the basic outline how this works:
  //
  // 1. Walk SD descriptors of the table, one for the table
  //    itself and one for each partition. Each one represents
  //    one HDFS directory with files for the table.
  // 2. For each list partition directory (or the directory for
  //    an unpartitioned table):
  //     3. Walk through every file. For every file:
  //         4. Determine bucket number (0 if file is not bucketed)
  //         5. Add file to its bucket
  //         6. Walk through blocks of file. For every block:
  //             7. Get host list for this block and add it
  //         9. Get file stats
  //     10. Aggregate file stats for all files and buckets
  // 11. Aggregate bucket stats for all buckets of the partition
  // 12. Aggregate partition stats for all partitions of the table

  NABoolean result = TRUE;
  struct hive_sd_desc *hsd = htd->getSDs();
  tableDir_ = hsd->location_;
  numOfPartCols_ = htd->getNumOfPartCols();
  recordTerminator_ = hsd->getRecordTerminator();
  fieldTerminator_ = hsd->getFieldTerminator() ;
  NAString hdfsHost;
  Int32 hdfsPort = -1;
  NAString tableDir;

  while (hsd)
    {
      // split table URL into host, port and filename
      splitLocation(hsd->location_, hdfsHost, hdfsPort, tableDir);
      if (! connectHDFS(hdfsHost, hdfsPort))
        CMPASSERT(fs_);

      // put back fully qualified URI
      tableDir = hsd->location_;

      // visit the directory
      result = processDirectory(tableDir, hsd->buckets_, 
                                hsd->isTrulyText(), 
                                hsd->getRecordTerminator(),
                                hsd->isSequenceFile());

      hsd = hsd->next_;
    }

  disconnectHDFS();
  validationJTimestamp_ = JULIANTIMESTAMP();

  return result;
}
NABoolean HHDFSTableStats::validateAndRefresh(Int64 expirationJTimestamp, NABoolean refresh)
{
  NABoolean result = TRUE;
  // initial heap allocation size
  Int32 initialSize = heap_->getAllocSize();

  diags_.reset();

  // check if the stats needs to be fetched within a specified time interval
  // when not requested to refresh
  if (! refresh && (expirationJTimestamp == -1 ||
      (expirationJTimestamp > 0 &&
       validationJTimestamp_ < expirationJTimestamp)))
    return result; // consider the stats still valid

  // if partitions get added or deleted, that gets
  // caught in the Hive metadata, so no need to check for
  // that here
  for (int p=0; p<totalNumPartitions_ && result && diags_.isSuccess(); p++)
    {
      HHDFSListPartitionStats *partStats = listPartitionStatsList_[p];
      NAString hdfsHost;
      Int32 hdfsPort;
      NAString partDir;

      result = splitLocation(partStats->getDirName(),
                             hdfsHost,
                             hdfsPort, 
                             partDir,
                             diags_,
                             hdfsPortOverride_);
      if (! result)
        break;

      if (! connectHDFS(hdfsHost, hdfsPort))
        return FALSE;

      subtract(partStats);
      result = partStats->validateAndRefresh(fs_, diags_, refresh);
      if (result)
        add(partStats);
    }

  disconnectHDFS();
  validationJTimestamp_ = JULIANTIMESTAMP();
  // account for the heap used by stats. Heap released during
  // stats refresh will also be included
  hiveStatsSize_ += (heap_->getAllocSize() - initialSize);

  return result;
}
Beispiel #3
0
NABoolean HHDFSTableStats::validateAndRefresh(Int64 expirationJTimestamp, NABoolean refresh)
{
  NABoolean result = TRUE;
  // initial heap allocation size
  Int32 initialSize = heap_->getAllocSize();

  // check only once within a specified time interval
  if (expirationJTimestamp == -1 ||
      (expirationJTimestamp > 0 &&
       validationJTimestamp_ < expirationJTimestamp))
    return result; // consider the stats still valid

  // if partitions get added or deleted, that gets
  // caught in the Hive metadata, so no need to check for
  // that here
  for (int p=0; p<totalNumPartitions_ && result; p++)
    {
      HHDFSListPartitionStats *partStats = listPartitionStatsList_[p];
      NAString hdfsHost;
      Int32 hdfsPort;
      NAString partDir;

      splitLocation(partStats->getDirName(), hdfsHost, hdfsPort, partDir);
      if (! connectHDFS(hdfsHost, hdfsPort))
        CMPASSERT(fs_);

      subtract(partStats);
      result = partStats->validateAndRefresh(fs_, refresh);
      add(partStats);
    }

  disconnectHDFS();
  validationJTimestamp_ = JULIANTIMESTAMP();
  // account for the heap used by stats. Heap released during
  // stats refresh will also be included
  hiveStatsSize_ += (heap_->getAllocSize() - initialSize);

  return result;
}
NABoolean HHDFSTableStats::populate(struct hive_tbl_desc *htd)
{
  // here is the basic outline how this works:
  //
  // 1. Walk SD descriptors of the table, one for the table
  //    itself and one for each partition. Each one represents
  //    one HDFS directory with files for the table.
  // 2. For each list partition directory (or the directory for
  //    an unpartitioned table):
  //     3. Walk through every file. For every file:
  //         4. Determine bucket number (0 if file is not bucketed)
  //         5. Add file to its bucket
  //         6. Walk through blocks of file. For every block:
  //             7. Get host list for this block and add it
  //         9. Get file stats
  //     10. Aggregate file stats for all files and buckets
  // 11. Aggregate bucket stats for all buckets of the partition
  // 12. Aggregate partition stats for all partitions of the table

  struct hive_sd_desc *hsd = htd->getSDs();
  if (hsd == NULL)
    return TRUE; // nothing need to be done

  diags_.reset();
  tableDir_ = hsd->location_;
  numOfPartCols_ = htd->getNumOfPartCols();
  recordTerminator_ = hsd->getRecordTerminator();
  fieldTerminator_ = hsd->getFieldTerminator() ;
  nullFormat_ = hsd->getNullFormat();
  NAString hdfsHost;
  Int32 hdfsPort = -1;
  NAString tableDir;

  if (hsd) {
     if (hsd->isTextFile())
        type_ = TEXT_;
     else if (hsd->isSequenceFile())
        type_ = SEQUENCE_;
     else if (hsd->isOrcFile())
        type_ = ORC_;
     else
        type_ = UNKNOWN_;
  }
  // split table URL into host, port and filename
  if (! splitLocation(hsd->location_,
                hdfsHost,
                hdfsPort,
                tableDir,
                diags_,
                hdfsPortOverride_)) 
      return FALSE;
  if (! connectHDFS(hdfsHost, hdfsPort)) 
     return FALSE; // diags_ is set
  // put back fully qualified URI
  tableDir = hsd->location_;
  computeModificationTSmsec();
  if (diags_.isSuccess()) {
     modificationTSInMillisec_ = htd->setRedeftime(modificationTSInMillisec_);
     while (hsd && diags_.isSuccess()) {
        // visit the directory
        processDirectory(hsd->location_, hsd->buckets_, 
                       hsd->isTrulyText(), 
                       hsd->getRecordTerminator());

        hsd = hsd->next_;
     }
  }

  disconnectHDFS();
  validationJTimestamp_ = JULIANTIMESTAMP();

  return diags_.isSuccess();
}