NABoolean HHDFSTableStats::populate(struct hive_tbl_desc *htd) { // here is the basic outline how this works: // // 1. Walk SD descriptors of the table, one for the table // itself and one for each partition. Each one represents // one HDFS directory with files for the table. // 2. For each list partition directory (or the directory for // an unpartitioned table): // 3. Walk through every file. For every file: // 4. Determine bucket number (0 if file is not bucketed) // 5. Add file to its bucket // 6. Walk through blocks of file. For every block: // 7. Get host list for this block and add it // 9. Get file stats // 10. Aggregate file stats for all files and buckets // 11. Aggregate bucket stats for all buckets of the partition // 12. Aggregate partition stats for all partitions of the table NABoolean result = TRUE; struct hive_sd_desc *hsd = htd->getSDs(); tableDir_ = hsd->location_; numOfPartCols_ = htd->getNumOfPartCols(); recordTerminator_ = hsd->getRecordTerminator(); fieldTerminator_ = hsd->getFieldTerminator() ; NAString hdfsHost; Int32 hdfsPort = -1; NAString tableDir; while (hsd) { // split table URL into host, port and filename splitLocation(hsd->location_, hdfsHost, hdfsPort, tableDir); if (! connectHDFS(hdfsHost, hdfsPort)) CMPASSERT(fs_); // put back fully qualified URI tableDir = hsd->location_; // visit the directory result = processDirectory(tableDir, hsd->buckets_, hsd->isTrulyText(), hsd->getRecordTerminator(), hsd->isSequenceFile()); hsd = hsd->next_; } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); return result; }
NABoolean HHDFSTableStats::validateAndRefresh(Int64 expirationJTimestamp, NABoolean refresh) { NABoolean result = TRUE; // initial heap allocation size Int32 initialSize = heap_->getAllocSize(); diags_.reset(); // check if the stats needs to be fetched within a specified time interval // when not requested to refresh if (! refresh && (expirationJTimestamp == -1 || (expirationJTimestamp > 0 && validationJTimestamp_ < expirationJTimestamp))) return result; // consider the stats still valid // if partitions get added or deleted, that gets // caught in the Hive metadata, so no need to check for // that here for (int p=0; p<totalNumPartitions_ && result && diags_.isSuccess(); p++) { HHDFSListPartitionStats *partStats = listPartitionStatsList_[p]; NAString hdfsHost; Int32 hdfsPort; NAString partDir; result = splitLocation(partStats->getDirName(), hdfsHost, hdfsPort, partDir, diags_, hdfsPortOverride_); if (! result) break; if (! connectHDFS(hdfsHost, hdfsPort)) return FALSE; subtract(partStats); result = partStats->validateAndRefresh(fs_, diags_, refresh); if (result) add(partStats); } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); // account for the heap used by stats. Heap released during // stats refresh will also be included hiveStatsSize_ += (heap_->getAllocSize() - initialSize); return result; }
NABoolean HHDFSTableStats::validateAndRefresh(Int64 expirationJTimestamp, NABoolean refresh) { NABoolean result = TRUE; // initial heap allocation size Int32 initialSize = heap_->getAllocSize(); // check only once within a specified time interval if (expirationJTimestamp == -1 || (expirationJTimestamp > 0 && validationJTimestamp_ < expirationJTimestamp)) return result; // consider the stats still valid // if partitions get added or deleted, that gets // caught in the Hive metadata, so no need to check for // that here for (int p=0; p<totalNumPartitions_ && result; p++) { HHDFSListPartitionStats *partStats = listPartitionStatsList_[p]; NAString hdfsHost; Int32 hdfsPort; NAString partDir; splitLocation(partStats->getDirName(), hdfsHost, hdfsPort, partDir); if (! connectHDFS(hdfsHost, hdfsPort)) CMPASSERT(fs_); subtract(partStats); result = partStats->validateAndRefresh(fs_, refresh); add(partStats); } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); // account for the heap used by stats. Heap released during // stats refresh will also be included hiveStatsSize_ += (heap_->getAllocSize() - initialSize); return result; }
NABoolean HHDFSTableStats::populate(struct hive_tbl_desc *htd) { // here is the basic outline how this works: // // 1. Walk SD descriptors of the table, one for the table // itself and one for each partition. Each one represents // one HDFS directory with files for the table. // 2. For each list partition directory (or the directory for // an unpartitioned table): // 3. Walk through every file. For every file: // 4. Determine bucket number (0 if file is not bucketed) // 5. Add file to its bucket // 6. Walk through blocks of file. For every block: // 7. Get host list for this block and add it // 9. Get file stats // 10. Aggregate file stats for all files and buckets // 11. Aggregate bucket stats for all buckets of the partition // 12. Aggregate partition stats for all partitions of the table struct hive_sd_desc *hsd = htd->getSDs(); if (hsd == NULL) return TRUE; // nothing need to be done diags_.reset(); tableDir_ = hsd->location_; numOfPartCols_ = htd->getNumOfPartCols(); recordTerminator_ = hsd->getRecordTerminator(); fieldTerminator_ = hsd->getFieldTerminator() ; nullFormat_ = hsd->getNullFormat(); NAString hdfsHost; Int32 hdfsPort = -1; NAString tableDir; if (hsd) { if (hsd->isTextFile()) type_ = TEXT_; else if (hsd->isSequenceFile()) type_ = SEQUENCE_; else if (hsd->isOrcFile()) type_ = ORC_; else type_ = UNKNOWN_; } // split table URL into host, port and filename if (! splitLocation(hsd->location_, hdfsHost, hdfsPort, tableDir, diags_, hdfsPortOverride_)) return FALSE; if (! connectHDFS(hdfsHost, hdfsPort)) return FALSE; // diags_ is set // put back fully qualified URI tableDir = hsd->location_; computeModificationTSmsec(); if (diags_.isSuccess()) { modificationTSInMillisec_ = htd->setRedeftime(modificationTSInMillisec_); while (hsd && diags_.isSuccess()) { // visit the directory processDirectory(hsd->location_, hsd->buckets_, hsd->isTrulyText(), hsd->getRecordTerminator()); hsd = hsd->next_; } } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); return diags_.isSuccess(); }