ErrorStack TpccLoadTask::load_customers() {
  Epoch ep;
  MasstreeStorage storage(engine_, kName);
  const uint32_t kCommitBatch = 100;
  char key_be[sizeof(Key)];
  char payload[4096U];
  std::memset(payload, 0, sizeof(payload));
  Key key_ho;
  for (uint32_t i = 0; i < kRecords;) {
    uint32_t cur_batch_size = std::min<uint32_t>(kCommitBatch, kRecords - i);
    WRAP_ERROR_CODE(xct_manager_->begin_xct(context_, xct::kSerializable));
    for (uint32_t j = 0; j < cur_batch_size; ++j) {
      key_ho.first_slice_ = rnd_.next_uint32() % kFirstDisinct;
      key_ho.second_slice_ = rnd_.next_uint32() % kSecondDisinct;
      key_ho.uniquefier_ = i + j + kRecords * thread_ordinal_;
      key_ho.to_be(key_be);
      WRAP_ERROR_CODE(
        storage.insert_record(
          context_,
          key_be,
          sizeof(key_be),
          payload,
          kPayload));
    }
    WRAP_ERROR_CODE(xct_manager_->abort_xct(context_));
    i += cur_batch_size;
    if ((i % 1000) == 0) {
      std::cout << "Thread-" << thread_ordinal_ << " " << i << "/" << kRecords << std::endl;
    }
  }
  return kRetOk;
}
Exemplo n.º 2
0
ErrorStack MetaLogger::initialize_once() {
  ASSERT_ND(engine_->is_master());
  control_block_ = engine_->get_soc_manager()->get_shared_memory_repo()
        ->get_global_memory_anchors()->meta_logger_memory_;
  control_block_->initialize();
  std::memset(control_block_->buffer_, 0, sizeof(control_block_->buffer_));

  fs::Path path(engine_->get_options().log_.construct_meta_log_path());
  if (!fs::exists(path.parent_path())) {
    fs::create_directories(path.parent_path());
  }

  engine_->get_savepoint_manager()->get_meta_logger_offsets(
    &control_block_->oldest_offset_,
    &control_block_->durable_offset_);

  // Open log file
  current_file_ = new fs::DirectIoFile(path, engine_->get_options().log_.emulation_);
  WRAP_ERROR_CODE(current_file_->open(true, true, true, true));
  if (control_block_->durable_offset_ < current_file_->get_current_offset()) {
    LOG(ERROR) << "Meta log file has a non-durable region. Probably there"
      << " was a crash. Will truncate it to " << control_block_->durable_offset_
      << " from " << current_file_->get_current_offset();
    WRAP_ERROR_CODE(current_file_->truncate(control_block_->durable_offset_, true));
  }
  ASSERT_ND(control_block_->durable_offset_ == current_file_->get_current_offset());

  stop_requested_ = false;
  logger_thread_ = std::move(std::thread(&MetaLogger::meta_logger_main, this));
  return kRetOk;
}
Exemplo n.º 3
0
ErrorStack inserts_varlen_task(const proc::ProcArguments& args) {
  EXPECT_EQ(sizeof(uint32_t), args.input_len_);
  uint32_t id = *reinterpret_cast<const uint32_t*>(args.input_buffer_);
  EXPECT_NE(id, 2U);

  thread::Thread* context = args.context_;
  storage::hash::HashStorage hash(args.engine_, kName);
  ASSERT_ND(hash.exists());
  xct::XctManager* xct_manager = args.engine_->get_xct_manager();
  Epoch commit_epoch;

  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));

  char buffer[16];
  std::memset(buffer, 0, sizeof(buffer));
  for (uint32_t i = 0; i < kRecords / 2U; ++i) {
    uint64_t rec = id * kRecords / 2U + i;
    // first 8 bytes, mod 17 to have next layers.
    assorted::write_bigendian<uint64_t>(static_cast<uint64_t>(rec % 17U), buffer);
    // and 1-4 bytes of decimal representation in text
    std::string str = std::to_string(rec);
    std::memcpy(buffer + sizeof(uint64_t), str.data(), str.size());
    uint16_t len = sizeof(uint64_t) + str.size();
    uint64_t data = rec + kDataAddendum;
    ErrorCode ret = hash.insert_record(context, buffer, len, &data, sizeof(data));
    EXPECT_EQ(kErrorCodeOk, ret) << i;
  }

  // CHECK_ERROR(hash.debugout_single_thread(args.engine_));
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  // CHECK_ERROR(hash.debugout_single_thread(args.engine_));
  WRAP_ERROR_CODE(xct_manager->wait_for_commit(commit_epoch));
  return kRetOk;
}
ErrorStack verify_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  Engine* engine = args.engine_;
  MasstreeStorage storage(engine, kName);
  WRAP_ERROR_CODE(engine->get_xct_manager()->begin_xct(context, xct::kSnapshot));
  CHECK_ERROR(storage.verify_single_thread(context));
  WRAP_ERROR_CODE(engine->get_xct_manager()->abort_xct(context));
  CHECK_ERROR(storage.debugout_single_thread(engine, true, 0xFFFFFF));
  return kRetOk;
}
ErrorStack verify_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  storage::masstree::MasstreeStorage masstree(args.engine_, kName);
  ASSERT_ND(masstree.exists());
  CHECK_ERROR(masstree.verify_single_thread(context));
  xct::XctManager* xct_manager = args.engine_->get_xct_manager();
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));

  for (uint32_t i = 0; i < kRecords; ++i) {
    uint64_t rec = i;
    storage::masstree::KeySlice slice = storage::masstree::normalize_primitive<uint64_t>(rec);
    uint64_t data;
    uint16_t capacity = sizeof(data);
    ErrorCode ret = masstree.get_record_normalized(context, slice, &data, &capacity, true);
/*
    if (ret != kErrorCodeOk || rec != data) {
      CHECK_ERROR(masstree.verify_single_thread(context));
      CHECK_ERROR(masstree.debugout_single_thread(args.engine_));
      std::cout << "asdasd" << std::endl;
    }
*/
    EXPECT_EQ(kErrorCodeOk, ret) << i;
    EXPECT_EQ(rec, data) << i;
    EXPECT_EQ(sizeof(data), capacity) << i;
  }

  Epoch commit_epoch;
  ErrorCode committed = xct_manager->precommit_xct(context, &commit_epoch);
  EXPECT_EQ(kErrorCodeOk, committed);
  return kRetOk;
}
ErrorStack MasstreeStoragePimpl::verify_single_thread_border(
  thread::Thread* context,
  KeySlice low_fence,
  HighFence high_fence,
  MasstreeBorderPage* page) {
  CHECK_ERROR(verify_page_basic(context, page, kMasstreeBorderPageType, low_fence, high_fence));
  // check consecutive_inserts_. this should be consistent whether it's moved or not.
  bool sorted = true;
  for (SlotIndex i = 1; i < page->get_key_count(); ++i) {
    KeySlice prev = page->get_slice(i - 1);
    KeySlice slice = page->get_slice(i);
    KeyLength prev_len = page->get_remainder_length(i - 1);
    KeyLength len = page->get_remainder_length(i);
    if (prev > slice || (prev == slice && prev_len > len)) {
      sorted = false;
      break;
    }
  }
  CHECK_AND_ASSERT(page->is_consecutive_inserts() == sorted);

  if (page->is_moved()) {
    CHECK_ERROR(verify_single_thread_border(
      context,
      low_fence,
      HighFence(page->get_foster_fence(), false),
      context->resolve_cast<MasstreeBorderPage>(page->get_foster_minor())));
    CHECK_ERROR(verify_single_thread_border(
      context,
      page->get_foster_fence(),
      high_fence,
      context->resolve_cast<MasstreeBorderPage>(page->get_foster_major())));
    return kRetOk;
  }

  CHECK_AND_ASSERT(!page->is_moved());
  CHECK_AND_ASSERT(page->get_key_count() <= kBorderPageMaxSlots);
  for (SlotIndex i = 0; i < page->get_key_count(); ++i) {
    CHECK_AND_ASSERT(!page->get_owner_id(i)->lock_.is_keylocked());
    CHECK_AND_ASSERT(!page->get_owner_id(i)->lock_.is_rangelocked());
    CHECK_AND_ASSERT(!page->get_owner_id(i)->xct_id_.is_being_written());
    CHECK_AND_ASSERT(page->get_owner_id(i)->xct_id_.get_epoch().is_valid());
    CHECK_AND_ASSERT(page->verify_slot_lengthes(i));
    KeySlice slice = page->get_slice(i);
    CHECK_AND_ASSERT(slice >= low_fence);
    CHECK_AND_ASSERT(slice < high_fence.slice_ || page->is_high_fence_supremum());
    if (page->does_point_to_layer(i)) {
      CHECK_AND_ASSERT(page->get_owner_id(i)->xct_id_.is_next_layer());
      CHECK_AND_ASSERT(!page->get_next_layer(i)->is_both_null());
      MasstreePage* next;
      // TASK(Hideaki) probably two versions: always follow volatile vs snapshot
      // so far check volatile only
      WRAP_ERROR_CODE(follow_page(context, true, page->get_next_layer(i), &next));
      CHECK_ERROR(verify_single_thread_layer(context, page->get_layer() + 1, next));
    } else {
      CHECK_AND_ASSERT(!page->get_owner_id(i)->xct_id_.is_next_layer());
    }
  }

  return kRetOk;
}
ErrorStack MasstreeStoragePimpl::verify_single_thread(thread::Thread* context) {
  MasstreeIntermediatePage* layer_root;
  WRAP_ERROR_CODE(get_first_root(context, false, &layer_root));
  CHECK_AND_ASSERT(!layer_root->is_border());  // root of first layer is always intermediate page
  CHECK_ERROR(verify_single_thread_layer(context, 0, layer_root));
  return kRetOk;
}
ErrorStack MasstreeStoragePimpl::debugout_single_thread_follow(
  Engine* engine,
  cache::SnapshotFileSet* files,
  const DualPagePointer& pointer,
  bool follow_volatile,
  uint32_t* remaining_pages) {
  if ((*remaining_pages) == 0) {
    return kRetOk;
  }
  if (follow_volatile) {
    if (!pointer.volatile_pointer_.is_null()) {
     MasstreePage* page = reinterpret_cast<MasstreePage*>(
       engine->get_memory_manager()->get_global_volatile_page_resolver().resolve_offset(
        pointer.volatile_pointer_));
      CHECK_ERROR(debugout_single_thread_recurse(engine, files, page, true, remaining_pages));
    }
  } else {
    if (pointer.snapshot_pointer_ != 0) {
      memory::AlignedMemory buf;
      buf.alloc(1 << 12U, 1 << 12U, memory::AlignedMemory::kNumaAllocOnnode, 0);
      MasstreePage* page = reinterpret_cast<MasstreePage*>(buf.get_block());
      WRAP_ERROR_CODE(files->read_page(pointer.snapshot_pointer_, page));
      CHECK_ERROR(debugout_single_thread_recurse(engine, files, page, false, remaining_pages));
    }
  }
  return kRetOk;
}
Exemplo n.º 9
0
ErrorStack verify_varlen_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  storage::hash::HashStorage hash(args.engine_, kName);
  ASSERT_ND(hash.exists());
  CHECK_ERROR(hash.verify_single_thread(context));
  // CHECK_ERROR(hash.debugout_single_thread(args.engine_));
  xct::XctManager* xct_manager = args.engine_->get_xct_manager();
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));

  char buffer[16];
  std::memset(buffer, 0, sizeof(buffer));
  for (uint32_t i = 0; i < kRecords; ++i) {
    uint64_t rec = i;
    assorted::write_bigendian<uint64_t>(static_cast<uint64_t>(rec % 17U), buffer);
    std::string str = std::to_string(rec);
    std::memcpy(buffer + sizeof(uint64_t), str.data(), str.size());
    uint16_t len = sizeof(uint64_t) + str.size();

    uint64_t data;
    uint16_t capacity = sizeof(data);
    ErrorCode ret = hash.get_record(context, buffer, len, &data, &capacity);
    EXPECT_EQ(kErrorCodeOk, ret) << i;
    EXPECT_EQ(i + kDataAddendum, data) << i;
    EXPECT_EQ(sizeof(data), capacity) << i;
  }

  Epoch commit_epoch;
  ErrorCode committed = xct_manager->precommit_xct(context, &commit_epoch);
  EXPECT_EQ(kErrorCodeOk, committed);
  return kRetOk;
}
Exemplo n.º 10
0
ErrorStack inserts_fixed_len_task(const proc::ProcArguments& args) {
  EXPECT_EQ(sizeof(uint32_t), args.input_len_);
  uint32_t id = *reinterpret_cast<const uint32_t*>(args.input_buffer_);
  EXPECT_NE(id, 2U);

  thread::Thread* context = args.context_;
  storage::hash::HashStorage hash(args.engine_, kName);
  ASSERT_ND(hash.exists());
  xct::XctManager* xct_manager = args.engine_->get_xct_manager();
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));

  for (uint32_t i = 0; i < kRecords / 2U; ++i) {
    uint64_t key = id * kRecords / 2U + i;
    uint64_t data = key + kDataAddendum;
    WRAP_ERROR_CODE(hash.insert_record(context, &key, sizeof(key), &data, sizeof(data)));
  }

  Epoch commit_epoch;
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  WRAP_ERROR_CODE(xct_manager->wait_for_commit(commit_epoch));
  return kRetOk;
}
ErrorStack inserts_normalized_task(const proc::ProcArguments& args) {
  EXPECT_EQ(sizeof(uint32_t), args.input_len_);
  uint32_t id = *reinterpret_cast<const uint32_t*>(args.input_buffer_);
  EXPECT_NE(id, 2U);

  thread::Thread* context = args.context_;
  storage::masstree::MasstreeStorage masstree(args.engine_, kName);
  ASSERT_ND(masstree.exists());
  xct::XctManager* xct_manager = args.engine_->get_xct_manager();
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));

  for (uint32_t i = 0; i < kRecords / 2U; ++i) {
    uint64_t rec = id * kRecords / 2U + i;
    storage::masstree::KeySlice slice = storage::masstree::normalize_primitive<uint64_t>(rec);
    WRAP_ERROR_CODE(masstree.insert_record_normalized(context, slice, &rec, sizeof(rec)));
  }

  Epoch commit_epoch;
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  WRAP_ERROR_CODE(xct_manager->wait_for_commit(commit_epoch));
  return kRetOk;
}
Exemplo n.º 12
0
ErrorStack EngineMemory::load_one_volatile_page(
  cache::SnapshotFileSet* fileset,
  storage::SnapshotPagePointer snapshot_pointer,
  storage::VolatilePagePointer* pointer,
  storage::Page** page) {
  ASSERT_ND(snapshot_pointer != 0);
  thread::ThreadGroupId node = storage::extract_numa_node_from_snapshot_pointer(snapshot_pointer);
  CHECK_ERROR(grab_one_volatile_page(node, pointer, page));
  WRAP_ERROR_CODE(fileset->read_page(snapshot_pointer, *page));
  (*page)->get_header().snapshot_ = false;
  (*page)->get_header().page_id_ = pointer->word;
  return kRetOk;
}
Exemplo n.º 13
0
ErrorStack EngineMemory::grab_one_volatile_page(
  thread::ThreadGroupId node,
  storage::VolatilePagePointer* pointer,
  storage::Page** page) {
  PagePool* pool = get_node_memory(node)->get_volatile_pool();
  PagePoolOffset offset;
  WRAP_ERROR_CODE(pool->grab_one(&offset));
  *page = pool->get_resolver().resolve_offset_newpage(offset);
  pointer->components.numa_node = node;
  pointer->components.flags = 0;
  pointer->components.mod_count = 0;
  pointer->components.offset = offset;
  return kRetOk;
}
Exemplo n.º 14
0
ErrorStack split_in_next_layer_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  MasstreeStorage masstree = context->get_engine()->get_storage_manager()->get_masstree("ggg");
  xct::XctManager* xct_manager = context->get_engine()->get_xct_manager();
  assorted::UniformRandom uniform_random(123456);
  std::string keys[32];
  std::string answers[32];
  Epoch commit_epoch;
  for (uint32_t rep = 0; rep < 32; ++rep) {
    WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
    uint64_t key_int = uniform_random.next_uint64();
    char key_string[16];
    std::memset(key_string, 42, 8);
    reinterpret_cast<uint64_t*>(key_string)[1] = key_int;
    keys[rep] = std::string(key_string, 16);
    char data[200];
    std::memset(data, 0, 200);
    std::memcpy(data + 123, &key_int, sizeof(key_int));
    answers[rep] = std::string(data, 200);
    WRAP_ERROR_CODE(masstree.insert_record(context, key_string, 16, data, sizeof(data)));
    WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  }

  // now read
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
  for (uint32_t rep = 0; rep < 32; ++rep) {
    char data[500];
    uint16_t capacity = 500;
    WRAP_ERROR_CODE(masstree.get_record(context, keys[rep].data(), 16, data, &capacity));
    EXPECT_EQ(200, capacity);
    EXPECT_EQ(answers[rep], std::string(data, 200));
  }
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));

  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
  CHECK_ERROR(masstree.verify_single_thread(context));
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  WRAP_ERROR_CODE(xct_manager->wait_for_commit(commit_epoch));
  return foedus::kRetOk;
}
Exemplo n.º 15
0
ErrorStack split_border_normalized_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  MasstreeStorage masstree = context->get_engine()->get_storage_manager()->get_masstree("ggg");
  xct::XctManager* xct_manager = context->get_engine()->get_xct_manager();
  assorted::UniformRandom uniform_random(123456);
  KeySlice keys[32];
  std::string answers[32];
  Epoch commit_epoch;
  for (uint32_t rep = 0; rep < 32; ++rep) {
    WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
    KeySlice key = normalize_primitive<uint64_t>(uniform_random.next_uint64());
    keys[rep] = key;
    char data[200];
    std::memset(data, 0, 200);
    std::memcpy(data + 123, &key, sizeof(key));
    answers[rep] = std::string(data, 200);
    WRAP_ERROR_CODE(masstree.insert_record_normalized(context, key, data, sizeof(data)));
    WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  }

  // now read
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
  for (uint32_t rep = 0; rep < 32; ++rep) {
    KeySlice key = keys[rep];
    char data[500];
    uint16_t capacity = 500;
    WRAP_ERROR_CODE(masstree.get_record_normalized(context, key, data, &capacity));
    EXPECT_EQ(200, capacity);
    EXPECT_EQ(answers[rep], std::string(data, 200));
  }
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));

  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
  CHECK_ERROR(masstree.verify_single_thread(context));
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  WRAP_ERROR_CODE(xct_manager->wait_for_commit(commit_epoch));
  return foedus::kRetOk;
}
Exemplo n.º 16
0
ErrorStack verify_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  storage::hash::HashStorage hash(args.engine_, kName);
  ASSERT_ND(hash.exists());
  CHECK_ERROR(hash.verify_single_thread(context));
  xct::XctManager* xct_manager = args.engine_->get_xct_manager();
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));

  for (uint32_t i = 0; i < kRecords; ++i) {
    uint64_t key = i;
    uint64_t data;
    uint16_t capacity = sizeof(data);
    ErrorCode ret = hash.get_record(context, &key, sizeof(key), &data, &capacity);
    EXPECT_EQ(kErrorCodeOk, ret) << i;
    EXPECT_EQ(i, key) << i;
    EXPECT_EQ(i + kDataAddendum, data) << i;
    EXPECT_EQ(sizeof(data), capacity) << i;
  }

  Epoch commit_epoch;
  ErrorCode committed = xct_manager->precommit_xct(context, &commit_epoch);
  EXPECT_EQ(kErrorCodeOk, committed);
  return kRetOk;
}
ErrorStack MasstreeStoragePimpl::verify_single_thread_intermediate(
  thread::Thread* context,
  KeySlice low_fence,
  HighFence high_fence,
  MasstreeIntermediatePage* page) {
  CHECK_ERROR(
    verify_page_basic(context, page, kMasstreeIntermediatePageType, low_fence, high_fence));

  if (page->is_moved()) {
    CHECK_ERROR(verify_single_thread_intermediate(
      context,
      low_fence,
      HighFence(page->get_foster_fence(), false),
      context->resolve_cast<MasstreeIntermediatePage>(page->get_foster_minor())));
    CHECK_ERROR(verify_single_thread_intermediate(
      context,
      page->get_foster_fence(),
      high_fence,
      context->resolve_cast<MasstreeIntermediatePage>(page->get_foster_major())));
    return kRetOk;
  }

  uint8_t key_count = page->get_key_count();
  CHECK_AND_ASSERT(key_count <= kMaxIntermediateSeparators);
  KeySlice previous_low = low_fence;
  for (uint8_t i = 0; i <= key_count; ++i) {
    HighFence mini_high(0, false);
    if (i < key_count) {
      mini_high.slice_ = page->get_separator(i);
      mini_high.supremum_ = false;
      CHECK_AND_ASSERT(high_fence.supremum_ || mini_high.slice_ < high_fence.slice_);
      if (i == 0) {
        CHECK_AND_ASSERT(mini_high.slice_ > low_fence);
      } else {
        CHECK_AND_ASSERT(mini_high.slice_ > page->get_separator(i - 1));
      }
    } else {
      mini_high = high_fence;
    }

    MasstreeIntermediatePage::MiniPage& minipage = page->get_minipage(i);
    uint8_t mini_count = minipage.key_count_;
    CHECK_AND_ASSERT(mini_count <= kMaxIntermediateMiniSeparators);
    KeySlice page_low = previous_low;
    for (uint8_t j = 0; j <= mini_count; ++j) {
      HighFence page_high(0, false);
      if (j < mini_count) {
        page_high.slice_ = minipage.separators_[j];
        page_high.supremum_ = false;
        CHECK_AND_ASSERT(page_high.slice_ < mini_high.slice_ || mini_high.supremum_);
        if (j == 0) {
          CHECK_AND_ASSERT(page_high.slice_ > previous_low);
        } else {
          CHECK_AND_ASSERT(page_high.slice_ > minipage.separators_[j - 1]);
        }
      } else {
        page_high = mini_high;
      }
      CHECK_AND_ASSERT(!minipage.pointers_[j].is_both_null());
      MasstreePage* next;
      // TASK(Hideaki) probably two versions: always follow volatile vs snapshot
      // so far check volatile only
      WRAP_ERROR_CODE(follow_page(context, true, &minipage.pointers_[j], &next));
      CHECK_AND_ASSERT(next->get_layer() == page->get_layer());
      CHECK_AND_ASSERT(next->get_btree_level() + 1U == page->get_btree_level());
      if (next->is_border()) {
        CHECK_ERROR(verify_single_thread_border(
          context,
          page_low,
          page_high,
          reinterpret_cast<MasstreeBorderPage*>(next)));
      } else {
        CHECK_ERROR(verify_single_thread_intermediate(
          context,
          page_low,
          page_high,
          reinterpret_cast<MasstreeIntermediatePage*>(next)));
      }

      page_low = page_high.slice_;
    }

    previous_low = mini_high.slice_;
  }

  return kRetOk;
}
ErrorStack MasstreeStoragePimpl::fatify_first_root_double(thread::Thread* context) {
  MasstreeIntermediatePage* root;
  WRAP_ERROR_CODE(get_first_root(context, true, &root));
  ASSERT_ND(root->is_locked());
  ASSERT_ND(!root->is_moved());

  // assure that all children have volatile version
  for (MasstreeIntermediatePointerIterator it(root); it.is_valid(); it.next()) {
    if (it.get_pointer().volatile_pointer_.is_null()) {
      MasstreePage* child;
      WRAP_ERROR_CODE(follow_page(
        context,
        true,
        const_cast<DualPagePointer*>(&it.get_pointer()),
        &child));
    }
    ASSERT_ND(!it.get_pointer().volatile_pointer_.is_null());
  }

  std::vector<Child> original_children = list_children(root);
  ASSERT_ND(original_children.size() * 2U <= kMaxIntermediatePointers);
  std::vector<Child> new_children;
  for (const Child& child : original_children) {
    CHECK_ERROR(split_a_child(context, root, child, &new_children));
  }
  ASSERT_ND(new_children.size() >= original_children.size());

  memory::NumaCoreMemory* memory = context->get_thread_memory();
  memory::PagePoolOffset new_offset = memory->grab_free_volatile_page();
  if (new_offset == 0) {
    return ERROR_STACK(kErrorCodeMemoryNoFreePages);
  }
  // from now on no failure (we grabbed a free page).

  VolatilePagePointer new_pointer = combine_volatile_page_pointer(
    context->get_numa_node(),
    kVolatilePointerFlagSwappable,  // pointer to root page might be swapped!
    get_first_root_pointer().volatile_pointer_.components.mod_count + 1,
    new_offset);
  MasstreeIntermediatePage* new_root
    = context->resolve_newpage_cast<MasstreeIntermediatePage>(new_pointer);
  new_root->initialize_volatile_page(
    get_id(),
    new_pointer,
    0,
    root->get_btree_level(),  // same as current root. this is not grow_root
    kInfimumSlice,
    kSupremumSlice);
  // no concurrent access to the new page, but just for the sake of assertion in the func.
  PageVersionLockScope new_scope(context, new_root->get_version_address());
  new_root->split_foster_migrate_records_new_first_root(&new_children);
  ASSERT_ND(count_children(new_root) == new_children.size());
  verify_new_root(context, new_root, new_children);

  // set the new first-root pointer.
  assorted::memory_fence_release();
  get_first_root_pointer().volatile_pointer_.word = new_pointer.word;
  // first-root snapshot pointer is unchanged.

  // old root page and the direct children are now retired
  assorted::memory_fence_acq_rel();
  root->set_moved();  // not quite moved, but assertions assume that.
  root->set_retired();
  context->collect_retired_volatile_page(
    construct_volatile_page_pointer(root->header().page_id_));
  for (const Child& child : original_children) {
    MasstreePage* original_page = context->resolve_cast<MasstreePage>(child.pointer_);
    if (original_page->is_moved()) {
      PageVersionLockScope scope(context, original_page->get_version_address());
      original_page->set_retired();
      context->collect_retired_volatile_page(child.pointer_);
    } else {
      // This means, the page had too small records to split. We must keep it.
    }
  }
  assorted::memory_fence_acq_rel();

  LOG(INFO) << "Split done. " << original_children.size() << " -> " << new_children.size();

  return kRetOk;
}
Exemplo n.º 19
0
/** Verify TPC-B results. */
ErrorStack verify_tpcb_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  xct::XctManager* xct_manager = context->get_engine()->get_xct_manager();
  CHECK_ERROR(xct_manager->begin_xct(context, xct::kSerializable));

  int64_t expected_branch[kBranches];
  int64_t expected_teller[kBranches * kTellers];
  int64_t expected_account[kBranches * kAccounts];
  for (int i = 0; i < kBranches; ++i) {
    expected_branch[i] = kInitialAccountBalance * kAccounts;
  }
  for (int i = 0; i < kBranches * kTellers; ++i) {
    expected_teller[i] = kInitialAccountBalance * kAccountsPerTellers;
  }
  for (int i = 0; i < kBranches * kAccounts; ++i) {
    expected_account[i] = kInitialAccountBalance;
  }

  // we don't have scanning API yet, so manually do it.
  std::set<uint64_t> observed_history_ids;
  WRAP_ERROR_CODE(sequential::SequentialStoragePimpl(
    context->get_engine(), histories.get_control_block()).for_every_page(
    [&](SequentialPage* page){
      uint16_t record_count = page->get_record_count();
      const char* record_pointers[kMaxSlots];
      uint16_t payload_lengthes[kMaxSlots];
      page->get_all_records_nosync(&record_count, record_pointers, payload_lengthes);

      for (uint16_t rec = 0; rec < record_count; ++rec) {
        EXPECT_EQ(payload_lengthes[rec], sizeof(HistoryData));
        const HistoryData& history = *reinterpret_cast<const HistoryData*>(
          record_pointers[rec] + kRecordOverhead);
        EXPECT_GE(history.amount_, kAmountRangeFrom);
        EXPECT_LE(history.amount_, kAmountRangeTo);

        EXPECT_LT(history.branch_id_, kBranches);
        EXPECT_LT(history.teller_id_, kBranches * kTellers);
        EXPECT_LT(history.account_id_, kBranches * kAccounts);

        EXPECT_EQ(history.branch_id_, history.teller_id_ / kTellers);
        EXPECT_EQ(history.branch_id_, history.account_id_ / kAccounts);
        EXPECT_EQ(history.teller_id_, history.account_id_ / kAccountsPerTellers);

        expected_branch[history.branch_id_] += history.amount_;
        expected_teller[history.teller_id_] += history.amount_;
        expected_account[history.account_id_] += history.amount_;
        EXPECT_EQ(observed_history_ids.end(), observed_history_ids.find(history.history_id_))
          << history.history_id_;
        observed_history_ids.insert(history.history_id_);
      }
      return kErrorCodeOk;
  }));
  EXPECT_EQ(kXctsPerThread * thread_count, observed_history_ids.size());
  for (int i = 0; i < kXctsPerThread * thread_count; ++i) {
    EXPECT_NE(observed_history_ids.end(), observed_history_ids.find(i)) << i;
  }

  for (int i = 0; i < kBranches; ++i) {
    BranchData data;
    CHECK_ERROR(branches.get_record(context, i, &data));
    EXPECT_EQ(expected_branch[i], data.branch_balance_) << "branch-" << i;
  }
  for (int i = 0; i < kBranches * kTellers; ++i) {
    TellerData data;
    CHECK_ERROR(tellers.get_record(context, i, &data));
    EXPECT_EQ(i / kTellers, data.branch_id_) << i;
    EXPECT_EQ(expected_teller[i], data.teller_balance_) << "teller-" << i;
  }
  for (int i = 0; i < kBranches * kAccounts; ++i) {
    AccountData data;
    CHECK_ERROR(accounts.get_record(context, i, &data));
    EXPECT_EQ(i / kAccounts, data.branch_id_) << i;
    EXPECT_EQ(expected_account[i], data.account_balance_) << "account-" << i;
  }
  for (uint32_t i = 0; i < context->get_current_xct().get_read_set_size(); ++i) {
    xct::ReadXctAccess& access = context->get_current_xct().get_read_set()[i];
    EXPECT_FALSE(access.observed_owner_id_.is_being_written()) << i;
    EXPECT_FALSE(access.observed_owner_id_.is_deleted()) << i;
    EXPECT_FALSE(access.observed_owner_id_.is_moved()) << i;
  }

  CHECK_ERROR(xct_manager->abort_xct(context));
  return foedus::kRetOk;
}
Exemplo n.º 20
0
ErrorStack split_intermediate_sequential_task(const proc::ProcArguments& args) {
  thread::Thread* context = args.context_;
  MasstreeStorage masstree = context->get_engine()->get_storage_manager()->get_masstree("ggg");
  xct::XctManager* xct_manager = context->get_engine()->get_xct_manager();
  Epoch commit_epoch;
  // 1000 bytes payload -> only 2 tuples per page.
  // one intermediate page can point to about 150 pages.
  // inserting 400 tuples surely causes 3 levels
  for (uint32_t rep = 0; rep < 400; ++rep) {
    if (rep == 321) {
      rep = 321;
    }
    WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
    CHECK_ERROR(masstree.verify_single_thread(context));
    KeySlice key = normalize_primitive<uint64_t>(rep);
    char data[1000];
    std::memset(data, 0, 1000);
    std::memcpy(data + 123, &key, sizeof(key));
    WRAP_ERROR_CODE(masstree.insert_record_normalized(context, key, data, sizeof(data)));
    WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
    WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
    CHECK_ERROR(masstree.verify_single_thread(context));
    WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  }

  // now read
  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
  for (uint32_t rep = 0; rep < 400; ++rep) {
    KeySlice key = normalize_primitive<uint64_t>(rep);
    char data[1000];
    uint16_t capacity = 1000;
    WRAP_ERROR_CODE(masstree.get_record_normalized(context, key, data, &capacity));
    EXPECT_EQ(1000, capacity);
    char correct_data[1000];
    std::memset(correct_data, 0, 1000);
    std::memcpy(correct_data + 123, &key, sizeof(key));
    EXPECT_EQ(std::string(correct_data, 1000), std::string(data, capacity)) << rep;
  }
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));

  WRAP_ERROR_CODE(xct_manager->begin_xct(context, xct::kSerializable));
  CHECK_ERROR(masstree.verify_single_thread(context));
  WRAP_ERROR_CODE(xct_manager->precommit_xct(context, &commit_epoch));
  WRAP_ERROR_CODE(xct_manager->wait_for_commit(commit_epoch));
  return foedus::kRetOk;
}
ErrorStack split_a_child(
  thread::Thread* context,
  MasstreeIntermediatePage* root,
  Child original,
  std::vector<Child>* out) {
  ASSERT_ND(!original.pointer_.is_null());
  MasstreeIntermediatePage::MiniPage& minipage = root->get_minipage(original.index_);
  ASSERT_ND(
    minipage.pointers_[original.index_mini_].volatile_pointer_.is_equivalent(original.pointer_));
  MasstreePage* original_page = context->resolve_cast<MasstreePage>(original.pointer_);
  ASSERT_ND(original_page->get_low_fence() == original.low_);
  ASSERT_ND(original_page->get_high_fence() == original.high_);

  // lock it first.
  PageVersionLockScope scope(context, original_page->get_version_address());
  ASSERT_ND(original_page->is_locked());

  // if it already has a foster child, nothing to do.
  if (!original_page->is_moved()) {
    if (original_page->is_border()) {
      MasstreeBorderPage* casted = reinterpret_cast<MasstreeBorderPage*>(original_page);
      if (casted->get_key_count() < 2U) {
        // Then, no split possible.
        LOG(INFO) << "This border page can't be split anymore";
        out->emplace_back(original);
        return kRetOk;
      }
      // trigger doesn't matter. just make sure it doesn't cause no-record-split. so, use low_fence.
      // also, specify disable_nrs
      KeySlice trigger = casted->get_low_fence();
      MasstreeBorderPage* after = casted;
      xct::McsLockScope after_lock;
      casted->split_foster(context, trigger, true, &after, &after_lock);
      ASSERT_ND(after->is_locked());
      ASSERT_ND(after_lock.is_locked());
      ASSERT_ND(casted->is_moved());
    } else {
      MasstreeIntermediatePage* casted = reinterpret_cast<MasstreeIntermediatePage*>(original_page);
      uint32_t pointers = count_children(casted);
      if (pointers < 2U) {
        LOG(INFO) << "This intermediate page can't be split anymore";
        out->emplace_back(original);
        return kRetOk;
      }
      WRAP_ERROR_CODE(casted->split_foster_no_adopt(context));
    }
  } else {
    LOG(INFO) << "lucky, already split. just adopt";
  }

  ASSERT_ND(original_page->is_moved());

  VolatilePagePointer minor_pointer = original_page->get_foster_minor();
  VolatilePagePointer major_pointer = original_page->get_foster_major();
  ASSERT_ND(!minor_pointer.is_null());
  ASSERT_ND(!major_pointer.is_null());
  MasstreePage* minor = context->resolve_cast<MasstreePage>(minor_pointer);
  MasstreePage* major = context->resolve_cast<MasstreePage>(major_pointer);
  KeySlice middle = original_page->get_foster_fence();
  ASSERT_ND(minor->get_low_fence() == original.low_);
  ASSERT_ND(minor->get_high_fence() == middle);
  ASSERT_ND(major->get_low_fence() == middle);
  ASSERT_ND(major->get_high_fence() == original.high_);

  Child minor_out = {minor_pointer, original.low_, middle, 0, 0};
  out->emplace_back(minor_out);
  Child major_out = {major_pointer, middle, original.high_, 0, 0};
  out->emplace_back(major_out);
  return kRetOk;
}
ErrorStack MasstreeStoragePimpl::fatify_first_root(
  thread::Thread* context,
  uint32_t desired_count) {
  LOG(INFO) << "Masstree-" << get_name() << " being fatified for " << desired_count;

  if (desired_count > kMaxIntermediatePointers) {
    LOG(INFO) << "desired_count too large. adjusted to the max";
    desired_count = kMaxIntermediatePointers;
  }

  // Check if the volatile page is moved. If so, grow it.
  while (true) {
    MasstreeIntermediatePage* root;
    WRAP_ERROR_CODE(get_first_root(context, true, &root));

    if (root->has_foster_child()) {
      // oh, the root page needs to grow
      LOG(INFO) << "oh, the root page needs to grow";
      WRAP_ERROR_CODE(grow_root(
        context,
        &get_first_root_pointer(),
        &get_first_root_owner(),
        &root));
      // then retry
    } else {
      break;
    }
  }

  while (true) {
    // lock the first root.
    xct::McsLockScope owner_scope(context, &get_first_root_owner());
    LOG(INFO) << "Locked the root page owner address.";
    MasstreeIntermediatePage* root;
    WRAP_ERROR_CODE(get_first_root(context, true, &root));
    PageVersionLockScope scope(context, root->get_version_address());
    LOG(INFO) << "Locked the root page itself.";
    if (root->has_foster_child()) {
      LOG(WARNING) << "Mm, I thought I grew the root, but concurrent xct again moved it. "
        << " Gave up fatifying. Should be super-rare.";
      return kRetOk;
    }

    ASSERT_ND(root->is_locked());
    ASSERT_ND(!root->is_moved());
    uint32_t current_count = count_children(root);
    LOG(INFO) << "Masstree-" << get_name() << " currently has " << current_count << " children";

    if (current_count >= desired_count || current_count >= (kMaxIntermediatePointers / 2U)) {
      LOG(INFO) << "Already enough fat. Done";
      break;
    }

    LOG(INFO) << "Splitting...";
    CHECK_ERROR(fatify_first_root_double(context));

    WRAP_ERROR_CODE(get_first_root(context, true, &root));
    uint32_t new_count = count_children(root);
    if (new_count == current_count) {
      LOG(INFO) << "Seems like we can't split any more.";
      break;
    }
  }

  return kRetOk;
}
Exemplo n.º 23
0
ErrorStack LogReducer::dump_buffer_sort_storage_write(
  const LogBuffer &buffer,
  storage::StorageId storage_id,
  const BufferPosition* sorted_logs,
  uint32_t shortest_key_length,
  uint32_t longest_key_length,
  uint32_t log_count,
  fs::DirectIoFile *dump_file) {
  debugging::StopWatch write_watch;
  char* io_buffer = reinterpret_cast<char*>(dump_io_buffer_.get_block());
  // we flush the IO buffer when we wrote out this number of bytes.
  // to keep it aligned, the bytes after this threshold have to be retained and copied over to
  // the beginning of the buffer.
  const uint64_t flush_threshold = dump_io_buffer_.get_size() - (1 << 16);
  uint64_t total_bytes = dump_block_header(
    buffer,
    storage_id,
    sorted_logs,
    shortest_key_length,
    longest_key_length,
    log_count,
    io_buffer);
  uint64_t total_written = 0;
  uint64_t current_pos = sizeof(FullBlockHeader);
  for (uint32_t i = 0; i < log_count; ++i) {
    const log::RecordLogType* record = buffer.resolve(sorted_logs[i]);
    ASSERT_ND(current_pos % 8 == 0);
    ASSERT_ND(record->header_.storage_id_ == storage_id);
    ASSERT_ND(record->header_.log_length_ > 0);
    ASSERT_ND(record->header_.log_length_ % 8 == 0);
    std::memcpy(io_buffer + current_pos, record, record->header_.log_length_);
    current_pos += record->header_.log_length_;
    if (current_pos >= flush_threshold) {
      WRAP_ERROR_CODE(dump_file->write(flush_threshold, dump_io_buffer_));

      // move the fragment to beginning
      if (current_pos > flush_threshold) {
        std::memcpy(io_buffer, io_buffer + flush_threshold, current_pos - flush_threshold);
      }
      current_pos -= flush_threshold;
      total_written += flush_threshold;
    }
  }

  ASSERT_ND(total_bytes == total_written + current_pos);  // now we went over all logs again

  if (current_pos > 0) {
    ASSERT_ND(current_pos < flush_threshold);
    // for aligned write, add a dummy storage block at the end.
    if (current_pos % (log::FillerLogType::kLogWriteUnitSize) != 0) {
      uint64_t upto = assorted::align<uint64_t, log::FillerLogType::kLogWriteUnitSize>(current_pos);
      ASSERT_ND(upto > current_pos);
      ASSERT_ND(upto < current_pos + log::FillerLogType::kLogWriteUnitSize);
      ASSERT_ND(upto % log::FillerLogType::kLogWriteUnitSize == 0);
      FillerBlockHeader* filler = reinterpret_cast<FillerBlockHeader*>(io_buffer + current_pos);
      filler->block_length_ = to_buffer_position(upto - current_pos);
      ASSERT_ND(filler->block_length_ < to_buffer_position(log::FillerLogType::kLogWriteUnitSize));
      filler->magic_word_ = BlockHeaderBase::kFillerBlockHeaderMagicWord;
      if (upto - current_pos > sizeof(FillerBlockHeader)) {
        // fill it with zeros. not mandatory, but wouldn't hurt. it's just 4kb.
        std::memset(
          io_buffer + current_pos + sizeof(FillerBlockHeader),
          0,
          upto - current_pos - sizeof(FillerBlockHeader));
      }
      current_pos = upto;
    }

    ASSERT_ND(current_pos % log::FillerLogType::kLogWriteUnitSize == 0);
    WRAP_ERROR_CODE(dump_file->write(current_pos, dump_io_buffer_));
    total_written += current_pos;
  }

  ASSERT_ND(total_written % log::FillerLogType::kLogWriteUnitSize == 0);
  write_watch.stop();
  LOG(INFO) << to_string() << " Wrote out storage-" << storage_id << " which had " << log_count
    << " log records (" << total_written << " bytes) in "<< write_watch.elapsed_ms() << "ms";
  return kRetOk;
}
Exemplo n.º 24
0
ErrorStack LogMapper::handle_process() {
  const Epoch base_epoch = parent_.get_base_epoch();
  const Epoch until_epoch = parent_.get_valid_until_epoch();
  log::LoggerRef logger = engine_->get_log_manager()->get_logger(id_);
  const log::LogRange log_range = logger.get_log_range(base_epoch, until_epoch);
  // uint64_t cur_offset = log_range.begin_offset;
  if (log_range.is_empty()) {
    LOG(INFO) << to_string() << " has no logs to process";
    report_completion(0);
    return kRetOk;
  }

  // open the file and seek to there. be careful on page boundary.
  // as we use direct I/O, all I/O must be 4kb-aligned. when the read range is not
  // a multiply of 4kb, we read a little bit more (at most 4kb per read, so negligible).
  // to clarify, here we use the following suffixes
  //   "infile"/"inbuf" : the offset is an offset in entire file/IO buffer
  //   "aligned" : the offset is 4kb-aligned (careful on floor vs ceil)
  // Lengthy, but otherwise it's so confusing.
  processed_log_count_ = 0;
  IoBufStatus status;
  status.size_inbuf_aligned_ = io_buffer_.get_size();
  status.cur_file_ordinal_ = log_range.begin_file_ordinal;
  status.ended_ = false;
  status.first_read_ = true;
  debugging::StopWatch watch;
  while (!status.ended_) {  // loop for log file switch
    fs::Path path(engine_->get_options().log_.construct_suffixed_log_path(
      numa_node_,
      id_,
      status.cur_file_ordinal_));
    uint64_t file_size = fs::file_size(path);
    if (file_size % kIoAlignment != 0) {
      LOG(WARNING) << to_string() << " Interesting, non-aligned file size, which probably means"
        << " previous writes didn't flush. file path=" << path << ", file size=" << file_size;
      file_size = align_io_floor(file_size);
    }
    ASSERT_ND(file_size % kIoAlignment == 0);
    status.size_infile_aligned_ = file_size;

    // If this is the first file to read, we might be reading from non-zero position.
    // In that case, be careful on alignment.
    if (status.cur_file_ordinal_ == log_range.begin_file_ordinal) {
      status.next_infile_ = log_range.begin_offset;
    } else {
      status.next_infile_ = 0;
    }

    if (status.cur_file_ordinal_ == log_range.end_file_ordinal) {
      ASSERT_ND(log_range.end_offset <= file_size);
      status.end_infile_ = log_range.end_offset;
    } else {
      status.end_infile_ = file_size;
    }

    DVLOG(1) << to_string() << " file path=" << path << ", file size=" << assorted::Hex(file_size)
      << ", read_end=" << assorted::Hex(status.end_infile_);
    fs::DirectIoFile file(path, engine_->get_options().snapshot_.emulation_);
    WRAP_ERROR_CODE(file.open(true, false, false, false));
    DVLOG(1) << to_string() << "opened log file " << file;

    while (true) {
      WRAP_ERROR_CODE(check_cancelled());  // check per each read
      status.buf_infile_aligned_ = align_io_floor(status.next_infile_);
      WRAP_ERROR_CODE(file.seek(status.buf_infile_aligned_, fs::DirectIoFile::kDirectIoSeekSet));
      DVLOG(1) << to_string() << " seeked to: " << assorted::Hex(status.buf_infile_aligned_);
      status.end_inbuf_aligned_ = std::min(
        io_buffer_.get_size(),
        align_io_ceil(status.end_infile_ - status.buf_infile_aligned_));
      ASSERT_ND(status.end_inbuf_aligned_ % kIoAlignment == 0);
      WRAP_ERROR_CODE(file.read(status.end_inbuf_aligned_, &io_buffer_));

      status.cur_inbuf_ = 0;
      if (status.next_infile_ != status.buf_infile_aligned_) {
        ASSERT_ND(status.next_infile_ > status.buf_infile_aligned_);
        status.cur_inbuf_ = status.next_infile_ - status.buf_infile_aligned_;
        status.cur_inbuf_ = status.next_infile_ - status.buf_infile_aligned_;
        DVLOG(1) << to_string() << " skipped " << status.cur_inbuf_ << " bytes for aligned read";
      }

      CHECK_ERROR(handle_process_buffer(file, &status));
      if (status.more_in_the_file_) {
        ASSERT_ND(status.next_infile_ > status.buf_infile_aligned_);
      } else {
        if (log_range.end_file_ordinal == status.cur_file_ordinal_) {
          status.ended_ = true;
          break;
        } else {
          ++status.cur_file_ordinal_;
          status.next_infile_ = 0;
          LOG(INFO) << to_string()
            << " moved on to next log file ordinal " << status.cur_file_ordinal_;
        }
      }
    }
    file.close();
  }
  watch.stop();
  LOG(INFO) << to_string() << " processed " << processed_log_count_ << " log entries in "
    << watch.elapsed_sec() << "s";
  report_completion(watch.elapsed_sec());
  return kRetOk;
}
Exemplo n.º 25
0
ErrorStack MetaLogger::truncate_non_durable(Epoch saved_durable_epoch) {
  ASSERT_ND(saved_durable_epoch.is_valid());
  const uint64_t from_offset = control_block_->oldest_offset_;
  const uint64_t to_offset = control_block_->durable_offset_;
  ASSERT_ND(from_offset <= to_offset);
  LOG(INFO) << "Truncating non-durable meta logs, if any. Right now meta logger's"
    << " oldest_offset_=" << from_offset
    << ", (meta logger's local) durable_offset_=" << to_offset
    << ", global saved_durable_epoch=" << saved_durable_epoch;
  ASSERT_ND(current_file_->is_opened());

  // Currently, we need to read everything from oldest_offset_ to see from where
  // We might have non-durable logs.
  // TASK(Hideaki) We should change SavepointManager to emit globally_durable_offset_. later.
  const uint64_t read_size = to_offset - from_offset;
  if (read_size > 0) {
    memory::AlignedMemory buffer;
    buffer.alloc(read_size, 1U << 12, memory::AlignedMemory::kNumaAllocOnnode, 0);
    WRAP_ERROR_CODE(current_file_->seek(from_offset, fs::DirectIoFile::kDirectIoSeekSet));
    WRAP_ERROR_CODE(current_file_->read_raw(read_size, buffer.get_block()));

    char* buf = reinterpret_cast<char*>(buffer.get_block());
    uint64_t cur = 0;
    uint64_t first_non_durable_at = read_size;
    while (cur < read_size) {
      log::BaseLogType* entry = reinterpret_cast<log::BaseLogType*>(buf + cur);
      ASSERT_ND(entry->header_.get_kind() != log::kRecordLogs);
      const uint32_t log_length = entry->header_.log_length_;
      log::LogCode type = entry->header_.get_type();
      ASSERT_ND(type != log::kLogCodeInvalid);
      if (type == log::kLogCodeFiller || type == log::kLogCodeEpochMarker) {
        // Skip filler/marker. These don't have XID
      } else {
        Epoch epoch = entry->header_.xct_id_.get_epoch();
        if (epoch <= saved_durable_epoch) {
          // Mostly this case.
        } else {
          // Ok, found a non-durable entry!
          const uint64_t raw_offset = from_offset + cur;
          on_non_durable_meta_log_found(&entry->header_, saved_durable_epoch, raw_offset);
          ASSERT_ND(first_non_durable_at == read_size || first_non_durable_at < cur);
          first_non_durable_at = std::min(first_non_durable_at, cur);
          // We can break here, but let's read all and warn all of them. meta log should be tiny
        }
      }
      cur += log_length;
    }

    if (first_non_durable_at < read_size) {
      // NOTE: This happens. Although the meta logger itself immediately flushes all logs
      // to durable storages, the global durable_epoch is min(all_logger_durable_epoch).
      // Thus, when the user didn't invoke wait_on_commit, we might have to discard
      // some meta logs that are "durable by itself" but "non-durable regarding the whole database"
      LOG(WARNING) << "Found some meta logs that are not in durable epoch (" << saved_durable_epoch
        << "). We will truncate non-durable regions. new durable_offset=" << first_non_durable_at;
      control_block_->durable_offset_ = first_non_durable_at;
      engine_->get_savepoint_manager()->change_meta_logger_durable_offset(first_non_durable_at);
    }
  } else {
    // Even if all locally-durable regions are globally durable,
    // there still could be locally-non-durable regions (=not yet fsynced).
    // Will truncate such regions.
    LOG(ERROR) << "Meta log file has a non-durable region. Probably there"
      << " was a crash. Will truncate";
  }

  const uint64_t new_offset = control_block_->durable_offset_;
  if (new_offset < current_file_->get_current_offset()) {
    LOG(WARNING) << "Truncating meta log file to " << new_offset
      << " from " << current_file_->get_current_offset();
    WRAP_ERROR_CODE(current_file_->truncate(new_offset, true));
  }
  WRAP_ERROR_CODE(current_file_->seek(new_offset, fs::DirectIoFile::kDirectIoSeekSet));
  return kRetOk;
}