bool InnerNode::load_all_msgbuf(BlockReader& reader) { Slice buffer; if (tree_->compressor_) { size_t buffer_length = first_msgbuf_uncompressed_length_; for (size_t i = 0; i < pivots_.size(); i++) { if (buffer_length < pivots_[i].uncompressed_length) { buffer_length = pivots_[i].uncompressed_length; } } buffer = Slice::alloc(buffer_length); } if (first_msgbuf_ == NULL) { reader.seek(first_msgbuf_offset_); first_msgbuf_ = new MsgBuf(tree_->options_.comparator); if (!read_msgbuf(reader, first_msgbuf_length_, first_msgbuf_uncompressed_length_, first_msgbuf_, buffer)) { if (buffer.size()) { buffer.destroy(); } return false; } msgcnt_ += first_msgbuf_->count(); msgbufsz_ += first_msgbuf_->size(); } for (size_t i = 0; i < pivots_.size(); i++) { if (pivots_[i].msgbuf == NULL) { reader.seek(pivots_[i].offset); pivots_[i].msgbuf = new MsgBuf(tree_->options_.comparator); if (!read_msgbuf(reader, pivots_[i].length, pivots_[i].uncompressed_length, pivots_[i].msgbuf, buffer)) { if (buffer.size()) { buffer.destroy(); } return false; } msgcnt_ += pivots_[i].msgbuf->count(); msgbufsz_ += pivots_[i].msgbuf->size(); } } if (buffer.size()) { buffer.destroy(); } status_ = kFullLoaded; return true; }
bool LeafNode::write_to(BlockWriter& writer, size_t& skeleton_size) { assert(status_ == kNew || status_ == kFullLoaded); size_t skeleton_pos = writer.pos(); skeleton_size = 8 + 8 + buckets_info_size_; if (!writer.skip(skeleton_size)) return false; Slice buffer; if (tree_->compressor_) { size_t buffer_length = 0; for (size_t i = 0; i < records_.buckets_number(); i++) { if (buffer_length < records_.bucket_length(i)) { buffer_length = records_.bucket_length(i); } } buffer = Slice::alloc(buffer_length); } assert(records_.buckets_number() == buckets_info_.size()); for (size_t i = 0; i < records_.buckets_number(); i++ ) { RecordBucket* bucket = records_.bucket(i); buckets_info_[i].offset = writer.pos(); if (!write_bucket(writer, bucket, buffer)) { if (buffer.size()) { buffer.destroy(); } return false; } buckets_info_[i].length = writer.pos() - buckets_info_[i].offset; buckets_info_[i].uncompressed_length = records_.bucket_length(i); } size_t last_pos = writer.pos(); if (buffer.size()) { buffer.destroy(); } writer.seek(skeleton_pos); if (!writer.writeUInt64(left_sibling_)) return false; if (!writer.writeUInt64(right_sibling_)) return false; if (!write_buckets_info(writer)) { LOG_ERROR("write buckets info error, nid " << nid_); return false; } writer.seek(last_pos); return true; }
TEST(DB, read_and_write) { Options opts; opts.dir = create_ram_directory(); opts.comparator = new NumericComparator<uint64_t>(); opts.inner_node_page_size = 4 * 1024; opts.inner_node_children_number = 64; opts.leaf_node_page_size = 4 * 1024; opts.cache_limit = 32 * 1024; DB *db = DB::open("test_db", opts); EXPECT_TRUE(db != NULL); for (uint64_t i = 0; i < 100000; i++ ) { char buf[16] = {0}; sprintf(buf, "%ld", i); Slice key = Slice((char*)&i, sizeof(uint64_t)); Slice value = Slice(buf, strlen(buf)); ASSERT_TRUE(db->put(key, value)) << "put key " << i << " error"; if (i % 10000 == 0) { cout << "write " << i << " records" << endl; } } db->flush(); db->debug_print(cout); for (uint64_t i = 0; i < 100000; i++ ) { Slice key = Slice((char*)&i, sizeof(uint64_t)); Slice value; ASSERT_TRUE(db->get(key, value)) << "get key " << i << " error"; char buf[16] = {0}; sprintf(buf, "%ld", i); ASSERT_EQ(value.size(), strlen(buf)) << "get key " << i << " value size unequal" ; ASSERT_TRUE(strncmp(buf, value.data(), value.size()) == 0) << "get key " << i << " value data unequal"; value.destroy(); if (i % 10000 == 0) { cout << "read " << i << " records" << endl; } } delete db; delete opts.dir; delete opts.comparator; }
bool LeafNode::load_all_buckets(BlockReader& reader) { Slice buffer; if (tree_->compressor_) { size_t buffer_length = 0; for (size_t i = 0; i < buckets_info_.size(); i++ ) { if (buffer_length < buckets_info_[i].uncompressed_length) { buffer_length = buckets_info_[i].uncompressed_length; } } buffer = Slice::alloc(buffer_length); } bool ret = true; for (size_t i = 0; i < buckets_info_.size(); i++) { reader.seek(buckets_info_[i].offset); RecordBucket *bucket = new RecordBucket(); if (bucket == NULL) { ret = false; break; } if (!read_bucket(reader, buckets_info_[i].length, buckets_info_[i].uncompressed_length, bucket, buffer)) { ret = false; delete bucket; break; } records_.set_bucket(i, bucket); } if (buffer.size()) { buffer.destroy(); } status_ = kFullLoaded; return ret; }
bool LeafNode::cascade(MsgBuf *mb, InnerNode* parent) { write_lock(); if (status_ == kSkeletonLoaded) { load_all_buckets(); } // lock message buffer from parent mb->write_lock(); size_t oldcnt = mb->count(); size_t oldsz = mb->size(); Slice anchor = mb->begin()->key.clone(); // merge message buffer into leaf RecordBuckets res(tree_->options_.leaf_node_bucket_size); MsgBuf::Iterator it = mb->begin(); RecordBuckets::Iterator jt = records_.get_iterator(); while (it != mb->end() && jt.valid()) { int n = tree_->options_.comparator->compare(it->key, jt.record().key); if (n < 0) { if (it->type == Put) { res.push_back(to_record(*it)); } else { // just throw deletion to non-exist record it->destroy(); } it ++; } else if (n > 0) { res.push_back(jt.record()); jt.next(); } else { if (it->type == Put) { res.push_back(to_record(*it)); } // old record is deleted it ++; jt.record().key.destroy(); jt.record().value.destroy(); jt.next(); } } for (; it != mb->end(); it++) { if (it->type == Put) { res.push_back(to_record(*it)); } } while(jt.valid()) { res.push_back(jt.record()); jt.next(); } records_.swap(res); refresh_buckets_info(); set_dirty(true); // clear message buffer mb->clear(); parent->msgcnt_ = parent->msgcnt_ + mb->count() - oldcnt; parent->msgbufsz_ = parent->msgbufsz_ + mb->size() - oldsz; // unlock message buffer mb->unlock(); // crab walk parent->unlock(); if (records_.size() == 0) { merge(anchor); } else if (records_.size() > 1 && (records_.size() > tree_->options_.leaf_node_record_count || size() > tree_->options_.leaf_node_page_size)) { split(anchor); } else { unlock(); } anchor.destroy(); return true; }
bool InnerNode::write_to(BlockWriter& writer, size_t& skeleton_size) { // get length of skeleton and reserve space for skeleton size_t skeleton_offset = writer.pos(); size_t skeleton_length = 1 + 4 + 8 + 4 + 4 + 4; for (size_t i = 0; i < pivots_.size(); i++) { skeleton_length += pivot_size(pivots_[i].key); } if (!writer.skip(skeleton_length)) return false; // prepare buffer if compression is enabled Slice buffer; if (tree_->compressor_) { // get buffer length to serialize msgbuf size_t buffer_length = first_msgbuf_->size(); for (size_t i = 0; i < pivots_.size(); i++) { if (pivots_[i].msgbuf->size() > buffer_length) buffer_length = pivots_[i].msgbuf->size(); } buffer = Slice::alloc(buffer_length); } // write the first msgbuf first_msgbuf_offset_ = writer.pos(); if (!write_msgbuf(writer, first_msgbuf_, buffer)) return false; first_msgbuf_length_ = writer.pos() - first_msgbuf_offset_; first_msgbuf_uncompressed_length_ = first_msgbuf_->size(); // write rest msgbufs for (size_t i = 0; i < pivots_.size(); i++) { pivots_[i].offset = writer.pos(); if (!write_msgbuf(writer, pivots_[i].msgbuf, buffer)) return false; pivots_[i].length = writer.pos() - pivots_[i].offset; pivots_[i].uncompressed_length = pivots_[i].msgbuf->size(); } if (buffer.size()) { buffer.destroy(); } size_t last_offset = writer.pos(); // seek to the head and write index writer.seek(skeleton_offset); if (!writer.writeBool(bottom_)) return false; if (!writer.writeUInt32(pivots_.size())) return false; if (!writer.writeUInt64(first_child_)) return false; if (!writer.writeUInt32(first_msgbuf_offset_)) return false; if (!writer.writeUInt32(first_msgbuf_length_)) return false; if (!writer.writeUInt32(first_msgbuf_uncompressed_length_)) return false; for (size_t i = 0; i < pivots_.size(); i++) { if (!writer.writeSlice(pivots_[i].key)) return false; if (!writer.writeUInt64(pivots_[i].child)) return false; if (!writer.writeUInt32(pivots_[i].offset)) return false; if (!writer.writeUInt32(pivots_[i].length)) return false; if (!writer.writeUInt32(pivots_[i].uncompressed_length)) return false; } writer.seek(last_offset); skeleton_size = skeleton_length; return true; }
bool InnerNode::load_msgbuf(int idx) { uint32_t offset; uint32_t length; uint32_t uncompressed_length; if (idx == 0) { offset = first_msgbuf_offset_; length = first_msgbuf_length_; uncompressed_length = first_msgbuf_uncompressed_length_; } else { offset = pivots_[idx-1].offset; length = pivots_[idx-1].length; uncompressed_length = pivots_[idx-1].uncompressed_length; } Block* block = tree_->layout_->read(nid_, offset, length); if (block == NULL) { LOG_ERROR("read msgbuf from layout error " << " nid " << nid_ << ", idx " << idx << ", offset " << offset << ", length " << length); return false; } BlockReader reader(block); Slice buffer; if (tree_->compressor_) { buffer = Slice::alloc(uncompressed_length); } MsgBuf *b = new MsgBuf(tree_->options_.comparator); assert(b); if (!read_msgbuf(reader, length, uncompressed_length, b, buffer)) { LOG_ERROR("read_msgbuf error " << " nid " << nid_ << ", idx " << idx); delete b; if (buffer.size()) { buffer.destroy(); } tree_->layout_->destroy(block); return false; } if (buffer.size()) { buffer.destroy(); } // lazy load, upgrade lock to write lock // TODO: write a upgradable rwlock unlock(); write_lock(); MsgBuf **pb = (idx == 0) ? &first_msgbuf_ : &(pivots_[idx-1].msgbuf); if (*pb == NULL) { *pb = b; msgcnt_ += b->count(); msgbufsz_ += b->size(); } else { delete b; } unlock(); read_lock(); tree_->layout_->destroy(block); return true; }
bool LeafNode::load_bucket(size_t idx) { assert(status_ != kFullLoaded); assert(idx < buckets_info_.size()); assert(records_.bucket(idx) == NULL); uint32_t offset = buckets_info_[idx].offset; uint32_t length = buckets_info_[idx].length; uint32_t uncompressed_length = buckets_info_[idx].uncompressed_length; Block* block = tree_->layout_->read(nid_, offset, length); if (block == NULL) { LOG_ERROR("read bucket error " << " nid " << nid_ << ", idx " << idx << ", offset " << offset << ", length " << length); return false; } BlockReader reader(block); RecordBucket *bucket = new RecordBucket(); if (bucket == NULL) { tree_->layout_->destroy(block); return false; } Slice buffer; if (tree_->compressor_) { buffer = Slice::alloc(uncompressed_length); } if (!read_bucket(reader, length, uncompressed_length, bucket, buffer)) { if (buffer.size()) { buffer.destroy(); } delete bucket; tree_->layout_->destroy(block); return false; } if (buffer.size()) { buffer.destroy(); } // this operation must be inside read lock // lazy load, upgrade lock to write lock // TODO: write a upgradable rwlock unlock(); write_lock(); if (records_.bucket(idx) == NULL) { records_.set_bucket(idx, bucket); } else { // it's possible another read thread loading // the same block at the same time delete bucket; } unlock(); read_lock(); tree_->layout_->destroy(block); return true; }