Example #1
0
bool InnerNode::load_all_msgbuf(BlockReader& reader)
{
    Slice buffer;
    if (tree_->compressor_) {
        size_t buffer_length = first_msgbuf_uncompressed_length_;
        for (size_t i = 0; i < pivots_.size(); i++) {
            if (buffer_length < pivots_[i].uncompressed_length) {
                buffer_length = pivots_[i].uncompressed_length;
            }
        }

        buffer = Slice::alloc(buffer_length);
    }

    if (first_msgbuf_ == NULL) {
        reader.seek(first_msgbuf_offset_);
        first_msgbuf_ = new MsgBuf(tree_->options_.comparator);
        if (!read_msgbuf(reader, first_msgbuf_length_,
                         first_msgbuf_uncompressed_length_, 
                         first_msgbuf_, buffer)) {
            if (buffer.size()) {
                buffer.destroy();
            }
            return false;
        }
        msgcnt_ += first_msgbuf_->count();
        msgbufsz_ += first_msgbuf_->size();
    }

    for (size_t i = 0; i < pivots_.size(); i++) {
        if (pivots_[i].msgbuf == NULL) {
            reader.seek(pivots_[i].offset);
            pivots_[i].msgbuf = new MsgBuf(tree_->options_.comparator);
            if (!read_msgbuf(reader, pivots_[i].length,
                             pivots_[i].uncompressed_length,
                             pivots_[i].msgbuf, buffer)) {
                if (buffer.size()) {
                    buffer.destroy();
                }
                return false;
            }
            msgcnt_ += pivots_[i].msgbuf->count();
            msgbufsz_ += pivots_[i].msgbuf->size();
        }
    }

    if (buffer.size()) {
        buffer.destroy();
    }

    status_ = kFullLoaded;
    return true;
}
Example #2
0
bool LeafNode::write_to(BlockWriter& writer, size_t& skeleton_size)
{
    assert(status_ == kNew || status_ == kFullLoaded);

    size_t skeleton_pos = writer.pos();
    skeleton_size = 8 + 8 + buckets_info_size_;
    if (!writer.skip(skeleton_size)) return false;

    Slice buffer;
    if (tree_->compressor_) {
        size_t buffer_length = 0;
        for (size_t i = 0; i < records_.buckets_number(); i++) {
            if (buffer_length < records_.bucket_length(i)) {
                buffer_length = records_.bucket_length(i);
            }
        }
        buffer = Slice::alloc(buffer_length);
    }

    assert(records_.buckets_number() == buckets_info_.size());
    for (size_t i = 0; i < records_.buckets_number(); i++ ) {
        RecordBucket* bucket = records_.bucket(i);

        buckets_info_[i].offset = writer.pos();
        if (!write_bucket(writer, bucket, buffer)) {
            if (buffer.size()) {
                buffer.destroy();
            }
            return false;
        }
        buckets_info_[i].length = writer.pos() - buckets_info_[i].offset;
        buckets_info_[i].uncompressed_length = records_.bucket_length(i);
    }
    size_t last_pos = writer.pos();

    if (buffer.size()) {
        buffer.destroy();
    }

    writer.seek(skeleton_pos);
    if (!writer.writeUInt64(left_sibling_)) return false;
    if (!writer.writeUInt64(right_sibling_)) return false;

    if (!write_buckets_info(writer)) {
        LOG_ERROR("write buckets info error, nid " << nid_);
        return false;
    }
    writer.seek(last_pos);
    return true;
}
Example #3
0
TEST(DB, read_and_write) {
    Options opts;
    opts.dir = create_ram_directory();
    opts.comparator = new NumericComparator<uint64_t>();
    opts.inner_node_page_size = 4 * 1024;
    opts.inner_node_children_number = 64;
    opts.leaf_node_page_size = 4 * 1024;
    opts.cache_limit = 32 * 1024;

    DB *db = DB::open("test_db", opts);
    EXPECT_TRUE(db != NULL);

    for (uint64_t i = 0; i < 100000; i++ ) {
        char buf[16] = {0};
        sprintf(buf, "%ld", i);
        Slice key = Slice((char*)&i, sizeof(uint64_t));
        Slice value = Slice(buf, strlen(buf));
        ASSERT_TRUE(db->put(key, value)) << "put key " << i << " error";
        if (i % 10000 == 0) {
            cout << "write " << i << " records" << endl;
        }
    }

    db->flush();
    db->debug_print(cout);

    for (uint64_t i = 0; i < 100000; i++ ) {
        Slice key = Slice((char*)&i, sizeof(uint64_t));
        Slice value;
        ASSERT_TRUE(db->get(key, value)) << "get key " << i << " error";

        char buf[16] = {0};
        sprintf(buf, "%ld", i);
        ASSERT_EQ(value.size(), strlen(buf)) << "get key " << i << " value size unequal" ;
        ASSERT_TRUE(strncmp(buf, value.data(), value.size()) == 0) << "get key " << i << " value data unequal";
        value.destroy();

        if (i % 10000 == 0) {
            cout << "read " << i << " records" << endl;
        }
    }

    delete db;
    delete opts.dir;
    delete opts.comparator;
}
Example #4
0
bool LeafNode::load_all_buckets(BlockReader& reader)
{
    Slice buffer;
    if (tree_->compressor_) {
        size_t buffer_length = 0;
        for (size_t i = 0; i < buckets_info_.size(); i++ ) {
            if (buffer_length < buckets_info_[i].uncompressed_length) {
                buffer_length = buckets_info_[i].uncompressed_length;
            }
        }
        buffer = Slice::alloc(buffer_length);
    }

    bool ret = true;
    for (size_t i = 0; i < buckets_info_.size(); i++) {
        reader.seek(buckets_info_[i].offset);

        RecordBucket *bucket = new RecordBucket();
        if (bucket == NULL) {
            ret = false;
            break;
        }

        if (!read_bucket(reader, buckets_info_[i].length, 
                         buckets_info_[i].uncompressed_length,
                         bucket, buffer)) {
            ret = false;
            delete bucket;
            break;
        }

        records_.set_bucket(i, bucket);
    }

    if (buffer.size()) {
        buffer.destroy();
    }

    status_ = kFullLoaded;
    return ret;
}
Example #5
0
bool LeafNode::cascade(MsgBuf *mb, InnerNode* parent)
{
    write_lock();

    if (status_ == kSkeletonLoaded) {
        load_all_buckets();
    }

    // lock message buffer from parent
    mb->write_lock();
    size_t oldcnt = mb->count();
    size_t oldsz = mb->size();

    Slice anchor = mb->begin()->key.clone();

    // merge message buffer into leaf
    RecordBuckets res(tree_->options_.leaf_node_bucket_size);

    MsgBuf::Iterator it = mb->begin();
    RecordBuckets::Iterator jt = records_.get_iterator();
    while (it != mb->end() && jt.valid()) {
        int n = tree_->options_.comparator->compare(it->key, jt.record().key);
        if (n < 0) {
            if (it->type == Put) {
                res.push_back(to_record(*it));
            } else {
                // just throw deletion to non-exist record
                it->destroy();
            }
            it ++;
        } else if (n > 0) {
            res.push_back(jt.record());
            jt.next();
        } else {
            if (it->type == Put) {
                res.push_back(to_record(*it));
            }
            // old record is deleted
            it ++;
            jt.record().key.destroy();
            jt.record().value.destroy();
            jt.next();
        }
    }
    for (; it != mb->end(); it++) {
        if (it->type == Put) {
            res.push_back(to_record(*it));
        }
    }
    while(jt.valid()) {
        res.push_back(jt.record());
        jt.next();
    }
    records_.swap(res);

    refresh_buckets_info();
    set_dirty(true);

    // clear message buffer
    mb->clear();
    parent->msgcnt_ = parent->msgcnt_ + mb->count() - oldcnt;
    parent->msgbufsz_ = parent->msgbufsz_ + mb->size() - oldsz;

    // unlock message buffer
    mb->unlock();
    // crab walk
    parent->unlock();

    if (records_.size() == 0) {
        merge(anchor);
    } else if (records_.size() > 1 && (records_.size() > 
        tree_->options_.leaf_node_record_count || size() > 
        tree_->options_.leaf_node_page_size)) {
        split(anchor);
    } else {
        unlock();
    }
    
    anchor.destroy();
    return true;
}
Example #6
0
bool InnerNode::write_to(BlockWriter& writer, size_t& skeleton_size)
{
    // get length of skeleton and reserve space for skeleton
    size_t skeleton_offset = writer.pos();
    size_t skeleton_length = 1 + 4 + 8 + 4 + 4 + 4;
    for (size_t i = 0; i < pivots_.size(); i++) {
        skeleton_length += pivot_size(pivots_[i].key);
    }
    if (!writer.skip(skeleton_length)) return false;

    // prepare buffer if compression is enabled
    Slice buffer;
    if (tree_->compressor_) {
        // get buffer length to serialize msgbuf
        size_t buffer_length = first_msgbuf_->size();
        for (size_t i = 0; i < pivots_.size(); i++) {
            if (pivots_[i].msgbuf->size() > buffer_length)
                buffer_length = pivots_[i].msgbuf->size();
        }

        buffer = Slice::alloc(buffer_length);
    }

    // write the first msgbuf
    first_msgbuf_offset_ = writer.pos();
    if (!write_msgbuf(writer, first_msgbuf_, buffer)) return false;
    first_msgbuf_length_ = writer.pos() - first_msgbuf_offset_;
    first_msgbuf_uncompressed_length_ = first_msgbuf_->size();

    // write rest msgbufs
    for (size_t i = 0; i < pivots_.size(); i++) {
        pivots_[i].offset = writer.pos();
        if (!write_msgbuf(writer, pivots_[i].msgbuf, buffer)) return false;
        pivots_[i].length = writer.pos() - pivots_[i].offset;
        pivots_[i].uncompressed_length = pivots_[i].msgbuf->size();
    }

    if (buffer.size()) {
        buffer.destroy();
    }

    size_t last_offset = writer.pos();

    // seek to the head and write index
    writer.seek(skeleton_offset);
    if (!writer.writeBool(bottom_)) return false;
    if (!writer.writeUInt32(pivots_.size())) return false;

    if (!writer.writeUInt64(first_child_)) return false;
    if (!writer.writeUInt32(first_msgbuf_offset_)) return false;
    if (!writer.writeUInt32(first_msgbuf_length_)) return false;
    if (!writer.writeUInt32(first_msgbuf_uncompressed_length_)) return false;

    for (size_t i = 0; i < pivots_.size(); i++) {
        if (!writer.writeSlice(pivots_[i].key)) return false;
        if (!writer.writeUInt64(pivots_[i].child)) return false;
        if (!writer.writeUInt32(pivots_[i].offset)) return false;
        if (!writer.writeUInt32(pivots_[i].length)) return false;
        if (!writer.writeUInt32(pivots_[i].uncompressed_length)) return false;
    }

    writer.seek(last_offset);
    skeleton_size = skeleton_length;
    return true;
}
Example #7
0
bool InnerNode::load_msgbuf(int idx)
{
    uint32_t offset;
    uint32_t length;
    uint32_t uncompressed_length;
    if (idx == 0) {
        offset = first_msgbuf_offset_;
        length = first_msgbuf_length_;
        uncompressed_length = first_msgbuf_uncompressed_length_;
    } else {
        offset = pivots_[idx-1].offset;
        length = pivots_[idx-1].length;
        uncompressed_length = pivots_[idx-1].uncompressed_length;
    }

    Block* block = tree_->layout_->read(nid_, offset, length);
    if (block == NULL) {
        LOG_ERROR("read msgbuf from layout error " << " nid " << nid_ << ", idx " << idx
            << ", offset " << offset << ", length " << length);
        return false;
    }

    BlockReader reader(block);

    Slice buffer;
    if (tree_->compressor_) {
        buffer = Slice::alloc(uncompressed_length);
    }

    MsgBuf *b = new MsgBuf(tree_->options_.comparator);
    assert(b);

    if (!read_msgbuf(reader, length, uncompressed_length, b, buffer)) {
        LOG_ERROR("read_msgbuf error " << " nid " << nid_ << ", idx " << idx);
        delete b;
        if (buffer.size()) {
            buffer.destroy();
        }
        tree_->layout_->destroy(block);
        return false;
    }

    if (buffer.size()) {
        buffer.destroy();
    }

    // lazy load, upgrade lock to write lock
    // TODO: write a upgradable rwlock
    unlock();
    write_lock();

    MsgBuf **pb = (idx == 0) ? &first_msgbuf_ : &(pivots_[idx-1].msgbuf);
    if (*pb == NULL) {
        *pb = b;
        msgcnt_ += b->count();
        msgbufsz_ += b->size();
    } else {
        delete b;
    }

    unlock();
    read_lock();

    tree_->layout_->destroy(block);
    return true;
}
Example #8
0
bool LeafNode::load_bucket(size_t idx)
{
    assert(status_ != kFullLoaded);
    assert(idx < buckets_info_.size());
    assert(records_.bucket(idx) == NULL);

    uint32_t offset = buckets_info_[idx].offset;
    uint32_t length = buckets_info_[idx].length;
    uint32_t uncompressed_length = buckets_info_[idx].uncompressed_length;

    Block* block = tree_->layout_->read(nid_, offset, length);
    if (block == NULL) {
        LOG_ERROR("read bucket error " << " nid " << nid_ << ", idx " << idx
            << ", offset " << offset << ", length " << length);
        return false;
    }

    BlockReader reader(block);

    RecordBucket *bucket = new RecordBucket();
    if (bucket == NULL) {
        tree_->layout_->destroy(block);
        return false;
    }

    Slice buffer;
    if (tree_->compressor_) {
        buffer = Slice::alloc(uncompressed_length);
    }

    if (!read_bucket(reader, length, uncompressed_length, bucket, buffer)) {
        if (buffer.size()) {
            buffer.destroy();
        }
        delete bucket;
        tree_->layout_->destroy(block);
        return false;
    }

    if (buffer.size()) {
        buffer.destroy();
    }

    // this operation must be inside read lock

    // lazy load, upgrade lock to write lock
    // TODO: write a upgradable rwlock
    unlock();
    write_lock();
    if (records_.bucket(idx) == NULL) {
        records_.set_bucket(idx, bucket);
    } else {
        // it's possible another read thread loading 
        // the same block at the same time
        delete bucket;
    }
    unlock();
    read_lock();

    tree_->layout_->destroy(block);
    return true;
}