rc_t btree_impl::_ux_assure_fence_low_entry(btree_page_h &leaf) { w_assert1(leaf.is_fixed()); w_assert1(leaf.latch_mode() == LATCH_EX); if (!leaf.is_leaf()) { // locks are taken only for leaf-page entries. this case isn't an issue return RCOK; } w_keystr_t fence_low; leaf.copy_fence_low_key(fence_low); bool needs_to_create = false; if (leaf.nrecs() == 0) { if (leaf.compare_with_fence_high(fence_low) == 0) { // low==high happens only during page split. In that case, no one can have a lock // in the page being created. No need to assure the record. return RCOK; } needs_to_create = true; } else { w_keystr_t first_key; leaf.get_key(0, first_key); w_assert1(fence_low.compare(first_key) <= 0); // can't be fence_low>first_key if (fence_low.compare(first_key) < 0) { // fence-low doesn't exist as an entry! needs_to_create = true; } } if (needs_to_create) { W_DO(_sx_reserve_ghost(leaf, fence_low, 0)); // no data is needed } return RCOK; }
w_rc_t bf_tree_m::_grab_free_block(bf_idx& ret, bool evict) { ret = 0; while (true) { // once the bufferpool becomes full, getting _freelist_lock everytime will be // too costly. so, we check _freelist_len without lock first. // false positive : fine. we do real check with locks in it // false negative : fine. we will eventually get some free block anyways. if (_freelist_len > 0) { CRITICAL_SECTION(cs, &_freelist_lock); if (_freelist_len > 0) { // here, we do the real check bf_idx idx = FREELIST_HEAD; DBG5(<< "Grabbing idx " << idx); w_assert1(_is_valid_idx(idx)); w_assert1 (!get_cb(idx)._used); ret = idx; --_freelist_len; if (_freelist_len == 0) { FREELIST_HEAD = 0; } else { FREELIST_HEAD = _freelist[idx]; w_assert1 (FREELIST_HEAD > 0 && FREELIST_HEAD < _block_cnt); } DBG5(<< "New head " << FREELIST_HEAD); w_assert1(ret != FREELIST_HEAD); return RCOK; } } // exit the scope to do the following out of the critical section
rc_t btree_impl::_sx_adopt_foster_all_core ( btree_page_h &parent, bool is_root, bool recursive) { // TODO this should use the improved tree-walk-through // See jira ticket:60 "Tree walk-through without more than 2 pages latched" (originally trac ticket:62) w_assert1 (xct()->is_sys_xct()); w_assert1 (parent.is_fixed()); w_assert1 (parent.latch_mode() == LATCH_EX); if (parent.is_node()) { w_assert1(parent.pid0()); W_DO(_sx_adopt_foster_sweep(parent)); if (recursive) { // also adopt at all children recursively for (int i = -1; i < parent.nrecs(); ++i) { btree_page_h child; PageID shpid_opaqueptr = i == -1 ? parent.get_foster_opaqueptr() : parent.child_opaqueptr(i); W_DO(child.fix_nonroot(parent, shpid_opaqueptr, LATCH_EX)); W_DO(_sx_adopt_foster_all_core(child, false, true)); } } } // after all adopts, if this parent is the root and has foster, // let's grow the tree if (is_root && parent.get_foster()) { W_DO(_sx_grow_tree(parent)); W_DO(_sx_adopt_foster_sweep(parent)); } w_assert3(parent.is_consistent(true, true)); return RCOK; }
void sthread_t::align_bufsize(size_t size, long W_IFDEBUG1(system_page_size), long max_page_size) { // *********************************************************** // // PROPERLY ALIGN ARGUMENTS TO MMAP // // The max page size should be a multiple of the system page size - // that should be a given. w_assert1(alignon(max_page_size, system_page_size) == max_page_size); // // The size requested must be multiples of // the page size to be used as well as of the system page size, // and while it doesn't have to be a multiple of the SM page // size, it must at least accommodate the size requested, which // is a multiple of the SM page size. // *********************************************************** _disk_buffer_size = alignon(size, max_page_size); w_assert1(_disk_buffer_size >= size); // goes without saying // should now be aligned on both page sizes w_assert1(size_t(alignon(_disk_buffer_size, max_page_size)) == _disk_buffer_size); w_assert1(size_t(alignon(_disk_buffer_size, system_page_size)) == _disk_buffer_size); }
// allows reuse rather than free/malloc of the structure xct_lock_info_t* xct_lock_info_t::reset_for_reuse() { // make sure the lock lists are empty w_assert1(_head == NULL); w_assert1(_tail == NULL); new (this) xct_lock_info_t; return this; }
void btree_page_data::remove_items( const int item_count, // In: Number of records to remove const w_keystr_t &high) // In: high fence after record removal { // Use this function with caution // A special helper function to remove 'item_count' largest items from the storage // this function is only used by full logging page rebalance restart operation // to recover the source page after a system crash // the caller resets the fence keys on source page which eliminate some // of the records from source page // this function removes the largest 'item_count' items from the page // because they belong to destination page after the rebalance // After the removal, item count changed but no change to ghost count w_assert1(btree_level >= 1); w_assert1(nitems > item_count); // Must have at least one record which is the fency key record w_assert3(_items_are_consistent()); if ((0 == item_count) || (1 == nitems)) // If 1 == nitems, we only have a fence key record return; DBGOUT3( << "btree_page_data::reset_item_count - before deletion item count: " << nitems << ", new high fence key: " << high); int remaining = item_count; char* high_key_p = (char *)high.buffer_as_keystr(); size_t high_key_length = (size_t)high.get_length_as_keystr(); while (0 < remaining) { w_assert1(1 < nitems); // Find the records with key >= new high fence key and delete them int item_index = 1; // Start with index 1 since 0 is for the fence key record uint16_t* key_length;; size_t item_len; int cmp; const int data_offset = sizeof(uint16_t); // To skipover the portion which contains the size of variable data for (int i = item_index; i < nitems; ++i) { key_length = (uint16_t*)item_data(i); item_len = *key_length++; cmp = ::memcmp(high_key_p, item_data(i)+data_offset, (high_key_length<=item_len)? high_key_length : item_len); if ((0 > cmp) || ((0 == cmp) && (high_key_length <= item_len))) { // The item is larger than the new high fence key or the same as high fence key (high fence is ghost) DBGOUT3( << "btree_page_data::reset_item_count - delete record index: " << i); // Delete the item, which changes nitems but no change to nghosts // therefore break out the loop and start the loop again if we have more items to remove delete_item(i); break; } } --remaining; }
void vol_t::build_caches(bool truncate) { _stnode_cache = new stnode_cache_t(truncate); w_assert1(_stnode_cache); _stnode_cache->dump(cerr); _alloc_cache = new alloc_cache_t(*_stnode_cache, truncate); w_assert1(_alloc_cache); }
w_rc_t fixable_page_h::refix_direct (bf_idx idx, latch_mode_t mode, bool conditional) { w_assert1(idx != 0); w_assert1(mode != LATCH_NL); unfix(); W_DO(smlevel_0::bf->refix_direct(_pp, idx, mode, conditional)); _bufferpool_managed = true; _mode = mode; return RCOK; }
rc_t btree_impl::_ux_norec_alloc_core(btree_page_h &page, PageID &new_page_id) { // This is called only in REDO-only SSX, so no compensation logging. Just apply. w_assert1 (xct()->is_single_log_sys_xct()); w_assert1 (page.latch_mode() == LATCH_EX); W_DO(smlevel_0::vol->alloc_a_page(new_page_id)); btree_page_h new_page; w_rc_t rc; rc = new_page.fix_nonroot(page, new_page_id, LATCH_EX, false, true); if (rc.is_error()) { // if failed for any reason, we release the allocated page. W_DO(smlevel_0::vol ->deallocate_page(new_page_id)); return rc; } // The new page has an empty key range; parent's high to high. w_keystr_t fence, chain_high; page.copy_fence_high_key(fence); bool was_right_most = (page.get_chain_fence_high_length() == 0); page.copy_chain_fence_high_key(chain_high); if (was_right_most) { // this means there was no chain or the page was the right-most of it. // (so its high=high of chain) // upon the first foster split, we start setting the chain-high. page.copy_fence_high_key(chain_high); } #if W_DEBUG_LEVEL >= 3 lsn_t old_lsn = page.get_page_lsn(); #endif //W_DEBUG_LEVEL W_DO(log_btree_norec_alloc(page, new_page, new_page_id, fence, chain_high)); DBGOUT3(<< "btree_impl::_ux_norec_alloc_core, fence=" << fence << ", old-LSN=" << old_lsn << ", new-LSN=" << page.get_page_lsn() << ", PID=" << new_page_id); // initialize as an empty child: new_page.format_steal(page.get_page_lsn(), new_page_id, page.store(), page.root(), page.level(), 0, lsn_t::null, page.get_foster_opaqueptr(), page.get_foster_emlsn(), fence, fence, chain_high, false); page.accept_empty_child(page.get_page_lsn(), new_page_id, false /*not from redo*/); // in this operation, the log contains everything we need to recover without any // write-order-dependency. So, no registration for WOD. w_assert3(new_page.is_consistent(true, true)); w_assert1(new_page.is_fixed()); w_assert1(new_page.latch_mode() == LATCH_EX); w_assert3(page.is_consistent(true, true)); w_assert1(page.is_fixed()); return RCOK; }
rc_t lg_tag_indirect_h::append(uint4_t num_pages, const lpid_t new_pages[]) { FUNC(lg_tag_indirect_h::append); const uint max_pages = 64; shpid_t page_list[max_pages]; w_assert9(num_pages <= max_pages); for (uint i=0; i<num_pages; i++) page_list[i]=new_pages[i].page; if (_iref.indirect_root == 0) { // allocate a root indirect page, near last page in store lpid_t root_pid; W_DO(smlevel_0::io->alloc_a_page(stid(), lpid_t::eof, // near hint root_pid, // npages, array for output pids false, // not may_realloc EX, // lock on the allocated pages false // do not search file for free pages )); _iref.indirect_root = root_pid.page; lgindex_p root; W_DO( root.fix(root_pid, LATCH_EX, root.t_virgin) ); // perform fake read of the new page } // calculate the number of pages to append to last index page uint space_on_last = lgindex_p::max_pids- _pages_on_last_indirect(); uint4_t pages_on_last = MIN(num_pages, space_on_last); // number of pages to place on a new indirect_page uint4_t pages_on_new = num_pages - pages_on_last; // append pages to lpid_t last_index_pid(stid(), _last_indirect()); lgindex_p last_index; W_DO( last_index.fix(last_index_pid, LATCH_EX) ); w_assert1(last_index.is_fixed()); W_DO(last_index.append(pages_on_last, page_list)); if (pages_on_new) { lpid_t new_pid; W_DO(_add_new_indirect(new_pid)); lgindex_p last_index2; W_DO( last_index2.fix(new_pid, LATCH_EX) ); w_assert1(last_index2.is_fixed()); W_DO(last_index2.append(pages_on_new, page_list+pages_on_last)); } return RCOK; }
void vol_t::build_caches(bool truncate, chkpt_t* chkpt_info) { _stnode_cache = new stnode_cache_t(truncate); w_assert1(_stnode_cache); _stnode_cache->dump(cerr); _alloc_cache = new alloc_cache_t(*_stnode_cache, truncate, _cluster_stores); w_assert1(_alloc_cache); if (chkpt_info && !chkpt_info->bkp_path.empty()) { sx_add_backup(chkpt_info->bkp_path, chkpt_info->bkp_lsn, true); ERROUT(<< "Added backup: " << chkpt_info->bkp_path); }
void bt_cursor_t::_set_current_page(btree_page_h &page) { if (_pid != 0) { _release_current_page(); } w_assert1(_pid == 0); w_assert1(_pid_bfidx.idx() == 0); _pid = page.pid(); // pin this page for subsequent refix() _pid_bfidx.set(page.pin_for_refix()); _lsn = page.get_page_lsn(); #ifndef USE_ATOMIC_COMMIT w_assert1(_lsn.valid()); // must have a valid LSN for _check_page_update to work #endif }
vol_t::~vol_t() { if (_alloc_cache) { delete _alloc_cache; _alloc_cache = nullptr; } if (_stnode_cache) { delete _stnode_cache; _stnode_cache = nullptr; } w_assert1(_fd == -1); w_assert1(_backup_fd == -1); }
PoorMansOldestLsnTracker::PoorMansOldestLsnTracker(uint32_t buckets) { // same logic as in lock_core(). yes, stupid prime hashing. but see the name of this class. int b = 0; // count bits shifted for (_buckets = 1; _buckets < buckets; _buckets <<= 1) { b++; } w_assert1(b >= 6 && b <= 23); b -= 6; _buckets = primes[b]; _low_water_marks = new lsndata_t[_buckets]; w_assert1(_low_water_marks); ::memset(_low_water_marks, 0, sizeof(lsndata_t) * _buckets); }
CArraySlot* ConsolidationArray::join_slot(int32_t size, carray_status_t &old_count) { w_assert1(size > 0); carray_slotid_t idx = (carray_slotid_t) ::pthread_self(); while (true) { // probe phase CArraySlot* info = nullptr; while (true) { idx = (idx + 1) % _active_slot_count; info = _active_slots[idx]; old_count = info->vthis()->count; if (old_count >= SLOT_AVAILABLE) { // this slot is available for join! break; } } // join phase while (true) { // set to 'available' and add our size to the slot carray_status_t new_count = join_carray_status(old_count, size); carray_status_t old_count_cas_tmp = old_count; if(lintel::unsafe::atomic_compare_exchange_strong<carray_status_t>( &info->count, &old_count_cas_tmp, new_count)) { // CAS succeeded. All done. // The assertion below doesn't necessarily hold because of the // ABA problem -- someone else might have grabbed the same slot // and gone through a whole join-release cycle, so that info is // now on a different array position. In general, this second // while loop must not use idx at all. // w_assert1(old_count != 0 || _active_slots[idx] == info); return info; } else { // the status has been changed. w_assert1(old_count != old_count_cas_tmp); old_count = old_count_cas_tmp; if (old_count < SLOT_AVAILABLE) { // it's no longer available. retry from probe break; } else { // someone else has joined, but still able to join. continue; } } } } }
void sthread_t::align_for_sm(size_t W_IFDEBUG1(requested_size)) { char * _disk_buffer2 = (char *)alignon( _disk_buffer, SM_PAGESIZE); if( _disk_buffer2 != _disk_buffer) { // We made the size big enough that we can align it here _disk_buffer_disalignment = ( _disk_buffer2 - _disk_buffer); w_assert1( _disk_buffer_disalignment < SM_PAGESIZE); w_assert1( _disk_buffer_size - _disk_buffer_disalignment >= requested_size); _disk_buffer = _disk_buffer2; } }
w_rc_t latch_t::latch_acquire(latch_mode_t mode, sthread_t::timeout_in_ms timeout) { w_assert1(mode != LATCH_NL); holder_search me(this); return _acquire(mode, timeout, me.value()); }
w_rc_t fixable_page_h::fix_nonroot(const fixable_page_h &parent, PageID shpid, latch_mode_t mode, bool conditional, bool virgin_page) { w_assert1(parent.is_fixed()); w_assert1(mode != LATCH_NL); unfix(); W_DO(smlevel_0::bf->fix_nonroot(_pp, parent._pp, shpid, mode, conditional, virgin_page)); w_assert1(bf_tree_m::is_swizzled_pointer(shpid) || smlevel_0::bf->get_cb(_pp)->_pid == shpid); _bufferpool_managed = true; _mode = mode; return RCOK; }
bool sthread_t::isStackFrameOK(size_t size) { bool ok; void *stack_top = &ok; void *_stack_top = &ok - size; w_assert1(this->_danger < this->_start_frame); void *absolute_bottom = (void *)((char *)_start_frame - _stack_size); if( stack_top < _danger) { if( stack_top <= absolute_bottom) { fprintf(stderr, // In order of values: "STACK OVERFLOW frame (offset -%ld) %p bottom %p danger %p top %p stack_size %ld \n", // cast so it works for -m32 and -m64 (long int) size, _stack_top, absolute_bottom, _danger, _start_frame, (long int) _stack_size); } else { fprintf(stderr, // In order of values: "STACK IN GUARD AREA bottom %p frame (offset -%ld) %p danger %p top %p stack_size %ld \n", // cast so it works for -m32 and -m64 absolute_bottom, (long int) size, _stack_top, _danger, _start_frame, (long int) _stack_size); } return false; } return true; }
/********************************************************************* * * logrec_t::fill(pid, len) * * Fill the "pid" and "length" field of the log record. * *********************************************************************/ void logrec_t::fill(PageID p, StoreID store, uint16_t tag, smsize_t l) { w_assert9(w_base_t::is_aligned(_data)); /* adjust _cat */ xct_t *x = xct(); if(x && (x->rolling_back() || x->state() == smlevel_0::xct_aborting)) { header._cat |= t_rollback; } set_pid(0); if (!is_single_sys_xct()) { // prv does not exist in single-log system transaction set_xid_prev(lsn_t::null); } header._page_tag = tag; header._pid = p; header._stid = store; char *dat = is_single_sys_xct() ? data_ssx() : data(); if (l != ALIGN_BYTE(l)) { // zero out extra space to keep purify happy memset(dat+l, 0, ALIGN_BYTE(l)-l); } unsigned int tmp = ALIGN_BYTE(l) + (is_single_sys_xct() ? hdr_single_sys_xct_sz : hdr_non_ssx_sz) + sizeof(lsn_t); tmp = (tmp + 7) & unsigned(-8); // force 8-byte alignment w_assert1(tmp <= sizeof(*this)); header._len = tmp; if(type() != t_skip) { DBG( << "Creat log rec: " << *this << " size: " << header._len << " xid_prevlsn: " << (is_single_sys_xct() ? lsn_t::null : xid_prev()) ); }
void rep_row_t::set(const unsigned nsz) { if ((!_dest) || (_bufsz < nsz)) { char* tmp = _dest; // Using the trash stack assert (_pts); //_dest = new(*_pts) char(nsz); w_assert1(nsz <= _pts->nbytes()); _dest = (char*)_pts->acquire(); assert (_dest); // Failed to allocate such a big buffer if (tmp) { // delete [] tmp; _pts->destroy(tmp); tmp = nullptr; } _bufsz = _pts->nbytes(); } // in any case, clean up the buffer memset (_dest, 0, nsz); }
bool htab_remove(bf_core_m *core, bfpid_t const &pid, bf_core_m::Tstats &s) { bool ret(false); bfcb_t *cb = core->_htab->lookup(pid); if(cb) { // find the bucket so we can acquire the lock, // necessary for removal. // also ensure pin count is zero. int idx = core->_htab->hash(cb->hash_func(), pid); bf_core_m::htab::bucket &b = core->_htab->_table[idx]; cb->zero_pin_cnt(); CRITICAL_SECTION(cs, b._lock); bool bull = core->_htab->remove(cb); w_assert0(bull); w_assert1(cb->pin_cnt() == 0); } // It's possible that it couldn't remove the item // because the lock is not held or the pin count is > 0 if(ret) { w_assert2(cb->hash_func() == bf_core_m::htab::HASH_COUNT); } s = me()->TL_stats().bfht; return ret; }
void sortorder::Ibyteorder(int permutation[4]) { /* The following magic constant has the representation * 0x3f404142 on a BIGLONG machine. */ int magic = 0x3f404142; u_char *p = (u_char *)&magic; int i; for (i=0;i<4;i++) permutation[i] = p[i] - 0x3f; #ifdef BIGLONG /* verify that the BIGLONG assertion is correct */ for (i=0;i<4;i++) w_assert1(permutation[i] == i); w_assert3(w_base_t::is_big_endian()); #else #if W_DEBUG_LEVEL > 2 // Make sure lexify agrees with w_base_t if(permutation[1] == 1) { w_assert3(w_base_t::is_big_endian()); } else { w_assert3(w_base_t::is_little_endian()); } #endif #endif }
void bt_cursor_t::_release_current_page() { if (_pid != 0) { w_assert1(_pid_bfidx.idx() != 0); _pid_bfidx.release(); _pid = 0; } }
void mcs_rwlock::downgrade() { membar_exit(); // this is for all intents and purposes, a release w_assert1(*&_holders == WRITER); *&_holders = READER; membar_enter(); // but it's also an acquire }
int main(int argc, char **argv) { int i; int threads; if (parse_args(argc, argv) == -1) return 1; if (mix_it_up) threads = NumFloatThreads + NumIntThreads; else threads = NumFloatThreads > NumIntThreads ? NumFloatThreads : NumIntThreads; ack = new int[threads]; if (!ack) W_FATAL(fcOUTOFMEMORY); worker = new sthread_t *[threads]; if (!worker) W_FATAL(fcOUTOFMEMORY); for (i=0; i<NumIntThreads; ++i) { ack[i] = 0; worker[i] = new int_thread_t(i); w_assert1(worker[i]); W_COERCE( worker[i]->fork() ); } if (!mix_it_up) harvest(NumIntThreads); int base = mix_it_up ? NumIntThreads : 0; for(i=base ; i < base + NumFloatThreads; ++i){ ack[i] = 0; worker[i] = new float_thread_t(i); w_assert1(worker[i]); W_COERCE( worker[i]->fork() ); } harvest(mix_it_up ? threads : NumFloatThreads); delete [] worker; delete [] ack; return 0; }
void fixable_page_h::update_page_lsn(const lsn_t & lsn) const { if (_bufferpool_managed) { w_assert1(_pp); smlevel_0::bf->set_page_lsn(_pp, lsn); } }
vol_t::~vol_t() { if (_alloc_cache) { delete _alloc_cache; _alloc_cache = NULL; } if (_stnode_cache) { delete _stnode_cache; _stnode_cache = NULL; } w_assert1(_unix_fd == -1); w_assert1(_backup_fd == -1); if (_restore_mgr) { delete _restore_mgr; } }
void fixable_page_h::unset_to_be_deleted() { w_assert1(is_latched()); if ((_pp->page_flags & t_to_be_deleted) != 0) { _pp->page_flags ^= t_to_be_deleted; // we don't need set_dirty() as it's always dirty if this is ever called // (UNDOing this means the page wasn't deleted yet by bufferpool, so it's dirty) } }
void fixable_page_h::fix_nonbufferpool_page(generic_page* s) { w_assert1(s != NULL); unfix(); _pp = s; _bufferpool_managed = false; _mode = LATCH_EX; }