int gu_log (gu_log_severity_t severity, const char* file, const char* function, const int line, ...) { va_list ap; int max_string = 2048; char string[max_string]; /** @note: this can cause stack overflow * in kernel mode (both Linux and Windows). */ char* str = string; int len; if (gu_log_self_tstamp) { len = log_tstamp (str, max_string); str += len; max_string -= len; } if (gu_likely(max_string > 0)) { const char* log_level_str = gu_log_cb_default == gu_log_cb ? gu_log_level_str[severity] : ""; /* provide file:func():line info only if debug logging is on */ if (gu_likely(!gu_log_debug && severity > GU_LOG_ERROR)) { len = snprintf (str, max_string, "%s", log_level_str); } else { len = snprintf (str, max_string, "%s%s:%s():%d: ", log_level_str, file, function, line); } str += len; max_string -= len; va_start (ap, line); { const char* format = va_arg (ap, const char*); if (gu_likely(max_string > 0 && NULL != format)) { vsnprintf (str, max_string, format, ap); } } va_end (ap); } /* actual logging */ gu_log_cb (severity, string); return 0; }
static GCS_BACKEND_RECV_FN(dummy_recv) { long ret = 0; dummy_t* conn = backend->conn; msg->sender_idx = GCS_SENDER_NONE; msg->type = GCS_MSG_ERROR; assert (conn); /* skip it if we already have popped a message from the queue * in the previous call */ if (gu_likely(DUMMY_CLOSED <= conn->state)) { int err; dummy_msg_t** ptr = gu_fifo_get_head (conn->gc_q, &err); if (gu_likely(ptr != NULL)) { dummy_msg_t* dmsg = *ptr; assert (NULL != dmsg); msg->type = dmsg->type; msg->sender_idx = dmsg->sender_idx; ret = dmsg->len; msg->size = ret; if (gu_likely(dmsg->len <= msg->buf_len)) { gu_fifo_pop_head (conn->gc_q); memcpy (msg->buf, dmsg->buf, dmsg->len); dummy_msg_destroy (dmsg); } else { // supplied recv buffer too short, leave the message in queue memcpy (msg->buf, dmsg->buf, msg->buf_len); gu_fifo_release (conn->gc_q); } } else { ret = -EBADFD; // closing gu_debug ("Returning %d: %s", ret, strerror(-ret)); } } else { ret = -EBADFD; } return ret; }
void* malloc (ssize_t size) { if (size > max_size_ || have_free_space(size) == false) return 0; assert (size_ + size <= max_size_); BufferHeader* bh (BH_cast (::malloc (size))); if (gu_likely(0 != bh)) { allocd_.insert(bh); bh->size = size; bh->seqno_g = SEQNO_NONE; bh->seqno_d = SEQNO_ILL; bh->flags = 0; bh->store = BUFFER_IN_MEM; bh->ctx = this; size_ += size; return (bh + 1); } return 0; }
/*! Injects a message in the message queue to produce a desired msg sequence. */ long gcs_dummy_inject_msg (gcs_backend_t* backend, const void* buf, size_t buf_len, gcs_msg_type_t type, long sender_idx) { long ret; size_t send_size = buf_len < backend->conn->max_send_size ? buf_len : backend->conn->max_send_size; dummy_msg_t* msg = dummy_msg_create (type, send_size, sender_idx, buf); if (msg) { dummy_msg_t** ptr = gu_fifo_get_tail (backend->conn->gc_q); if (gu_likely(ptr != NULL)) { *ptr = msg; gu_fifo_push_tail (backend->conn->gc_q); ret = send_size; } else { dummy_msg_destroy (msg); ret = -EBADFD; // closed } } else { ret = -ENOMEM; } return ret; }
bool GCache::discard_seqno (int64_t seqno) { // seqno = std::min(seqno, seqno_released); for (seqno2ptr_t::iterator i = seqno2ptr.begin(); i != seqno2ptr.end() && i->first <= seqno;) { seqno2ptr_t::iterator j(i); ++i; BufferHeader* bh(ptr2BH (j->second)); if (gu_likely(BH_is_released(bh))) { assert (bh->seqno_g <= seqno); seqno2ptr.erase (j); bh->seqno_g = SEQNO_ILL; // will never be reused switch (bh->store) { case BUFFER_IN_MEM: mem.discard (bh); break; case BUFFER_IN_RB: rb.discard (bh); break; case BUFFER_IN_PAGE: ps.discard (bh); break; default: log_fatal << "Corrupt buffer header: " << bh; abort(); } } else { return false; } } return true; }
void gcs_sm_stats_get (gcs_sm_t* sm, int* q_len, double* q_len_avg, long long* paused_ns, double* paused_avg) { gcs_sm_stats_t tmp; long long now; bool paused; if (gu_unlikely(gu_mutex_lock (&sm->lock))) abort(); *q_len = sm->users; tmp = sm->stats; now = gu_time_monotonic(); paused = sm->pause; gu_mutex_unlock (&sm->lock); if (paused) { // taking sample in a middle of a pause tmp.paused_ns += now - tmp.pause_start; } *paused_ns = tmp.paused_ns; if (gu_likely(tmp.paused_ns >= 0)) { *paused_avg = ((double)(tmp.paused_ns - tmp.paused_sample)) / (now - tmp.sample_start); } else { *paused_avg = -1.0; } if (gu_likely(tmp.send_q_len >= 0 && tmp.send_q_samples >= 0)){ if (gu_likely(tmp.send_q_samples > 0)) { *q_len_avg = ((double)tmp.send_q_len) / tmp.send_q_samples; } else { *q_len_avg = 0.0; } } else { *q_len_avg = -1.0; } }
inline size_t uleb128_decode(const byte_t* buf, size_t buflen, size_t offset, UI& value) { // initial check for overflow, at least one byte must be readable #ifdef GU_VLQ_CHECKS if (gu_unlikely(offset >= buflen)) gu_throw_fatal; #endif #ifdef GU_VLQ_ALEX value = buf[offset] & 0x7f; size_t shift(0); while (buf[offset] & 0x80) { ++offset; shift +=7; #ifdef GU_VLQ_CHECKS ssize_t left_bits((sizeof(UI) << 3) - shift); if (gu_unlikely(offset >= buflen || left_bits < 7)) uleb128_decode_checks (buf, buflen, offset, left_bits); #endif value |= (UI(buf[offset] & 0x7f) << shift); } return offset + 1; #else /* GU_VLQ_ALEX */ value = 0; size_t shift(0); while (true) { value |= (UI(buf[offset] & 0x7f) << shift); if (gu_likely((buf[offset] & 0x80) == 0)) { // last byte ++offset; break; } ++offset; shift += 7; #ifdef GU_VLQ_CHECKS ssize_t left_bits((sizeof(UI) << 3) - shift); if (gu_unlikely(offset >= buflen || left_bits < 7)) uleb128_decode_checks (buf, buflen, offset, left_bits); #endif } return offset; #endif /* GU_VLQ_ALEX */ }
byte_t* alloc (size_t size) { byte_t* ret = NULL; if (gu_likely(size <= left_)) { ret = ptr_; ptr_ += size; left_ -= size; } return ret; }
void RecordSet::init (const byte_t* const ptr, ssize_t const size) { assert (EMPTY == version_); assert (size >= 0); assert (NULL != ptr || 0 == size); assert (NULL == ptr || 0 != size); if (gu_likely ((ptr && size))) { version_ = header_version (ptr, size); check_type_ = header_check_type (version_, ptr, size); } }
static inline RecordSet::Version header_version (const byte_t* buf, ssize_t const size) { assert (NULL != buf); assert (size > 0); uint const ver((buf[0] & 0xf0) >> 4); assert (ver > 0); if (gu_likely(ver <= RecordSet::MAX_VERSION)) return static_cast<RecordSet::Version>(ver); gu_throw_error (EPROTO) << "Unsupported RecordSet version: " << ver; }
void GCache::free (void* ptr) { if (gu_likely(0 != ptr)) { BufferHeader* const bh(ptr2BH(ptr)); gu::Lock lock(mtx); free_common (bh); } else { log_warn << "Attempt to free a null pointer"; assert(0); } }
void enter(C& obj) { const wsrep_seqno_t obj_seqno(obj.seqno()); const size_t idx(indexof(obj_seqno)); gu::Lock lock(mutex_); assert(obj_seqno > last_left_); pre_enter(obj, lock); if (gu_likely(process_[idx].state_ != Process::S_CANCELED)) { assert(process_[idx].state_ == Process::S_IDLE); process_[idx].state_ = Process::S_WAITING; process_[idx].obj_ = &obj; #ifdef GU_DBUG_ON obj.debug_sync(mutex_); #endif // GU_DBUG_ON while (may_enter(obj) == false && process_[idx].state_ == Process::S_WAITING) { obj.unlock(); lock.wait(process_[idx].cond_); obj.lock(); } if (process_[idx].state_ != Process::S_CANCELED) { assert(process_[idx].state_ == Process::S_WAITING || process_[idx].state_ == Process::S_APPLYING); process_[idx].state_ = Process::S_APPLYING; ++entered_; oooe_ += ((last_left_ + 1) < obj_seqno); win_size_ += (last_entered_ - last_left_); return; } } assert(process_[idx].state_ == Process::S_CANCELED); process_[idx].state_ = Process::S_IDLE; gu_throw_error(EINTR); }
/* Find node with the smallest last_applied */ static inline void group_redo_last_applied (gcs_group_t* group) { long n; long last_node = -1; gu_seqno_t last_applied = GU_LONG_LONG_MAX; for (n = 0; n < group->num; n++) { const gcs_node_t* const node = &group->nodes[n]; gcs_seqno_t const seqno = node->last_applied; bool count = node->count_last_applied; if (gu_unlikely (0 == group->last_applied_proto_ver)) { /* @note: this may be removed after quorum v1 is phased out */ count = (GCS_NODE_STATE_SYNCED == node->status || GCS_NODE_STATE_DONOR == node->status); } // gu_debug ("last_applied[%ld]: %lld", n, seqno); /* NOTE: It is crucial for consistency that last_applied algorithm * is absolutely identical on all nodes. Therefore for the * generality sake and future compatibility we have to assume * non-blocking donor. * GCS_BLOCKING_DONOR should never be defined unless in some * very custom builds. Commenting it out for safety sake. */ //#ifndef GCS_BLOCKING_DONOR if (count //#else // if ((GCS_NODE_STATE_SYNCED == node->status) /* ignore donor */ //#endif && (seqno < last_applied)) { assert (seqno >= 0); last_applied = seqno; last_node = n; } // extra diagnostic, ignore //else if (!count) { gu_warn("not counting %d", n); } } if (gu_likely (last_node >= 0)) { group->last_applied = last_applied; group->last_node = last_node; } }
/*! * Handles action message. Is called often - therefore, inlined * * @return */ static inline ssize_t gcs_node_handle_act_frag (gcs_node_t* node, const gcs_act_frag_t* frg, struct gcs_act* act, bool local) { if (gu_likely(GCS_ACT_SERVICE != frg->act_type)) { return gcs_defrag_handle_frag (&node->app, frg, act, local); } else if (GCS_ACT_SERVICE == frg->act_type) { return gcs_defrag_handle_frag (&node->oob, frg, act, local); } else { gu_warn ("Unrecognised action type: %d", frg->act_type); assert(0); return -EPROTO; } }
/* discard all seqnos preceeding and including seqno */ bool RingBuffer::discard_seqno (int64_t seqno) { for (seqno2ptr_t::iterator i = seqno2ptr_.begin(); i != seqno2ptr_.end() && i->first <= seqno;) { seqno2ptr_t::iterator j(i); ++i; BufferHeader* const bh (ptr2BH (j->second)); if (gu_likely (BH_is_released(bh))) { seqno2ptr_.erase (j); bh->seqno_g = SEQNO_ILL; // will never be accessed by seqno switch (bh->store) { case BUFFER_IN_RB: discard(bh); break; case BUFFER_IN_MEM: { MemStore* const ms(static_cast<MemStore*>(bh->ctx)); ms->discard(bh); break; } case BUFFER_IN_PAGE: { Page* const page (static_cast<Page*>(bh->ctx)); PageStore* const ps (PageStore::page_store(page)); ps->discard(bh); break; } default: log_fatal << "Corrupt buffer header: " << bh; abort(); } } else { return false; } } return true; }
ssize_t repl(gcs_action& act, bool scheduled) { act.seqno_g = GCS_SEQNO_ILL; act.seqno_l = GCS_SEQNO_ILL; ssize_t ret(-EBADFD); { gu::Lock lock(mtx_); switch (state_) { case S_CONNECTED: case S_SYNCED: { ++global_seqno_; act.seqno_g = global_seqno_; ++local_seqno_; act.seqno_l = local_seqno_; ret = act.size; break; } case S_CLOSED: ret = -EBADFD; break; case S_OPEN: ret = -ENOTCONN; break; } } if (gu_likely(0 != gcache_ && ret > 0)) { assert (ret == act.size); void* ptr = gcache_->malloc(act.size); memcpy (ptr, act.buf, act.size); act.buf = ptr; } return ret; }
void* RingBuffer::malloc (ssize_t size) { void* ret(0); // We can reliably allocate continuous buffer which is 1/2 // of a total cache space. So compare to half the space if (size <= (size_cache_ / 2) && size <= (size_cache_ - size_used_)) { BufferHeader* const bh (get_new_buffer (size)); BH_assert_clear(BH_cast(next_)); // mallocs_++; if (gu_likely (0 != bh)) ret = bh + 1; } assert_sizes(); return ret; // "out of memory" }
static GCS_BACKEND_SEND_FN(dummy_send) { int err = 0; dummy_t* dummy = backend->conn; if (gu_unlikely(NULL == dummy)) return -EBADFD; if (gu_likely(DUMMY_PRIM == dummy->state)) { err = gcs_dummy_inject_msg (backend, buf, len, msg_type, backend->conn->my_idx); } else { static long send_error[DUMMY_PRIM] = { -EBADFD, -EBADFD, -ENOTCONN, -EAGAIN }; err = send_error[dummy->state]; } return err; }
void GCache::free_common (BufferHeader* const bh) { assert(bh->seqno_g != SEQNO_ILL); BH_release(bh); #ifndef NDEBUG void* const ptr(bh + 1); std::set<const void*>::iterator it = buf_tracker.find(ptr); if (it == buf_tracker.end()) { log_fatal << "Have not allocated this ptr: " << ptr; abort(); } buf_tracker.erase(it); #endif frees++; switch (bh->store) { case BUFFER_IN_MEM: mem.free (bh); break; case BUFFER_IN_RB: rb.free (bh); break; case BUFFER_IN_PAGE: if (gu_likely(bh->seqno_g > 0)) { discard_seqno (bh->seqno_g); } else { assert(bh->seqno_g != SEQNO_ILL); bh->seqno_g = SEQNO_ILL; ps.discard (bh); } break; } rb.assert_size_free(); }
void RingBuffer::seqno_reset() { if (size_cache_ == size_free_) return; /* Find the last seqno'd RB buffer. It is likely to be close to the * end of released buffers chain. */ BufferHeader* bh(0); for (seqno2ptr_t::reverse_iterator r(seqno2ptr_.rbegin()); r != seqno2ptr_.rend(); ++r) { BufferHeader* const b(ptr2BH(r->second)); if (BUFFER_IN_RB == b->store) { #ifndef NDEBUG if (!BH_is_released(b)) { log_fatal << "Buffer " << reinterpret_cast<const void*>(r->second) << ", seqno_g " << b->seqno_g << ", seqno_d " << b->seqno_d << " is not released."; assert(0); } #endif bh = b; break; } } if (!bh) return; assert(bh->size > 0); assert(BH_is_released(bh)); /* Seek the first unreleased buffer. * This should be called in isolation, when all seqno'd buffers are * freed, and the only unreleased buffers should come only from new * configuration. There should be no seqno'd buffers after it. */ ssize_t const old(size_free_); assert (0 == size_trail_ || first_ > next_); first_ = reinterpret_cast<uint8_t*>(bh); while (BH_is_released(bh)) // next_ is never released - no endless loop { first_ = reinterpret_cast<uint8_t*>(BH_next(bh)); if (gu_unlikely (0 == bh->size && first_ != next_)) { // rollover assert (first_ > next_); first_ = start_; } bh = BH_cast(first_); } BH_assert_clear(BH_cast(next_)); if (first_ == next_) { log_info << "GCache DEBUG: RingBuffer::seqno_reset(): full reset"; /* empty RB, reset it completely */ reset(); return; } assert ((BH_cast(first_))->size > 0); assert (first_ != next_); assert ((BH_cast(first_))->seqno_g == SEQNO_NONE); assert (!BH_is_released(BH_cast(first_))); /* Estimate how much space remains */ if (first_ < next_) { /* start_ first_ next_ end_ * | |###########| | */ size_used_ = next_ - first_; size_free_ = size_cache_ - size_used_; size_trail_ = 0; } else { /* start_ next_ first_ end_ * |#######| |#####| | * ^size_trail_ */ assert(size_trail_ > 0); size_free_ = first_ - next_ + size_trail_ - sizeof(BufferHeader); size_used_ = size_cache_ - size_free_; } assert_sizes(); assert(size_free_ < size_cache_); log_info << "GCache DEBUG: RingBuffer::seqno_reset(): discarded " << (size_free_ - old) << " bytes"; /* There is a small but non-0 probability that some released buffers * are locked within yet unreleased aborted local actions. * Seek all the way to next_, invalidate seqnos and update size_free_ */ assert(first_ != next_); assert(bh == BH_cast(first_)); long total(1); long locked(0); bh = BH_next(bh); while (bh != BH_cast(next_)) { if (gu_likely (bh->size > 0)) { total++; if (bh->seqno_g != SEQNO_NONE) { // either released or already discarded buffer assert (BH_is_released(bh)); bh->seqno_g = SEQNO_ILL; discard (bh); locked++; } else { assert(!BH_is_released(bh)); } bh = BH_next(bh); } else // rollover { assert (BH_cast(next_) < bh); bh = BH_cast(start_); } } log_info << "GCache DEBUG: RingBuffer::seqno_reset(): found " << locked << '/' << total << " locked buffers"; assert_sizes(); }
/*! Processes a new action added to a slave queue. * @return length of sleep in nanoseconds or negative error code * or GU_TIME_ETERNITY for complete stop */ long long gcs_fc_process (gcs_fc_t* fc, ssize_t act_size) { fc->size += act_size; fc->act_count++; if (fc->size <= fc->soft_limit) { /* normal operation */ if (gu_unlikely(fc->debug > 0 && !(fc->act_count % fc->debug))) { gu_info ("FC: queue size: %zdb (%4.1f%% of soft limit)", fc->size, ((double)fc->size)/fc->soft_limit*100.0); } return 0; } else if (fc->size >= fc->hard_limit) { if (0.0 == fc->max_throttle) { /* we can accept total service outage */ return GU_TIME_ETERNITY; } else { gu_error ("Recv queue hard limit exceded. Can't continue."); return -ENOMEM; } } // else if (!(fc->act_count & 7)) { // do this for every 8th action else { long long end = gu_time_monotonic(); double interval = ((end - fc->start) * 1.0e-9); if (gu_unlikely (0 == fc->last_sleep)) { /* just tripped the soft limit, preparing constants for throttle */ fc->max_rate = (double)(fc->size - fc->init_size) / interval; double s = (1.0 - fc->max_throttle)/(fc->soft_limit-fc->hard_limit); assert (s < 0.0); fc->scale = s * fc->max_rate; fc->offset = (1.0 - s*fc->soft_limit) * fc->max_rate; // calculate time interval from the soft limit interval = interval * (double)(fc->size - fc->soft_limit) / (fc->size - fc->init_size); assert (interval >= 0.0); // Move reference point to soft limit fc->last_sleep = fc->soft_limit; fc->start = end - interval; gu_warn("Soft recv queue limit exceeded, starting replication " "throttle. Measured avg. rate: %f bytes/sec; " "Throttle parameters: scale=%f, offset=%f", fc->max_rate, fc->scale, fc->offset); } /* throttling operation */ double desired_rate = fc->size * fc->scale + fc->offset; // linear decay //double desired_rate = fc->max_rate * fc->max_throttle; // square wave assert (desired_rate <= fc->max_rate); double sleep = (double)(fc->size - fc->last_sleep) / desired_rate - interval; if (gu_unlikely(fc->debug > 0 && !(fc->act_count % fc->debug))) { gu_info ("FC: queue size: %zdb, length: %zd, " "measured rate: %fb/s, desired rate: %fb/s, " "interval: %5.3fs, sleep: %5.4fs. " "Sleeps initiated: %zd, for a total of %6.3fs", fc->size, fc->act_count, ((double)(fc->size - fc->last_sleep))/interval, desired_rate, interval, sleep, fc->sleep_count, fc->sleeps); fc->sleep_count = 0; fc->sleeps = 0.0; } if (gu_likely(sleep < min_sleep)) { #if 0 gu_info ("Skipping sleep: desired_rate = %f, sleep = %f (%f), " "interval = %f, fc->scale = %f, fc->offset = %f, " "fc->size = %zd", desired_rate, sleep, min_sleep, interval, fc->scale, fc->offset, fc->size); #endif return 0; } fc->last_sleep = fc->size; fc->start = end; fc->sleep_count++; fc->sleeps += sleep; return (1000000000LL * sleep); } return 0; }
/*! * Handle action fragment * * Unless a whole action is returned, contents of act is undefined * * In order to optimize branch prediction used gu_likely macros and odered and * nested if/else blocks according to branch probability. * * @return 0 - success, * size of action - success, full action received, * negative - error. * * TODO: this function is too long, figure out a way to factor it into several * smaller ones. Note that it is called for every GCS_MSG_ACTION message * so it should be optimal. */ ssize_t gcs_defrag_handle_frag (gcs_defrag_t* df, const gcs_act_frag_t* frg, struct gcs_act* act, bool local) { if (df->received) { /* another fragment of existing action */ df->frag_no++; /* detect possible error condition */ if (gu_unlikely((df->sent_id != frg->act_id) || (df->frag_no != frg->frag_no))) { if (local && df->reset && (df->sent_id == frg->act_id) && (0 == frg->frag_no)) { /* df->sent_id was aborted halfway and is being taken care of * by the sender thread. Forget about it. * Reinit counters and continue with the new action. * Note that for local actions no memory allocation is made.*/ gu_debug ("Local action %lld reset.", frg->act_id); df->frag_no = 0; df->received = 0; df->tail = df->head; df->reset = false; if (df->size != frg->act_size) { df->size = frg->act_size; #ifndef GCS_FOR_GARB if (df->cache !=NULL) { gcache_free (df->cache, df->head); } else { free ((void*)df->head); } DF_ALLOC(); #endif /* GCS_FOR_GARB */ } } else { gu_error ("Unordered fragment received. Protocol error."); gu_error ("Expected: %llu:%ld, received: %llu:%ld", df->sent_id, df->frag_no, frg->act_id, frg->frag_no); gu_error ("Contents: '%.*s'", frg->frag_len, (char*)frg->frag); df->frag_no--; // revert counter in hope that we get good frag assert(0); return -EPROTO; } } } else { /* new action */ if (gu_likely(0 == frg->frag_no)) { df->size = frg->act_size; df->sent_id = frg->act_id; df->reset = false; #ifndef GCS_FOR_GARB DF_ALLOC(); #else /* we don't store actions locally at all */ df->head = NULL; df->tail = df->head; #endif } else { /* not a first fragment */ if (!local && df->reset) { /* can happen after configuration change, just ignore this message calmly */ gu_debug ("Ignoring fragment %lld:%ld after action reset", frg->act_id, frg->frag_no); return 0; } else { ((char*)frg->frag)[frg->frag_len - 1] = '\0'; gu_error ("Unordered fragment received. Protocol error."); gu_error ("Expected: any:0(first), received: %lld:%ld", frg->act_id, frg->frag_no); gu_error ("Contents: '%s', local: %s, reset: %s", (char*)frg->frag, local ? "yes" : "no", df->reset ? "yes" : "no"); assert(0); return -EPROTO; } } } df->received += frg->frag_len; assert (df->received <= df->size); #ifndef GCS_FOR_GARB assert (df->tail); memcpy (df->tail, frg->frag, frg->frag_len); df->tail += frg->frag_len; #else /* we skip memcpy since have not allocated any buffer */ assert (NULL == df->tail); assert (NULL == df->head); #endif if (df->received == df->size) { act->buf = df->head; act->buf_len = df->received; gcs_defrag_init (df, df->cache); return act->buf_len; } else { return 0; } }