Exemplo n.º 1
0
/*! Reads header data from the actual header of the message
 *  Remainig fragment buf and length is in frag->frag and frag->frag_len
 *
 * @return 0 on success */
long
gcs_act_proto_read (gcs_act_frag_t* frag, const void* buf, size_t buf_len)
{
    frag->proto_ver = ((uint8_t*)buf)[PROTO_PV_OFFSET];

    if (gu_unlikely(buf_len < PROTO_DATA_OFFSET)) {
        gu_error ("Action message too short: %zu, expected at least %d",
                  buf_len, PROTO_DATA_OFFSET);
        return -EBADMSG;
    }

    if (gu_unlikely(frag->proto_ver > PROTO_VERSION)) {
        gu_error ("Bad protocol version %d, expected %d",
                  frag->proto_ver, PROTO_VERSION);
        return -EPROTO; // this fragment should be dropped
    }

    ((uint8_t*)buf)[PROTO_PV_OFFSET] = 0x0;
    frag->act_id   = gu_be64(*(uint64_t*)buf);
    frag->act_size = gtohl  (((uint32_t*)buf)[2]);
    frag->frag_no  = gtohl  (((uint32_t*)buf)[3]);
    frag->act_type = ((uint8_t*)buf)[PROTO_AT_OFFSET];
    frag->frag     = ((uint8_t*)buf) + PROTO_DATA_OFFSET;
    frag->frag_len = buf_len - PROTO_DATA_OFFSET;

    /* return 0 or -EMSGSIZE */
    return ((frag->act_size > GCS_MAX_ACT_SIZE) * -EMSGSIZE);
}
Exemplo n.º 2
0
/*! Returns protocol header size */
long
gcs_act_proto_hdr_size (long version)
{
    if (gu_unlikely(GCS_ACT_PROTO_MAX < version)) return -EPROTONOSUPPORT;

    if (gu_unlikely(version < 0)) return PROTO_MAX_HDR_SIZE; // safe

    return PROTO_DATA_OFFSET;
}
    inline size_t uleb128_decode(const byte_t* buf,
                                 size_t        buflen,
                                 size_t        offset,
                                 UI&           value)
    {
        // initial check for overflow, at least one byte must be readable
#ifdef GU_VLQ_CHECKS
        if (gu_unlikely(offset >= buflen)) gu_throw_fatal;
#endif

#ifdef GU_VLQ_ALEX
        value = buf[offset] & 0x7f;
        size_t shift(0);

        while (buf[offset] & 0x80)
        {
            ++offset;
            shift +=7;

#ifdef GU_VLQ_CHECKS
            ssize_t left_bits((sizeof(UI) << 3) - shift);
            if (gu_unlikely(offset >= buflen || left_bits < 7))
                uleb128_decode_checks (buf, buflen, offset, left_bits);
#endif
            value |= (UI(buf[offset] & 0x7f) << shift);
        }

        return offset + 1;
#else /* GU_VLQ_ALEX */
        value = 0;
        size_t shift(0);

        while (true)
        {
            value |= (UI(buf[offset] & 0x7f) << shift);
            if (gu_likely((buf[offset] & 0x80) == 0))
            {
                // last byte
                ++offset;
                break;
            }
            ++offset;
            shift += 7;

#ifdef GU_VLQ_CHECKS
            ssize_t left_bits((sizeof(UI) << 3) - shift);
            if (gu_unlikely(offset >= buflen || left_bits < 7))
                uleb128_decode_checks (buf, buflen, offset, left_bits);
#endif
        }

        return offset;
#endif /* GU_VLQ_ALEX */
    }
Exemplo n.º 4
0
/* returns false if checksum matched and true if failed */
void
RecordSetInBase::checksum() const
{
    int const cs(check_size(check_type_));

    if (cs > 0) /* checksum records */
    {
        Hash check;

        check.append (head_ + begin_, size_ - begin_); /* records */
        check.append (head_, begin_ - cs);             /* header  */

        assert(cs <= MAX_CHECKSUM_SIZE);
        byte_t result[MAX_CHECKSUM_SIZE];
        check.gather<sizeof(result)>(result);

        const byte_t* const stored_checksum(head_ + begin_ - cs);

        if (gu_unlikely(memcmp (result, stored_checksum, cs)))
        {
            gu_throw_error(EINVAL)
                << "RecordSet checksum does not match:"
                << "\ncomputed: " << gu::Hexdump(result, cs)
                << "\nfound:    " << gu::Hexdump(stored_checksum, cs);
        }
    }
}
Exemplo n.º 5
0
    // this will crash if ptr == 0
    void*
    GCache::realloc (void* ptr, ssize_t size)
    {
        size += sizeof(BufferHeader);

        void*               new_ptr(0);
        BufferHeader* const bh(ptr2BH(ptr));

        if (gu_unlikely(bh->seqno_g > 0)) // sanity check
        {
            log_fatal << "Internal program error: changing size of an ordered"
                      << " buffer, seqno: " << bh->seqno_g << ". Aborting.";
            abort();
        }

        gu::Lock      lock(mtx);

        reallocs++;

        MemOps* store(0);

        switch (bh->store)
        {
        case BUFFER_IN_MEM:  store = &mem; break;
        case BUFFER_IN_RB:   store = &rb;  break;
        case BUFFER_IN_PAGE: store = &ps;  break;
        default:
            log_fatal << "Memory corruption: unrecognized store: "
                      << bh->store;
            abort();
        }

        new_ptr = store->realloc (ptr, size);

        if (0 == new_ptr)
        {
            new_ptr = malloc (size);

            if (0 != new_ptr)
            {
                memcpy (new_ptr, ptr, bh->size - sizeof(BufferHeader));
                store->free (bh);
            }
        }

#ifndef NDEBUG
        if (ptr != new_ptr && 0 != new_ptr)
        {
            std::set<const void*>::iterator it = buf_tracker.find(ptr);

            if (it != buf_tracker.end()) buf_tracker.erase(it);

            it = buf_tracker.find(new_ptr);

        }
#endif

        return new_ptr;
    }
Exemplo n.º 6
0
 void report_last_committed()
 {
     if (gu_unlikely(cert_.index_purge_required()))
     {
         wsrep_seqno_t const purge_seqno(cert_.get_safe_to_discard_seqno());
         service_thd_.report_last_committed(purge_seqno);
     }
 }
Exemplo n.º 7
0
 inline void broadcast () const
 {
     if (ref_count > 0) {
         int ret = gu_cond_broadcast (&cond);
         if (gu_unlikely(ret != 0))
             throw Exception("gu_cond_broadcast() failed", ret);
     }
 }
Exemplo n.º 8
0
 inline void signal () const
 {
     if (ref_count > 0) {
         int ret = gu_cond_signal (&cond);
         if (gu_unlikely(ret != 0))
             throw Exception("gu_cond_signal() failed", ret);
     }
 }
Exemplo n.º 9
0
void
RecordSetInBase::parse_header_v1 (size_t const size)
{
    assert (size > 1);

    int off = 1;

    off += uleb128_decode (head_ + off, size - off, size_);

    if (gu_unlikely(static_cast<size_t>(size_) > static_cast<size_t>(size)))
    {
        gu_throw_error (EPROTO) << "RecordSet size " << size_
                                << " exceeds buffer size " << size
                                << "\nfirst 4 bytes: " << gu::Hexdump(head_, 4);
    }

    off += uleb128_decode (head_ + off, size - off, count_);

    if (gu_unlikely(static_cast<size_t>(size_) < static_cast<size_t>(count_)))
    {
        gu_throw_error (EPROTO) << "Corrupted RecordSet header: count "
                                << count_ << " exceeds size " << size_;
    }

    /* verify header CRC */
    uint32_t const crc_comp(gu_fast_hash32(head_, off));
    uint32_t const crc_orig(
        gtoh(*(reinterpret_cast<const uint32_t*>(head_ + off))));

    if (gu_unlikely(crc_comp != crc_orig))
    {
        gu_throw_error (EPROTO)
            << "RecordSet header CRC mismatch: "
            << std::showbase << std::internal << std::hex
            << std::setfill('0') << std::setw(10)
            << "\ncomputed: " << crc_comp
            << "\nfound:    " << crc_orig << std::dec;
    }
    off += VER1_CRC_SIZE;

    /* checksum is between header and records */
    begin_ = off + check_size(check_type_);
}
    inline size_t uleb128_encode(UI       value,
                                 byte_t*  buf,
                                 size_t   buflen,
                                 size_t   offset)
    {
#ifdef GU_VLQ_ALEX
        assert (offset < buflen);
        buf[offset] = value & 0x7f;

        while (value >>= 7)
        {
            buf[offset] |= 0x80;
            ++offset;
#ifdef GU_VLQ_CHECKS
            if (gu_unlikely(offset >= buflen)) gu_throw_fatal;
#else
            assert(offset < buflen);
#endif /* GU_VLQ_CHECKS */
            buf[offset] = value & 0x7f;
        }

        return offset + 1;
#else /* GU_VLQ_ALEX */
        do
        {
#ifdef GU_VLQ_CHECKS
            if (gu_unlikely(offset >= buflen)) gu_throw_fatal;
#else
            assert(offset < buflen);
#endif /* GU_VLQ_CHECKS */
            buf[offset] = value & 0x7f;
            value >>= 7;
            if (gu_unlikely(value != 0))
            {
                buf[offset] |= 0x80;
            }
            ++offset;
        }
        while (value != 0);

        return offset;
#endif /* GU_VLQ_ALEX */
    }
Exemplo n.º 11
0
static inline void
gcs_node_set_last_applied (gcs_node_t* node, gcs_seqno_t seqno)
{
    if (gu_unlikely(seqno < node->last_applied)) {
        gu_warn ("Received bogus LAST message: %lld, from node %s, "
                 "expected >= %lld. Ignoring.",
                 seqno, node->id, node->last_applied);
    } else {
        node->last_applied = seqno;
    }
}
Exemplo n.º 12
0
 inline size_t
 __private_unserialize(const byte_t* const buf, size_t const buflen, size_t const offset,
                       TO& t)
 {
     BOOST_STATIC_ASSERT(std::numeric_limits<TO>::is_integer);
     BOOST_STATIC_ASSERT(std::numeric_limits<FROM>::is_integer);
     BOOST_STATIC_ASSERT(sizeof(FROM) == sizeof(TO));
     size_t const ret = offset + sizeof(t);
     if (gu_unlikely(ret > buflen)) gu_throw_error(EMSGSIZE) << ret << " > " << buflen;
     t = gtoh<FROM>(*reinterpret_cast<const FROM*>(buf + offset));
     return ret;
 }
Exemplo n.º 13
0
        inline void wait (const Cond& cond, const datetime::Date& date)
            throw (Exception)
        {
            timespec ts;

            date._timespec(ts);
            cond.ref_count++;
            int ret = pthread_cond_timedwait (&(cond.cond), value, &ts);
            cond.ref_count--;

            if (gu_unlikely(ret)) gu_throw_error(ret);
        }
Exemplo n.º 14
0
 ~Cond ()
 {
     int ret;
     while (EBUSY == (ret = gu_cond_destroy(&cond)))
         { usleep (100); }
     if (gu_unlikely(ret != 0))
     {
         log_fatal << "gu_cond_destroy() failed: " << ret
                   << " (" << strerror(ret) << ". Aborting.";
         ::abort();
     }
 }
Exemplo n.º 15
0
RecordSet::RecordSet (Version ver, CheckType const ct)
    :
    size_      (0),
    count_     (0),
    version_   (ver),
    check_type_(ct)
{
    if (gu_unlikely(uint(version_) > MAX_VERSION))
    {
        gu_throw_error (EPROTO) << "Unsupported header version: " << version_;
    }
}
Exemplo n.º 16
0
 inline size_t
 __private_serialize(const FROM& f,
                        byte_t* const buf, size_t const buflen, size_t const offset)
     throw (Exception)
 {
     BOOST_STATIC_ASSERT(std::numeric_limits<TO>::is_integer);
     BOOST_STATIC_ASSERT(std::numeric_limits<FROM>::is_integer);
     BOOST_STATIC_ASSERT(sizeof(FROM) == sizeof(TO));
     size_t const ret = offset + sizeof(TO);
     if (gu_unlikely(ret > buflen)) gu_throw_error(EMSGSIZE) << ret << " > " << buflen;
     *reinterpret_cast<TO*>(buf + offset) = htog<TO>(f);
     return ret;
 }
std::pair<size_t, size_t>
galera::WriteSet::segment(const gu::byte_t* buf, size_t buf_len, size_t offset)
{
    uint32_t data_len;
    offset = gu::unserialize4(buf, buf_len, offset, data_len);
    if (gu_unlikely(offset + data_len > buf_len))
    {
#ifdef NDEBUG
        gu_throw_error(EMSGSIZE);
#else
        gu_throw_error(EMSGSIZE) << "offset: " << offset << ", data_len: "
                                 << data_len << ", buf_len: " << buf_len;
#endif /* NDEBUG */
    }
    return std::pair<size_t, size_t>(offset, data_len);
}
Exemplo n.º 18
0
inline galera::TrxHandle*
galera::Wsdb::create_trx(const TrxHandle::Params& params,
                         const wsrep_uuid_t&  source_id,
                         wsrep_trx_id_t const trx_id)
{
    TrxHandle* trx(TrxHandle::New(trx_pool_, params, source_id, -1, trx_id));

    gu::Lock lock(trx_mutex_);

    std::pair<TrxMap::iterator, bool> i
        (trx_map_.insert(std::make_pair(trx_id, trx)));

    if (gu_unlikely(i.second == false)) gu_throw_fatal;

    return i.first->second;
}
/* Find node with the smallest last_applied */
static inline void
group_redo_last_applied (gcs_group_t* group)
{
    long       n;
    long       last_node    = -1;
    gu_seqno_t last_applied = GU_LONG_LONG_MAX;

    for (n = 0; n < group->num; n++) {
        const gcs_node_t* const node = &group->nodes[n];
        gcs_seqno_t const seqno = node->last_applied;
        bool count = node->count_last_applied;

        if (gu_unlikely (0 == group->last_applied_proto_ver)) {
            /* @note: this may be removed after quorum v1 is phased out */
            count = (GCS_NODE_STATE_SYNCED == node->status ||
                     GCS_NODE_STATE_DONOR  == node->status);
        }

//        gu_debug ("last_applied[%ld]: %lld", n, seqno);

        /* NOTE: It is crucial for consistency that last_applied algorithm
         *       is absolutely identical on all nodes. Therefore for the
         *       generality sake and future compatibility we have to assume
         *       non-blocking donor.
         *       GCS_BLOCKING_DONOR should never be defined unless in some
         *       very custom builds. Commenting it out for safety sake. */
//#ifndef GCS_BLOCKING_DONOR
        if (count
//#else
//        if ((GCS_NODE_STATE_SYNCED == node->status) /* ignore donor */
//#endif
            && (seqno < last_applied)) {
            assert (seqno >= 0);
            last_applied = seqno;
            last_node    = n;
        }
        // extra diagnostic, ignore
        //else if (!count) { gu_warn("not counting %d", n); }
    }

    if (gu_likely (last_node >= 0)) {
        group->last_applied = last_applied;
        group->last_node    = last_node;
    }
}
void
gcs_sm_stats_get (gcs_sm_t*  sm,
                  int*       q_len,
                  double*    q_len_avg,
                  long long* paused_ns,
                  double*    paused_avg)
{
    gcs_sm_stats_t tmp;
    long long      now;
    bool           paused;

    if (gu_unlikely(gu_mutex_lock (&sm->lock))) abort();

    *q_len = sm->users;
    tmp    = sm->stats;
    now    = gu_time_monotonic();
    paused = sm->pause;

    gu_mutex_unlock (&sm->lock);

    if (paused) { // taking sample in a middle of a pause
        tmp.paused_ns += now - tmp.pause_start;
    }
    *paused_ns = tmp.paused_ns;

    if (gu_likely(tmp.paused_ns >= 0)) {
        *paused_avg = ((double)(tmp.paused_ns - tmp.paused_sample)) /
                       (now - tmp.sample_start);
    }
    else {
        *paused_avg = -1.0;
    }

    if (gu_likely(tmp.send_q_len >= 0 && tmp.send_q_samples >= 0)){
        if (gu_likely(tmp.send_q_samples > 0)) {
            *q_len_avg = ((double)tmp.send_q_len) / tmp.send_q_samples;
        }
        else {
            *q_len_avg = 0.0;
        }
    }
    else {
        *q_len_avg = -1.0;
    }
}
long
gcs_sm_open (gcs_sm_t* sm)
{
    long ret = -1;

    if (gu_unlikely(gu_mutex_lock (&sm->lock))) abort();

    if (-EBADFD == sm->ret)  /* closed */
    {
        sm->ret = 0;
    }
    ret = sm->ret;

    gu_mutex_unlock (&sm->lock);

    if (ret) { gu_error ("Can't open send monitor: wrong state %d", ret); }

    return ret;
}
void
gcs_sm_stats_flush(gcs_sm_t* sm)
{
    if (gu_unlikely(gu_mutex_lock (&sm->lock))) abort();

    long long const now = gu_time_monotonic();

    sm->stats.sample_start = now;

    sm->stats.paused_sample = sm->stats.paused_ns;

    if (sm->pause) // append elapsed pause time
    {
        sm->stats.paused_sample  += now - sm->stats.pause_start;
    }

    sm->stats.send_q_len     = 0;
    sm->stats.send_q_samples = 0;

    gu_mutex_unlock (&sm->lock);
}
Exemplo n.º 23
0
static
GCS_BACKEND_SEND_FN(dummy_send)
{
    int err = 0;
    dummy_t* dummy = backend->conn;

    if (gu_unlikely(NULL == dummy)) return -EBADFD;

    if (gu_likely(DUMMY_PRIM == dummy->state))
    {
        err = gcs_dummy_inject_msg (backend, buf, len, msg_type,
                                    backend->conn->my_idx);
    }
    else {
        static long send_error[DUMMY_PRIM] =
            { -EBADFD, -EBADFD, -ENOTCONN, -EAGAIN };
	err = send_error[dummy->state];
    }

    return err;
}
long
gcs_sm_close (gcs_sm_t* sm)
{
    gu_info ("Closing send monitor...");

    if (gu_unlikely(gu_mutex_lock (&sm->lock))) abort();

    sm->ret = -EBADFD;

    if (sm->pause) _gcs_sm_continue_common (sm);

    gu_cond_t cond;
    gu_cond_init (&cond, NULL);

    // in case the queue is full
    while (sm->users >= (long)sm->wait_q_len) {
        gu_mutex_unlock (&sm->lock);
        usleep(1000);
        gu_mutex_lock (&sm->lock);
    }

    while (sm->users > 0) { // wait for cleared queue
        sm->users++;
        GCS_SM_INCREMENT(sm->wait_q_tail);
        _gcs_sm_enqueue_common (sm, &cond);
        sm->users--;
        GCS_SM_INCREMENT(sm->wait_q_head);
    }

    gu_cond_destroy (&cond);

    gu_mutex_unlock (&sm->lock);

    gu_info ("Closed send monitor.");

    return 0;
}
Exemplo n.º 25
0
galera::Wsdb::Conn*
galera::Wsdb::get_conn(wsrep_conn_id_t const conn_id, bool const create)
{
    gu::Lock lock(conn_mutex_);

    ConnMap::iterator i(conn_map_.find(conn_id));

    if (conn_map_.end() == i)
    {
        if (create == true)
        {
            std::pair<ConnMap::iterator, bool> p
                (conn_map_.insert(std::make_pair(conn_id, Conn(conn_id))));

            if (gu_unlikely(p.second == false)) gu_throw_fatal;

            return &p.first->second;
        }

        return 0;
    }

    return &(i->second);
}
void
RecvLoop::loop()
{
    while (1)
    {
        gcs_action act;

        gcs_.recv (act);

        switch (act.type)
        {
        case GCS_ACT_TORDERED:
            if (gu_unlikely(!(act.seqno_g & 127)))
                /* == report_interval_ of 128 */
            {
                gcs_.set_last_applied (act.seqno_g);
            }
            break;
        case GCS_ACT_COMMIT_CUT:
            break;
        case GCS_ACT_STATE_REQ:
            gcs_.join (-ENOSYS); /* we can't donate state */
            break;
        case GCS_ACT_CONF:
        {
            const gcs_act_conf_t* const cc
                (reinterpret_cast<const gcs_act_conf_t*>(act.buf));

            if (cc->conf_id > 0) /* PC */
            {
                if (GCS_NODE_STATE_PRIM == cc->my_state)
                {
                    gcs_.request_state_transfer (config_.sst(),config_.donor());
                    gcs_.join(cc->seqno);
                }
            }
            else if (cc->memb_num == 0) // SELF-LEAVE after closing connection
            {
                log_info << "Exiting main loop";
                return;
            }

            if (config_.sst() != Config::DEFAULT_SST)
            {
                // we requested custom SST, so we're done here
                gcs_.close();
            }

            break;
        }
        case GCS_ACT_JOIN:
        case GCS_ACT_SYNC:
        case GCS_ACT_FLOW:
        case GCS_ACT_SERVICE:
        case GCS_ACT_ERROR:
        case GCS_ACT_UNKNOWN:
            break;
        }

        if (act.buf)
        {
            free (const_cast<void*>(act.buf));
        }
    }
}
Exemplo n.º 27
0
/*! Processes a new action added to a slave queue.
 *  @return length of sleep in nanoseconds or negative error code
 *          or GU_TIME_ETERNITY for complete stop */
long long
gcs_fc_process (gcs_fc_t* fc, ssize_t act_size)
{
    fc->size += act_size;
    fc->act_count++;

    if (fc->size <= fc->soft_limit) {
        /* normal operation */
        if (gu_unlikely(fc->debug > 0 && !(fc->act_count % fc->debug))) {
            gu_info ("FC: queue size: %zdb (%4.1f%% of soft limit)",
                     fc->size, ((double)fc->size)/fc->soft_limit*100.0);
        }
        return 0;
    }
    else if (fc->size >= fc->hard_limit) {
        if (0.0 == fc->max_throttle) {
            /* we can accept total service outage */
            return GU_TIME_ETERNITY;
        }
        else {
            gu_error ("Recv queue hard limit exceded. Can't continue.");
            return -ENOMEM;
        }
    }
//    else if (!(fc->act_count & 7)) { // do this for every 8th action
    else {
        long long end   = gu_time_monotonic();
        double interval = ((end - fc->start) * 1.0e-9);

        if (gu_unlikely (0 == fc->last_sleep)) {
            /* just tripped the soft limit, preparing constants for throttle */

            fc->max_rate = (double)(fc->size - fc->init_size) / interval;

            double s = (1.0 - fc->max_throttle)/(fc->soft_limit-fc->hard_limit);
            assert (s < 0.0);

            fc->scale  = s * fc->max_rate;
            fc->offset = (1.0 - s*fc->soft_limit) * fc->max_rate;

            // calculate time interval from the soft limit
            interval = interval * (double)(fc->size - fc->soft_limit) /
                (fc->size - fc->init_size);
            assert (interval >= 0.0);

            // Move reference point to soft limit
            fc->last_sleep = fc->soft_limit;
            fc->start      = end - interval;

            gu_warn("Soft recv queue limit exceeded, starting replication "
                    "throttle. Measured avg. rate: %f bytes/sec; "
                    "Throttle parameters: scale=%f, offset=%f",
                    fc->max_rate, fc->scale, fc->offset);
        }

        /* throttling operation */
        double desired_rate = fc->size * fc->scale + fc->offset; // linear decay
        //double desired_rate = fc->max_rate * fc->max_throttle; // square wave
        assert (desired_rate <= fc->max_rate);

        double sleep = (double)(fc->size - fc->last_sleep) / desired_rate
            - interval;

        if (gu_unlikely(fc->debug > 0 && !(fc->act_count % fc->debug))) {
            gu_info ("FC: queue size: %zdb, length: %zd, "
                     "measured rate: %fb/s, desired rate: %fb/s, "
                     "interval: %5.3fs, sleep: %5.4fs. "
                     "Sleeps initiated: %zd, for a total of %6.3fs",
                     fc->size, fc->act_count,
                     ((double)(fc->size - fc->last_sleep))/interval,
                     desired_rate, interval, sleep, fc->sleep_count,
                     fc->sleeps);
            fc->sleep_count = 0;
            fc->sleeps = 0.0;
        }

        if (gu_likely(sleep < min_sleep)) {
#if 0
            gu_info ("Skipping sleep: desired_rate = %f, sleep = %f (%f), "
                     "interval = %f, fc->scale = %f, fc->offset = %f, "
                     "fc->size = %zd",
                     desired_rate, sleep, min_sleep, interval,
                     fc->scale, fc->offset, fc->size);
#endif
            return 0;
        }

        fc->last_sleep = fc->size;
        fc->start      = end;
        fc->sleep_count++;
        fc->sleeps += sleep;

        return (1000000000LL * sleep);
    }

    return 0;
}
    // returns pointer to buffer data area or 0 if no space found
    BufferHeader*
    RingBuffer::get_new_buffer (ssize_t const size)
    {
        assert_size_free();
        assert (size > 0);

        BH_assert_clear(BH_cast(next_));

        uint8_t* ret(next_);

        ssize_t const size_next (size + sizeof(BufferHeader));

        if (ret >= first_) {
            assert (0 == size_trail_);
            // try to find space at the end
            ssize_t const end_size(end_ - ret);

            if (end_size >= size_next) {
                assert(size_free_ >= size);
                goto found_space;
            }
            else {
                // no space at the end, go from the start
                size_trail_ = end_size;
                ret = start_;
            }
        }

        assert (ret <= first_);
        if ((first_ - ret) >= size_next) { assert(size_free_ >= size); }

        while ((first_ - ret) < size_next) {
            // try to discard first buffer to get more space
            BufferHeader* bh = BH_cast(first_);

            if (!BH_is_released(bh) /* true also when first_ == next_ */ ||
                (bh->seqno_g > 0 && !discard_seqno (bh->seqno_g)))
            {
                // can't free any more space, so no buffer, next_ is unchanged
                // and revert size_trail_ if it was set above
                if (next_ >= first_) size_trail_ = 0;
                assert_sizes();
                return 0;
            }

            assert (first_ != next_);
            /* buffer is either discarded already, or it must have seqno */
            assert (SEQNO_ILL == bh->seqno_g);

            first_ += bh->size;
            assert_size_free();

            if (gu_unlikely(0 == (BH_cast(first_))->size))
            {
                // empty header: check if we fit at the end and roll over if not
                assert(first_ >= next_);
                assert(first_ >= ret);

                first_ = start_;
// WRONG               if (first_ != ret) size_trail_ = 0; // we're now contiguous: first_ < next_
                assert_size_free();

                if ((end_ - ret) >= size_next)
                {
                    assert(size_free_ >= size);
                    size_trail_ = 0;
                    goto found_space;
                }
                else
                {
                    size_trail_ = end_ - ret;
                    ret = start_;
                }
            }
        }

#ifndef NDEBUG
        if ((first_ - ret) < size_next) {
            log_fatal << "Assertion ((first - ret) >= size_next) failed: "
                      << std::endl
                      << "first offt = " << (first_ - start_) << std::endl
                      << "next  offt = " << (next_  - start_) << std::endl
                      << "end   offt = " << (end_   - start_) << std::endl
                      << "ret   offt = " << (ret    - start_) << std::endl
                      << "size_next  = " << size_next         << std::endl;
            abort();
        }
#endif

    found_space:
        size_used_ += size;
        assert (size_used_ <= size_cache_);
        size_free_ -= size;
        assert (size_free_ >= 0);

        BufferHeader* const bh(BH_cast(ret));
        bh->size    = size;
        bh->seqno_g = SEQNO_NONE;
        bh->seqno_d = SEQNO_ILL;
        bh->flags   = 0;
        bh->store   = BUFFER_IN_RB;
        bh->ctx     = this;

        next_ = ret + size;
        assert (next_ + sizeof(BufferHeader) <= end_);
        BH_clear (BH_cast(next_));
        assert_sizes();

        return bh;
    }
    void
    RingBuffer::seqno_reset()
    {
        if (size_cache_ == size_free_) return;

        /* Find the last seqno'd RB buffer. It is likely to be close to the
         * end of released buffers chain. */
        BufferHeader* bh(0);

        for (seqno2ptr_t::reverse_iterator r(seqno2ptr_.rbegin());
             r != seqno2ptr_.rend(); ++r)
        {
            BufferHeader* const b(ptr2BH(r->second));
            if (BUFFER_IN_RB == b->store)
            {
#ifndef NDEBUG
                if (!BH_is_released(b))
                {
                    log_fatal << "Buffer "
                              << reinterpret_cast<const void*>(r->second)
                              << ", seqno_g " << b->seqno_g << ", seqno_d "
                              << b->seqno_d << " is not released.";
                    assert(0);
                }
#endif
                bh = b;
                break;
            }
        }

        if (!bh) return;

        assert(bh->size > 0);
        assert(BH_is_released(bh));

        /* Seek the first unreleased buffer.
         * This should be called in isolation, when all seqno'd buffers are
         * freed, and the only unreleased buffers should come only from new
         * configuration. There should be no seqno'd buffers after it. */

        ssize_t const old(size_free_);

        assert (0 == size_trail_ || first_ > next_);
        first_ = reinterpret_cast<uint8_t*>(bh);

        while (BH_is_released(bh)) // next_ is never released - no endless loop
        {
             first_ = reinterpret_cast<uint8_t*>(BH_next(bh));

             if (gu_unlikely (0 == bh->size && first_ != next_))
             {
                 // rollover
                 assert (first_ > next_);
                 first_ = start_;
             }

             bh = BH_cast(first_);
        }

        BH_assert_clear(BH_cast(next_));

        if (first_ == next_)
        {
            log_info << "GCache DEBUG: RingBuffer::seqno_reset(): full reset";
            /* empty RB, reset it completely */
            reset();
            return;
        }

        assert ((BH_cast(first_))->size > 0);
        assert (first_ != next_);
        assert ((BH_cast(first_))->seqno_g == SEQNO_NONE);
        assert (!BH_is_released(BH_cast(first_)));

        /* Estimate how much space remains */
        if (first_ < next_)
        {
            /* start_  first_      next_    end_
             *   |       |###########|       |
             */
            size_used_ = next_ - first_;
            size_free_ = size_cache_ - size_used_;
            size_trail_ = 0;
        }
        else
        {
            /* start_  next_       first_   end_
             *   |#######|           |#####| |
             *                              ^size_trail_ */
            assert(size_trail_ > 0);
            size_free_ = first_ - next_ + size_trail_ - sizeof(BufferHeader);
            size_used_ = size_cache_ - size_free_;
        }

        assert_sizes();
        assert(size_free_ < size_cache_);

        log_info << "GCache DEBUG: RingBuffer::seqno_reset(): discarded "
                 << (size_free_ - old) << " bytes";

        /* There is a small but non-0 probability that some released buffers
         * are locked within yet unreleased aborted local actions.
         * Seek all the way to next_, invalidate seqnos and update size_free_ */

        assert(first_ != next_);
        assert(bh == BH_cast(first_));

        long total(1);
        long locked(0);

        bh = BH_next(bh);

        while (bh != BH_cast(next_))
        {
            if (gu_likely (bh->size > 0))
            {
                total++;

                if (bh->seqno_g != SEQNO_NONE)
                {
                    // either released or already discarded buffer
                    assert (BH_is_released(bh));
                    bh->seqno_g = SEQNO_ILL;
                    discard (bh);
                    locked++;
                }
                else
                {
                    assert(!BH_is_released(bh));
                }

                bh = BH_next(bh);
            }
            else // rollover
            {
                assert (BH_cast(next_) < bh);
                bh = BH_cast(start_);
            }
        }

        log_info << "GCache DEBUG: RingBuffer::seqno_reset(): found "
                 << locked << '/' << total << " locked buffers";

        assert_sizes();
    }
Exemplo n.º 30
0
/*!
 * Handle action fragment
 *
 * Unless a whole action is returned, contents of act is undefined
 *
 * In order to optimize branch prediction used gu_likely macros and odered and
 * nested if/else blocks according to branch probability.
 *
 * @return 0              - success,
 *         size of action - success, full action received,
 *         negative       - error.
 *
 * TODO: this function is too long, figure out a way to factor it into several
 *       smaller ones. Note that it is called for every GCS_MSG_ACTION message
 *       so it should be optimal.
 */
ssize_t
gcs_defrag_handle_frag (gcs_defrag_t*         df,
                        const gcs_act_frag_t* frg,
                        struct gcs_act*       act,
                        bool                  local)
{
    if (df->received) {
        /* another fragment of existing action */

        df->frag_no++;

        /* detect possible error condition */
        if (gu_unlikely((df->sent_id != frg->act_id) ||
                        (df->frag_no != frg->frag_no))) {
            if (local && df->reset &&
                (df->sent_id == frg->act_id) && (0 == frg->frag_no)) {
                /* df->sent_id was aborted halfway and is being taken care of
                 * by the sender thread. Forget about it.
                 * Reinit counters and continue with the new action.
                 * Note that for local actions no memory allocation is made.*/
                gu_debug ("Local action %lld reset.", frg->act_id);
                df->frag_no  = 0;
                df->received = 0;
                df->tail     = df->head;
                df->reset    = false;

                if (df->size != frg->act_size) {

                    df->size = frg->act_size;

#ifndef GCS_FOR_GARB
                    if (df->cache !=NULL) {
                        gcache_free (df->cache, df->head);
                    }
                    else {
                        free ((void*)df->head);
                    }

                    DF_ALLOC();
#endif /* GCS_FOR_GARB */
                }
            }
            else {
                gu_error ("Unordered fragment received. Protocol error.");
                gu_error ("Expected: %llu:%ld, received: %llu:%ld",
                          df->sent_id, df->frag_no, frg->act_id, frg->frag_no);
                gu_error ("Contents: '%.*s'", frg->frag_len, (char*)frg->frag);
                df->frag_no--; // revert counter in hope that we get good frag
                assert(0);
                return -EPROTO;
            }
        }
    }
    else {
        /* new action */
        if (gu_likely(0 == frg->frag_no)) {

            df->size    = frg->act_size;
            df->sent_id = frg->act_id;
            df->reset   = false;

#ifndef GCS_FOR_GARB
            DF_ALLOC();
#else
            /* we don't store actions locally at all */
            df->head = NULL;
            df->tail = df->head;
#endif
        }
        else {
            /* not a first fragment */
            if (!local && df->reset) {
                /* can happen after configuration change,
                   just ignore this message calmly */
                gu_debug ("Ignoring fragment %lld:%ld after action reset",
                          frg->act_id, frg->frag_no);
                return 0;
            }
            else {
                ((char*)frg->frag)[frg->frag_len - 1] = '\0';
                gu_error ("Unordered fragment received. Protocol error.");
                gu_error ("Expected: any:0(first), received: %lld:%ld",
                          frg->act_id, frg->frag_no);
                gu_error ("Contents: '%s', local: %s, reset: %s",
                          (char*)frg->frag, local ? "yes" : "no",
                          df->reset ? "yes" : "no");
                assert(0);
                return -EPROTO;
            }
        }
    }

    df->received += frg->frag_len;
    assert (df->received <= df->size);

#ifndef GCS_FOR_GARB
    assert (df->tail);
    memcpy (df->tail, frg->frag, frg->frag_len);
    df->tail += frg->frag_len;
#else
    /* we skip memcpy since have not allocated any buffer */
    assert (NULL == df->tail);
    assert (NULL == df->head);
#endif

    if (df->received == df->size) {
        act->buf     = df->head;
        act->buf_len = df->received;
        gcs_defrag_init (df, df->cache);
        return act->buf_len;
    }
    else {
        return 0;
    }
}