void BaseMemcachedStore::delete_without_tombstone(const std::string& fqkey, const std::vector<memcached_st*>& replicas, SAS::TrailId trail) { if (trail != 0) { SAS::Event event(trail, SASEvent::MEMCACHED_DELETE, 0); event.add_var_param(fqkey); SAS::report_event(event); } const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); for (size_t ii = 0; ii < replicas.size(); ++ii) { TRC_DEBUG("Attempt delete to replica %d (connection %p)", ii, replicas[ii]); memcached_return_t rc = memcached_delete(replicas[ii], key_ptr, key_len, 0); if (!memcached_success(rc)) { log_delete_failure(fqkey, ii, replicas.size(), trail, 0); } } }
/// Retrieve the AoR data for a given SIP URI, creating it if there isn't /// any already, and returning NULL if we can't get a connection. AoR* MemcachedStore::get_aor_data(const std::string& aor_id) ///< the SIP URI { memcached_return_t rc; MemcachedAoR* aor_data = NULL; // Try to get a connection struct timespec wait_time; wait_time.tv_sec = 0; wait_time.tv_nsec = 100 * 1000 * 1000; memcached_st* st = memcached_pool_fetch(_pool, &wait_time, &rc); if (st != NULL) { // Got one: use it. const char* key_ptr = aor_id.data(); const size_t key_len = aor_id.length(); rc = memcached_mget(st, &key_ptr, &key_len, 1); if (memcached_success(rc)) { memcached_result_st result; memcached_result_create(st, &result); memcached_fetch_result(st, &result, &rc); if (memcached_success(rc)) { aor_data = deserialize_aor(std::string(memcached_result_value(&result), memcached_result_length(&result))); aor_data->set_cas(memcached_result_cas(&result)); int now = time(NULL); expire_bindings(aor_data, now); } else { // AoR does not exist, so create it. aor_data = new MemcachedAoR(); } } memcached_pool_release(_pool, st); } return (AoR*)aor_data; }
memcached_return_t BaseMemcachedStore::get_from_replica(memcached_st* replica, const char* key_ptr, const size_t key_len, std::string& data, uint64_t& cas) { memcached_return_t rc = MEMCACHED_ERROR; cas = 0; // We must use memcached_mget because memcached_get does not retrieve CAS // values. rc = memcached_mget(replica, &key_ptr, &key_len, 1); if (memcached_success(rc)) { // memcached_mget command was successful, so retrieve the result. TRC_DEBUG("Fetch result"); memcached_result_st result; memcached_result_create(replica, &result); memcached_fetch_result(replica, &result, &rc); if (memcached_success(rc)) { // Found a record, so exit the read loop. TRC_DEBUG("Found record on replica"); // Copy the record into a string. std::string::assign copies its // arguments when used with a char*, so we can free the result // afterwards. data.assign(memcached_result_value(&result), memcached_result_length(&result)); cas = memcached_result_cas(&result); } memcached_result_free(&result); } return rc; }
/// Delete the data for the specified namespace and key. Writes the data /// unconditionally, so CAS is not needed. Store::Status MemcachedStore::delete_data(const std::string& table, const std::string& key, SAS::TrailId trail) { Store::Status status = Store::Status::OK; LOG_DEBUG("Deleting key %s from table %s", key.c_str(), table.c_str()); // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); // Delete from the read replicas - read replicas are a superset of the write replicas const std::vector<memcached_st*>& replicas = get_replicas(fqkey, Op::READ); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_DELETE, 0); start.add_var_param(fqkey); SAS::report_event(start); } LOG_DEBUG("Deleting from the %d read replicas for key %s", replicas.size(), fqkey.c_str()); // First try to write the primary data record to the first responding // server. memcached_return_t rc = MEMCACHED_ERROR; size_t ii; for (ii = 0; ii < replicas.size(); ++ii) { LOG_DEBUG("Attempt delete to replica %d (connection %p)", ii, replicas[ii]); rc = memcached_delete(replicas[ii], key_ptr, key_len, 0); if (!memcached_success(rc)) { // Deletes are unconditional so this should never happen LOG_ERROR("Delete failed to replica %d", ii); } } return status; }
bool MemcachedSessionManager::load_session(std::string const &sessionid, Session &session) { memcached_return_t rc; size_t value_length; char *value = memcached_get(memcached_conn, sessionid.data(), sessionid.length(), &value_length, 0, &rc); if (memcached_success(rc)) { std::stringstream valuestream(std::string(value, value_length)); parse_pairs(valuestream, '\n', session.data()); uint64_t userid = 0; try { userid = std::stoi(session.get("userid")); } catch (std::exception const &ex) { LOG_MESSAGE_WARN("Invalid userid in session data"); return false; } session.set(User::find(database(), userid), sessionid); } if (value != NULL) free(value); return memcached_success(rc); }
void BaseMemcachedStore::delete_with_tombstone(const std::string& fqkey, const std::vector<memcached_st*>& replicas, SAS::TrailId trail) { if (trail != 0) { SAS::Event event(trail, SASEvent::MEMCACHED_DELETE, 0); event.add_var_param(fqkey); event.add_static_param(_tombstone_lifetime); SAS::report_event(event); } const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); // Calculate a timestamp (least-significant 32 bits of milliseconds since the // epoch) for the current time. We store this in the flags field to allow us // to resolve conflicts when resynchronizing between memcached servers. struct timespec ts; (void)clock_gettime(CLOCK_REALTIME, &ts); uint32_t flags = (uint32_t)((ts.tv_sec * 1000) + (ts.tv_nsec / 1000000)); // Calculate the vbucket for this key. int vbucket = vbucket_for_key(fqkey); for (size_t ii = 0; ii < replicas.size(); ++ii) { TRC_DEBUG("Attempt write tombstone to replica %d (connection %p)", ii, replicas[ii]); memcached_return_t rc = memcached_set_vb(replicas[ii], key_ptr, key_len, _binary ? vbucket : 0, TOMBSTONE.data(), TOMBSTONE.length(), _tombstone_lifetime, flags); if (!memcached_success(rc)) { log_delete_failure(fqkey, ii, replicas.size(), trail, 1); } } }
/// Update the data for a particular address of record. Writes the data /// atomically. If the underlying data has changed since it was last /// read, the update is rejected and this returns false; if the update /// succeeds, this returns true. /// /// If a connection cannot be obtained, returns a random boolean based on /// data found on the call stack at the point of entry. bool MemcachedStore::set_aor_data(const std::string& aor_id, ///< the SIP URI AoR* data) ///< the data to store { memcached_return_t rc; MemcachedAoR* aor_data = (MemcachedAoR*)data; // Try to get a connection. struct timespec wait_time; wait_time.tv_sec = 0; wait_time.tv_nsec = 100 * 1000 * 1000; memcached_st* st = memcached_pool_fetch(_pool, &wait_time, &rc); if (st != NULL) { // Got one: use it. // // Expire any old bindings before writing to the server. In theory, // if there are no bindings left we could delete the entry, but this // may cause concurrency problems because memcached does not support // cas on delete operations. In this case we do a memcached_cas with // an effectively immediate expiry time. int now = time(NULL); int max_expires = expire_bindings(aor_data, now); std::string value = serialize_aor(aor_data); if (aor_data->get_cas() == 0) { // New record, so attempt to add. This will fail if someone else // gets there first. rc = memcached_add(st, aor_id.data(), aor_id.length(), value.data(), value.length(), max_expires, 0); } else { // This is an update to an existing record, so use memcached_cas // to make sure it is atomic. rc = memcached_cas(st, aor_id.data(), aor_id.length(), value.data(), value.length(), max_expires, 0, aor_data->get_cas()); } memcached_pool_release(_pool, st); } return memcached_success(rc); }
/// Update the data for the specified namespace and key. Writes the data /// atomically, so if the underlying data has changed since it was last /// read, the update is rejected and this returns Store::Status::CONTENTION. Store::Status MemcachedStore::set_data(const std::string& table, const std::string& key, const std::string& data, uint64_t cas, int expiry, SAS::TrailId trail) { Store::Status status = Store::Status::OK; LOG_DEBUG("Writing %d bytes to table %s key %s, CAS = %ld, expiry = %d", data.length(), table.c_str(), key.c_str(), cas, expiry); // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); const std::vector<memcached_st*>& replicas = get_replicas(fqkey, Op::WRITE); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_SET_START, 0); start.add_var_param(fqkey); start.add_var_param(data); start.add_static_param(cas); start.add_static_param(expiry); SAS::report_event(start); } LOG_DEBUG("%d write replicas for key %s", replicas.size(), fqkey.c_str()); // Calculate the rough expected expiry time. We store this in the flags // as it may be useful in future for read repair function. uint32_t now = time(NULL); uint32_t exptime = now + expiry; // Memcached uses a flexible mechanism for specifying expiration. // - 0 indicates never expire. // - <= MEMCACHED_EXPIRATION_MAXDELTA indicates a relative (delta) time. // - > MEMCACHED_EXPIRATION_MAXDELTA indicates an absolute time. // Absolute time is the only way to force immediate expiry. Unfortunately, // it's not reliable - see https://github.com/Metaswitch/cpp-common/issues/160 // for details. Instead, we use relative time for future times (expiry > 0) // and the earliest absolute time for immediate expiry (expiry == 0). time_t memcached_expiration = (time_t)((expiry > 0) ? expiry : MEMCACHED_EXPIRATION_MAXDELTA + 1); // First try to write the primary data record to the first responding // server. memcached_return_t rc = MEMCACHED_ERROR; size_t ii; size_t replica_idx; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2: replicas.size(); for (ii = 0; ii < attempts; ++ii) { if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a transient server failure, so we // shouldn't retry. break; } replica_idx = 0; LOG_WARNING("Failed to write to sole memcached replica: retrying once"); } else { replica_idx = ii; } LOG_DEBUG("Attempt conditional write to replica %d (connection %p), CAS = %ld", replica_idx, replicas[replica_idx], cas); if (cas == 0) { // New record, so attempt to add. This will fail if someone else // gets there first. rc = memcached_add(replicas[replica_idx], key_ptr, key_len, data.data(), data.length(), memcached_expiration, exptime); } else { // This is an update to an existing record, so use memcached_cas // to make sure it is atomic. rc = memcached_cas(replicas[replica_idx], key_ptr, key_len, data.data(), data.length(), memcached_expiration, exptime, cas); } if (memcached_success(rc)) { LOG_DEBUG("Conditional write succeeded to replica %d", replica_idx); break; } else { LOG_DEBUG("memcached_%s command for %s failed on replica %d, rc = %d (%s), expiry = %d\n%s", (cas == 0) ? "add" : "cas", fqkey.c_str(), replica_idx, rc, memcached_strerror(replicas[replica_idx], rc), expiry, memcached_last_error_message(replicas[replica_idx])); if ((rc == MEMCACHED_NOTSTORED) || (rc == MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_CONTENTION, 0); err.add_var_param(fqkey); SAS::report_event(err); } // A NOT_STORED or EXISTS response indicates a concurrent write failure, // so return this to the application immediately - don't go on to // other replicas. LOG_INFO("Contention writing data for %s to store", fqkey.c_str()); status = Store::Status::DATA_CONTENTION; break; } } } if ((rc == MEMCACHED_SUCCESS) && (replica_idx < replicas.size())) { // Write has succeeded, so write unconditionally (and asynchronously) // to the replicas. for (size_t jj = replica_idx + 1; jj < replicas.size(); ++jj) { LOG_DEBUG("Attempt unconditional write to replica %d", jj); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 1); memcached_set(replicas[jj], key_ptr, key_len, data.data(), data.length(), memcached_expiration, exptime); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 0); } } if ((!memcached_success(rc)) && (rc != MEMCACHED_NOTSTORED) && (rc != MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_FAILED, 0); err.add_var_param(fqkey); SAS::report_event(err); } LOG_ERROR("Failed to write data for %s to %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; } return status; }
/// Retrieve the data for a given namespace and key. Store::Status MemcachedStore::get_data(const std::string& table, const std::string& key, std::string& data, uint64_t& cas, SAS::TrailId trail) { Store::Status status = Store::Status::OK; // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); const std::vector<memcached_st*>& replicas = get_replicas(fqkey, Op::READ); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_GET_START, 0); start.add_var_param(fqkey); SAS::report_event(start); } LOG_DEBUG("%d read replicas for key %s", replicas.size(), fqkey.c_str()); // Read from all replicas until we get a positive result. memcached_return_t rc = MEMCACHED_ERROR; bool active_not_found = false; size_t failed_replicas = 0; size_t ii; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2 : replicas.size(); for (ii = 0; ii < attempts; ++ii) { size_t replica_idx; if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a server failure, so we // shouldn't retry. break; } replica_idx = 0; LOG_WARNING("Failed to read from sole memcached replica: retrying once"); } else { replica_idx = ii; } // We must use memcached_mget because memcached_get does not retrieve CAS // values. LOG_DEBUG("Attempt to read from replica %d (connection %p)", replica_idx, replicas[replica_idx]); rc = memcached_mget(replicas[replica_idx], &key_ptr, &key_len, 1); if (memcached_success(rc)) { // memcached_mget command was successful, so retrieve the result. LOG_DEBUG("Fetch result"); memcached_result_st result; memcached_result_create(replicas[replica_idx], &result); memcached_fetch_result(replicas[replica_idx], &result, &rc); if (memcached_success(rc)) { // Found a record, so exit the read loop. LOG_DEBUG("Found record on replica %d", replica_idx); data.assign(memcached_result_value(&result), memcached_result_length(&result)); cas = (active_not_found) ? 0 : memcached_result_cas(&result); // std::string::assign copies its arguments when used with a // char*, so this is safe. memcached_result_free(&result); break; } else { // Free the result and continue the read loop. memcached_result_free(&result); } } if (rc == MEMCACHED_NOTFOUND) { // Failed to find a record on an active replica. Flag this so if we do // find data on a later replica we can reset the cas value returned to // zero to ensure a subsequent write will succeed. LOG_DEBUG("Read for %s on replica %d returned NOTFOUND", fqkey.c_str(), replica_idx); active_not_found = true; } else { // Error from this node, so consider it inactive. LOG_DEBUG("Read for %s on replica %d returned error %d (%s)", fqkey.c_str(), replica_idx, rc, memcached_strerror(replicas[replica_idx], rc)); ++failed_replicas; } } if (memcached_success(rc)) { if (trail != 0) { SAS::Event got_data(trail, SASEvent::MEMCACHED_GET_SUCCESS, 0); got_data.add_var_param(fqkey); got_data.add_var_param(data); got_data.add_static_param(cas); SAS::report_event(got_data); } // Return the data and CAS value. The CAS value is either set to the CAS // value from the result, or zero if an earlier active replica returned // NOT_FOUND. This ensures that a subsequent set operation will succeed // on the earlier active replica. LOG_DEBUG("Read %d bytes from table %s key %s, CAS = %ld", data.length(), table.c_str(), key.c_str(), cas); } else if (failed_replicas < replicas.size()) { // At least one replica returned NOT_FOUND. if (trail != 0) { SAS::Event not_found(trail, SASEvent::MEMCACHED_GET_NOT_FOUND, 0); not_found.add_var_param(fqkey); SAS::report_event(not_found); } LOG_DEBUG("At least one replica returned not found, so return NOT_FOUND"); status = Store::Status::NOT_FOUND; } else { // All replicas returned an error, so log the error and return the // failure. if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_GET_ERROR, 0); err.add_var_param(fqkey); SAS::report_event(err); } LOG_ERROR("Failed to read data for %s from %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; } return status; }
memcached_return_t BaseMemcachedStore::add_overwriting_tombstone(memcached_st* replica, const char* key_ptr, const size_t key_len, const uint32_t vbucket, const std::string& data, time_t memcached_expiration, uint32_t flags, SAS::TrailId trail) { memcached_return_t rc; uint64_t cas = 0; TRC_DEBUG("Attempting to add data for key %.*s", key_len, key_ptr); // Convert the key into a std::string (sas-client does not like that // key_{ptr,len} are constant). const std::string key(key_ptr, key_len); while (true) { if (cas == 0) { TRC_DEBUG("Attempting memcached ADD command"); rc = memcached_add_vb(replica, key_ptr, key_len, _binary ? vbucket : 0, data.data(), data.length(), memcached_expiration, flags); } else { TRC_DEBUG("Attempting memcached CAS command (cas = %d)", cas); rc = memcached_cas_vb(replica, key_ptr, key_len, _binary ? vbucket : 0, data.data(), data.length(), memcached_expiration, flags, cas); } if ((rc == MEMCACHED_DATA_EXISTS) || (rc == MEMCACHED_NOTSTORED)) { // A record with this key already exists. If it is a tombstone, we need // to overwrite it. Get the record to see what it is. memcached_return_t get_rc; std::string existing_data; TRC_DEBUG("Existing data prevented the ADD/CAS." "Issue GET to see if we need to overwrite a tombstone"); get_rc = get_from_replica(replica, key_ptr, key_len, existing_data, cas); if (memcached_success(get_rc)) { if (existing_data != TOMBSTONE) { // The existing record is not a tombstone. We mustn't overwrite // this, so break out of the loop and return the original return code // from the ADD/CAS. TRC_DEBUG("Found real data. Give up"); break; } else { // The existing record IS a tombstone. Go round the loop again to // overwrite it. `cas` has been set to the cas of the tombstone. TRC_DEBUG("Found a tombstone. Attempt to overwrite"); if (trail != 0) { SAS::Event event(trail, SASEvent::MEMCACHED_SET_BLOCKED_BY_TOMBSTONE, 0); event.add_var_param(key); event.add_static_param(cas); SAS::report_event(event); } } } else if (get_rc == MEMCACHED_NOTFOUND) { // The GET returned that there is no record for this key. This can // happen if the record has expired. We need to try again (it could // have been a tombstone which should not block adds). TRC_DEBUG("GET failed with NOT_FOUND"); if (trail != 0) { SAS::Event event(trail, SASEvent::MEMCACHED_SET_BLOCKED_BY_EXPIRED, 0); event.add_var_param(key); SAS::report_event(event); } } else { // The replica failed. Return the return code from the original ADD/CAS. TRC_DEBUG("GET failed, rc = %d (%s)\n%s", get_rc, memcached_strerror(replica, get_rc), memcached_last_error_message(replica)); break; } } else { TRC_DEBUG("ADD/CAS returned rc = %d (%s)\n%s", rc, memcached_strerror(replica, rc), memcached_last_error_message(replica)); break; } } return rc; }
/// Update the data for the specified namespace and key. Writes the data /// atomically, so if the underlying data has changed since it was last /// read, the update is rejected and this returns Store::Status::CONTENTION. Store::Status BaseMemcachedStore::set_data(const std::string& table, const std::string& key, const std::string& data, uint64_t cas, int expiry, SAS::TrailId trail) { Store::Status status = Store::Status::OK; TRC_DEBUG("Writing %d bytes to table %s key %s, CAS = %ld, expiry = %d", data.length(), table.c_str(), key.c_str(), cas, expiry); // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); int vbucket = vbucket_for_key(fqkey); const std::vector<memcached_st*>& replicas = get_replicas(vbucket, Op::WRITE); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_SET_START, 0); start.add_var_param(fqkey); start.add_var_param(data); start.add_static_param(cas); start.add_static_param(expiry); SAS::report_event(start); } TRC_DEBUG("%d write replicas for key %s", replicas.size(), fqkey.c_str()); // Calculate a timestamp (least-significant 32 bits of milliseconds since the // epoch) for the current time. We store this in the flags field to allow us // to resolve conflicts when resynchronizing between memcached servers. struct timespec ts; (void)clock_gettime(CLOCK_REALTIME, &ts); uint32_t flags = (uint32_t)((ts.tv_sec * 1000) + (ts.tv_nsec / 1000000)); // Memcached uses a flexible mechanism for specifying expiration. // - 0 indicates never expire. // - <= MEMCACHED_EXPIRATION_MAXDELTA indicates a relative (delta) time. // - > MEMCACHED_EXPIRATION_MAXDELTA indicates an absolute time. // Absolute time is the only way to force immediate expiry. Unfortunately, // it's not reliable - see https://github.com/Metaswitch/cpp-common/issues/160 // for details. Instead, we use relative time for future times (expiry > 0) // and the earliest absolute time for immediate expiry (expiry == 0). time_t memcached_expiration = (time_t)((expiry > 0) ? expiry : MEMCACHED_EXPIRATION_MAXDELTA + 1); // First try to write the primary data record to the first responding // server. memcached_return_t rc = MEMCACHED_ERROR; size_t ii; size_t replica_idx; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2: replicas.size(); for (ii = 0; ii < attempts; ++ii) { if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a transient server failure, so we // shouldn't retry. break; } replica_idx = 0; TRC_WARNING("Failed to write to sole memcached replica: retrying once"); } else { replica_idx = ii; } TRC_DEBUG("Attempt conditional write to vbucket %d on replica %d (connection %p), CAS = %ld, expiry = %d", vbucket, replica_idx, replicas[replica_idx], cas, expiry); if (cas == 0) { // New record, so attempt to add (but overwrite any tombstones we // encounter). This will fail if someone else got there first and some // data already exists in memcached for this key. rc = add_overwriting_tombstone(replicas[replica_idx], key_ptr, key_len, vbucket, data, memcached_expiration, flags, trail); } else { // This is an update to an existing record, so use memcached_cas // to make sure it is atomic. rc = memcached_cas_vb(replicas[replica_idx], key_ptr, key_len, _binary ? vbucket : 0, data.data(), data.length(), memcached_expiration, flags, cas); if (!memcached_success(rc)) { TRC_DEBUG("memcached_cas command failed, rc = %d (%s)\n%s", rc, memcached_strerror(replicas[replica_idx], rc), memcached_last_error_message(replicas[replica_idx])); } } if (memcached_success(rc)) { TRC_DEBUG("Conditional write succeeded to replica %d", replica_idx); break; } else if ((rc == MEMCACHED_NOTSTORED) || (rc == MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_CONTENTION, 0); err.add_var_param(fqkey); SAS::report_event(err); } // A NOT_STORED or EXISTS response indicates a concurrent write failure, // so return this to the application immediately - don't go on to // other replicas. TRC_INFO("Contention writing data for %s to store", fqkey.c_str()); status = Store::Status::DATA_CONTENTION; break; } } if ((rc == MEMCACHED_SUCCESS) && (replica_idx < replicas.size())) { // Write has succeeded, so write unconditionally (and asynchronously) // to the replicas. for (size_t jj = replica_idx + 1; jj < replicas.size(); ++jj) { TRC_DEBUG("Attempt unconditional write to replica %d", jj); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 1); memcached_set_vb(replicas[jj], key_ptr, key_len, _binary ? vbucket : 0, data.data(), data.length(), memcached_expiration, flags); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 0); } } if ((!memcached_success(rc)) && (rc != MEMCACHED_NOTSTORED) && (rc != MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_FAILED, 0); err.add_var_param(fqkey); SAS::report_event(err); } update_vbucket_comm_state(vbucket, FAILED); if (_comm_monitor) { _comm_monitor->inform_failure(); } TRC_ERROR("Failed to write data for %s to %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; } else { update_vbucket_comm_state(vbucket, OK); if (_comm_monitor) { _comm_monitor->inform_success(); } } return status; }
/// Retrieve the data for a given namespace and key. Store::Status BaseMemcachedStore::get_data(const std::string& table, const std::string& key, std::string& data, uint64_t& cas, SAS::TrailId trail) { Store::Status status; // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); int vbucket = vbucket_for_key(fqkey); const std::vector<memcached_st*>& replicas = get_replicas(vbucket, Op::READ); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_GET_START, 0); start.add_var_param(fqkey); SAS::report_event(start); } TRC_DEBUG("%d read replicas for key %s", replicas.size(), fqkey.c_str()); // Read from all replicas until we get a positive result. memcached_return_t rc = MEMCACHED_ERROR; bool active_not_found = false; size_t failed_replicas = 0; size_t ii; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2 : replicas.size(); for (ii = 0; ii < attempts; ++ii) { size_t replica_idx; if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a server failure, so we // shouldn't retry. break; } replica_idx = 0; TRC_WARNING("Failed to read from sole memcached replica: retrying once"); } else { replica_idx = ii; } TRC_DEBUG("Attempt to read from replica %d (connection %p)", replica_idx, replicas[replica_idx]); rc = get_from_replica(replicas[replica_idx], key_ptr, key_len, data, cas); if (memcached_success(rc)) { // Got data back from this replica. Don't try any more. TRC_DEBUG("Read for %s on replica %d returned SUCCESS", fqkey.c_str(), replica_idx); break; } else if (rc == MEMCACHED_NOTFOUND) { // Failed to find a record on an active replica. Flag this so if we do // find data on a later replica we can reset the cas value returned to // zero to ensure a subsequent write will succeed. TRC_DEBUG("Read for %s on replica %d returned NOTFOUND", fqkey.c_str(), replica_idx); active_not_found = true; } else { // Error from this node, so consider it inactive. TRC_DEBUG("Read for %s on replica %d returned error %d (%s)", fqkey.c_str(), replica_idx, rc, memcached_strerror(replicas[replica_idx], rc)); ++failed_replicas; } } if (memcached_success(rc)) { if (data != TOMBSTONE) { if (trail != 0) { SAS::Event got_data(trail, SASEvent::MEMCACHED_GET_SUCCESS, 0); got_data.add_var_param(fqkey); got_data.add_var_param(data); got_data.add_static_param(cas); SAS::report_event(got_data); } // Return the data and CAS value. The CAS value is either set to the CAS // value from the result, or zero if an earlier active replica returned // NOT_FOUND. This ensures that a subsequent set operation will succeed // on the earlier active replica. if (active_not_found) { cas = 0; } TRC_DEBUG("Read %d bytes from table %s key %s, CAS = %ld", data.length(), table.c_str(), key.c_str(), cas); status = Store::OK; } else { if (trail != 0) { SAS::Event got_tombstone(trail, SASEvent::MEMCACHED_GET_TOMBSTONE, 0); got_tombstone.add_var_param(fqkey); got_tombstone.add_static_param(cas); SAS::report_event(got_tombstone); } // We have read a tombstone. Return NOT_FOUND to the caller, and also // zero out the CAS (returning a zero CAS makes the interface cleaner). TRC_DEBUG("Read tombstone from table %s key %s, CAS = %ld", table.c_str(), key.c_str(), cas); cas = 0; status = Store::NOT_FOUND; } // Regardless of whether we got a tombstone, the vbucket is alive. update_vbucket_comm_state(vbucket, OK); if (_comm_monitor) { _comm_monitor->inform_success(); } } else if (failed_replicas < replicas.size()) { // At least one replica returned NOT_FOUND. if (trail != 0) { SAS::Event not_found(trail, SASEvent::MEMCACHED_GET_NOT_FOUND, 0); not_found.add_var_param(fqkey); SAS::report_event(not_found); } TRC_DEBUG("At least one replica returned not found, so return NOT_FOUND"); status = Store::Status::NOT_FOUND; update_vbucket_comm_state(vbucket, OK); if (_comm_monitor) { _comm_monitor->inform_success(); } } else { // All replicas returned an error, so log the error and return the // failure. if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_GET_ERROR, 0); err.add_var_param(fqkey); SAS::report_event(err); } TRC_ERROR("Failed to read data for %s from %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; update_vbucket_comm_state(vbucket, FAILED); if (_comm_monitor) { _comm_monitor->inform_failure(); } } return status; }