/// Delete the data for the specified namespace and key. Writes the data /// unconditionally, so CAS is not needed. Store::Status MemcachedStore::delete_data(const std::string& table, const std::string& key, SAS::TrailId trail) { Store::Status status = Store::Status::OK; LOG_DEBUG("Deleting key %s from table %s", key.c_str(), table.c_str()); // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); // Delete from the read replicas - read replicas are a superset of the write replicas const std::vector<memcached_st*>& replicas = get_replicas(fqkey, Op::READ); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_DELETE, 0); start.add_var_param(fqkey); SAS::report_event(start); } LOG_DEBUG("Deleting from the %d read replicas for key %s", replicas.size(), fqkey.c_str()); // First try to write the primary data record to the first responding // server. memcached_return_t rc = MEMCACHED_ERROR; size_t ii; for (ii = 0; ii < replicas.size(); ++ii) { LOG_DEBUG("Attempt delete to replica %d (connection %p)", ii, replicas[ii]); rc = memcached_delete(replicas[ii], key_ptr, key_len, 0); if (!memcached_success(rc)) { // Deletes are unconditional so this should never happen LOG_ERROR("Delete failed to replica %d", ii); } } return status; }
/// Delete the data for the specified namespace and key. Writes the data /// unconditionally, so CAS is not needed. Store::Status BaseMemcachedStore::delete_data(const std::string& table, const std::string& key, SAS::TrailId trail) { TRC_DEBUG("Deleting key %s from table %s", key.c_str(), table.c_str()); // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; // Delete from the read replicas - read replicas are a superset of the write replicas const std::vector<memcached_st*>& replicas = get_replicas(fqkey, Op::READ); TRC_DEBUG("Deleting from the %d read replicas for key %s", replicas.size(), fqkey.c_str()); if (_tombstone_lifetime == 0) { delete_without_tombstone(fqkey, replicas, trail); } else { delete_with_tombstone(fqkey, replicas, trail); } return Status::OK; }
/// Update the data for the specified namespace and key. Writes the data /// atomically, so if the underlying data has changed since it was last /// read, the update is rejected and this returns Store::Status::CONTENTION. Store::Status MemcachedStore::set_data(const std::string& table, const std::string& key, const std::string& data, uint64_t cas, int expiry, SAS::TrailId trail) { Store::Status status = Store::Status::OK; LOG_DEBUG("Writing %d bytes to table %s key %s, CAS = %ld, expiry = %d", data.length(), table.c_str(), key.c_str(), cas, expiry); // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); const std::vector<memcached_st*>& replicas = get_replicas(fqkey, Op::WRITE); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_SET_START, 0); start.add_var_param(fqkey); start.add_var_param(data); start.add_static_param(cas); start.add_static_param(expiry); SAS::report_event(start); } LOG_DEBUG("%d write replicas for key %s", replicas.size(), fqkey.c_str()); // Calculate the rough expected expiry time. We store this in the flags // as it may be useful in future for read repair function. uint32_t now = time(NULL); uint32_t exptime = now + expiry; // Memcached uses a flexible mechanism for specifying expiration. // - 0 indicates never expire. // - <= MEMCACHED_EXPIRATION_MAXDELTA indicates a relative (delta) time. // - > MEMCACHED_EXPIRATION_MAXDELTA indicates an absolute time. // Absolute time is the only way to force immediate expiry. Unfortunately, // it's not reliable - see https://github.com/Metaswitch/cpp-common/issues/160 // for details. Instead, we use relative time for future times (expiry > 0) // and the earliest absolute time for immediate expiry (expiry == 0). time_t memcached_expiration = (time_t)((expiry > 0) ? expiry : MEMCACHED_EXPIRATION_MAXDELTA + 1); // First try to write the primary data record to the first responding // server. memcached_return_t rc = MEMCACHED_ERROR; size_t ii; size_t replica_idx; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2: replicas.size(); for (ii = 0; ii < attempts; ++ii) { if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a transient server failure, so we // shouldn't retry. break; } replica_idx = 0; LOG_WARNING("Failed to write to sole memcached replica: retrying once"); } else { replica_idx = ii; } LOG_DEBUG("Attempt conditional write to replica %d (connection %p), CAS = %ld", replica_idx, replicas[replica_idx], cas); if (cas == 0) { // New record, so attempt to add. This will fail if someone else // gets there first. rc = memcached_add(replicas[replica_idx], key_ptr, key_len, data.data(), data.length(), memcached_expiration, exptime); } else { // This is an update to an existing record, so use memcached_cas // to make sure it is atomic. rc = memcached_cas(replicas[replica_idx], key_ptr, key_len, data.data(), data.length(), memcached_expiration, exptime, cas); } if (memcached_success(rc)) { LOG_DEBUG("Conditional write succeeded to replica %d", replica_idx); break; } else { LOG_DEBUG("memcached_%s command for %s failed on replica %d, rc = %d (%s), expiry = %d\n%s", (cas == 0) ? "add" : "cas", fqkey.c_str(), replica_idx, rc, memcached_strerror(replicas[replica_idx], rc), expiry, memcached_last_error_message(replicas[replica_idx])); if ((rc == MEMCACHED_NOTSTORED) || (rc == MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_CONTENTION, 0); err.add_var_param(fqkey); SAS::report_event(err); } // A NOT_STORED or EXISTS response indicates a concurrent write failure, // so return this to the application immediately - don't go on to // other replicas. LOG_INFO("Contention writing data for %s to store", fqkey.c_str()); status = Store::Status::DATA_CONTENTION; break; } } } if ((rc == MEMCACHED_SUCCESS) && (replica_idx < replicas.size())) { // Write has succeeded, so write unconditionally (and asynchronously) // to the replicas. for (size_t jj = replica_idx + 1; jj < replicas.size(); ++jj) { LOG_DEBUG("Attempt unconditional write to replica %d", jj); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 1); memcached_set(replicas[jj], key_ptr, key_len, data.data(), data.length(), memcached_expiration, exptime); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 0); } } if ((!memcached_success(rc)) && (rc != MEMCACHED_NOTSTORED) && (rc != MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_FAILED, 0); err.add_var_param(fqkey); SAS::report_event(err); } LOG_ERROR("Failed to write data for %s to %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; } return status; }
/// Retrieve the data for a given namespace and key. Store::Status MemcachedStore::get_data(const std::string& table, const std::string& key, std::string& data, uint64_t& cas, SAS::TrailId trail) { Store::Status status = Store::Status::OK; // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); const std::vector<memcached_st*>& replicas = get_replicas(fqkey, Op::READ); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_GET_START, 0); start.add_var_param(fqkey); SAS::report_event(start); } LOG_DEBUG("%d read replicas for key %s", replicas.size(), fqkey.c_str()); // Read from all replicas until we get a positive result. memcached_return_t rc = MEMCACHED_ERROR; bool active_not_found = false; size_t failed_replicas = 0; size_t ii; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2 : replicas.size(); for (ii = 0; ii < attempts; ++ii) { size_t replica_idx; if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a server failure, so we // shouldn't retry. break; } replica_idx = 0; LOG_WARNING("Failed to read from sole memcached replica: retrying once"); } else { replica_idx = ii; } // We must use memcached_mget because memcached_get does not retrieve CAS // values. LOG_DEBUG("Attempt to read from replica %d (connection %p)", replica_idx, replicas[replica_idx]); rc = memcached_mget(replicas[replica_idx], &key_ptr, &key_len, 1); if (memcached_success(rc)) { // memcached_mget command was successful, so retrieve the result. LOG_DEBUG("Fetch result"); memcached_result_st result; memcached_result_create(replicas[replica_idx], &result); memcached_fetch_result(replicas[replica_idx], &result, &rc); if (memcached_success(rc)) { // Found a record, so exit the read loop. LOG_DEBUG("Found record on replica %d", replica_idx); data.assign(memcached_result_value(&result), memcached_result_length(&result)); cas = (active_not_found) ? 0 : memcached_result_cas(&result); // std::string::assign copies its arguments when used with a // char*, so this is safe. memcached_result_free(&result); break; } else { // Free the result and continue the read loop. memcached_result_free(&result); } } if (rc == MEMCACHED_NOTFOUND) { // Failed to find a record on an active replica. Flag this so if we do // find data on a later replica we can reset the cas value returned to // zero to ensure a subsequent write will succeed. LOG_DEBUG("Read for %s on replica %d returned NOTFOUND", fqkey.c_str(), replica_idx); active_not_found = true; } else { // Error from this node, so consider it inactive. LOG_DEBUG("Read for %s on replica %d returned error %d (%s)", fqkey.c_str(), replica_idx, rc, memcached_strerror(replicas[replica_idx], rc)); ++failed_replicas; } } if (memcached_success(rc)) { if (trail != 0) { SAS::Event got_data(trail, SASEvent::MEMCACHED_GET_SUCCESS, 0); got_data.add_var_param(fqkey); got_data.add_var_param(data); got_data.add_static_param(cas); SAS::report_event(got_data); } // Return the data and CAS value. The CAS value is either set to the CAS // value from the result, or zero if an earlier active replica returned // NOT_FOUND. This ensures that a subsequent set operation will succeed // on the earlier active replica. LOG_DEBUG("Read %d bytes from table %s key %s, CAS = %ld", data.length(), table.c_str(), key.c_str(), cas); } else if (failed_replicas < replicas.size()) { // At least one replica returned NOT_FOUND. if (trail != 0) { SAS::Event not_found(trail, SASEvent::MEMCACHED_GET_NOT_FOUND, 0); not_found.add_var_param(fqkey); SAS::report_event(not_found); } LOG_DEBUG("At least one replica returned not found, so return NOT_FOUND"); status = Store::Status::NOT_FOUND; } else { // All replicas returned an error, so log the error and return the // failure. if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_GET_ERROR, 0); err.add_var_param(fqkey); SAS::report_event(err); } LOG_ERROR("Failed to read data for %s from %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; } return status; }
/// Update the data for the specified namespace and key. Writes the data /// atomically, so if the underlying data has changed since it was last /// read, the update is rejected and this returns Store::Status::CONTENTION. Store::Status BaseMemcachedStore::set_data(const std::string& table, const std::string& key, const std::string& data, uint64_t cas, int expiry, SAS::TrailId trail) { Store::Status status = Store::Status::OK; TRC_DEBUG("Writing %d bytes to table %s key %s, CAS = %ld, expiry = %d", data.length(), table.c_str(), key.c_str(), cas, expiry); // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); int vbucket = vbucket_for_key(fqkey); const std::vector<memcached_st*>& replicas = get_replicas(vbucket, Op::WRITE); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_SET_START, 0); start.add_var_param(fqkey); start.add_var_param(data); start.add_static_param(cas); start.add_static_param(expiry); SAS::report_event(start); } TRC_DEBUG("%d write replicas for key %s", replicas.size(), fqkey.c_str()); // Calculate a timestamp (least-significant 32 bits of milliseconds since the // epoch) for the current time. We store this in the flags field to allow us // to resolve conflicts when resynchronizing between memcached servers. struct timespec ts; (void)clock_gettime(CLOCK_REALTIME, &ts); uint32_t flags = (uint32_t)((ts.tv_sec * 1000) + (ts.tv_nsec / 1000000)); // Memcached uses a flexible mechanism for specifying expiration. // - 0 indicates never expire. // - <= MEMCACHED_EXPIRATION_MAXDELTA indicates a relative (delta) time. // - > MEMCACHED_EXPIRATION_MAXDELTA indicates an absolute time. // Absolute time is the only way to force immediate expiry. Unfortunately, // it's not reliable - see https://github.com/Metaswitch/cpp-common/issues/160 // for details. Instead, we use relative time for future times (expiry > 0) // and the earliest absolute time for immediate expiry (expiry == 0). time_t memcached_expiration = (time_t)((expiry > 0) ? expiry : MEMCACHED_EXPIRATION_MAXDELTA + 1); // First try to write the primary data record to the first responding // server. memcached_return_t rc = MEMCACHED_ERROR; size_t ii; size_t replica_idx; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2: replicas.size(); for (ii = 0; ii < attempts; ++ii) { if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a transient server failure, so we // shouldn't retry. break; } replica_idx = 0; TRC_WARNING("Failed to write to sole memcached replica: retrying once"); } else { replica_idx = ii; } TRC_DEBUG("Attempt conditional write to vbucket %d on replica %d (connection %p), CAS = %ld, expiry = %d", vbucket, replica_idx, replicas[replica_idx], cas, expiry); if (cas == 0) { // New record, so attempt to add (but overwrite any tombstones we // encounter). This will fail if someone else got there first and some // data already exists in memcached for this key. rc = add_overwriting_tombstone(replicas[replica_idx], key_ptr, key_len, vbucket, data, memcached_expiration, flags, trail); } else { // This is an update to an existing record, so use memcached_cas // to make sure it is atomic. rc = memcached_cas_vb(replicas[replica_idx], key_ptr, key_len, _binary ? vbucket : 0, data.data(), data.length(), memcached_expiration, flags, cas); if (!memcached_success(rc)) { TRC_DEBUG("memcached_cas command failed, rc = %d (%s)\n%s", rc, memcached_strerror(replicas[replica_idx], rc), memcached_last_error_message(replicas[replica_idx])); } } if (memcached_success(rc)) { TRC_DEBUG("Conditional write succeeded to replica %d", replica_idx); break; } else if ((rc == MEMCACHED_NOTSTORED) || (rc == MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_CONTENTION, 0); err.add_var_param(fqkey); SAS::report_event(err); } // A NOT_STORED or EXISTS response indicates a concurrent write failure, // so return this to the application immediately - don't go on to // other replicas. TRC_INFO("Contention writing data for %s to store", fqkey.c_str()); status = Store::Status::DATA_CONTENTION; break; } } if ((rc == MEMCACHED_SUCCESS) && (replica_idx < replicas.size())) { // Write has succeeded, so write unconditionally (and asynchronously) // to the replicas. for (size_t jj = replica_idx + 1; jj < replicas.size(); ++jj) { TRC_DEBUG("Attempt unconditional write to replica %d", jj); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 1); memcached_set_vb(replicas[jj], key_ptr, key_len, _binary ? vbucket : 0, data.data(), data.length(), memcached_expiration, flags); memcached_behavior_set(replicas[jj], MEMCACHED_BEHAVIOR_NOREPLY, 0); } } if ((!memcached_success(rc)) && (rc != MEMCACHED_NOTSTORED) && (rc != MEMCACHED_DATA_EXISTS)) { if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_SET_FAILED, 0); err.add_var_param(fqkey); SAS::report_event(err); } update_vbucket_comm_state(vbucket, FAILED); if (_comm_monitor) { _comm_monitor->inform_failure(); } TRC_ERROR("Failed to write data for %s to %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; } else { update_vbucket_comm_state(vbucket, OK); if (_comm_monitor) { _comm_monitor->inform_success(); } } return status; }
/// Retrieve the data for a given namespace and key. Store::Status BaseMemcachedStore::get_data(const std::string& table, const std::string& key, std::string& data, uint64_t& cas, SAS::TrailId trail) { Store::Status status; // Construct the fully qualified key. std::string fqkey = table + "\\\\" + key; const char* key_ptr = fqkey.data(); const size_t key_len = fqkey.length(); int vbucket = vbucket_for_key(fqkey); const std::vector<memcached_st*>& replicas = get_replicas(vbucket, Op::READ); if (trail != 0) { SAS::Event start(trail, SASEvent::MEMCACHED_GET_START, 0); start.add_var_param(fqkey); SAS::report_event(start); } TRC_DEBUG("%d read replicas for key %s", replicas.size(), fqkey.c_str()); // Read from all replicas until we get a positive result. memcached_return_t rc = MEMCACHED_ERROR; bool active_not_found = false; size_t failed_replicas = 0; size_t ii; // If we only have one replica, we should try it twice - // libmemcached won't notice a dropped TCP connection until it tries // to make a request on it, and will fail the request then // reconnect, so the second attempt could still work. size_t attempts = (replicas.size() == 1) ? 2 : replicas.size(); for (ii = 0; ii < attempts; ++ii) { size_t replica_idx; if ((replicas.size() == 1) && (ii == 1)) { if (rc != MEMCACHED_CONNECTION_FAILURE) { // This is a legitimate error, not a server failure, so we // shouldn't retry. break; } replica_idx = 0; TRC_WARNING("Failed to read from sole memcached replica: retrying once"); } else { replica_idx = ii; } TRC_DEBUG("Attempt to read from replica %d (connection %p)", replica_idx, replicas[replica_idx]); rc = get_from_replica(replicas[replica_idx], key_ptr, key_len, data, cas); if (memcached_success(rc)) { // Got data back from this replica. Don't try any more. TRC_DEBUG("Read for %s on replica %d returned SUCCESS", fqkey.c_str(), replica_idx); break; } else if (rc == MEMCACHED_NOTFOUND) { // Failed to find a record on an active replica. Flag this so if we do // find data on a later replica we can reset the cas value returned to // zero to ensure a subsequent write will succeed. TRC_DEBUG("Read for %s on replica %d returned NOTFOUND", fqkey.c_str(), replica_idx); active_not_found = true; } else { // Error from this node, so consider it inactive. TRC_DEBUG("Read for %s on replica %d returned error %d (%s)", fqkey.c_str(), replica_idx, rc, memcached_strerror(replicas[replica_idx], rc)); ++failed_replicas; } } if (memcached_success(rc)) { if (data != TOMBSTONE) { if (trail != 0) { SAS::Event got_data(trail, SASEvent::MEMCACHED_GET_SUCCESS, 0); got_data.add_var_param(fqkey); got_data.add_var_param(data); got_data.add_static_param(cas); SAS::report_event(got_data); } // Return the data and CAS value. The CAS value is either set to the CAS // value from the result, or zero if an earlier active replica returned // NOT_FOUND. This ensures that a subsequent set operation will succeed // on the earlier active replica. if (active_not_found) { cas = 0; } TRC_DEBUG("Read %d bytes from table %s key %s, CAS = %ld", data.length(), table.c_str(), key.c_str(), cas); status = Store::OK; } else { if (trail != 0) { SAS::Event got_tombstone(trail, SASEvent::MEMCACHED_GET_TOMBSTONE, 0); got_tombstone.add_var_param(fqkey); got_tombstone.add_static_param(cas); SAS::report_event(got_tombstone); } // We have read a tombstone. Return NOT_FOUND to the caller, and also // zero out the CAS (returning a zero CAS makes the interface cleaner). TRC_DEBUG("Read tombstone from table %s key %s, CAS = %ld", table.c_str(), key.c_str(), cas); cas = 0; status = Store::NOT_FOUND; } // Regardless of whether we got a tombstone, the vbucket is alive. update_vbucket_comm_state(vbucket, OK); if (_comm_monitor) { _comm_monitor->inform_success(); } } else if (failed_replicas < replicas.size()) { // At least one replica returned NOT_FOUND. if (trail != 0) { SAS::Event not_found(trail, SASEvent::MEMCACHED_GET_NOT_FOUND, 0); not_found.add_var_param(fqkey); SAS::report_event(not_found); } TRC_DEBUG("At least one replica returned not found, so return NOT_FOUND"); status = Store::Status::NOT_FOUND; update_vbucket_comm_state(vbucket, OK); if (_comm_monitor) { _comm_monitor->inform_success(); } } else { // All replicas returned an error, so log the error and return the // failure. if (trail != 0) { SAS::Event err(trail, SASEvent::MEMCACHED_GET_ERROR, 0); err.add_var_param(fqkey); SAS::report_event(err); } TRC_ERROR("Failed to read data for %s from %d replicas", fqkey.c_str(), replicas.size()); status = Store::Status::ERROR; update_vbucket_comm_state(vbucket, FAILED); if (_comm_monitor) { _comm_monitor->inform_failure(); } } return status; }
/// Gets the set of replicas to use for a read or write operation for the /// specified key. const std::vector<memcached_st*>& BaseMemcachedStore::get_replicas(const std::string& key, Op operation) { return get_replicas(vbucket_for_key(key), operation); }