store_t* open_mmap_store(const char* base_dir, const char* name, int flags) { int dir_fd = open(base_dir, O_DIRECTORY, (mode_t)0600); if (dir_fd == -1) return NULL; int real_fd = openat(dir_fd, name, O_RDWR, (mode_t)0600); ensure(real_fd > 0, "Failed to open mmap store file"); close(dir_fd); struct stat sb; int ret = fstat(real_fd, &sb); ensure(ret != -1, "Failed to fstat file"); int size = sb.st_size; // This is nearly identical to the create_mmap_store. Maybe should make an "init mmap store" or // something? struct mmap_store *store = (struct mmap_store*) calloc(1, sizeof(struct mmap_store)); if (store == NULL) return NULL; void *mapping = mmap(NULL, (size_t) size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE | MAP_NONBLOCK , real_fd, 0); if (mapping == NULL) return NULL; madvise(mapping, size, MADV_SEQUENTIAL); uint32_t off = sizeof(uint32_t) * 2; ensure(((uint32_t *)mapping)[0] == 0xDEADBEEF, "Magic number does not match. Bad file format"); ensure(((uint32_t *)mapping)[1] == size, "Size recorded does not match file size. Bad file format"); ensure(asprintf(&(store->filename), "%s/%s", base_dir, name) > 0, "Failed to allocate store filename"); store->fd = real_fd; store->capacity = size; store->flags = flags; store->mapping = mapping; // These don't really matter because writers aren't allowed... ck_pr_store_32(&store->write_cursor, off); ck_pr_store_32(&store->last_sync, 0); ck_pr_store_32(&store->read_cursor, -1); // We infer that this store has been synced... ck_pr_store_32(&store->syncing_and_writers, 0x80000000U); ck_pr_store_32(&store->synced, 1); ck_pr_fence_atomic(); ensure(msync(mapping, off, MS_SYNC) == 0, "Unable to sync"); ensure(store->write_cursor != 0, "Cursor incorrect"); ((store_t *)store)->write = &_mmap_write; ((store_t *)store)->open_cursor = &_mmap_open_cursor; ((store_t *)store)->pop_cursor = &_mmap_pop_cursor; ((store_t *)store)->capacity = &_mmap_capacity; ((store_t *)store)->cursor = &_mmap_cursor; ((store_t *)store)->start_cursor = &_mmap_start_cursor; ((store_t *)store)->sync = &_mmap_sync; ((store_t *)store)->close = &_mmap_close; ((store_t *)store)->destroy = &_mmap_destroy; return (store_t *)store; }
store_t* create_mmap_store(uint32_t size, const char* base_dir, const char* name, int flags) { //TODO : Enforce a max size //TODO : Check flags //TODO : check thread sanity //TODO : check size is near a page int dir_fd = open(base_dir, O_DIRECTORY, (mode_t)0600); if (dir_fd == -1) return NULL; int real_fd = openat(dir_fd, name, O_RDWR | O_CREAT, (mode_t)0600); close(dir_fd); // TODO - Check for the race condition if two people attempt to create // the same segment if (real_fd == -1) return NULL; if (posix_fallocate(real_fd, 0, size) != 0) { close(real_fd); return NULL; } struct mmap_store *store = (struct mmap_store*) calloc(1, sizeof(struct mmap_store)); if (store == NULL) return NULL; void *mapping = mmap(NULL, (size_t) size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE | MAP_NONBLOCK , real_fd, 0); if (mapping == NULL) return NULL; madvise(mapping, size, MADV_SEQUENTIAL); uint32_t off = sizeof(uint32_t) * 2; ((uint32_t *)mapping)[0] = 0xDEADBEEF; ((uint32_t *)mapping)[1] = size; store->fd = real_fd; store->capacity = size; store->flags = flags; store->mapping = mapping; ck_pr_store_32(&store->write_cursor, off); ck_pr_store_32(&store->sync_cursor, off); ck_pr_fence_atomic(); ensure(msync(mapping, off, MS_SYNC) == 0, "Unable to sync"); ensure(store->write_cursor != 0, "Cursor incorrect"); ensure(store->sync_cursor != 0, "Cursor incorrect"); ((store_t *)store)->write = &_mmap_write; ((store_t *)store)->open_cursor = &_mmap_open_cursor; ((store_t *)store)->capacity = &_mmap_capacity; ((store_t *)store)->cursor = &_mmap_cursor; ((store_t *)store)->sync = &_mmap_sync; ((store_t *)store)->close = &_mmap_close; ((store_t *)store)->destroy = &_mmap_destroy; return (store_t *)store; }
static int as_node_create_connection(as_node* node, int* fd) { // Create a non-blocking socket. *fd = cf_socket_create_nb(); if (*fd == -1) { // Local problem - socket create failed. cf_debug("Socket create failed for %s", node->name); return CITRUSLEAF_FAIL_CLIENT; } // Try primary address. as_address* primary = as_vector_get(&node->addresses, node->address_index); if (cf_socket_start_connect_nb(*fd, &primary->addr) == 0) { // Connection started ok - we have our socket. return as_node_authenticate_connection(node, fd); } // Try other addresses. as_vector* addresses = &node->addresses; for (uint32_t i = 0; i < addresses->size; i++) { as_address* address = as_vector_get(addresses, i); // Address points into alias array, so pointer comparison is sufficient. if (address != primary) { if (cf_socket_start_connect_nb(*fd, &address->addr) == 0) { // Replace invalid primary address with valid alias. // Other threads may not see this change immediately. // It's just a hint, not a requirement to try this new address first. cf_debug("Change node address %s %s:%d", node->name, address->name, (int)cf_swap_from_be16(address->addr.sin_port)); ck_pr_store_32(&node->address_index, i); return as_node_authenticate_connection(node, fd); } } } // Couldn't start a connection on any socket address - close the socket. cf_info("Failed to connect: %s %s:%d", node->name, primary->name, (int)cf_swap_from_be16(primary->addr.sin_port)); cf_close(*fd); *fd = -1; return CITRUSLEAF_FAIL_UNAVAILABLE; }
store_t* create_mmap_store(uint32_t size, const char* base_dir, const char* name, int flags) { //TODO : Enforce a max size //TODO : Check flags //TODO : check thread sanity //TODO : check size is near a page int dir_fd = open(base_dir, O_DIRECTORY, (mode_t)0600); if (dir_fd == -1) return NULL; int openat_flags = O_RDWR | O_CREAT | O_SYNC; if (flags & DELETE_IF_EXISTS) { openat_flags = openat_flags | O_TRUNC; } else { openat_flags = openat_flags | O_EXCL; } int real_fd = openat(dir_fd, name, openat_flags, (mode_t)0600); close(dir_fd); // TODO - Check for the race condition if two people attempt to create // the same segment if (real_fd == -1) { // TODO: This is a terrible hack. We need to fix the error handling, but for now, actually // warn us if we are failing because of loading a garbage file. ensure(errno != EEXIST, "Failed to create mmap store because file already exists"); return NULL; } if (posix_fallocate(real_fd, 0, size) != 0) { close(real_fd); return NULL; } struct mmap_store *store = (struct mmap_store*) calloc(1, sizeof(struct mmap_store)); if (store == NULL) return NULL; void *mapping = mmap(NULL, (size_t) size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE | MAP_NONBLOCK , real_fd, 0); if (mapping == NULL) return NULL; madvise(mapping, size, MADV_SEQUENTIAL); uint32_t off = sizeof(uint32_t) * 2; ((uint32_t *)mapping)[0] = 0xDEADBEEF; ((uint32_t *)mapping)[1] = size; ensure(asprintf(&(store->filename), "%s/%s", base_dir, name) > 0, "Failed to allocate store filename"); store->fd = real_fd; store->capacity = size; store->flags = flags; store->mapping = mapping; ck_pr_store_32(&store->write_cursor, off); ck_pr_store_32(&store->last_sync, 0); ck_pr_store_32(&store->read_cursor, -1); ck_pr_store_32(&store->syncing_and_writers, 0); ck_pr_store_32(&store->synced, 0); ck_pr_fence_atomic(); ensure(msync(mapping, off, MS_SYNC) == 0, "Unable to sync"); ensure(store->write_cursor != 0, "Cursor incorrect"); ((store_t *)store)->write = &_mmap_write; ((store_t *)store)->open_cursor = &_mmap_open_cursor; ((store_t *)store)->pop_cursor = &_mmap_pop_cursor; ((store_t *)store)->capacity = &_mmap_capacity; ((store_t *)store)->cursor = &_mmap_cursor; ((store_t *)store)->start_cursor = &_mmap_start_cursor; ((store_t *)store)->sync = &_mmap_sync; ((store_t *)store)->close = &_mmap_close; ((store_t *)store)->destroy = &_mmap_destroy; return (store_t *)store; }
/** * Force this store to sync if needed * * return * 0 - success * 1 - failure */ uint32_t _mmap_sync(store_t *store) { struct mmap_store *mstore = (struct mmap_store*) store; // The point we have written up to uint32_t write_cursor = ck_pr_load_32(&mstore->write_cursor); ensure(write_cursor > sizeof(uint32_t) * 2, "Attempted to sync an empty store"); // We must ensure that no writes are happening during a sync. To do this, we pack both the // "syncing" bit and the number of writers in the same 32 bit value. // 1. Load the "syncing_and_writers" value // 2. Set that we are syncing // 3. Try to Compare and Swap this value // 4. Repeat until "writers" == 0 while (1) { // 1. uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); // Make sure we aren't already at 2^32 - 1 writers. If we try to increment when we already have // that many we will overflow the 31 bits we are using to store the writers. ensure(writers < 0xEFFFFFFFU, "Too many writers"); // 2. // 3. if (syncing == 0) { if (!ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, SET_SYNCING(syncing_and_writers))) { continue; } } // 4. if (writers == 0) { break; } } // The point we have written up to write_cursor = ck_pr_load_32(&mstore->write_cursor); // Actually sync. At this point we are guaranteed there are no writers, so sync the entire // store. //TODO: Protect the nearest page once sunk //mprotect(mapping, off, PROT_READ); ensure(msync(mstore->mapping, write_cursor, MS_SYNC) == 0, "Unable to msync"); ensure(fsync(mstore->fd) == 0, "Unable to fsync"); // Record that we synced successfully. This will allow readers to progress. ck_pr_store_32(&mstore->synced, 1); uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); ensure(writers == 0, "We should not have synced the store when there are still writers"); ensure(syncing == 1, "We should not have synced the store when we did not mark it as syncing"); return 0; }
static void as_ev_connect(as_event_command* cmd) { int fd = as_event_create_socket(cmd); if (fd < 0) { return; } // Try primary address. as_node* node = cmd->node; as_address* primary = as_vector_get(&node->addresses, node->address_index); // Attempt non-blocking connection. if (connect(fd, (struct sockaddr*)&primary->addr, sizeof(struct sockaddr)) == 0) { as_ev_watcher_init(cmd, fd); return; } // Check if connection is in progress. if (errno == EINPROGRESS) { as_ev_watcher_init(cmd, fd); return; } // Try other addresses. as_vector* addresses = &node->addresses; for (uint32_t i = 0; i < addresses->size; i++) { as_address* address = as_vector_get(addresses, i); // Address points into alias array, so pointer comparison is sufficient. if (address != primary) { if (connect(fd, (struct sockaddr*)&address->addr, sizeof(struct sockaddr)) == 0) { // Replace invalid primary address with valid alias. // Other threads may not see this change immediately. // It's just a hint, not a requirement to try this new address first. as_log_debug("Change node address %s %s:%d", node->name, address->name, (int)cf_swap_from_be16(address->addr.sin_port)); ck_pr_store_32(&node->address_index, i); as_ev_watcher_init(cmd, fd); return; } // Check if connection is in progress. if (errno == EINPROGRESS) { // Replace invalid primary address with valid alias. // Other threads may not see this change immediately. // It's just a hint, not a requirement to try this new address first. as_log_debug("Change node address %s %s:%d", node->name, address->name, (int)cf_swap_from_be16(address->addr.sin_port)); ck_pr_store_32(&node->address_index, i); // Connection hasn't finished. as_ev_watcher_init(cmd, fd); return; } } } // Failed to start a connection on any socket address. as_error err; as_error_update(&err, AEROSPIKE_ERR_ASYNC_CONNECTION, "Failed to connect: %s %s:%d", node->name, primary->name, (int)cf_swap_from_be16(primary->addr.sin_port)); as_event_connect_error(cmd, &err, fd); }
int _compare_and_swap(persistent_atomic_value_t *pav, uint32_t old_value, uint32_t new_value) { // First lock this counter ck_rwlock_write_lock(pav->_lock); // Then, check to see if someone changed this value before we got here if (ck_pr_load_32(&pav->_current_value) != old_value) { ck_rwlock_write_unlock(pav->_lock); return -1; } // We got here first. Set the new value. ck_pr_store_32(&pav->_current_value, new_value); // Now, persist the value // 1. Write it to a temporary file // 2. Delete the original file // 3. Link the temporary file to the original file // 4. Unlink the temporary file int fail = 0; // 1. int open_flags = O_RDWR | O_CREAT | O_EXCL | O_SYNC ; int fd = open(pav->_temporary_filename, open_flags, (mode_t)0600); if (fd < 0) { fail = -2; goto end; } ssize_t nwritten = write(fd, &pav->_current_value, sizeof(pav->_current_value)); if(fsync(fd) != 0) { fail = -2; close(fd); goto end; } close(fd); if (nwritten < 0) { fail = -2; goto end; } // 2. if(unlink(pav->_filename) != 0) { fail = -3; goto end; } // 3. if (link(pav->_temporary_filename, pav->_filename) != 0) fail = -4; end: if (unlink(pav->_temporary_filename) != 0) fail = -5; if (fail != 0) { ck_pr_store_32(&pav->_current_value, old_value); } ck_rwlock_write_unlock(pav->_lock); // For now ensure(fail == 0, "Failed during persistent update"); return fail; }