static inline void fq_push_free_message_stack(struct free_message_stack *stack, fq_msg *m) { if (stack == NULL) { return; } while(ck_pr_load_32(&stack->size) > stack->max_size) { ck_stack_entry_t *ce = ck_stack_pop_mpmc(&stack->stack); if (ce != NULL) { fq_msg *m = container_of(ce, fq_msg, cleanup_stack_entry); free(m); ck_pr_dec_32(&stack->size); } else break; } uint32_t c = ck_pr_load_32(&stack->size); if (c >= stack->max_size) { free(m); return; } ck_pr_inc_32(&stack->size); ck_stack_push_mpmc(&stack->stack, &m->cleanup_stack_entry); }
/* * Free a segment. We are assuming the list is either locked or being accessed from a single * threaded context */ int _free_segment_inlock(segment_list_t *segment_list, uint32_t segment_number, bool destroy_store) { segment_t *segment = __segment_number_to_segment(segment_list, segment_number); ensure(__is_segment_number_in_segment_list_inlock(segment_list, segment_number), "Attempted to destroy a segment not in the list"); ensure(segment->state != FREE, "Attempted to destroy segment already in the FREE state"); ensure(segment->state != CLOSED, "Attempted to destroy segment already in the CLOSED state"); ensure(segment->segment_number == segment_number, "Attempted to destroy uninitialized segment"); ensure(segment->store != NULL, "Attempted to destroy segment with null store"); ensure(ck_pr_load_32(&segment->refcount) == 0, "Attempted to destroy segment with non zero refcount"); if (destroy_store) { segment->store->destroy(segment->store); segment->state = FREE; } else { segment->store->close(segment->store, 1); segment->state = CLOSED; } // Zero out the segment we just freed for debugging segment->store = NULL; segment->segment_number = 0; return 0; }
void __mmap_schedule_store_sync(struct mmap_store *mstore, uint32_t write_cursor) { // Lock free but not wait free uint32_t *sync_cursor = &mstore->sync_cursor; uint32_t sync_pos = ck_pr_load_32(sync_cursor); sync_pos = ck_pr_load_32(sync_cursor); // TODO - Add in the sync flags for allowing things like Dirty read //TODO: Protect the nearest page once sunk //mprotect(mapping, off, PROT_READ); if (write_cursor - sync_pos > (4 * 1024)) { int sync_distance = write_cursor - sync_pos; msync(mstore->mapping + sync_pos, sync_distance, MS_ASYNC); } if (write_cursor - sync_pos > (64 * 1024 * 1024)) { fsync(mstore->fd); // Try to write the new cursor, give up if you miss the race ck_pr_cas_32(sync_cursor, sync_pos, write_cursor); } }
enum store_read_status __mmap_cursor_position(struct mmap_store_cursor *cursor, uint32_t offset) { ensure(cursor->store != NULL, "Broken cursor"); // If a user calls this store before any thread has called sync, that is a programming error. // TODO: Make this a real error. An assert now just for debugging. ensure(EXTRACT_SYNCING(ck_pr_load_32(&cursor->store->syncing_and_writers)) == 1, "Attempted to seek a cursor on a store before sync has been called"); // Calling read before a store has finished syncing, however, may be more of a race condition, // so be nicer about it and just tell the caller to try again. if (ck_pr_load_32(&cursor->store->synced) != 1) { // We are trying to read from a store that has not yet been synced, which is not (yet?) // supported. return UNSYNCED_STORE; } // The read is clearly out of bounds for this store // Note that we need at least sizeof(uint32_t) bytes to continue, since for each write we write // at least a uint32_t to indicate the size of the block if (offset + sizeof(uint32_t) > cursor->store->capacity) return OUT_OF_BOUNDS; void *src = (cursor->store->mapping + offset); uint32_t size = ((uint32_t*)src)[0]; if (size == 0) return END; // We have reached the synthetic end of the data ensure(offset + size + sizeof(uint32_t) < cursor->store->capacity, "Found a block that runs over the end of our store"); cursor->next_offset = (offset + sizeof(uint32_t) + size); ((store_cursor_t*)cursor)->offset = offset; // For now mmap cursors are read forward only (sequential madvise) // and because we want to encourage people to use the cursor if (cursor->next_offset <= offset) return INVALID_SEEK_DIRECTION; ((store_cursor_t*)cursor)->size = size; ((store_cursor_t*)cursor)->data = src + sizeof(uint32_t); return SUCCESS; }
int _segment_list_close_segment(struct segment_list *segment_list, uint32_t segment_number) { // Take out a write lock so we are mutually exclusive with get_segment ck_rwlock_write_lock(segment_list->lock); segment_t *segment = __segment_number_to_segment(segment_list, segment_number); // Check the refcount and fail to close the segment if the refcount is not zero if (ck_pr_load_32(&segment->refcount) != 0) { // TODO: More specific error ck_rwlock_write_unlock(segment_list->lock); return -1; } // Do not close a segment twice if (segment->state == CLOSED) { // TODO: More specific error ck_rwlock_write_unlock(segment_list->lock); return -1; } // This function may be called when a segment is in the FREE state or the READING state. This // can happen in the lock free synchronization built on top of this structure, since a slow // thread with an old segment number might get here after other threads have advanced past this // segment. Since this is valid, just return an error so that the slow thread can recover. if (segment->state != WRITING) { // TODO: More specific error ck_rwlock_write_unlock(segment_list->lock); return -1; } // Destroy the segment, but close the store rather than destroying it because we don't want to // delete the on disk store files ensure(_free_segment_inlock(segment_list, segment_number, false/*destroy_store*/) == 0, "Failed to internally destroy segment in close segment function"); segment->state = CLOSED; ck_rwlock_write_unlock(segment_list->lock); return 0; }
static as_status as_scan_parse_records(uint8_t* buf, size_t size, as_scan_task* task, as_error* err) { uint8_t* p = buf; uint8_t* end = buf + size; as_status status; while (p < end) { as_msg* msg = (as_msg*)p; as_msg_swap_header_from_be(msg); if (msg->result_code) { // Special case - if we scan a set name that doesn't exist on a // node, it will return "not found" - we unify this with the // case where OK is returned and no callbacks were made. [AKG] // We are sending "no more records back" to the caller which will // send OK to the main worker thread. if (msg->result_code == AEROSPIKE_ERR_RECORD_NOT_FOUND) { return AEROSPIKE_NO_MORE_RECORDS; } return as_error_set_message(err, msg->result_code, as_error_string(msg->result_code)); } p += sizeof(as_msg); if (msg->info3 & AS_MSG_INFO3_LAST) { return AEROSPIKE_NO_MORE_RECORDS; } status = as_scan_parse_record(&p, msg, task); if (status != AEROSPIKE_OK) { return status; } if (ck_pr_load_32(task->error_mutex)) { err->code = AEROSPIKE_ERR_SCAN_ABORTED; return err->code; } } return AEROSPIKE_OK; }
/* * This function attempts to free segments, and returns the number of the segment up to which we * have freed. These semantics are a little strange, because segment numbers are uint32_t and our * first segment is zero. We only want this function to return zero when we have not freed any * segments, but if we returned the last segment we freed we would have to return zero after we've * freed segment 0. As it is now, we will return 1 in that case, because having freed segment 0 * means we've freed up to segment 1. */ uint32_t _segment_list_free_segments(struct segment_list *segment_list, uint32_t segment_number, bool destroy_store) { ck_rwlock_write_lock(segment_list->lock); // TODO: Think more carefully about what this function can return uint32_t freed_up_to = segment_list->tail; // Try to free as many segments as we can up to the provided segment number while (segment_list->tail <= segment_number && segment_list->head != segment_list->tail) { segment_t *segment = __segment_number_to_segment(segment_list, segment_list->tail); // We should not be freeing a segment in the WRITING or CLOSED state ensure(segment->state == READING, "Attempted to free segment not in the READING state"); // Do not free this segment if the refcount is not zero if (ck_pr_load_32(&segment->refcount) != 0) { // Do not try to free any more segments break; } ensure(_free_segment_inlock(segment_list, segment->segment_number, true/*destroy_store*/) == 0, "Failed to internally destroy segment in free segments function"); segment->state = FREE; // Move the tail up segment_list->tail++; // Record the segment we have freed up to freed_up_to = segment_list->tail; } ck_rwlock_write_unlock(segment_list->lock); return freed_up_to; }
/* * Write data into the store implementation * * params * *data - data to write * size - amount to write * * return * -1 - Capacity exceeded */ uint32_t _mmap_write(store_t *store, void *data, uint32_t size) { struct mmap_store *mstore = (struct mmap_store*) store; void * mapping = mstore->mapping; ensure(mapping != NULL, "Bad mapping"); // [uint32_t,BYTES] uint32_t *write_cursor = &mstore->write_cursor; uint32_t required_size = (sizeof(uint32_t) + size); uint32_t cursor_pos = 0; uint32_t new_pos = 0; while (true) { cursor_pos = ck_pr_load_32(write_cursor); ensure(cursor_pos != 0, "Incorrect cursor pos"); uint32_t remaining = mstore->capacity - cursor_pos; if (remaining <= required_size) { return 0; } new_pos = cursor_pos + required_size; if (ck_pr_cas_32(write_cursor, cursor_pos, new_pos)) { break; } } ensure(new_pos != 0, "Invalid write position"); ensure(cursor_pos != 0, "Invalid cursor position"); void *dest = (mapping + cursor_pos); ((uint32_t*)dest)[0] = (uint32_t) size; dest += sizeof(uint32_t); memcpy(dest, data, size); __mmap_schedule_store_sync(mstore, cursor_pos); return cursor_pos; }
/* * Write data into the store implementation * * params * *data - data to write * size - amount to write * * return * -1 - Capacity exceeded */ uint32_t _mmap_write(store_t *store, void *data, uint32_t size) { struct mmap_store *mstore = (struct mmap_store*) store; void * mapping = mstore->mapping; ensure(mapping != NULL, "Bad mapping"); // We must ensure that no writes are happening during a sync. To do this, we pack both the // "syncing" bit and the number of writers in the same 32 bit value. // 1. Load the "syncing_and_writers" value // 2. Check if "syncing" and abort if so // 3. Increment the number of writers // 4. Try to Compare and Swap this value // 5. Repeat if CAS fails bool writers_incremented = false; while (!writers_incremented) { // 1. uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); // Make sure we aren't already at 2^32 - 1 writers. If we try to increment when we already have // that many we will overflow the 31 bits we are using to store the writers. ensure(writers < 0xEFFFFFFFU, "Too many writers"); // 2. if (syncing == 1) { return 0; } // 3. // 4. if (ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, syncing_and_writers + 1)) { writers_incremented = true; } } ensure(ck_pr_load_32(&mstore->synced) == 0, "A writer should not get here when the store is synced"); // [uint32_t,BYTES] uint32_t *write_cursor = &mstore->write_cursor; uint32_t required_size = (sizeof(uint32_t) + size); uint32_t cursor_pos = 0; uint32_t new_pos = 0; uint32_t ret = 0; // Assert if we are trying to write a block larger than the capacity of this store, and the // store is empty. This is to die fast on the case where we have a block that we can never // write to any store of this size. // TODO: Actually handle this case gracefully ensure(((mstore->capacity - store->start_cursor(store)) >= required_size) || (ck_pr_load_32(write_cursor) != store->start_cursor(store)), "Attempting to write a block of data larger than the total capacity of our store"); while (true) { cursor_pos = ck_pr_load_32(write_cursor); ensure(cursor_pos != 0, "Incorrect cursor pos"); uint32_t remaining = mstore->capacity - cursor_pos; if (remaining <= required_size) { // TODO: Structure this code better. Right now, this works because "ret" is still zero, // and we return zero in the case where our data couldn't be written because the store // was full. goto decrement_writers; } new_pos = cursor_pos + required_size; if (ck_pr_cas_32(write_cursor, cursor_pos, new_pos)) { break; } } ensure(new_pos != 0, "Invalid write position"); ensure(cursor_pos != 0, "Invalid cursor position"); void *dest = (mapping + cursor_pos); ((uint32_t*)dest)[0] = (uint32_t) size; dest += sizeof(uint32_t); memcpy(dest, data, size); // If our new cursor is 32 pages past where we have last synced, try to sync // TODO: Make this tunable long page_size = sysconf(_SC_PAGESIZE); uint32_t last_sync = ck_pr_load_32(&mstore->last_sync); if (new_pos > last_sync + page_size * 1024) { ensure(last_sync % page_size == 0, "Last sync offset is not a multiple of page size, which is needed for msync"); uint32_t page_aligned_new_pos = (new_pos - (new_pos % page_size)); if (ck_pr_cas_32(&mstore->last_sync, last_sync, page_aligned_new_pos)) { // TODO: Sync the previous page too, since it may have gotten dirtied ensure(msync(mapping + last_sync, page_size * 1024, MS_ASYNC) == 0, "Unable to sync"); } } ensure(ck_pr_load_32(&mstore->synced) == 0, "A writer should not be here when the store is synced"); // Return the position in the store that we wrote to // TODO: Clean up the error handling and return values for this function ret = cursor_pos; bool writers_decremented = false; decrement_writers: // TODO: Need to initialize here, otherwise writers_decremented will be true and uninitialized // in the case where we jump to this label. Structure this function better. writers_decremented = false; // Decrement the number of writers to indicate that we are finished writing // 1. Load the "syncing_and_writers" value // 2. Decrement the number of writers // 3. Try to Compare and Swap this value // 4. Repeat if CAS fails while (!writers_decremented) { // 1. uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); // Invariants ensure(writers > 0, "Would decrement the number of writers below zero"); ensure(ck_pr_load_32(&mstore->synced) == 0, "The sync should not have gone through since we are not done writing"); // 2. // 3. if (ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, syncing_and_writers - 1)) { writers_decremented = true; } } return ret; }
/** * Force this store to sync if needed * * return * 0 - success * 1 - failure */ uint32_t _mmap_sync(store_t *store) { struct mmap_store *mstore = (struct mmap_store*) store; // The point we have written up to uint32_t write_cursor = ck_pr_load_32(&mstore->write_cursor); ensure(write_cursor > sizeof(uint32_t) * 2, "Attempted to sync an empty store"); // We must ensure that no writes are happening during a sync. To do this, we pack both the // "syncing" bit and the number of writers in the same 32 bit value. // 1. Load the "syncing_and_writers" value // 2. Set that we are syncing // 3. Try to Compare and Swap this value // 4. Repeat until "writers" == 0 while (1) { // 1. uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); // Make sure we aren't already at 2^32 - 1 writers. If we try to increment when we already have // that many we will overflow the 31 bits we are using to store the writers. ensure(writers < 0xEFFFFFFFU, "Too many writers"); // 2. // 3. if (syncing == 0) { if (!ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, SET_SYNCING(syncing_and_writers))) { continue; } } // 4. if (writers == 0) { break; } } // The point we have written up to write_cursor = ck_pr_load_32(&mstore->write_cursor); // Actually sync. At this point we are guaranteed there are no writers, so sync the entire // store. //TODO: Protect the nearest page once sunk //mprotect(mapping, off, PROT_READ); ensure(msync(mstore->mapping, write_cursor, MS_SYNC) == 0, "Unable to msync"); ensure(fsync(mstore->fd) == 0, "Unable to fsync"); // Record that we synced successfully. This will allow readers to progress. ck_pr_store_32(&mstore->synced, 1); uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); ensure(writers == 0, "We should not have synced the store when there are still writers"); ensure(syncing == 1, "We should not have synced the store when we did not mark it as syncing"); return 0; }
/** * Return the cursor of where the store is * consumed up to */ uint32_t _mmap_cursor(store_t *store) { struct mmap_store *mstore = (struct mmap_store*) store; return ck_pr_load_32(&mstore->write_cursor); }
store_cursor_t* _mmap_pop_cursor(store_t *store) { // This is really an mmap store struct mmap_store *mstore = (struct mmap_store*) store; // Assert invariants uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); ensure(writers == 0, "We should not be reading the store when there are still writers"); ensure(syncing == 1, "We should not be reading the store before it has started syncing"); ensure(ck_pr_load_32(&mstore->synced) == 1, "We should not be reading the store before it has been synced"); // Open a blank cursor struct mmap_store_cursor* cursor = (struct mmap_store_cursor*) _mmap_open_cursor(store); // Save the current offset so we can try to CAS later uint32_t current_offset = ck_pr_load_32(&mstore->read_cursor); // If the first cursor has not been returned, don't advance. Instead seek to the beginning. if (current_offset == -1) { uint32_t next_offset = store->start_cursor(store); // Seek to the read offset enum store_read_status ret = _mmap_cursor_seek((store_cursor_t*) cursor, next_offset); ensure(ret != END, "Failed to seek due to empty store"); ensure(ret != UNSYNCED_STORE, "Failed to seek due to unsynced store"); ensure(ret == SUCCESS, "Failed to seek"); // Set the read cursor. Note we are setting it to the offset of the thing we are reading, // because of the logic below if (ck_pr_cas_32(&mstore->read_cursor, current_offset, next_offset)) { return (store_cursor_t*) cursor; } // If we failed to CAS, reload the current offset and drop down to the normal logic below current_offset = ck_pr_load_32(&mstore->read_cursor); } // Seek to the current read offset enum store_read_status ret = _mmap_cursor_seek((store_cursor_t*) cursor, current_offset); ensure(ret != UNSYNCED_STORE, "Failed to seek due to unsynced store"); ensure(ret == SUCCESS, "Failed to seek"); // Save our offset so we can try to CAS uint32_t next_offset = cursor->next_offset; // This is our only way to advance, so we have to do this ret = _mmap_cursor_advance((store_cursor_t*) cursor); ensure(ret == SUCCESS || ret == END, "Failed to advance"); // If we advanced successfully, try to CAS the read cursor while (ret != END) { // If we succeed, return the cursor we made if (ck_pr_cas_32(&mstore->read_cursor, current_offset, next_offset)) { return (store_cursor_t*) cursor; } // Otherwise, try again // Save the current offset so we can try to CAS later current_offset = ck_pr_load_32(&mstore->read_cursor); // Seek to the current read offset ret = _mmap_cursor_seek((store_cursor_t*) cursor, current_offset); ensure(ret == SUCCESS, "Failed to seek"); // Save our offset so we can try to CAS next_offset = cursor->next_offset; // This is our only way to advance, so we have to do this ret = _mmap_cursor_advance((store_cursor_t*) cursor); ensure(ret == SUCCESS || ret == END, "Failed to advance"); } ((store_cursor_t*) cursor)->destroy((store_cursor_t*) cursor); return NULL; }
uint32_t _get_value(persistent_atomic_value_t *pav) { ck_rwlock_read_lock(pav->_lock); uint32_t current_value = ck_pr_load_32(&pav->_current_value); ck_rwlock_read_unlock(pav->_lock); return current_value; }
int _compare_and_swap(persistent_atomic_value_t *pav, uint32_t old_value, uint32_t new_value) { // First lock this counter ck_rwlock_write_lock(pav->_lock); // Then, check to see if someone changed this value before we got here if (ck_pr_load_32(&pav->_current_value) != old_value) { ck_rwlock_write_unlock(pav->_lock); return -1; } // We got here first. Set the new value. ck_pr_store_32(&pav->_current_value, new_value); // Now, persist the value // 1. Write it to a temporary file // 2. Delete the original file // 3. Link the temporary file to the original file // 4. Unlink the temporary file int fail = 0; // 1. int open_flags = O_RDWR | O_CREAT | O_EXCL | O_SYNC ; int fd = open(pav->_temporary_filename, open_flags, (mode_t)0600); if (fd < 0) { fail = -2; goto end; } ssize_t nwritten = write(fd, &pav->_current_value, sizeof(pav->_current_value)); if(fsync(fd) != 0) { fail = -2; close(fd); goto end; } close(fd); if (nwritten < 0) { fail = -2; goto end; } // 2. if(unlink(pav->_filename) != 0) { fail = -3; goto end; } // 3. if (link(pav->_temporary_filename, pav->_filename) != 0) fail = -4; end: if (unlink(pav->_temporary_filename) != 0) fail = -5; if (fail != 0) { ck_pr_store_32(&pav->_current_value, old_value); } ck_rwlock_write_unlock(pav->_lock); // For now ensure(fail == 0, "Failed during persistent update"); return fail; }