void __mmap_schedule_store_sync(struct mmap_store *mstore, uint32_t write_cursor) { // Lock free but not wait free uint32_t *sync_cursor = &mstore->sync_cursor; uint32_t sync_pos = ck_pr_load_32(sync_cursor); sync_pos = ck_pr_load_32(sync_cursor); // TODO - Add in the sync flags for allowing things like Dirty read //TODO: Protect the nearest page once sunk //mprotect(mapping, off, PROT_READ); if (write_cursor - sync_pos > (4 * 1024)) { int sync_distance = write_cursor - sync_pos; msync(mstore->mapping + sync_pos, sync_distance, MS_ASYNC); } if (write_cursor - sync_pos > (64 * 1024 * 1024)) { fsync(mstore->fd); // Try to write the new cursor, give up if you miss the race ck_pr_cas_32(sync_cursor, sync_pos, write_cursor); } }
/* * Write data into the store implementation * * params * *data - data to write * size - amount to write * * return * -1 - Capacity exceeded */ uint32_t _mmap_write(store_t *store, void *data, uint32_t size) { struct mmap_store *mstore = (struct mmap_store*) store; void * mapping = mstore->mapping; ensure(mapping != NULL, "Bad mapping"); // [uint32_t,BYTES] uint32_t *write_cursor = &mstore->write_cursor; uint32_t required_size = (sizeof(uint32_t) + size); uint32_t cursor_pos = 0; uint32_t new_pos = 0; while (true) { cursor_pos = ck_pr_load_32(write_cursor); ensure(cursor_pos != 0, "Incorrect cursor pos"); uint32_t remaining = mstore->capacity - cursor_pos; if (remaining <= required_size) { return 0; } new_pos = cursor_pos + required_size; if (ck_pr_cas_32(write_cursor, cursor_pos, new_pos)) { break; } } ensure(new_pos != 0, "Invalid write position"); ensure(cursor_pos != 0, "Invalid cursor position"); void *dest = (mapping + cursor_pos); ((uint32_t*)dest)[0] = (uint32_t) size; dest += sizeof(uint32_t); memcpy(dest, data, size); __mmap_schedule_store_sync(mstore, cursor_pos); return cursor_pos; }
/* * Write data into the store implementation * * params * *data - data to write * size - amount to write * * return * -1 - Capacity exceeded */ uint32_t _mmap_write(store_t *store, void *data, uint32_t size) { struct mmap_store *mstore = (struct mmap_store*) store; void * mapping = mstore->mapping; ensure(mapping != NULL, "Bad mapping"); // We must ensure that no writes are happening during a sync. To do this, we pack both the // "syncing" bit and the number of writers in the same 32 bit value. // 1. Load the "syncing_and_writers" value // 2. Check if "syncing" and abort if so // 3. Increment the number of writers // 4. Try to Compare and Swap this value // 5. Repeat if CAS fails bool writers_incremented = false; while (!writers_incremented) { // 1. uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); // Make sure we aren't already at 2^32 - 1 writers. If we try to increment when we already have // that many we will overflow the 31 bits we are using to store the writers. ensure(writers < 0xEFFFFFFFU, "Too many writers"); // 2. if (syncing == 1) { return 0; } // 3. // 4. if (ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, syncing_and_writers + 1)) { writers_incremented = true; } } ensure(ck_pr_load_32(&mstore->synced) == 0, "A writer should not get here when the store is synced"); // [uint32_t,BYTES] uint32_t *write_cursor = &mstore->write_cursor; uint32_t required_size = (sizeof(uint32_t) + size); uint32_t cursor_pos = 0; uint32_t new_pos = 0; uint32_t ret = 0; // Assert if we are trying to write a block larger than the capacity of this store, and the // store is empty. This is to die fast on the case where we have a block that we can never // write to any store of this size. // TODO: Actually handle this case gracefully ensure(((mstore->capacity - store->start_cursor(store)) >= required_size) || (ck_pr_load_32(write_cursor) != store->start_cursor(store)), "Attempting to write a block of data larger than the total capacity of our store"); while (true) { cursor_pos = ck_pr_load_32(write_cursor); ensure(cursor_pos != 0, "Incorrect cursor pos"); uint32_t remaining = mstore->capacity - cursor_pos; if (remaining <= required_size) { // TODO: Structure this code better. Right now, this works because "ret" is still zero, // and we return zero in the case where our data couldn't be written because the store // was full. goto decrement_writers; } new_pos = cursor_pos + required_size; if (ck_pr_cas_32(write_cursor, cursor_pos, new_pos)) { break; } } ensure(new_pos != 0, "Invalid write position"); ensure(cursor_pos != 0, "Invalid cursor position"); void *dest = (mapping + cursor_pos); ((uint32_t*)dest)[0] = (uint32_t) size; dest += sizeof(uint32_t); memcpy(dest, data, size); // If our new cursor is 32 pages past where we have last synced, try to sync // TODO: Make this tunable long page_size = sysconf(_SC_PAGESIZE); uint32_t last_sync = ck_pr_load_32(&mstore->last_sync); if (new_pos > last_sync + page_size * 1024) { ensure(last_sync % page_size == 0, "Last sync offset is not a multiple of page size, which is needed for msync"); uint32_t page_aligned_new_pos = (new_pos - (new_pos % page_size)); if (ck_pr_cas_32(&mstore->last_sync, last_sync, page_aligned_new_pos)) { // TODO: Sync the previous page too, since it may have gotten dirtied ensure(msync(mapping + last_sync, page_size * 1024, MS_ASYNC) == 0, "Unable to sync"); } } ensure(ck_pr_load_32(&mstore->synced) == 0, "A writer should not be here when the store is synced"); // Return the position in the store that we wrote to // TODO: Clean up the error handling and return values for this function ret = cursor_pos; bool writers_decremented = false; decrement_writers: // TODO: Need to initialize here, otherwise writers_decremented will be true and uninitialized // in the case where we jump to this label. Structure this function better. writers_decremented = false; // Decrement the number of writers to indicate that we are finished writing // 1. Load the "syncing_and_writers" value // 2. Decrement the number of writers // 3. Try to Compare and Swap this value // 4. Repeat if CAS fails while (!writers_decremented) { // 1. uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); // Invariants ensure(writers > 0, "Would decrement the number of writers below zero"); ensure(ck_pr_load_32(&mstore->synced) == 0, "The sync should not have gone through since we are not done writing"); // 2. // 3. if (ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, syncing_and_writers - 1)) { writers_decremented = true; } } return ret; }
/** * Force this store to sync if needed * * return * 0 - success * 1 - failure */ uint32_t _mmap_sync(store_t *store) { struct mmap_store *mstore = (struct mmap_store*) store; // The point we have written up to uint32_t write_cursor = ck_pr_load_32(&mstore->write_cursor); ensure(write_cursor > sizeof(uint32_t) * 2, "Attempted to sync an empty store"); // We must ensure that no writes are happening during a sync. To do this, we pack both the // "syncing" bit and the number of writers in the same 32 bit value. // 1. Load the "syncing_and_writers" value // 2. Set that we are syncing // 3. Try to Compare and Swap this value // 4. Repeat until "writers" == 0 while (1) { // 1. uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); // Make sure we aren't already at 2^32 - 1 writers. If we try to increment when we already have // that many we will overflow the 31 bits we are using to store the writers. ensure(writers < 0xEFFFFFFFU, "Too many writers"); // 2. // 3. if (syncing == 0) { if (!ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, SET_SYNCING(syncing_and_writers))) { continue; } } // 4. if (writers == 0) { break; } } // The point we have written up to write_cursor = ck_pr_load_32(&mstore->write_cursor); // Actually sync. At this point we are guaranteed there are no writers, so sync the entire // store. //TODO: Protect the nearest page once sunk //mprotect(mapping, off, PROT_READ); ensure(msync(mstore->mapping, write_cursor, MS_SYNC) == 0, "Unable to msync"); ensure(fsync(mstore->fd) == 0, "Unable to fsync"); // Record that we synced successfully. This will allow readers to progress. ck_pr_store_32(&mstore->synced, 1); uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); ensure(writers == 0, "We should not have synced the store when there are still writers"); ensure(syncing == 1, "We should not have synced the store when we did not mark it as syncing"); return 0; }
store_cursor_t* _mmap_pop_cursor(store_t *store) { // This is really an mmap store struct mmap_store *mstore = (struct mmap_store*) store; // Assert invariants uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers); uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers); uint32_t writers = EXTRACT_WRITERS(syncing_and_writers); ensure(writers == 0, "We should not be reading the store when there are still writers"); ensure(syncing == 1, "We should not be reading the store before it has started syncing"); ensure(ck_pr_load_32(&mstore->synced) == 1, "We should not be reading the store before it has been synced"); // Open a blank cursor struct mmap_store_cursor* cursor = (struct mmap_store_cursor*) _mmap_open_cursor(store); // Save the current offset so we can try to CAS later uint32_t current_offset = ck_pr_load_32(&mstore->read_cursor); // If the first cursor has not been returned, don't advance. Instead seek to the beginning. if (current_offset == -1) { uint32_t next_offset = store->start_cursor(store); // Seek to the read offset enum store_read_status ret = _mmap_cursor_seek((store_cursor_t*) cursor, next_offset); ensure(ret != END, "Failed to seek due to empty store"); ensure(ret != UNSYNCED_STORE, "Failed to seek due to unsynced store"); ensure(ret == SUCCESS, "Failed to seek"); // Set the read cursor. Note we are setting it to the offset of the thing we are reading, // because of the logic below if (ck_pr_cas_32(&mstore->read_cursor, current_offset, next_offset)) { return (store_cursor_t*) cursor; } // If we failed to CAS, reload the current offset and drop down to the normal logic below current_offset = ck_pr_load_32(&mstore->read_cursor); } // Seek to the current read offset enum store_read_status ret = _mmap_cursor_seek((store_cursor_t*) cursor, current_offset); ensure(ret != UNSYNCED_STORE, "Failed to seek due to unsynced store"); ensure(ret == SUCCESS, "Failed to seek"); // Save our offset so we can try to CAS uint32_t next_offset = cursor->next_offset; // This is our only way to advance, so we have to do this ret = _mmap_cursor_advance((store_cursor_t*) cursor); ensure(ret == SUCCESS || ret == END, "Failed to advance"); // If we advanced successfully, try to CAS the read cursor while (ret != END) { // If we succeed, return the cursor we made if (ck_pr_cas_32(&mstore->read_cursor, current_offset, next_offset)) { return (store_cursor_t*) cursor; } // Otherwise, try again // Save the current offset so we can try to CAS later current_offset = ck_pr_load_32(&mstore->read_cursor); // Seek to the current read offset ret = _mmap_cursor_seek((store_cursor_t*) cursor, current_offset); ensure(ret == SUCCESS, "Failed to seek"); // Save our offset so we can try to CAS next_offset = cursor->next_offset; // This is our only way to advance, so we have to do this ret = _mmap_cursor_advance((store_cursor_t*) cursor); ensure(ret == SUCCESS || ret == END, "Failed to advance"); } ((store_cursor_t*) cursor)->destroy((store_cursor_t*) cursor); return NULL; }