Esempio n. 1
0
static inline void 
fq_push_free_message_stack(struct free_message_stack *stack, fq_msg *m) 
{
  if (stack == NULL) {
    return;
  }

  while(ck_pr_load_32(&stack->size) > stack->max_size) {
    ck_stack_entry_t *ce = ck_stack_pop_mpmc(&stack->stack);
    if (ce != NULL) {
      fq_msg *m = container_of(ce, fq_msg, cleanup_stack_entry);
      free(m);
      ck_pr_dec_32(&stack->size);
    }
    else break;
  }
  uint32_t c = ck_pr_load_32(&stack->size);
  if (c >= stack->max_size) {
    free(m);
    return;
  }

  ck_pr_inc_32(&stack->size);
  ck_stack_push_mpmc(&stack->stack, &m->cleanup_stack_entry);
}
Esempio n. 2
0
/*
 * Free a segment.  We are assuming the list is either locked or being accessed from a single
 * threaded context
 */
int _free_segment_inlock(segment_list_t *segment_list, uint32_t segment_number, bool destroy_store) {
    segment_t *segment = __segment_number_to_segment(segment_list, segment_number);

    ensure(__is_segment_number_in_segment_list_inlock(segment_list, segment_number),
           "Attempted to destroy a segment not in the list");
    ensure(segment->state != FREE, "Attempted to destroy segment already in the FREE state");
    ensure(segment->state != CLOSED, "Attempted to destroy segment already in the CLOSED state");
    ensure(segment->segment_number == segment_number, "Attempted to destroy uninitialized segment");
    ensure(segment->store != NULL, "Attempted to destroy segment with null store");
    ensure(ck_pr_load_32(&segment->refcount) == 0, "Attempted to destroy segment with non zero refcount");

    if (destroy_store) {
        segment->store->destroy(segment->store);
        segment->state = FREE;
    }
    else {
        segment->store->close(segment->store, 1);
        segment->state = CLOSED;
    }

    // Zero out the segment we just freed for debugging
    segment->store = NULL;
    segment->segment_number = 0;

    return 0;
}
Esempio n. 3
0
void __mmap_schedule_store_sync(struct mmap_store *mstore, uint32_t write_cursor) {
    // Lock free but not wait free
    uint32_t *sync_cursor = &mstore->sync_cursor;
    uint32_t sync_pos = ck_pr_load_32(sync_cursor);

    sync_pos = ck_pr_load_32(sync_cursor);

    // TODO - Add in the sync flags for allowing things like Dirty read
    //TODO: Protect the nearest page once sunk
    //mprotect(mapping, off, PROT_READ);
    if (write_cursor - sync_pos > (4 * 1024)) {
        int sync_distance = write_cursor - sync_pos;
        msync(mstore->mapping + sync_pos, sync_distance, MS_ASYNC);
    }

    if (write_cursor - sync_pos > (64 * 1024 * 1024)) {
        fsync(mstore->fd);
        // Try to write the new cursor, give up if you miss the race
        ck_pr_cas_32(sync_cursor, sync_pos, write_cursor);
    }
}
Esempio n. 4
0
enum store_read_status __mmap_cursor_position(struct mmap_store_cursor *cursor,
                                              uint32_t offset) {
    ensure(cursor->store != NULL, "Broken cursor");

    // If a user calls this store before any thread has called sync, that is a programming error.
    // TODO: Make this a real error.  An assert now just for debugging.
    ensure(EXTRACT_SYNCING(ck_pr_load_32(&cursor->store->syncing_and_writers)) == 1,
           "Attempted to seek a cursor on a store before sync has been called");

    // Calling read before a store has finished syncing, however, may be more of a race condition,
    // so be nicer about it and just tell the caller to try again.
    if (ck_pr_load_32(&cursor->store->synced) != 1) {
        // We are trying to read from a store that has not yet been synced, which is not (yet?)
        // supported.
        return UNSYNCED_STORE;
    }

    // The read is clearly out of bounds for this store
    // Note that we need at least sizeof(uint32_t) bytes to continue, since for each write we write
    // at least a uint32_t to indicate the size of the block
    if (offset + sizeof(uint32_t) > cursor->store->capacity) return OUT_OF_BOUNDS;

    void *src = (cursor->store->mapping + offset);
    uint32_t size = ((uint32_t*)src)[0];
    if (size == 0) return END; // We have reached the synthetic end of the data
    ensure(offset + size + sizeof(uint32_t) < cursor->store->capacity, "Found a block that runs over the end of our store");

    cursor->next_offset = (offset + sizeof(uint32_t) + size);
    ((store_cursor_t*)cursor)->offset = offset;

    // For now mmap cursors are read forward only (sequential madvise)
    // and because we want to encourage people to use the cursor
    if (cursor->next_offset <= offset) return INVALID_SEEK_DIRECTION;

    ((store_cursor_t*)cursor)->size = size;
    ((store_cursor_t*)cursor)->data = src + sizeof(uint32_t);
    return SUCCESS;
}
Esempio n. 5
0
int _segment_list_close_segment(struct segment_list *segment_list, uint32_t segment_number) {
    // Take out a write lock so we are mutually exclusive with get_segment
    ck_rwlock_write_lock(segment_list->lock);

    segment_t *segment = __segment_number_to_segment(segment_list, segment_number);

    // Check the refcount and fail to close the segment if the refcount is not zero
    if (ck_pr_load_32(&segment->refcount) != 0) {
        // TODO: More specific error
        ck_rwlock_write_unlock(segment_list->lock);
        return -1;
    }

    // Do not close a segment twice
    if (segment->state == CLOSED) {
        // TODO: More specific error
        ck_rwlock_write_unlock(segment_list->lock);
        return -1;
    }

    // This function may be called when a segment is in the FREE state or the READING state.  This
    // can happen in the lock free synchronization built on top of this structure, since a slow
    // thread with an old segment number might get here after other threads have advanced past this
    // segment.  Since this is valid, just return an error so that the slow thread can recover.
    if (segment->state != WRITING) {
        // TODO: More specific error
        ck_rwlock_write_unlock(segment_list->lock);
        return -1;
    }

    // Destroy the segment, but close the store rather than destroying it because we don't want to
    // delete the on disk store files
    ensure(_free_segment_inlock(segment_list, segment_number, false/*destroy_store*/) == 0,
           "Failed to internally destroy segment in close segment function");

    segment->state = CLOSED;

    ck_rwlock_write_unlock(segment_list->lock);

    return 0;
}
static as_status
as_scan_parse_records(uint8_t* buf, size_t size, as_scan_task* task, as_error* err)
{
	uint8_t* p = buf;
	uint8_t* end = buf + size;
	as_status status;
	
	while (p < end) {
		as_msg* msg = (as_msg*)p;
		as_msg_swap_header_from_be(msg);
		
		if (msg->result_code) {
			// Special case - if we scan a set name that doesn't exist on a
			// node, it will return "not found" - we unify this with the
			// case where OK is returned and no callbacks were made. [AKG]
			// We are sending "no more records back" to the caller which will
			// send OK to the main worker thread.
			if (msg->result_code == AEROSPIKE_ERR_RECORD_NOT_FOUND) {
				return AEROSPIKE_NO_MORE_RECORDS;
			}
			return as_error_set_message(err, msg->result_code, as_error_string(msg->result_code));
		}
		p += sizeof(as_msg);
		
		if (msg->info3 & AS_MSG_INFO3_LAST) {
			return AEROSPIKE_NO_MORE_RECORDS;
		}
		
		status = as_scan_parse_record(&p, msg, task);
		
		if (status != AEROSPIKE_OK) {
			return status;
		}
		
		if (ck_pr_load_32(task->error_mutex)) {
			err->code = AEROSPIKE_ERR_SCAN_ABORTED;
			return err->code;
		}
	}
	return AEROSPIKE_OK;
}
Esempio n. 7
0
/*
 * This function attempts to free segments, and returns the number of the segment up to which we
 * have freed.  These semantics are a little strange, because segment numbers are uint32_t and our
 * first segment is zero.  We only want this function to return zero when we have not freed any
 * segments, but if we returned the last segment we freed we would have to return zero after we've
 * freed segment 0.  As it is now, we will return 1 in that case, because having freed segment 0
 * means we've freed up to segment 1.
 */
uint32_t _segment_list_free_segments(struct segment_list *segment_list, uint32_t segment_number, bool destroy_store) {
    ck_rwlock_write_lock(segment_list->lock);

    // TODO: Think more carefully about what this function can return
    uint32_t freed_up_to = segment_list->tail;

    // Try to free as many segments as we can up to the provided segment number
    while (segment_list->tail <= segment_number &&
           segment_list->head != segment_list->tail) {

        segment_t *segment = __segment_number_to_segment(segment_list, segment_list->tail);

        // We should not be freeing a segment in the WRITING or CLOSED state
        ensure(segment->state == READING, "Attempted to free segment not in the READING state");

        // Do not free this segment if the refcount is not zero
        if (ck_pr_load_32(&segment->refcount) != 0) {

            // Do not try to free any more segments
            break;
        }

        ensure(_free_segment_inlock(segment_list, segment->segment_number, true/*destroy_store*/) == 0,
               "Failed to internally destroy segment in free segments function");

        segment->state = FREE;

        // Move the tail up
        segment_list->tail++;

        // Record the segment we have freed up to
        freed_up_to = segment_list->tail;
    }

    ck_rwlock_write_unlock(segment_list->lock);

    return freed_up_to;
}
Esempio n. 8
0
/*
 * Write data into the store implementation
 *
 * params
 *  *data - data to write
 *  size - amount to write
 *
 * return
 *  -1 - Capacity exceeded
 */
uint32_t _mmap_write(store_t *store, void *data, uint32_t size) {
    struct mmap_store *mstore = (struct mmap_store*) store;
    void * mapping = mstore->mapping;
    ensure(mapping != NULL, "Bad mapping");

    // [uint32_t,BYTES]
    uint32_t *write_cursor = &mstore->write_cursor;
    uint32_t required_size = (sizeof(uint32_t) + size);

    uint32_t cursor_pos = 0;
    uint32_t new_pos = 0;

    while (true) {
        cursor_pos = ck_pr_load_32(write_cursor);
        ensure(cursor_pos != 0, "Incorrect cursor pos");
        uint32_t remaining = mstore->capacity - cursor_pos;

        if (remaining <= required_size) {
            return 0;
        }

        new_pos = cursor_pos + required_size;
        if (ck_pr_cas_32(write_cursor, cursor_pos, new_pos)) {
            break;
        }
    }
    ensure(new_pos != 0, "Invalid write position");
    ensure(cursor_pos != 0, "Invalid cursor position");

    void *dest = (mapping + cursor_pos);
    ((uint32_t*)dest)[0] = (uint32_t) size;
    dest += sizeof(uint32_t);
    memcpy(dest, data, size);

    __mmap_schedule_store_sync(mstore, cursor_pos);
    return cursor_pos;
}
Esempio n. 9
0
/*
 * Write data into the store implementation
 *
 * params
 *  *data - data to write
 *  size - amount to write
 *
 * return
 *  -1 - Capacity exceeded
 */
uint32_t _mmap_write(store_t *store, void *data, uint32_t size) {
    struct mmap_store *mstore = (struct mmap_store*) store;
    void * mapping = mstore->mapping;
    ensure(mapping != NULL, "Bad mapping");

    // We must ensure that no writes are happening during a sync.  To do this, we pack both the
    // "syncing" bit and the number of writers in the same 32 bit value.
    // 1. Load the "syncing_and_writers" value
    // 2. Check if "syncing" and abort if so
    // 3. Increment the number of writers
    // 4. Try to Compare and Swap this value
    // 5. Repeat if CAS fails
    bool writers_incremented = false;
    while (!writers_incremented) {

        // 1.
        uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers);
        uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers);
        uint32_t writers = EXTRACT_WRITERS(syncing_and_writers);

        // Make sure we aren't already at 2^32 - 1 writers.  If we try to increment when we already have
        // that many we will overflow the 31 bits we are using to store the writers.
        ensure(writers < 0xEFFFFFFFU, "Too many writers");

        // 2.
        if (syncing == 1) {
            return 0;
        }

        // 3.
        // 4.
        if (ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, syncing_and_writers + 1)) {
            writers_incremented = true;
        }
    }

    ensure(ck_pr_load_32(&mstore->synced) == 0, "A writer should not get here when the store is synced");

    // [uint32_t,BYTES]
    uint32_t *write_cursor = &mstore->write_cursor;
    uint32_t required_size = (sizeof(uint32_t) + size);

    uint32_t cursor_pos = 0;
    uint32_t new_pos = 0;
    uint32_t ret = 0;

    // Assert if we are trying to write a block larger than the capacity of this store, and the
    // store is empty.  This is to die fast on the case where we have a block that we can never
    // write to any store of this size.
    // TODO: Actually handle this case gracefully
    ensure(((mstore->capacity - store->start_cursor(store)) >= required_size) ||
           (ck_pr_load_32(write_cursor) != store->start_cursor(store)),
           "Attempting to write a block of data larger than the total capacity of our store");

    while (true) {
        cursor_pos = ck_pr_load_32(write_cursor);
        ensure(cursor_pos != 0, "Incorrect cursor pos");
        uint32_t remaining = mstore->capacity - cursor_pos;

        if (remaining <= required_size) {
            // TODO: Structure this code better.  Right now, this works because "ret" is still zero,
            // and we return zero in the case where our data couldn't be written because the store
            // was full.
            goto decrement_writers;
        }

        new_pos = cursor_pos + required_size;
        if (ck_pr_cas_32(write_cursor, cursor_pos, new_pos)) {
            break;
        }
    }
    ensure(new_pos != 0, "Invalid write position");
    ensure(cursor_pos != 0, "Invalid cursor position");

    void *dest = (mapping + cursor_pos);
    ((uint32_t*)dest)[0] = (uint32_t) size;
    dest += sizeof(uint32_t);
    memcpy(dest, data, size);

    // If our new cursor is 32 pages past where we have last synced, try to sync
    // TODO: Make this tunable
    long page_size = sysconf(_SC_PAGESIZE);
    uint32_t last_sync = ck_pr_load_32(&mstore->last_sync);
    if (new_pos > last_sync + page_size * 1024) {
        ensure(last_sync % page_size == 0,
               "Last sync offset is not a multiple of page size, which is needed for msync");
        uint32_t page_aligned_new_pos = (new_pos - (new_pos % page_size));
        if (ck_pr_cas_32(&mstore->last_sync, last_sync, page_aligned_new_pos)) {
            // TODO: Sync the previous page too, since it may have gotten dirtied
            ensure(msync(mapping + last_sync, page_size * 1024, MS_ASYNC) == 0, "Unable to sync");
        }
    }

    ensure(ck_pr_load_32(&mstore->synced) == 0, "A writer should not be here when the store is synced");

    // Return the position in the store that we wrote to
    // TODO: Clean up the error handling and return values for this function
    ret = cursor_pos;

    bool writers_decremented = false;
decrement_writers:
    // TODO: Need to initialize here, otherwise writers_decremented will be true and uninitialized
    // in the case where we jump to this label.  Structure this function better.
    writers_decremented = false;

    // Decrement the number of writers to indicate that we are finished writing
    // 1. Load the "syncing_and_writers" value
    // 2. Decrement the number of writers
    // 3. Try to Compare and Swap this value
    // 4. Repeat if CAS fails
    while (!writers_decremented) {

        // 1.
        uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers);
        uint32_t writers = EXTRACT_WRITERS(syncing_and_writers);

        // Invariants
        ensure(writers > 0, "Would decrement the number of writers below zero");
        ensure(ck_pr_load_32(&mstore->synced) == 0,
               "The sync should not have gone through since we are not done writing");

        // 2.
        // 3.
        if (ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, syncing_and_writers - 1)) {
            writers_decremented = true;
        }
    }

    return ret;
}
Esempio n. 10
0
/**
 * Force this store to sync if needed
 *
 * return
 *  0 - success
 *  1 - failure 
 */
uint32_t _mmap_sync(store_t *store) {
    struct mmap_store *mstore = (struct mmap_store*) store;

    // The point we have written up to
    uint32_t write_cursor = ck_pr_load_32(&mstore->write_cursor);

    ensure(write_cursor > sizeof(uint32_t) * 2, "Attempted to sync an empty store");

    // We must ensure that no writes are happening during a sync.  To do this, we pack both the
    // "syncing" bit and the number of writers in the same 32 bit value.
    // 1. Load the "syncing_and_writers" value
    // 2. Set that we are syncing
    // 3. Try to Compare and Swap this value
    // 4. Repeat until "writers" == 0
    while (1) {

        // 1.
        uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers);
        uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers);
        uint32_t writers = EXTRACT_WRITERS(syncing_and_writers);

        // Make sure we aren't already at 2^32 - 1 writers.  If we try to increment when we already have
        // that many we will overflow the 31 bits we are using to store the writers.
        ensure(writers < 0xEFFFFFFFU, "Too many writers");

        // 2.
        // 3.
        if (syncing == 0) {
            if (!ck_pr_cas_32(&mstore->syncing_and_writers, syncing_and_writers, SET_SYNCING(syncing_and_writers))) {
                continue;
            }
        }

        // 4.
        if (writers == 0) {
            break;
        }
    }

    // The point we have written up to
    write_cursor = ck_pr_load_32(&mstore->write_cursor);

    // Actually sync.  At this point we are guaranteed there are no writers, so sync the entire
    // store.
    //TODO: Protect the nearest page once sunk
    //mprotect(mapping, off, PROT_READ);
    ensure(msync(mstore->mapping, write_cursor, MS_SYNC) == 0, "Unable to msync");
    ensure(fsync(mstore->fd) == 0, "Unable to fsync");

    // Record that we synced successfully.  This will allow readers to progress.
    ck_pr_store_32(&mstore->synced, 1);

    uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers);
    uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers);
    uint32_t writers = EXTRACT_WRITERS(syncing_and_writers);

    ensure(writers == 0, "We should not have synced the store when there are still writers");
    ensure(syncing == 1, "We should not have synced the store when we did not mark it as syncing");

    return 0;
}
Esempio n. 11
0
/**
 * Return the cursor of where the store is
 * consumed up to
 */
uint32_t _mmap_cursor(store_t *store) {
    struct mmap_store *mstore = (struct mmap_store*) store;
    return ck_pr_load_32(&mstore->write_cursor);
}
Esempio n. 12
0
store_cursor_t* _mmap_pop_cursor(store_t *store) {

    // This is really an mmap store
    struct mmap_store *mstore = (struct mmap_store*) store;

    // Assert invariants
    uint32_t syncing_and_writers = ck_pr_load_32(&mstore->syncing_and_writers);
    uint32_t syncing = EXTRACT_SYNCING(syncing_and_writers);
    uint32_t writers = EXTRACT_WRITERS(syncing_and_writers);
    ensure(writers == 0, "We should not be reading the store when there are still writers");
    ensure(syncing == 1, "We should not be reading the store before it has started syncing");
    ensure(ck_pr_load_32(&mstore->synced) == 1, "We should not be reading the store before it has been synced");

    // Open a blank cursor
    struct mmap_store_cursor* cursor = (struct mmap_store_cursor*) _mmap_open_cursor(store);

    // Save the current offset so we can try to CAS later
    uint32_t current_offset = ck_pr_load_32(&mstore->read_cursor);

    // If the first cursor has not been returned, don't advance.  Instead seek to the beginning.
    if (current_offset == -1) {

        uint32_t next_offset = store->start_cursor(store);

        // Seek to the read offset
        enum store_read_status ret = _mmap_cursor_seek((store_cursor_t*) cursor, next_offset);
        ensure(ret != END, "Failed to seek due to empty store");
        ensure(ret != UNSYNCED_STORE, "Failed to seek due to unsynced store");
        ensure(ret == SUCCESS, "Failed to seek");

        // Set the read cursor.  Note we are setting it to the offset of the thing we are reading,
        // because of the logic below
        if (ck_pr_cas_32(&mstore->read_cursor, current_offset, next_offset)) {
            return (store_cursor_t*) cursor;
        }

        // If we failed to CAS, reload the current offset and drop down to the normal logic below
        current_offset = ck_pr_load_32(&mstore->read_cursor);
    }

    // Seek to the current read offset
    enum store_read_status ret = _mmap_cursor_seek((store_cursor_t*) cursor, current_offset);
    ensure(ret != UNSYNCED_STORE, "Failed to seek due to unsynced store");
    ensure(ret == SUCCESS, "Failed to seek");

    // Save our offset so we can try to CAS
    uint32_t next_offset = cursor->next_offset;

    // This is our only way to advance, so we have to do this
    ret = _mmap_cursor_advance((store_cursor_t*) cursor);
    ensure(ret == SUCCESS || ret == END, "Failed to advance");

    // If we advanced successfully, try to CAS the read cursor
    while (ret != END) {

        // If we succeed, return the cursor we made
        if (ck_pr_cas_32(&mstore->read_cursor, current_offset, next_offset)) {
            return (store_cursor_t*) cursor;
        }

        // Otherwise, try again

        // Save the current offset so we can try to CAS later
        current_offset = ck_pr_load_32(&mstore->read_cursor);

        // Seek to the current read offset
        ret = _mmap_cursor_seek((store_cursor_t*) cursor, current_offset);
        ensure(ret == SUCCESS, "Failed to seek");

        // Save our offset so we can try to CAS
        next_offset = cursor->next_offset;

        // This is our only way to advance, so we have to do this
        ret = _mmap_cursor_advance((store_cursor_t*) cursor);
        ensure(ret == SUCCESS || ret == END, "Failed to advance");
    }

    ((store_cursor_t*) cursor)->destroy((store_cursor_t*) cursor);
    return NULL;
}
uint32_t _get_value(persistent_atomic_value_t *pav) {
    ck_rwlock_read_lock(pav->_lock);
    uint32_t current_value = ck_pr_load_32(&pav->_current_value);
    ck_rwlock_read_unlock(pav->_lock);
    return current_value;
}
int _compare_and_swap(persistent_atomic_value_t *pav, uint32_t old_value, uint32_t new_value) {
    // First lock this counter
    ck_rwlock_write_lock(pav->_lock);

    // Then, check to see if someone changed this value before we got here
    if (ck_pr_load_32(&pav->_current_value) != old_value) {
        ck_rwlock_write_unlock(pav->_lock);
        return -1;
    }

    // We got here first.  Set the new value.
    ck_pr_store_32(&pav->_current_value, new_value);

    // Now, persist the value
    // 1. Write it to a temporary file
    // 2. Delete the original file
    // 3. Link the temporary file to the original file
    // 4. Unlink the temporary file
    int fail = 0;

    // 1.
    int open_flags = O_RDWR | O_CREAT | O_EXCL | O_SYNC ;
    int fd = open(pav->_temporary_filename, open_flags, (mode_t)0600);
    if (fd < 0) {
        fail = -2;
        goto end;
    }

    ssize_t nwritten = write(fd, &pav->_current_value, sizeof(pav->_current_value));
    if(fsync(fd) != 0) {
        fail = -2;
        close(fd);
        goto end;
    }
    close(fd);

    if (nwritten < 0) {
        fail = -2;
        goto end;
    }

    // 2.
    if(unlink(pav->_filename) != 0) {
        fail = -3;
        goto end;
    }

    // 3.
    if (link(pav->_temporary_filename, pav->_filename) != 0) fail = -4;

end:
    if (unlink(pav->_temporary_filename) != 0) fail = -5;

    if (fail != 0) {
        ck_pr_store_32(&pav->_current_value, old_value);
    }

    ck_rwlock_write_unlock(pav->_lock);
    // For now
    ensure(fail == 0, "Failed during persistent update");
    return fail;
}