//----------------------------------------------------------------------- void PagingLandScapePageManager::processUnloadQueues() { // Check for pages that need to be unloaded. // if touched, that means they didn't have been touch by any cameras // for several frames and thus need to be unloaded. // LIST CHECKS for (PagingLandScapePageList::iterator itl = mLoadedPages.begin(); itl != mLoadedPages.end();) { if ((*itl)->unloadUntouched()) { releasePage(*itl); itl = mLoadedPages.erase(itl); } else { ++itl; } } // QUEUES CHECKS // check queues for page that need to be excluded from queues PagingLandScapePage* p = 0; for (PagingLandScapeQueue<PagingLandScapePage>::MsgQueType::iterator itq = mPageLoadQueue.begin(); itq != mPageLoadQueue.end();) { assert(!(*itq)->isLoaded()); assert((*itq)->isInLoadQueue()); if ((*itq)->unloadUntouched()) { p = *itq; // remove from queue p->setInQueue(PagingLandScapePage::QUEUE_NONE); itq = mPageLoadQueue.erase(itq); // remove from active pages //(must be removed from queue first) releasePage(p); } else { ++itq; } } }
size_t dataPage::write_bytes(const byte * buf, ssize_t remaining, Page ** latch_p) { if(latch_p) { *latch_p = NULL; } recordid chunk = calc_chunk_from_offset(write_offset_); if(chunk.size > remaining) { chunk.size = remaining; } if(chunk.page >= first_page_ + page_count_) { chunk.size = 0; // no space (should not happen) } else { Page *p = alloc_ ? alloc_->load_page(xid_, chunk.page) : loadPage(xid_, chunk.page); assert(chunk.size); memcpy(data_at_offset_ptr(p, chunk.slot), buf, chunk.size); stasis_page_lsn_write(xid_, p, alloc_->get_lsn(xid_)); if(latch_p && !*latch_p) { writelock(p->rwlatch,0); *latch_p = p; } else { releasePage(p); } write_offset_ += chunk.size; } return chunk.size; }
void dataPage::initialize_page(pageid_t pageid) { //load the first page Page *p; #ifdef CHECK_FOR_SCRIBBLING p = alloc_ ? alloc->load_page(xid_, pageid) : loadPage(xid_, pageid); if(*stasis_page_type_ptr(p) == DATA_PAGE) { printf("Collision on page %lld\n", (long long)pageid); fflush(stdout); assert(*stasis_page_type_ptr(p) != DATA_PAGE); } #else p = loadUninitializedPage(xid_, pageid); #endif DEBUG("\t\t\t\t\t\t->%lld\n", pageid); //initialize header p->pageType = DATA_PAGE; //clear page (arranges for null-padding) XXX null pad more carefully and use sentinel value instead? memset(p->memAddr, 0, PAGE_SIZE); //we're the last page for now. *is_another_page_ptr(p) = 0; //write 0 to first data size *length_at_offset_ptr(p, calc_chunk_from_offset(write_offset_).slot) = 0; //set the page dirty stasis_page_lsn_write(xid_, p, alloc_->get_lsn(xid_)); releasePage(p); }
int main(int argc, char * argv[]) { if(argc != 3) { printf(usage, argv[0]); abort(); } char * endptr; numthreads = strtoul(argv[1], &endptr, 10); if(*endptr != 0) { printf(usage, argv[0]); abort(); } numops= strtoul(argv[2], &endptr, 10) / numthreads; if(*endptr != 0) { printf(usage, argv[0]); abort(); } pthread_t workers[numthreads]; Page * p; Tinit(); dpt = stasis_runtime_dirty_page_table(); p = loadPage(-1,0); for(int i = 0; i < numthreads; i++) { pthread_create(&workers[i], 0, worker, p); } for(int i = 0; i < numthreads; i++) { pthread_join(workers[i], 0); } releasePage(p); Tdeinit(); }
static void stasis_alloc_register_old_regions(stasis_alloc_t* alloc) { pageid_t boundary = REGION_FIRST_TAG; boundary_tag t; DEBUG("registering old regions\n"); int succ = TregionReadBoundaryTag(-1, boundary, &t); if(succ) { do { DEBUG("boundary tag %lld type %d\n", boundary, t.allocation_manager); if(t.allocation_manager == STORAGE_MANAGER_TALLOC) { for(pageid_t i = 0; i < t.size; i++) { Page * p = loadPage(-1, boundary + i); readlock(p->rwlatch,0); if(p->pageType == SLOTTED_PAGE) { stasis_allocation_policy_register_new_page(alloc->allocPolicy, p->id, stasis_record_freespace(-1, p)); DEBUG("registered page %lld\n", boundary+i); } else { abort(); } unlock(p->rwlatch); releasePage(p); } } } while(TregionNextBoundaryTag(-1, &boundary, &t, 0)); //STORAGE_MANAGER_TALLOC)) { } }
int TrecordType(int xid, recordid rid) { Page * p; p = loadPage(xid, rid.page); readlock(p->rwlatch,0); int ret; ret = stasis_record_type_read(xid, p, rid); unlock(p->rwlatch); releasePage(p); return ret; }
dataTuple* dataPage::iterator::getnext() { len_t len; bool succ; if(dp == NULL) { return NULL; } // XXX hack: read latch the page that the record will live on. // This should be handled by a read_data_in_latch function, or something... Page * p = loadPage(dp->xid_, dp->calc_chunk_from_offset(read_offset_).page); readlock(p->rwlatch, 0); succ = dp->read_data((byte*)&len, read_offset_, sizeof(len)); if((!succ) || (len == 0)) { unlock(p->rwlatch); releasePage(p); return NULL; } read_offset_ += sizeof(len); byte * buf = (byte*)malloc(len); succ = dp->read_data(buf, read_offset_, len); // release hacky latch unlock(p->rwlatch); releasePage(p); if(!succ) { read_offset_ -= sizeof(len); free(buf); return NULL; } read_offset_ += len; dataTuple *ret = dataTuple::from_bytes(buf); free(buf); return ret; }
void freeIndex(NativeNaturalType index, PageRefType pageRef) { assert(getSize(index) > 0); if(isFull()) { assert(superPage->fullBlobBuckets.erase<Key>(pageRef)); assert(superPage->freeBlobBuckets[header.type].insert(pageRef)); } --header.count; if(isEmpty()) { assert(superPage->freeBlobBuckets[header.type].erase<Key>(pageRef)); releasePage(pageRef); } else { setSize(index, 0); setSymbol(index, header.freeIndex); header.freeIndex = index; } }
int TrecordSize(int xid, recordid rid) { int ret; Page * p; p = loadPage(xid, rid.page); readlock(p->rwlatch,0); rid.size = stasis_record_length_read(xid, p, rid); if(stasis_record_type_read(xid,p,rid) == BLOB_SLOT) { blob_record_t r; stasis_record_read(xid,p,rid,(byte*)&r); ret = r.size; } else { ret = rid.size; } unlock(p->rwlatch); releasePage(p); return ret; }
dataPage::dataPage(int xid, regionAllocator * alloc, pageid_t pid): // XXX Hack!! The read-only constructor signature is too close to the other's xid_(xid), page_count_(1), // will be opportunistically incremented as we scan the datapage. initial_page_count_(-1), // used by append. alloc_(alloc), // read-only, and we don't free data pages one at a time. first_page_(pid), write_offset_(-1) { assert(pid!=0); Page *p = alloc_ ? alloc_->load_page(xid, first_page_) : loadPage(xid, first_page_); if(!(*is_another_page_ptr(p) == 0 || *is_another_page_ptr(p) == 2)) { printf("Page %lld is not the start of a datapage\n", first_page_); fflush(stdout); abort(); } assert(*is_another_page_ptr(p) == 0 || *is_another_page_ptr(p) == 2); // would be 1 for page in the middle of a datapage releasePage(p); }
recordid TallocFromPage(int xid, pageid_t page, unsigned long size) { stasis_alloc_t* alloc = stasis_runtime_alloc_state(); short type; if(size >= BLOB_THRESHOLD_SIZE) { type = BLOB_SLOT; } else { assert(size > 0); type = size; } pthread_mutex_lock(&alloc->mut); if(!stasis_allocation_policy_can_xid_alloc_from_page(alloc->allocPolicy, xid, page)) { pthread_mutex_unlock(&alloc->mut); return NULLRID; } Page * p = loadPage(xid, page); writelock(p->rwlatch,0); recordid rid = stasis_record_alloc_begin(xid, p, type); if(rid.size != INVALID_SLOT) { stasis_record_alloc_done(xid,p,rid); stasis_allocation_policy_alloced_from_page(alloc->allocPolicy, xid, page); unlock(p->rwlatch); alloc_arg a = { rid.slot, type }; Tupdate(xid, rid.page, &a, sizeof(a), OPERATION_ALLOC); if(type == BLOB_SLOT) { rid.size = size; stasis_blob_alloc(xid,rid); } } else { unlock(p->rwlatch); } releasePage(p); pthread_mutex_unlock(&alloc->mut); stasis_transaction_table_set_argument(alloc->xact_table, xid, alloc->callback_id, AT_COMMIT, alloc); return rid; }
static void stasis_alloc_reserve_new_region(stasis_alloc_t* alloc, int xid) { void* nta = TbeginNestedTopAction(xid, OPERATION_NOOP, 0,0); pageid_t firstPage = TregionAlloc(xid, TALLOC_REGION_SIZE, STORAGE_MANAGER_TALLOC); int initialFreespace = -1; for(pageid_t i = 0; i < TALLOC_REGION_SIZE; i++) { TinitializeSlottedPage(xid, firstPage + i); if(initialFreespace == -1) { Page * p = loadPage(xid, firstPage); readlock(p->rwlatch,0); initialFreespace = stasis_record_freespace(xid, p); unlock(p->rwlatch); releasePage(p); } stasis_allocation_policy_register_new_page(alloc->allocPolicy, firstPage + i, initialFreespace); } TendNestedTopAction(xid, nta); }
size_t dataPage::read_bytes(byte * buf, off_t offset, ssize_t remaining) { recordid chunk = calc_chunk_from_offset(offset); if(chunk.size > remaining) { chunk.size = remaining; } if(chunk.page >= first_page_ + page_count_) { chunk.size = 0; // eof } else { Page *p = alloc_ ? alloc_->load_page(xid_, chunk.page) : loadPage(xid_, chunk.page); if(p->pageType != DATA_PAGE) { fprintf(stderr, "Page type %d, id %lld lsn %lld\n", (int)p->pageType, (long long)p->id, (long long)p->LSN); assert(p->pageType == DATA_PAGE); } if((chunk.page + 1 == page_count_ + first_page_) && (*is_another_page_ptr(p))) { page_count_++; } memcpy(buf, data_at_offset_ptr(p, chunk.slot), chunk.size); releasePage(p); } return chunk.size; }
Page * dataPage::write_data_and_latch(const byte * buf, size_t len, bool init_next, bool latch) { bool first = true; Page * p = 0; while(1) { assert(len > 0); size_t written; if(latch && first ) { written = write_bytes(buf, len, &p); } else { written = write_bytes(buf, len); } if(written == 0) { assert(!p); return 0; // fail } if(written == len) { if(latch) { return p; } else { return (Page*)1; } } if(len > PAGE_SIZE && ! first) { assert(written > 4000); } buf += written; len -= written; if(init_next) { if(!initialize_next_page()) { if(p) { unlock(p->rwlatch); releasePage(p); } return 0; // fail } } first = false; } }
size_t PageCache::releaseFromStart(size_t maxBytes) { size_t bytesReleased = 0; while (maxBytes > 0 && !mActivePages.empty()) { List<Page *>::iterator it = mActivePages.begin(); Page *page = *it; if (maxBytes < page->mSize) { break; } mActivePages.erase(it); maxBytes -= page->mSize; bytesReleased += page->mSize; releasePage(page); } mTotalSize -= bytesReleased; return bytesReleased; }
bool dataPage::initialize_next_page() { recordid rid = calc_chunk_from_offset(write_offset_); assert(rid.slot == 0); DEBUG("\t\t%lld\n", (long long)rid.page); if(rid.page >= first_page_ + page_count_) { assert(rid.page == first_page_ + page_count_); if(alloc_->grow_extent(1)) { page_count_++; } else { return false; // The region is full } } else { abort(); } Page *p = alloc_ ? alloc_->load_page(xid_, rid.page-1) : loadPage(xid_, rid.page-1); *is_another_page_ptr(p) = (rid.page-1 == first_page_) ? 2 : 1; stasis_page_lsn_write(xid_, p, alloc_->get_lsn(xid_)); releasePage(p); initialize_page(rid.page); return true; }
int main (int argc, char * argv[]) { double MB = 1024 * 1024; uint64_t mb = 20000; // size of run, in megabytes. enum run_type mode = ALL; const uint64_t num_pages = mb * (MB / PAGE_SIZE); stasis_buffer_manager_size = (512 * MB) / PAGE_SIZE; // stasis_buffer_manager_hint_writes_are_sequential = 1; // stasis_dirty_page_table_flush_quantum = (8 * MB) / PAGE_SIZE; // XXX if set to high-> segfault // stasis_dirty_page_count_hard_limit = (16 * MB) / PAGE_SIZE; // stasis_dirty_page_count_soft_limit = (10 * MB) / PAGE_SIZE; // stasis_dirty_page_low_water_mark = (8 * MB) / PAGE_SIZE; // Hard disk preferred. /* stasis_dirty_page_table_flush_quantum = (4 * MB) / PAGE_SIZE; // XXX if set to high-> segfault stasis_dirty_page_count_hard_limit = (12 * MB) / PAGE_SIZE; stasis_dirty_page_count_soft_limit = (8 * MB) / PAGE_SIZE; stasis_dirty_page_low_water_mark = (4 * MB) / PAGE_SIZE;*/ // SSD preferred. stasis_dirty_page_table_flush_quantum = (4 * MB) / PAGE_SIZE; // XXX if set to high-> segfault stasis_dirty_page_count_hard_limit = (40 * MB) / PAGE_SIZE; stasis_dirty_page_count_soft_limit = (32 * MB) / PAGE_SIZE; stasis_dirty_page_low_water_mark = (16 * MB) / PAGE_SIZE; stasis_dirty_page_table_flush_quantum = (4 * MB) / PAGE_SIZE; // XXX if set to high-> segfault stasis_dirty_page_count_hard_limit = (48 * MB) / PAGE_SIZE; stasis_dirty_page_count_soft_limit = (40 * MB) / PAGE_SIZE; stasis_dirty_page_low_water_mark = (32 * MB) / PAGE_SIZE; printf("stasis_buffer_manager_size=%lld\n", (long long)stasis_buffer_manager_size * PAGE_SIZE); printf("Hard limit=%lld\n", (long long)((stasis_dirty_page_count_hard_limit*PAGE_SIZE)/MB)); printf("Hard limit is %f pct.\n", 100.0 * ((double)stasis_dirty_page_count_hard_limit)/((double)stasis_buffer_manager_size)); bLSM::init_stasis(); regionAllocator * readableAlloc = NULL; if(!mode) { int xid = Tbegin(); regionAllocator * alloc = new regionAllocator(xid, num_pages); printf("Starting first write of %lld mb\n", (long long)mb); struct timeval start, start_sync, stop; double elapsed; gettimeofday(&start, 0); pageid_t extent = alloc->alloc_extent(xid, num_pages); for(uint64_t i = 0; i < num_pages; i++) { Page * p = loadUninitializedPage(xid, i+extent); stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); releasePage(p); } gettimeofday(&start_sync,0); alloc->force_regions(xid); readableAlloc = alloc; Tcommit(xid); // alloc = new RegionAllocator(xid, num_pages); gettimeofday(&stop, 0); elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)mb)/elapsed); printf("Sync took %f seconds.\n", stasis_timeval_to_double(stasis_subtract_timeval(stop, start_sync))); } if(!mode) { int xid = Tbegin(); regionAllocator * alloc = new regionAllocator(xid, num_pages); printf("Starting write with parallel read of %lld mb\n", (long long)mb); struct timeval start, start_sync, stop; double elapsed; gettimeofday(&start, 0); pageid_t region_length; pageid_t region_count; pageid_t * old_extents = readableAlloc->list_regions(xid, ®ion_length, ®ion_count); pageid_t extent = alloc->alloc_extent(xid, num_pages); assert(region_count == 1); for(uint64_t i = 0; i < num_pages/2; i++) { Page * p = loadUninitializedPage(xid, i+extent); stasis_dirty_page_table_set_dirty((stasis_dirty_page_table_t*)stasis_runtime_dirty_page_table(), p); releasePage(p); p = loadPage(xid, i+old_extents[0]); releasePage(p); } gettimeofday(&start_sync,0); alloc->force_regions(xid); delete alloc; Tcommit(xid); // alloc = new RegionAllocator(xid, num_pages); gettimeofday(&stop, 0); elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)mb)/elapsed); printf("Sync took %f seconds.\n", stasis_timeval_to_double(stasis_subtract_timeval(stop, start_sync))); } if(!mode) { int xid = Tbegin(); struct timeval start, start_sync, stop; double elapsed; printf("Starting write of giant datapage\n"); gettimeofday(&start, 0); regionAllocator * alloc = new regionAllocator(xid, num_pages); dataPage * dp = new DataPage(xid, num_pages-1, alloc); byte * key = (byte*)calloc(100, 1); byte * val = (byte*)calloc(900, 1); dataTuple * tup = dataTuple::create(key, 100, val, 900); free(key); free(val); while(1) { if(!dp->append(tup)) { break; } } gettimeofday(&start_sync,0); alloc->force_regions(xid); gettimeofday(&stop, 0); Tcommit(xid); elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)mb)/elapsed); printf("Sync took %f seconds.\n", stasis_timeval_to_double(stasis_subtract_timeval(stop, start_sync))); } if(!mode) { int xid = Tbegin(); struct timeval start, start_sync, stop; double elapsed; printf("Starting write of many small datapages\n"); gettimeofday(&start, 0); regionAllocator * alloc = new regionAllocator(xid, num_pages); byte * key = (byte*)calloc(100, 1); byte * val = (byte*)calloc(900, 1); dataTuple * tup = dataTuple::create(key, 100, val, 900); free(key); free(val); dataPage * dp = 0; uint64_t this_count = 0; uint64_t count = 0; uint64_t dp_count = 0; while((count * 1000) < (mb * 1024*1024)) { if((!dp) || !dp->append(tup)) { dp = new DataPage(xid, 2, alloc); dp_count++; } count++; this_count++; // if(((this_count * 1000) > (1024 * 1024 * 16))) { // alloc->force_regions(xid); // this_count = 0; // gettimeofday(&stop, 0); // elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); // printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(1024*1024*elapsed)); // } } gettimeofday(&start_sync,0); alloc->force_regions(xid); gettimeofday(&stop, 0); Tcommit(xid); elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(elapsed*1024*1024)); printf("Sync took %f seconds.\n", stasis_timeval_to_double(stasis_subtract_timeval(stop, start_sync))); } if(!mode) { int xid = Tbegin(); struct timeval start, start_sync, stop; double elapsed; printf("Starting two parallel writes of many small datapages\n"); gettimeofday(&start, 0); regionAllocator * alloc = new regionAllocator(xid, num_pages/2); regionAllocator * alloc2 = new regionAllocator(xid, num_pages/2); byte * key = (byte*)calloc(100, 1); byte * val = (byte*)calloc(900, 1); dataTuple * tup = dataTuple::create(key, 100, val, 900); free(key); free(val); dataPage * dp = 0; dataPage * dp2 = 0; uint64_t this_count = 0; uint64_t count = 0; uint64_t dp_count = 0; while((count * 1000) < (mb * 1024*1024)) { if((!dp) || !dp->append(tup)) { dp = new DataPage(xid, 2, alloc); dp_count++; } if((!dp2) || !dp2->append(tup)) { dp2 = new DataPage(xid, 2, alloc2); //dp_count++; } count += 2; this_count++; // if(((this_count * 1000) > (1024 * 1024 * 16))) { // alloc->force_regions(xid); // this_count = 0; // gettimeofday(&stop, 0); // elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); // printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(1024*1024*elapsed)); // } } gettimeofday(&start_sync,0); alloc->force_regions(xid); alloc2->force_regions(xid); gettimeofday(&stop, 0); Tcommit(xid); elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(elapsed*1024*1024)); printf("Sync took %f seconds.\n", stasis_timeval_to_double(stasis_subtract_timeval(stop, start_sync))); } regionAllocator * read_alloc = NULL; regionAllocator * read_alloc2 = NULL; regionAllocator * read_alloc3 = NULL; regionAllocator * read_alloc4 = NULL; if(!mode) { int xid = Tbegin(); struct timeval start, start_sync, stop; double elapsed; printf("Starting four parallel writes of many small datapages\n"); gettimeofday(&start, 0); regionAllocator * alloc = new regionAllocator(xid, num_pages/4); regionAllocator * alloc2 = new regionAllocator(xid, num_pages/4); regionAllocator * alloc3 = new regionAllocator(xid, num_pages/4); regionAllocator * alloc4 = new regionAllocator(xid, num_pages/4); byte * key = (byte*)calloc(100, 1); byte * val = (byte*)calloc(900, 1); dataTuple * tup = dataTuple::create(key, 100, val, 900); free(key); free(val); dataPage * dp = 0; dataPage * dp2 = 0; dataPage * dp3 = 0; dataPage * dp4 = 0; uint64_t this_count = 0; uint64_t count = 0; uint64_t dp_count = 0; while((count * 1000) < (mb * 1024*1024)) { if((!dp) || !dp->append(tup)) { dp = new DataPage(xid, 2, alloc); dp_count++; } if((!dp2) || !dp2->append(tup)) { dp2 = new DataPage(xid, 2, alloc2); //dp_count++; } if((!dp3) || !dp3->append(tup)) { dp3 = new DataPage(xid, 2, alloc3); //dp_count++; } if((!dp4) || !dp4->append(tup)) { dp4 = new DataPage(xid, 2, alloc4); //dp_count++; } count += 4; this_count++; // if(((this_count * 1000) > (1024 * 1024 * 16))) { // alloc->force_regions(xid); // this_count = 0; // gettimeofday(&stop, 0); // elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); // printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(1024*1024*elapsed)); // } } gettimeofday(&start_sync,0); alloc->force_regions(xid); alloc2->force_regions(xid); alloc3->force_regions(xid); alloc4->force_regions(xid); gettimeofday(&stop, 0); Tcommit(xid); elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(elapsed*1024*1024)); printf("Sync took %f seconds.\n", stasis_timeval_to_double(stasis_subtract_timeval(stop, start_sync))); read_alloc = alloc; read_alloc2 = alloc2; read_alloc3 = alloc3; read_alloc4 = alloc4; } if(!mode) { int xid = Tbegin(); struct timeval start, start_sync, stop; double elapsed; printf("Starting four parallel writes of many small datapages\n"); gettimeofday(&start, 0); regionAllocator * alloc = new regionAllocator(xid, num_pages/4); regionAllocator * alloc2 = new regionAllocator(xid, num_pages/4); regionAllocator * alloc3 = new regionAllocator(xid, num_pages/4); regionAllocator * alloc4 = new regionAllocator(xid, num_pages/4); byte * key = (byte*)calloc(100, 1); byte * val = (byte*)calloc(900, 1); dataTuple * tup = dataTuple::create(key, 100, val, 900); free(key); free(val); dataPage * dp = 0; dataPage * dp2 = 0; dataPage * dp3 = 0; dataPage * dp4 = 0; uint64_t this_count = 0; uint64_t count = 0; uint64_t dp_count = 0; pageid_t n1, n2, n3, n4; pageid_t l1, l2, l3, l4; pageid_t * regions1, * regions2, * regions3, * regions4; regions1 = read_alloc->list_regions(xid, &l1, &n1); regions2 = read_alloc2->list_regions(xid, &l2, &n2); regions3 = read_alloc3->list_regions(xid, &l3, &n3); regions4 = read_alloc4->list_regions(xid, &l4, &n4); pageid_t i1 = regions1[0]; pageid_t i2 = regions2[0]; pageid_t i3 = regions3[0]; pageid_t i4 = regions4[0]; dataPage * rdp = new DataPage(xid, 0, i1); dataPage * rdp2 = new DataPage(xid, 0, i2); dataPage * rdp3 = new DataPage(xid, 0, i3); dataPage * rdp4 = new DataPage(xid, 0, i4); dataPage::iterator it1 = rdp->begin(); dataPage::iterator it2 = rdp2->begin(); dataPage::iterator it3 = rdp3->begin(); dataPage::iterator it4 = rdp4->begin(); while((count * 1000) < (mb * 1024*1024)) { if((!dp) || !dp->append(tup)) { dp = new DataPage(xid, 2, alloc); dp_count++; } if((!dp2) || !dp2->append(tup)) { dp2 = new DataPage(xid, 2, alloc2); //dp_count++; } if((!dp3) || !dp3->append(tup)) { dp3 = new DataPage(xid, 2, alloc3); //dp_count++; } if((!dp4) || !dp4->append(tup)) { dp4 = new DataPage(xid, 2, alloc4); //dp_count++; } dataTuple * t; if((!rdp) || !(t = it1.getnext())) { i1+= rdp->get_page_count(); if(rdp) delete rdp; rdp = new DataPage(xid, 0, i1); // i1++; it1 = rdp->begin(); t = it1.getnext(); } if(t) dataTuple::freetuple(t); if((!rdp2) || !(t = it2.getnext())) { i2+= rdp2->get_page_count(); if(rdp2) delete rdp2; rdp2 = new DataPage(xid, 0, i2); // i2++; it2 = rdp2->begin(); t = it2.getnext(); } if(t) dataTuple::freetuple(t); if((!rdp3) || !(t = it3.getnext())) { i3+= rdp3->get_page_count(); if(rdp3) delete rdp3; rdp3 = new DataPage(xid, 0, i3); // i3++; it3 = rdp3->begin(); t = it3.getnext(); } if(t) dataTuple::freetuple(t); if((!rdp4) || !(t = it4.getnext())) { i4+= rdp4->get_page_count(); if(rdp4) delete rdp4; rdp4 = new DataPage(xid, 0, i4); // i4++; it4 = rdp4->begin(); t = it4.getnext(); } if(t) dataTuple::freetuple(t); count += 8; this_count++; // if(((this_count * 1000) > (1024 * 1024 * 16))) { // alloc->force_regions(xid); // this_count = 0; // gettimeofday(&stop, 0); // elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); // printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(1024*1024*elapsed)); // } } gettimeofday(&start_sync,0); alloc->force_regions(xid); alloc2->force_regions(xid); alloc3->force_regions(xid); alloc4->force_regions(xid); gettimeofday(&stop, 0); Tcommit(xid); elapsed = stasis_timeval_to_double(stasis_subtract_timeval(stop, start)); printf("Write took %f seconds (%f mb/sec)\n", elapsed, ((double)(count*1000))/(elapsed*1024*1024)); printf("Sync took %f seconds.\n", stasis_timeval_to_double(stasis_subtract_timeval(stop, start_sync))); read_alloc = alloc; read_alloc2 = alloc2; read_alloc3 = alloc3; read_alloc4 = alloc4; } bLSM::deinit_stasis(); }
recordid Talloc(int xid, unsigned long size) { stasis_alloc_t* alloc = stasis_runtime_alloc_state(); short type; if(size >= BLOB_THRESHOLD_SIZE) { type = BLOB_SLOT; } else { assert(size >= 0); type = size; } recordid rid; pthread_mutex_lock(&alloc->mut); pageid_t pageid = stasis_allocation_policy_pick_suitable_page(alloc->allocPolicy, xid, stasis_record_type_to_size(type)); if(pageid == INVALID_PAGE) { stasis_alloc_reserve_new_region(alloc, xid); pageid = stasis_allocation_policy_pick_suitable_page(alloc->allocPolicy, xid, stasis_record_type_to_size(type)); } alloc->lastFreepage = pageid; Page * p = loadPage(xid, alloc->lastFreepage); writelock(p->rwlatch, 0); int rec_size = stasis_record_type_to_size(type); if(rec_size < 4) { rec_size = 4; } while(stasis_record_freespace(xid, p) < rec_size) { stasis_record_compact(p); int newFreespace = stasis_record_freespace(xid, p); if(newFreespace >= rec_size) { break; } unlock(p->rwlatch); stasis_allocation_policy_update_freespace(alloc->allocPolicy, pageid, newFreespace); releasePage(p); pageid = stasis_allocation_policy_pick_suitable_page(alloc->allocPolicy, xid, rec_size); if(pageid == INVALID_PAGE) { stasis_alloc_reserve_new_region(alloc, xid); pageid = stasis_allocation_policy_pick_suitable_page(alloc->allocPolicy, xid, rec_size); } alloc->lastFreepage = pageid; p = loadPage(xid, alloc->lastFreepage); writelock(p->rwlatch, 0); } rid = stasis_record_alloc_begin(xid, p, type); assert(rid.size != INVALID_SLOT); stasis_record_alloc_done(xid, p, rid); int newFreespace = stasis_record_freespace(xid, p); stasis_allocation_policy_alloced_from_page(alloc->allocPolicy, xid, pageid); stasis_allocation_policy_update_freespace(alloc->allocPolicy, pageid, newFreespace); unlock(p->rwlatch); alloc_arg a = { rid.slot, type }; Tupdate(xid, rid.page, &a, sizeof(a), OPERATION_ALLOC); if(type == BLOB_SLOT) { rid.size = size; stasis_blob_alloc(xid, rid); } releasePage(p); pthread_mutex_unlock(&alloc->mut); stasis_transaction_table_set_argument(alloc->xact_table, xid, alloc->callback_id, AT_COMMIT, alloc); return rid; // TODO return NULLRID on error }
void Tdealloc(int xid, recordid rid) { stasis_alloc_t* alloc = stasis_runtime_alloc_state(); // @todo this needs to garbage collect empty storage regions. pthread_mutex_lock(&alloc->mut); Page * p = loadPage(xid, rid.page); readlock(p->rwlatch,0); recordid newrid = stasis_record_dereference(xid, p, rid); stasis_allocation_policy_dealloced_from_page(alloc->allocPolicy, xid, newrid.page); int64_t size = stasis_record_length_read(xid,p,rid); int64_t type = stasis_record_type_read(xid,p,rid); if(type == NORMAL_SLOT) { type = size; } byte * preimage = malloc(sizeof(alloc_arg)+size); ((alloc_arg*)preimage)->slot = rid.slot; ((alloc_arg*)preimage)->type = type; // stasis_record_read() wants rid to have its raw size to prevent // code that doesn't know about record types from introducing memory // bugs. rid.size = size; stasis_record_read(xid, p, rid, preimage+sizeof(alloc_arg)); // restore rid to valid state. rid.size = type; // Ok to release latch; page is still pinned (so no WAL problems). // allocationPolicy protects us from running out of space due to concurrent // xacts. // Also, there can be no reordering of allocations / deallocations , // since we're holding alloc->mutex. However, we might reorder a Tset() // to and a Tdealloc() or Talloc() on the same page. If this happens, // it's an unsafe race in the application, and not technically our problem. // @todo Tupdate forces allocation to release a latch, leading to potentially nasty application bugs. Perhaps this is the wrong API! // @todo application-level allocation races can lead to unrecoverable logs. unlock(p->rwlatch); Tupdate(xid, rid.page, preimage, sizeof(alloc_arg)+size, OPERATION_DEALLOC); releasePage(p); pthread_mutex_unlock(&alloc->mut); if(type==BLOB_SLOT) { stasis_blob_dealloc(xid,(blob_record_t*)(preimage+sizeof(alloc_arg))); } free(preimage); stasis_transaction_table_set_argument(alloc->xact_table, xid, alloc->callback_id, AT_COMMIT, alloc); }
bool dataPage::append(dataTuple const * dat) { // First, decide if we should append to this datapage, based on whether // appending will waste more or less space than starting a new datapage bool accept_tuple; len_t tup_len = dat->byte_length(); // Decsion tree if(write_offset_ > (initial_page_count_ * PAGE_SIZE)) { // we already exceeded the page budget if(write_offset_ > (2 * initial_page_count_ * PAGE_SIZE)) { // ... by a lot. Reject regardless. This prevents small tuples from // being stuck behind giant ones without sacrificing much space // (as a percentage of the whole index), because this path only // can happen once per giant object. accept_tuple = false; } else { // ... by a little bit. accept_tuple = true; //Accept tuple if it fits on this page, or if it's big.. //accept_tuple = (((write_offset_-1) & ~(PAGE_SIZE-1)) == (((write_offset_ + tup_len)-1) & ~(PAGE_SIZE-1))); } } else { if(write_offset_ + tup_len < (initial_page_count_ * PAGE_SIZE)) { // tuple fits. contractually obligated to accept it. accept_tuple = true; } else if(write_offset_ == 0) { // datapage is empty. contractually obligated to accept tuple. accept_tuple = true; } else { if(tup_len > initial_page_count_ * PAGE_SIZE) { // this is a "big tuple" len_t reject_padding = PAGE_SIZE - (write_offset_ & (PAGE_SIZE-1)); len_t accept_padding = PAGE_SIZE - ((write_offset_ + tup_len) & (PAGE_SIZE-1)); accept_tuple = accept_padding < reject_padding; } else { // this is a "small tuple"; only exceed budget if doing so leads to < 33% overhead for this data. len_t accept_padding = PAGE_SIZE - (write_offset_ & (PAGE_SIZE-1)); accept_tuple = (3*accept_padding) < tup_len; } } } if(!accept_tuple) { DEBUG("offset %lld closing datapage\n", write_offset_); return false; } DEBUG("offset %lld continuing datapage\n", write_offset_); // TODO could be more efficient; this does a malloc and memcpy. // The alternative couples us more strongly to datatuple, but simplifies // datapage. byte * buf = dat->to_bytes(); len_t dat_len = dat->byte_length(); Page * p = write_data_and_latch((const byte*)&dat_len, sizeof(dat_len)); bool succ = false; if(p) { succ = write_data(buf, dat_len); unlock(p->rwlatch); releasePage(p); } free(buf); return succ; }