void insert_bigfile(chidb *db, int i) { int rc; int datalen = ((bigfile_pkeys[i] % 3) + 1) * 64; uint8_t buf[192]; for (int j=0; j<48; j++) put4byte(buf + (4*j), bigfile_ikeys[i]); rc = chidb_Btree_insertInTable(db->bt, 1, bigfile_pkeys[i], buf, datalen); CU_ASSERT(rc == CHIDB_OK); }
void test_bigfile(chidb *db) { int rc; for (int i=0; i<bigfile_nvalues; i++) { uint8_t* buf; uint16_t size; uint8_t data[192]; int datalen = ((bigfile_pkeys[i] % 3) + 1) * 64; for (int j=0; j<48; j++) put4byte(data + (4*j), bigfile_ikeys[i]); rc = chidb_Btree_find(db->bt, 1, bigfile_pkeys[i], &buf, &size); CU_ASSERT(rc == CHIDB_OK); CU_ASSERT(size == datalen); CU_ASSERT(!memcmp(buf, data, datalen)); } }
void test_index_bigfile(chidb *db, npage_t index_nroot) { int rc; for (int i=0; i<bigfile_nvalues; i++) { uint8_t* buf; uint16_t size; uint8_t data[192]; key_t pkey; rc = chidb_Btree_findInIndex(db->bt, index_nroot, bigfile_ikeys[i], &pkey); CU_ASSERT(rc == CHIDB_OK); int datalen = ((pkey % 3) + 1) * 64; for(int j=0; j<48; j++) put4byte(data + (4*j), bigfile_ikeys[i]); rc = chidb_Btree_find(db->bt, 1, pkey, &buf, &size); CU_ASSERT(rc == CHIDB_OK); CU_ASSERT(size == datalen); CU_ASSERT(!memcmp(buf, data, datalen)); } }
// return number of bytes written static int wal_iterate(Wal *pWal, iterate_resource *iter, u8 *buf, int bufsize, u8 *hdr, u32 *done) { // db_thread* const thr = enif_tsd_get(g_tsd_thread); #if ATOMIC db_thread *thr = g_tsd_thread; #else db_thread* thr = enif_tsd_get(g_tsd_thread); #endif mdbinf* const mdb = &thr->mdb; u32 mxPage; u64 readSafeEvnum, readSafeTerm; #ifndef _TESTAPP_ enif_mutex_lock(pWal->mtx); #endif readSafeEvnum = pWal->lastCompleteEvnum; readSafeTerm = pWal->lastCompleteTerm; mxPage = pWal->mxPage; #ifndef _TESTAPP_ enif_mutex_unlock(pWal->mtx); #endif if (!iter->started) { if (iter->evnum + iter->evterm == 0) { // If any writes come after iterator started, we must ignore those pages. iter->evnum = readSafeEvnum; iter->evterm = readSafeTerm; iter->pgnoPos = 1; iter->entiredb = 1; iter->mxPage = mxPage; if (pWal->mxPage == 0) { DBG("ERROR: Iterate on empty DB %llu",pWal->lastCompleteEvnum); *done = 1; return 0; } } else { // set mxPage to highest pgno we find. iter->pgnoPos = iter->mxPage = 0; DBG("Iterate rsterm=%llu rsevnum=%llu",readSafeTerm,readSafeEvnum); } iter->started = 1; } // send entire db (without history) if (iter->entiredb) { u32 iRead = 0; findframe(thr, pWal, iter->pgnoPos, &iRead, iter->evterm, iter->evnum, NULL, NULL); if (!iRead) { DBG("Iterate did not find page"); *done = iter->mxPage; return 0; } DBG("Iter pos=%u, mx=%u, safemx=%u",iter->pgnoPos, iter->mxPage, mxPage); if (iter->pgnoPos == iter->mxPage) *done = iter->mxPage; put8byte(hdr, iter->evterm); put8byte(hdr+sizeof(u64), iter->evnum); put4byte(hdr+sizeof(u64)*2, iter->pgnoPos); put4byte(hdr+sizeof(u64)*2+sizeof(u32), *done); iter->pgnoPos++; return fillbuff(thr, pWal, iter, buf, bufsize); } else { MDB_val logKey, logVal; int logop; u8 logKeyBuf[sizeof(u64)*3]; int rc; // ** - Log DB: {<<ActorIndex:64, Evterm:64, Evnum:64>>, <<Pgno:32/unsigned>>} memcpy(logKeyBuf, &pWal->index, sizeof(u64)); memcpy(logKeyBuf + sizeof(u64), &iter->evterm, sizeof(u64)); memcpy(logKeyBuf + sizeof(u64)*2, &iter->evnum, sizeof(u64)); logKey.mv_data = logKeyBuf; logKey.mv_size = sizeof(logKeyBuf); DBG("iterate looking for, matchterm=%llu matchevnum=%llu",iter->evterm,iter->evnum); if (mdb_cursor_get(mdb->cursorLog,&logKey,&logVal,MDB_SET) != MDB_SUCCESS) { // Evterm/evnum combination not found. Check if evnum is there. // If so return evterm. It will mean a node is in conflict. DBG("Key not found in log"); if (readSafeEvnum == iter->evnum) { iter->evterm = readSafeTerm; iter->termMismatch = 1; } else { memcpy(logKeyBuf, &pWal->index, sizeof(u64)); memcpy(logKeyBuf + sizeof(u64), &readSafeTerm, sizeof(u64)); memcpy(logKeyBuf + sizeof(u64)*2, &readSafeEvnum,sizeof(u64)); if (mdb_cursor_get(mdb->cursorLog,&logKey,&logVal,MDB_SET) != MDB_SUCCESS) { DBG("Key not found in log for undo"); *done = 1; return 0; } while (mdb_cursor_get(mdb->cursorLog,&logKey,&logVal,MDB_PREV_NODUP) == MDB_SUCCESS) { u64 aindex, term, evnum; mdb_cursor_get(mdb->cursorLog,&logKey, &logVal, MDB_GET_CURRENT); memcpy(&aindex, logKey.mv_data, sizeof(u64)); memcpy(&term, (u8*)logKey.mv_data+sizeof(u64), sizeof(u64)); memcpy(&evnum, (u8*)logKey.mv_data+sizeof(u64)*2,sizeof(u64)); DBG("Iterate on term=%llu, evnum=%llu, looking for=%llu",term,evnum,iter->evnum); if (aindex != pWal->index) break; if (iter->evnum == evnum) { iter->evterm = term; iter->termMismatch = 1; break; } } } *done = 1; return 0; } // We start iterate from next evnum not current. Input evterm/evnum is match_index and match_term. // It needs next. if (iter->started == 1 && (rc = mdb_cursor_get(mdb->cursorLog,&logKey, &logVal, MDB_NEXT_NODUP)) != MDB_SUCCESS) { *done = 1; return 0; } else { u64 aindex; rc = mdb_cursor_get(mdb->cursorLog,&logKey, &logVal, MDB_GET_CURRENT); if (rc != MDB_SUCCESS) { *done = 1; return 0; } memcpy(&aindex, (u8*)logKey.mv_data, sizeof(u64)); memcpy(&iter->evterm, (u8*)logKey.mv_data+sizeof(u64), sizeof(u64)); memcpy(&iter->evnum, (u8*)logKey.mv_data+sizeof(u64)*2,sizeof(u64)); if (aindex != pWal->index) { *done = 1; return 0; } // To keep from moving iter->evterm/iter->evnum forward more than once. iter->started = 2; } logop = MDB_FIRST_DUP; while ((rc = mdb_cursor_get(mdb->cursorLog,&logKey,&logVal,logop)) == MDB_SUCCESS) { u64 evnum,evterm; u32 pgno; u32 iRead; logop = MDB_NEXT_DUP; mdb_cursor_get(mdb->cursorLog,&logKey, &logVal, MDB_GET_CURRENT); memcpy(&pgno,logVal.mv_data,sizeof(u32)); DBG("iterate at pgno=%u, pgnopos=%u",pgno,iter->pgnoPos); if (pgno <= iter->pgnoPos) continue; findframe(thr, pWal, pgno, &iRead, iter->evterm, iter->evnum, &evterm, &evnum); if (iRead == 0) { DBG("ERROR: Did not find frame for pgno=%u, evterm=%llu, evnum=%llu", pgno, iter->evterm, iter->evnum); *done = 1; return 0; } if (evterm != iter->evterm || evnum != iter->evnum) { DBG("ERROR: Evterm/evnum does not match,looking for: evterm=%llu, evnum=%llu, " "got: evterm=%llu, evnum=%llu", iter->evterm, iter->evnum, evterm, evnum); *done = 1; return 0; } iter->pgnoPos = pgno; if ((rc = mdb_cursor_get(mdb->cursorLog,&logKey,&logVal,logop)) == MDB_SUCCESS) *done = 0; else *done = iter->pgnoPos; DBG( "logcursor get next %d, done=%u",rc,*done); put8byte(hdr, iter->evterm); put8byte(hdr+sizeof(u64), iter->evnum); put4byte(hdr+sizeof(u64)*2, iter->pgnoPos); put4byte(hdr+sizeof(u64)*2+sizeof(u32), *done); return fillbuff(thr, pWal, iter, buf, bufsize); } *done = 1; return 0; } }
/* Write a frame or frames to the log. */ int sqlite3WalFrames(Wal *pWal, int szPage, PgHdr *pList, Pgno nTruncate, int isCommit, int sync_flags) { PgHdr *p; MDB_val key, data; int rc; mdbinf* mdb; MDB_txn* txn; #if ATOMIC db_thread *thr = g_tsd_thread; db_connection* pCon = g_tsd_conn; #else db_thread* thr = enif_tsd_get(g_tsd_thread); db_connection* pCon = enif_tsd_get(g_tsd_conn); #endif #if ATOMIC if (!g_tsd_wmdb) lock_wtxn(thr->nEnv); mdb = g_tsd_wmdb; #else mdb = enif_tsd_get(g_tsd_wmdb); if (!mdb) lock_wtxn(thr->nEnv); mdb = enif_tsd_get(g_tsd_wmdb); #endif txn = mdb->txn; if (!mdb) return SQLITE_ERROR; key.mv_size = sizeof(u64); key.mv_data = (void*)&pWal->index; // Term/evnum must always be increasing if ((pWal->inProgressTerm > 0 && pWal->inProgressTerm < pWal->lastCompleteTerm) || (pWal->inProgressEvnum > 0 && pWal->inProgressEvnum < pWal->lastCompleteEvnum)) return SQLITE_ERROR; track_time(2,thr); // ** - Pages DB: {<<ActorIndex:64, Pgno:32/unsigned>>, <<Evterm:64,Evnum:64,Fragment,CompressedPage/binary>>} for(p=pList; p; p=p->pDirty) { u8 pagesKeyBuf[sizeof(u64)+sizeof(u32)]; u8 pagesBuf[PAGE_BUFF_SIZE]; int full_size = 0; int page_size = LZ4_compress_default((char*)p->pData,(char*)pagesBuf+sizeof(u64)*2+1,szPage,sizeof(pagesBuf)); char fragment_index = 0; int skipped = 0; track_time(3,thr); DBG("Insert frame, actor=%lld, pgno=%u, " "term=%lld, evnum=%lld, commit=%d, truncate=%d, compressedsize=%d", pWal->index,p->pgno,pWal->inProgressTerm,pWal->inProgressEvnum, isCommit,nTruncate,page_size); if (pCon->doReplicate) { u8 hdr[sizeof(u64)*2+sizeof(u32)*2]; put8byte(hdr, pWal->inProgressTerm); put8byte(hdr+sizeof(u64), pWal->inProgressEvnum); put4byte(hdr+sizeof(u64)*2, p->pgno); if (p->pDirty) put4byte(hdr+sizeof(u64)*2+sizeof(u32), 0); else put4byte(hdr+sizeof(u64)*2+sizeof(u32), nTruncate); #ifndef _TESTAPP_ wal_page_hook(thr,pagesBuf+sizeof(u64)*2+1, page_size, hdr, sizeof(hdr)); #endif } memcpy(pagesKeyBuf, &pWal->index,sizeof(u64)); memcpy(pagesKeyBuf + sizeof(u64), &p->pgno, sizeof(u32)); key.mv_size = sizeof(pagesKeyBuf); key.mv_data = pagesKeyBuf; // Check if there are pages with the same or higher evnum/evterm. If there are, delete them. // This can happen if sqlite flushed some page to disk before commiting, because there were // so many pages that they could not be held in memory. Or it could happen if pages need to be // overwritten because there was a write that did not pass raft consensus. rc = mdb_cursor_get(mdb->cursorPages,&key,&data,MDB_SET_KEY); if (rc == MDB_SUCCESS) { size_t ndupl; mdb_cursor_count(mdb->cursorPages,&ndupl); rc = mdb_cursor_get(mdb->cursorPages,&key,&data,MDB_LAST_DUP); if (rc == MDB_SUCCESS) { MDB_val pgDelKey = {0,NULL}, pgDelVal = {0,NULL}; u64 evnum, evterm; u8 frag = *((u8*)data.mv_data+sizeof(u64)*2); memcpy(&evterm, data.mv_data, sizeof(u64)); memcpy(&evnum, (u8*)data.mv_data + sizeof(u64), sizeof(u64)); while ((evterm > pWal->inProgressTerm || evnum >= pWal->inProgressEvnum)) //(pWal->inProgressTerm + pWal->inProgressEvnum) > 0) { DBG("Deleting pages higher or equal to current. " "Evterm=%llu, evnum=%llu, curterm=%llu, curevn=%llu, dupl=%ld", evterm,evnum,pWal->inProgressTerm,pWal->inProgressEvnum,ndupl); if (pgDelKey.mv_data != NULL) { if ((rc = mdb_del(txn,mdb->pagesdb,&pgDelKey,&pgDelVal)) != MDB_SUCCESS) { DBG("Unable to cleanup page from pagedb %d",rc); break; } pgDelKey.mv_data = NULL; } mdb_cursor_get(mdb->cursorPages,&pgDelKey,&pgDelVal,MDB_GET_CURRENT); // if (mdb_cursor_del(mdb->cursorPages,0) != MDB_SUCCESS) // { // DBG("Cant delete!"); // break; // } if (frag == 0) pWal->allPages--; ndupl--; if (!ndupl) break; rc = mdb_cursor_get(mdb->cursorPages,&key,&data,MDB_PREV_DUP); if (rc != MDB_SUCCESS) break; memcpy(&evterm, data.mv_data, sizeof(u64)); memcpy(&evnum, (u8*)data.mv_data + sizeof(u64), sizeof(u64)); frag = *((u8*)data.mv_data+sizeof(u64)*2); } if (pgDelKey.mv_data != NULL) { if ((rc = mdb_del(txn,mdb->pagesdb,&pgDelKey,&pgDelVal)) != MDB_SUCCESS) { DBG("Unable to cleanup page from pagedb %d",rc); break; } pgDelKey.mv_data = NULL; } } memcpy(pagesKeyBuf, &pWal->index,sizeof(u64)); memcpy(pagesKeyBuf + sizeof(u64), &p->pgno, sizeof(u32)); key.mv_size = sizeof(pagesKeyBuf); key.mv_data = pagesKeyBuf; } track_time(4,thr); memcpy(pagesBuf, &pWal->inProgressTerm, sizeof(u64)); memcpy(pagesBuf + sizeof(u64), &pWal->inProgressEvnum, sizeof(u64)); full_size = page_size + sizeof(u64)*2 + 1; if (full_size < thr->maxvalsize) fragment_index = 0; else { full_size = page_size; skipped = thr->maxvalsize - sizeof(u64)*2 - 1; full_size -= skipped; while(full_size > 0) { full_size -= (thr->maxvalsize - sizeof(u64)*2 - 1); fragment_index++; } full_size = page_size + sizeof(u64)*2 +1; } pagesBuf[sizeof(u64)*2] = fragment_index; data.mv_size = fragment_index == 0 ? full_size : thr->maxvalsize; data.mv_data = pagesBuf; // fragment_index == 0 ? MDB_APPENDDUP : 0 if ((rc = mdb_cursor_put(mdb->cursorPages,&key,&data,0)) != MDB_SUCCESS) { // printf("Cursor put failed to pages %d",rc); DBG("ERROR: cursor put failed=%d, datasize=%d",rc,full_size); return SQLITE_ERROR; } fragment_index--; skipped = data.mv_size; while (fragment_index >= 0) { DBG("Insert fragment %d",(int)fragment_index); if (fragment_index == 0) data.mv_size = full_size - skipped + sizeof(u64)*2 + 1; else data.mv_size = thr->maxvalsize; data.mv_data = pagesBuf + skipped - (sizeof(u64)*2+1); memcpy(pagesBuf + skipped - (sizeof(u64)*2+1), &pWal->inProgressTerm, sizeof(u64)); memcpy(pagesBuf + skipped - (sizeof(u64)+1), &pWal->inProgressEvnum, sizeof(u64)); pagesBuf[skipped-1] = fragment_index; if ((rc = mdb_cursor_put(mdb->cursorPages,&key,&data,0)) != MDB_SUCCESS) { DBG("ERROR: cursor secondary put failed: err=%d, datasize=%d, skipped=%d, frag=%d", rc,full_size, skipped, (int)fragment_index); return SQLITE_ERROR; } skipped += data.mv_size - sizeof(u64)*2 - 1; fragment_index--; } thr->pagesChanged++; } // printf(""); // ** - Log DB: {<<ActorIndex:64, Evterm:64, Evnum:64>>, <<Pgno:32/unsigned>>} if (pWal->inProgressTerm > 0) { for(p=pList; p; p=p->pDirty) { u8 logKeyBuf[sizeof(u64)*3]; DBG("Inserting to log"); memcpy(logKeyBuf, &pWal->index, sizeof(u64)); memcpy(logKeyBuf + sizeof(u64), &pWal->inProgressTerm, sizeof(u64)); memcpy(logKeyBuf + sizeof(u64)*2, &pWal->inProgressEvnum, sizeof(u64)); key.mv_size = sizeof(logKeyBuf); key.mv_data = logKeyBuf; data.mv_size = sizeof(u32); data.mv_data = &p->pgno; if (mdb_cursor_put(mdb->cursorLog,&key,&data,0) != MDB_SUCCESS) { // printf("Cursor put failed to log"); DBG("ERROR: cursor put to log failed: %d",rc); return SQLITE_ERROR; } pWal->allPages++; } } else { DBG("Skipping log"); for(p=pList; p; p=p->pDirty) pWal->allPages++; } /** - Info DB: {<<ActorIndex:64>>, <<V,FirstCompleteTerm:64,FirstCompleteEvnum:64, LastCompleteTerm:64,LastCompleteEvnum:64, InprogressTerm:64,InProgressEvnum:64>>} */ { if (isCommit) { DBG("Commit actor=%llu fct=%llu, fcev=%llu, lct=%llu, lcev=%llu, int=%llu, inev=%llu", pWal->index, pWal->firstCompleteTerm, pWal->firstCompleteEvnum, pWal->lastCompleteTerm, pWal->lastCompleteEvnum, pWal->inProgressTerm,pWal->inProgressEvnum); #ifndef _TESTAPP_ enif_mutex_lock(pWal->mtx); #endif pWal->lastCompleteTerm = pWal->inProgressTerm > 0 ? pWal->inProgressTerm : pWal->lastCompleteTerm; pWal->lastCompleteEvnum = pWal->inProgressEvnum > 0 ? pWal->inProgressEvnum : pWal->lastCompleteEvnum; if (pWal->firstCompleteTerm == 0) { pWal->firstCompleteTerm = pWal->inProgressTerm; pWal->firstCompleteEvnum = pWal->inProgressEvnum; } pWal->inProgressTerm = pWal->inProgressEvnum = 0; pWal->mxPage = pWal->mxPage > nTruncate ? pWal->mxPage : nTruncate; // pWal->changed = 0; thr->forceCommit = 1; pCon->dirty = 0; #ifndef _TESTAPP_ enif_mutex_unlock(pWal->mtx); #endif DBG("cur mxpage=%u",pWal->mxPage); } else { // pWal->changed = 1; pCon->dirty = 1; } thr->pagesChanged++; rc = storeinfo(pWal,0,0,NULL); if (rc != SQLITE_OK) return rc; track_time(5,thr); } return SQLITE_OK; }