static struct BloomContainer *compaction_update_bc( struct DB *const db, struct BloomContainer *const old_bc, struct BloomTable *const bloomtable) { const double sec0 = debug_time_sec(); const uint64_t off_bc = db_cmap_safe_alloc(db, db->cm_bc); assert(off_bc < db->cm_bc->total_cap); const uint64_t mtid_bc = db_aquire_mtid(db); const int raw_fd = db->cm_bc->raw_fd; struct BloomContainer *const new_bc = (!old_bc) ? bloomcontainer_build(bloomtable, raw_fd, off_bc, &(db->stat)) : bloomcontainer_update(old_bc, bloomtable, raw_fd, off_bc, &(db->stat)); assert(new_bc); new_bc->mtid = mtid_bc; const uint64_t count = new_bc->nr_bf_per_box; assert(count > 0); const bool r = db_dump_bloomcontainer_meta(db, mtid_bc, new_bc); assert(r); db_log_diff(db, sec0, "BC *%1" PRIu64 " [%8" PRIx64 " #%08" PRIx64 "] {%4" PRIu32 "}", count, mtid_bc, off_bc / TABLE_ALIGN, new_bc->nr_index); return new_bc; }
// backup db metadata static bool db_dump_meta(struct DB *const db) { char path_meta[256]; char path_sym[256]; const double sec0 = debug_time_sec(); // prepare files sprintf(path_meta, "%s/%s/%s-%018.6lf", db->persist_dir, DB_META_BACKUP_DIR, DB_META_MAIN, sec0); FILE *const meta_out = fopen(path_meta, "w"); assert(meta_out); const uint64_t ticket = rwlock_reader_lock(&(db->rwlock)); // dump meta // write vc const bool r_meta = recursive_dump(db->vcroot, meta_out); assert(r_meta); // write mtid const uint64_t db_next_mtid = db->next_mtid; fprintf(meta_out, "%" PRIu64 "\n", db_next_mtid); fclose(meta_out); // create symlink for newest meta sprintf(path_sym, "%s/%s", db->persist_dir, DB_META_MAIN); if (access(path_sym, F_OK) == 0) { const int ru = unlink(path_sym); assert(ru == 0); } const int rsm = symlink(path_meta, path_sym); assert(rsm == 0); // dump container-maps for (int i = 0; db->cms_dump[i]; i++) { char path_cm_dump[256]; sprintf(path_cm_dump, "%s/%s/%s-%01d-%018.6lf", db->persist_dir, DB_META_BACKUP_DIR, DB_META_CMAP_PREFIX, i, sec0); containermap_dump(db->cms_dump[i], path_cm_dump); // create symlink for newest meta sprintf(path_sym, "%s/%s-%01d", db->persist_dir, DB_META_CMAP_PREFIX, i); if (access(path_sym, F_OK) == 0) { const int ru = unlink(path_sym); assert(ru == 0); } const int rs = symlink(path_cm_dump, path_sym); assert(rs == 0); } // done rwlock_reader_unlock(&(db->rwlock), ticket); db_log_diff(db, sec0, "Dumping Metadata Finished (%06" PRIx64 ")", db_next_mtid); fflush(db->log); return true; }
static struct DB *db_load(const char *const meta_dir, struct ContainerMapConf *const cm_conf) { const double sec0 = debug_time_sec(); char path_meta[2048]; // test files sprintf(path_meta, "%s/%s", meta_dir, DB_META_MAIN); if (0 != access(path_meta, F_OK)) return NULL; assert(cm_conf); // test cmaps for (int i = 0; (i < 6) && cm_conf->raw_fn[i]; i++) { char path_cm[2048]; sprintf(path_cm, "%s/%s-%01d", meta_dir, DB_META_CMAP_PREFIX, i); if (0 != access(path_cm, F_OK)) return NULL; } // alloc db and load ContainerMap struct DB *const db = (typeof(db))malloc(sizeof(*db)); assert(db); bzero(db, sizeof(*db)); // load ContainerMaps for (int i = 0; (i < 6) && cm_conf->raw_fn[i]; i++) { char path_cm[2048]; sprintf(path_cm, "%s/%s-%01d", meta_dir, DB_META_CMAP_PREFIX, i); struct ContainerMap *const cm = containermap_load(path_cm, cm_conf->raw_fn[i]); assert(cm); db->cms_dump[i] = cm; } db_initial(db, meta_dir, cm_conf); //// LOAD META // parse vc FILE *const meta_in = fopen(path_meta, "r"); struct VirtualContainer *const vcroot = recursive_parse(meta_in, 0, db); assert(vcroot); db->vcroot = vcroot; // read mtid char buf_mtid[32]; fgets(buf_mtid, 30, meta_in); const uint64_t mtid = strtoull(buf_mtid, NULL, 10); assert(mtid > 0); db->next_mtid = mtid; fclose(meta_in); // initial anything db_log_diff(db, sec0, "Loaded Metadata Done"); return db; }
static void compaction_feed_all(struct Compaction *const comp) { for (uint64_t i = 0; i < comp->nr_feed; i++) { const double sec0 = debug_time_sec(); comp->feed_id = i; comp->feed_token = 0; // parallel feed threads conc_fork_reduce(DB_FEED_NR, thread_compaction_feed, comp); db_log_diff(comp->db, sec0, "FEED @%" PRIu64 " [%8" PRIx64 " #%08" PRIx64 "]", comp->start_bit / 3, comp->mts_old[i]->mtid, comp->mts_old[i]->mfh.off / TABLE_ALIGN); } // free feed arenas huge_free(comp->arena, TABLE_ALIGN); }
static void compaction_main(struct DB *const db, struct VirtualContainer *const vc, const uint64_t nr_feed) { struct Compaction comp; const double sec0 = debug_time_sec(); compaction_initial(&comp, db, vc, nr_feed); // feed (must sequential) compaction_feed_all(&comp); compaction_build_bt_all(&comp); compaction_dump_and_bc_all(&comp); compaction_update_vc(&comp); compaction_free_old(&comp); db_log_diff(db, sec0, "COMP @%" PRIu64 " %2" PRIu64, vc->start_bit / 3u, nr_feed); stat_inc(&(db->stat.nr_compaction)); }
static void db_initial(struct DB *const db, const char *const meta_dir, struct ContainerMapConf *const cm_conf) { // Load Meta // dir (for dump) db->persist_dir = strdup(meta_dir); // set cms assert(cm_conf); for (uint64_t i = 0; i < DB_NR_LEVELS; i++) { db->cms[i] = db->cms_dump[cm_conf->data_id[i]]; assert(db->cms[i]); } db->cm_bc = db->cms_dump[cm_conf->bc_id]; // hi? assert(db->cm_bc); // active tables db->active_table[0] = table_alloc_default(15.0); db->active_table[1] = NULL; // threading vars pthread_mutex_init(&(db->mutex_active), NULL); pthread_mutex_init(&(db->mutex_current), NULL); pthread_mutex_init(&(db->mutex_root), NULL); for (uint64_t i = 0; i < DB_COMPACTION_NR; i++) { pthread_mutex_init(&(db->mutex_token[i]), NULL); } // rwlock rwlock_initial(&(db->rwlock)); // cond var pthread_cond_init(&(db->cond_root_consumer), NULL); pthread_cond_init(&(db->cond_root_producer), NULL); pthread_cond_init(&(db->cond_active), NULL); pthread_cond_init(&(db->cond_writer), NULL); db->compaction_token = 0; // log char path[4096]; sprintf(path, "%s/%s", db->persist_dir, DB_META_LOG); FILE *const log = fopen(path, "a"); // NULL is OK db->log = log; // running db->sec_start = debug_time_sec(); db->closing = false; }
static void db_log(struct DB *const db, const char *const msg, ...) { if (!db->log) return; const double sec = debug_time_sec(); char th_name[16] = {0}; pthread_getname_np(pthread_self(), th_name, sizeof(th_name)); char head[1024]; char tail[1024]; sprintf(head, "[%-15s|%10s->%10.3lf|%9s] ", th_name, "", sec - db->sec_start, ""); va_list varg; va_start(varg, msg); vsnprintf(tail, sizeof(tail), msg, varg); va_end(varg); fprintf(db->log, "%s%s\n", head, tail); }
// takes 0.5s on average // assume table has been detached from db (like memtable => imm) static uint64_t db_table_dump(struct DB *const db, struct Table *const table, uint64_t start_bit) { const double sec0 = debug_time_sec(); const uint64_t mtid = db_aquire_mtid(db); // post process table // must has bloom-filter assert(table->bt); const bool rr = table_retain(table); // logging on failed retaining if (!rr) { // char buffer[4096]; // table_analysis_verbose(table, buffer); db_log(db, "DUMP @%" PRIu64 " [%8" PRIx64 " FAILED!!]\n%s", start_bit / 3, mtid, ""); assert(false); } // analysis and log char buffer[1024]; table_analysis_short(table, buffer); // alloc data area from containermap for items struct ContainerMap *const cm = db->cms[start_bit / 3]; const uint64_t off_main = db_cmap_safe_alloc(db, cm); assert(off_main < cm->total_cap); // dump table data const uint64_t nr_items = table_dump_barrels(table, cm->raw_fd, off_main); // dump meta char metafn[2048]; db_generate_meta_fn(db, mtid, metafn); const bool rdm = table_dump_meta(table, metafn, off_main); assert(rdm); db_log_diff(db, sec0, "DUMP @%" PRIu64 " [%8" PRIx64 " #%08" PRIx64 "] [%08" PRIu64 "] %s", start_bit / 3, mtid, off_main / TABLE_ALIGN, nr_items, buffer); return mtid; }
// create empty db static struct DB *db_create(const char *const meta_dir, struct ContainerMapConf *const cm_conf) { const double sec0 = debug_time_sec(); if (!db_touch_dir(meta_dir, "")) return NULL; if (!db_touch_dir(meta_dir, DB_META_BACKUP_DIR)) return NULL; // pre make 256 sub-dirs char sub_dir[16]; for (uint64_t i = 0; i < 256; i++) { sprintf(sub_dir, "%02" PRIx64, i); if (!db_touch_dir(meta_dir, sub_dir)) return NULL; } struct DB *const db = (typeof(db))malloc(sizeof(*db)); bzero(db, sizeof(*db)); for (int i = 0; (i < 6) && cm_conf->raw_fn[i]; i++) { struct ContainerMap *const cm = containermap_create(cm_conf->raw_fn[i], cm_conf->hints[i]); assert(cm); db->cms_dump[i] = cm; } db_initial(db, meta_dir, cm_conf); // empty vc db->vcroot = vc_create(0); assert(db->vcroot); // mtid start from 1 db->next_mtid = 1; // initial anything db_log_diff(db, sec0, "Initialized Metadata"); return db; }
double debug_diff_sec(const double last) { return debug_time_sec() - last; }