void * page_aligned_alloc (void **ptr, size_t size) { size_t alignment = getpagesize (); size_t size1 = size + alignment; if (size1 < size) xalloc_die (); *ptr = xmalloc (size1); return ptr_align (*ptr, alignment); }
void * page_aligned_alloc (void **ptr, size_t size) { #ifdef HAVE_GETPAGESIZE size_t alignment = getpagesize (); #else size_t alignment = 8192; #endif size_t size1 = size + alignment; if (size1 < size) xalloc_die (); *ptr = xmalloc (size1); return ptr_align (*ptr, alignment); }
int ark_create_verbose(char *path, ARK **arkret, uint64_t size, uint64_t bsize, uint64_t hcount, int nthrds, int nqueue, int basyncs, uint64_t flags) { int rc = 0; int p_rc = 0; uint64_t bcount = 0; uint64_t x = 0; int i = 0; int tnum = 0; int rnum = 0; scb_t *scbp = NULL; KV_TRC_OPEN(pAT, "arkdb"); if (NULL == arkret) { KV_TRC_FFDC(pAT, "Incorrect value for ARK control block: rc=EINVAL"); rc = EINVAL; goto ark_create_ark_err; } if ( (flags & (ARK_KV_PERSIST_LOAD|ARK_KV_PERSIST_STORE)) && (flags & ARK_KV_VIRTUAL_LUN) ) { KV_TRC_FFDC(pAT, "Invalid persistence combination with ARK flags: %016lx", flags); rc = EINVAL; goto ark_create_ark_err; } if (nthrds <= 0) { KV_TRC_FFDC(pAT, "invalid nthrds:%d", nthrds); rc = EINVAL; goto ark_create_ark_err; } _ARK *ark = am_malloc(sizeof(_ARK)); if (ark == NULL) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocating ARK control structure for %ld", sizeof(_ARK)); goto ark_create_ark_err; } KV_TRC(pAT, "%p path(%s) size %ld bsize %ld hcount %ld " "nthrds %d nqueue %d basyncs %d flags:%08lx", ark, path, size, bsize, hcount, nthrds, nqueue, basyncs, flags); ark->bsize = bsize; ark->rthread = 0; ark->persload = 0; ark->nasyncs = ((nqueue <= 0) ? ARK_MAX_ASYNC_OPS : nqueue); ark->basyncs = basyncs; ark->ntasks = ARK_MAX_TASK_OPS; ark->nthrds = ARK_VERBOSE_NTHRDS_DEF; // hardcode, perf requirement // Create the KV storage, whether that will be memory based // or flash ark->ea = ea_new(path, ark->bsize, basyncs, &size, &bcount, (flags & ARK_KV_VIRTUAL_LUN)); if (ark->ea == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "KV storage initialization failed: rc/errno:%d", rc); goto ark_create_ea_err; } // Now that the "connection" to the store has been established // we need to check to see if data was persisted from a previous // instantiation of the KV store. p_rc = ark_check_persistence(ark, flags); if (p_rc > 0) { // We ran into an error while trying to read from // the store. rc = p_rc; KV_TRC_FFDC(pAT, "Persistence check failed: %d", rc); goto ark_create_persist_err; } else if (p_rc == -1) { KV_TRC(pAT, "NO PERSIST LOAD FLAG"); // There was no persistence data, so we just build off // of what was passed into the API. ark->size = size; ark->bcount = bcount; ark->hcount = hcount; ark->vlimit = ARK_VERBOSE_VLIMIT_DEF; ark->blkbits = ARK_VERBOSE_BLKBITS_DEF; ark->grow = ARK_VERBOSE_GROW_DEF; ark->rthread = 0; ark->flags = flags; ark->astart = 0; ark->blkused = 1; ark->ark_exit = 0; ark->nactive = 0; ark->pers_stats.kv_cnt = 0; ark->pers_stats.blk_cnt = 0; ark->pers_stats.byte_cnt = 0; ark->pcmd = PT_IDLE; // Create the requests and tag control blocks and queues. x = ark->hcount / ark->nthrds; ark->npart = x + (ark->hcount % ark->nthrds ? 1 : 0); // Create the hash table ark->ht = hash_new(ark->hcount); if (ark->ht == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Hash initialization failed: %d", rc); goto ark_create_ht_err; } // Create the block list ark->bl = bl_new(ark->bcount, ark->blkbits); if (ark->bl == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Block list initialization failed: %d", rc); goto ark_create_bl_err; } if (flags & ARK_KV_PERSIST_STORE) { ark_persistence_calc(ark); if (bl_reserve(ark->bl, ark->pers_max_blocks)) {goto ark_create_bl_err;} } } else { KV_TRC(pAT, "PERSIST: %p path(%s) size %ld bsize %ld hcount %ld " "nthrds %d nqueue %ld basyncs %d bcount %ld blkbits %ld", ark, path, ark->size, ark->bsize, ark->hcount, ark->nthrds, ark->nasyncs, ark->basyncs, ark->bcount, ark->blkbits); } rc = pthread_mutex_init(&ark->mainmutex,NULL); if (rc != 0) { KV_TRC_FFDC(pAT, "pthread_mutex_init for main mutex failed: %d", rc); goto ark_create_pth_mutex_err; } ark->rtags = tag_new(ark->nasyncs); if ( NULL == ark->rtags ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Tag initialization for requests failed: %d", rc); goto ark_create_rtag_err; } ark->ttags = tag_new(ark->ntasks); if ( NULL == ark->ttags ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Tag initialization for tasks failed: %d", rc); goto ark_create_ttag_err; } ark->rcbs = am_malloc(ark->nasyncs * sizeof(rcb_t)); if ( NULL == ark->rcbs ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for request control blocks", (ark->nasyncs * sizeof(rcb_t))); goto ark_create_rcbs_err; } ark->tcbs = am_malloc(ark->ntasks * sizeof(tcb_t)); if ( NULL == ark->tcbs ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for task control blocks", (ark->ntasks * sizeof(rcb_t))); goto ark_create_tcbs_err; } ark->iocbs = am_malloc(ark->ntasks * sizeof(iocb_t)); if ( NULL == ark->iocbs ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for io control blocks", (ark->ntasks * sizeof(iocb_t))); goto ark_create_iocbs_err; } ark->poolthreads = am_malloc(ark->nthrds * sizeof(scb_t)); if ( NULL == ark->poolthreads ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for server thread control blocks", (ark->nthrds * sizeof(scb_t))); goto ark_create_poolthreads_err; } for ( rnum = 0; rnum < ark->nasyncs ; rnum++ ) { ark->rcbs[rnum].stat = A_NULL; pthread_cond_init(&(ark->rcbs[rnum].acond), NULL); pthread_mutex_init(&(ark->rcbs[rnum].alock), NULL); } for ( tnum = 0; tnum < ark->ntasks; tnum++ ) { ark->tcbs[tnum].inb = bt_new(0, ark->vlimit, sizeof(uint64_t), &(ark->tcbs[tnum].inblen), &(ark->tcbs[tnum].inb_orig)); if (ark->tcbs[tnum].inb == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Bucket allocation for inbuffer failed: %d", rc); goto ark_create_taskloop_err; } ark->tcbs[tnum].oub = bt_new(0, ark->vlimit, sizeof(uint64_t), &(ark->tcbs[tnum].oublen), &(ark->tcbs[tnum].oub_orig)); if (ark->tcbs[tnum].oub == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Bucket allocation for outbuffer failed: %d", rc); goto ark_create_taskloop_err; } //ark->tcbs[tnum].vbsize = bsize * 1024; ark->tcbs[tnum].vbsize = bsize * 256; ark->tcbs[tnum].vb_orig = am_malloc(ark->tcbs[tnum].vbsize); if (ark->tcbs[tnum].vb_orig == NULL) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation for %"PRIu64" bytes for variable size buffer", (bsize * 1024)); goto ark_create_taskloop_err; } ark->tcbs[tnum].vb = ptr_align(ark->tcbs[tnum].vb_orig); } *arkret = (void *)ark; ark->pts = (PT *)am_malloc(sizeof(PT) * ark->nthrds); if ( ark->pts == NULL ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation for %"PRIu64" bytes for server thread data", (sizeof(PT) * ark->nthrds)); goto ark_create_taskloop_err; } for (i = 0; i < ark->nthrds; i++) { PT *pt = &(ark->pts[i]); scbp = &(ark->poolthreads[i]); memset(scbp, 0, sizeof(scb_t)); // Start off the random start point for this thread // at -1, to show that it has not been part of a // ark_random call. scbp->rlast = -1; scbp->holds = 0; scbp->poolstate = PT_RUN; scbp->poolstats.io_cnt = 0; scbp->poolstats.ops_cnt = 0; scbp->poolstats.kv_cnt = 0; scbp->poolstats.blk_cnt = 0; scbp->poolstats.byte_cnt = 0; pthread_mutex_init(&(scbp->poolmutex), NULL); pthread_cond_init(&(scbp->poolcond), NULL); scbp->rqueue = queue_new(ark->nasyncs); scbp->tqueue = queue_new(ark->ntasks); scbp->ioqueue = queue_new(ark->ntasks); pt->id = i; pt->ark = ark; rc = pthread_create(&(scbp->pooltid), NULL, pool_function, pt); if (rc != 0) { KV_TRC_FFDC(pAT, "pthread_create of server thread failed: %d", rc); goto ark_create_poolloop_err; } } #if 0 while (ark->nactive < ark->nthrds) { usleep(1); //printf("Create waiting %d/%d\n", ark->nactive, ark->nthrds); } #endif ark->pcmd = PT_RUN; goto ark_create_return; ark_create_poolloop_err: for (; i >= 0; i--) { scbp = &(ark->poolthreads[i]); if (scbp->pooltid != 0) { queue_lock(scbp->rqueue); queue_wakeup(scbp->rqueue); queue_unlock(scbp->rqueue); pthread_join(scbp->pooltid, NULL); pthread_mutex_destroy(&(scbp->poolmutex)); pthread_cond_destroy(&(scbp->poolcond)); if ( scbp->rqueue != NULL ) { queue_free(scbp->rqueue); } if ( scbp->tqueue != NULL ) { queue_free(scbp->tqueue); } if ( scbp->ioqueue != NULL ) { queue_free(scbp->ioqueue); } } } if ( ark->pts != NULL ) { am_free(ark->pts); } ark_create_taskloop_err: for ( tnum = 0; tnum < ark->ntasks; tnum++ ) { if (ark->tcbs[tnum].inb) { bt_delete(ark->tcbs[tnum].inb); } if (ark->tcbs[tnum].oub) { bt_delete(ark->tcbs[tnum].oub); } if (ark->tcbs[tnum].vb_orig) { am_free(ark->tcbs[tnum].vb_orig); } } for (rnum = 0; rnum < ark->nasyncs; rnum++) { pthread_cond_destroy(&(ark->rcbs[rnum].acond)); pthread_mutex_destroy(&(ark->rcbs[rnum].alock)); } if ( ark->poolthreads != NULL ) { am_free(ark->poolthreads); } ark_create_poolthreads_err: if (ark->iocbs) { am_free(ark->iocbs); } ark_create_iocbs_err: if (ark->tcbs) { am_free(ark->tcbs); } ark_create_tcbs_err: if (ark->rcbs) { am_free(ark->rcbs); } ark_create_rcbs_err: if (ark->ttags) { tag_free(ark->ttags); } ark_create_ttag_err: if (ark->rtags) { tag_free(ark->rtags); } ark_create_rtag_err: pthread_mutex_destroy(&ark->mainmutex); ark_create_pth_mutex_err: bl_delete(ark->bl); ark_create_bl_err: hash_free(ark->ht); ark_create_ht_err: ark_create_persist_err: ea_delete(ark->ea); ark_create_ea_err: am_free(ark); *arkret = NULL; ark_create_ark_err: KV_TRC_CLOSE(pAT); ark_create_return: return rc; }
int ark_check_persistence(_ARK *_arkp, uint64_t flags) { int32_t rc = -1; char *p_data_orig = NULL; char *p_data = NULL; ark_io_list_t *bl_array = NULL; p_cntr_t *pptr = NULL; P_ARK_t *pcfg = NULL; hash_t *htp = NULL; BL *blp = NULL; uint64_t rdblks = 0; if (flags & ARK_KV_PERSIST_LOAD) {KV_TRC(pAT, "PERSIST_LOAD");} // Ignore the persistence data and load from scratch if ( (!(flags & ARK_KV_PERSIST_LOAD)) || (flags & ARK_KV_VIRTUAL_LUN) ) { return -1; } p_data_orig = am_malloc(_arkp->bsize); if (p_data_orig == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for the first " "persistence block", _arkp->bsize); rc = ENOMEM; } else { p_data = ptr_align(p_data_orig); bl_array = bl_chain_no_bl(0, 1); rc = ea_async_io(_arkp->ea, ARK_EA_READ, (void *)p_data, bl_array, 1, 1); am_free(bl_array); } if (rc == 0) { // We've read the first block. We check to see if // persistence data is present and if so, then // read the rest of the data from the flash. pptr = (p_cntr_t *)p_data; _arkp->persdata = p_data_orig; if ( memcmp(pptr->p_cntr_magic, ARK_P_MAGIC, sizeof(pptr->p_cntr_magic) != 0)) { KV_TRC_FFDC(pAT, "No magic number found in persistence data: %d", EINVAL); // The magic number does not match so data is either // not present or is corrupted. rc = -1; } else { // Now we check version and the first persistence data // needs to be the ARK_PERSIST_CONFIG block if (pptr->p_cntr_version != ARK_P_VERSION_1 && pptr->p_cntr_version != ARK_P_VERSION_2) { KV_TRC_FFDC(pAT, "Invalid / unsupported version: %"PRIu64"", pptr->p_cntr_version); rc = EINVAL; } else { // Read in the rest of the persistence data pcfg = (P_ARK_t *)(pptr->p_cntr_data + pptr->p_cntr_cfg_offset); rdblks = pcfg->pblocks; if (rdblks > 1) { p_data_orig = am_realloc(p_data_orig, (rdblks * _arkp->bsize)); if (p_data_orig == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for " "full persistence block", (rdblks * _arkp->bsize)); rc = ENOMEM; } else { p_data = ptr_align(p_data_orig); bl_array = bl_chain_no_bl(0, rdblks); if (bl_array == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" blocks for " "full persistence data", rdblks); rc = ENOMEM; } } // We are still good to read the rest of the data // from the flash if (rc == 0) { KV_TRC(pAT, "PERSIST_RD rdblks:%ld", rdblks); rc = ea_async_io(_arkp->ea, ARK_EA_READ, (void *)p_data, bl_array, rdblks, 1); am_free(bl_array); pptr = (p_cntr_t *)p_data; pcfg = (P_ARK_t *)(pptr->p_cntr_data + pptr->p_cntr_cfg_offset); _arkp->persdata = p_data_orig; } } } } } // If rc == 0, that means we have persistence data if (rc == 0) { KV_TRC(pAT, "PERSIST_META size %ld bsize %ld hcount %ld bcount %ld " "nthrds %d nasyncs %d basyncs %d blkbits %ld version:%ld", pcfg->size, pcfg->bsize, pcfg->hcount, pcfg->bcount, pcfg->nthrds, pcfg->nasyncs, pcfg->basyncs, pcfg->blkbits, pptr->p_cntr_version); _arkp->persload = 1; _arkp->size = pcfg->size; _arkp->flags = flags; _arkp->bsize = pcfg->bsize; _arkp->bcount = pcfg->bcount; _arkp->blkbits = pcfg->blkbits; _arkp->grow = pcfg->grow; _arkp->hcount = pcfg->hcount; _arkp->vlimit = pcfg->vlimit; _arkp->blkused = pcfg->blkused; _arkp->pers_stats.kv_cnt = pcfg->pstats.kv_cnt; _arkp->pers_stats.blk_cnt = pcfg->pstats.blk_cnt; _arkp->pers_stats.byte_cnt = pcfg->pstats.byte_cnt; KV_TRC(pAT, "ARK_META size %ld bsize %ld hcount %ld bcount %ld " "nthrds %d nasyncs %ld basyncs %d blkbits %ld", _arkp->size, _arkp->bsize, _arkp->hcount, _arkp->bcount, _arkp->nthrds, _arkp->nasyncs, _arkp->basyncs, _arkp->blkbits); htp = (hash_t *)(pptr->p_cntr_data + pptr->p_cntr_ht_offset); _arkp->ht = hash_new(htp->n); if (_arkp->ht == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "ht_new failed: n:%ld rc:%d", htp->n, rc); goto error_exit; } memcpy(_arkp->ht, htp, pptr->p_cntr_ht_size); blp = (BL *)(pptr->p_cntr_data + pptr->p_cntr_bl_offset); _arkp->bl = bl_new(blp->n, blp->w); if (_arkp->bl == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "bl_new failed: n:%ld w:%ld rc:%d", blp->n, blp->w, rc); goto error_exit; } _arkp->bl->count = blp->count; _arkp->bl->head = blp->head; _arkp->bl->hold = blp->hold; _arkp->bl->top = blp->top; if (pptr->p_cntr_version == ARK_P_VERSION_1) { IV *piv = (IV *)(pptr->p_cntr_data + pptr->p_cntr_bliv_offset); KV_TRC(pAT, "PERSIST_VERSION_1 LOADED"); _arkp->bl->top = _arkp->bl->n; // copy IV->data from piv->data memcpy(_arkp->bl->list->data, piv->data, pptr->p_cntr_bliv_size); } else if (pptr->p_cntr_version == ARK_P_VERSION_2) { KV_TRC(pAT, "PERSIST_VERSION_2 LOADED"); // copy IV->data from bliv_offset memcpy(_arkp->bl->list->data, pptr->p_cntr_data + pptr->p_cntr_bliv_offset, pptr->p_cntr_bliv_size); } else { rc = EINVAL; KV_TRC_FFDC(pAT, "bad persistent version number: ver:%ld", pptr->p_cntr_version); goto error_exit; } KV_TRC(pAT, "BL_META: n:%ld count:%ld head:%ld hold:%ld top:%ld", _arkp->bl->n, _arkp->bl->count, _arkp->bl->head, _arkp->bl->hold, _arkp->bl->top); } error_exit: am_free(p_data_orig); return rc; }
int ark_persist(_ARK *_arkp) { int32_t rc = 0; uint64_t tot_bytes = 0; uint64_t wrblks = 0; char *p_data_orig = NULL; char *p_data = NULL; p_cntr_t *pptr = NULL; char *dptr = NULL; P_ARK_t *pcfg = NULL; ark_io_list_t *bl_array = NULL; if ( (_arkp->ea->st_type == EA_STORE_TYPE_MEMORY) || !(_arkp->flags & ARK_KV_PERSIST_STORE) ) { return 0; } ark_persistence_calc(_arkp); // allocate write buffer tot_bytes = _arkp->pers_max_blocks * _arkp->bsize; p_data_orig = am_malloc(tot_bytes); if (p_data_orig == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for " "persistence data", tot_bytes); return ENOMEM; } memset(p_data_orig, 0, tot_bytes); p_data = ptr_align(p_data_orig); // Record cntr data pptr = (p_cntr_t *)p_data; memcpy(pptr->p_cntr_magic, ARK_P_MAGIC, sizeof(pptr->p_cntr_magic)); pptr->p_cntr_version = ARK_P_VERSION_2; pptr->p_cntr_size = sizeof(p_cntr_t); // Record configuration info pcfg = (P_ARK_t*)pptr->p_cntr_data; pcfg->flags = _arkp->flags; pcfg->size = _arkp->ea->size; pcfg->bsize = _arkp->bsize; pcfg->bcount = _arkp->bcount; pcfg->blkbits = _arkp->blkbits; pcfg->grow = _arkp->blkbits; pcfg->hcount = _arkp->hcount; pcfg->vlimit = _arkp->vlimit; pcfg->blkused = _arkp->blkused; pcfg->nasyncs = _arkp->nasyncs; pcfg->basyncs = _arkp->basyncs; pcfg->ntasks = _arkp->ntasks; pcfg->nthrds = _arkp->nthrds; ark_persist_stats(_arkp, &(pcfg->pstats)); pptr->p_cntr_cfg_offset = 0; pptr->p_cntr_cfg_size = sizeof(P_ARK_t); dptr = pptr->p_cntr_data; // Record hash info dptr += pptr->p_cntr_cfg_size; pptr->p_cntr_ht_offset = dptr - pptr->p_cntr_data; pptr->p_cntr_ht_size = sizeof(hash_t) + (_arkp->ht->n * sizeof(uint64_t)); memcpy(dptr, _arkp->ht, pptr->p_cntr_ht_size); // Record block list info dptr += pptr->p_cntr_ht_size; pptr->p_cntr_bl_offset = dptr - pptr->p_cntr_data; pptr->p_cntr_bl_size = sizeof(BL); memcpy(dptr, _arkp->bl, pptr->p_cntr_bl_size); // Record IV list info dptr += pptr->p_cntr_bl_size; pptr->p_cntr_bliv_offset = dptr - pptr->p_cntr_data; // bliv_size = bytes in bl->list->data[cs_blocks + kvdata_blocks] // add 2 to top because of how IV->data chaining works pptr->p_cntr_bliv_size = divup((_arkp->bl->top+2) * _arkp->bl->w, 8); memcpy(dptr, _arkp->bl->list->data, pptr->p_cntr_bliv_size); // Calculate wrblks: number of persist metadata blocks to write tot_bytes = _arkp->pers_cs_bytes + pptr->p_cntr_bliv_size; wrblks = pcfg->pblocks = divup(tot_bytes, _arkp->bsize); KV_TRC(pAT, "PERSIST_WR dev:%s top:%ld wrblks:%ld vs pers_max_blocks:%ld", _arkp->ea->st_device, _arkp->bl->top, pcfg->pblocks, _arkp->pers_max_blocks); bl_array = bl_chain_blocks(_arkp->bl, 0, wrblks); if ( NULL == bl_array ) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" blocks for block list", wrblks); rc = ENOMEM; } else { rc = ea_async_io(_arkp->ea, ARK_EA_WRITE, (void *)p_data, bl_array, wrblks, _arkp->nthrds); am_free(bl_array); } KV_TRC(pAT, "PERSIST_DATA_STORED rc:%d", rc); am_free(p_data_orig); return rc; }