/** ******************************************************************************* * \brief ******************************************************************************/ int64_t iv_get(IV *iv, uint64_t i) { uint64_t pos = 0; uint64_t w = 0; uint64_t b = 0; uint64_t shift = 0; uint64_t val0 = 0; uint64_t val1 = 0; uint64_t msk0 = 0; uint64_t msk1 = 0; int64_t val = -1; if (!iv) { KV_TRC_FFDC(pAT, "iv NULL i:%ld", i); goto exception; } if (i >= iv->n) { KV_TRC_FFDC(pAT, "i:%ld is invalid n:%ld", i, iv->n); goto exception; } pos = i * iv->m; w = pos >> 6; b = pos & 63; if (b <= iv->bar) { shift = iv->bar - b; val = (int64_t)(iv->mask & (iv->data[w] >> shift)); }
void *si_new(uint64_t nh, uint64_t ne, uint64_t nb) { SI *si = am_malloc(sizeof(SI)); if (si == NULL) { errno = ENOMEM; KV_TRC_FFDC(pAT, "FFDC1: nh %ld ne %ld nb %ld, rc = %d", nh, ne, nb, errno); return NULL; } si->nh = nh; si->ne = ne; si->nb = nb; si->ent_next = 0; si->gid_next = 0; si->dat_next = 0; si->tbl = am_malloc(nh * sizeof(uint64_t)); if ( si->tbl == NULL ) { errno = ENOMEM; KV_TRC_FFDC(pAT, "FFDC2: nh %ld ne %ld nb %ld, rc = %d", nh, ne, nb, errno); } else { memset(si->tbl, 0xFF, nh * sizeof(uint64_t)); si->dat = am_malloc(nb); if (si->dat == NULL) { errno = ENOMEM; KV_TRC_FFDC(pAT, "FFDC3: nh %ld ne %ld nb %ld, rc = %d", nh, ne, nb, errno); am_free(si->tbl); am_free(si); si = NULL; } else { si->ent = am_malloc(ne * sizeof(SIE)); if (si->ent == NULL) { errno = ENOMEM; KV_TRC_FFDC(pAT, "FFDC4: nh %ld ne %ld nb %ld, rc = %d", nh, ne, nb, errno); am_free(si->tbl); am_free(si->dat); am_free(si); si = NULL; } } } memset(si->tbl, 0xFF, nh * sizeof(uint64_t)); return si; }
/** ******************************************************************************* * \brief ******************************************************************************/ int iv_set(IV *iv, uint64_t i, uint64_t v) { int rc = -1; uint64_t pos = 0; uint64_t w = 0; uint64_t b = 0; uint64_t shift = 0; uint64_t msk0 = 0; uint64_t msk1 = 0; uint64_t val = -1; if (!iv) { KV_TRC_FFDC(pAT, "iv NULL i:%ld", i); goto exception; } if (i >= iv->n) { KV_TRC_FFDC(pAT, "i:%ld is invalid n:%ld", i, iv->n); goto exception; } pos = i * iv->m; w = pos >> 6; b = pos & 63; v &= iv->mask; if (b <= iv->bar) { shift = iv->bar - b; msk1 = iv->mask << shift; msk0 = ~msk1; val = v << shift; val |= (iv->data[w] & msk0); iv->data[w] = val; } else { shift = b - iv->bar; msk1 = iv->mask >> shift; msk0 = ~msk1; val = v >> shift; val |= (iv->data[w] & msk0); iv->data[w] = val; shift = 64 - (b - iv->bar); msk1 = iv->mask << shift; msk0 = ~msk1; val = v << shift; val |= (iv->data[w+1] & msk0); iv->data[w+1] = val; } rc=0; exception: return rc; }
/** ******************************************************************************* * \brief ******************************************************************************/ IV *iv_resize(IV *piv, uint64_t n, uint64_t m) { uint64_t bits = n * m; uint64_t words = divup(bits, 64); uint64_t bytes = sizeof(IV) + words * sizeof(uint64_t); IV *iv = am_realloc(piv,bytes); if (iv == NULL) { errno = ENOMEM; KV_TRC_FFDC(pAT, "FFDC: iv %p n %"PRIu64" m %"PRIu64", errno = %d", piv, n, m, errno); } else { iv->n = n; iv->m = m; iv->bits = bits; iv->words = words; iv->mask = 1; iv->mask <<= m; iv->mask -= 1; iv->bar = 64 - m; } KV_TRC_DBG(pAT, "iv %p n %"PRIu64" m %"PRIu64"", piv, n, m); return iv; }
/** ******************************************************************************* * \brief ******************************************************************************/ IV *iv_new(uint64_t n, uint64_t m) { uint64_t bits = n * m; uint64_t words = divup(bits, 64); uint64_t bytes = sizeof(IV) + words * sizeof(uint64_t); IV *iv = am_malloc(bytes); if (iv == NULL) { errno = ENOMEM; KV_TRC_FFDC(pAT, "FFDC: n %"PRIu64" m %"PRIu64", errno = %d", n, m, errno); } else { memset(iv,0x00, bytes); iv->n = n; iv->m = m; iv->bits = bits; iv->words = words; iv->mask = 1; iv->mask <<= m; iv->mask -= 1; iv->bar = 64 - m; } KV_TRC(pAT, "iv:%p n:%ld m:%ld", iv, n, m); return iv; }
/** ******************************************************************************* * \brief ******************************************************************************/ uint32_t kv_async_dispatch_jobs(uint32_t ctxt) { async_context_t *pCT = pCTs+ctxt; async_CB_t *pCB = NULL; uint32_t jobs_running = 0; if (ctxt < 0 || ctxt > KV_ASYNC_MAX_CONTEXTS) { KV_TRC_FFDC(pFT, "FFDC %x", ctxt); return FALSE; } for (pCB=pCT->pCBs; pCB<pCT->pCBs+KV_ASYNC_JOB_Q; pCB++) { if (pCB->flags & KV_ASYNC_CB_QUEUED) { kv_async_dispatch(pCB); jobs_running = 1; usleep(1000); } else if (pCB->flags & KV_ASYNC_CB_RUNNING) { jobs_running = 1; } } return jobs_running; }
int ea_resize(EA *ea, uint64_t bsize, uint64_t bcount) { uint64_t size = bcount * bsize; int rc = 0; ARK_SYNC_EA_WRITE(ea); if ( ea->st_type == EA_STORE_TYPE_MEMORY ) { // For an in-memory store, we simply "realloc" // the memory. uint8_t *store = realloc(ea->st_memory, size); if (store) { ea->bcount = bcount; ea->size = size; ea->st_memory = store; } else { errno = ENOMEM; KV_TRC_FFDC(pAT, "ENOMEM, resize ea %p bsize %lu bcount %lu, errno = %d", ea, bsize, bcount, errno); rc = 1; } } else { // Call down to the block layer to set the // new size on the store. rc = cblk_set_size(ea->st_flash, bcount, 0); if (rc == 0) { ea->bcount = bcount; ea->size = size; } else { errno = ENOSPC; KV_TRC_FFDC(pAT, "cblk_set_size failed ea %p bsize %lu bcount %lu, " "errno = %d", ea, bsize, bcount, errno); } } ARK_SYNC_EA_UNLOCK(ea); return rc; }
/** ******************************************************************************* * \brief ******************************************************************************/ ARK* kv_async_get_ark(uint32_t ctxt) { async_context_t *pCT = pCTs+ctxt; if (ctxt < 0 || ctxt > KV_ASYNC_MAX_CONTEXTS) { KV_TRC_FFDC(pFT, "FFDC %x", ctxt); return FALSE; } return pCT->ark; }
int ark_exist_finish(_ARK *_arkp, int tid, tcb_t *tcbp) { int32_t state = ARK_CMD_DONE; rcb_t *rcbp = &(_arkp->rcbs[tcbp->rtag]); // Find the key position in the read in bucket rcbp->res = bt_exists(tcbp->inb, rcbp->klen, rcbp->key); if (rcbp->res == BT_FAIL) { KV_TRC_FFDC(pAT, "rc = ENOENT key %p, klen %"PRIu64"", rcbp->key, rcbp->klen); rcbp->rc = ENOENT; rcbp->res = -1; state = ARK_CMD_DONE; } return state; }
/** ******************************************************************************* * \brief * return TRUE if all IOs for the iocb are successfully completed, else FALSE ******************************************************************************/ int ea_async_io_schedule(_ARK *_arkp, int32_t tid, tcb_t *iotcbp, iocb_t *iocbp) { EA *ea = iocbp->ea; int32_t rc = TRUE; int32_t arc = 0; void *prc = 0; int64_t i = 0; uint8_t *p_addr = NULL; uint8_t *m_addr = NULL; char *ot = NULL; KV_TRC_IO(pAT, "IO_BEG: SCHEDULE_START: tid:%d ttag:%d start:%"PRIu64" " "nblks:%"PRIu64" issT:%d cmpT:%d", tid, iocbp->tag, iocbp->start, iocbp->nblks, iocbp->issT, iocbp->cmpT); ARK_SYNC_EA_READ(iocbp->ea); if (iocbp->op == ARK_EA_READ) {ot="IO_RD";} else {ot="IO_WR";} for (i=iocbp->start; i<iocbp->nblks; i++) { if (ea->st_type == EA_STORE_TYPE_MEMORY) { p_addr = ((uint8_t *)(iocbp->addr)) + (i * ea->bsize); m_addr = ea->st_memory + (iocbp->blist[i].blkno * ea->bsize); if (ARK_EA_READ == iocbp->op) {prc = memcpy(p_addr,m_addr,ea->bsize);} else {prc = memcpy(m_addr,p_addr,ea->bsize);} if (check_sched_error_injects(iocbp->op)) {prc=NULL;} // if memcpy failed, fail the iocb if (prc == NULL) { rc=FALSE; KV_TRC_FFDC(pAT,"IO_ERR: tid:%d ttag:%d blkno:%"PRIi64"" " errno:%d", tid, iocbp->tag, iocbp->blist[i].blkno, errno); if (!errno) {KV_TRC_FFDC(pAT, "IO: UNSET_ERRNO"); errno=EIO;} iocbp->io_error = errno; break; } ++iocbp->issT; iocbp->blist[i].a_tag = i; } else // r/w to hw { p_addr = ((uint8_t *)iocbp->addr) + (i * ea->bsize); if (check_sched_error_injects(iocbp->op)) { arc=-1; } else if ( iocbp->op == ARK_EA_READ ) { arc = cblk_aread(ea->st_flash, p_addr, iocbp->blist[i].blkno, 1, &(iocbp->blist[i].a_tag), NULL, 0); } else { arc = cblk_awrite(ea->st_flash, p_addr, iocbp->blist[i].blkno, 1, &(iocbp->blist[i].a_tag), NULL, 0); } if (arc == 0) // good status { ++iocbp->issT; rc=FALSE; } else if (arc < 0) { rc=FALSE; if (errno == EAGAIN) { // return, and an ark thread will re-schedule this iocb KV_TRC_DBG(pAT,"IO: RW_EAGAIN: tid:%d ttag:%d " "blkno:%"PRIi64"", tid, iocbp->tag, iocbp->blist[i].blkno); break; } // Something bad went wrong, fail the iocb KV_TRC_FFDC(pAT,"IO_ERR: tid:%d ttag:%d blkno:%"PRIi64"" " errno:%d", tid, iocbp->tag, iocbp->blist[i].blkno, errno); if (!errno) {KV_TRC_FFDC(pAT, "IO: UNSET_ERRNO"); errno=EIO;} iocbp->io_error = errno; break; } else if (arc > 0) { KV_TRC_IO(pAT,"IO_CMP: IMMEDIATE: tid:%d ttag:%d a_tag:%d " "blkno:%"PRIi64"", tid, iocbp->tag, iocbp->blist[i].a_tag, iocbp->blist[i].blkno); ++iocbp->issT; ++iocbp->cmpT; iocbp->blist[i].a_tag = -1; // mark as harvested } } KV_TRC_IO(pAT, "%s: tid:%2d ttag:%4d a_tag:%4d blkno:%5"PRIi64"", ot,tid, iocbp->tag, iocbp->blist[i].a_tag, iocbp->blist[i].blkno); } iotcbp->state = ARK_IO_HARVEST; iocbp->start = i; ARK_SYNC_EA_UNLOCK(iocbp->ea); return rc; }
int ark_persist(_ARK *_arkp) { int32_t rc = 0; uint64_t tot_bytes = 0; uint64_t wrblks = 0; char *p_data_orig = NULL; char *p_data = NULL; p_cntr_t *pptr = NULL; char *dptr = NULL; P_ARK_t *pcfg = NULL; ark_io_list_t *bl_array = NULL; if ( (_arkp->ea->st_type == EA_STORE_TYPE_MEMORY) || !(_arkp->flags & ARK_KV_PERSIST_STORE) ) { return 0; } ark_persistence_calc(_arkp); // allocate write buffer tot_bytes = _arkp->pers_max_blocks * _arkp->bsize; p_data_orig = am_malloc(tot_bytes); if (p_data_orig == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for " "persistence data", tot_bytes); return ENOMEM; } memset(p_data_orig, 0, tot_bytes); p_data = ptr_align(p_data_orig); // Record cntr data pptr = (p_cntr_t *)p_data; memcpy(pptr->p_cntr_magic, ARK_P_MAGIC, sizeof(pptr->p_cntr_magic)); pptr->p_cntr_version = ARK_P_VERSION_2; pptr->p_cntr_size = sizeof(p_cntr_t); // Record configuration info pcfg = (P_ARK_t*)pptr->p_cntr_data; pcfg->flags = _arkp->flags; pcfg->size = _arkp->ea->size; pcfg->bsize = _arkp->bsize; pcfg->bcount = _arkp->bcount; pcfg->blkbits = _arkp->blkbits; pcfg->grow = _arkp->blkbits; pcfg->hcount = _arkp->hcount; pcfg->vlimit = _arkp->vlimit; pcfg->blkused = _arkp->blkused; pcfg->nasyncs = _arkp->nasyncs; pcfg->basyncs = _arkp->basyncs; pcfg->ntasks = _arkp->ntasks; pcfg->nthrds = _arkp->nthrds; ark_persist_stats(_arkp, &(pcfg->pstats)); pptr->p_cntr_cfg_offset = 0; pptr->p_cntr_cfg_size = sizeof(P_ARK_t); dptr = pptr->p_cntr_data; // Record hash info dptr += pptr->p_cntr_cfg_size; pptr->p_cntr_ht_offset = dptr - pptr->p_cntr_data; pptr->p_cntr_ht_size = sizeof(hash_t) + (_arkp->ht->n * sizeof(uint64_t)); memcpy(dptr, _arkp->ht, pptr->p_cntr_ht_size); // Record block list info dptr += pptr->p_cntr_ht_size; pptr->p_cntr_bl_offset = dptr - pptr->p_cntr_data; pptr->p_cntr_bl_size = sizeof(BL); memcpy(dptr, _arkp->bl, pptr->p_cntr_bl_size); // Record IV list info dptr += pptr->p_cntr_bl_size; pptr->p_cntr_bliv_offset = dptr - pptr->p_cntr_data; // bliv_size = bytes in bl->list->data[cs_blocks + kvdata_blocks] // add 2 to top because of how IV->data chaining works pptr->p_cntr_bliv_size = divup((_arkp->bl->top+2) * _arkp->bl->w, 8); memcpy(dptr, _arkp->bl->list->data, pptr->p_cntr_bliv_size); // Calculate wrblks: number of persist metadata blocks to write tot_bytes = _arkp->pers_cs_bytes + pptr->p_cntr_bliv_size; wrblks = pcfg->pblocks = divup(tot_bytes, _arkp->bsize); KV_TRC(pAT, "PERSIST_WR dev:%s top:%ld wrblks:%ld vs pers_max_blocks:%ld", _arkp->ea->st_device, _arkp->bl->top, pcfg->pblocks, _arkp->pers_max_blocks); bl_array = bl_chain_blocks(_arkp->bl, 0, wrblks); if ( NULL == bl_array ) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" blocks for block list", wrblks); rc = ENOMEM; } else { rc = ea_async_io(_arkp->ea, ARK_EA_WRITE, (void *)p_data, bl_array, wrblks, _arkp->nthrds); am_free(bl_array); } KV_TRC(pAT, "PERSIST_DATA_STORED rc:%d", rc); am_free(p_data_orig); return rc; }
EA *ea_new(const char *path, uint64_t bsize, int basyncs, uint64_t *size, uint64_t *bcount, uint64_t vlun) { int rc = 0; size_t plen = 0; uint8_t *store = NULL; EA *ea = NULL; chunk_id_t chkid = NULL_CHUNK_ID; chunk_ext_arg_t ext = 0; if (!(fetch_and_or(&cflsh_blk_lib_init,1))) { // We need to call cblk_init once before // we use any other cblk_ interfaces rc = cblk_init(NULL,0); if (rc) { KV_TRC_FFDC(pAT, "cblk_init failed path %s bsize %"PRIu64" " "size %"PRIu64" bcount %"PRIu64", errno = %d", path, bsize, *size, *bcount, errno); goto error_exit; } } ea = am_malloc(sizeof(EA)); if (NULL == ea) { KV_TRC_FFDC(pAT, "Out of memory path %s bsize %"PRIu64" size %"PRIu64" " "bcount %"PRIu64", errno = %d", path, bsize, *size, *bcount, errno); goto error_exit; } // We need to check the path parameter to see if // we are going to use memory or a file/capi // device (to be determined by the block layer) if ( (NULL == path) || (strlen(path) == 0) ) { KV_TRC(pAT, "EA_STORE_TYPE_MEMORY"); // Using memory for store ea->st_type = EA_STORE_TYPE_MEMORY; store = malloc(*size); if (NULL == store) { errno = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory for store path %s bsize %"PRIu64" " "size %"PRIu64" bcount %"PRIu64", errno = %d", path, bsize, *size, *bcount, errno); goto error_exit; } *bcount = ((*size) / bsize); ea->st_memory = store; } else { KV_TRC(pAT, "EA_STORE_TYPE_FILE(%s)", path); // Using a file. We don't care if it's an actual // file or a CAPI device, we let block layer // decide and we just use the chunk ID that is // passed back from the cblk_open call. ea->st_type = EA_STORE_TYPE_FILE; // Check to see if we need to create the store on a // physical or virtual LUN. Previously, in GA1, // we keyed off the size and if it was 0, then we // asked for the LUN to be physical. Now, the user // can specify with a flag. if ( vlun == 0 ) { KV_TRC(pAT, "cblk_open PHYSICAL LUN: %s", path); chkid = cblk_open(path, basyncs, O_RDWR, ext, CBLK_OPN_NO_INTRP_THREADS); if (NULL_CHUNK_ID == chkid) { printf("cblk_open physical lun failed\n"); KV_TRC_FFDC(pAT, "cblk_open phys lun failed path:%s bsize:%ld " "size:%ld bcount:%ld, errno:%d", path, bsize, *size, *bcount, errno); goto error_exit; } rc = cblk_get_size(chkid, (size_t *)bcount, 0); if ( (rc != 0) || (*bcount == 0) ) { // An error was encountered, close the chunk cblk_close(chkid, 0); chkid = NULL_CHUNK_ID; KV_TRC_FFDC(pAT, "cblk_get_size failed path %s bsize %"PRIu64" " "size %"PRIu64" bcount %"PRIu64", errno = %d", path, bsize, *size, *bcount, errno); goto error_exit; } // Set the size to be returned *size = *bcount * bsize; } else { KV_TRC(pAT, "cblk_open VIRTUAL LUN: %s", path); chkid = cblk_open(path, basyncs, O_RDWR, ext, CBLK_OPN_VIRT_LUN|CBLK_OPN_NO_INTRP_THREADS); if (NULL_CHUNK_ID == chkid) { printf("cblk_open virtual lun failed\n"); KV_TRC_FFDC(pAT, "cblk_open virt lun failed path:%s bsize:%ld " "size:%ld bcount:%ld, errno:%d", path, bsize, *size, *bcount, errno); goto error_exit; } // A specific size was passed in so we try to set the // size of the chunk. *bcount = *size / bsize; rc = cblk_set_size(chkid, (size_t)*bcount, 0); if ( rc != 0 ) { printf("cblk_set_size failed for %ld\n", *bcount); // An error was encountered, close the chunk cblk_close(chkid, 0); chkid = NULL_CHUNK_ID; KV_TRC_FFDC(pAT, "cblk_set_size failed path %s bsize %"PRIu64" " "size %"PRIu64" bcount %"PRIu64", errno = %d", path, bsize, *size, *bcount, errno); goto error_exit; } } // Save off the chunk ID and the device name ea->st_flash = chkid; plen = strlen(path) + 1; ea->st_device = (char *)am_malloc(plen); if (!ea->st_device) { cblk_close(chkid, 0); KV_TRC_FFDC(pAT, "MALLOC st_device failed (%s) plen=%ld errno:%d", path, plen, errno); goto error_exit; } memset(ea->st_device, 0, plen); strncpy(ea->st_device, path, plen); } // Fill in the EA struct pthread_rwlock_init(&(ea->ea_rwlock), NULL); ea->bsize = bsize; ea->bcount = *bcount; ea->size = *size; KV_TRC(pAT, "path %s bsize %"PRIu64" size %"PRIu64" bcount %"PRIu64"", path, bsize, *size, *bcount); goto done; error_exit: am_free(ea); ea = NULL; if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOSPC;} done: return ea; }
int ea_async_io(EA *ea, int op, void *addr, ark_io_list_t *blist, int64_t len, int nthrs) { int64_t i = 0; int64_t j = 0; int64_t comps = 0; int num = 0; int max_ops = 0; void *m_rc = NULL; int rc = 0; int a_rc = 0; uint64_t status = 0; uint8_t *p_addr = NULL; uint8_t *m_addr = NULL; char *ot = NULL; ARK_SYNC_EA_READ(ea); if (op == ARK_EA_READ) {ot="IO_RD";} else {ot="IO_WR";} if ( ea->st_type == EA_STORE_TYPE_MEMORY) { // Loop through the block list to issue the IO for(i = 0; i < len; i++) { p_addr = ((uint8_t*)addr) + (i * ea->bsize); // For in-memory Store, we issue the memcpy // and wait for the return, no async here. // Read out the value from the in-memor block m_addr = ea->st_memory + (blist[i].blkno * ea->bsize); if (op == ARK_EA_READ) {m_rc = memcpy(p_addr, m_addr, ea->bsize);} else {m_rc = memcpy(m_addr, p_addr, ea->bsize);} if (check_sched_error_injects(op)) {m_rc=NULL;} if (check_harv_error_injects(op)) {m_rc=NULL;} if (m_rc == NULL) { rc = errno; break; } } } else { // divide up the cmd slots among // the threads and go 3 less max_ops = (ARK_EA_BLK_ASYNC_CMDS / nthrs) - 3; // Loop through the block list to issue the IO while ((comps < len) && (rc == 0)) { for(i = comps, num = 0; (i < len) && (num < max_ops); i++, num++) { p_addr = ((uint8_t*)addr) + (i * ea->bsize); // Call out to the block layer and retrive a block // Do an async op for a single block and tell the block // layer to wait if there are no available command // blocks. Upon return, we can either get an error // (rc == -1), the data will be available (rc == number // of blocks read), or IO has been scheduled (rc == 0). if (op == ARK_EA_READ) { rc = cblk_aread(ea->st_flash, p_addr, blist[i].blkno, 1, &(blist[i].a_tag), NULL,CBLK_ARW_WAIT_CMD_FLAGS); } else { rc = cblk_awrite(ea->st_flash, p_addr, blist[i].blkno, 1, &(blist[i].a_tag), NULL,CBLK_ARW_WAIT_CMD_FLAGS); } if (check_sched_error_injects(op)) {rc=-1;} KV_TRC_IO(pAT, "%s: id:%d blkno:%"PRIi64" rc:%d", ot, ea->st_flash, blist[i].blkno, rc); if ( rc == -1 ) { // Error was encountered. Don't issue any more IO rc = errno; KV_TRC_FFDC(pAT, "IO_ERR: cblk_aread/awrite failed, " "blkno:%"PRIi64" tag:%d, errno = %d", blist[i].blkno, blist[i].a_tag, errno); break; } // Data has already been returned so we don't need to // wait for the response below if ( rc > 0 ) { blist[i].a_tag = -1; rc = 0; } //_arkp->stats.io_cnt++; } // For as many IOs that were performed, we loop t // see if we need to wait for the response or the // data has already been returned. for (j = comps; j < i; j++) { // Data has already been read if (blist[j].a_tag == -1) { continue; } do { a_rc = cblk_aresult(ea->st_flash, &(blist[j].a_tag), &status, CBLK_ARESULT_BLOCKING); if (check_harv_error_injects(op)) {a_rc=-1;} // There was an error, check to see if we haven't // encoutnered an error previously and if not, then // set rc. Continue processing so that we harvest // all outstanding responses if (a_rc == -1) { if (rc == 0) { rc = errno; } KV_TRC_IO(pAT, "IO_ERR: id:%d blkno:%ld status:%ld a_rc:%d", ea->st_flash, blist[j].blkno, status, a_rc); } else { KV_TRC_IO(pAT, "IO_CMP: id:%d blkno:%ld status:%ld a_rc:%d", ea->st_flash, blist[j].blkno, status, a_rc); } // If a_rc is 0, that means we got interrupted somehow // so we need to retry the operation. } while (a_rc == 0); } // If we start another loop, start off where we finished // in this loop. comps = i; } } ARK_SYNC_EA_UNLOCK(ea); return rc; }
// if successful returns length of value void ark_exist_start(_ARK *_arkp, int tid, tcb_t *tcbp) { scb_t *scbp = &(_arkp->poolthreads[tid]); rcb_t *rcbp = &(_arkp->rcbs[tcbp->rtag]); tcb_t *iotcbp = &(_arkp->tcbs[rcbp->ttag]); iocb_t *iocbp = &(_arkp->iocbs[rcbp->ttag]); ark_io_list_t *bl_array = NULL; int32_t rc = 0; // Now that we have the hash entry, get the block // that holds the control information for the entry. tcbp->hblk = HASH_LBA(HASH_GET(_arkp->ht, rcbp->pos)); // If there is no control block for this hash // entry, then the key is not present in the hash. // Set the error if ( tcbp->hblk == 0 ) { KV_TRC_FFDC(pAT, "rc = ENOENT key %p, klen %"PRIu64" ttag:%d", rcbp->key, rcbp->klen, tcbp->ttag); rcbp->res = -1; rcbp->rc = ENOENT; tcbp->state = ARK_CMD_DONE; goto ark_exist_start_err; } // Set up the in-buffer to read in the hash bucket // that contains the key tcbp->blen = bl_len(_arkp->bl, tcbp->hblk); rc = bt_growif(&(tcbp->inb), &(tcbp->inb_orig), &(tcbp->inblen), (tcbp->blen * _arkp->bsize)); if (rc != 0) { KV_TRC_FFDC(pAT, "bt_growif failed tcbp:%p ttag:%d", tcbp, tcbp->ttag); rcbp->res = -1; rcbp->rc = rc; tcbp->state = ARK_CMD_DONE; goto ark_exist_start_err; } // Create a chain of blocks to be passed to be read bl_array = bl_chain(_arkp->bl, tcbp->hblk, tcbp->blen); if (bl_array == NULL) { KV_TRC_FFDC(pAT, "bl_chain failed tcbp:%p ttag:%d", tcbp, tcbp->ttag); rcbp->rc = ENOMEM; rcbp->res = -1; tcbp->state = ARK_CMD_DONE; goto ark_exist_start_err; } scbp->poolstats.io_cnt += tcbp->blen; KV_TRC_IO(pAT, "read hash entry ttag:%d", tcbp->ttag); ea_async_io_init(_arkp, ARK_EA_READ, (void *)tcbp->inb, bl_array, tcbp->blen, 0, tcbp->ttag, ARK_EXIST_FINISH); if (ea_async_io_schedule(_arkp, tid, iotcbp, iocbp) && ea_async_io_harvest (_arkp, tid, iotcbp, iocbp, rcbp)) { ark_exist_finish(_arkp, tid, tcbp); } ark_exist_start_err: return; }
int ark_check_persistence(_ARK *_arkp, uint64_t flags) { int32_t rc = -1; char *p_data_orig = NULL; char *p_data = NULL; ark_io_list_t *bl_array = NULL; p_cntr_t *pptr = NULL; P_ARK_t *pcfg = NULL; hash_t *htp = NULL; BL *blp = NULL; uint64_t rdblks = 0; if (flags & ARK_KV_PERSIST_LOAD) {KV_TRC(pAT, "PERSIST_LOAD");} // Ignore the persistence data and load from scratch if ( (!(flags & ARK_KV_PERSIST_LOAD)) || (flags & ARK_KV_VIRTUAL_LUN) ) { return -1; } p_data_orig = am_malloc(_arkp->bsize); if (p_data_orig == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for the first " "persistence block", _arkp->bsize); rc = ENOMEM; } else { p_data = ptr_align(p_data_orig); bl_array = bl_chain_no_bl(0, 1); rc = ea_async_io(_arkp->ea, ARK_EA_READ, (void *)p_data, bl_array, 1, 1); am_free(bl_array); } if (rc == 0) { // We've read the first block. We check to see if // persistence data is present and if so, then // read the rest of the data from the flash. pptr = (p_cntr_t *)p_data; _arkp->persdata = p_data_orig; if ( memcmp(pptr->p_cntr_magic, ARK_P_MAGIC, sizeof(pptr->p_cntr_magic) != 0)) { KV_TRC_FFDC(pAT, "No magic number found in persistence data: %d", EINVAL); // The magic number does not match so data is either // not present or is corrupted. rc = -1; } else { // Now we check version and the first persistence data // needs to be the ARK_PERSIST_CONFIG block if (pptr->p_cntr_version != ARK_P_VERSION_1 && pptr->p_cntr_version != ARK_P_VERSION_2) { KV_TRC_FFDC(pAT, "Invalid / unsupported version: %"PRIu64"", pptr->p_cntr_version); rc = EINVAL; } else { // Read in the rest of the persistence data pcfg = (P_ARK_t *)(pptr->p_cntr_data + pptr->p_cntr_cfg_offset); rdblks = pcfg->pblocks; if (rdblks > 1) { p_data_orig = am_realloc(p_data_orig, (rdblks * _arkp->bsize)); if (p_data_orig == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for " "full persistence block", (rdblks * _arkp->bsize)); rc = ENOMEM; } else { p_data = ptr_align(p_data_orig); bl_array = bl_chain_no_bl(0, rdblks); if (bl_array == NULL) { KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" blocks for " "full persistence data", rdblks); rc = ENOMEM; } } // We are still good to read the rest of the data // from the flash if (rc == 0) { KV_TRC(pAT, "PERSIST_RD rdblks:%ld", rdblks); rc = ea_async_io(_arkp->ea, ARK_EA_READ, (void *)p_data, bl_array, rdblks, 1); am_free(bl_array); pptr = (p_cntr_t *)p_data; pcfg = (P_ARK_t *)(pptr->p_cntr_data + pptr->p_cntr_cfg_offset); _arkp->persdata = p_data_orig; } } } } } // If rc == 0, that means we have persistence data if (rc == 0) { KV_TRC(pAT, "PERSIST_META size %ld bsize %ld hcount %ld bcount %ld " "nthrds %d nasyncs %d basyncs %d blkbits %ld version:%ld", pcfg->size, pcfg->bsize, pcfg->hcount, pcfg->bcount, pcfg->nthrds, pcfg->nasyncs, pcfg->basyncs, pcfg->blkbits, pptr->p_cntr_version); _arkp->persload = 1; _arkp->size = pcfg->size; _arkp->flags = flags; _arkp->bsize = pcfg->bsize; _arkp->bcount = pcfg->bcount; _arkp->blkbits = pcfg->blkbits; _arkp->grow = pcfg->grow; _arkp->hcount = pcfg->hcount; _arkp->vlimit = pcfg->vlimit; _arkp->blkused = pcfg->blkused; _arkp->pers_stats.kv_cnt = pcfg->pstats.kv_cnt; _arkp->pers_stats.blk_cnt = pcfg->pstats.blk_cnt; _arkp->pers_stats.byte_cnt = pcfg->pstats.byte_cnt; KV_TRC(pAT, "ARK_META size %ld bsize %ld hcount %ld bcount %ld " "nthrds %d nasyncs %ld basyncs %d blkbits %ld", _arkp->size, _arkp->bsize, _arkp->hcount, _arkp->bcount, _arkp->nthrds, _arkp->nasyncs, _arkp->basyncs, _arkp->blkbits); htp = (hash_t *)(pptr->p_cntr_data + pptr->p_cntr_ht_offset); _arkp->ht = hash_new(htp->n); if (_arkp->ht == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "ht_new failed: n:%ld rc:%d", htp->n, rc); goto error_exit; } memcpy(_arkp->ht, htp, pptr->p_cntr_ht_size); blp = (BL *)(pptr->p_cntr_data + pptr->p_cntr_bl_offset); _arkp->bl = bl_new(blp->n, blp->w); if (_arkp->bl == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "bl_new failed: n:%ld w:%ld rc:%d", blp->n, blp->w, rc); goto error_exit; } _arkp->bl->count = blp->count; _arkp->bl->head = blp->head; _arkp->bl->hold = blp->hold; _arkp->bl->top = blp->top; if (pptr->p_cntr_version == ARK_P_VERSION_1) { IV *piv = (IV *)(pptr->p_cntr_data + pptr->p_cntr_bliv_offset); KV_TRC(pAT, "PERSIST_VERSION_1 LOADED"); _arkp->bl->top = _arkp->bl->n; // copy IV->data from piv->data memcpy(_arkp->bl->list->data, piv->data, pptr->p_cntr_bliv_size); } else if (pptr->p_cntr_version == ARK_P_VERSION_2) { KV_TRC(pAT, "PERSIST_VERSION_2 LOADED"); // copy IV->data from bliv_offset memcpy(_arkp->bl->list->data, pptr->p_cntr_data + pptr->p_cntr_bliv_offset, pptr->p_cntr_bliv_size); } else { rc = EINVAL; KV_TRC_FFDC(pAT, "bad persistent version number: ver:%ld", pptr->p_cntr_version); goto error_exit; } KV_TRC(pAT, "BL_META: n:%ld count:%ld head:%ld hold:%ld top:%ld", _arkp->bl->n, _arkp->bl->count, _arkp->bl->head, _arkp->bl->hold, _arkp->bl->top); } error_exit: am_free(p_data_orig); return rc; }
// if successful returns length of value int ark_exist_start(_ARK *_arkp, int tid, tcb_t *tcbp) { scb_t *scbp = &(_arkp->poolthreads[tid]); rcb_t *rcbp = &(_arkp->rcbs[tcbp->rtag]); ark_io_list_t *bl_array = NULL; int32_t rc = 0; int32_t state = ARK_CMD_DONE; // Now that we have the hash entry, get the block // that holds the control information for the entry. tcbp->hblk = HASH_LBA(HASH_GET(_arkp->ht, rcbp->pos)); // If there is no control block for this hash // entry, then the key is not present in the hash. // Set the error if ( tcbp->hblk == 0 ) { KV_TRC_FFDC(pAT, "rc = ENOENT key %p, klen %"PRIu64"", rcbp->key, rcbp->klen); rcbp->res = -1; rcbp->rc = ENOENT; state = ARK_CMD_DONE; goto ark_exist_start_err; } // Set up the in-buffer to read in the hash bucket // that contains the key tcbp->blen = bl_len(_arkp->bl, tcbp->hblk); rc = bt_growif(&(tcbp->inb), &(tcbp->inb_orig), &(tcbp->inblen), (tcbp->blen * _arkp->bsize)); if (rc != 0) { rcbp->res = -1; rcbp->rc = rc; state = ARK_CMD_DONE; goto ark_exist_start_err; } // Create a chain of blocks to be passed to be read bl_array = bl_chain(_arkp->bl, tcbp->hblk, tcbp->blen); if (bl_array == NULL) { rcbp->rc = ENOMEM; rcbp->res = -1; state = ARK_CMD_DONE; goto ark_exist_start_err; } scbp->poolstats.io_cnt += tcbp->blen; rc = ea_async_io_mod(_arkp, ARK_EA_READ, (void *)tcbp->inb, bl_array, tcbp->blen, 0, tcbp->ttag, ARK_EXIST_FINISH); if (rc < 0) { rcbp->rc = -rc; rcbp->res = -1; state = ARK_CMD_DONE; goto ark_exist_start_err; } else if (rc == 0) { state = ARK_IO_HARVEST; } else { state = ark_exist_finish(_arkp, tid, tcbp); } ark_exist_start_err: return state; }
int ark_create_verbose(char *path, ARK **arkret, uint64_t size, uint64_t bsize, uint64_t hcount, int nthrds, int nqueue, int basyncs, uint64_t flags) { int rc = 0; int p_rc = 0; uint64_t bcount = 0; uint64_t x = 0; int i = 0; int tnum = 0; int rnum = 0; scb_t *scbp = NULL; KV_TRC_OPEN(pAT, "arkdb"); if (NULL == arkret) { KV_TRC_FFDC(pAT, "Incorrect value for ARK control block: rc=EINVAL"); rc = EINVAL; goto ark_create_ark_err; } if ( (flags & (ARK_KV_PERSIST_LOAD|ARK_KV_PERSIST_STORE)) && (flags & ARK_KV_VIRTUAL_LUN) ) { KV_TRC_FFDC(pAT, "Invalid persistence combination with ARK flags: %016lx", flags); rc = EINVAL; goto ark_create_ark_err; } if (nthrds <= 0) { KV_TRC_FFDC(pAT, "invalid nthrds:%d", nthrds); rc = EINVAL; goto ark_create_ark_err; } _ARK *ark = am_malloc(sizeof(_ARK)); if (ark == NULL) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocating ARK control structure for %ld", sizeof(_ARK)); goto ark_create_ark_err; } KV_TRC(pAT, "%p path(%s) size %ld bsize %ld hcount %ld " "nthrds %d nqueue %d basyncs %d flags:%08lx", ark, path, size, bsize, hcount, nthrds, nqueue, basyncs, flags); ark->bsize = bsize; ark->rthread = 0; ark->persload = 0; ark->nasyncs = ((nqueue <= 0) ? ARK_MAX_ASYNC_OPS : nqueue); ark->basyncs = basyncs; ark->ntasks = ARK_MAX_TASK_OPS; ark->nthrds = ARK_VERBOSE_NTHRDS_DEF; // hardcode, perf requirement // Create the KV storage, whether that will be memory based // or flash ark->ea = ea_new(path, ark->bsize, basyncs, &size, &bcount, (flags & ARK_KV_VIRTUAL_LUN)); if (ark->ea == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "KV storage initialization failed: rc/errno:%d", rc); goto ark_create_ea_err; } // Now that the "connection" to the store has been established // we need to check to see if data was persisted from a previous // instantiation of the KV store. p_rc = ark_check_persistence(ark, flags); if (p_rc > 0) { // We ran into an error while trying to read from // the store. rc = p_rc; KV_TRC_FFDC(pAT, "Persistence check failed: %d", rc); goto ark_create_persist_err; } else if (p_rc == -1) { KV_TRC(pAT, "NO PERSIST LOAD FLAG"); // There was no persistence data, so we just build off // of what was passed into the API. ark->size = size; ark->bcount = bcount; ark->hcount = hcount; ark->vlimit = ARK_VERBOSE_VLIMIT_DEF; ark->blkbits = ARK_VERBOSE_BLKBITS_DEF; ark->grow = ARK_VERBOSE_GROW_DEF; ark->rthread = 0; ark->flags = flags; ark->astart = 0; ark->blkused = 1; ark->ark_exit = 0; ark->nactive = 0; ark->pers_stats.kv_cnt = 0; ark->pers_stats.blk_cnt = 0; ark->pers_stats.byte_cnt = 0; ark->pcmd = PT_IDLE; // Create the requests and tag control blocks and queues. x = ark->hcount / ark->nthrds; ark->npart = x + (ark->hcount % ark->nthrds ? 1 : 0); // Create the hash table ark->ht = hash_new(ark->hcount); if (ark->ht == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Hash initialization failed: %d", rc); goto ark_create_ht_err; } // Create the block list ark->bl = bl_new(ark->bcount, ark->blkbits); if (ark->bl == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Block list initialization failed: %d", rc); goto ark_create_bl_err; } if (flags & ARK_KV_PERSIST_STORE) { ark_persistence_calc(ark); if (bl_reserve(ark->bl, ark->pers_max_blocks)) {goto ark_create_bl_err;} } } else { KV_TRC(pAT, "PERSIST: %p path(%s) size %ld bsize %ld hcount %ld " "nthrds %d nqueue %ld basyncs %d bcount %ld blkbits %ld", ark, path, ark->size, ark->bsize, ark->hcount, ark->nthrds, ark->nasyncs, ark->basyncs, ark->bcount, ark->blkbits); } rc = pthread_mutex_init(&ark->mainmutex,NULL); if (rc != 0) { KV_TRC_FFDC(pAT, "pthread_mutex_init for main mutex failed: %d", rc); goto ark_create_pth_mutex_err; } ark->rtags = tag_new(ark->nasyncs); if ( NULL == ark->rtags ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Tag initialization for requests failed: %d", rc); goto ark_create_rtag_err; } ark->ttags = tag_new(ark->ntasks); if ( NULL == ark->ttags ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Tag initialization for tasks failed: %d", rc); goto ark_create_ttag_err; } ark->rcbs = am_malloc(ark->nasyncs * sizeof(rcb_t)); if ( NULL == ark->rcbs ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for request control blocks", (ark->nasyncs * sizeof(rcb_t))); goto ark_create_rcbs_err; } ark->tcbs = am_malloc(ark->ntasks * sizeof(tcb_t)); if ( NULL == ark->tcbs ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for task control blocks", (ark->ntasks * sizeof(rcb_t))); goto ark_create_tcbs_err; } ark->iocbs = am_malloc(ark->ntasks * sizeof(iocb_t)); if ( NULL == ark->iocbs ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for io control blocks", (ark->ntasks * sizeof(iocb_t))); goto ark_create_iocbs_err; } ark->poolthreads = am_malloc(ark->nthrds * sizeof(scb_t)); if ( NULL == ark->poolthreads ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for server thread control blocks", (ark->nthrds * sizeof(scb_t))); goto ark_create_poolthreads_err; } for ( rnum = 0; rnum < ark->nasyncs ; rnum++ ) { ark->rcbs[rnum].stat = A_NULL; pthread_cond_init(&(ark->rcbs[rnum].acond), NULL); pthread_mutex_init(&(ark->rcbs[rnum].alock), NULL); } for ( tnum = 0; tnum < ark->ntasks; tnum++ ) { ark->tcbs[tnum].inb = bt_new(0, ark->vlimit, sizeof(uint64_t), &(ark->tcbs[tnum].inblen), &(ark->tcbs[tnum].inb_orig)); if (ark->tcbs[tnum].inb == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Bucket allocation for inbuffer failed: %d", rc); goto ark_create_taskloop_err; } ark->tcbs[tnum].oub = bt_new(0, ark->vlimit, sizeof(uint64_t), &(ark->tcbs[tnum].oublen), &(ark->tcbs[tnum].oub_orig)); if (ark->tcbs[tnum].oub == NULL) { if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;} rc = errno; KV_TRC_FFDC(pAT, "Bucket allocation for outbuffer failed: %d", rc); goto ark_create_taskloop_err; } //ark->tcbs[tnum].vbsize = bsize * 1024; ark->tcbs[tnum].vbsize = bsize * 256; ark->tcbs[tnum].vb_orig = am_malloc(ark->tcbs[tnum].vbsize); if (ark->tcbs[tnum].vb_orig == NULL) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation for %"PRIu64" bytes for variable size buffer", (bsize * 1024)); goto ark_create_taskloop_err; } ark->tcbs[tnum].vb = ptr_align(ark->tcbs[tnum].vb_orig); } *arkret = (void *)ark; ark->pts = (PT *)am_malloc(sizeof(PT) * ark->nthrds); if ( ark->pts == NULL ) { rc = ENOMEM; KV_TRC_FFDC(pAT, "Out of memory allocation for %"PRIu64" bytes for server thread data", (sizeof(PT) * ark->nthrds)); goto ark_create_taskloop_err; } for (i = 0; i < ark->nthrds; i++) { PT *pt = &(ark->pts[i]); scbp = &(ark->poolthreads[i]); memset(scbp, 0, sizeof(scb_t)); // Start off the random start point for this thread // at -1, to show that it has not been part of a // ark_random call. scbp->rlast = -1; scbp->holds = 0; scbp->poolstate = PT_RUN; scbp->poolstats.io_cnt = 0; scbp->poolstats.ops_cnt = 0; scbp->poolstats.kv_cnt = 0; scbp->poolstats.blk_cnt = 0; scbp->poolstats.byte_cnt = 0; pthread_mutex_init(&(scbp->poolmutex), NULL); pthread_cond_init(&(scbp->poolcond), NULL); scbp->rqueue = queue_new(ark->nasyncs); scbp->tqueue = queue_new(ark->ntasks); scbp->ioqueue = queue_new(ark->ntasks); pt->id = i; pt->ark = ark; rc = pthread_create(&(scbp->pooltid), NULL, pool_function, pt); if (rc != 0) { KV_TRC_FFDC(pAT, "pthread_create of server thread failed: %d", rc); goto ark_create_poolloop_err; } } #if 0 while (ark->nactive < ark->nthrds) { usleep(1); //printf("Create waiting %d/%d\n", ark->nactive, ark->nthrds); } #endif ark->pcmd = PT_RUN; goto ark_create_return; ark_create_poolloop_err: for (; i >= 0; i--) { scbp = &(ark->poolthreads[i]); if (scbp->pooltid != 0) { queue_lock(scbp->rqueue); queue_wakeup(scbp->rqueue); queue_unlock(scbp->rqueue); pthread_join(scbp->pooltid, NULL); pthread_mutex_destroy(&(scbp->poolmutex)); pthread_cond_destroy(&(scbp->poolcond)); if ( scbp->rqueue != NULL ) { queue_free(scbp->rqueue); } if ( scbp->tqueue != NULL ) { queue_free(scbp->tqueue); } if ( scbp->ioqueue != NULL ) { queue_free(scbp->ioqueue); } } } if ( ark->pts != NULL ) { am_free(ark->pts); } ark_create_taskloop_err: for ( tnum = 0; tnum < ark->ntasks; tnum++ ) { if (ark->tcbs[tnum].inb) { bt_delete(ark->tcbs[tnum].inb); } if (ark->tcbs[tnum].oub) { bt_delete(ark->tcbs[tnum].oub); } if (ark->tcbs[tnum].vb_orig) { am_free(ark->tcbs[tnum].vb_orig); } } for (rnum = 0; rnum < ark->nasyncs; rnum++) { pthread_cond_destroy(&(ark->rcbs[rnum].acond)); pthread_mutex_destroy(&(ark->rcbs[rnum].alock)); } if ( ark->poolthreads != NULL ) { am_free(ark->poolthreads); } ark_create_poolthreads_err: if (ark->iocbs) { am_free(ark->iocbs); } ark_create_iocbs_err: if (ark->tcbs) { am_free(ark->tcbs); } ark_create_tcbs_err: if (ark->rcbs) { am_free(ark->rcbs); } ark_create_rcbs_err: if (ark->ttags) { tag_free(ark->ttags); } ark_create_ttag_err: if (ark->rtags) { tag_free(ark->rtags); } ark_create_rtag_err: pthread_mutex_destroy(&ark->mainmutex); ark_create_pth_mutex_err: bl_delete(ark->bl); ark_create_bl_err: hash_free(ark->ht); ark_create_ht_err: ark_create_persist_err: ea_delete(ark->ea); ark_create_ea_err: am_free(ark); *arkret = NULL; ark_create_ark_err: KV_TRC_CLOSE(pAT); ark_create_return: return rc; }
/** ******************************************************************************* * \brief ******************************************************************************/ void kv_async_run_jobs(void) { async_CB_t *pCB = NULL; uint32_t ctxt_running = 0; uint32_t jobs_running = 0; uint32_t i = 0; uint32_t next = 0; uint32_t elapse = 0; uint32_t inject = 0; uint32_t secs = 0; uint32_t log_interval = 600; uint64_t ops = 0; uint64_t ios = 0; uint32_t tops = 0; uint32_t tios = 0; uint32_t perf = 0; KV_TRC(pFT, "ASYNC START: 0 minutes"); if (!(pCTs->pCBs->flags & KV_ASYNC_CB_RUNNING)) start = time(0); next = log_interval; do { ctxt_running = FALSE; if (elapse > next) { KV_TRC(pFT, "ASYNC RUNNING: %d elapsed minutes", elapse/60); next += log_interval; } for (i=0; i<KV_ASYNC_MAX_CONTEXTS; i++) { if (! (pCTs[i].flags & KV_ASYNC_CT_RUNNING)) continue; jobs_running = kv_async_dispatch_jobs(i); if (!jobs_running) { pCTs[i].flags &= ~KV_ASYNC_CT_RUNNING; pCTs[i].flags |= KV_ASYNC_CT_DONE; KV_TRC(pFT, "ASYNC DONE ctxt %d %x", i, pCTs[i].flags); continue; } else { ctxt_running = TRUE; } elapse = time(0) - start; if (elapse >= inject && pCTs[i].flags & KV_ASYNC_CT_ERROR_INJECT) { KV_TRC_FFDC(pFT, "FFDC: INJECT ERRORS"); FVT_KV_INJECT_READ_ERROR; FVT_KV_INJECT_WRITE_ERROR; FVT_KV_INJECT_ALLOC_ERROR; ++inject; } if (elapse >= pCTs[i].secs) { for (pCB=pCTs[i].pCBs;pCB<pCTs[i].pCBs+KV_ASYNC_JOB_Q;pCB++) { if ((pCB->flags & KV_ASYNC_CB_RUNNING || pCB->flags & KV_ASYNC_CB_QUEUED) && (!(pCB->flags & KV_ASYNC_CB_SHUTDOWN)) ) { pCB->flags |= KV_ASYNC_CB_SHUTDOWN; KV_TRC_IO(pFT, "SHUTDOWN pCB %p (%d >= %d)", pCB, elapse, pCTs[i].secs); } } } usleep(100); } } while (ctxt_running); stop = time(0); secs = stop - start; KV_TRC(pFT, "ASYNC RUNNING DONE: %d minutes", elapse/60); /* log cleanup, since the first ark_delete closes the log file */ for (i=0; i<KV_ASYNC_MAX_CONTEXTS; i++) { if (pCTs[i].flags & KV_ASYNC_CT_DONE) KV_TRC(pFT, "ASYNC CLEANUP: ctxt:%d ark:%p", i, pCTs[i].ark); } /* check for MULTI_CTXT_IO, destroy common kv dbs */ for (pCB=pCTs->pCBs;pCB<pCTs->pCBs+KV_ASYNC_JOB_Q;pCB++) { if (pCB->flags & KV_ASYNC_CB_MULTI_CTXT_IO) { kv_db_destroy(pCB->db, pCB->len); } } for (i=0; i<KV_ASYNC_MAX_CONTEXTS; i++) { /* if this context didn't run any I/O */ if (! (pCTs[i].flags & KV_ASYNC_CT_DONE)) continue; pCTs[i].flags &= ~KV_ASYNC_CT_DONE; /* if perf then don't delete the ark here */ if (pCTs[i].flags & KV_ASYNC_CT_PERF) { perf = TRUE; continue; } (void)ark_stats(pCTs[i].ark, &ops, &ios); tops += (uint32_t)ops; tios += (uint32_t)ios; KV_TRC(pFT, "PERF ark%p ops:%"PRIu64" ios:%"PRIu64"", pCTs[i].ark, ops, ios); EXPECT_EQ(0, ark_delete(pCTs[i].ark)); } if (!perf) { tops = tops / secs; tios = tios / secs; printf("op/s:%d io/s:%d secs:%d\n", tops, tios, secs); KV_TRC(pFT, "PERF op/s:%d io/s:%d secs:%d", tops, tios, secs); } }
int ark_delete(ARK *ark) { int rc = 0; int i = 0; _ARK *_arkp = (_ARK *)ark; scb_t *scbp = NULL; if (NULL == ark) { rc = EINVAL; KV_TRC_FFDC(pAT, "Invalid ARK control block parameter: %d", rc); goto ark_delete_ark_err; } // Wait for all active threads to exit for (i = 0; i < _arkp->nthrds; i++) { scbp = &(_arkp->poolthreads[i]); scbp->poolstate = PT_EXIT; queue_lock(scbp->rqueue); queue_wakeup(scbp->rqueue); queue_unlock(scbp->rqueue); pthread_join(scbp->pooltid, NULL); queue_free(scbp->rqueue); queue_free(scbp->tqueue); queue_free(scbp->ioqueue); pthread_mutex_destroy(&(scbp->poolmutex)); pthread_cond_destroy(&(scbp->poolcond)); KV_TRC(pAT, "thread %d joined", i); } if (_arkp->poolthreads) am_free(_arkp->poolthreads); if (_arkp->pts) am_free(_arkp->pts); for ( i = 0; i < _arkp->nasyncs ; i++ ) { pthread_cond_destroy(&(_arkp->rcbs[i].acond)); pthread_mutex_destroy(&(_arkp->rcbs[i].alock)); } for ( i = 0; i < _arkp->ntasks; i++ ) { bt_delete(_arkp->tcbs[i].inb); bt_delete(_arkp->tcbs[i].oub); am_free(_arkp->tcbs[i].vb_orig); } if (_arkp->iocbs) { am_free(_arkp->iocbs); } if (_arkp->tcbs) { am_free(_arkp->tcbs); } if (_arkp->rcbs) { am_free(_arkp->rcbs); } if (_arkp->ttags) { tag_free(_arkp->ttags); } if (_arkp->rtags) { tag_free(_arkp->rtags); } if (!(_arkp->flags & ARK_KV_VIRTUAL_LUN)) { rc = ark_persist(_arkp); if ( rc != 0 ) { KV_TRC_FFDC(pAT, "FFDC: ark_persist failed: %d", rc); } } pthread_mutex_destroy(&_arkp->mainmutex); (void)ea_delete(_arkp->ea); hash_free(_arkp->ht); bl_delete(_arkp->bl); KV_TRC(pAT, "ark_delete done %p", _arkp); am_free(_arkp); ark_delete_ark_err: KV_TRC_CLOSE(pAT); return rc; }
/** ******************************************************************************* * \brief * callback function for set/get/exists/del ******************************************************************************/ static void kv_async_cb(int errcode, uint64_t dt, int64_t res) { async_CB_t *pCB = (async_CB_t*)dt; kv_t *p_kv = NULL; uint64_t tag = (uint64_t)pCB; if (pCB == NULL) { KV_TRC_FFDC(pFT, "FFDC: pCB NULL"); return; } if (pCB->b_mark != B_MARK) { KV_TRC_FFDC(pFT, "FFDC: B_MARK FAILURE %p: %"PRIx64"", pCB, pCB->b_mark); return; } if (pCB->e_mark != E_MARK) { KV_TRC_FFDC(pFT, "FFDC: E_MARK FAILURE %p: %"PRIx64"", pCB, pCB->e_mark); return; } if (EBUSY == errcode) {kv_async_q_retry(pCB); goto done;} if (IS_GTEST) { EXPECT_EQ(0, errcode); EXPECT_EQ(tag, pCB->tag); } p_kv = pCB->db + pCB->len_i; ++pCB->len_i; if (pCB->flags & KV_ASYNC_CB_SET) { KV_TRC_IO(pFT, "KV_ASYNC_CB_SET, %p %d %d", pCB, pCB->len_i, pCB->len); if (0 != errcode) printf("ark_set failed, errcode=%d\n", errcode); if (tag != pCB->tag) printf("ark_set bad tag\n"); if (res != p_kv->vlen) printf("ark_set bad vlen\n"); if (IS_GTEST) { EXPECT_EQ(res, p_kv->vlen);} /* end of db len sequence, move to next step */ if (pCB->len_i == pCB->len) { if (pCB->flags & KV_ASYNC_CB_WRITE_PERF) { pCB->len_i = 0; kv_async_perf_done(pCB); goto done; } pCB->len_i = 0; pCB->flags &= ~KV_ASYNC_CB_SET; pCB->flags |= KV_ASYNC_CB_GET; kv_async_GET_KEY(pCB); goto done; } kv_async_SET_KEY(pCB); goto done; } else if (pCB->flags & KV_ASYNC_CB_GET) { uint32_t miscompare = memcmp(p_kv->value, pCB->gvalue, p_kv->vlen); KV_TRC_IO(pFT, "KV_ASYNC_CB_GET, %p %d %d", pCB, pCB->len_i, pCB->len); if (0 != errcode) printf("ark_get failed, errcode=%d\n", errcode); if (tag != pCB->tag) printf("ark_get bad tag\n"); if (res != p_kv->vlen) printf("ark_get bad vlen\n"); if (IS_GTEST) { EXPECT_EQ(0, miscompare);} /* end of db len sequence, move to next step */ if (pCB->len_i == pCB->len) { if (pCB->flags & KV_ASYNC_CB_READ_PERF) { pCB->len_i = 0; kv_async_perf_done(pCB); goto done; } pCB->len_i = 0; pCB->flags &= ~KV_ASYNC_CB_GET; pCB->flags |= KV_ASYNC_CB_EXISTS; kv_async_EXISTS_KEY(pCB); goto done; } kv_async_GET_KEY(pCB); goto done; } else if (pCB->flags & KV_ASYNC_CB_EXISTS) { KV_TRC_IO(pFT, "KV_ASYNC_CB_EXISTS, %p %d %d", pCB, pCB->len_i, pCB->len); if (0 != errcode) printf("ark_exists failed,errcode=%d\n",errcode); if (tag != pCB->tag) printf("ark_exists bad tag\n"); if (res != p_kv->vlen) printf("ark_exists bad vlen\n"); if (IS_GTEST) { EXPECT_EQ(res, p_kv->vlen);} /* if end of db len sequence, move to next step */ if (pCB->len_i == pCB->len) { pCB->len_i = 0; pCB->flags &= ~KV_ASYNC_CB_EXISTS; if (pCB->flags & KV_ASYNC_CB_SGD) { pCB->flags |= KV_ASYNC_CB_DEL; kv_async_DEL_KEY(pCB); goto done; } else if (pCB->flags & KV_ASYNC_CB_REPLACE) { /* make sure we don't shutdown before we have replaced once */ if (pCB->replace && pCB->flags & KV_ASYNC_CB_SHUTDOWN) { pCB->flags |= KV_ASYNC_CB_DEL; kv_async_DEL_KEY(pCB); goto done; } pCB->replace = TRUE; if (0 != pCB->regen(pCB->db, pCB->len, pCB->regen_len)) { printf("regen failure, fatal\n"); KV_TRC_FFDC(pFT, "FFDC: regen failure"); memset(pCB, 0, sizeof(async_CB_t)); goto done; } pCB->flags |= KV_ASYNC_CB_SET; kv_async_SET_KEY(pCB); goto done; } else { /* should not be here */ EXPECT_TRUE(0); } } kv_async_EXISTS_KEY(pCB); goto done; } else if (pCB->flags & KV_ASYNC_CB_DEL) { KV_TRC_IO(pFT, "KV_ASYNC_CB_DEL, %p i:%d len:%d", pCB, pCB->len_i,pCB->len); if (0 != errcode) printf("ark_del failed, errcode=%d\n",errcode); if (tag != pCB->tag) printf("ark_del bad tag\n"); if (res != p_kv->vlen) printf("ark_del bad vlen\n"); if (IS_GTEST) { EXPECT_EQ(res, p_kv->vlen);} /* end of db len sequence, move to next step */ if (pCB->len_i == pCB->len) { if (pCB->flags & KV_ASYNC_CB_SHUTDOWN) { if (!(pCB->flags & KV_ASYNC_CB_MULTI_CTXT_IO)) { kv_db_destroy(pCB->db, pCB->len); } if (pCB->gvalue) free(pCB->gvalue); memset(pCB, 0, sizeof(async_CB_t)); KV_TRC_IO(pFT, "LOOP_DONE: %p", pCB); goto done; } KV_TRC_IO(pFT, "NEXT_LOOP, %p", pCB); pCB->flags &= ~KV_ASYNC_CB_DEL; pCB->flags |= KV_ASYNC_CB_SET; pCB->len_i = 0; kv_async_SET_KEY(pCB); goto done; } kv_async_DEL_KEY(pCB); goto done; } else { /* should not be here */ EXPECT_TRUE(0); } done: return; }
/** ******************************************************************************* * \brief * return TRUE if the IOs for the iocb are successfully completed, else FALSE ******************************************************************************/ int ea_async_io_harvest(_ARK *_arkp, int32_t tid, tcb_t *iotcbp, iocb_t *iocbp, rcb_t *iorcbp) { EA *ea = iocbp->ea; int32_t i = 0; int32_t arc = 0; int32_t rc = FALSE; uint64_t status = 0; scb_t *scbp = &(_arkp->poolthreads[tid]); queue_t *rq = scbp->rqueue; queue_t *tq = scbp->tqueue; queue_t *ioq = scbp->ioqueue; for (i=0; i<iocbp->issT; i++) { if (EA_STORE_TYPE_MEMORY == ea->st_type) { // the IO has already been done in the schedule function, // so mark it completed arc = 1; } else { // skip previously harvested cmd if (iocbp->blist[i].a_tag == -1) {continue;} arc = cblk_aresult(ea->st_flash, &(iocbp->blist[i].a_tag), &status,0); } if (check_harv_error_injects(iocbp->op)) {arc=-1;} if (arc == 0) { KV_TRC_DBG(pAT,"IO: WAIT_NOT_CMP: tid:%d ttag:%d a_tag:%d " "blkno:%"PRIi64"", tid, iocbp->tag, iocbp->blist[i].a_tag, iocbp->blist[i].blkno); ++iocbp->hmissN; // if nothing to do and the first harvest missed, usleep if (queue_empty(rq) && queue_empty(tq) && queue_count(ioq)<=8 && iocbp->hmissN==1 && _arkp->ea->st_type != EA_STORE_TYPE_MEMORY) { usleep(50); KV_TRC_DBG(pAT,"IO: USLEEP"); } break; } if (arc < 0) { KV_TRC_FFDC(pAT, "IO_ERR: tid:%d ttag:%d errno=%d", tid, iocbp->tag, errno); if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=EIO;} iocbp->io_error = errno; } else { KV_TRC_IO(pAT,"IO_CMP: tid:%2d ttag:%4d a_tag:%4d blkno:%5"PRIi64"", tid, iocbp->tag, iocbp->blist[i].a_tag, iocbp->blist[i].blkno); } ++iocbp->cmpT; iocbp->blist[i].a_tag = -1; // mark as harvested } if (iocbp->io_error) { // if all cmds that were issued (success or fail) have been // completed for this iocb, then fail this iocb if (iocbp->issT == iocbp->cmpT) { iorcbp->res = -1; iorcbp->rc = iocbp->io_error; iotcbp->state = ARK_CMD_DONE; am_free(iocbp->blist); KV_TRC_FFDC(pAT, "IO: ERROR_DONE: tid:%d ttag:%d rc:%d", tid, iocbp->tag, iorcbp->rc); } else { // IOs outstanding, harvest the remaining IOs for this iocb KV_TRC_FFDC(pAT,"IO: ERROR_RE_HARVEST: tid:%d ttag:%d " "iocbp->issT:%d iocbp->cmpT:%d", tid, iocbp->tag, iocbp->issT, iocbp->cmpT); } } // if all IO has completed successfully for this iocb, done else if (iocbp->cmpT == iocbp->nblks) { rc=TRUE; am_free(iocbp->blist); iotcbp->state = ARK_IO_DONE; KV_TRC_IO(pAT, "IO_END: SUCCESS tid:%d ttag:%d cmpT:%d", tid, iocbp->tag, iocbp->cmpT); } // if more blks need an IO, schedule else if (iocbp->issT < iocbp->nblks) { iotcbp->state = ARK_IO_SCHEDULE; KV_TRC_IO(pAT,"IO: RE_SCHEDULE: tid:%d ttag:%d " "iocbp->issT:%d iocbp->nblks:%"PRIi64" ", tid, iocbp->tag, iocbp->issT, iocbp->nblks); } else { // all IOs have been issued but not all are completed, do harvest KV_TRC_IO(pAT,"IO: RE_HARVEST: tid:%d ttag:%d " "iocbp->cmpT:%d iocbp->issT:%d", tid, iocbp->tag, iocbp->cmpT, iocbp->issT); } return rc; }