Example #1
0
/**
 *******************************************************************************
 * \brief
 ******************************************************************************/
int64_t iv_get(IV *iv, uint64_t i)
{
  uint64_t  pos   = 0;
  uint64_t  w     = 0;
  uint64_t  b     = 0;
  uint64_t  shift = 0;
  uint64_t  val0  = 0;
  uint64_t  val1  = 0;
  uint64_t  msk0  = 0;
  uint64_t  msk1  = 0;
  int64_t   val   = -1;

  if (!iv)
  {
      KV_TRC_FFDC(pAT, "iv NULL i:%ld", i);
      goto exception;
  }
  if (i >= iv->n)
  {
      KV_TRC_FFDC(pAT, "i:%ld is invalid n:%ld", i, iv->n);
      goto exception;
  }

  pos = i * iv->m;
  w   = pos >> 6;
  b   = pos & 63;

  if (b <= iv->bar)
  {
    shift = iv->bar - b;
    val   = (int64_t)(iv->mask & (iv->data[w] >> shift));
  }
Example #2
0
void *si_new(uint64_t nh, uint64_t ne, uint64_t nb) {

  SI *si = am_malloc(sizeof(SI)); 
  if (si == NULL)
  {
    errno = ENOMEM;
    KV_TRC_FFDC(pAT, "FFDC1: nh %ld ne %ld nb %ld, rc = %d",
            nh, ne, nb, errno);
    return NULL;
  }
  
  si->nh = nh;
  si->ne = ne;
  si->nb = nb;
  
  si->ent_next = 0;
  si->gid_next = 0;
  si->dat_next = 0;

  si->tbl = am_malloc(nh * sizeof(uint64_t));
  if ( si->tbl == NULL )
  {
    errno = ENOMEM;
    KV_TRC_FFDC(pAT, "FFDC2: nh %ld ne %ld nb %ld, rc = %d",
            nh, ne, nb, errno);
  }
  else
  {
    memset(si->tbl, 0xFF, nh * sizeof(uint64_t));
    si->dat = am_malloc(nb);
    if (si->dat == NULL)
    {
      errno = ENOMEM;
      KV_TRC_FFDC(pAT, "FFDC3: nh %ld ne %ld nb %ld, rc = %d",
              nh, ne, nb, errno);
      am_free(si->tbl);
      am_free(si);
      si = NULL;
    }
    else
    {
      si->ent = am_malloc(ne * sizeof(SIE));
      if (si->ent == NULL)
      {
        errno = ENOMEM;
        KV_TRC_FFDC(pAT, "FFDC4: nh %ld ne %ld nb %ld, rc = %d",
                nh, ne, nb, errno);
        am_free(si->tbl);
        am_free(si->dat);
        am_free(si);
        si = NULL;
      }
    }
  }

  memset(si->tbl, 0xFF, nh * sizeof(uint64_t));
  return si;
}
Example #3
0
/**
 *******************************************************************************
 * \brief
 ******************************************************************************/
int iv_set(IV *iv, uint64_t i, uint64_t v)
{
  int       rc    = -1;
  uint64_t  pos   = 0;
  uint64_t  w     = 0;
  uint64_t  b     = 0;
  uint64_t  shift = 0;
  uint64_t  msk0  = 0;
  uint64_t  msk1  = 0;
  uint64_t  val   = -1;

  if (!iv)
  {
      KV_TRC_FFDC(pAT, "iv NULL i:%ld", i);
      goto exception;
  }
  if (i >= iv->n)
  {
      KV_TRC_FFDC(pAT, "i:%ld is invalid n:%ld", i, iv->n);
      goto exception;
  }

  pos = i * iv->m;
  w   = pos >> 6;
  b   = pos & 63;

  v &= iv->mask;
  if (b <= iv->bar)
  {
    shift = iv->bar - b;
    msk1   = iv->mask << shift;
    msk0  = ~msk1;
    val   = v << shift;
    val |= (iv->data[w] & msk0);
    iv->data[w] =  val;
  }
  else
  {
    shift = b - iv->bar;
    msk1  = iv->mask >> shift;
    msk0  = ~msk1;
    val   = v >> shift;
    val |= (iv->data[w] & msk0);
    iv->data[w] = val;
    shift = 64 - (b - iv->bar);
    msk1  = iv->mask << shift;
    msk0  = ~msk1;
    val   = v << shift;
    val |= (iv->data[w+1] & msk0);
    iv->data[w+1] = val;
  }
  rc=0;

exception:
   return rc;
}
Example #4
0
/**
 *******************************************************************************
 * \brief
 ******************************************************************************/
IV *iv_resize(IV *piv, uint64_t n, uint64_t m) {

  uint64_t bits  = n * m;
  uint64_t words = divup(bits, 64);
  uint64_t bytes = sizeof(IV) + words * sizeof(uint64_t);

  IV *iv = am_realloc(piv,bytes);
  if (iv == NULL)
  {
    errno = ENOMEM;
    KV_TRC_FFDC(pAT, "FFDC: iv %p n %"PRIu64" m %"PRIu64", errno = %d", piv, n, m, errno);
  }
  else
  {
    iv->n      = n;
    iv->m      = m;
    iv->bits   = bits;
    iv->words  = words;
    iv->mask   = 1;
    iv->mask <<= m;
    iv->mask  -= 1;
    iv->bar    = 64 - m;
  }

  KV_TRC_DBG(pAT, "iv %p n %"PRIu64" m %"PRIu64"", piv, n, m);
  return iv;
}
Example #5
0
/**
 *******************************************************************************
 * \brief
 ******************************************************************************/
IV *iv_new(uint64_t n, uint64_t m) {

  uint64_t bits  = n * m;
  uint64_t words = divup(bits, 64);
  uint64_t bytes = sizeof(IV) + words * sizeof(uint64_t);

  IV *iv = am_malloc(bytes);
  if (iv == NULL)
  {
    errno = ENOMEM;
    KV_TRC_FFDC(pAT, "FFDC: n %"PRIu64" m %"PRIu64", errno = %d", n, m, errno);
  }
  else
  {
    memset(iv,0x00, bytes);
    iv->n      =  n;
    iv->m      = m;
    iv->bits   = bits;
    iv->words  = words;
    iv->mask   = 1;
    iv->mask <<= m;
    iv->mask  -= 1;
    iv->bar    = 64 - m;
  }
  KV_TRC(pAT, "iv:%p n:%ld m:%ld", iv, n, m);
  return iv;
}
/**
 *******************************************************************************
 * \brief
 ******************************************************************************/
uint32_t kv_async_dispatch_jobs(uint32_t ctxt)
{
    async_context_t *pCT          = pCTs+ctxt;
    async_CB_t      *pCB          = NULL;
    uint32_t         jobs_running = 0;

    if (ctxt < 0 || ctxt > KV_ASYNC_MAX_CONTEXTS)
    {
        KV_TRC_FFDC(pFT, "FFDC %x", ctxt);
        return FALSE;
    }

    for (pCB=pCT->pCBs; pCB<pCT->pCBs+KV_ASYNC_JOB_Q; pCB++)
    {
        if (pCB->flags & KV_ASYNC_CB_QUEUED)
        {
            kv_async_dispatch(pCB);
            jobs_running = 1;
            usleep(1000);
        }
        else if (pCB->flags & KV_ASYNC_CB_RUNNING)
        {
            jobs_running = 1;
        }
    }
    return jobs_running;
}
Example #7
0
int ea_resize(EA *ea, uint64_t bsize, uint64_t bcount)
{
  uint64_t size = bcount * bsize;
  int rc        = 0;

  ARK_SYNC_EA_WRITE(ea);

  if ( ea->st_type == EA_STORE_TYPE_MEMORY )
  {
    // For an in-memory store, we simply "realloc"
    // the memory.
    uint8_t *store = realloc(ea->st_memory, size);
    if (store) {
      ea->bcount = bcount;
      ea->size = size;
      ea->st_memory = store;
    } 
    else {
      errno = ENOMEM;
      KV_TRC_FFDC(pAT, "ENOMEM, resize ea %p bsize %lu bcount %lu, errno = %d",
              ea, bsize, bcount, errno);
      rc = 1;
    }
  }
  else
  {
    // Call down to the block layer to set the
    // new size on the store.
    rc = cblk_set_size(ea->st_flash, bcount, 0);
    if (rc == 0)
    {
      ea->bcount = bcount;
      ea->size = size;
    }
    else
    {
        errno = ENOSPC;
        KV_TRC_FFDC(pAT, "cblk_set_size failed ea %p bsize %lu bcount %lu, "
                         "errno = %d",
                         ea, bsize, bcount, errno);
    }
  }

  ARK_SYNC_EA_UNLOCK(ea);

  return rc;
}
/**
 *******************************************************************************
 * \brief
 ******************************************************************************/
ARK* kv_async_get_ark(uint32_t ctxt)
{
    async_context_t *pCT = pCTs+ctxt;

    if (ctxt < 0 || ctxt > KV_ASYNC_MAX_CONTEXTS)
    {
        KV_TRC_FFDC(pFT, "FFDC %x", ctxt);
        return FALSE;
    }

    return pCT->ark;
}
Example #9
0
int ark_exist_finish(_ARK *_arkp, int tid, tcb_t *tcbp)
{
  int32_t state = ARK_CMD_DONE;
  rcb_t  *rcbp  = &(_arkp->rcbs[tcbp->rtag]);

  // Find the key position in the read in bucket
  rcbp->res = bt_exists(tcbp->inb, rcbp->klen, rcbp->key);
  if (rcbp->res == BT_FAIL)
  {
    KV_TRC_FFDC(pAT, "rc = ENOENT key %p, klen %"PRIu64"",
                  rcbp->key, rcbp->klen);
    rcbp->rc = ENOENT;
    rcbp->res = -1;
    state = ARK_CMD_DONE;
  }

  return state;
}
Example #10
0
/**
 *******************************************************************************
 * \brief
 *  return TRUE if all IOs for the iocb are successfully completed, else FALSE
 ******************************************************************************/
int ea_async_io_schedule(_ARK   *_arkp,
                         int32_t tid,
                         tcb_t  *iotcbp,
                         iocb_t *iocbp)
{
  EA       *ea     = iocbp->ea;
  int32_t   rc     = TRUE;
  int32_t   arc    = 0;
  void     *prc    = 0;
  int64_t   i      = 0;
  uint8_t  *p_addr = NULL;
  uint8_t  *m_addr = NULL;
  char     *ot     = NULL;

  KV_TRC_IO(pAT, "IO_BEG: SCHEDULE_START: tid:%d ttag:%d start:%"PRIu64" "
                 "nblks:%"PRIu64" issT:%d cmpT:%d",
                 tid, iocbp->tag, iocbp->start, iocbp->nblks,
                 iocbp->issT, iocbp->cmpT);

  ARK_SYNC_EA_READ(iocbp->ea);

  if (iocbp->op == ARK_EA_READ) {ot="IO_RD";}
  else                          {ot="IO_WR";}

  for (i=iocbp->start; i<iocbp->nblks; i++)
  {
      if (ea->st_type == EA_STORE_TYPE_MEMORY)
      {
          p_addr = ((uint8_t *)(iocbp->addr)) + (i * ea->bsize);
          m_addr = ea->st_memory + (iocbp->blist[i].blkno * ea->bsize);

          if (ARK_EA_READ == iocbp->op) {prc = memcpy(p_addr,m_addr,ea->bsize);}
          else                          {prc = memcpy(m_addr,p_addr,ea->bsize);}

          if (check_sched_error_injects(iocbp->op)) {prc=NULL;}

          // if memcpy failed, fail the iocb
          if (prc == NULL)
          {
              rc=FALSE;
              KV_TRC_FFDC(pAT,"IO_ERR: tid:%d ttag:%d blkno:%"PRIi64""
                              " errno:%d", tid, iocbp->tag,
                              iocbp->blist[i].blkno, errno);
              if (!errno) {KV_TRC_FFDC(pAT, "IO:     UNSET_ERRNO"); errno=EIO;}
              iocbp->io_error = errno;
              break;
          }
          ++iocbp->issT;
          iocbp->blist[i].a_tag = i;
      }
      else // r/w to hw
      {
          p_addr = ((uint8_t *)iocbp->addr) + (i * ea->bsize);

          if (check_sched_error_injects(iocbp->op))
          {
              arc=-1;
          }
          else if ( iocbp->op == ARK_EA_READ )
          {
              arc = cblk_aread(ea->st_flash, p_addr, iocbp->blist[i].blkno, 1,
                              &(iocbp->blist[i].a_tag), NULL, 0);
          }
          else
          {
              arc = cblk_awrite(ea->st_flash, p_addr, iocbp->blist[i].blkno, 1,
                               &(iocbp->blist[i].a_tag), NULL, 0);
          }

          if (arc == 0)    // good status
          {
              ++iocbp->issT; rc=FALSE;
          }
          else if (arc < 0)
          {
              rc=FALSE;
              if (errno == EAGAIN)
              {
                  // return, and an ark thread will re-schedule this iocb
                  KV_TRC_DBG(pAT,"IO:    RW_EAGAIN: tid:%d ttag:%d "
                                 "blkno:%"PRIi64"",
                                 tid, iocbp->tag, iocbp->blist[i].blkno);
                  break;
              }
              // Something bad went wrong, fail the iocb
              KV_TRC_FFDC(pAT,"IO_ERR: tid:%d ttag:%d blkno:%"PRIi64""
                              " errno:%d", tid, iocbp->tag,
                              iocbp->blist[i].blkno, errno);
              if (!errno) {KV_TRC_FFDC(pAT, "IO:     UNSET_ERRNO"); errno=EIO;}
              iocbp->io_error = errno;
              break;
          }
          else if (arc > 0)
          {
              KV_TRC_IO(pAT,"IO_CMP: IMMEDIATE: tid:%d ttag:%d a_tag:%d "
                            "blkno:%"PRIi64"",
                            tid, iocbp->tag,
                            iocbp->blist[i].a_tag, iocbp->blist[i].blkno);
              ++iocbp->issT;
              ++iocbp->cmpT;
              iocbp->blist[i].a_tag = -1; // mark as harvested
          }
      }

      KV_TRC_IO(pAT, "%s:  tid:%2d ttag:%4d a_tag:%4d blkno:%5"PRIi64"", ot,tid,
                iocbp->tag, iocbp->blist[i].a_tag, iocbp->blist[i].blkno);

  }

  iotcbp->state = ARK_IO_HARVEST;
  iocbp->start  = i;
  ARK_SYNC_EA_UNLOCK(iocbp->ea);

  return rc;
}
Example #11
0
int ark_persist(_ARK *_arkp)
{
  int32_t        rc          = 0;
  uint64_t       tot_bytes   = 0;
  uint64_t       wrblks      = 0;
  char          *p_data_orig = NULL;
  char          *p_data      = NULL;
  p_cntr_t      *pptr        = NULL;
  char          *dptr        = NULL;
  P_ARK_t       *pcfg        = NULL;
  ark_io_list_t *bl_array    = NULL;

  if ( (_arkp->ea->st_type == EA_STORE_TYPE_MEMORY) ||
      !(_arkp->flags & ARK_KV_PERSIST_STORE) )
  {
    return 0;
  }

  ark_persistence_calc(_arkp);

  // allocate write buffer
  tot_bytes   = _arkp->pers_max_blocks * _arkp->bsize;
  p_data_orig = am_malloc(tot_bytes);
  if (p_data_orig == NULL)
  {
    KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for "
                     "persistence data", tot_bytes);
    return ENOMEM;
  }
  memset(p_data_orig, 0, tot_bytes);
  p_data = ptr_align(p_data_orig);

  // Record cntr data
  pptr   = (p_cntr_t *)p_data;
  memcpy(pptr->p_cntr_magic, ARK_P_MAGIC, sizeof(pptr->p_cntr_magic));
  pptr->p_cntr_version = ARK_P_VERSION_2;
  pptr->p_cntr_size    = sizeof(p_cntr_t);

  // Record configuration info
  pcfg = (P_ARK_t*)pptr->p_cntr_data;
  pcfg->flags   = _arkp->flags;
  pcfg->size    = _arkp->ea->size;
  pcfg->bsize   = _arkp->bsize;
  pcfg->bcount  = _arkp->bcount;
  pcfg->blkbits = _arkp->blkbits;
  pcfg->grow    = _arkp->blkbits;
  pcfg->hcount  = _arkp->hcount;
  pcfg->vlimit  = _arkp->vlimit;
  pcfg->blkused = _arkp->blkused;
  pcfg->nasyncs = _arkp->nasyncs;
  pcfg->basyncs = _arkp->basyncs;
  pcfg->ntasks  = _arkp->ntasks;
  pcfg->nthrds  = _arkp->nthrds;
  ark_persist_stats(_arkp, &(pcfg->pstats));
  pptr->p_cntr_cfg_offset = 0;
  pptr->p_cntr_cfg_size   = sizeof(P_ARK_t);

  dptr = pptr->p_cntr_data;

  // Record hash info
  dptr                  += pptr->p_cntr_cfg_size;
  pptr->p_cntr_ht_offset = dptr - pptr->p_cntr_data;
  pptr->p_cntr_ht_size   = sizeof(hash_t) + (_arkp->ht->n * sizeof(uint64_t));
  memcpy(dptr, _arkp->ht, pptr->p_cntr_ht_size);

  // Record block list info
  dptr                  += pptr->p_cntr_ht_size;
  pptr->p_cntr_bl_offset = dptr - pptr->p_cntr_data;
  pptr->p_cntr_bl_size   = sizeof(BL);
  memcpy(dptr, _arkp->bl, pptr->p_cntr_bl_size);

  // Record IV list info
  dptr                    += pptr->p_cntr_bl_size;
  pptr->p_cntr_bliv_offset = dptr - pptr->p_cntr_data;

  // bliv_size = bytes in bl->list->data[cs_blocks + kvdata_blocks]
  // add 2 to top because of how IV->data chaining works
  pptr->p_cntr_bliv_size = divup((_arkp->bl->top+2) * _arkp->bl->w, 8);
  memcpy(dptr, _arkp->bl->list->data, pptr->p_cntr_bliv_size);

  // Calculate wrblks: number of persist metadata blocks to write
  tot_bytes = _arkp->pers_cs_bytes + pptr->p_cntr_bliv_size;
  wrblks    = pcfg->pblocks = divup(tot_bytes, _arkp->bsize);

  KV_TRC(pAT, "PERSIST_WR dev:%s top:%ld wrblks:%ld vs pers_max_blocks:%ld",
         _arkp->ea->st_device, _arkp->bl->top, pcfg->pblocks,
         _arkp->pers_max_blocks);

  bl_array = bl_chain_blocks(_arkp->bl, 0, wrblks);
  if ( NULL == bl_array )
  {
    KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" blocks for block list",
                wrblks);
    rc = ENOMEM;
  }
  else
  {
    rc = ea_async_io(_arkp->ea, ARK_EA_WRITE, (void *)p_data, 
                     bl_array, wrblks, _arkp->nthrds);
    am_free(bl_array);
  }

  KV_TRC(pAT, "PERSIST_DATA_STORED rc:%d", rc);

  am_free(p_data_orig);
  return rc;
}
Example #12
0
EA *ea_new(const char *path, uint64_t bsize, int basyncs,
           uint64_t *size, uint64_t *bcount, uint64_t vlun)
{
    int             rc    = 0;
    size_t          plen  = 0;
    uint8_t        *store = NULL;
    EA             *ea    = NULL;
    chunk_id_t      chkid = NULL_CHUNK_ID;
    chunk_ext_arg_t ext   = 0;

    if (!(fetch_and_or(&cflsh_blk_lib_init,1)))
    {
        // We need to call cblk_init once before
        // we use any other cblk_ interfaces
        rc = cblk_init(NULL,0);
        if (rc)
        {
            KV_TRC_FFDC(pAT, "cblk_init failed path %s bsize %"PRIu64" "
                             "size %"PRIu64" bcount %"PRIu64", errno = %d",
                             path, bsize, *size, *bcount, errno);
            goto error_exit;
        }
    }

    ea = am_malloc(sizeof(EA));
    if (NULL == ea)
    {
        KV_TRC_FFDC(pAT, "Out of memory path %s bsize %"PRIu64" size %"PRIu64" "
                         "bcount %"PRIu64", errno = %d",
                         path, bsize, *size, *bcount, errno);
        goto error_exit;
    }

    // We need to check the path parameter to see if
    // we are going to use memory or a file/capi
    // device (to be determined by the block layer)
    if ( (NULL == path) || (strlen(path) == 0) )
    {
        KV_TRC(pAT, "EA_STORE_TYPE_MEMORY");
        // Using memory for store
        ea->st_type = EA_STORE_TYPE_MEMORY;

        store = malloc(*size);
        if (NULL == store)
        {
            errno = ENOMEM;
            KV_TRC_FFDC(pAT, "Out of memory for store path %s bsize %"PRIu64" "
                             "size %"PRIu64" bcount %"PRIu64", errno = %d",
                             path, bsize, *size, *bcount, errno);
            goto error_exit;
        }

        *bcount = ((*size) / bsize);
        ea->st_memory = store;
    }
    else
    {
        KV_TRC(pAT, "EA_STORE_TYPE_FILE(%s)", path);

        // Using a file.  We don't care if it's an actual
        // file or a CAPI device, we let block layer
        // decide and we just use the chunk ID that is
        // passed back from the cblk_open call.
        ea->st_type = EA_STORE_TYPE_FILE;

        // Check to see if we need to create the store on a
        // physical or virtual LUN.  Previously, in GA1,
        // we keyed off the size and if it was 0, then we
        // asked for the LUN to be physical.  Now, the user
        // can specify with a flag.
        if ( vlun == 0 )
        {
            KV_TRC(pAT, "cblk_open PHYSICAL LUN: %s", path);
            chkid = cblk_open(path, basyncs, O_RDWR, ext,
                              CBLK_OPN_NO_INTRP_THREADS);

            if (NULL_CHUNK_ID == chkid)
            {
                printf("cblk_open physical lun failed\n");
                KV_TRC_FFDC(pAT, "cblk_open phys lun failed path:%s bsize:%ld "
                                 "size:%ld bcount:%ld, errno:%d",
                                 path, bsize, *size, *bcount, errno);
                goto error_exit;
            }

            rc = cblk_get_size(chkid, (size_t *)bcount, 0);
            if ( (rc != 0) || (*bcount == 0) )
            {
                // An error was encountered, close the chunk
                cblk_close(chkid, 0);
                chkid = NULL_CHUNK_ID;
                KV_TRC_FFDC(pAT, "cblk_get_size failed path %s bsize %"PRIu64" "
                                 "size %"PRIu64" bcount %"PRIu64", errno = %d",
                                 path, bsize, *size, *bcount, errno);
                goto error_exit;
            }

            // Set the size to be returned
            *size = *bcount * bsize;
        }
        else
        {
            KV_TRC(pAT, "cblk_open VIRTUAL LUN: %s", path);
            chkid = cblk_open(path, basyncs, O_RDWR, ext,
                              CBLK_OPN_VIRT_LUN|CBLK_OPN_NO_INTRP_THREADS);

            if (NULL_CHUNK_ID == chkid)
            {
                printf("cblk_open virtual lun failed\n");
                KV_TRC_FFDC(pAT, "cblk_open virt lun failed path:%s bsize:%ld "
                                 "size:%ld bcount:%ld, errno:%d",
                                 path, bsize, *size, *bcount, errno);
                goto error_exit;
            }

            // A specific size was passed in so we try to set the
            // size of the chunk.
            *bcount = *size / bsize;
            rc = cblk_set_size(chkid, (size_t)*bcount, 0);
            if ( rc != 0 )
            {
                printf("cblk_set_size failed for %ld\n", *bcount);
                // An error was encountered, close the chunk
                cblk_close(chkid, 0);
                chkid = NULL_CHUNK_ID;
                KV_TRC_FFDC(pAT, "cblk_set_size failed path %s bsize %"PRIu64" "
                                 "size %"PRIu64" bcount %"PRIu64", errno = %d",
                                 path, bsize, *size, *bcount, errno);
                goto error_exit;
            }
        }

        // Save off the chunk ID and the device name
        ea->st_flash  = chkid;
        plen          = strlen(path) + 1;
        ea->st_device = (char *)am_malloc(plen);
        if (!ea->st_device)
        {
            cblk_close(chkid, 0);
            KV_TRC_FFDC(pAT, "MALLOC st_device failed (%s) plen=%ld errno:%d",
                        path, plen, errno);
            goto error_exit;
        }

        memset(ea->st_device, 0, plen);
        strncpy(ea->st_device, path, plen);
    }

    // Fill in the EA struct
    pthread_rwlock_init(&(ea->ea_rwlock), NULL);
    ea->bsize  = bsize;
    ea->bcount = *bcount;
    ea->size   = *size;

    KV_TRC(pAT, "path %s bsize %"PRIu64" size %"PRIu64" bcount %"PRIu64"",
           path, bsize, *size, *bcount);
    goto done;

error_exit:
    am_free(ea);
    ea = NULL;
    if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOSPC;}

done:
    return ea;
}
Example #13
0
int ea_async_io(EA *ea, int op, void *addr, ark_io_list_t *blist, int64_t len, int nthrs)
{
  int64_t  i       = 0;
  int64_t  j       = 0;
  int64_t  comps   = 0;
  int      num     = 0;
  int      max_ops = 0;
  void    *m_rc    = NULL;
  int      rc      = 0;
  int      a_rc    = 0;
  uint64_t status  = 0;
  uint8_t *p_addr  = NULL;
  uint8_t *m_addr  = NULL;
  char     *ot     = NULL;

  ARK_SYNC_EA_READ(ea);

  if (op == ARK_EA_READ) {ot="IO_RD";}
  else                   {ot="IO_WR";}

  if ( ea->st_type == EA_STORE_TYPE_MEMORY)
  {
    // Loop through the block list to issue the IO
    for(i = 0; i < len; i++)
    {

      p_addr = ((uint8_t*)addr) + (i * ea->bsize);

      // For in-memory Store, we issue the memcpy
      // and wait for the return, no async here.
      // Read out the value from the in-memor block
      m_addr = ea->st_memory + (blist[i].blkno * ea->bsize);

      if (op == ARK_EA_READ) {m_rc = memcpy(p_addr, m_addr, ea->bsize);}
      else                   {m_rc = memcpy(m_addr, p_addr, ea->bsize);}

      if (check_sched_error_injects(op)) {m_rc=NULL;}
      if (check_harv_error_injects(op))  {m_rc=NULL;}

      if (m_rc == NULL)
      {
          rc = errno;
          break;
      }
    }
  }
  else
  {
    // divide up the cmd slots among
    // the threads and go 3 less
    max_ops = (ARK_EA_BLK_ASYNC_CMDS / nthrs) - 3;

    // Loop through the block list to issue the IO
    while ((comps < len) && (rc == 0))
    {
      for(i = comps, num = 0; 
             (i < len) && (num < max_ops); 
              i++, num++)
      {
        p_addr = ((uint8_t*)addr) + (i * ea->bsize);

        // Call out to the block layer and retrive a block
        // Do an async op for a single block and tell the block
        // layer to wait if there are no available command
        // blocks.  Upon return, we can either get an error
        // (rc == -1), the data will be available (rc == number
        // of blocks read), or IO has been scheduled (rc == 0).
        if (op == ARK_EA_READ)
        {
            rc = cblk_aread(ea->st_flash, p_addr, blist[i].blkno, 1,
                    &(blist[i].a_tag), NULL,CBLK_ARW_WAIT_CMD_FLAGS);
        }
        else
        {
            rc = cblk_awrite(ea->st_flash, p_addr, blist[i].blkno, 1,
                    &(blist[i].a_tag), NULL,CBLK_ARW_WAIT_CMD_FLAGS);
        }

        if (check_sched_error_injects(op)) {rc=-1;}

        KV_TRC_IO(pAT, "%s:  id:%d blkno:%"PRIi64" rc:%d",
                ot, ea->st_flash, blist[i].blkno, rc);

        if ( rc == -1 )
        {
          // Error was encountered.  Don't issue any more IO
          rc = errno;
          KV_TRC_FFDC(pAT, "IO_ERR: cblk_aread/awrite failed, "
                           "blkno:%"PRIi64" tag:%d, errno = %d",
                           blist[i].blkno, blist[i].a_tag, errno);
          break;
        }

        // Data has already been returned so we don't need to
        // wait for the response below
        if ( rc > 0 )
        {
          blist[i].a_tag = -1;
          rc = 0;
        }
        //_arkp->stats.io_cnt++;
      }

      // For as many IOs that were performed, we loop t
      // see if we need to wait for the response or the
      // data has already been returned.
      for (j = comps; j < i; j++)
      {

        // Data has already been read
        if (blist[j].a_tag == -1)
        {
          continue;
        }

        do
        {
          a_rc = cblk_aresult(ea->st_flash, &(blist[j].a_tag), 
                            &status, CBLK_ARESULT_BLOCKING);

          if (check_harv_error_injects(op))  {a_rc=-1;}

          // There was an error, check to see if we haven't
          // encoutnered an error previously and if not, then
          // set rc.  Continue processing so that we harvest
          // all outstanding responses
          if (a_rc == -1)
          {
            if (rc == 0)
            {
              rc = errno;
            }
            KV_TRC_IO(pAT, "IO_ERR: id:%d blkno:%ld status:%ld a_rc:%d",
                    ea->st_flash, blist[j].blkno, status, a_rc);
          }
          else
          {
              KV_TRC_IO(pAT, "IO_CMP: id:%d blkno:%ld status:%ld a_rc:%d",
                      ea->st_flash, blist[j].blkno, status, a_rc);
          }

          // If a_rc is 0, that means we got interrupted somehow
          // so we need to retry the operation.
        } while (a_rc == 0);
      }

      // If we start another loop, start off where we finished
      // in this loop.
      comps = i;
    }
  }

  ARK_SYNC_EA_UNLOCK(ea);

  return rc;
}
Example #14
0
// if successful returns length of value
void ark_exist_start(_ARK *_arkp, int tid, tcb_t *tcbp)
{
  scb_t         *scbp     = &(_arkp->poolthreads[tid]);
  rcb_t         *rcbp     = &(_arkp->rcbs[tcbp->rtag]);
  tcb_t         *iotcbp   = &(_arkp->tcbs[rcbp->ttag]);
  iocb_t        *iocbp    = &(_arkp->iocbs[rcbp->ttag]);
  ark_io_list_t *bl_array = NULL;
  int32_t        rc       = 0;

  // Now that we have the hash entry, get the block
  // that holds the control information for the entry.
  tcbp->hblk = HASH_LBA(HASH_GET(_arkp->ht, rcbp->pos));

  // If there is no control block for this hash
  // entry, then the key is not present in the hash.
  // Set the error
  if ( tcbp->hblk == 0 )
  {
    KV_TRC_FFDC(pAT, "rc = ENOENT key %p, klen %"PRIu64" ttag:%d",
                 rcbp->key, rcbp->klen, tcbp->ttag);
    rcbp->res   = -1;
    rcbp->rc = ENOENT;
    tcbp->state = ARK_CMD_DONE;
    goto ark_exist_start_err;
  }

  // Set up the in-buffer to read in the hash bucket
  // that contains the key
  tcbp->blen = bl_len(_arkp->bl, tcbp->hblk);
  rc = bt_growif(&(tcbp->inb), &(tcbp->inb_orig), &(tcbp->inblen), 
                  (tcbp->blen * _arkp->bsize));
  if (rc != 0)
  {
    KV_TRC_FFDC(pAT, "bt_growif failed tcbp:%p ttag:%d", tcbp, tcbp->ttag);
    rcbp->res = -1;
    rcbp->rc = rc;
    tcbp->state = ARK_CMD_DONE;
    goto ark_exist_start_err;
  }

  // Create a chain of blocks to be passed to be read
  bl_array = bl_chain(_arkp->bl, tcbp->hblk, tcbp->blen);
  if (bl_array == NULL)
  {
    KV_TRC_FFDC(pAT, "bl_chain failed tcbp:%p ttag:%d", tcbp, tcbp->ttag);
    rcbp->rc = ENOMEM;
    rcbp->res = -1;
    tcbp->state = ARK_CMD_DONE;
    goto ark_exist_start_err;
  }

  scbp->poolstats.io_cnt += tcbp->blen;

  KV_TRC_IO(pAT, "read hash entry ttag:%d", tcbp->ttag);
  ea_async_io_init(_arkp, ARK_EA_READ, (void *)tcbp->inb, bl_array,
                   tcbp->blen, 0, tcbp->ttag, ARK_EXIST_FINISH);
  if (ea_async_io_schedule(_arkp, tid, iotcbp, iocbp) &&
      ea_async_io_harvest (_arkp, tid, iotcbp, iocbp, rcbp))
  {
      ark_exist_finish(_arkp, tid, tcbp);
  }

ark_exist_start_err:

  return;
}
Example #15
0
int ark_check_persistence(_ARK *_arkp, uint64_t flags)
{
  int32_t  rc = -1;
  char    *p_data_orig = NULL;
  char    *p_data = NULL;
  ark_io_list_t *bl_array = NULL;
  p_cntr_t *pptr = NULL;
  P_ARK_t  *pcfg = NULL;
  hash_t   *htp = NULL;
  BL       *blp = NULL;
  uint64_t  rdblks = 0;

  if (flags & ARK_KV_PERSIST_LOAD) {KV_TRC(pAT, "PERSIST_LOAD");}

  // Ignore the persistence data and load from scratch
  if ( (!(flags & ARK_KV_PERSIST_LOAD)) || (flags & ARK_KV_VIRTUAL_LUN) )
  {
    return -1;
  }

  p_data_orig = am_malloc(_arkp->bsize);
  if (p_data_orig == NULL)
  {
    KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for the first "
                     "persistence block", _arkp->bsize);
    rc = ENOMEM;
  }
  else
  {
    p_data = ptr_align(p_data_orig);
    bl_array = bl_chain_no_bl(0, 1);
    rc = ea_async_io(_arkp->ea, ARK_EA_READ, (void *)p_data, 
                      bl_array, 1, 1);
    am_free(bl_array);
  }

  if (rc == 0)
  {
    // We've read the first block.  We check to see if
    // persistence data is present and if so, then
    // read the rest of the data from the flash.
    pptr = (p_cntr_t *)p_data;
    _arkp->persdata = p_data_orig;
    if ( memcmp(pptr->p_cntr_magic, ARK_P_MAGIC, 
                 sizeof(pptr->p_cntr_magic) != 0))
    {
      KV_TRC_FFDC(pAT, "No magic number found in persistence data: %d", EINVAL);
      // The magic number does not match so data is either
      // not present or is corrupted.
      rc = -1;
    }
    else
    {
      // Now we check version and the first persistence data
      // needs to be the ARK_PERSIST_CONFIG block
      if (pptr->p_cntr_version != ARK_P_VERSION_1 &&
          pptr->p_cntr_version != ARK_P_VERSION_2)
      {
        KV_TRC_FFDC(pAT, "Invalid / unsupported version: %"PRIu64"",
                    pptr->p_cntr_version);
        rc = EINVAL;
      }
      else
      {
        // Read in the rest of the persistence data
        pcfg   = (P_ARK_t *)(pptr->p_cntr_data + pptr->p_cntr_cfg_offset);
        rdblks = pcfg->pblocks;
        if (rdblks > 1)
        {
          p_data_orig = am_realloc(p_data_orig, (rdblks * _arkp->bsize));
          if (p_data_orig == NULL)
          {
            KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" bytes for "
                             "full persistence block",
                             (rdblks * _arkp->bsize));
            rc = ENOMEM;
          }
          else
          {
            p_data = ptr_align(p_data_orig);
            bl_array = bl_chain_no_bl(0, rdblks);
            if (bl_array == NULL)
            {
              KV_TRC_FFDC(pAT, "Out of memory allocating %"PRIu64" blocks for "
                               "full persistence data", rdblks);
              rc = ENOMEM;
            }
          }

          // We are still good to read the rest of the data
          // from the flash
          if (rc == 0)
          {
            KV_TRC(pAT, "PERSIST_RD rdblks:%ld", rdblks);
            rc = ea_async_io(_arkp->ea, ARK_EA_READ, (void *)p_data,
                           bl_array, rdblks, 1);
            am_free(bl_array);
            pptr = (p_cntr_t *)p_data;
            pcfg   = (P_ARK_t *)(pptr->p_cntr_data + pptr->p_cntr_cfg_offset);
            _arkp->persdata = p_data_orig;
          }
        }
      }
    }
  }

  // If rc == 0, that means we have persistence data
  if (rc == 0)
  {
      KV_TRC(pAT, "PERSIST_META size %ld bsize %ld hcount %ld bcount %ld "
                  "nthrds %d nasyncs %d basyncs %d blkbits %ld version:%ld",
                  pcfg->size, pcfg->bsize, pcfg->hcount, pcfg->bcount,
                  pcfg->nthrds, pcfg->nasyncs, pcfg->basyncs, pcfg->blkbits,
                  pptr->p_cntr_version);

    _arkp->persload            = 1;
    _arkp->size                = pcfg->size;
    _arkp->flags               = flags;
    _arkp->bsize               = pcfg->bsize;
    _arkp->bcount              = pcfg->bcount;
    _arkp->blkbits             = pcfg->blkbits;
    _arkp->grow                = pcfg->grow;
    _arkp->hcount              = pcfg->hcount;
    _arkp->vlimit              = pcfg->vlimit;
    _arkp->blkused             = pcfg->blkused;
    _arkp->pers_stats.kv_cnt   = pcfg->pstats.kv_cnt;
    _arkp->pers_stats.blk_cnt  = pcfg->pstats.blk_cnt;
    _arkp->pers_stats.byte_cnt = pcfg->pstats.byte_cnt;

    KV_TRC(pAT, "ARK_META size %ld bsize %ld hcount %ld bcount %ld "
                "nthrds %d nasyncs %ld basyncs %d blkbits %ld",
                _arkp->size, _arkp->bsize, _arkp->hcount, _arkp->bcount,
                _arkp->nthrds, _arkp->nasyncs, _arkp->basyncs, _arkp->blkbits);

    htp = (hash_t *)(pptr->p_cntr_data + pptr->p_cntr_ht_offset);
    _arkp->ht = hash_new(htp->n);
    if (_arkp->ht == NULL)
    {
        if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;}
        rc = errno;
        KV_TRC_FFDC(pAT, "ht_new failed: n:%ld rc:%d", htp->n, rc);
        goto error_exit;
    }
    memcpy(_arkp->ht, htp, pptr->p_cntr_ht_size);

    blp = (BL *)(pptr->p_cntr_data + pptr->p_cntr_bl_offset);
    _arkp->bl = bl_new(blp->n, blp->w);
    if (_arkp->bl == NULL)
    {
        if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;}
        rc = errno;
        KV_TRC_FFDC(pAT, "bl_new failed: n:%ld w:%ld rc:%d",
                    blp->n, blp->w, rc);
        goto error_exit;
    }
    _arkp->bl->count = blp->count;
    _arkp->bl->head  = blp->head;
    _arkp->bl->hold  = blp->hold;
    _arkp->bl->top   = blp->top;

    if (pptr->p_cntr_version == ARK_P_VERSION_1)
    {
        IV *piv = (IV *)(pptr->p_cntr_data + pptr->p_cntr_bliv_offset);

        KV_TRC(pAT, "PERSIST_VERSION_1 LOADED");
        _arkp->bl->top = _arkp->bl->n;

        // copy IV->data from piv->data
        memcpy(_arkp->bl->list->data,
               piv->data,
               pptr->p_cntr_bliv_size);
    }
    else if (pptr->p_cntr_version == ARK_P_VERSION_2)
    {
        KV_TRC(pAT, "PERSIST_VERSION_2 LOADED");
        // copy IV->data from bliv_offset
        memcpy(_arkp->bl->list->data,
               pptr->p_cntr_data + pptr->p_cntr_bliv_offset,
               pptr->p_cntr_bliv_size);
    }
    else
    {
        rc = EINVAL;
        KV_TRC_FFDC(pAT, "bad persistent version number: ver:%ld",
                    pptr->p_cntr_version);
        goto error_exit;
    }

    KV_TRC(pAT, "BL_META: n:%ld count:%ld head:%ld hold:%ld top:%ld",
            _arkp->bl->n, _arkp->bl->count, _arkp->bl->head, _arkp->bl->hold,
            _arkp->bl->top);
  }

error_exit:
  am_free(p_data_orig);
  return rc;
}
Example #16
0
// if successful returns length of value
int ark_exist_start(_ARK *_arkp, int tid, tcb_t *tcbp)
{
  scb_t            *scbp     = &(_arkp->poolthreads[tid]);
  rcb_t            *rcbp     = &(_arkp->rcbs[tcbp->rtag]);
  ark_io_list_t    *bl_array = NULL;
  int32_t           rc       = 0;
  int32_t           state    = ARK_CMD_DONE;

  // Now that we have the hash entry, get the block
  // that holds the control information for the entry.
  tcbp->hblk = HASH_LBA(HASH_GET(_arkp->ht, rcbp->pos));

  // If there is no control block for this hash
  // entry, then the key is not present in the hash.
  // Set the error
  if ( tcbp->hblk == 0 )
  {
    KV_TRC_FFDC(pAT, "rc = ENOENT key %p, klen %"PRIu64"",
                 rcbp->key, rcbp->klen);
    rcbp->res   = -1;
    rcbp->rc = ENOENT;
    state = ARK_CMD_DONE;
    goto ark_exist_start_err;
  }

  // Set up the in-buffer to read in the hash bucket
  // that contains the key
  tcbp->blen = bl_len(_arkp->bl, tcbp->hblk);
  rc = bt_growif(&(tcbp->inb), &(tcbp->inb_orig), &(tcbp->inblen), 
                  (tcbp->blen * _arkp->bsize));
  if (rc != 0)
  {
    rcbp->res = -1;
    rcbp->rc = rc;
    state = ARK_CMD_DONE;
    goto ark_exist_start_err;
  }

  // Create a chain of blocks to be passed to be read
  bl_array = bl_chain(_arkp->bl, tcbp->hblk, tcbp->blen);
  if (bl_array == NULL)
  {
    rcbp->rc = ENOMEM;
    rcbp->res = -1;
    state = ARK_CMD_DONE;
    goto ark_exist_start_err;
  }

  scbp->poolstats.io_cnt += tcbp->blen;

  rc = ea_async_io_mod(_arkp, ARK_EA_READ, (void *)tcbp->inb, bl_array, 
                   tcbp->blen, 0, tcbp->ttag, ARK_EXIST_FINISH);
  if (rc < 0)
  {
    rcbp->rc = -rc;
    rcbp->res = -1;
    state = ARK_CMD_DONE;
    goto ark_exist_start_err;
  }
  else if (rc == 0)
  {
    state = ARK_IO_HARVEST;
  }
  else
  {
    state = ark_exist_finish(_arkp, tid, tcbp);
  }

ark_exist_start_err:

  return state;
}
Example #17
0
int ark_create_verbose(char *path, ARK **arkret,
                       uint64_t size, uint64_t bsize, uint64_t hcount,
                       int nthrds, int nqueue, int basyncs, uint64_t flags)
{
  int          rc = 0;
  int        p_rc = 0;
  uint64_t bcount = 0;
  uint64_t      x = 0;
  int           i = 0;
  int        tnum = 0;
  int        rnum = 0;
  scb_t     *scbp = NULL;

  KV_TRC_OPEN(pAT, "arkdb");

  if (NULL == arkret)
  {
    KV_TRC_FFDC(pAT, "Incorrect value for ARK control block: rc=EINVAL");
    rc = EINVAL;
    goto ark_create_ark_err;
  }

  if ( (flags & (ARK_KV_PERSIST_LOAD|ARK_KV_PERSIST_STORE)) && 
         (flags & ARK_KV_VIRTUAL_LUN) )
  {
    KV_TRC_FFDC(pAT, "Invalid persistence combination with ARK flags: %016lx",
                flags);
    rc = EINVAL;
    goto ark_create_ark_err;
  }

  if (nthrds <= 0)
  {
      KV_TRC_FFDC(pAT, "invalid nthrds:%d", nthrds);
      rc = EINVAL;
      goto ark_create_ark_err;
  }

  _ARK *ark = am_malloc(sizeof(_ARK));
  if (ark == NULL) {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Out of memory allocating ARK control structure for %ld",
                sizeof(_ARK));
    goto ark_create_ark_err;
  }

  KV_TRC(pAT, "%p path(%s) size %ld bsize %ld hcount %ld "
              "nthrds %d nqueue %d basyncs %d flags:%08lx",
              ark, path, size, bsize, hcount, 
              nthrds, nqueue, basyncs, flags);

  ark->bsize    = bsize;
  ark->rthread  = 0;
  ark->persload = 0;
  ark->nasyncs  = ((nqueue <= 0) ? ARK_MAX_ASYNC_OPS : nqueue);
  ark->basyncs  = basyncs;
  ark->ntasks   = ARK_MAX_TASK_OPS;
  ark->nthrds   = ARK_VERBOSE_NTHRDS_DEF; // hardcode, perf requirement

  // Create the KV storage, whether that will be memory based
  // or flash
  ark->ea = ea_new(path, ark->bsize, basyncs, &size, &bcount,
                    (flags & ARK_KV_VIRTUAL_LUN));
  if (ark->ea == NULL)
  {
    if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;}
    rc = errno;
    KV_TRC_FFDC(pAT, "KV storage initialization failed: rc/errno:%d", rc);
    goto ark_create_ea_err;
  }

  // Now that the "connection" to the store has been established
  // we need to check to see if data was persisted from a previous
  // instantiation of the KV store.
  p_rc = ark_check_persistence(ark, flags);
  if (p_rc > 0)
  {
    // We ran into an error while trying to read from
    // the store.
    rc = p_rc;
    KV_TRC_FFDC(pAT, "Persistence check failed: %d", rc);
    goto ark_create_persist_err;
  }
  else if (p_rc == -1)
  {
    KV_TRC(pAT, "NO PERSIST LOAD FLAG");
    // There was no persistence data, so we just build off
    // of what was passed into the API.

    ark->size = size;
    ark->bcount = bcount;
    ark->hcount = hcount;
    ark->vlimit = ARK_VERBOSE_VLIMIT_DEF;
    ark->blkbits = ARK_VERBOSE_BLKBITS_DEF;
    ark->grow = ARK_VERBOSE_GROW_DEF;
    ark->rthread = 0;
    ark->flags = flags;
    ark->astart = 0;
    ark->blkused = 1;
    ark->ark_exit = 0;
    ark->nactive = 0;
    ark->pers_stats.kv_cnt = 0;
    ark->pers_stats.blk_cnt = 0;
    ark->pers_stats.byte_cnt = 0;
    ark->pcmd = PT_IDLE;

    // Create the requests and tag control blocks and queues.
    x = ark->hcount / ark->nthrds;
    ark->npart  = x + (ark->hcount % ark->nthrds ? 1 : 0);

    // Create the hash table
    ark->ht = hash_new(ark->hcount);
    if (ark->ht == NULL)
    {
      if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;}
      rc = errno;
      KV_TRC_FFDC(pAT, "Hash initialization failed: %d", rc);
      goto ark_create_ht_err;
    }

    // Create the block list
    ark->bl = bl_new(ark->bcount, ark->blkbits);
    if (ark->bl == NULL)
    {
      if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;}
      rc = errno;
      KV_TRC_FFDC(pAT, "Block list initialization failed: %d", rc);
      goto ark_create_bl_err;
    }
    if (flags & ARK_KV_PERSIST_STORE)
    {
      ark_persistence_calc(ark);
      if (bl_reserve(ark->bl, ark->pers_max_blocks))
          {goto ark_create_bl_err;}
    }
  }
  else
  {
      KV_TRC(pAT, "PERSIST: %p path(%s) size %ld bsize %ld hcount %ld "
                  "nthrds %d nqueue %ld basyncs %d bcount %ld blkbits %ld",
                  ark, path, ark->size, ark->bsize, ark->hcount,
                  ark->nthrds, ark->nasyncs, ark->basyncs,
                  ark->bcount, ark->blkbits);
  }

  rc = pthread_mutex_init(&ark->mainmutex,NULL);
  if (rc != 0)
  {
    KV_TRC_FFDC(pAT, "pthread_mutex_init for main mutex failed: %d", rc);
    goto ark_create_pth_mutex_err;
  }

  ark->rtags = tag_new(ark->nasyncs);
  if ( NULL == ark->rtags )
  {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Tag initialization for requests failed: %d", rc);
    goto ark_create_rtag_err;
  }

  ark->ttags = tag_new(ark->ntasks);
  if ( NULL == ark->ttags )
  {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Tag initialization for tasks failed: %d", rc);
    goto ark_create_ttag_err;
  }

  ark->rcbs = am_malloc(ark->nasyncs * sizeof(rcb_t));
  if ( NULL == ark->rcbs )
  {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for request control blocks", (ark->nasyncs * sizeof(rcb_t)));
    goto ark_create_rcbs_err;
  }

  ark->tcbs = am_malloc(ark->ntasks * sizeof(tcb_t));
  if ( NULL == ark->tcbs )
  {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for task control blocks", (ark->ntasks * sizeof(rcb_t)));
    goto ark_create_tcbs_err;
  }

  ark->iocbs = am_malloc(ark->ntasks * sizeof(iocb_t));
  if ( NULL == ark->iocbs )
  {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for io control blocks", (ark->ntasks * sizeof(iocb_t)));
    goto ark_create_iocbs_err;
  }

  ark->poolthreads = am_malloc(ark->nthrds * sizeof(scb_t));
  if ( NULL == ark->poolthreads )
  {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Out of memory allocation of %"PRIu64" bytes for server thread control blocks", (ark->nthrds * sizeof(scb_t)));
    goto ark_create_poolthreads_err;
  }

  for ( rnum = 0; rnum < ark->nasyncs ; rnum++ )
  {
    ark->rcbs[rnum].stat = A_NULL;
    pthread_cond_init(&(ark->rcbs[rnum].acond), NULL);
    pthread_mutex_init(&(ark->rcbs[rnum].alock), NULL);
  }

  for ( tnum = 0; tnum < ark->ntasks; tnum++ )
  {
    ark->tcbs[tnum].inb = bt_new(0, ark->vlimit, sizeof(uint64_t), 
                                       &(ark->tcbs[tnum].inblen),
                                       &(ark->tcbs[tnum].inb_orig));
    if (ark->tcbs[tnum].inb == NULL)
    {
      if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;}
      rc = errno;
      KV_TRC_FFDC(pAT, "Bucket allocation for inbuffer failed: %d", rc);
      goto ark_create_taskloop_err;
    }

    ark->tcbs[tnum].oub = bt_new(0, ark->vlimit, sizeof(uint64_t), 
                                       &(ark->tcbs[tnum].oublen),
                                       &(ark->tcbs[tnum].oub_orig));
    if (ark->tcbs[tnum].oub == NULL)
    {
      if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=ENOMEM;}
      rc = errno;
      KV_TRC_FFDC(pAT, "Bucket allocation for outbuffer failed: %d", rc);
      goto ark_create_taskloop_err;
    }

    //ark->tcbs[tnum].vbsize = bsize * 1024;
    ark->tcbs[tnum].vbsize = bsize * 256;
    ark->tcbs[tnum].vb_orig = am_malloc(ark->tcbs[tnum].vbsize);
    if (ark->tcbs[tnum].vb_orig == NULL)
    {
      rc = ENOMEM;
      KV_TRC_FFDC(pAT, "Out of memory allocation for %"PRIu64" bytes for variable size buffer", (bsize * 1024));
      goto ark_create_taskloop_err;
    }
    ark->tcbs[tnum].vb = ptr_align(ark->tcbs[tnum].vb_orig);
  }

  *arkret = (void *)ark;

  ark->pts = (PT *)am_malloc(sizeof(PT) * ark->nthrds);
  if ( ark->pts == NULL )
  {
    rc = ENOMEM;
    KV_TRC_FFDC(pAT, "Out of memory allocation for %"PRIu64" bytes for server thread data", (sizeof(PT) * ark->nthrds));
    goto ark_create_taskloop_err;
  }

  for (i = 0; i < ark->nthrds; i++) {
    PT *pt = &(ark->pts[i]);
    scbp = &(ark->poolthreads[i]);

    memset(scbp, 0, sizeof(scb_t));

    // Start off the random start point for this thread
    // at -1, to show that it has not been part of a
    // ark_random call.
    scbp->rlast = -1;
    scbp->holds = 0;
    scbp->poolstate = PT_RUN;

    scbp->poolstats.io_cnt = 0;
    scbp->poolstats.ops_cnt = 0;
    scbp->poolstats.kv_cnt = 0;
    scbp->poolstats.blk_cnt = 0;
    scbp->poolstats.byte_cnt = 0;

    pthread_mutex_init(&(scbp->poolmutex), NULL);
    pthread_cond_init(&(scbp->poolcond), NULL);

    scbp->rqueue = queue_new(ark->nasyncs);
    scbp->tqueue = queue_new(ark->ntasks);
    scbp->ioqueue = queue_new(ark->ntasks);

    pt->id = i;
    pt->ark = ark;
    rc = pthread_create(&(scbp->pooltid), NULL, pool_function, pt);
    if (rc != 0)
    {
      KV_TRC_FFDC(pAT, "pthread_create of server thread failed: %d", rc);
      goto ark_create_poolloop_err;
    }
  }

#if 0
  while (ark->nactive < ark->nthrds) {
    usleep(1);
    //printf("Create waiting %d/%d\n", ark->nactive, ark->nthrds);
  }
#endif

  ark->pcmd = PT_RUN;

  goto ark_create_return;

ark_create_poolloop_err:

  for (; i >= 0; i--)
  {
    scbp = &(ark->poolthreads[i]);

    if (scbp->pooltid != 0)
    {
      queue_lock(scbp->rqueue);
      queue_wakeup(scbp->rqueue);
      queue_unlock(scbp->rqueue);
      pthread_join(scbp->pooltid, NULL);

      pthread_mutex_destroy(&(scbp->poolmutex));
      pthread_cond_destroy(&(scbp->poolcond));

      if ( scbp->rqueue != NULL )
      {
        queue_free(scbp->rqueue);
      }

      if ( scbp->tqueue != NULL )
      {
        queue_free(scbp->tqueue);
      }
    
      if ( scbp->ioqueue != NULL )
      {
        queue_free(scbp->ioqueue);
      }
    }
  }

  if ( ark->pts != NULL )
  {
    am_free(ark->pts);
  }

ark_create_taskloop_err:
  for ( tnum = 0; tnum < ark->ntasks; tnum++ )
  {
    if (ark->tcbs[tnum].inb)
    {
      bt_delete(ark->tcbs[tnum].inb);
    }

    if (ark->tcbs[tnum].oub)
    {
      bt_delete(ark->tcbs[tnum].oub);
    }

    if (ark->tcbs[tnum].vb_orig)
    {
      am_free(ark->tcbs[tnum].vb_orig);
    }
  }

  for (rnum = 0; rnum < ark->nasyncs; rnum++)
  {
    pthread_cond_destroy(&(ark->rcbs[rnum].acond));
    pthread_mutex_destroy(&(ark->rcbs[rnum].alock));
  }

  if ( ark->poolthreads != NULL )
  {
    am_free(ark->poolthreads);
  }

ark_create_poolthreads_err:
  if (ark->iocbs)
  {
    am_free(ark->iocbs);
  }

ark_create_iocbs_err:
  if (ark->tcbs)
  {
    am_free(ark->tcbs);
  }

ark_create_tcbs_err:
  if (ark->rcbs)
  {
    am_free(ark->rcbs);
  }

ark_create_rcbs_err:
  if (ark->ttags)
  {
    tag_free(ark->ttags);
  }

ark_create_ttag_err:
  if (ark->rtags)
  {
    tag_free(ark->rtags);
  }

ark_create_rtag_err:
  pthread_mutex_destroy(&ark->mainmutex);

ark_create_pth_mutex_err:
  bl_delete(ark->bl);

ark_create_bl_err:
  hash_free(ark->ht);

ark_create_ht_err:
ark_create_persist_err:
  ea_delete(ark->ea);

ark_create_ea_err:
  am_free(ark);
  *arkret = NULL;

ark_create_ark_err:
  KV_TRC_CLOSE(pAT);

ark_create_return:
  return rc;
}
/**
 *******************************************************************************
 * \brief
 ******************************************************************************/
void kv_async_run_jobs(void)
{
    async_CB_t *pCB          = NULL;
    uint32_t    ctxt_running = 0;
    uint32_t    jobs_running = 0;
    uint32_t    i            = 0;
    uint32_t    next         = 0;
    uint32_t    elapse       = 0;
    uint32_t    inject       = 0;
    uint32_t    secs         = 0;
    uint32_t    log_interval = 600;
    uint64_t    ops          = 0;
    uint64_t    ios          = 0;
    uint32_t    tops         = 0;
    uint32_t    tios         = 0;
    uint32_t    perf         = 0;

    KV_TRC(pFT, "ASYNC START: 0 minutes");

    if (!(pCTs->pCBs->flags & KV_ASYNC_CB_RUNNING)) start = time(0);
    next = log_interval;

    do
    {
        ctxt_running = FALSE;

        if (elapse > next)
        {
            KV_TRC(pFT, "ASYNC RUNNING: %d elapsed minutes", elapse/60);
            next += log_interval;
        }

        for (i=0; i<KV_ASYNC_MAX_CONTEXTS; i++)
        {
            if (! (pCTs[i].flags & KV_ASYNC_CT_RUNNING)) continue;

            jobs_running = kv_async_dispatch_jobs(i);

            if (!jobs_running)
            {
                pCTs[i].flags &= ~KV_ASYNC_CT_RUNNING;
                pCTs[i].flags |=  KV_ASYNC_CT_DONE;
                KV_TRC(pFT, "ASYNC DONE ctxt %d %x", i, pCTs[i].flags);
                continue;
            }
            else
            {
                ctxt_running = TRUE;
            }

            elapse = time(0) - start;

            if (elapse >= inject &&
                pCTs[i].flags & KV_ASYNC_CT_ERROR_INJECT)
            {
                KV_TRC_FFDC(pFT, "FFDC: INJECT ERRORS");
                FVT_KV_INJECT_READ_ERROR;
                FVT_KV_INJECT_WRITE_ERROR;
                FVT_KV_INJECT_ALLOC_ERROR;
                ++inject;
            }

            if (elapse >= pCTs[i].secs)
            {
                for (pCB=pCTs[i].pCBs;pCB<pCTs[i].pCBs+KV_ASYNC_JOB_Q;pCB++)
                {
                    if ((pCB->flags & KV_ASYNC_CB_RUNNING ||
                         pCB->flags & KV_ASYNC_CB_QUEUED)
                        &&
                        (!(pCB->flags & KV_ASYNC_CB_SHUTDOWN)) )
                    {
                        pCB->flags |=  KV_ASYNC_CB_SHUTDOWN;
                        KV_TRC_IO(pFT, "SHUTDOWN pCB %p (%d >= %d)", pCB, elapse, pCTs[i].secs);
                    }
                }
            }
            usleep(100);
        }
    }
    while (ctxt_running);

    stop = time(0);
    secs = stop - start;

    KV_TRC(pFT, "ASYNC RUNNING DONE: %d minutes", elapse/60);

    /* log cleanup, since the first ark_delete closes the log file */
    for (i=0; i<KV_ASYNC_MAX_CONTEXTS; i++)
    {
        if (pCTs[i].flags & KV_ASYNC_CT_DONE)
            KV_TRC(pFT, "ASYNC CLEANUP: ctxt:%d ark:%p", i, pCTs[i].ark);
    }

    /* check for MULTI_CTXT_IO, destroy common kv dbs */
    for (pCB=pCTs->pCBs;pCB<pCTs->pCBs+KV_ASYNC_JOB_Q;pCB++)
    {
        if (pCB->flags & KV_ASYNC_CB_MULTI_CTXT_IO)
        {
            kv_db_destroy(pCB->db, pCB->len);
        }
    }

    for (i=0; i<KV_ASYNC_MAX_CONTEXTS; i++)
    {
        /* if this context didn't run any I/O */
        if (! (pCTs[i].flags & KV_ASYNC_CT_DONE)) continue;

        pCTs[i].flags &= ~KV_ASYNC_CT_DONE;

        /* if perf then don't delete the ark here */
        if (pCTs[i].flags & KV_ASYNC_CT_PERF)
        {
            perf = TRUE;
            continue;
        }

        (void)ark_stats(pCTs[i].ark, &ops, &ios);
        tops += (uint32_t)ops;
        tios += (uint32_t)ios;
        KV_TRC(pFT, "PERF ark%p ops:%"PRIu64" ios:%"PRIu64"", pCTs[i].ark, ops, ios);

        EXPECT_EQ(0, ark_delete(pCTs[i].ark));
    }

    if (!perf)
    {
        tops = tops / secs;
        tios = tios / secs;
        printf("op/s:%d io/s:%d secs:%d\n", tops, tios, secs);
        KV_TRC(pFT, "PERF op/s:%d io/s:%d secs:%d",
                tops, tios, secs);
    }
}
Example #19
0
int ark_delete(ARK *ark) {
  int rc = 0;
  int i = 0;
  _ARK *_arkp = (_ARK *)ark;
  scb_t *scbp = NULL;

  if (NULL == ark)
  {
    rc = EINVAL;
    KV_TRC_FFDC(pAT, "Invalid ARK control block parameter: %d", rc);
    goto ark_delete_ark_err;
  }

  // Wait for all active threads to exit
  for (i = 0; i < _arkp->nthrds; i++)
  {
      scbp = &(_arkp->poolthreads[i]);
      scbp->poolstate = PT_EXIT;

      queue_lock(scbp->rqueue);
      queue_wakeup(scbp->rqueue);
      queue_unlock(scbp->rqueue);

      pthread_join(scbp->pooltid, NULL);

      queue_free(scbp->rqueue);
      queue_free(scbp->tqueue);
      queue_free(scbp->ioqueue);

      pthread_mutex_destroy(&(scbp->poolmutex));
      pthread_cond_destroy(&(scbp->poolcond));
      KV_TRC(pAT, "thread %d joined", i);
  }

  if (_arkp->poolthreads) am_free(_arkp->poolthreads);

  if (_arkp->pts) am_free(_arkp->pts);

  for ( i = 0; i < _arkp->nasyncs ; i++ )
  {
    pthread_cond_destroy(&(_arkp->rcbs[i].acond));
    pthread_mutex_destroy(&(_arkp->rcbs[i].alock));
  }

  for ( i = 0; i < _arkp->ntasks; i++ )
  {

    bt_delete(_arkp->tcbs[i].inb);
    bt_delete(_arkp->tcbs[i].oub);
    am_free(_arkp->tcbs[i].vb_orig);
  }

  if (_arkp->iocbs)
  {
    am_free(_arkp->iocbs);
  }

  if (_arkp->tcbs)
  {
    am_free(_arkp->tcbs);
  }

  if (_arkp->rcbs)
  {
    am_free(_arkp->rcbs);
  }

  if (_arkp->ttags)
  {
    tag_free(_arkp->ttags);
  }

  if (_arkp->rtags)
  {
    tag_free(_arkp->rtags);
  }

  if (!(_arkp->flags & ARK_KV_VIRTUAL_LUN))
  {
    rc = ark_persist(_arkp);
    if ( rc != 0 )
    {
      KV_TRC_FFDC(pAT, "FFDC: ark_persist failed: %d", rc);
    }
  }

  pthread_mutex_destroy(&_arkp->mainmutex);

  (void)ea_delete(_arkp->ea);
  hash_free(_arkp->ht);
  bl_delete(_arkp->bl);
  KV_TRC(pAT, "ark_delete done %p", _arkp);
  am_free(_arkp);

ark_delete_ark_err:
  KV_TRC_CLOSE(pAT);
  return rc;
}
/**
 *******************************************************************************
 * \brief
 *  callback function for set/get/exists/del
 ******************************************************************************/
static void kv_async_cb(int errcode, uint64_t dt, int64_t res)
{
    async_CB_t *pCB  = (async_CB_t*)dt;
    kv_t       *p_kv = NULL;
    uint64_t    tag  = (uint64_t)pCB;

    if (pCB == NULL)
    {
        KV_TRC_FFDC(pFT, "FFDC: pCB NULL");
        return;
    }
    if (pCB->b_mark != B_MARK)
    {
        KV_TRC_FFDC(pFT, "FFDC: B_MARK FAILURE %p: %"PRIx64"", pCB, pCB->b_mark);
        return;
    }
    if (pCB->e_mark != E_MARK)
    {
        KV_TRC_FFDC(pFT, "FFDC: E_MARK FAILURE %p: %"PRIx64"", pCB, pCB->e_mark);
        return;
    }
    if (EBUSY == errcode) {kv_async_q_retry(pCB); goto done;}

    if (IS_GTEST)
    {
        EXPECT_EQ(0,   errcode);
        EXPECT_EQ(tag, pCB->tag);
    }
    p_kv = pCB->db + pCB->len_i;
    ++pCB->len_i;

    if (pCB->flags & KV_ASYNC_CB_SET)
    {
        KV_TRC_IO(pFT, "KV_ASYNC_CB_SET, %p %d %d", pCB, pCB->len_i, pCB->len);
        if (0   != errcode)    printf("ark_set failed, errcode=%d\n", errcode);
        if (tag != pCB->tag)   printf("ark_set bad tag\n");
        if (res != p_kv->vlen) printf("ark_set bad vlen\n");
        if (IS_GTEST) {        EXPECT_EQ(res, p_kv->vlen);}

        /* end of db len sequence, move to next step */
        if (pCB->len_i == pCB->len)
        {
            if (pCB->flags & KV_ASYNC_CB_WRITE_PERF)
            {
                pCB->len_i = 0;
                kv_async_perf_done(pCB);
                goto done;
            }
            pCB->len_i  = 0;
            pCB->flags &= ~KV_ASYNC_CB_SET;
            pCB->flags |= KV_ASYNC_CB_GET;
            kv_async_GET_KEY(pCB);
            goto done;
        }
        kv_async_SET_KEY(pCB);
        goto done;
    }
    else if (pCB->flags & KV_ASYNC_CB_GET)
    {
        uint32_t miscompare = memcmp(p_kv->value, pCB->gvalue, p_kv->vlen);

        KV_TRC_IO(pFT, "KV_ASYNC_CB_GET, %p %d %d", pCB, pCB->len_i, pCB->len);
        if (0   != errcode)    printf("ark_get failed, errcode=%d\n", errcode);
        if (tag != pCB->tag)   printf("ark_get bad tag\n");
        if (res != p_kv->vlen) printf("ark_get bad vlen\n");
        if (IS_GTEST) {        EXPECT_EQ(0, miscompare);}

        /* end of db len sequence, move to next step */
        if (pCB->len_i == pCB->len)
        {
            if (pCB->flags & KV_ASYNC_CB_READ_PERF)
            {
                pCB->len_i = 0;
                kv_async_perf_done(pCB);
                goto done;
            }
            pCB->len_i  = 0;
            pCB->flags &= ~KV_ASYNC_CB_GET;
            pCB->flags |= KV_ASYNC_CB_EXISTS;
            kv_async_EXISTS_KEY(pCB);
            goto done;
        }
        kv_async_GET_KEY(pCB);
        goto done;
    }
    else if (pCB->flags & KV_ASYNC_CB_EXISTS)
    {
        KV_TRC_IO(pFT, "KV_ASYNC_CB_EXISTS, %p %d %d", pCB, pCB->len_i, pCB->len);
        if (0   != errcode)    printf("ark_exists failed,errcode=%d\n",errcode);
        if (tag != pCB->tag)   printf("ark_exists bad tag\n");
        if (res != p_kv->vlen) printf("ark_exists bad vlen\n");
        if (IS_GTEST) {        EXPECT_EQ(res, p_kv->vlen);}

        /* if end of db len sequence, move to next step */
        if (pCB->len_i == pCB->len)
        {
            pCB->len_i  = 0;
            pCB->flags &= ~KV_ASYNC_CB_EXISTS;

            if (pCB->flags & KV_ASYNC_CB_SGD)
            {
                pCB->flags |= KV_ASYNC_CB_DEL;
                kv_async_DEL_KEY(pCB);
                goto done;
            }
            else if (pCB->flags & KV_ASYNC_CB_REPLACE)
            {
                /* make sure we don't shutdown before we have replaced once */
                if (pCB->replace &&
                    pCB->flags & KV_ASYNC_CB_SHUTDOWN)
                {
                    pCB->flags |= KV_ASYNC_CB_DEL;
                    kv_async_DEL_KEY(pCB);
                    goto done;
                }
                pCB->replace = TRUE;
                if (0 != pCB->regen(pCB->db, pCB->len, pCB->regen_len))
                {
                    printf("regen failure, fatal\n");
                    KV_TRC_FFDC(pFT, "FFDC: regen failure");
                    memset(pCB, 0, sizeof(async_CB_t));
                    goto done;
                }
                pCB->flags |= KV_ASYNC_CB_SET;
                kv_async_SET_KEY(pCB);
                goto done;
            }
            else
            {
                /* should not be here */
                EXPECT_TRUE(0);
            }
        }
        kv_async_EXISTS_KEY(pCB);
        goto done;
    }
    else if (pCB->flags & KV_ASYNC_CB_DEL)
    {
        KV_TRC_IO(pFT, "KV_ASYNC_CB_DEL, %p i:%d len:%d", pCB, pCB->len_i,pCB->len);
        if (0   != errcode)    printf("ark_del failed, errcode=%d\n",errcode);
        if (tag != pCB->tag)   printf("ark_del bad tag\n");
        if (res != p_kv->vlen) printf("ark_del bad vlen\n");
        if (IS_GTEST) {        EXPECT_EQ(res, p_kv->vlen);}

        /* end of db len sequence, move to next step */
        if (pCB->len_i == pCB->len)
        {
            if (pCB->flags & KV_ASYNC_CB_SHUTDOWN)
            {
                if (!(pCB->flags & KV_ASYNC_CB_MULTI_CTXT_IO))
                {
                    kv_db_destroy(pCB->db, pCB->len);
                }
                if (pCB->gvalue) free(pCB->gvalue);
                memset(pCB, 0, sizeof(async_CB_t));
                KV_TRC_IO(pFT, "LOOP_DONE: %p", pCB);
                goto done;
            }
            KV_TRC_IO(pFT, "NEXT_LOOP, %p", pCB);
            pCB->flags &= ~KV_ASYNC_CB_DEL;
            pCB->flags |= KV_ASYNC_CB_SET;
            pCB->len_i  = 0;
            kv_async_SET_KEY(pCB);
            goto done;
        }
        kv_async_DEL_KEY(pCB);
        goto done;
    }
    else
    {
        /* should not be here */
        EXPECT_TRUE(0);
    }

done:
    return;
}
Example #21
0
/**
 *******************************************************************************
 * \brief
 *  return TRUE if the IOs for the iocb are successfully completed, else FALSE
 ******************************************************************************/
int ea_async_io_harvest(_ARK   *_arkp,
                        int32_t tid,
                        tcb_t  *iotcbp,
                        iocb_t *iocbp,
                        rcb_t  *iorcbp)
{
  EA       *ea     = iocbp->ea;
  int32_t   i      = 0;
  int32_t   arc    = 0;
  int32_t   rc     = FALSE;
  uint64_t  status = 0;
  scb_t    *scbp   = &(_arkp->poolthreads[tid]);
  queue_t  *rq     = scbp->rqueue;
  queue_t  *tq     = scbp->tqueue;
  queue_t  *ioq    = scbp->ioqueue;

  for (i=0; i<iocbp->issT; i++)
  {
      if (EA_STORE_TYPE_MEMORY == ea->st_type)
      {
          // the IO has already been done in the schedule function,
          // so mark it completed
          arc = 1;
      }
      else
      {
          // skip previously harvested cmd
          if (iocbp->blist[i].a_tag == -1) {continue;}

          arc = cblk_aresult(ea->st_flash, &(iocbp->blist[i].a_tag), &status,0);
      }

      if (check_harv_error_injects(iocbp->op)) {arc=-1;}

      if (arc == 0)
      {
          KV_TRC_DBG(pAT,"IO:     WAIT_NOT_CMP: tid:%d ttag:%d a_tag:%d "
                         "blkno:%"PRIi64"",
                         tid, iocbp->tag, iocbp->blist[i].a_tag,
                         iocbp->blist[i].blkno);
          ++iocbp->hmissN;

          // if nothing to do and the first harvest missed, usleep
          if (queue_empty(rq) && queue_empty(tq) && queue_count(ioq)<=8 &&
              iocbp->hmissN==1 &&
              _arkp->ea->st_type != EA_STORE_TYPE_MEMORY)
          {
              usleep(50);
              KV_TRC_DBG(pAT,"IO:     USLEEP");
          }
          break;
      }

      if (arc < 0)
      {
          KV_TRC_FFDC(pAT, "IO_ERR: tid:%d ttag:%d errno=%d",
                           tid, iocbp->tag, errno);
          if (!errno) {KV_TRC_FFDC(pAT, "UNSET_ERRNO"); errno=EIO;}
          iocbp->io_error = errno;
      }
      else
      {
          KV_TRC_IO(pAT,"IO_CMP: tid:%2d ttag:%4d a_tag:%4d blkno:%5"PRIi64"",
                    tid, iocbp->tag,
                    iocbp->blist[i].a_tag, iocbp->blist[i].blkno);
      }

      ++iocbp->cmpT;
      iocbp->blist[i].a_tag = -1; // mark as harvested
  }

  if (iocbp->io_error)
  {
      // if all cmds that were issued (success or fail) have been
      // completed for this iocb, then fail this iocb
      if (iocbp->issT == iocbp->cmpT)
      {
          iorcbp->res    = -1;
          iorcbp->rc     = iocbp->io_error;
          iotcbp->state  = ARK_CMD_DONE;
          am_free(iocbp->blist);
          KV_TRC_FFDC(pAT, "IO:     ERROR_DONE: tid:%d ttag:%d rc:%d",
                      tid, iocbp->tag, iorcbp->rc);
      }
      else
      {
          // IOs outstanding, harvest the remaining IOs for this iocb
          KV_TRC_FFDC(pAT,"IO:    ERROR_RE_HARVEST: tid:%d ttag:%d "
                          "iocbp->issT:%d iocbp->cmpT:%d",
                          tid, iocbp->tag, iocbp->issT, iocbp->cmpT);
      }
  }
  // if all IO has completed successfully for this iocb, done
  else if (iocbp->cmpT == iocbp->nblks)
  {
      rc=TRUE;
      am_free(iocbp->blist);
      iotcbp->state = ARK_IO_DONE;
      KV_TRC_IO(pAT, "IO_END: SUCCESS tid:%d ttag:%d cmpT:%d",
                tid, iocbp->tag, iocbp->cmpT);
  }
  // if more blks need an IO, schedule
  else if (iocbp->issT < iocbp->nblks)
  {
      iotcbp->state = ARK_IO_SCHEDULE;
      KV_TRC_IO(pAT,"IO:     RE_SCHEDULE: tid:%d ttag:%d "
                    "iocbp->issT:%d iocbp->nblks:%"PRIi64" ",
                    tid, iocbp->tag, iocbp->issT, iocbp->nblks);
  }
  else
  {
      // all IOs have been issued but not all are completed, do harvest
      KV_TRC_IO(pAT,"IO:     RE_HARVEST: tid:%d ttag:%d "
                    "iocbp->cmpT:%d iocbp->issT:%d",
                    tid, iocbp->tag, iocbp->cmpT, iocbp->issT);
  }
  return rc;
}