int m_pi_create_wc_q(struct mcm_qp *m_qp, int entries) { /* RDMA proxy WC pool, register with SCIF and IB, set pool and segm size with parameters */ m_qp->wrc.wc_sz = ALIGN_64(sizeof(struct mcm_wc_rx)); m_qp->wrc.wc_len = m_qp->wrc.wc_sz * entries; /* 64 byte aligned for signal_fence */ m_qp->wrc.wc_end = entries - 1; m_qp->wc_hd_rem = 0; m_qp->wc_tl_rem = 0; if (posix_memalign((void **)&m_qp->wrc.wc_addr, 4096, ALIGN_PAGE(m_qp->wrc.wc_len))) { mlog(0, "failed to allocate wc_rbuf, m_qp=%p, wc_len=%d, entries=%d\n", m_qp, m_qp->wrc.wc_len, entries); return -1; } memset((void*)m_qp->wrc.wc_addr, 0, ALIGN_PAGE(m_qp->wrc.wc_len)); mlog(4, " WC rbuf pool %p, LEN req=%d, act=%d\n", m_qp->wrc.wc_addr, m_qp->wrc.wc_len, ALIGN_PAGE(m_qp->wrc.wc_len)); m_qp->wc_rbuf_mr = ibv_reg_mr(m_qp->smd->md->pd, (void*)m_qp->wrc.wc_addr, m_qp->wrc.wc_len, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); if (!m_qp->wc_rbuf_mr) { mlog(0, " IB_register addr=%p,%d failed %s\n", m_qp->wrc.wc_addr, ALIGN_PAGE(m_qp->wrc.wc_len), strerror(errno)); return -1; } m_qp->wrc.wc_addr = (uint64_t)(uintptr_t)m_qp->wc_rbuf_mr->addr; m_qp->wrc.wc_rkey = m_qp->wc_rbuf_mr->rkey; mlog(4, " IB_mr for wc_buf addr %p, mr 0x%llx, len %d, entries %d rkey %x lkey %x\n", m_qp->wrc.wc_addr, m_qp->wc_rbuf_mr->addr, ALIGN_PAGE(m_qp->wrc.wc_len), entries, m_qp->wc_rbuf_mr->rkey, m_qp->wc_rbuf_mr->lkey); return 0; }
int FINALIZE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, void *mh_sha1_digest, void *murmur3_x64_128_digest) { uint8_t *partial_block_buffer, *murmur_tail_data; uint64_t partial_block_len, total_len; uint32_t(*mh_sha1_segs_digests)[HASH_SEGS]; uint8_t *aligned_frame_buffer; if (ctx == NULL) return MH_SHA1_MURMUR3_CTX_ERROR_NULL; total_len = ctx->total_length; partial_block_len = total_len % MH_SHA1_BLOCK_SIZE; partial_block_buffer = ctx->partial_block_buffer; // Calculate murmur3 firstly // because mh_sha1 will change the partial_block_buffer // ( partial_block_buffer = n murmur3 blocks and 1 murmur3 tail) murmur_tail_data = partial_block_buffer + partial_block_len - partial_block_len % MUR_BLOCK_SIZE; MURMUR_BLOCK_FUNCTION(partial_block_buffer, partial_block_len / MUR_BLOCK_SIZE, ctx->murmur3_x64_128_digest); MURMUR_TAIL_FUNCTION(murmur_tail_data, total_len, ctx->murmur3_x64_128_digest); /* mh_sha1 final */ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer); mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests; MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests, aligned_frame_buffer, ctx->mh_sha1_digest); /* Output the digests of murmur3 and mh_sha1 */ if (mh_sha1_digest != NULL) { ((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0]; ((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1]; ((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2]; ((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3]; ((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4]; } if (murmur3_x64_128_digest != NULL) { ((uint32_t *) murmur3_x64_128_digest)[0] = ctx->murmur3_x64_128_digest[0]; ((uint32_t *) murmur3_x64_128_digest)[1] = ctx->murmur3_x64_128_digest[1]; ((uint32_t *) murmur3_x64_128_digest)[2] = ctx->murmur3_x64_128_digest[2]; ((uint32_t *) murmur3_x64_128_digest)[3] = ctx->murmur3_x64_128_digest[3]; } return MH_SHA1_MURMUR3_CTX_ERROR_NONE; }
int m_pi_create_wr_q(struct mcm_qp *m_qp, int entries) { /* RDMA proxy WR pool, register with SCIF and IB, set pool and segm size with parameters */ m_qp->wrc.wr_sz = ALIGN_64(sizeof(struct mcm_wr_rx)); m_qp->wrc.wr_len = m_qp->wrc.wr_sz * entries; /* 64 byte aligned for signal_fence */ m_qp->wrc.wr_end = entries - 1; m_qp->wr_hd_r = 0; m_qp->wr_tl_r = 0; m_qp->wr_tl_r_wt = 1; /* start at tl+1 */ if (posix_memalign((void **)&m_qp->wrc.wr_addr, 4096, ALIGN_PAGE(m_qp->wrc.wr_len))) { mlog(0, "failed to allocate wr_rbuf, m_qp=%p, wr_len=%d, entries=%d\n", m_qp, m_qp->wrc.wr_len, entries); return -1; } memset((void*)m_qp->wrc.wr_addr, 0, ALIGN_PAGE(m_qp->wrc.wr_len)); mlog(4, " WR rbuf pool %p, LEN req=%d, act=%d\n", m_qp->wrc.wr_addr, m_qp->wrc.wr_len, ALIGN_PAGE(m_qp->wrc.wr_len) ); m_qp->wr_rbuf_mr = ibv_reg_mr(m_qp->smd->md->pd, (void*)m_qp->wrc.wr_addr, m_qp->wrc.wr_len, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); if (!m_qp->wr_rbuf_mr) { mlog(0, " IB_register addr=%p,%d failed %s\n", m_qp->wrc.wr_addr, ALIGN_PAGE(m_qp->wrc.wr_len), strerror(errno)); return -1;; } m_qp->wrc.wr_addr = (uint64_t)(uintptr_t)m_qp->wr_rbuf_mr->addr; m_qp->wrc.wr_rkey = m_qp->wr_rbuf_mr->rkey; mlog(4, " IB_mr for wr_buf addr %p, off 0x%llx, len %d, entries %d, rkey %x lkey %x\n", m_qp->wrc.wr_addr, m_qp->wr_rbuf_mr->addr, ALIGN_PAGE(m_qp->wrc.wr_len), entries, m_qp->wr_rbuf_mr->rkey, m_qp->wr_rbuf_mr->rkey); m_qp->wr_off_r = scif_register(m_qp->smd->scif_tx_ep, (void*)m_qp->wrc.wr_addr, ALIGN_PAGE(m_qp->wrc.wr_len), (off_t)0, SCIF_PROT_READ | SCIF_PROT_WRITE, 0); if (m_qp->wr_off_r == (off_t)(-1)) { mlog(0, " SCIF_register addr=%p,%d failed %s\n", m_qp->wrc.wr_addr, ALIGN_PAGE(m_qp->wrc.wr_len), strerror(errno)); return -1; } mlog(4, " WR rbuf pool %p, LEN req=%d, act=%d\n", m_qp->wr_buf, m_qp->wr_len, ALIGN_PAGE(m_qp->wrc.wr_len)); mlog(4, " SCIF_mr for wr_rbuf addr %p, off 0x%llx, len %d, entries %d\n", m_qp->wrc.wr_addr, m_qp->wr_off_r, ALIGN_PAGE(m_qp->wrc.wr_len), entries); return 0; }
/* called with smd->rblock */ static int m_pi_buf_ordered(mcm_scif_dev_t *smd, int next) { int idx; if (smd->m_buf_hd_r == 0) /* previous m_idx */ idx = smd->m_buf_end_r; else idx = smd->m_buf_hd_r - 1; mlog(8," smd %p - m_buf_wc_r %p: tl %d hd %d buf_wc_hd[%d].m_idx=0x%x next=0x%x\n", smd, smd->m_buf_wc_r, smd->m_buf_tl_r, smd->m_buf_hd_r, idx, smd->m_buf_wc_r[idx].m_idx, next); if (smd->m_buf_wc_r[idx].done || ALIGN_64(smd->m_buf_wc_r[idx].m_idx + 1) == next) return 1; else return 0; }
void get(SoupServer *server, SoupMessage *msg, const char *path) { struct btval key, val; key.data = (void*)path+1; key.size = strlen(path)-1; key.free_data = FALSE; key.mp = NULL; g_debug ("GET\n Path=%s\n Fetching key %s", path, (char*)key.data); const struct btree_stat *stat = btree_stat(btree); show_dbstats("",stat); if (0 == btree_get(btree,&key,&val)) { g_debug ("data checksum is %s", g_compute_checksum_for_data(G_CHECKSUM_MD5, val.data, val.size)); /* we need to make the data 64 bits aligned for gvariant. Best plan is to make all data aligned in the store, but for now, lets just copy it to somewhere aligned. TODO: think about fragmentation. it may just be ok, as so far we delete everything we malloc in this function, despite the lifetimes being interleaved. */ char *buf = g_slice_alloc(val.size + 8); char *data = ALIGN_64(buf); memcpy (data, val.data, val.size); g_debug ("aligned data checksum is %s", g_compute_checksum_for_data(G_CHECKSUM_MD5, data, val.size)); GVariant *gv = g_variant_new_from_data(G_VARIANT_TYPE_VARIANT, data, val.size, TRUE, NULL, NULL); char *debug = g_variant_print (gv, TRUE); g_debug ("converted to %s", debug); g_free (debug); int length; char* ret = json_gvariant_serialize_data(gv, &length); g_variant_unref(gv); g_slice_free1 (val.size + 8, buf); soup_message_set_status (msg, SOUP_STATUS_OK); /*TODO: does soup do anything sensible with it's memory management of responses to reduce the risk of fragmentation? probably not..*/ soup_message_set_response (msg, "application/json", SOUP_MEMORY_TAKE, ret, length); } else { soup_message_set_status (msg, SOUP_STATUS_NOT_FOUND); } }
/* called with m_qp->rxlock */ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) { mcm_scif_dev_t *smd = m_qp->smd; struct ibv_qp *ib_qp; char *rbuf; int l_start, l_end, ret = 0; int l_len = wr_rx->sg[0].length; struct ibv_send_wr ib_wr; struct ibv_send_wr *bad_wr; mlog(4, " [%d:%d:%d] WR_rx[%d] %p RR init: po-addr=%p ln=%d, key=%x ctx=%Lx\n", m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, wr_rx, wr_rx->sg[0].addr, wr_rx->sg[0].length, wr_rx->sg[0].lkey, wr_rx->context); /* shared proxy-in buffer, device level serialization */ mpxy_lock(&smd->rblock); /* slice out proxy buffer for this segment */ l_start = ALIGN_64(smd->m_hd_r); if ((l_start + l_len) > smd->m_len_r) l_start = 64; l_end = l_start + l_len; if (l_start < smd->m_tl_r && l_end > smd->m_tl_r) { if (!(wr_rx->flags & M_READ_PAUSED)) { wr_rx->flags |= M_READ_PAUSED; m_qp->stall_cnt_rr++; MCNTR(smd->md, MCM_MX_RR_STALL); mlog(0, " WARN[%d:%d:%d] WR_rx[%d] org_id %Lx RR stall (%d)" " low mem (%p-%p) hd 0x%x tl 0x%x ln %x,%d\n", smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, wr_rx->org_id, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); mlog(0, " wr[%d] %p RR(%d,%d,%d): flgs %x tl %d tl_wt %d hd %d\n", wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, m_qp->pi_rr_cnt, wr_rx->flags, m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r); } mpxy_unlock(&smd->rblock); return; } rbuf = (char*)(smd->m_buf_r + l_start); if ((m_qp->pi_rr_cnt >= mcm_rr_max) && !(wr_rx->flags & M_READ_PAUSED)) { wr_rx->flags |= M_READ_PAUSED; m_qp->stall_cnt_rr++; mlog(0x1, "WARN[%d:%d:%d] WR_rx[%d] max RR's, stalling (%d)" " memory (%p-%p) hd 0x%x tl 0x%x %x,%d\n", smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); mlog(0x1, " wr[%d] %p RR(%d,%d,%d): flgs %x tl %d tl_wt %d hd %d\n", wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, m_qp->pi_rr_cnt, wr_rx->flags, m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r); mpxy_unlock(&smd->rblock); return; } /* rbuf available, progress if paused, no progress if any prior IO waiting */ if (wr_rx->flags & M_READ_PAUSED) { m_qp->stall_cnt_rr--; wr_rx->flags &= ~M_READ_PAUSED; mlog(0x1, "[%d:%d:%d] WR_rx[%d] RR released (%d) got memory (%p-%p)" " hd 0x%x tl 0x%x ln %x,%d\n", smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); } else if (m_qp->stall_cnt_rr) { wr_rx->flags |= M_READ_PAUSED; m_qp->stall_cnt_rr++; mlog(0x1, "WARN[%d:%d:%d] WR_rx[%d] previous RR stall (%d)" " memory (%p-%p) hd 0x%x tl 0x%x %x,%d\n", smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); mlog(0x1, " wr[%d] %p RR(%d,%d,%d): flgs %x tl %d tl_wt %d hd %d\n", wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, m_qp->pi_rr_cnt, wr_rx->flags, m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r); mpxy_unlock(&smd->rblock); return; } /* sg[0] entry == proxy-out buffer, src for IB RR */ /* sg[1] entry == proxy-in buffer, dst for IB RR */ /* sg[2] entry == proxy-in buffer src for scif_sendto */ /* wr.rdma.remote_addr, wr.rdma.rkey, dst for scif_sento */ wr_rx->sg[1].addr = (uint64_t)(rbuf); wr_rx->sg[1].lkey = smd->m_mr_r->lkey; wr_rx->sg[1].length = l_len; wr_rx->sg[2].addr = (uint64_t)smd->m_offset_r + l_start; wr_rx->sg[2].lkey = 0; wr_rx->sg[2].length = l_len; /* initiate RR from remote po proxy buf to local pi buffer, signal all */ wr_rx->wr.wr_id = 0; /* indication of wr_rx type */ wr_rx->m_idx = 0; /* build an ib_wr from wr_rx */ const_ib_rr(&ib_wr, &wr_rx->wr, (struct ibv_sge*)wr_rx->sg); ib_wr.wr_id = WRID_SET(wr_rx, WRID_RX_RR); /* signal and mark rbuf idx, if m_idx out of order must mark and signal */ if ((wr_rx->flags & M_SEND_LS) || (!m_pi_buf_ordered(smd, rbuf - smd->m_buf_r)) || (m_qp->pi_rr_cnt == mcm_rr_max-1) || (!((m_qp->post_cnt_rr+1) % mcm_rr_signal))) { ib_wr.send_flags = IBV_SEND_SIGNALED; wr_rx->m_idx = ((rbuf + (l_len - 1)) - smd->m_buf_r); if (m_pi_buf_hd(smd, wr_rx->m_idx, wr_rx)) goto buf_err; } /* * update shared proxy-in buffer hd, save end of buffer idx * and save ref m_idx for out of order completions across QP's */ smd->m_hd_r = l_end; mpxy_unlock(&smd->rblock); /* MXS -> MSS or HST, PI service will be on QP1 */ if (MXS_EP(&m_qp->smd->md->addr) && (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1))) ib_qp = m_qp->ib_qp1; else ib_qp = m_qp->ib_qp2; #if MCM_PROFILE wr_rx->time = mcm_ts_us(); wr_rx->qcnt = m_qp->pi_rr_cnt; #endif wr_rx->flags |= M_READ_POSTED; errno = 0; ret = ibv_post_send(ib_qp, &ib_wr, &bad_wr); if (ret) goto bail; m_qp->pi_rr_cnt++; m_qp->post_cnt_rr++; MCNTR(smd->md, MCM_QP_READ); mlog(0x10, "[%d:%d:%d] WR[%d] %p RR(%d,%d,%d): wr_id %Lx qn %x flgs %x,%x ln %d " "r_addr,key %Lx %x to l_addr,key %Lx %x tl %d hd %d, m_idx %x\n", smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, m_qp->pi_rr_cnt, ib_wr.wr_id, ib_qp->qp_num, ib_wr.send_flags, wr_rx->flags, l_len, ib_wr.wr.rdma.remote_addr, ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey, m_qp->wr_tl_r, m_qp->wr_hd_r, wr_rx->m_idx); write(smd->md->mc->tx_pipe[1], "w", sizeof "w"); return; bail: mpxy_lock(&smd->rblock); m_pi_buf_tl(smd, wr_rx->m_idx, wr_rx); /* return buffer slot */ mpxy_unlock(&smd->rblock); buf_err: m_qp->stall_cnt_rr++; wr_rx->flags |= M_READ_PAUSED; wr_rx->flags &= ~M_READ_POSTED; mlog(0, " WARN[%d] (%d,%d): wr[%d] %p RR ibv_post/pi_buf ERR stall (%d,%d,%d,%d):" " flgs 0x%x ln %d r_addr,key %Lx %x to l_addr,key %Lx %x" " tl %d w_tl %d hd %d\n", smd->entry.tid, ret, errno, wr_rx->w_idx, wr_rx, m_qp->pi_rr_cnt, m_qp->pi_rw_cnt, m_qp->post_sig_cnt, m_qp->stall_cnt_rr, ib_wr.send_flags, l_len, ib_wr.wr.rdma.remote_addr, ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey, m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r); }