コード例 #1
0
int main(int argc, char *argv[])
{
    NNTI_result_t rc;
    selfsend_args *ssa;
    char server_url[NNTI_URL_LEN];

    logger_init(LOG_ERROR, NULL);

    pthread_barrier_init(&barrier2, NULL, 2);
    pthread_barrier_init(&barrier3, NULL, 3);

    rc=NNTI_init(NNTI_DEFAULT_TRANSPORT, NULL, &trans_hdl);
    rc=NNTI_get_url(&trans_hdl, server_url, NNTI_URL_LEN);

    launch_wait_threads();

    pthread_barrier_wait(&barrier3);

    rc=NNTI_connect(
            &trans_hdl,
            server_url,
            5000,
            &server_hdl);

    pthread_barrier_wait(&barrier2);

    rc=NNTI_alloc(&trans_hdl, NNTI_REQUEST_BUFFER_SIZE, 1, NNTI_SEND_SRC, &send_mr);

    ssa=(selfsend_args *)NNTI_BUFFER_C_POINTER(&send_mr);
    ssa->data.int_val   =10;
    ssa->data.float_val =10.0;
    ssa->data.double_val=10.0;
    ssa->chksum=calc_checksum((const char *)&ssa->data, sizeof(data_t));

    rc=NNTI_send(&server_hdl, &send_mr, &recv_mr, &send_wr);
    rc=NNTI_wait(&send_wr, 5000, &send_status);

    rc=NNTI_send(&server_hdl, &send_mr, NULL, &send_wr);
    rc=NNTI_wait(&send_wr, 5000, &send_status);

    pthread_barrier_wait(&barrier3);

    NNTI_free(&send_mr);

    join_wait_threads();

    if (success)
        std::cout << "\nEnd Result: TEST PASSED" << std::endl;
    else
        std::cout << "\nEnd Result: TEST FAILED" << std::endl;

    return (success ? 0 : 1 );
}
コード例 #2
0
static void *do_request_wait(void *args)
{
    NNTI_result_t rc;
    selfsend_args *ssa;

    double wait_time=0.0;

    NNTI_alloc(&trans_hdl, NNTI_REQUEST_BUFFER_SIZE, 10, NNTI_RECV_QUEUE, &queue_mr);

    NNTI_create_work_request(&queue_mr, &queue_wr);

    /* client is waiting for us to initialize */
    pthread_barrier_wait(&barrier3);

    /* the client sends a message here */
    NNTI_wait(&queue_wr, -1, &queue_status);

    ssa=(selfsend_args *)queue_status.start;
    if (ssa->chksum != calc_checksum((const char *)&ssa->data, sizeof(data_t))) {
        fprintf(stdout, "checksum failure in the request thread");
        success=false;
    }

    pthread_barrier_wait(&barrier3);

    NNTI_destroy_work_request(&queue_wr);

    NNTI_free(&queue_mr);

    return(NULL);
}
コード例 #3
0
static void *do_recv_wait(void *args)
{
    NNTI_result_t rc;
    selfsend_args *ssa;

    double wait_time=0.0;

    NNTI_alloc(&trans_hdl, NNTI_REQUEST_BUFFER_SIZE, 1, NNTI_RECV_DST, &recv_mr);

    NNTI_create_work_request(&recv_mr, &recv_wr);

    /* client is waiting for us to initialize */
    pthread_barrier_wait(&barrier3);

    wait_time=trios_get_time_ms();
    /* the client sends a message here */
    NNTI_wait(&recv_wr, RECV_TIMEOUT, &recv_status);
    wait_time=trios_get_time_ms()-wait_time;

    // if wait time varies from the timeout by more than 100ms, then fail.
    if ((wait_time < (RECV_TIMEOUT-100)) || (wait_time > (RECV_TIMEOUT+100))) {
        fprintf(stdout, "Time to complete NNTI_wait: expected=%lums ; actual=%lums\n", (uint64_t)RECV_TIMEOUT, (uint64_t)wait_time);
        success=false;
    }

    pthread_barrier_wait(&barrier2);

    wait_time=trios_get_time_ms();
    /* the client sends a message here */
    NNTI_wait(&recv_wr, -1, &recv_status);
    wait_time=trios_get_time_ms()-wait_time;

    ssa=(selfsend_args *)recv_status.start;
    if (ssa->chksum != calc_checksum((const char *)&ssa->data, sizeof(data_t))) {
        fprintf(stdout, "checksum failure in the receive thread");
        success=false;
    }

    pthread_barrier_wait(&barrier3);

    NNTI_destroy_work_request(&recv_wr);

    NNTI_free(&recv_mr);

    return(NULL);
}
コード例 #4
0
static void *do_wait(void *args)
{
    NNTI_result_t rc;
    selfsend_args *ssa;

    double wait_time=0.0;

    NNTI_alloc(&copy_trans_hdl, NNTI_REQUEST_BUFFER_SIZE, 10, NNTI_RECV_QUEUE, &queue_mr);

    NNTI_dt_sizeof(&copy_trans_hdl, &queue_mr, &packed_size);
    NNTI_dt_pack(&copy_trans_hdl, &queue_mr, packed_buf, packed_size);
    NNTI_dt_unpack(&copy_trans_hdl, &copy_queue_mr, packed_buf, packed_size);

    NNTI_create_work_request(&copy_queue_mr, &queue_wr);

    NNTI_dt_sizeof(&copy_trans_hdl, &queue_wr, &packed_size);
    NNTI_dt_pack(&copy_trans_hdl, &queue_wr, packed_buf, packed_size);
    NNTI_dt_unpack(&copy_trans_hdl, &copy_queue_wr, packed_buf, packed_size);

    /* client is waiting for us to initialize */
    pthread_barrier_wait(&barrier);

    /* the client sends a message here */
    NNTI_wait(&copy_queue_wr, -1, &queue_status);

    NNTI_dt_sizeof(&copy_trans_hdl, &queue_status, &packed_size);
    NNTI_dt_pack(&copy_trans_hdl, &queue_status, packed_buf, packed_size);
    NNTI_dt_unpack(&copy_trans_hdl, &copy_queue_status, packed_buf, packed_size);

    ssa=(selfsend_args *)copy_queue_status.start;
    if (ssa->chksum != calc_checksum((const char *)&ssa->data, sizeof(data_t))) {
        success=false;
    }

    NNTI_dt_free(&copy_trans_hdl, &copy_queue_status);

    pthread_barrier_wait(&barrier);

    NNTI_dt_free(&copy_trans_hdl, &copy_queue_wr);
    NNTI_destroy_work_request(&queue_wr);

    NNTI_dt_free(&copy_trans_hdl, &copy_queue_mr);
    NNTI_free(&queue_mr);

    return(NULL);
}
コード例 #5
0
ファイル: NntiPerfTest.cpp プロジェクト: eisenhauer/nnti1
void server(void)
{
    NNTI_result_t rc=NNTI_OK;
    NNTI_status_t queue_status;
    NNTI_status_t send_status;
    char *c_ptr;
    void    *packed=NULL;
    int32_t  packed_size=0;


    int num_elements=nclients+(4*nclients*num_sends);
    char *queue_buf=(char *)malloc(num_elements*NNTI_REQUEST_BUFFER_SIZE);
    memset(queue_buf, 0, num_elements*NNTI_REQUEST_BUFFER_SIZE);
    NNTI_register_memory(&trans_hdl, queue_buf, NNTI_REQUEST_BUFFER_SIZE, num_elements, NNTI_RECV_QUEUE, NULL, &queue_mr);

    char *send_buf=(char *)malloc(NNTI_REQUEST_BUFFER_SIZE);
    memset(send_buf, 0, NNTI_REQUEST_BUFFER_SIZE);
    NNTI_register_memory(&trans_hdl, send_buf, NNTI_REQUEST_BUFFER_SIZE, 1, NNTI_SEND_SRC, NULL, &send_mr);

    char *server_ack_buf=(char *)malloc(NNTI_REQUEST_BUFFER_SIZE);
    memset(server_ack_buf, 0, NNTI_REQUEST_BUFFER_SIZE);
    NNTI_register_memory(&trans_hdl, server_ack_buf, NNTI_REQUEST_BUFFER_SIZE, 1, NNTI_RECV_DST, NULL, &server_ack_mr);

    char *get_src_buf=(char *)malloc(nclients*get_size);
    memset(get_src_buf, 0, nclients*get_size);
    NNTI_register_memory(&trans_hdl, get_src_buf, nclients*get_size, 1, NNTI_GET_SRC, NULL, &get_src_mr);

    char *put_dst_buf=(char *)malloc(nclients*put_size);
    memset(put_dst_buf, 0, nclients*put_size);
    NNTI_register_memory(&trans_hdl, put_dst_buf, nclients*put_size, 1, NNTI_PUT_DST, NULL, &put_dst_mr);


    /*
     * Phase 1 - exchange buffers handles
     */
    // wait for the client to send it's recv_mr
    NNTI_wait(&queue_mr, NNTI_RECV_QUEUE, -1, &queue_status);

    c_ptr=(char*)queue_status.start+queue_status.offset;
    buffer_unpack(c_ptr, queue_status.length, &client_ack_mr, (xdrproc_t)&xdr_NNTI_buffer_t);

//    fprint_NNTI_buffer(logger_get_file(), "client_ack_mr",
//            "received client ack hdl", &client_ack_mr);

    // send our server_ack_mr, get_src_mr and put_dst_mr back to the client
    buffer_pack(&server_ack_mr, &packed, &packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);
    if (packed_size > NNTI_REQUEST_BUFFER_SIZE) {
        log_error(nntiperf_debug_level, "buffer_pack() says encoded NNTI_buffer_t is larger than NNTI_REQUEST_BUFFER_SIZE");
    	MPI_Abort(MPI_COMM_WORLD, -10);
    }

    char *ptr=send_buf;
    memcpy(ptr, &packed_size, sizeof(packed_size));
    ptr += sizeof(packed_size);
    memcpy(ptr, packed, packed_size);
    ptr += packed_size;

    buffer_pack_free(packed, packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);

    buffer_pack(&get_src_mr, &packed, &packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);
    if (packed_size > NNTI_REQUEST_BUFFER_SIZE) {
        log_error(nntiperf_debug_level, "buffer_pack() says encoded NNTI_buffer_t is larger than NNTI_REQUEST_BUFFER_SIZE");
    	MPI_Abort(MPI_COMM_WORLD, -10);
    }

    memcpy(ptr, &packed_size, sizeof(packed_size));
    ptr += sizeof(packed_size);
    memcpy(ptr, packed, packed_size);
    ptr += packed_size;

    buffer_pack_free(packed, packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);

    buffer_pack(&put_dst_mr, &packed, &packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);
    if (packed_size > NNTI_REQUEST_BUFFER_SIZE) {
        log_error(nntiperf_debug_level, "buffer_pack() says encoded NNTI_buffer_t is larger than NNTI_REQUEST_BUFFER_SIZE");
    	MPI_Abort(MPI_COMM_WORLD, -10);
    }

    memcpy(ptr, &packed_size, sizeof(packed_size));
    ptr += sizeof(packed_size);
    memcpy(ptr, packed, packed_size);
    ptr += packed_size;

    buffer_pack_free(packed, packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);

    rc=NNTI_send(&queue_status.src, &send_mr, &client_ack_mr);
    if (rc != NNTI_OK) {
        log_error(nntiperf_debug_level, "NNTI_send() returned an error: %d", rc);
        MPI_Abort(MPI_COMM_WORLD, rc);
    }
    rc=NNTI_wait(&send_mr, NNTI_SEND_SRC, 5000, &send_status);
    if (rc != NNTI_OK) {
        log_error(nntiperf_debug_level, "NNTI_wait() returned an error: %d", rc);
        MPI_Abort(MPI_COMM_WORLD, rc);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 2 - client sends sync requests
     */
    for (int i=0;i<nclients*num_sends;i++) {
        rc=NNTI_wait(&queue_mr, NNTI_RECV_QUEUE, 1000, &queue_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 3 - client sends async requests
     */
    for (int i=0;i<nclients*num_sends;i++) {
        rc=NNTI_wait(&queue_mr, NNTI_RECV_QUEUE, 1000, &queue_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 4 - client does sync gets
     */
    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 5 - client does async gets
     */
    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 6 - client does sync puts
     */
    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 7 - client does async puts
     */
    MPI_Barrier(MPI_COMM_WORLD);


    NNTI_unregister_memory(&queue_mr);
    free(queue_buf);

    NNTI_unregister_memory(&send_mr);
    free(send_buf);

    NNTI_unregister_memory(&server_ack_mr);
    free(server_ack_buf);

    NNTI_unregister_memory(&get_src_mr);
    free(get_src_buf);

    NNTI_unregister_memory(&put_dst_mr);
    free(put_dst_buf);

    return;
}
コード例 #6
0
ファイル: NntiPerfTest.cpp プロジェクト: eisenhauer/nnti1
void client(void) {
    NNTI_result_t rc=NNTI_OK;
    NNTI_status_t rdma_status;
    NNTI_status_t send_status;
    NNTI_status_t client_ack_status;
    void    *packed=NULL;
    int32_t  packed_size=0;

    double op_timer;

    //    Teuchos::oblackholestream blackhole;
    //    std::ostream &out = ( rank == 1 ? std::cout : blackhole );
    std::ostream &out = std::cout;

    NNTI_connect(&trans_hdl, url, 5000, &server_hdl);

    char *send_buf=(char *)malloc(NNTI_REQUEST_BUFFER_SIZE);
    memset(send_buf, 0, NNTI_REQUEST_BUFFER_SIZE);
    NNTI_register_memory(&trans_hdl, send_buf, NNTI_REQUEST_BUFFER_SIZE, 1, NNTI_SEND_SRC, NULL, &send_mr);

    char *client_ack_buf=(char *)malloc(NNTI_REQUEST_BUFFER_SIZE);
    memset(client_ack_buf, 0, NNTI_REQUEST_BUFFER_SIZE);
    NNTI_register_memory(&trans_hdl, client_ack_buf, NNTI_REQUEST_BUFFER_SIZE, 1, NNTI_RECV_DST, NULL, &client_ack_mr);

    char *get_dst_buf=(char *)malloc(get_size);
    memset(get_dst_buf, 0, get_size);
    NNTI_register_memory(&trans_hdl, get_dst_buf, get_size, 1, NNTI_GET_DST, NULL, &get_dst_mr);

    char *put_src_buf=(char *)malloc(put_size);
    memset(put_src_buf, 0, put_size);
    NNTI_register_memory(&trans_hdl, put_src_buf, put_size, 1, NNTI_PUT_SRC, NULL, &put_src_mr);

    /*
     * Phase 1 - exchange buffer handles
     */
    buffer_pack(&client_ack_mr, &packed, &packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);
    if (packed_size > NNTI_REQUEST_BUFFER_SIZE) {
        log_error(nntiperf_debug_level, "buffer_pack() says encoded NNTI_buffer_t is larger than NNTI_REQUEST_BUFFER_SIZE");
    	MPI_Abort(MPI_COMM_WORLD, -10);
    }

    // send the server the recv_mr so it can send back it's ack_mr
    memcpy(send_buf, packed, packed_size);

    buffer_pack_free(packed, packed_size, (xdrproc_t)&xdr_NNTI_buffer_t);

    rc=NNTI_send(&server_hdl, &send_mr, NULL);
    if (rc != NNTI_OK) {
        log_error(nntiperf_debug_level, "NNTI_send() returned an error: %d", rc);
        MPI_Abort(MPI_COMM_WORLD, rc);
    }
    rc=NNTI_wait(&send_mr, NNTI_SEND_SRC, 5000, &send_status);
    if (rc != NNTI_OK) {
        log_error(nntiperf_debug_level, "NNTI_wait() returned an error: %d", rc);
        MPI_Abort(MPI_COMM_WORLD, rc);
    }

    // wait for the server to send back it's recv_mr
    rc=NNTI_wait(&client_ack_mr, NNTI_RECV_DST, -1, &client_ack_status);

    char *ptr=(char*)client_ack_status.start+client_ack_status.offset;

    memcpy(&packed_size, ptr, sizeof(packed_size));
    ptr += sizeof(packed_size);
    memcpy(packed, ptr, packed_size);
    ptr += packed_size;

    buffer_unpack(packed, packed_size, &server_ack_mr, (xdrproc_t)&xdr_NNTI_buffer_t);

    memcpy(&packed_size, ptr, sizeof(packed_size));
    ptr += sizeof(packed_size);
    memcpy(packed, ptr, packed_size);
    ptr += packed_size;

    buffer_unpack(packed, packed_size, &get_src_mr, (xdrproc_t)&xdr_NNTI_buffer_t);

    memcpy(&packed_size, ptr, sizeof(packed_size));
    ptr += sizeof(packed_size);
    memcpy(packed, ptr, packed_size);
    ptr += packed_size;

    buffer_unpack(packed, packed_size, &put_dst_mr, (xdrproc_t)&xdr_NNTI_buffer_t);

//    fprint_NNTI_buffer(logger_get_file(), "server_ack_mr",
//            "received server ack hdl", &server_ack_mr);
//    fprint_NNTI_buffer(logger_get_file(), "get_src_mr",
//            "received get src hdl", &get_src_mr);
//    fprint_NNTI_buffer(logger_get_file(), "put_dst_mr",
//            "received put dst hdl", &put_dst_mr);

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 2 - test sync request performance
     */
    op_timer=trios_get_time();
    for (int i=0;i<num_sends;i++) {
        rc=NNTI_send(&server_hdl, &send_mr, NULL);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_send() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
        rc=NNTI_wait(&send_mr, NNTI_SEND_SRC, 1000, &send_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    op_timer=trios_get_time()-op_timer;
    if (num_sends > 0) {
    	out << " sync requests per second == " << num_sends/op_timer << std::endl;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 3 - test async request performance
     */
    op_timer=trios_get_time();
    for (int i=0;i<num_sends;i++) {
        rc=NNTI_send(&server_hdl, &send_mr, NULL);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_send() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    for (int i=0;i<num_sends;i++) {
        rc=NNTI_wait(&send_mr, NNTI_SEND_SRC, 1000, &send_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    op_timer=trios_get_time()-op_timer;
    if (num_sends > 0) {
    	out << "async requests per second == " << num_sends/op_timer << std::endl;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 4 - test sync get performance
     */
    // warm up the pipes
    for (int i=0;i<num_gets;i++) {
        rc=NNTI_get(&get_src_mr, client_rank*get_size, get_size, &get_dst_mr, 0);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_get() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
        rc=NNTI_wait(&get_dst_mr, NNTI_GET_DST, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }

    op_timer=trios_get_time();
    for (int i=0;i<num_gets;i++) {
        rc=NNTI_get(&get_src_mr, client_rank*get_size, get_size, &get_dst_mr, 0);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_get() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
        rc=NNTI_wait(&get_dst_mr, NNTI_GET_DST, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    op_timer=trios_get_time()-op_timer;
    if (num_gets > 0) {
    	out << " sync get (" << get_size << " byte transfer) == " << (double)(num_gets*get_size)/one_mb/op_timer << " MBps" << std::endl;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 5 - test async get performance
     */
    // warm up the pipes
    for (int i=0;i<num_gets;i++) {
        rc=NNTI_get(&get_src_mr, client_rank*get_size, get_size, &get_dst_mr, 0);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_get() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    for (int i=0;i<num_gets;i++) {
        rc=NNTI_wait(&get_dst_mr, NNTI_GET_DST, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }

    op_timer=trios_get_time();
    for (int i=0;i<num_gets;i++) {
        rc=NNTI_get(&get_src_mr, client_rank*get_size, get_size, &get_dst_mr, 0);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_get() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    for (int i=0;i<num_gets;i++) {
        rc=NNTI_wait(&get_dst_mr, NNTI_GET_DST, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    op_timer=trios_get_time()-op_timer;
    if (num_gets > 0) {
    	out << "async get (" << get_size << " byte transfer) == " << (double)(num_gets*get_size)/one_mb/op_timer << " MBps" << std::endl;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 6 - test sync put performance
     */
    // warm up the pipes
    for (int i=0;i<num_puts;i++) {
        rc=NNTI_put(&put_src_mr, 0, put_size, &put_dst_mr, client_rank*put_size);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_put() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
        rc=NNTI_wait(&put_src_mr, NNTI_PUT_SRC, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }

    op_timer=trios_get_time();
    for (int i=0;i<num_puts;i++) {
        rc=NNTI_put(&put_src_mr, 0, put_size, &put_dst_mr, client_rank*put_size);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_put() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
        rc=NNTI_wait(&put_src_mr, NNTI_PUT_SRC, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    op_timer=trios_get_time()-op_timer;
    if (num_puts > 0) {
    	out << " sync put (" << put_size << " byte transfer) == " << (double)(num_puts*put_size)/one_mb/op_timer << " MBps" << std::endl;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /*
     * Phase 7 - test async put performance
     */
    // warm up the pipes
    for (int i=0;i<num_puts;i++) {
        rc=NNTI_put(&put_src_mr, 0, put_size, &put_dst_mr, client_rank*put_size);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_put() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    for (int i=0;i<num_puts;i++) {
        rc=NNTI_wait(&put_src_mr, NNTI_PUT_SRC, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }

    op_timer=trios_get_time();
    for (int i=0;i<num_puts;i++) {
        rc=NNTI_put(&put_src_mr, 0, put_size, &put_dst_mr, client_rank*put_size);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_put() returned an error: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    for (int i=0;i<num_puts;i++) {
        rc=NNTI_wait(&put_src_mr, NNTI_PUT_SRC, 1000, &rdma_status);
        if (rc != NNTI_OK) {
            log_error(nntiperf_debug_level, "NNTI_wait() did not return NNTI_OK: %d", rc);
            MPI_Abort(MPI_COMM_WORLD, rc);
        }
    }
    op_timer=trios_get_time()-op_timer;
    if (num_puts > 0) {
    	out << "async put (" << put_size << " byte transfer) == " << (double)(num_puts*put_size)/one_mb/op_timer << " MBps" << std::endl;
    }

    MPI_Barrier(MPI_COMM_WORLD);


    NNTI_unregister_memory(&send_mr);
    free(send_buf);

    NNTI_unregister_memory(&client_ack_mr);
    free(client_ack_buf);

    NNTI_unregister_memory(&get_dst_mr);
    free(get_dst_buf);

    NNTI_unregister_memory(&put_src_mr);
    free(put_src_buf);

    return;
}