Exemple #1
0
int main()
{

    int i, j, rank, nranks, msgsize, dest;
    int xdim, ydim;
    long bufsize;
    double **buffer;
    double t_start, t_stop, t_total, d_total, bw;
    int count[2], src_stride, trg_stride, stride_level;
    OSP_handle_t osp_handle;

    OSP_Initialize(OSP_THREAD_SINGLE);

    rank = OSP_Process_id(OSP_GROUP_WORLD);
    nranks = OSP_Process_total(OSP_GROUP_WORLD);

    OSP_Barrier_group(OSP_GROUP_WORLD);

    bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
    buffer = (double **) malloc(sizeof(double *) * nranks);
    OSP_Alloc_segment((void **) &(buffer[rank]), bufsize);
    OSP_Exchange_segments(OSP_GROUP_WORLD, (void **) buffer);

    for (i = 0; i < bufsize / sizeof(double); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }

    OSP_Allocate_handle(&osp_handle);

    OSP_Barrier_group(OSP_GROUP_WORLD);

    if (rank == 0)
    {
        printf("OSP_PutS Bandwidth in MBPS \n");
        printf("%30s %22s \n", "Dimensions(array of doubles)", "Latency");
        fflush(stdout);

        dest = 1;

        src_stride = MAX_YDIM * sizeof(double);
        trg_stride = MAX_YDIM * sizeof(double);
        stride_level = 1;

        for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2)
        {

            count[1] = xdim;

            for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2)
            {

                count[0] = ydim * sizeof(double);

                for (i = 0; i < ITERATIONS + SKIP; i++)
                {

                    if (i == SKIP) 
                          t_start = OSP_Time_seconds();

                    OSP_NbPutS(1,
                              stride_level,
                              count,
                              (void *) buffer[dest],
                              &src_stride,
                              (void *) buffer[rank],
                              &trg_stride,
                              osp_handle);

                }
                OSP_Wait_handle(osp_handle);
                t_stop = OSP_Time_seconds();
                OSP_Flush(1);

                char temp[10];
                sprintf(temp, "%dX%d", xdim, ydim);
                t_total = t_stop - t_start;
                d_total = (xdim*ydim*sizeof(double)*ITERATIONS)/(1024*1024);
                bw = d_total/t_total;
                printf("%30s %20.2f \n", temp, bw);
                fflush(stdout);

            }

        }

    }

    OSP_Barrier_group(OSP_GROUP_WORLD);

    OSP_Release_segments(OSP_GROUP_WORLD, (void *) buffer[rank]);
    OSP_Free_segment((void *) buffer[rank]);

    OSP_Finalize();

    return 0;
}
Exemple #2
0
int main() {

   int i, j, rank, nranks, msgsize;
   int xdim, ydim;
   long bufsize;
   double **buffer;
   double t_start, t_stop, t_latency;
   int count[2], src_stride, trg_stride, stride_level, peer;
   double expected, actual;
   
   OSP_Initialize(OSP_THREAD_SINGLE); 

   rank = OSP_Process_id(OSP_GROUP_WORLD);
   nranks = OSP_Process_total(OSP_GROUP_WORLD);

   buffer = (double **) malloc (sizeof(double *) * nranks); 

   OSP_Barrier_group(OSP_GROUP_WORLD);

   bufsize = MAX_XDIM * MAX_YDIM * sizeof(double);
   OSP_Alloc_segment((void **) &(buffer[rank]), bufsize);
   OSP_Exchange_segments(OSP_GROUP_WORLD, (void **) buffer);

   for(i=0; i< bufsize/sizeof(double); i++) {
       *(buffer[rank] + i) = 1.0 + rank;
   }

   if(rank == 0) {
     printf("OSP_PutS Latency - local and remote completions - in usec \n");
     printf("%30s %22s \n", "Dimensions(array of doubles)", "Latency-LocalCompeltion", "Latency-RemoteCompletion");
     fflush(stdout);
   }

   src_stride = MAX_YDIM*sizeof(double);
   trg_stride = MAX_YDIM*sizeof(double);
   stride_level = 1;

   for(xdim=1; xdim<=MAX_XDIM; xdim*=2) {

      count[1] = xdim;

      for(ydim=1; ydim<=MAX_YDIM; ydim*=2) {

        count[0] = ydim*sizeof(double); 
      
        if(rank == 0) 
        {
          peer = 1;          
 
          for(i=0; i<ITERATIONS+SKIP; i++) { 

             if(i == SKIP)
                 t_start = OSP_Time_seconds();              

             OSP_PutS(peer, stride_level, count, (void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride); 
 
          }
          t_stop = OSP_Time_seconds();
          OSP_Flush(peer);
          char temp[10]; 
          sprintf(temp,"%dX%d", xdim, ydim);
          printf("%30s %20.2f", temp, ((t_stop-t_start)*1000000)/ITERATIONS);
          fflush(stdout);

          OSP_Barrier_group(OSP_GROUP_WORLD);

          OSP_Barrier_group(OSP_GROUP_WORLD);

          for(i=0; i<ITERATIONS+SKIP; i++) {
  
             if(i == SKIP)
                t_start = OSP_Time_seconds();

             OSP_PutS(peer, stride_level, count, (void *) buffer[rank], &src_stride, (void *) buffer[peer], &trg_stride);
             OSP_Flush(peer);

          }
          t_stop = OSP_Time_seconds();
          printf("%20.2f \n", ((t_stop-t_start)*1000000)/ITERATIONS);
          fflush(stdout);

          OSP_Barrier_group(OSP_GROUP_WORLD);

          OSP_Barrier_group(OSP_GROUP_WORLD);
        }
        else
        {
            peer = 0;

            expected = (1.0 + (double) peer);

            OSP_Barrier_group(OSP_GROUP_WORLD);

            for(i=0; i<xdim; i++)
            {
               for(j=0; j<ydim; j++)
               {
                   actual = *(buffer[rank] + i*MAX_YDIM + j);
                   if(actual != expected)
                   {
                      printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                              i, j, expected, actual);
                      fflush(stdout);
                      return -1;
                    }
                }
            }

            for(i=0; i< bufsize/sizeof(double); i++) {
                *(buffer[rank] + i) = 1.0 + rank;
            }

            OSP_Barrier_group(OSP_GROUP_WORLD);

            OSP_Barrier_group(OSP_GROUP_WORLD);

            for(i=0; i<xdim; i++)
            {
               for(j=0; j<ydim; j++)
               {
                   actual = *(buffer[rank] + i*MAX_YDIM + j);
                   if(actual != expected)
                   {
                      printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                              i, j, expected, actual);
                      fflush(stdout);
                      return -1;
                    }
                }
            }

            for(i=0; i< bufsize/sizeof(double); i++) {
                *(buffer[rank] + i) = 1.0 + rank;
            }

            OSP_Barrier_group(OSP_GROUP_WORLD);

        }
        
      }

   }

   OSP_Release_segments(OSP_GROUP_WORLD, (void *) buffer[rank]);
   OSP_Free_segment((void *) buffer[rank]);

   OSP_Finalize();

   return 0;
}
Exemple #3
0
/*********************************************************************
 * @fn      AlgorithmTask
 *          This task is responsible for running the sensor algorithms
 *          on the incoming sensor data (could be raw or filtered) and
 *          processing output results
 *
 * @param   none
 *
 * @return  none
 *
 **********************************************************************/
ASF_TASK  void AlgorithmTask (ASF_TASK_ARG)
{
    MessageBuffer *rcvMsg = NULLP;
    OSP_STATUS_t OSP_Status;
    int alg_count;


    OSP_GetLibraryVersion(&version);
    D1_printf("OSP Version: %s\r\n", version->VersionString);

    /* Initialize the mutex */
    mutex_id = osMutexCreate(osMutex(mutexCritSection));

    OSP_Status = OSP_Initialize(&gSystemDesc);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "OSP_Initialize Failed");
    OSP_SetCalibrationConfig( 0x1);     // disable rotational cal.

    D0_printf("--Alg Task %i\r\n", __LINE__);

    // Register the input sensors
    OSP_Status = OSP_RegisterInputSensor(&_AccSensDesc, &_AccHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "OSP_RegisterInputSensor (accel) Failed");

    OSP_Status = OSP_RegisterInputSensor(&_MagSensDesc, &_MagHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "OSP_RegisterInputSensor (mag) Failed");

    OSP_Status = OSP_RegisterInputSensor(&_GyroSensDesc, &_GyroHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "OSP_RegisterInputSensor (gyro) Failed");


#if 0

    SENSOR_SUBSCRIBE(SENSOR_STEP_COUNTER);

    SENSOR_SUBSCRIBE(SENSOR_STEP_DETECTOR);
    SENSOR_SUBSCRIBE(SENSOR_SIGNIFICANT_MOTION);


    SENSOR_SUBSCRIBE(SENSOR_GYROSCOPE_UNCALIBRATED);
    SENSOR_SUBSCRIBE(SENSOR_MAGNETIC_FIELD_UNCALIBRATED);

    SENSOR_SUBSCRIBE(SENSOR_GYROSCOPE);
    SENSOR_SUBSCRIBE(SENSOR_ACCELEROMETER);
    SENSOR_SUBSCRIBE(SENSOR_MAGNETIC_FIELD);
    SENSOR_SUBSCRIBE(SENSOR_ORIENTATION);
    SENSOR_SUBSCRIBE(SENSOR_GRAVITY);
    SENSOR_SUBSCRIBE(SENSOR_LINEAR_ACCELERATION);
    SENSOR_SUBSCRIBE(SENSOR_ROTATION_VECTOR);
    SENSOR_SUBSCRIBE(SENSOR_GAME_ROTATION_VECTOR);
    SENSOR_SUBSCRIBE(SENSOR_GEOMAGNETIC_ROTATION_VECTOR);

    // Subscribing private sensor results
    PRIVATE_SENSOR_SUBSCRIBE(AP_PSENSOR_ACCELEROMETER_UNCALIBRATED);
#endif

    D0_printf("%s: --Alg Task init done\r\n", __func__);

    while (1) {
        ASFReceiveMessage(ALGORITHM_TASK_ID, &rcvMsg);
        if (!(mycount % 64)) {
            LED_Toggle(LED_GREEN);
        }

        switch (rcvMsg->msgId) {
        case MSG_MAG_DATA:
        //    SendBgTrigger();
        case MSG_ACC_DATA:
        case MSG_GYRO_DATA:
            mycount++;
            HandleSensorData(rcvMsg);
            //keep doing foreground computation until its finished
            /* Bump clock speed while processing? HY-DBG */
            alg_count = 0;

            do {
                OSP_Status = OSP_DoForegroundProcessing();

                ASF_assert(OSP_Status != OSP_STATUS_UNSPECIFIED_ERROR);
                alg_count++;
                if (alg_count > 5) {
                    D0_printf("%s:%i Taking too long\r\n", __func__, __LINE__);
                    break;
                }
            } while(OSP_Status != OSP_STATUS_IDLE);
            /* DBG:
             * Run background here as the backgound taks doesn't seem to run enough */
            while(OSP_DoBackgroundProcessing() != OSP_STATUS_IDLE);

            break;
        case MSG_PRESS_DATA:
            PressureDataResultCallback(&rcvMsg->msg.msgPressData);
            break;
        default:
            /* Unhandled messages */
            D1_printf("Alg-FG:!!!UNHANDLED MESSAGE:%d!!!\r\n", rcvMsg->msgId);
            break;
        }
        ASFDeleteMessage( ALGORITHM_TASK_ID, &rcvMsg );
#ifdef DEBUG_TEST_SENSOR_SUBSCRIPTION
        // Testing subscribe and unsubscribe sensors
        DebugTestSensorSubscription();
#endif
    }
}
/****************************************************************************************************
 * @fn      AlgorithmTask
 *          This task is responsible for running the sensor algorithms on the incoming sensor
 *          data (could be raw or filtered) and processing output results
 *
 * @param   none
 *
 * @return  none
 *
 ***************************************************************************************************/
ASF_TASK  void AlgorithmTask ( ASF_TASK_ARG )
{
    MessageBuffer *rcvMsg = NULLP;
    osp_status_t OSP_Status;

    OSP_GetVersion(&version);
    D1_printf("OSP Version: %s\r\n", version->VersionString);

    OSP_Status = OSP_Initialize(&gSystemDesc);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_Initialize Failed");

    // Register the input sensors
    OSP_Status = OSP_RegisterInputSensor(&_AccSensDesc, &_AccHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_RegisterSensor (accel) Failed");

    OSP_Status = OSP_RegisterInputSensor(&_MagSensDesc, &_MagHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_RegisterSensor (mag) Failed");

    OSP_Status = OSP_RegisterInputSensor(&_GyroSensDesc, &_GyroHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_RegisterSensor (gyro) Failed");

    // Register output sensors/results
    OSP_Status =  OSP_SubscribeOutputSensor(&stepCounterRequest, &_stepCounterHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_SubscribeResult (SENSOR_STEP_COUNTER) Failed");

    OSP_Status =  OSP_SubscribeOutputSensor(&sigMotionRequest, &_sigMotionHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_SubscribeResult (SENSOR_CONTEXT_DEVICE_MOTION) Failed");

    OSP_Status =  OSP_SubscribeOutputSensor(&UnCalAccelRequest, &_unCalAccelHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_SubscribeResult (SENSOR_ACCELEROMETER) Failed");

    OSP_Status =  OSP_SubscribeOutputSensor(&UnCalMagRequest, &_unCalMagHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_SubscribeResult (SENSOR_MAGNETIC_FIELD) Failed");

    OSP_Status =  OSP_SubscribeOutputSensor(&UnCalGyroRequest, &_unCalGyroHandle);
    ASF_assert_msg(OSP_STATUS_OK == OSP_Status, "SensorManager: OSP_SubscribeResult (SENSOR_GYROSCOPE) Failed");

    while (1)
    {
        ASFReceiveMessage( ALGORITHM_TASK_ID, &rcvMsg );
        switch (rcvMsg->msgId)
        {
        case MSG_MAG_DATA:
            SendBgTrigger();
        case MSG_ACC_DATA:
        case MSG_GYRO_DATA:
            HandleSensorData(rcvMsg);
            do
            {
                OSP_Status = OSP_DoForegroundProcessing();
                ASF_assert(OSP_Status != OSP_STATUS_ERROR);
            } while(OSP_Status != OSP_STATUS_IDLE)
                ; //keep doing foreground computation until its finished
            break;

        default:
            /* Unhandled messages */
            D1_printf("Alg-FG:!!!UNHANDLED MESSAGE:%d!!!\r\n", rcvMsg->msgId);
            break;
        }
    }
}
int main()
{

    size_t i, rank, nranks, msgsize, peer;
    long bufsize;
    double **buffer;
    double scaling;
    double t_start, t_stop, t_latency;

    OSP_Initialize(OSP_THREAD_SINGLE);

    rank = OSP_Process_id(OSP_GROUP_WORLD);
    nranks = OSP_Process_total(OSP_GROUP_WORLD);

    bufsize = MAX_MSG_SIZE * (ITERATIONS + SKIP);
    buffer = (double **) malloc(sizeof(double *) * nranks);
    OSP_Alloc_segment((void **) &(buffer[rank]), bufsize);
    OSP_Exchange_segments(OSP_GROUP_WORLD, (void **) buffer);

    if (rank == 0)
    {
        printf("OSP_PutAcc Latency in usec \n");
        printf("%20s %22s %22s\n",
               "Message Size",
               "Local Completion",
               "Remote Completion");
        fflush(stdout);
    }

    for (i = 0; i < (((ITERATIONS + SKIP) * MAX_MSG_SIZE) / sizeof(double)); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }
    scaling = 2.0;

    OSP_Barrier_group(OSP_GROUP_WORLD);

    for (msgsize = sizeof(double); msgsize < MAX_MSG_SIZE; msgsize *= 2)
    {

        if (rank == 0)
        {

            peer = 1;

            /** Local Completion **/
            for (i = 0; i < ITERATIONS + SKIP; i++)
            {

                if (i == SKIP) t_start = OSP_Time_seconds();

                OSP_PutAcc(peer,
                          (void *) ((size_t) buffer[rank] + (size_t)(i
                                  * msgsize)),
                          (void *) ((size_t) buffer[peer] + (size_t)(i
                                  * msgsize)),
                          msgsize,
                          OSP_DOUBLE,
                          (void *) &scaling);

            }
            t_stop = OSP_Time_seconds();
            OSP_Flush(1);
            printf("%20d %20.2f ", msgsize, ((t_stop - t_start) * 1000000)
                    / ITERATIONS);
            fflush(stdout);

            OSP_Barrier_group(OSP_GROUP_WORLD);

            OSP_Barrier_group(OSP_GROUP_WORLD);

            for (i = 0; i < ITERATIONS + SKIP; i++)
            {

                if (i == SKIP) t_start = OSP_Time_seconds();

                OSP_PutAcc(1, (void *) ((size_t) buffer[0] + (size_t)(i
                        * msgsize)), (void *) ((size_t) buffer[1] + (size_t)(i
                        * msgsize)), msgsize, OSP_DOUBLE, (void *) &scaling);
                OSP_Flush(1);

            }
            t_stop = OSP_Time_seconds();
            printf("%20.2f \n", ((t_stop - t_start) * 1000000) / ITERATIONS);
            fflush(stdout);

            OSP_Barrier_group(OSP_GROUP_WORLD);

            OSP_Barrier_group(OSP_GROUP_WORLD);

        }
        else
        {

            peer = 0;

            OSP_Barrier_group(OSP_GROUP_WORLD);

            /** Data Validation **/
            for (i = 0; i < (((ITERATIONS + SKIP) * msgsize) / sizeof(double)); i++)
            {
                if (*(buffer[rank] + i) != ((1.0 + rank) + scaling * (1.0
                        + peer)))
                {
                    printf("Data validation failed At displacement : %d Expected : %f Actual : %f \n",
                           i,
                           ((1.0 + rank) + scaling * (1.0 + peer)),
                           *(buffer[rank] + i));
                    fflush(stdout);
                    return -1;
                }
            }

            for (i = 0; i < (((ITERATIONS + SKIP) * MAX_MSG_SIZE)
                    / sizeof(double)); i++)
            {
                *(buffer[rank] + i) = 1.0 + rank;
            }

            OSP_Barrier_group(OSP_GROUP_WORLD);

            OSP_Barrier_group(OSP_GROUP_WORLD);

            /** Data Validation **/
            for (i = 0; i < (((ITERATIONS + SKIP) * msgsize) / sizeof(double)); i++)
            {
                if (*(buffer[rank] + i) != ((1.0 + rank) + scaling * (1.0
                        + peer)))
                {
                    printf("Data validation failed At displacement : %d Expected : %f Actual : %f \n",
                           i,
                           ((1.0 + rank) + scaling * (1.0 + peer)),
                           *(buffer[rank] + i));
                    fflush(stdout);
                    return -1;
                }
            }

            for (i = 0; i < (((ITERATIONS + SKIP) * MAX_MSG_SIZE)
                    / sizeof(double)); i++)
            {
                *(buffer[rank] + i) = 1.0 + rank;
            }

            OSP_Barrier_group(OSP_GROUP_WORLD);

        }

    }

    OSP_Release_segments(OSP_GROUP_WORLD, buffer[rank]);

    OSP_Finalize();

    return 0;
}
Exemple #6
0
int main()
{

    int i, j, rank, nranks, msgsize, dest;
    int dim;
    long bufsize;
    double **buffer;
    int iterations;
    double t_start, t_stop, t_total, d_total, bw;
    int count[2], src_stride, trg_stride, stride_level;
    OSP_handle_t osp_handle;

    OSP_Initialize(OSP_THREAD_SINGLE);

    rank = OSP_Process_id(OSP_GROUP_WORLD);
    nranks = OSP_Process_total(OSP_GROUP_WORLD);

    OSP_Barrier_group(OSP_GROUP_WORLD);

    bufsize = MAX_DIM * MAX_DIM * sizeof(double);
    buffer = (double **) malloc(sizeof(double *) * nranks);
    OSP_Alloc_segment((void **) &(buffer[rank]), bufsize);
    OSP_Exchange_segments(OSP_GROUP_WORLD, (void **) buffer);

    for (i = 0; i < bufsize / sizeof(double); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }

    OSP_Allocate_handle(&osp_handle);

    OSP_Barrier_group(OSP_GROUP_WORLD);

    if (rank == 0)
    {
        printf("OSP_GetS Bandwidth in MBPS \n");
        printf("%30s %30s %22s \n", "MsgSize", "Dimensions(array of doubles)", "Latency");
        fflush(stdout);

        dest = 1;

        src_stride = MAX_DIM * sizeof(double);
        trg_stride = MAX_DIM * sizeof(double);
        stride_level = 1;

        for (dim = 1; dim <= MAX_DIM; dim *= 2)
        {

            count[0] = dim*sizeof(double);
            count[1] = dim;

            iterations = (MAX_DIM * MAX_DIM)/(dim*dim);

            t_start = OSP_Time_seconds();
            for (i = 0; i < iterations; i++)
            {

                OSP_NbGetS(1,
                          stride_level,
                          count,
                          (void *) buffer[dest],
                          &src_stride,
                          (void *) buffer[rank],
                          &trg_stride,
                          osp_handle);

            }
            OSP_Wait_handle(osp_handle);
            t_stop = OSP_Time_seconds();

            char temp[10];
            sprintf(temp, "%dX%d", dim, dim);
            t_total = t_stop - t_start;
            d_total = (dim*dim*sizeof(double)*iterations)/(1024*1024);
            bw = d_total/t_total;
            printf("%30d %30s %20.2f \n", dim*dim*sizeof(double), temp, bw);
            fflush(stdout);

            for (i = 0; i < dim; i++)
            {
                for (j = 0; j < dim; j++)
                {
                    if (*(buffer[rank] + i * MAX_DIM + j) != (1.0 + dest))
                    {
                        printf("Data validation failed at X: %d Y: %d Expected : %f Actual : %f \n",
                               i,
                               j,
                               (1.0 + dest),
                               *(buffer[rank] + i * MAX_DIM + j));
                        fflush(stdout);
                        return -1;
                    }
                }
            }

            for (i = 0; i < bufsize / sizeof(double); i++)
            {
                *(buffer[rank] + i) = 1.0 + rank;
            }
        }

    }

    OSP_Barrier_group(OSP_GROUP_WORLD);

    OSP_Release_segments(OSP_GROUP_WORLD, (void *) buffer[rank]);
    OSP_Free_segment((void *) buffer[rank]);

    OSP_Finalize();

    return 0;
}
Exemple #7
0
int main()
{

    size_t i, rank, nranks, msgsize, dest;
    long bufsize;
    double **buffer;
    double t_start, t_stop, t_latency;

    OSP_Initialize(OSP_THREAD_SINGLE);

    rank = OSP_Process_id(OSP_GROUP_WORLD);
    nranks = OSP_Process_total(OSP_GROUP_WORLD);

    bufsize = MAX_MSG_SIZE * (ITERATIONS + SKIP);
    buffer = (double **) malloc(sizeof(double *) * nranks);
    OSP_Alloc_segment((void **) &(buffer[rank]), bufsize);
    OSP_Exchange_segments(OSP_GROUP_WORLD, (void **) buffer);

    for (i = 0; i < bufsize / sizeof(double); i++)
    {
        *(buffer[rank] + i) = 1.0 + rank;
    }

    OSP_Barrier_group(OSP_GROUP_WORLD);

    if (rank == 0)
    {

        printf("OSP_Get Latency in usec \n");
        printf("%20s %22s \n", "Message Size", "Latency");
        fflush(stdout);

        dest = 1;

        for (msgsize = sizeof(double); msgsize <= MAX_MSG_SIZE; msgsize *= 2)
        {

            for (i = 0; i < ITERATIONS + SKIP; i++)
            {

                if (i == SKIP) t_start = OSP_Time_seconds();

                OSP_Get(1, (void *) ((size_t) buffer[dest] + (size_t)(i
                        * msgsize)), (void *) ((size_t) buffer[rank]
                        + (size_t)(i * msgsize)), msgsize);

            }
            t_stop = OSP_Time_seconds();
            printf("%20d %20.2f \n", msgsize, ((t_stop - t_start) * 1000000)
                    / ITERATIONS);
            fflush(stdout);

            for (i = 0; i < (((ITERATIONS + SKIP) * msgsize) / sizeof(double)); i++)
            {
                if (*(buffer[rank] + i) != (1.0 + dest))
                {
                    printf("Data validation failed At displacement : %d Expected : %f Actual : %f \n",
                           i,
                           (1.0 + dest),
                           *(buffer[rank] + i));
                    fflush(stdout);
                    return -1;
                }
            }

            for (i = 0; i < bufsize / sizeof(double); i++)
            {
                *(buffer[rank] + i) = 1.0 + rank;
            }
        }

    }

    OSP_Barrier_group(OSP_GROUP_WORLD);

    OSP_Release_segments(OSP_GROUP_WORLD, buffer[rank]);

    OSP_Finalize();

    return 0;
}
int main() {

   int i, j, k, rank, nranks, msgsize;
   int dim;
   long bufsize;
   double **buffer;
   unsigned long long t_start, t_stop, t_latency, t_overlap;
   unsigned long long wait_start, wait_stop;
   int count[2], src_stride, trg_stride, stride_level, peer;
   double A[1024][1024], B[1024][1024], C[1024][1024];
   int m1,m2,m3;
   double expected, actual;
   OSP_handle_t osp_handle;
   
   OSP_Initialize(OSP_THREAD_SINGLE); 

   rank = OSP_Process_id(OSP_GROUP_WORLD);
   nranks = OSP_Process_total(OSP_GROUP_WORLD);

   buffer = (double **) malloc (sizeof(double *) * nranks); 

   OSP_Allocate_handle(&osp_handle);

   OSP_Barrier_group(OSP_GROUP_WORLD);

   bufsize = MAX_DIM * MAX_DIM * sizeof(double);
   OSP_Alloc_segment((void **) &(buffer[rank]), bufsize);
   OSP_Exchange_segments(OSP_GROUP_WORLD, (void **) buffer);

   for(i=0; i< bufsize/sizeof(double); i++) {
       *(buffer[rank] + i) = 1.0 + rank;
   }

   if(rank == 0) {

      printf("OSP_PutS Overlap - NbPutS + DGEMM + Wait. Time in cycles\n");
      printf("%30s %30s %22s %22s\n", "Msg Size", "Dimensions(array of doubles)", "Base Latency", "Overlaped Latency");
      fflush(stdout);

      src_stride = MAX_DIM*sizeof(double);
      trg_stride = MAX_DIM*sizeof(double);
      stride_level = 1;
 
      for(dim=1; dim<=MAX_DIM; dim*=2) {
 
         count[0] = dim*sizeof(double);
         count[1] = 512;
 
         peer = 1;          
  
         for(i=0; i<ITERATIONS+SKIP; i++) { 
 
            if(i == SKIP)
                t_start = OSP_Time_cycles();              

             for(k=0; k<WINDOW; k++)      
             { 
                OSP_NbPutS(peer, stride_level, count, (void *) buffer[rank],
                      &src_stride, (void *) buffer[peer], &trg_stride, osp_handle);
             }

             OSP_Wait_handle(osp_handle);
 
         }
         t_stop = OSP_Time_cycles();
         OSP_Flush(peer);
         
         t_latency = (t_stop-t_start)/ITERATIONS;
  
         char temp[10];
         sprintf(temp,"%dX%d", count[1], dim);
         printf("%30d %30s %20lld", count[1]*count[0], temp, t_latency);
         fflush(stdout);
 
         t_start = OSP_Time_cycles();
         for(i=0; i<ITERATIONS; i++) {

            for(k=0; k<WINDOW; k++)      
            {
               OSP_NbPutS(peer, stride_level, count, (void *) buffer[rank],
                      &src_stride, (void *) buffer[peer], &trg_stride, osp_handle);
            } 
  
            wait_start = OSP_Time_cycles();
            for(m1=0; m1<1024; m1++)
            {
               for(m2=0; m2<1024; m2++)
               {
                 for(m3=0; m3<1024; m3++)
                 {
                    C[m1][m2] +=  A[m1][m3] * B[m3][m2];
                    wait_stop = OSP_Time_cycles();
                    if((wait_stop - wait_start) > t_latency)
                         break;
                 }
                 if((wait_stop - wait_start) > t_latency)
                      break;
               }
               if((wait_stop - wait_start) > t_latency)
                    break;
            } 
 
            OSP_Wait_handle(osp_handle);
 
         }
         t_stop = OSP_Time_cycles();
         OSP_Flush(peer);
         t_overlap = (t_stop - t_start)/ITERATIONS;
 
         printf("%20lld \n", t_overlap);          
 
      }

   }

   OSP_Barrier_group(OSP_GROUP_WORLD);

   OSP_Release_handle(osp_handle);

   OSP_Release_segments(OSP_GROUP_WORLD, (void *) buffer[rank]);

   OSP_Free_segment((void *) buffer[rank]);

   OSP_Finalize();

   return 0;
}