Example #1
0
int
main(int argc, char *argv[])
{
    FILE        *out;           /* Output data file                          */
    char        s[255];         /* Generic string                            */
    char        *memtmp;
    char        *memtmp1;

    int         c,              /* option index                              */
                i, j, n, nq,    /* Loop indices                              */
		asyncReceive=0, /* Pre-post a receive buffer?                */
                bufoffset=0,    /* Align buffer to this                      */
                bufalign=16*1024,/* Boundary to align buffer to              */
		errFlag,        /* Error occurred in inner testing loop      */
                nrepeat,        /* Number of time to do the transmission     */
                len,            /* Number of bytes to be transmitted         */
                inc=0,          /* Increment value                           */
                trans=-1,       /* Transmitter flag. 1 if transmitting.      */
                detailflag=0,   /* Set to examine the signature curve detail */
                bufszflag=0,    /* Set to change the TCP socket buffer size  */
                pert,           /* Perturbation value                        */
                start=1,        /* Starting value for signature curve        */
                end=MAXINT,     /* Ending value for signature curve          */
                streamopt=0,    /* Streaming mode flag                       */
                printopt=0;     /* Debug print statements flag               */
   
    ArgStruct   args;           /* Argumentsfor all the calls                */

    double      t, t0, t1, t2,  /* Time variables                            */
                tlast,          /* Time for the last transmission            */
                latency;        /* Network message latency                   */

    Data        bwdata[NSAMP];  /* Bandwidth curve data                      */

    short       port=DEFPORT;   /* Port number for connection                */
#ifdef HAVE_GETRUSAGE
    struct rusage prev_rusage, curr_rusage; /* Resource usage                */
    double	user_time, sys_time;	/* User & system time used                   */
    double	best_user_time, best_sys_time; /* Total user & system time used     */
    double      ut1, ut2, st1, st2; /* User & system time ctrs for variance  */
    double	ut_var, st_var;	/* Variance in user & system time            */
#endif

#ifdef MPI
    MPI_Init(&argc, &argv);
#endif

    strcpy(s, "NetPIPE.out");
#ifndef MPI
    if(argc < 2)
     PrintUsage();
#endif

    /* Parse the arguments. See Usage for description */
    while ((c = getopt(argc, argv, "Pstrh:p:o:A:O:l:u:i:b:a")) != -1)
    {
        switch(c)
        {
            case 'o': strcpy(s,optarg);
                      break;

            case 't': trans = 1;
                      break;
            
            case 'r': trans = 0;
                      break;

            case 's': streamopt = 1;
                      break;

            case 'l': /*detailflag = 1;*/
                      start = atoi(optarg);
                      if (start < 1)
                      {
                        fprintf(stderr,"Need a starting value >= 1\n");
                        exit(743);
                      }
                      break;

            case 'u': /*detailflag = 1;*/
                      end = atoi(optarg);
                      break;

            case 'i': detailflag = 1;
                      inc = atoi(optarg);
                      break;

            case 'b': bufszflag = 1;
#ifdef TCP
                      args.prot.rcvbufsz=atoi(optarg);
                      args.prot.sndbufsz=args.prot.rcvbufsz;
#endif
                      break;

            case 'P': printopt = 1;
                      break;
            
            case 'A': bufalign = atoi(optarg);
                      break;

            case 'O': bufoffset = atoi(optarg);
                      break;

            case 'p': port = atoi(optarg);
                      break;
            
            case 'h': if (trans == 1)
                      {
                          args.host = (char *)malloc(strlen(optarg)+1);
                          strcpy(args.host, optarg);
                      }
		      else
		      {
			  fprintf(stderr, "Error: -t must be specified before -h\n");
			  exit(-11);
		      }
                      break;

	    case 'a': asyncReceive = 1;
		      break;

            default:  PrintUsage(); 
                      exit(-12);
        }
    }
    if (start > end)
    {
        fprintf(stderr, "Start MUST be LESS than end\n");
        exit(420132);
    }
#if defined(TCP) || defined(PVM)
    /*
      It should be explicitly specified whether this is the transmitter
      or the receiver.
    */
    if (trans < 0)
    {
	fprintf(stderr, "Error: either -t or -r must be specified\n");
	exit(-11);
    }
#endif

    args.nbuff = TRIALS;
    args.tr = trans;
    args.port = port;

#if defined(TCP)
    if (!bufszflag)
    {
        args.prot.sndbufsz = 0;
        args.prot.rcvbufsz = 0;
    }
    else
        fprintf(stderr,"Send and Recv Buffers are %d bytes\n",
                                                   args.prot.sndbufsz);
#endif

    Setup(&args);
    Establish(&args);

    if (args.tr)
    {
        if ((out = fopen(s, "w")) == NULL)
        {
            fprintf(stderr,"Can't open %s for output\n", s);
            exit(1);
        }
    }
    else
        out = stdout;

    args.bufflen = 1;
    args.buff = (char *)malloc(args.bufflen);
    args.buff1 = (char *)malloc(args.bufflen);
    if (asyncReceive)
	PrepareToReceive(&args);
    Sync(&args);
    t0 = When();
    t0 = When();
    t0 = When();
#ifdef HAVE_GETRUSAGE
    getrusage(RUSAGE_SELF, &prev_rusage);
#endif
    t0 = When();
    for (i = 0; i < LATENCYREPS; i++)
    {
        if (args.tr)
        {
            SendData(&args);
            RecvData(&args);
	    if (asyncReceive && (i < LATENCYREPS - 1))
	    {
		PrepareToReceive(&args);
	    }
        }
        else
        {
            RecvData(&args);
	    if (asyncReceive && (i < LATENCYREPS - 1))
	    {
		PrepareToReceive(&args);
	    }
            SendData(&args);
        }
    }
    latency = (When() - t0)/(2 * LATENCYREPS);
#ifdef HAVE_GETRUSAGE
    getrusage(RUSAGE_SELF, &curr_rusage);
#endif
    free(args.buff);
    free(args.buff1);


    if (args.tr)
    {
        SendTime(&args, &latency);
    }
    else
    {
        RecvTime(&args, &latency);
    }
    if (args.tr && printopt)
    {
        fprintf(stderr,"Latency: %.7f\n", latency);
        fprintf(stderr,"Now starting main loop\n");
    }
    tlast = latency;
    if (inc == 0)
    {
	/* Set a starting value for the message size increment. */
	inc = (start > 1) ? start / 2 : 1;
    }

    /* Main loop of benchmark */
    for (nq = n = 0, len = start, errFlag = 0; 
         n < NSAMP - 3 && tlast < STOPTM && len <= end && !errFlag; 
         len = len + inc, nq++ )
    {
        if (nq > 2 && !detailflag)
	  {
	    /*
	      This has the effect of exponentially increasing the block
	      size.  If detailflag is false, then the block size is
	      linearly increased (the increment is not adjusted).
	    */
            inc = ((nq % 2))? inc + inc: inc;
	  }
        
        /* This is a perturbation loop to test nearby values */
        for (pert = (!detailflag && inc > PERT+1)? -PERT: 0;
             pert <= PERT; 
             n++, pert += (!detailflag && inc > PERT+1)? PERT: PERT+1)
        {

            /* Calculate how many times to repeat the experiment. */
            if (args.tr)
            {
                nrepeat = MAX((RUNTM / ((double)args.bufflen /
                                 (args.bufflen - inc + 1.0) * tlast)), TRIALS);
                SendRepeat(&args, nrepeat);
            }
            else
            {
                RecvRepeat(&args, &nrepeat);
            }

            /* Allocate the buffer */
            args.bufflen = len + pert;
            if((args.buff=(char *)malloc(args.bufflen+bufalign))==(char *)NULL)
            {
                fprintf(stderr,"Couldn't allocate memory\n");
		errFlag = -1;
                break;
            }
            if((args.buff1=(char *)malloc(args.bufflen+bufalign))==(char *)NULL)
            {
                fprintf(stderr,"Couldn't allocate memory\n");
		errFlag = -1;
                break;
            }
            /*
	      Possibly align the data buffer: make memtmp and memtmp1
	      point to the original blocks (so they can be freed later),
	      then adjust args.buff and args.buff1 if the user requested it.
	    */
            memtmp = args.buff;
            memtmp1 = args.buff1;
            if (bufalign != 0)
                args.buff +=(bufalign - 
                        ((int)(*args.buff) % bufalign) + bufoffset) % bufalign;

            if (bufalign != 0)
                args.buff1 +=(bufalign - 
                        ((int)(*args.buff1) % bufalign) + bufoffset) % bufalign;


            if (args.tr && printopt)
                fprintf(stderr,"%3d: %9d bytes %4d times --> ",
                                 n,args.bufflen,nrepeat);

            /* Finally, we get to transmit or receive and time */
            if (args.tr)
            {
		/*
		   This is the transmitter: send the block TRIALS times, and
		   if we are not streaming, expect the receiver to return each
		   block.
		*/
                bwdata[n].t = LONGTIME;
                t2 = t1 = 0;
#ifdef HAVE_GETRUSAGE
		ut1 = ut2 = st1 = st2 = 0.0;
		best_user_time = best_sys_time = LONGTIME;
#endif
                for (i = 0; i < TRIALS; i++)
                {
                    Sync(&args);
#ifdef HAVE_GETRUSAGE
		    getrusage(RUSAGE_SELF, &prev_rusage);
#endif
                    t0 = When();
                    for (j = 0; j < nrepeat; j++)
                    {
			if (asyncReceive && !streamopt)
			{
			    PrepareToReceive(&args);
			}
                        SendData(&args);
                        if (!streamopt)
			{
                            RecvData(&args);
			}
                    }
                    t = (When() - t0)/((1 + !streamopt) * nrepeat);
#ifdef HAVE_GETRUSAGE
		    getrusage(RUSAGE_SELF, &curr_rusage);
		    user_time = ((curr_rusage.ru_utime.tv_sec -
			      prev_rusage.ru_utime.tv_sec) + (double)
			     (curr_rusage.ru_utime.tv_usec -
			      prev_rusage.ru_utime.tv_usec) * 1.0E-6) /
		      ((1 + !streamopt) * nrepeat);
		    sys_time = ((curr_rusage.ru_stime.tv_sec -
			      prev_rusage.ru_stime.tv_sec) + (double)
			     (curr_rusage.ru_stime.tv_usec -
			      prev_rusage.ru_stime.tv_usec) * 1.0E-6) /
		      ((1 + !streamopt) * nrepeat);
		    ut2 += user_time * user_time;
		    st2 += sys_time * sys_time;
		    ut1 += user_time;
		    st1 += sys_time;
		    if ((user_time + sys_time) < (best_user_time + best_sys_time))
		    {
			best_user_time = user_time;
			best_sys_time = sys_time;
		    }
#endif

                    if (!streamopt)
                    {
                        t2 += t*t;
                        t1 += t;
                        bwdata[n].t = MIN(bwdata[n].t, t);
                    }
                }
                if (!streamopt)
                    SendTime(&args, &bwdata[n].t);
                else
                    RecvTime(&args, &bwdata[n].t);

                if (!streamopt)
                    bwdata[n].variance = t2/TRIALS - t1/TRIALS * t1/TRIALS;

#ifdef HAVE_GETRUSAGE
		ut_var = ut2/TRIALS - (ut1/TRIALS) * (ut1/TRIALS);
		st_var = st2/TRIALS - (st1/TRIALS) * (st1/TRIALS);
#endif

            }
            else
            {
		/*
		   This is the receiver: receive the block TRIALS times, and
		   if we are not streaming, send the block back to the
		   sender.
		*/
                bwdata[n].t = LONGTIME;
                t2 = t1 = 0;
                for (i = 0; i < TRIALS; i++)
                {
		    if (asyncReceive)
		    {
		    	PrepareToReceive(&args);
		    }
                    Sync(&args);
                    t0 = When();
                    for (j = 0; j < nrepeat; j++)
                    {
                        RecvData(&args);
			if (asyncReceive && (j < nrepeat - 1))
			{
			    PrepareToReceive(&args);
			}
                        if (!streamopt)
                            SendData(&args);
                    }
                    t = (When() - t0)/((1 + !streamopt) * nrepeat);

                    if (streamopt)
                    {
                        t2 += t*t;
                        t1 += t;
                        bwdata[n].t = MIN(bwdata[n].t, t);
                    }
                }
                if (streamopt)
                    SendTime(&args, &bwdata[n].t);
                else
                    RecvTime(&args, &bwdata[n].t);

                if (streamopt)
                    bwdata[n].variance = t2/TRIALS - t1/TRIALS * t1/TRIALS;

            }
            tlast = bwdata[n].t;
            bwdata[n].bits = args.bufflen * CHARSIZE;
            bwdata[n].bps = bwdata[n].bits / (bwdata[n].t * 1024 * 1024);
            bwdata[n].repeat = nrepeat;
            
            if (args.tr)
	    {
		fprintf(out, "%.7f %.7f %d %d %.7f", bwdata[n].t, bwdata[n].bps,
			bwdata[n].bits, bwdata[n].bits / 8,
			bwdata[n].variance);
#ifdef HAVE_GETRUSAGE
		fprintf(out, " %.7f %.7f %.7f %.7f", ut1 / (double) TRIALS,
			st1 / (double) TRIALS, ut_var, st_var);
#endif
		fprintf(out, "\n");
	    }
            fflush(out);

            free(memtmp);
            free(memtmp1);

            if (args.tr && printopt)
	    {
        	fprintf(stderr," %6.2f Mbps in %.7f sec", bwdata[n].bps,
			tlast);
#ifdef HAVE_GETRUSAGE
		fprintf(stderr, ", avg utime=%.7f avg stime=%.7f, ",
			ut1 / (double) TRIALS,
			st1 / (double) TRIALS);
		fprintf(stderr, "min utime=%.7f stime=%.7f, ", best_user_time,
			best_sys_time);
		fprintf(stderr, "utime var=%.7f stime var=%.7f", ut_var, st_var);
#endif
		fprintf(stderr, "\n");
	    }
        } /* End of perturbation loop */

    } /* End of main loop  */
        
    if (args.tr)
       fclose(out);
         
    CleanUp(&args);
    return(0);
}
int main(int argc, char **argv)
{
    FILE        *out;           /* Output data file                          */
    char        s[255],s2[255],delim[255],*pstr; /* Generic strings          */
    int         *memcache;      /* used to flush cache                       */

    int         len_buf_align,  /* meaningful when args.cache is 0. buflen   */
                                /* rounded up to be divisible by 8           */
                num_buf_align;  /* meaningful when args.cache is 0. number   */
                                /* of aligned buffers in memtmp              */

    int         c,              /* option index                              */
                i, j, n, nq,    /* Loop indices                              */
                asyncReceive=0, /* Pre-post a receive buffer?                */
                bufalign=16*1024,/* Boundary to align buffer to              */
                errFlag,        /* Error occurred in inner testing loop      */
                nrepeat,        /* Number of time to do the transmission     */
                nrepeat_const=0,/* Set if we are using a constant nrepeat    */
                len,            /* Number of bytes to be transmitted         */
                inc=0,          /* Increment value                           */
                perturbation=DEFPERT, /* Perturbation value                  */
                pert,
                start= 1,       /* Starting value for signature curve        */
                end=MAXINT,     /* Ending value for signature curve          */
                streamopt=0,    /* Streaming mode flag                       */
                reset_connection;/* Reset the connection between trials      */
   
    ArgStruct   args;           /* Arguments for all the calls               */

    double      t, t0, t1, t2,  /* Time variables                            */
                tlast,          /* Time for the last transmission            */
                latency;        /* Network message latency                   */

    Data        bwdata[NSAMP];  /* Bandwidth curve data                      */

    int         integCheck=0;   /* Integrity check                           */

    /* Initialize vars that may change from default due to arguments */

    strcpy(s, "np.out");   /* Default output file */

    /* Let modules initialize related vars, and possibly call a library init
       function that requires argc and argv */


    Init(&args, &argc, &argv);   /* This will set args.tr and args.rcv */

    args.preburst = 0; /* Default to not bursting preposted receives */
    args.bidir = 0; /* Turn bi-directional mode off initially */
    args.cache = 1; /* Default to use cache */
    args.upper = end;
    args.host  = NULL;
    args.soffset=0; /* default to no offsets */
    args.roffset=0; 
    args.syncflag=0; /* use normal mpi_send */
    args.port = DEFPORT; /* just in case the user doesn't set this. */

    /* TCGMSG launches NPtcgmsg with a -master master_hostname
     * argument, so ignore all arguments and set them manually 
     * in netpipe.c instead.
     */

#if ! defined(TCGMSG)

    /* Parse the arguments. See Usage for description */
    while ((c = getopt(argc, argv, "SO:rIiszgfaB2h:p:o:l:u:b:m:n:t:c:d:D:P:")) != -1)
    {
        switch(c)
        {
            case 'O':
                      strcpy(s2,optarg);
                      strcpy(delim,",");
                      if((pstr=strtok(s2,delim))!=NULL) {
                         args.soffset=atoi(pstr);
                         if((pstr=strtok((char *)NULL,delim))!=NULL)
                            args.roffset=atoi(pstr);
                         else /* only got one token */
                            args.roffset=args.soffset;
                      } else {
                         args.soffset=0; args.roffset=0;
                      }
                      printf("Transmit buffer offset: %d\nReceive buffer offset: %d\n",args.soffset,args.roffset);
                      break;
            case 'p': perturbation = atoi(optarg);
                      if( perturbation > 0 ) {
                         printf("Using a perturbation value of %d\n\n", perturbation);
                      } else {
                         perturbation = 0;
                         printf("Using no perturbations\n\n");
                      }
                      break;

            case 'B': if(integCheck == 1) {
                        fprintf(stderr, "Integrity check not supported with prepost burst\n");
                        exit(-1);
                      }
                      args.preburst = 1;
                      asyncReceive = 1;
                      printf("Preposting all receives before a timed run.\n");
                      printf("Some would consider this cheating,\n");
                      printf("but it is needed to match some vendor tests.\n"); fflush(stdout);
                      break;

            case 'I': args.cache = 0;
                      printf("Performance measured without cache effects\n\n"); fflush(stdout);
                      break;

            case 'o': strcpy(s,optarg);
                      printf("Sending output to %s\n", s); fflush(stdout);
                      break;

            case 's': streamopt = 1;
                      printf("Streaming in one direction only.\n\n");
#if defined(TCP) && ! defined(INFINIBAND) 
                      printf("Sockets are reset between trials to avoid\n");
                      printf("degradation from a collapsing window size.\n\n");
#endif
                      args.reset_conn = 1;
                      printf("Streaming does not provide an accurate\n");
                      printf("measurement of the latency since small\n");
                      printf("messages may get bundled together.\n\n");
                      if( args.bidir == 1 ) {
                        printf("You can't use -s and -2 together\n");
                        exit(0);
                      }
                      fflush(stdout);
                      break;

            case 'l': start = atoi(optarg);
                      if (start < 1)
                      {
                        fprintf(stderr,"Need a starting value >= 1\n");
                        exit(0);
                      }
                      break;

            case 'u': end = atoi(optarg);
                      break;

#if defined(TCP) && ! defined(INFINIBAND)
            case 'b': /* -b # resets the buffer size, -b 0 keeps system defs */
                      args.prot.sndbufsz = args.prot.rcvbufsz = atoi(optarg);
                      break;
#endif

            case '2': args.bidir = 1;    /* Both procs are transmitters */
                         /* end will be maxed at sndbufsz+rcvbufsz */
                      printf("Passing data in both directions simultaneously.\n");
                      printf("Output is for the combined bandwidth.\n");
#if defined(TCP) && ! defined(INFINIBAND)
                      printf("The socket buffer size limits the maximum test size.\n\n");
#endif
                      if( streamopt ) {
                        printf("You can't use -s and -2 together\n");
                        exit(0);
                      }
                      break;

            case 'h': args.tr = 1;       /* -h implies transmit node */
                      args.rcv = 0;
                      args.host = (char *)malloc(strlen(optarg)+1);
                      strcpy(args.host, optarg);
                      break;

#ifdef DISK
            case 'd': args.tr = 1;      /* -d to specify input/output file */
                      args.rcv = 0;
                      args.prot.read = 0;
                      args.prot.read_type = 'c';
                      args.prot.dfile_name = (char *)malloc(strlen(optarg)+1);
                      strcpy(args.prot.dfile_name, optarg);
                      break;

            case 'D': if( optarg[0] == 'r' )
                         args.prot.read = 1;
                      else
                         args.prot.read = 0;
                      args.prot.read_type = optarg[1];
                      break;
#endif

            case 'i': if(args.preburst == 1) {
                        fprintf(stderr, "Integrity check not supported with prepost burst\n");
                        exit(-1);
                      }
                      integCheck = 1;
                      perturbation = 0;
                      start = sizeof(int)+1; /* Start with integer size */
                      printf("Doing an integrity check instead of measuring performance\n"); fflush(stdout);
                      break;

#if defined(MPI)
            case 'z': args.source_node = -1;
                      printf("Receive using the ANY_SOURCE flag\n"); fflush(stdout);
                      break;

            case 'a': asyncReceive = 1;
                      printf("Preposting asynchronous receives\n"); fflush(stdout);
                      break;

            case 'S': args.syncflag=1;
                      fprintf(stderr,"Using synchronous sends\n");
                      break;
#endif
#if defined(MPI2)
            case 'g': if(args.prot.no_fence == 1) {
                        fprintf(stderr, "-f cannot be used with -g\n");
                        exit(-1);
                      } 
                      args.prot.use_get = 1;
                      printf("Using MPI-2 Get instead of Put\n");
                      break;

            case 'f': if(args.prot.use_get == 1) {
                         fprintf(stderr, "-f cannot be used with -g\n");
                         exit(-1);
                      }
                      args.prot.no_fence = 1;
                      bufalign = 0;
                      printf("Buffer alignment off (Required for no fence)\n");
                      break;
#endif /* MPI2 */

#if defined(INFINIBAND)
            case 'm': switch(atoi(optarg)) {
                        case 256: args.prot.ib_mtu = MTU256;
                          break;
                        case 512: args.prot.ib_mtu = MTU512;
                          break;
                        case 1024: args.prot.ib_mtu = MTU1024;
                          break;
                        case 2048: args.prot.ib_mtu = MTU2048;
                          break;
                        case 4096: args.prot.ib_mtu = MTU4096;
                          break;
                        default: 
                          fprintf(stderr, "Invalid MTU size, must be one of "
                                          "256, 512, 1024, 2048, 4096\n");
                          exit(-1);
                      }
                      break;

            case 't': if( !strcmp(optarg, "send_recv") ) {
                         printf("Using Send/Receive communications\n");
                         args.prot.commtype = NP_COMM_SENDRECV;
                      } else if( !strcmp(optarg, "send_recv_with_imm") ) {
                         printf("Using Send/Receive communications with immediate data\n");
                         args.prot.commtype = NP_COMM_SENDRECV_WITH_IMM;
                      } else if( !strcmp(optarg, "rdma_write") ) {
                         printf("Using RDMA Write communications\n");
                         args.prot.commtype = NP_COMM_RDMAWRITE;
                      } else if( !strcmp(optarg, "rdma_write_with_imm") ) {
                         printf("Using RDMA Write communications with immediate data\n");
                         args.prot.commtype = NP_COMM_RDMAWRITE_WITH_IMM;
                      } else {
                         fprintf(stderr, "Invalid transfer type "
                                 "specified, please choose one of:\n\n"
                                 "\tsend_recv\t\tUse Send/Receive communications\t(default)\n"
                                 "\tsend_recv_with_imm\tSame as above with immediate data\n"
                                 "\trdma_write\t\tUse RDMA Write communications\n"
                                 "\trdma_write_with_imm\tSame as above with immediate data\n\n");
                         exit(-1);
                      }
                      break;

            case 'c': if( !strcmp(optarg, "local_poll") ) {
                         printf("Using local polling completion\n");
                         args.prot.comptype = NP_COMP_LOCALPOLL;
                      } else if( !strcmp(optarg, "vapi_poll") ) {
                         printf("Using VAPI polling completion\n");
                         args.prot.comptype = NP_COMP_VAPIPOLL;
                      } else if( !strcmp(optarg, "event") ) {
                         printf("Using VAPI event completion\n");
                         args.prot.comptype = NP_COMP_EVENT;
                      } else {
                         fprintf(stderr, "Invalid completion type specified, "
                                 "please choose one of:\n\n"
                                 "\tlocal_poll\tWait for last byte of data\t(default)\n"
                                 "\tvapi_poll\tUse VAPI polling function\n"
                                 "\tevent\t\tUse VAPI event handling function\n\n");
                         exit(-1);
                      }
                      break;
#endif

            case 'n': nrepeat_const = atoi(optarg);
                      break;

#if defined(TCP) && ! defined(INFINIBAND)
            case 'r': args.reset_conn = 1;
                      printf("Resetting connection after every trial\n");
                      break;
#endif
	    case 'P': 
		      args.port = atoi(optarg);
		      break;

            default: 
                     PrintUsage(); 
                     exit(-12);
       }
   }

#endif /* ! defined TCGMSG */

#if defined(INFINIBAND)
   asyncReceive = 1;
   fprintf(stderr, "Preposting asynchronous receives (required for Infiniband)\n");
   if(args.bidir && (
          (args.cache && args.prot.commtype == NP_COMM_RDMAWRITE) || /* rdma_write only works with no-cache mode */
          (!args.preburst && args.prot.commtype != NP_COMM_RDMAWRITE) || /* anything besides rdma_write requires prepost burst */
          (args.preburst && args.prot.comptype == NP_COMP_LOCALPOLL && args.cache) || /* preburst with local polling in cache mode doesn't work */
          0)) {

      fprintf(stderr, 
         "\n"
         "Bi-directional mode currently only works with a subset of the\n"
         "Infiniband options. Restrictions are:\n"
         "\n"
         "  RDMA write (-t rdma_write) requires no-cache mode (-I).\n"
         "\n"
         "  Local polling (-c local_poll, default if no -c given) requires\n"
         "    no-cache mode (-I), and if not using RDMA write communication,\n"
         "    burst mode (-B).\n"
         "\n"
         "  Any other communication type and any other completion type\n"
         "    require burst mode (-B). No-cache mode (-I) may be used\n"
         "    optionally.\n"
         "\n"
         "  All other option combinations will fail.\n"
         "\n");
               
      exit(-1);      

   }
#endif

   if (start > end)
   {
       fprintf(stderr, "Start MUST be LESS than end\n");
       exit(420132);
   }
   args.nbuff = TRIALS;

   Setup(&args);

   if( args.bidir && end > args.upper ) {
      end = args.upper;
      if( args.tr ) {
         printf("The upper limit is being set to %d Bytes\n", end);
#if defined(TCP) && ! defined(INFINIBAND)
         printf("due to socket buffer size limitations\n\n");
#endif
   }  }

#if defined(GM)

   if(streamopt && (!nrepeat_const || nrepeat_const > args.prot.num_stokens)) {
     printf("\nGM is currently limited by the driver software to %d\n", 
            args.prot.num_stokens);
     printf("outstanding sends. The number of repeats will be set\n");
     printf("to this limit for every trial in streaming mode.  You\n");
     printf("may use the -n switch to set a smaller number of repeats\n\n");

     nrepeat_const = args.prot.num_stokens;
   }

#endif

   if( args.tr )                     /* Primary transmitter */
   {
       if ((out = fopen(s, "w")) == NULL)
       {
           fprintf(stderr,"Can't open %s for output\n", s);
           exit(1);
       }
   }
   else out = stdout;

      /* Set a starting value for the message size increment. */

   inc = (start > 1) ? start / 2 : 1;
   nq = (start > 1) ? 1 : 0;

      /* Test the timing to set tlast for the first test */

   args.bufflen = start;
   MyMalloc(&args, args.bufflen, 0, 0);
   InitBufferData(&args, args.bufflen, 0, 0);

   if(args.cache) args.s_buff = args.r_buff;
   
   args.r_ptr = args.r_buff_orig = args.r_buff;
   args.s_ptr = args.s_buff_orig = args.s_buff;
      
   AfterAlignmentInit(&args);  /* MPI-2 needs this to create a window */

   /* Infiniband requires use of asynchronous communications, so we need
    * the PrepareToReceive calls below
    */
   if( asyncReceive )
      PrepareToReceive(&args);
   
   Sync(&args);    /* Sync to prevent race condition in armci module */

   /* For simplicity's sake, even if the real test below will be done in
    * bi-directional mode, we still do the ping-pong one-way-at-a-time test
    * here to estimate the one-way latency. Unless it takes significantly
    * longer to send data in both directions at once than it does to send data
    * one way at a time, this shouldn't be too far off anyway.
    */
   t0 = When();
      for( n=0; n<100; n++) {
         if( args.tr) {
            SendData(&args);
            RecvData(&args);
            if( asyncReceive && n<99 )
               PrepareToReceive(&args);
         } else if( args.rcv) {
            RecvData(&args);
            if( asyncReceive && n<99 )
               PrepareToReceive(&args);
            SendData(&args);
         }
      }
   tlast = (When() - t0)/200;

   /* Sync up and Reset before freeing the buffers */

   Sync(&args); 

   Reset(&args);
   
   /* Free the buffers and any other module-specific resources. */
   if(args.cache)
      FreeBuff(args.r_buff_orig, NULL);
   else
      FreeBuff(args.r_buff_orig, args.s_buff_orig);

      /* Do setup for no-cache mode, using two distinct buffers. */

   if (!args.cache)
   {

       /* Allocate dummy pool of memory to flush cache with */

       if ( (memcache = (int *)malloc(MEMSIZE)) == NULL)
       {
           perror("malloc");
           exit(1);
       }
       mymemset(memcache, 0, MEMSIZE/sizeof(int)); 

       /* Allocate large memory pools */

       MyMalloc(&args, MEMSIZE+bufalign, args.soffset, args.roffset); 

       /* Save buffer addresses */
       
       args.s_buff_orig = args.s_buff;
       args.r_buff_orig = args.r_buff;

       /* Align buffers */

       args.s_buff = AlignBuffer(args.s_buff, bufalign);
       args.r_buff = AlignBuffer(args.r_buff, bufalign);

       /* Post alignment initialization */

       AfterAlignmentInit(&args);

       /* Initialize send buffer pointer */
       
/* both soffset and roffset should be zero if we don't have any offset stuff, so this should be fine */
       args.s_ptr = args.s_buff+args.soffset;
       args.r_ptr = args.r_buff+args.roffset;
   }

       /**************************
        * Main loop of benchmark *
        **************************/

   if( args.tr ) fprintf(stderr,"Now starting the main loop\n");

   for ( n = 0, len = start, errFlag = 0; 
        n < NSAMP - 3 && tlast < STOPTM && len <= end && !errFlag; 
        len = len + inc, nq++ )
   {

           /* Exponentially increase the block size.  */

       if (nq > 2) inc = ((nq % 2))? inc + inc: inc;
       
          /* This is a perturbation loop to test nearby values */

       for (pert = ((perturbation > 0) && (inc > perturbation+1)) ? -perturbation : 0;
            pert <= perturbation; 
            n++, pert += ((perturbation > 0) && (inc > perturbation+1)) ? perturbation : perturbation+1)
       {

           Sync(&args);    /* Sync to prevent race condition in armci module */

               /* Calculate how many times to repeat the experiment. */

           if( args.tr )
           {
               if (nrepeat_const) {
                   nrepeat = nrepeat_const;
/*               } else if (len == start) {*/
/*                   nrepeat = MAX( RUNTM/( 0.000020 + start/(8*1000) ), TRIALS);*/
               } else {
                   nrepeat = MAX((RUNTM / ((double)args.bufflen /
                                  (args.bufflen - inc + 1.0) * tlast)),TRIALS);
               }
               SendRepeat(&args, nrepeat);
           }
           else if( args.rcv )
           {
               RecvRepeat(&args, &nrepeat);
           }

           args.bufflen = len + pert;

           if( args.tr )
               fprintf(stderr,"%3d: %7d bytes %6d times --> ",
                       n,args.bufflen,nrepeat);

           if (args.cache) /* Allow cache effects.  We use only one buffer */
           {
               /* Allocate the buffer with room for alignment*/

               MyMalloc(&args, args.bufflen+bufalign, args.soffset, args.roffset); 

               /* Save buffer address */

               args.r_buff_orig = args.r_buff;
               args.s_buff_orig = args.r_buff;

               /* Align buffer */

               args.r_buff = AlignBuffer(args.r_buff, bufalign);
               args.s_buff = args.r_buff;
               
               /* Initialize buffer with data
                *
                * NOTE: The buffers should be initialized with some sort of
                * valid data, whether it is actually used for anything else,
                * to get accurate results.  Performance increases noticeably
                * if the buffers are left uninitialized, but this does not
                * give very useful results as realworld apps tend to actually
                * have data stored in memory.  We are not sure what causes
                * the difference in performance at this time.
                */

               InitBufferData(&args, args.bufflen, args.soffset, args.roffset);


               /* Post-alignment initialization */

               AfterAlignmentInit(&args);

               /* Initialize buffer pointers (We use r_ptr and s_ptr for
                * compatibility with no-cache mode, as this makes the code
                * simpler) 
                */
               /* offsets are zero by default so this saves an #ifdef */
               args.r_ptr = args.r_buff+args.roffset;
               args.s_ptr = args.r_buff+args.soffset;

           }
           else /* Eliminate cache effects.  We use two distinct buffers */
           {

               /* this isn't truly set up for offsets yet */
               /* Size of an aligned memory block including trailing padding */

               len_buf_align = args.bufflen;
               if(bufalign != 0)
                 len_buf_align += bufalign - args.bufflen % bufalign;
 
               /* Initialize the buffers with data
                *
                * See NOTE above.
                */
               InitBufferData(&args, MEMSIZE, args.soffset, args.roffset); 
               

               /* Reset buffer pointers to beginning of pools */
               args.r_ptr = args.r_buff+args.roffset;
               args.s_ptr = args.s_buff+args.soffset;
            }

            bwdata[n].t = LONGTIME;
/*            t2 = t1 = 0;*/

            /* Finally, we get to transmit or receive and time */

            /* NOTE: If a module is running that uses only one process (e.g.
             * memcpy), we assume that it will always have the args.tr flag
             * set.  Thus we make some special allowances in the transmit 
             * section that are not in the receive section.
             */

            if( args.tr || args.bidir )
            {
                /*
                   This is the transmitter: send the block TRIALS times, and
                   if we are not streaming, expect the receiver to return each
                   block.
                */

                for (i = 0; i < (integCheck ? 1 : TRIALS); i++)
                {                    
                    if(args.preburst && asyncReceive && !streamopt)
                    {

                      /* We need to save the value of the recv ptr so
                       * we can reset it after we do the preposts, in case
                       * the module needs to use the same ptr values again
                       * so it can wait on the last byte to change to indicate
                       * the recv is finished.
                       */

                      SaveRecvPtr(&args);

                      for(j=0; j<nrepeat; j++)
                      {
                        PrepareToReceive(&args);
                        if(!args.cache)
                          AdvanceRecvPtr(&args, len_buf_align);
                      }

                      ResetRecvPtr(&args);
                    }

                    /* Flush the cache using the dummy buffer */
                    if (!args.cache)
                      flushcache(memcache, MEMSIZE/sizeof(int));

                    Sync(&args);

                    t0 = When();

                    for (j = 0; j < nrepeat; j++)
                    {
                        if (!args.preburst && asyncReceive && !streamopt)
                        {
                            PrepareToReceive(&args);
                        }

                        if (integCheck) SetIntegrityData(&args);

                        SendData(&args);

                        if (!streamopt)
                        {
                            RecvData(&args);

                            if (integCheck) VerifyIntegrity(&args);

                            if(!args.cache)
                              AdvanceRecvPtr(&args, len_buf_align);

                        }
                        
                        /* Wait to advance send pointer in case RecvData uses
                         * it (e.g. memcpy module).
                         */
                        if (!args.cache)
                          AdvanceSendPtr(&args, len_buf_align);

                    }

                       /* t is the 1-directional trasmission time */

                    t = (When() - t0)/ nrepeat;

                    if( !streamopt && !args.bidir) t /= 2; /* Normal ping-pong */

                    Reset(&args);

/* NOTE: NetPIPE does each data point TRIALS times, bouncing the message
 * nrepeats times for each trial, then reports the lowest of the TRIALS
 * times.  -Dave Turner
 */
                    bwdata[n].t = MIN(bwdata[n].t, t);
/*                    t1 += t;*/
/*                    t2 += t*t;*/
                }

                if (streamopt){  /* Get time info from Recv node */
                    RecvTime(&args, &bwdata[n].t);
/*                    RecvTime(&args, &t1);*/
/*                    RecvTime(&args, &t2);*/
                }

                   /* Calculate variance after completing this set of trials */

/*                bwdata[n].variance = t2/TRIALS - t1/TRIALS * t1/TRIALS;*/

            }
            else if( args.rcv )
            {
                /*
                   This is the receiver: receive the block TRIALS times, and
                   if we are not streaming, send the block back to the
                   sender.
                */
                for (i = 0; i < (integCheck ? 1 : TRIALS); i++)
                {
                    if (asyncReceive)
                    {
                       if (args.preburst)
                       {

                         /* We need to save the value of the recv ptr so
                          * we can reset it after we do the preposts, in case
                          * the module needs to use the same ptr values again
                          * so it can wait on the last byte to change to 
                          * indicate the recv is finished.
                          */

                         SaveRecvPtr(&args);

                         for (j=0; j < nrepeat; j++)
                         {
                              PrepareToReceive(&args);
                              if (!args.cache)
                                 AdvanceRecvPtr(&args, len_buf_align);
                         }
                         
                         ResetRecvPtr(&args);
                         
                       }
                       else
                       {
                           PrepareToReceive(&args);
                       }
                      
                    }
                    
                    /* Flush the cache using the dummy buffer */
                    if (!args.cache)
                      flushcache(memcache, MEMSIZE/sizeof(int));

                    Sync(&args);

                    t0 = When();
                    for (j = 0; j < nrepeat; j++)
                    {
                        RecvData(&args);

                        if (integCheck) VerifyIntegrity(&args);

                        if (!args.cache)
                        { 
                            AdvanceRecvPtr(&args, len_buf_align);
                        }
                        
                        if (!args.preburst && asyncReceive && (j < nrepeat-1))
                        {
                            PrepareToReceive(&args);
                        }

                        if (!streamopt)
                        {
                            if (integCheck) SetIntegrityData(&args);
                            
                            SendData(&args);

                            if(!args.cache) 
                              AdvanceSendPtr(&args, len_buf_align);
                        }

                    }
                    t = (When() - t0)/ nrepeat;

                    if( !streamopt && !args.bidir) t /= 2; /* Normal ping-pong */

                    Reset(&args);
                    
                    bwdata[n].t = MIN(bwdata[n].t, t);
/*                    t1 += t;*/
/*                    t2 += t*t;*/
                }
                if (streamopt){  /* Recv proc calcs time and sends to Trans */
                    SendTime(&args, &bwdata[n].t);
/*                    SendTime(&args, &t1);*/
/*                    SendTime(&args, &t2);*/
                }
            }
            else  /* Just going along for the ride */
            {
                for (i = 0; i < (integCheck ? 1 : TRIALS); i++)
                {
                    Sync(&args);
                }
            }

            /* Streaming mode doesn't really calculate correct latencies
             * for small message sizes, and on some nics we can get
             * zero second latency after doing the math.  Protect against
             * this.
             */
            if(bwdata[n].t == 0.0) {
              bwdata[n].t = 0.000001;
            }
            
            tlast = bwdata[n].t;
            bwdata[n].bits = args.bufflen * CHARSIZE * (1+args.bidir);
            bwdata[n].bps = bwdata[n].bits / (bwdata[n].t * 1024 * 1024);
            bwdata[n].repeat = nrepeat;
            
            if (args.tr)
            {
                if(integCheck) {
                  fprintf(out,"%8d %d", bwdata[n].bits / 8, nrepeat);

                } else {
                  fprintf(out,"%8d %lf %12.8lf",
                        bwdata[n].bits / 8, bwdata[n].bps, bwdata[n].t);

                }
                fprintf(out, "\n");
                fflush(out);
            }
    
            /* Free using original buffer addresses since we may have aligned
               r_buff and s_buff */

            if (args.cache)
                FreeBuff(args.r_buff_orig, NULL);
            
            if ( args.tr ) {
               if(integCheck) {
                 fprintf(stderr, " Integrity check passed\n");

               } else {
                 fprintf(stderr," %8.2lf Mbps in %10.2lf usec\n", 
                         bwdata[n].bps, tlast*1.0e6);
               }
            }


        } /* End of perturbation loop */

    } /* End of main loop  */
 
   /* Free using original buffer addresses since we may have aligned
      r_buff and s_buff */

   if (!args.cache) {
        FreeBuff(args.s_buff_orig, args.r_buff_orig);
   }
    if (args.tr) fclose(out);
         
    CleanUp(&args);
    return 0;
}