void pool3_max_float(float *in, float *out) {
  #pragma HLS INTERFACE m_axi port=in offset=slave bundle=pool_gmem
  #pragma HLS INTERFACE m_axi port=out offset=slave bundle=pool_gmem
  #pragma HLS INTERFACE s_axilite port=in bundle=pool_control
  #pragma HLS INTERFACE s_axilite port=out bundle=pool_control
  #pragma HLS INTERFACE s_axilite port=return bundle=pool_control

  float inbuf[IWIDTH*IHEIGHT], obuf[OWIDTH*OHEIGHT];

  //memcpy(inbuf, (float *)in, IWIDTH*IHEIGHT); 

  int i;
  for (i=0; i<IWIDTH*IHEIGHT; i++)
      inbuf[i]=in[i];
  float m;
  int idx;
  int count = 0;
  int col, row, pcol, prow;

OUTER_MAX_DATA_LOOP:for (row = 0; row < IHEIGHT - NUM_MASK_ROWS + 1; row += STRIDE) {

    #pragma HLS PIPELINE
    #pragma HLS loop_tripcount min=26 max=26
    INNER_MAX_DATA_LOOP:for (col = 0; col < IWIDTH - NUM_MASK_COLS + 1 ; col += STRIDE) {
           #pragma HLS loop_tripcount min=26 max=26
                            m = -FLT_MAX;
                            idx = -1;
         OUTER_MAX_MASK_LOOP:for (prow = 0; (prow < NUM_MASK_ROWS && row + prow < IHEIGHT); ++prow) {
               #pragma HLS loop_tripcount min=3 max=3
              INNER_MAX_MASK_LOOP:for (pcol = 0; (pcol < NUM_MASK_COLS && col + pcol < IWIDTH); ++pcol) {
                    #pragma HLS loop_tripcount min=3 max=3
                                      //printf("INBUF: %f\n", inbuf[IDX2C(row+prow, col+pcol, IHEIGHT)]);  
                                     if (inbuf[IDX2C(col + pcol, row + prow, IWIDTH)] > m) {
                                      idx = IDX2C(col + pcol, row + prow, IWIDTH);
                                       m = inbuf[idx];
                                         }
                                   }
                               }
          obuf[count] = m;
          count++;
      }
  }  
   //memcpy((float*)out, obuf, OWIDTH*OHEIGHT);
   for (i=0; i<OWIDTH*OHEIGHT; i++)
     out[i]=obuf[i];
}
inline void compute_map_pooling(T *ptr_data, const mwSize *DATA_DIMS, T *ptr_pool,
    T *ptr_out, T *ptr_idx, int tile_start)
{
  T m;
  int idx;
  int count = 0;

  for (int col = 0; col < DATA_DIMS[1]; col += ptr_pool[1]) {
      for (int row = 0; row < DATA_DIMS[0]; row += ptr_pool[0]) {
          if (debug)
            fprintf(stderr, "r = %i, c = %i \n", row, col);

          m = -std::numeric_limits<T>::max();
          idx = -1;
          for (int pcol = 0; (pcol < ptr_pool[1] && col + pcol < DATA_DIMS[1]); ++pcol) {
              for (int prow = 0; (prow < ptr_pool[0] && row + prow < DATA_DIMS[0]); ++prow) {
                  if (debug) {
                      fprintf(stderr, "m = %f, data = %f \n", m, ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])]);
                      fprintf(stderr, "rr = %i, cc = %i \n --> idx = %i \n", row + prow, col + pcol, idx);
                  }

                  if (ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])] > m) {
                      idx = IDX2C(row + prow, col + pcol, DATA_DIMS[0]);
                      m = ptr_data[idx];
                  }
              }
          }

          if (debug && idx == -1) {
              fprintf(stderr, "dioschifoso\n");
              return;
          }

          if (debug)
            fprintf(stderr, "count = %i\n",count);

          /* idxs are to be used in Matlab and hence a +1 is needed */
          ptr_idx[count] = idx + 1 + tile_start;
          ptr_out[count] = m;
          count++;
      }
  }
}
Beispiel #3
0
void base_layer<dType>::check_gradient_GPU(dType epsilon,dType *d_mat,dType *d_grad,int rows,int cols) {
	cudaDeviceSynchronize();
	thrust::device_ptr<dType> d_thrust_mat = thrust::device_pointer_cast(d_mat);
	thrust::device_ptr<dType> d_thrust_grad = thrust::device_pointer_cast(d_grad);
	for(int i=0; i<rows; i++) {
		for(int j=0; j<cols; j++) {
			dType loss =0;
			d_thrust_mat[IDX2C(i,j,rows)]+= epsilon;
			loss = model->getError(true);
			cudaDeviceSynchronize();
			d_thrust_mat[IDX2C(i,j,rows)]+= -2*epsilon;
			loss -=model->getError(true);
			cudaDeviceSynchronize();
			d_thrust_mat[IDX2C(i,j,rows)]+= epsilon;
			std::cout << "Gradient difference: " << std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon)) << "\n";
			if( (std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon))) > 1/(dType)1000.0 ||  (std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon))/(std::abs(d_thrust_grad[IDX2C(i,j,rows)]) + std::abs(loss/(2*epsilon)))) > 1/1000.0  ) {
				std::cout << "Gradient for gradient check: " << loss/(2*epsilon) << "\n";
				std::cout << "My gradient: " << d_thrust_grad[IDX2C(i,j,rows)] << "\n";
				std::cout << "Gradient difference: " << std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon)) << "\n";
				std::cout << "Gradient difference (Equation 2): " << std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon))/(std::abs(d_thrust_grad[IDX2C(i,j,rows)]) + std::abs(loss/(2*epsilon)) ) << "\n\n";
			}
		}
	}
}
Beispiel #4
0
int main( int argc, char **argv )
{
	double *A, *B, *C;
	double *cu_A, *cu_B, *cu_C;

	cudaError_t    cuError;
	cublasStatus_t cuStatus;
	cublasHandle_t cuHandle;

	// seed rand()
	srand(time(NULL));

	// allocate memory on CPU
	A = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_P);
	B = (double*)malloc(sizeof(double)*MATRIX_P*MATRIX_M);
	C = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_M);

	if( !A || !B || !C )
	{
		perror("Can't allocate CPU matrices");
		exit(EXIT_FAILURE);
	}

	// generate matrices
	for( int i = 0; i < MATRIX_N*MATRIX_P; i++ )
		A[i] = 10.0*((double)rand())/RAND_MAX;

	for( int i = 0; i < MATRIX_P*MATRIX_M; i++ )
		B[i] = 10.0*((double)rand())/RAND_MAX;

	// allocate memory on GPU
	cuError = cudaMalloc( &cu_A, sizeof(double)*MATRIX_N*MATRIX_P );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	cuError = cudaMalloc( &cu_B, sizeof(double)*MATRIX_P*MATRIX_M );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	cuError = cudaMalloc( &cu_C, sizeof(double)*MATRIX_N*MATRIX_M );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	// setup cuBlas
	cuStatus = cublasCreate( &cuHandle );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error initializing cuBlas\n");
		exit(EXIT_FAILURE);
	}

	// setup matrices
	cuStatus = cublasSetMatrix( MATRIX_N, MATRIX_P, sizeof(double), A, MATRIX_N, cu_A, MATRIX_N );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring matrix A\n");
		exit(EXIT_FAILURE);
	}

	cuStatus = cublasSetMatrix( MATRIX_P, MATRIX_M, sizeof(double), B, MATRIX_P, cu_B, MATRIX_P );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring matrix B\n");
		exit(EXIT_FAILURE);
	}

	// multiply
	double one  = 1.0;
	double zero = 0.0;
	cuStatus = cublasDgemm( cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, MATRIX_N, MATRIX_M, MATRIX_P, &one, cu_A, MATRIX_N, cu_B, MATRIX_P, &zero, cu_C, MATRIX_N );

	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error executing matrix mult\n");
		exit(EXIT_FAILURE);
	}

	// get results
	cuStatus = cublasGetMatrix( MATRIX_N, MATRIX_M, sizeof(double), cu_C, MATRIX_N, C, MATRIX_N );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring results\n");
		exit(EXIT_FAILURE);
	}
	
	// check results
	bool good = true;
	for( int i = 0; i < MATRIX_N; i++ )
	{
		for( int j = 0; j < MATRIX_M; j++ )
		{
			double sum = 0.0;
			for( int k = 0; k < MATRIX_P; k++ )
			{
				sum += A[IDX2C(i, k, MATRIX_N)]*B[IDX2C(k, j, MATRIX_P)];
			}
			// check
			if( fabs(sum - C[IDX2C(i,j,MATRIX_N)]) > 0.00001 )
			{
				good = false;
				printf("(%i, %i) sum = %f\tcu_C = %f\tMISMATCH\n", i, j, sum, C[IDX2C(i,j,MATRIX_N)]);
			}
		}
	}

	if( good )
		printf("Results Match\n");
	else
		printf("Results DO NOT Match\n");

	// cleanup
	free( A ); free( B ); free( C );
	cudaFree( cu_A ); cudaFree( cu_B ); cudaFree( cu_C );
	cublasDestroy( cuHandle );

	return 0;
}
inline void compute_stochastic_pooling(T *ptr_data, const mwSize *DATA_DIMS, T *ptr_pool,
    T *ptr_out, T *ptr_idx, int tile_start)
{
  T m;
  T sum,rsum;
  int idx;
  int count = 0;

  for (int col = 0; col < DATA_DIMS[1]; col += ptr_pool[1]) {
      for (int row = 0; row < DATA_DIMS[0]; row += ptr_pool[0]) {
          if (debug)
            fprintf(stderr, "r = %i, c = %i \n", row, col);

          m = 0;
		  sum = 0;
		  rsum = 0;
          idx = -1;
          for (int pcol = 0; (pcol < ptr_pool[1] && col + pcol < DATA_DIMS[1]); ++pcol) {
              for (int prow = 0; (prow < ptr_pool[0] && row + prow < DATA_DIMS[0]); ++prow) {
                  sum += ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])];
              }
          }
		  float num = rand()%1000;
		  num *= sum/1000;
		  for (int pcol = 0; (pcol < ptr_pool[1] && col + pcol < DATA_DIMS[1]); ++pcol) {
              for (int prow = 0; (prow < ptr_pool[0] && row + prow < DATA_DIMS[0]); ++prow) {
                  if (debug) {
                      fprintf(stderr, "num = %f, rsum = %f, this = %f \n", num, rsum, ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])]);
                  }
				  rsum += ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])];
				  if(num<rsum)
				  {
					  idx = IDX2C(row + prow, col + pcol, DATA_DIMS[0]);
                      m = ptr_data[idx];
					  break;
				  }
				  /*if((pcol == ptr_pool[1] - 1 || col + pcol == DATA_DIMS[1] - 1) && (prow == ptr_pool[0]-1 || row + prow == DATA_DIMS[0] -1))*/
				  if(sum-rsum<0.0001)
				  {
					  idx = IDX2C(row + prow, col + pcol, DATA_DIMS[0]);
                      m = ptr_data[idx];
					  break;
				  }
              }
			  if(idx != -1) break;
          }
		  
          if (debug && idx == -1) {
              fprintf(stderr, "dioschifoso\n");
              return;
          }

          if (debug)
            fprintf(stderr, "count = %i\n",count);

          /* idxs are to be used in Matlab and hence a +1 is needed */
          ptr_idx[count] = idx + 1 + tile_start;
          ptr_out[count] = m;
          count++;
      }
  }
}
Beispiel #6
0
int main ( void ){
	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context
	int i,j; // i-row index , j-col. index
	double * a; // mxm matrix a on the host
	double * b; // mxn matrix b on the host
	a=( double *) malloc (m*m* sizeof ( double )); // host memory for a

	b=( double *) malloc (m*n* sizeof ( double )); // host memory for b

	int ind =11; // a:
	for(j=0;j<m;j ++){ // 11
		for(i=0;i<m;i ++){ // 12 ,17
			if(i >=j){ // 13 ,18 ,22
				a[ IDX2C(i,j,m)]=( double )ind ++; // 14 ,19 ,23 ,26
			} // 15 ,20 ,24 ,27 ,29
		} // 16 ,21 ,25 ,28 ,30 ,31
	}
	 printf (" lower triangle of a:\n");
/*	for (i=0;i<m;i ++){
		for (j=0;j<m;j ++){
			if(i >=j)
				printf (" %5.0f",a[ IDX2C(i,j,m)]);
		}
		printf ("\n");
	} */

	ind =11; // b:
	for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35
		for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36
			if(i == j) b[IDX2C(i,i,m)] =  1.0;
			else b[IDX2C(i,j,m)] = 0.0;
		}
	}
		
		//	b[ IDX2C(i,j,m)] = ind++;
			/*if(i == j)
			b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37
			else 
			b[ IDX2C(i, j, m)] = 0.0;*/
			//ind ++; // 14 ,20 ,26 ,32 ,38
	 printf ("b:\n");
/*	for (i=0;i<m;i ++){
		for (j=0;j<n;j ++){
			printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row
		}
		printf ("\n");
	} */

	double * d_a; // d_a - a on the device
	double * d_b; // d_b - b on the device
	cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device
	// memory alloc for a
	cudaStat = cudaMalloc (( void **)& d_b ,m*n* sizeof (*b)); // device
	// // memory alloc for b
	stat = cublasCreate (& handle ); // initialize CUBLAS context

	stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a
	stat = cublasSetMatrix (m,n, sizeof (*b) ,b,m,d_b ,m); //b -> d_b
	double al =1.0f;

	double startime = CycleTimer::currentSeconds();
	(cublasDtrsm(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_LOWER,
			CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&al,d_a,m,d_b,m));
	stat = cublasGetMatrix (m,n, sizeof (*b) ,d_b ,m,b,m); // d_b -> b
	double endtime = CycleTimer::currentSeconds();
	printf (" solution x from Strsm :\n");
/*	for(i=0;i<m;i ++){
		for(j=0;j<n;j ++){
			printf (" %11.5f",b[IDX2C(i,j,m )]); // print b after Strsm
		}
		printf ("\n");
	} */
	cudaFree (d_a ); // free device memory
	cudaFree (d_b ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context
	free (a); // free host memory
	free (b); // free host memory

	printf("Time taken: %lf\n", endtime - startime);
	return EXIT_SUCCESS ;
}
int main(){

	int m = 10;
	int n = m;
	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context
	int i,j; // i-row index , j-col. index
	double * a; // mxm matrix a on the host
	double * b; // mxm matrix b on the host
	double * c; // mxm matrix c on the host
	a=( double *) malloc (m*m* sizeof ( double )); // host memory for a

	b=( double *) malloc (m*m* sizeof ( double )); // host memory for b
	c=( double *) malloc (m*m* sizeof ( double )); // host memory for b

	int ind =1; // a:
	for(j=0;j<m;j ++){ // 11
		for(i=0;i<m;i ++){ // 12 ,17
	//		if(i >=j){ // 13 ,18 ,22
				a[ IDX2C(i,j,m)]=( float )ind ++; // 14 ,19 ,23 ,26
	//		} // 15 ,20 ,24 ,27 ,29
		} // 16 ,21 ,25 ,28 ,30 ,31
	}
	 printf (" lower triangle of a:\n");
	for (i=0;i<m;i ++){
		for (j=0;j<m;j ++){
	//		if(i >=j)
				printf (" %5.0f",a[ IDX2C(i,j,m)]);
		}
		printf ("\n");
	} 

	ind =11; // b:
	for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35
		for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36
			if(i == j)
			b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37
			else 
			b[ IDX2C(i, j, m)] = 2.0;
			//ind ++; // 14 ,20 ,26 ,32 ,38
		} // 15 ,21 ,27 ,33 ,39
	} // 16 ,22 ,28 ,34 ,40
	 printf ("b:\n");
	for (i=0;i<m;i ++){
		for (j=0;j<n;j ++){
			printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row
		}
		printf ("\n");
	} 

	double * d_a; // d_a - a on the device
	double * d_b; // d_b - b on the device
	double * d_c; // d_c - c on the devicde
	cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device memory alloc for a
	cudaStat = cudaMalloc (( void **)& d_b ,m*m* sizeof (*b)); // device memory alloc for b
	cudaStat = cudaMalloc (( void **)& d_c ,m*m* sizeof (*c)); // device memory alloc for c

	stat = cublasCreate (& handle ); // initialize CUBLAS context

	stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a
	stat = cublasSetMatrix (m,m, sizeof (*b) ,b,m,d_b ,m); //b -> d_b

	double startime = CycleTimer::currentSeconds();
	gpu_blas_mmul(d_a, d_b, d_c, m, m, m);
	double endtime = CycleTimer::currentSeconds();
	
	stat = cublasGetMatrix (m,n, sizeof (*c) ,d_c ,m,c,m); // d_b -> b
	 printf (" solution x from Strsm :\n");
	for(i=0;i<m;i ++){
		for(j=0;j<n;j ++){
			printf (" %11.5f",c[IDX2C(i,j,m )]); // print b after Strsm
		}
		printf ("\n");
	} 
	cudaFree (d_a ); // free device memory
	cudaFree (d_b ); // free device memory
	cudaFree (d_c ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context
	free (a); // free host memory
	free (b); // free host memory
	free (c); // free host memory

	printf("Time taken: %lf\n", endtime - startime);
	return EXIT_SUCCESS ;
}
int validateResults(double *luMatrix, double *origMatrix, int matrixSize) {
  int count = 0;

  // Multiply lower triangle with upper triangle
//  double *result = new double[matrixSize *matrixSize];

//  std::cout<< "orig matrix:" << std::endl;

  double *lMatrix = new double[matrixSize * matrixSize];
  double *uMatrix = new double[matrixSize * matrixSize];

  for (int c = 0; c < matrixSize; c++)
  {
    for (int r = 0; r < matrixSize; r++)
    {
      // below diag
      if (r > c)
      {
        lMatrix[IDX2C(r, c, matrixSize)] = luMatrix[IDX2C(r, c, matrixSize)];
        uMatrix[IDX2C(r, c, matrixSize)] = 0.0;
      }
        // above diag
      else if (c > r)
      {
        lMatrix[IDX2C(r, c, matrixSize)] = 0.0;
        uMatrix[IDX2C(r, c, matrixSize)] = luMatrix[IDX2C(r, c, matrixSize)];
      }
        // on diag
      else if (r == c)
      {
        lMatrix[IDX2C(r, c, matrixSize)] = 1.0;
        uMatrix[IDX2C(r, c, matrixSize)] = luMatrix[IDX2C(r, c, matrixSize)];
      }
    }
  }

  double *result = new double[matrixSize*matrixSize];

  mkl_set_num_threads(40);

  mkl_blas_
//  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, matrixSize, matrixSize, matrixSize, 1.0, lMatrix, matrixSize, uMatrix, matrixSize, 0.0, result, matrixSize);

  for (int r = 0; r < matrixSize; r++)
  {
    for (int c = 0; c < matrixSize; c++)
    {

      double difference = fabs(result[IDX2C(r, c, matrixSize)] - origMatrix[IDX2C(r, c, matrixSize)]);
      if (difference > 1.0e-8)
      {
        count++;
        if (count < 20)
        {
          std::cout << "Incorrect value: " << result[IDX2C(r, c, matrixSize)] << " != " << origMatrix[IDX2C(r, c, matrixSize)] << " difference: " << difference << std::endl;
        }
      }
    }
  }

  if (count > 0)
    std::cout << "Total incorrect = " << count << std::endl;

  if (count > 0)
    return 1;
  return 0;
}
int main(int argc, char *argv[]) {
  long matrixSize= 16384;
  int blockSize = 128;
  bool runSequential = false;
  bool validate = false;

  int numBlasThreads = 40;

  int numGausElimThreads = 2;
  int numFactorLowerThreads = 4;
  int numFactorUpperThreads = 4;
  int numMatrixMulThreads = 30;

  std::string runtimeFileStr("runtimes");

  int numRetry = 1;

  if (argc > 1) {
    for (int arg = 1; arg < argc; arg++) {
      std::string argvs(argv[arg]);

      if (argvs == "--size") {
        arg++;
        matrixSize = atoi(argv[arg]);
      }

      if (argvs == "--num-threads-blas") {
        arg++;
        numBlasThreads = atoi(argv[arg]);
      }


      if (argvs == "num-threads-factor-l") {
        arg++;
        numFactorLowerThreads = atoi(argv[arg]);
      }

      if (argvs == "num-threads-factor-u") {
        arg++;
        numFactorUpperThreads = atoi(argv[arg]);
      }

      if (argvs == "num-threads-gaus") {
        arg++;
        numGausElimThreads = atoi(argv[arg]);
      }

      if (argvs == "num-threads-gemm") {
        arg++;
        numMatrixMulThreads = atoi(argv[arg]);
      }

      if (argvs == "--run-sequential") {
        runSequential = true;
      }

      if (argvs == "--num-retry" && arg + 1 < argc) {
        arg++;
        numRetry = atoi(argv[arg]);
      }

      if (argvs == "--block-size") {
        arg++;
        blockSize = atoi(argv[arg]);
      }


      if (argvs == "--runtime-file" && arg + 1 < argc) {
        runtimeFileStr = argv[arg + 1];
        arg++;
      }

      if (argvs == "--validate-results") {
        validate = true;
      }

      if (argvs == "--help") {
        std::cout << argv[0]
                  << " args: [--size <#>] [--block-size <#>] [--num-retry <#>] [--runtime-file <filename>] [--validate-results] [--run-sequential] [--num-threads-factor-l <#>] [--num-threads-factor-u <#>] [--num-threads-gaus <#>] [--num-threads-gemm <#>] [--num-threads-blas <#>] [--help]"
                  << std::endl;
        exit(0);

      }
    }
  }

  std::ofstream runtimeFile(runtimeFileStr, std::ios::app);
  double *matrix = new double[matrixSize * matrixSize];
  double *matrixTest = nullptr;

  // TODO: Ensure diagonally dominant
  initMatrixDiagDom(matrix, matrixSize, matrixSize, true);

  if (validate) {
    matrixTest = new double[matrixSize * matrixSize];
    for (int i = 0; i < matrixSize * matrixSize; i++)
      matrixTest[i] = matrix[i];
  }

  for (int numTry = 0; numTry < numRetry; numTry++) {
    SimpleClock clk;
    SimpleClock endToEnd;

    if (runSequential) {
      endToEnd.start();
      mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL);
//      mkl_set_num_threads(40);

      clk.start();
      runSequentialLU(matrix, matrixSize);
//      computeSequentialMatMul(matrixA, matrixB, matrixC, matrixAHeight, sharedDim, matrixBWidth);
      clk.stopAndIncrement();
      endToEnd.stopAndIncrement();
    }
    else {
      endToEnd.start();
      mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL);

      int gridHeight = (int) matrixSize / blockSize;
      int gridWidth = (int) matrixSize / blockSize;

      // TODO: Build graph and runtime
      htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>> *matrixBlocks = new htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>>(gridHeight, gridWidth, nullptr);

      for (int r = 0; r < gridHeight; r++)
      {
        for (int c = 0; c < gridWidth; c++)
        {
          // Store pointer locations for all blocks
          double *ptr = &matrix[IDX2C(r * blockSize, c *blockSize, matrixSize)];

          std::shared_ptr<MatrixRequestData> request(new MatrixRequestData(r, c, MatrixType::MatrixA));
          std::shared_ptr<MatrixBlockData<double *>> data(new MatrixBlockData<double *>(request, ptr, blockSize, blockSize));

          matrixBlocks->set(r, c, data);
        }
      }

      GausElimTask *gausElimTask = new GausElimTask(numGausElimThreads, matrixSize, matrixSize);

      auto gausElimBk = new htgs::Bookkeeper<MatrixBlockData<double *>>();

      GausElimRuleUpper *gausElimRuleUpper = new GausElimRuleUpper(matrixBlocks, gridHeight, gridWidth);
      GausElimRuleLower *gausElimRuleLower = new GausElimRuleLower(matrixBlocks, gridHeight, gridWidth);

      FactorUpperTask *factorUpperTask = new FactorUpperTask(numFactorUpperThreads, matrixSize, matrixSize);
      FactorLowerTask *factorLowerTask = new FactorLowerTask(numFactorLowerThreads, matrixSize, matrixSize);

      auto matrixMulBk = new htgs::Bookkeeper<MatrixBlockData<double *>>();
      MatrixMulRule *matrixMulRule = new MatrixMulRule(matrixBlocks, gridHeight, gridWidth);

      MatrixMulBlkTask *matrixMulTask = new MatrixMulBlkTask(numMatrixMulThreads, matrixSize, matrixSize, matrixSize, matrixSize, blockSize);


      auto matrixMulResultBk = new htgs::Bookkeeper<MatrixBlockData<double *>>();

      int numDiagonals = gridWidth - 1;
      GausElimRule *gausElimRule = new GausElimRule(numDiagonals, gridHeight, gridWidth);

      // Number of updates excluding the diagonal and the top/left row/column
      int numUpdates = (1.0/6.0) * (double)gridWidth * (2.0 * ((double)gridWidth * (double)gridWidth) - 3.0 * (double)gridWidth + 1.0);

      UpdateRule *updateRule = new UpdateRule(numUpdates);
      UpdateRule *updateRule2 = new UpdateRule(numUpdates);

      auto taskGraph = new htgs::TaskGraph<MatrixBlockData<double *>, htgs::VoidData>();
      taskGraph->addGraphInputConsumer(gausElimTask);

      taskGraph->addEdge(gausElimTask, gausElimBk);
      taskGraph->addRule(gausElimBk, factorUpperTask, gausElimRuleUpper);
      taskGraph->addRule(gausElimBk, factorLowerTask, gausElimRuleLower);

      taskGraph->addEdge(factorUpperTask, matrixMulBk);
      taskGraph->addEdge(factorLowerTask, matrixMulBk);

      taskGraph->addRule(matrixMulBk, matrixMulTask, matrixMulRule);
      taskGraph->addEdge(matrixMulTask, matrixMulResultBk);

      if (numDiagonals > 0)
        taskGraph->addRule(matrixMulResultBk, gausElimTask, gausElimRule);

      if (numUpdates > 0)
        taskGraph->addRule(matrixMulResultBk, matrixMulBk, updateRule);

      if (numUpdates > 0)
        taskGraph->addRule(matrixMulResultBk, gausElimBk, updateRule2);

      taskGraph->incrementGraphInputProducer();

      taskGraph->writeDotToFile("lud-graph.dot");

      htgs::Runtime *runtime = new htgs::Runtime(taskGraph);

      clk.start();

      runtime->executeRuntime();

      taskGraph->produceData(matrixBlocks->get(0, 0));
      taskGraph->finishedProducingData();

      runtime->waitForRuntime();

      clk.stopAndIncrement();


      delete runtime;
      endToEnd.stopAndIncrement();
    }

    double operations = (2.0 * (matrixSize * matrixSize * matrixSize)) / 3.0;
    double flops = operations / clk.getAverageTime(TimeVal::SEC);
    double gflops = flops / 1073741824.0;

    std::cout << (runSequential ? "sequential" : "htgs")
              << ", matrix-size: " << matrixSize
              << ", " << "blockSize: " << (runSequential ? 0 : blockSize)
              << ", blasThreads: " << numBlasThreads
              << ", gausThreads: " << numGausElimThreads
              << ", factorUpperThreads: " << numFactorUpperThreads
              << ", factorLowerThreads: " << numFactorLowerThreads
              << ", gemmThreads: " << numMatrixMulThreads
              << ", time:" << clk.getAverageTime(TimeVal::MILLI)
              << ", end-to-end:" << endToEnd.getAverageTime(TimeVal::MILLI)
              << ", gflops: " << gflops
        << std::endl;

    runtimeFile << (runSequential ? "sequential" : "htgs")
                << ", " << matrixSize
                << ", " << blockSize
                << ", " << numBlasThreads
                << ", " << numGausElimThreads
                << ", " << numFactorUpperThreads
                << ", " << numFactorLowerThreads
                << ", " << numMatrixMulThreads
                << ", " << clk.getAverageTime(TimeVal::MILLI)
                << ", " << endToEnd.getAverageTime(TimeVal::MILLI)
                << ", " << gflops
                << std::endl;



    if (validate)
    {
      int res = validateResults(matrix, matrixTest, matrixSize);
      std::cout << (res == 0 ? "PASSED" : "FAILED") << std::endl;
    }


  }

  delete[] matrix;
  delete[] matrixTest;

}
void ensemble_factory<dType>::ensembles_models() {

	int num_models = models.size();
	for(int i=0; i<outputdist.rows(); i++) {
		for(int j=0; j< outputdist.cols(); j++) {
			double temp_sum = 0;
			for(int k=0; k<models.size(); k++) {
				temp_sum+=models[k].outputdist(i,j);
			}
			outputdist(i,j) = temp_sum/num_models;
		}
	}

	//normalize now
	// for(int j=0; j< outputdist.cols(); j++) {
	// 	double temp_sum = 0;
	// 	for(int i=0; i<outputdist.rows(); i++) {
	// 		temp_sum+=outputdist(i,j);
	// 	}
	// 	for(int i=0; i<outputdist.rows(); i++) {
	// 		outputdist(i,j) = outputdist(i,j)/temp_sum;
	// 	}
	// }
	normalization.setZero();

	for(int i=0; i<outputdist.rows(); i++) {
		normalization+=outputdist.row(i);
	}
	for(int i=0; i<outputdist.rows(); i++) {
		outputdist.row(i) = (outputdist.row(i).array()/normalization.array()).matrix();
	}

	//now averaging alignment scores for unk replacement
	if(BZ_CUDA::unk_replacement) {
		//average the scores
		for(int i=0; i<models[0].longest_sent;i++) {
			for(int j=0; j<models[0].beam_size; j++) {
				dType temp_sum = 0;
				for(int k=0; k<models.size(); k++) {
					temp_sum+=models[k].viterbi_alignments_scores[IDX2C(i,j,models[0].longest_sent)];
				}
				BZ_CUDA::alignment_scores[IDX2C(i,j,models[0].longest_sent)] = temp_sum;
			}
		}

		// std::cout << "-------------------------------------------\n";
		// for(int i=0; i<models[0].longest_sent;i++) {
		// 	for(int j=0; j<models[0].beam_size; j++) {
		// 		std::cout << BZ_CUDA::alignment_scores[IDX2C(i,j,models[0].longest_sent)] << " ";
		// 	}
		// 	std::cout << "\n";
		// }
		// std::cout << "\n";
		// std::cout << "-------------------------------------------\n\n";
		//choose the max and fill in BZ_CUDA::viterbi_alignments
		for(int i=0; i<models[0].beam_size; i++) {
			dType max_val = 0;
			int max_index = -1;
			for(int j=0; j<models[0].longest_sent; j++) {
				dType temp_val = BZ_CUDA::alignment_scores[IDX2C(j,i,models[0].longest_sent)];
				if(temp_val > max_val) {
					max_val = temp_val;
					max_index = j;
				}
			}
			// if(max_index==-1) {
			// 	std::cout << "ERROR: max_index is still -1, so all values are zero\n";
			// }
			BZ_CUDA::viterbi_alignments[i] = max_index;
		}
	}
}