void pool3_max_float(float *in, float *out) { #pragma HLS INTERFACE m_axi port=in offset=slave bundle=pool_gmem #pragma HLS INTERFACE m_axi port=out offset=slave bundle=pool_gmem #pragma HLS INTERFACE s_axilite port=in bundle=pool_control #pragma HLS INTERFACE s_axilite port=out bundle=pool_control #pragma HLS INTERFACE s_axilite port=return bundle=pool_control float inbuf[IWIDTH*IHEIGHT], obuf[OWIDTH*OHEIGHT]; //memcpy(inbuf, (float *)in, IWIDTH*IHEIGHT); int i; for (i=0; i<IWIDTH*IHEIGHT; i++) inbuf[i]=in[i]; float m; int idx; int count = 0; int col, row, pcol, prow; OUTER_MAX_DATA_LOOP:for (row = 0; row < IHEIGHT - NUM_MASK_ROWS + 1; row += STRIDE) { #pragma HLS PIPELINE #pragma HLS loop_tripcount min=26 max=26 INNER_MAX_DATA_LOOP:for (col = 0; col < IWIDTH - NUM_MASK_COLS + 1 ; col += STRIDE) { #pragma HLS loop_tripcount min=26 max=26 m = -FLT_MAX; idx = -1; OUTER_MAX_MASK_LOOP:for (prow = 0; (prow < NUM_MASK_ROWS && row + prow < IHEIGHT); ++prow) { #pragma HLS loop_tripcount min=3 max=3 INNER_MAX_MASK_LOOP:for (pcol = 0; (pcol < NUM_MASK_COLS && col + pcol < IWIDTH); ++pcol) { #pragma HLS loop_tripcount min=3 max=3 //printf("INBUF: %f\n", inbuf[IDX2C(row+prow, col+pcol, IHEIGHT)]); if (inbuf[IDX2C(col + pcol, row + prow, IWIDTH)] > m) { idx = IDX2C(col + pcol, row + prow, IWIDTH); m = inbuf[idx]; } } } obuf[count] = m; count++; } } //memcpy((float*)out, obuf, OWIDTH*OHEIGHT); for (i=0; i<OWIDTH*OHEIGHT; i++) out[i]=obuf[i]; }
inline void compute_map_pooling(T *ptr_data, const mwSize *DATA_DIMS, T *ptr_pool, T *ptr_out, T *ptr_idx, int tile_start) { T m; int idx; int count = 0; for (int col = 0; col < DATA_DIMS[1]; col += ptr_pool[1]) { for (int row = 0; row < DATA_DIMS[0]; row += ptr_pool[0]) { if (debug) fprintf(stderr, "r = %i, c = %i \n", row, col); m = -std::numeric_limits<T>::max(); idx = -1; for (int pcol = 0; (pcol < ptr_pool[1] && col + pcol < DATA_DIMS[1]); ++pcol) { for (int prow = 0; (prow < ptr_pool[0] && row + prow < DATA_DIMS[0]); ++prow) { if (debug) { fprintf(stderr, "m = %f, data = %f \n", m, ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])]); fprintf(stderr, "rr = %i, cc = %i \n --> idx = %i \n", row + prow, col + pcol, idx); } if (ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])] > m) { idx = IDX2C(row + prow, col + pcol, DATA_DIMS[0]); m = ptr_data[idx]; } } } if (debug && idx == -1) { fprintf(stderr, "dioschifoso\n"); return; } if (debug) fprintf(stderr, "count = %i\n",count); /* idxs are to be used in Matlab and hence a +1 is needed */ ptr_idx[count] = idx + 1 + tile_start; ptr_out[count] = m; count++; } } }
void base_layer<dType>::check_gradient_GPU(dType epsilon,dType *d_mat,dType *d_grad,int rows,int cols) { cudaDeviceSynchronize(); thrust::device_ptr<dType> d_thrust_mat = thrust::device_pointer_cast(d_mat); thrust::device_ptr<dType> d_thrust_grad = thrust::device_pointer_cast(d_grad); for(int i=0; i<rows; i++) { for(int j=0; j<cols; j++) { dType loss =0; d_thrust_mat[IDX2C(i,j,rows)]+= epsilon; loss = model->getError(true); cudaDeviceSynchronize(); d_thrust_mat[IDX2C(i,j,rows)]+= -2*epsilon; loss -=model->getError(true); cudaDeviceSynchronize(); d_thrust_mat[IDX2C(i,j,rows)]+= epsilon; std::cout << "Gradient difference: " << std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon)) << "\n"; if( (std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon))) > 1/(dType)1000.0 || (std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon))/(std::abs(d_thrust_grad[IDX2C(i,j,rows)]) + std::abs(loss/(2*epsilon)))) > 1/1000.0 ) { std::cout << "Gradient for gradient check: " << loss/(2*epsilon) << "\n"; std::cout << "My gradient: " << d_thrust_grad[IDX2C(i,j,rows)] << "\n"; std::cout << "Gradient difference: " << std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon)) << "\n"; std::cout << "Gradient difference (Equation 2): " << std::abs(d_thrust_grad[IDX2C(i,j,rows)] - loss/(2*epsilon))/(std::abs(d_thrust_grad[IDX2C(i,j,rows)]) + std::abs(loss/(2*epsilon)) ) << "\n\n"; } } } }
int main( int argc, char **argv ) { double *A, *B, *C; double *cu_A, *cu_B, *cu_C; cudaError_t cuError; cublasStatus_t cuStatus; cublasHandle_t cuHandle; // seed rand() srand(time(NULL)); // allocate memory on CPU A = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_P); B = (double*)malloc(sizeof(double)*MATRIX_P*MATRIX_M); C = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_M); if( !A || !B || !C ) { perror("Can't allocate CPU matrices"); exit(EXIT_FAILURE); } // generate matrices for( int i = 0; i < MATRIX_N*MATRIX_P; i++ ) A[i] = 10.0*((double)rand())/RAND_MAX; for( int i = 0; i < MATRIX_P*MATRIX_M; i++ ) B[i] = 10.0*((double)rand())/RAND_MAX; // allocate memory on GPU cuError = cudaMalloc( &cu_A, sizeof(double)*MATRIX_N*MATRIX_P ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } cuError = cudaMalloc( &cu_B, sizeof(double)*MATRIX_P*MATRIX_M ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } cuError = cudaMalloc( &cu_C, sizeof(double)*MATRIX_N*MATRIX_M ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } // setup cuBlas cuStatus = cublasCreate( &cuHandle ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error initializing cuBlas\n"); exit(EXIT_FAILURE); } // setup matrices cuStatus = cublasSetMatrix( MATRIX_N, MATRIX_P, sizeof(double), A, MATRIX_N, cu_A, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring matrix A\n"); exit(EXIT_FAILURE); } cuStatus = cublasSetMatrix( MATRIX_P, MATRIX_M, sizeof(double), B, MATRIX_P, cu_B, MATRIX_P ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring matrix B\n"); exit(EXIT_FAILURE); } // multiply double one = 1.0; double zero = 0.0; cuStatus = cublasDgemm( cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, MATRIX_N, MATRIX_M, MATRIX_P, &one, cu_A, MATRIX_N, cu_B, MATRIX_P, &zero, cu_C, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error executing matrix mult\n"); exit(EXIT_FAILURE); } // get results cuStatus = cublasGetMatrix( MATRIX_N, MATRIX_M, sizeof(double), cu_C, MATRIX_N, C, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring results\n"); exit(EXIT_FAILURE); } // check results bool good = true; for( int i = 0; i < MATRIX_N; i++ ) { for( int j = 0; j < MATRIX_M; j++ ) { double sum = 0.0; for( int k = 0; k < MATRIX_P; k++ ) { sum += A[IDX2C(i, k, MATRIX_N)]*B[IDX2C(k, j, MATRIX_P)]; } // check if( fabs(sum - C[IDX2C(i,j,MATRIX_N)]) > 0.00001 ) { good = false; printf("(%i, %i) sum = %f\tcu_C = %f\tMISMATCH\n", i, j, sum, C[IDX2C(i,j,MATRIX_N)]); } } } if( good ) printf("Results Match\n"); else printf("Results DO NOT Match\n"); // cleanup free( A ); free( B ); free( C ); cudaFree( cu_A ); cudaFree( cu_B ); cudaFree( cu_C ); cublasDestroy( cuHandle ); return 0; }
inline void compute_stochastic_pooling(T *ptr_data, const mwSize *DATA_DIMS, T *ptr_pool, T *ptr_out, T *ptr_idx, int tile_start) { T m; T sum,rsum; int idx; int count = 0; for (int col = 0; col < DATA_DIMS[1]; col += ptr_pool[1]) { for (int row = 0; row < DATA_DIMS[0]; row += ptr_pool[0]) { if (debug) fprintf(stderr, "r = %i, c = %i \n", row, col); m = 0; sum = 0; rsum = 0; idx = -1; for (int pcol = 0; (pcol < ptr_pool[1] && col + pcol < DATA_DIMS[1]); ++pcol) { for (int prow = 0; (prow < ptr_pool[0] && row + prow < DATA_DIMS[0]); ++prow) { sum += ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])]; } } float num = rand()%1000; num *= sum/1000; for (int pcol = 0; (pcol < ptr_pool[1] && col + pcol < DATA_DIMS[1]); ++pcol) { for (int prow = 0; (prow < ptr_pool[0] && row + prow < DATA_DIMS[0]); ++prow) { if (debug) { fprintf(stderr, "num = %f, rsum = %f, this = %f \n", num, rsum, ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])]); } rsum += ptr_data[IDX2C(row + prow, col + pcol, DATA_DIMS[0])]; if(num<rsum) { idx = IDX2C(row + prow, col + pcol, DATA_DIMS[0]); m = ptr_data[idx]; break; } /*if((pcol == ptr_pool[1] - 1 || col + pcol == DATA_DIMS[1] - 1) && (prow == ptr_pool[0]-1 || row + prow == DATA_DIMS[0] -1))*/ if(sum-rsum<0.0001) { idx = IDX2C(row + prow, col + pcol, DATA_DIMS[0]); m = ptr_data[idx]; break; } } if(idx != -1) break; } if (debug && idx == -1) { fprintf(stderr, "dioschifoso\n"); return; } if (debug) fprintf(stderr, "count = %i\n",count); /* idxs are to be used in Matlab and hence a +1 is needed */ ptr_idx[count] = idx + 1 + tile_start; ptr_out[count] = m; count++; } } }
int main ( void ){ cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context int i,j; // i-row index , j-col. index double * a; // mxm matrix a on the host double * b; // mxn matrix b on the host a=( double *) malloc (m*m* sizeof ( double )); // host memory for a b=( double *) malloc (m*n* sizeof ( double )); // host memory for b int ind =11; // a: for(j=0;j<m;j ++){ // 11 for(i=0;i<m;i ++){ // 12 ,17 if(i >=j){ // 13 ,18 ,22 a[ IDX2C(i,j,m)]=( double )ind ++; // 14 ,19 ,23 ,26 } // 15 ,20 ,24 ,27 ,29 } // 16 ,21 ,25 ,28 ,30 ,31 } printf (" lower triangle of a:\n"); /* for (i=0;i<m;i ++){ for (j=0;j<m;j ++){ if(i >=j) printf (" %5.0f",a[ IDX2C(i,j,m)]); } printf ("\n"); } */ ind =11; // b: for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35 for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36 if(i == j) b[IDX2C(i,i,m)] = 1.0; else b[IDX2C(i,j,m)] = 0.0; } } // b[ IDX2C(i,j,m)] = ind++; /*if(i == j) b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37 else b[ IDX2C(i, j, m)] = 0.0;*/ //ind ++; // 14 ,20 ,26 ,32 ,38 printf ("b:\n"); /* for (i=0;i<m;i ++){ for (j=0;j<n;j ++){ printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row } printf ("\n"); } */ double * d_a; // d_a - a on the device double * d_b; // d_b - b on the device cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device // memory alloc for a cudaStat = cudaMalloc (( void **)& d_b ,m*n* sizeof (*b)); // device // // memory alloc for b stat = cublasCreate (& handle ); // initialize CUBLAS context stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a stat = cublasSetMatrix (m,n, sizeof (*b) ,b,m,d_b ,m); //b -> d_b double al =1.0f; double startime = CycleTimer::currentSeconds(); (cublasDtrsm(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&al,d_a,m,d_b,m)); stat = cublasGetMatrix (m,n, sizeof (*b) ,d_b ,m,b,m); // d_b -> b double endtime = CycleTimer::currentSeconds(); printf (" solution x from Strsm :\n"); /* for(i=0;i<m;i ++){ for(j=0;j<n;j ++){ printf (" %11.5f",b[IDX2C(i,j,m )]); // print b after Strsm } printf ("\n"); } */ cudaFree (d_a ); // free device memory cudaFree (d_b ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context free (a); // free host memory free (b); // free host memory printf("Time taken: %lf\n", endtime - startime); return EXIT_SUCCESS ; }
int main(){ int m = 10; int n = m; cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context int i,j; // i-row index , j-col. index double * a; // mxm matrix a on the host double * b; // mxm matrix b on the host double * c; // mxm matrix c on the host a=( double *) malloc (m*m* sizeof ( double )); // host memory for a b=( double *) malloc (m*m* sizeof ( double )); // host memory for b c=( double *) malloc (m*m* sizeof ( double )); // host memory for b int ind =1; // a: for(j=0;j<m;j ++){ // 11 for(i=0;i<m;i ++){ // 12 ,17 // if(i >=j){ // 13 ,18 ,22 a[ IDX2C(i,j,m)]=( float )ind ++; // 14 ,19 ,23 ,26 // } // 15 ,20 ,24 ,27 ,29 } // 16 ,21 ,25 ,28 ,30 ,31 } printf (" lower triangle of a:\n"); for (i=0;i<m;i ++){ for (j=0;j<m;j ++){ // if(i >=j) printf (" %5.0f",a[ IDX2C(i,j,m)]); } printf ("\n"); } ind =11; // b: for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35 for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36 if(i == j) b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37 else b[ IDX2C(i, j, m)] = 2.0; //ind ++; // 14 ,20 ,26 ,32 ,38 } // 15 ,21 ,27 ,33 ,39 } // 16 ,22 ,28 ,34 ,40 printf ("b:\n"); for (i=0;i<m;i ++){ for (j=0;j<n;j ++){ printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row } printf ("\n"); } double * d_a; // d_a - a on the device double * d_b; // d_b - b on the device double * d_c; // d_c - c on the devicde cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device memory alloc for a cudaStat = cudaMalloc (( void **)& d_b ,m*m* sizeof (*b)); // device memory alloc for b cudaStat = cudaMalloc (( void **)& d_c ,m*m* sizeof (*c)); // device memory alloc for c stat = cublasCreate (& handle ); // initialize CUBLAS context stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a stat = cublasSetMatrix (m,m, sizeof (*b) ,b,m,d_b ,m); //b -> d_b double startime = CycleTimer::currentSeconds(); gpu_blas_mmul(d_a, d_b, d_c, m, m, m); double endtime = CycleTimer::currentSeconds(); stat = cublasGetMatrix (m,n, sizeof (*c) ,d_c ,m,c,m); // d_b -> b printf (" solution x from Strsm :\n"); for(i=0;i<m;i ++){ for(j=0;j<n;j ++){ printf (" %11.5f",c[IDX2C(i,j,m )]); // print b after Strsm } printf ("\n"); } cudaFree (d_a ); // free device memory cudaFree (d_b ); // free device memory cudaFree (d_c ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context free (a); // free host memory free (b); // free host memory free (c); // free host memory printf("Time taken: %lf\n", endtime - startime); return EXIT_SUCCESS ; }
int validateResults(double *luMatrix, double *origMatrix, int matrixSize) { int count = 0; // Multiply lower triangle with upper triangle // double *result = new double[matrixSize *matrixSize]; // std::cout<< "orig matrix:" << std::endl; double *lMatrix = new double[matrixSize * matrixSize]; double *uMatrix = new double[matrixSize * matrixSize]; for (int c = 0; c < matrixSize; c++) { for (int r = 0; r < matrixSize; r++) { // below diag if (r > c) { lMatrix[IDX2C(r, c, matrixSize)] = luMatrix[IDX2C(r, c, matrixSize)]; uMatrix[IDX2C(r, c, matrixSize)] = 0.0; } // above diag else if (c > r) { lMatrix[IDX2C(r, c, matrixSize)] = 0.0; uMatrix[IDX2C(r, c, matrixSize)] = luMatrix[IDX2C(r, c, matrixSize)]; } // on diag else if (r == c) { lMatrix[IDX2C(r, c, matrixSize)] = 1.0; uMatrix[IDX2C(r, c, matrixSize)] = luMatrix[IDX2C(r, c, matrixSize)]; } } } double *result = new double[matrixSize*matrixSize]; mkl_set_num_threads(40); mkl_blas_ // cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, matrixSize, matrixSize, matrixSize, 1.0, lMatrix, matrixSize, uMatrix, matrixSize, 0.0, result, matrixSize); for (int r = 0; r < matrixSize; r++) { for (int c = 0; c < matrixSize; c++) { double difference = fabs(result[IDX2C(r, c, matrixSize)] - origMatrix[IDX2C(r, c, matrixSize)]); if (difference > 1.0e-8) { count++; if (count < 20) { std::cout << "Incorrect value: " << result[IDX2C(r, c, matrixSize)] << " != " << origMatrix[IDX2C(r, c, matrixSize)] << " difference: " << difference << std::endl; } } } } if (count > 0) std::cout << "Total incorrect = " << count << std::endl; if (count > 0) return 1; return 0; }
int main(int argc, char *argv[]) { long matrixSize= 16384; int blockSize = 128; bool runSequential = false; bool validate = false; int numBlasThreads = 40; int numGausElimThreads = 2; int numFactorLowerThreads = 4; int numFactorUpperThreads = 4; int numMatrixMulThreads = 30; std::string runtimeFileStr("runtimes"); int numRetry = 1; if (argc > 1) { for (int arg = 1; arg < argc; arg++) { std::string argvs(argv[arg]); if (argvs == "--size") { arg++; matrixSize = atoi(argv[arg]); } if (argvs == "--num-threads-blas") { arg++; numBlasThreads = atoi(argv[arg]); } if (argvs == "num-threads-factor-l") { arg++; numFactorLowerThreads = atoi(argv[arg]); } if (argvs == "num-threads-factor-u") { arg++; numFactorUpperThreads = atoi(argv[arg]); } if (argvs == "num-threads-gaus") { arg++; numGausElimThreads = atoi(argv[arg]); } if (argvs == "num-threads-gemm") { arg++; numMatrixMulThreads = atoi(argv[arg]); } if (argvs == "--run-sequential") { runSequential = true; } if (argvs == "--num-retry" && arg + 1 < argc) { arg++; numRetry = atoi(argv[arg]); } if (argvs == "--block-size") { arg++; blockSize = atoi(argv[arg]); } if (argvs == "--runtime-file" && arg + 1 < argc) { runtimeFileStr = argv[arg + 1]; arg++; } if (argvs == "--validate-results") { validate = true; } if (argvs == "--help") { std::cout << argv[0] << " args: [--size <#>] [--block-size <#>] [--num-retry <#>] [--runtime-file <filename>] [--validate-results] [--run-sequential] [--num-threads-factor-l <#>] [--num-threads-factor-u <#>] [--num-threads-gaus <#>] [--num-threads-gemm <#>] [--num-threads-blas <#>] [--help]" << std::endl; exit(0); } } } std::ofstream runtimeFile(runtimeFileStr, std::ios::app); double *matrix = new double[matrixSize * matrixSize]; double *matrixTest = nullptr; // TODO: Ensure diagonally dominant initMatrixDiagDom(matrix, matrixSize, matrixSize, true); if (validate) { matrixTest = new double[matrixSize * matrixSize]; for (int i = 0; i < matrixSize * matrixSize; i++) matrixTest[i] = matrix[i]; } for (int numTry = 0; numTry < numRetry; numTry++) { SimpleClock clk; SimpleClock endToEnd; if (runSequential) { endToEnd.start(); mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL); // mkl_set_num_threads(40); clk.start(); runSequentialLU(matrix, matrixSize); // computeSequentialMatMul(matrixA, matrixB, matrixC, matrixAHeight, sharedDim, matrixBWidth); clk.stopAndIncrement(); endToEnd.stopAndIncrement(); } else { endToEnd.start(); mkl_domain_set_num_threads(numBlasThreads, MKL_DOMAIN_ALL); int gridHeight = (int) matrixSize / blockSize; int gridWidth = (int) matrixSize / blockSize; // TODO: Build graph and runtime htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>> *matrixBlocks = new htgs::StateContainer<std::shared_ptr<MatrixBlockData<double *>>>(gridHeight, gridWidth, nullptr); for (int r = 0; r < gridHeight; r++) { for (int c = 0; c < gridWidth; c++) { // Store pointer locations for all blocks double *ptr = &matrix[IDX2C(r * blockSize, c *blockSize, matrixSize)]; std::shared_ptr<MatrixRequestData> request(new MatrixRequestData(r, c, MatrixType::MatrixA)); std::shared_ptr<MatrixBlockData<double *>> data(new MatrixBlockData<double *>(request, ptr, blockSize, blockSize)); matrixBlocks->set(r, c, data); } } GausElimTask *gausElimTask = new GausElimTask(numGausElimThreads, matrixSize, matrixSize); auto gausElimBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); GausElimRuleUpper *gausElimRuleUpper = new GausElimRuleUpper(matrixBlocks, gridHeight, gridWidth); GausElimRuleLower *gausElimRuleLower = new GausElimRuleLower(matrixBlocks, gridHeight, gridWidth); FactorUpperTask *factorUpperTask = new FactorUpperTask(numFactorUpperThreads, matrixSize, matrixSize); FactorLowerTask *factorLowerTask = new FactorLowerTask(numFactorLowerThreads, matrixSize, matrixSize); auto matrixMulBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); MatrixMulRule *matrixMulRule = new MatrixMulRule(matrixBlocks, gridHeight, gridWidth); MatrixMulBlkTask *matrixMulTask = new MatrixMulBlkTask(numMatrixMulThreads, matrixSize, matrixSize, matrixSize, matrixSize, blockSize); auto matrixMulResultBk = new htgs::Bookkeeper<MatrixBlockData<double *>>(); int numDiagonals = gridWidth - 1; GausElimRule *gausElimRule = new GausElimRule(numDiagonals, gridHeight, gridWidth); // Number of updates excluding the diagonal and the top/left row/column int numUpdates = (1.0/6.0) * (double)gridWidth * (2.0 * ((double)gridWidth * (double)gridWidth) - 3.0 * (double)gridWidth + 1.0); UpdateRule *updateRule = new UpdateRule(numUpdates); UpdateRule *updateRule2 = new UpdateRule(numUpdates); auto taskGraph = new htgs::TaskGraph<MatrixBlockData<double *>, htgs::VoidData>(); taskGraph->addGraphInputConsumer(gausElimTask); taskGraph->addEdge(gausElimTask, gausElimBk); taskGraph->addRule(gausElimBk, factorUpperTask, gausElimRuleUpper); taskGraph->addRule(gausElimBk, factorLowerTask, gausElimRuleLower); taskGraph->addEdge(factorUpperTask, matrixMulBk); taskGraph->addEdge(factorLowerTask, matrixMulBk); taskGraph->addRule(matrixMulBk, matrixMulTask, matrixMulRule); taskGraph->addEdge(matrixMulTask, matrixMulResultBk); if (numDiagonals > 0) taskGraph->addRule(matrixMulResultBk, gausElimTask, gausElimRule); if (numUpdates > 0) taskGraph->addRule(matrixMulResultBk, matrixMulBk, updateRule); if (numUpdates > 0) taskGraph->addRule(matrixMulResultBk, gausElimBk, updateRule2); taskGraph->incrementGraphInputProducer(); taskGraph->writeDotToFile("lud-graph.dot"); htgs::Runtime *runtime = new htgs::Runtime(taskGraph); clk.start(); runtime->executeRuntime(); taskGraph->produceData(matrixBlocks->get(0, 0)); taskGraph->finishedProducingData(); runtime->waitForRuntime(); clk.stopAndIncrement(); delete runtime; endToEnd.stopAndIncrement(); } double operations = (2.0 * (matrixSize * matrixSize * matrixSize)) / 3.0; double flops = operations / clk.getAverageTime(TimeVal::SEC); double gflops = flops / 1073741824.0; std::cout << (runSequential ? "sequential" : "htgs") << ", matrix-size: " << matrixSize << ", " << "blockSize: " << (runSequential ? 0 : blockSize) << ", blasThreads: " << numBlasThreads << ", gausThreads: " << numGausElimThreads << ", factorUpperThreads: " << numFactorUpperThreads << ", factorLowerThreads: " << numFactorLowerThreads << ", gemmThreads: " << numMatrixMulThreads << ", time:" << clk.getAverageTime(TimeVal::MILLI) << ", end-to-end:" << endToEnd.getAverageTime(TimeVal::MILLI) << ", gflops: " << gflops << std::endl; runtimeFile << (runSequential ? "sequential" : "htgs") << ", " << matrixSize << ", " << blockSize << ", " << numBlasThreads << ", " << numGausElimThreads << ", " << numFactorUpperThreads << ", " << numFactorLowerThreads << ", " << numMatrixMulThreads << ", " << clk.getAverageTime(TimeVal::MILLI) << ", " << endToEnd.getAverageTime(TimeVal::MILLI) << ", " << gflops << std::endl; if (validate) { int res = validateResults(matrix, matrixTest, matrixSize); std::cout << (res == 0 ? "PASSED" : "FAILED") << std::endl; } } delete[] matrix; delete[] matrixTest; }
void ensemble_factory<dType>::ensembles_models() { int num_models = models.size(); for(int i=0; i<outputdist.rows(); i++) { for(int j=0; j< outputdist.cols(); j++) { double temp_sum = 0; for(int k=0; k<models.size(); k++) { temp_sum+=models[k].outputdist(i,j); } outputdist(i,j) = temp_sum/num_models; } } //normalize now // for(int j=0; j< outputdist.cols(); j++) { // double temp_sum = 0; // for(int i=0; i<outputdist.rows(); i++) { // temp_sum+=outputdist(i,j); // } // for(int i=0; i<outputdist.rows(); i++) { // outputdist(i,j) = outputdist(i,j)/temp_sum; // } // } normalization.setZero(); for(int i=0; i<outputdist.rows(); i++) { normalization+=outputdist.row(i); } for(int i=0; i<outputdist.rows(); i++) { outputdist.row(i) = (outputdist.row(i).array()/normalization.array()).matrix(); } //now averaging alignment scores for unk replacement if(BZ_CUDA::unk_replacement) { //average the scores for(int i=0; i<models[0].longest_sent;i++) { for(int j=0; j<models[0].beam_size; j++) { dType temp_sum = 0; for(int k=0; k<models.size(); k++) { temp_sum+=models[k].viterbi_alignments_scores[IDX2C(i,j,models[0].longest_sent)]; } BZ_CUDA::alignment_scores[IDX2C(i,j,models[0].longest_sent)] = temp_sum; } } // std::cout << "-------------------------------------------\n"; // for(int i=0; i<models[0].longest_sent;i++) { // for(int j=0; j<models[0].beam_size; j++) { // std::cout << BZ_CUDA::alignment_scores[IDX2C(i,j,models[0].longest_sent)] << " "; // } // std::cout << "\n"; // } // std::cout << "\n"; // std::cout << "-------------------------------------------\n\n"; //choose the max and fill in BZ_CUDA::viterbi_alignments for(int i=0; i<models[0].beam_size; i++) { dType max_val = 0; int max_index = -1; for(int j=0; j<models[0].longest_sent; j++) { dType temp_val = BZ_CUDA::alignment_scores[IDX2C(j,i,models[0].longest_sent)]; if(temp_val > max_val) { max_val = temp_val; max_index = j; } } // if(max_index==-1) { // std::cout << "ERROR: max_index is still -1, so all values are zero\n"; // } BZ_CUDA::viterbi_alignments[i] = max_index; } } }