static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; }
// unrolling generator for the f4zero function static int f4zeroSingle(struct KgenContext *ctx, void *priv) { DUMMY_ARG_USAGE(priv); return kgenAddStmt(ctx, "*data++ = 0;\n"); }
static bool checkCalcDecompDedicated( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { bool ret = true; DUMMY_ARG_USAGE(subdimsNum); if (check == PGRAN_CHECK) { unsigned int minSize, maxSize; maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8; minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2; ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true); ret = ret && (subdims[0].bwidth == subdims[1].bwidth); ret = ret && (pgran->wgSize[0] == 64); } else { calcPgranDedicated(pgran, subdims, -1, 3); } return ret; }
static int trmmGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs) { DUMMY_ARG_USAGE(subdimsNum); if ( NULL == pArgs ) { return -EINVAL; } subdims[1].bwidth = 2; subdims[1].x = subdims[1].itemX = 8; subdims[1].y = subdims[1].itemY = 8; subdims[0].bwidth = 2; subdims[0].x = subdims[0].itemX = 32; subdims[0].y = 128; subdims[0].itemY = -1; pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; return 0; }
/** The purpose of this function is to add an work-group size indicator in kernelKey, so that a different kernel is generated when work-group size is changed. Reduction loop is unrolled in kprintf based on work-group size. Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel this will become a kernelKey, and kernel cache will be accordingly managed. Note -- SubproblemDim is a member of kernelKey **/ static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { DUMMY_ARG_USAGE(extra); CLBlasKargs *kargs = (CLBlasKargs*)args; SolutionStep *step = container_of(kargs, args, SolutionStep); subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]); }
static int copyMemPreUnroll(struct KgenContext *ctx, void *priv) { DUMMY_ARG_USAGE(priv); kgenAddStmt(ctx, "src1 = src;\n"); return kgenAddStmt(ctx, "dst1 = dst;\n\n"); }
// // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARG_USAGE(subdims); size_t BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_ASUM printf("ASUM GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_ASUM printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_ASUM printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_ASUM printf("Using Aligned Data Pointer \n"); #endif } strcpy( tempTemplate, (char*)asum_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); }
static int subgGetPerf( unsigned int kflags, const void *args ) { DUMMY_ARG_USAGE(args); if( !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_B ) ){ return PPERF_GOOD; } return PPERF_NOT_SUPPORTED; }
static int trmmSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs) { int itemsPerSubg = 4; int subgA = 8; int subgB = 2; int bw1 = 8; int x1 = 4; int y1 = 4; CLBlasKargs *kargs; DUMMY_ARG_USAGE(subdimsNum); if ( NULL == pArgs ) { return -EINVAL; } kargs = (CLBlasKargs *)pArgs; if( isComplexType(kargs->dtype) ){ bw1 /= 2; } if( isDoubleBasedType(kargs->dtype) ){ bw1 /= 2; } subdims[1].bwidth = bw1; subdims[1].x = subdims[1].itemX = x1; subdims[1].y = subdims[1].itemY = y1; subdims[0].bwidth = bw1 * itemsPerSubg; subdims[0].itemX = x1 * subgB; subdims[0].x = x1*subgB; subdims[0].itemY = y1*subgA; subdims[0].y = y1*subgA; pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; return 0; }
static int symvSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs) { (void)subdimsNum; DUMMY_ARG_USAGE(pArgs); pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; subdims[1].bwidth = 4; subdims[1].itemX = subdims[1].x = 1; subdims[1].itemY = subdims[1].y = 4; subdims[0].bwidth = 8 * subdims[1].bwidth; subdims[0].itemX = subdims[0].x = 1; subdims[0].itemY = subdims[0].y = 8 * subdims[1].y; return 0; }
void initialize_scalars(double alpha, double beta) { DUMMY_ARG_USAGE(beta); buffer_.alpha_ = makeScalar<T>(alpha); }
//----------------------------------------------------------------------------- // TODO: reimplement via new validation API static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check ) { unsigned int subgA = 0; unsigned int subgB = 0; unsigned int regUse = 0; unsigned int itemsPerSubg = 0; DUMMY_ARG_USAGE(subdimsNum); if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } subgA = subdims[0].y/subdims[1].y; subgB = subdims[0].x/subdims[1].x; itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth; if( itemsPerSubg < 4 ){ return false; } if( subdims[1].y < 4 || subdims[1].x < 4 || subdims[1].bwidth < 4 ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } //check fitting of bw to common vector sizes if( isComplexType(dtype) ){ if( 2*subdims[1].bwidth > 16 ){ return false; } } // check dimensions if( subdims[1].bwidth > 16 || subdims[1].x > 16 || subdims[1].y > 16 ){ return false; } // estimate register usage, drop // inevitably slowed decompositions regUse = ( subdims[1].bwidth * subdims[1].x + subdims[1].bwidth * subdims[1].y + subdims[1].x * subdims[1].y ) * dtypeSize(dtype); regUse /= 16; // 16 bytes per register if( regUse >= 64 ){ return false; } // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgDim != 1 ){ return false; } if( pgran->wgSize[0] != 64 ){ return false; } if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){ return false; } } // PGranularity should be calculated else{ pgran->wgDim = 1; pgran->wgSize[0] = subgA * subgB * itemsPerSubg; } return true; }