static void* convertible(PyObject *obj_ptr) { // Check for a null pointer. if(!obj_ptr) { //THROW_TYPE_ERROR("PyObject pointer was null"); return 0; } // Make sure this is a numpy array. if (!PyArray_Check(obj_ptr)) { //THROW_TYPE_ERROR("Conversion is only defined for numpy array and matrix types"); return 0; } // Check the type of the array. int npyType = PyArray_ObjectType(obj_ptr, 0); if(!TypeToNumPy<scalar_t>::canConvert(npyType)) { //THROW_TYPE_ERROR("Can not convert " << npyArrayTypeString(obj_ptr) << " to " << toString() // << ". Mismatched types."); return 0; } // Check the array dimensions. int nd = PyArray_NDIM(obj_ptr); if(nd != 1 && nd != 2) { THROW_TYPE_ERROR("Conversion is only valid for arrays with 1 or 2 dimensions. Argument has " << nd << " dimensions"); } if(nd == 1) { checkVectorSizes(obj_ptr); } else { // Two-dimensional matrix type. checkMatrixSizes(obj_ptr); } return obj_ptr; }
clblasStatus doRot( CLBlasKargs *kargs, size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_ROT printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_ROT printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_ROT printf("Invalid Size for Y\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy kargs->pigFuncID = CLBLAS_ROT; // Using ROTM kernel for ROT. Both are similar listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ROTM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
clblasStatus doGer( CLBlasKargs *kargs, clblasOrder order, size_t M, size_t N, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, int doConj, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { #ifdef DEBUG_GER printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_GER printf("Invalid Size for A %d\n",retCode ); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_GER printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_GER printf("Invalid Size for Y\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } /* * ASSUMPTION: * doTRMV assumes "commandQueue" of 0. The same is reflected in * "makeSolutionSeq" as well. If either of them changes in future, * this code needs to be revisited. */ kargs->order = order; kargs->M = M; kargs->N = N; kargs->A = A; kargs->offa = offa; kargs->offA = offa; kargs->lda.matrix = lda; kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->C = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; kargs->scimage[1] = 0; kargs->K = (size_t)doConj; // Will be using K as doConj parameter #ifdef DEBUG_GER printf("Calling makeSolutionSeq from DoGer: GER\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GER, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
static clblasStatus doHemv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq1, seq2; cl_event first_event; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offA, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) { return retCode; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->N = N; kargs->A = A; kargs->offA = offA; kargs->offa = offA; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.Vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.Vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; listInitHead(&seq1); err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &first_event, &seq1); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq1); if (err == CL_SUCCESS) { listInitHead(&seq2); kargs->transA = clblasConjTrans; kargs->diag = clblasUnit; err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, 1, &first_event, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq1); return (clblasStatus)err; //printf("doHemv called\n"); //return 0; }
static clblasStatus doSymv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; #ifdef USE_SYMV ListHead seq2; ListNode *listNodePtr; cl_event first_event; #endif if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offA, lda, A_MAT_ERRSET ))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET ))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET ))) { return retCode; } kargs->order = order; kargs->uplo = uplo; kargs->N = N; kargs->K = N; //store original N kargs->A = A; kargs->offA = offA; kargs->offa = offA; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; #ifndef USE_SYMV listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SYMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } #else // version of SYMV using kprintf numCommandQueues = 1; listInitHead(&seq); kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &first_event, &seq); if (err == CL_SUCCESS) { listInitHead(&seq2); kargs->transA = clblasTrans; kargs->diag = clblasUnit; err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, 1, &first_event, events, &seq2); if (err == CL_SUCCESS) { // Adding node from seq2 to main seq listNodePtr = listNodeFirst(&seq2); listAddToTail(&seq, listNodePtr); err = executeSolutionSeq(&seq); // Executes both kernels in the seq one after other } } #endif freeSolutionSeq(&seq); return (clblasStatus)err; }
clblasStatus doHer2( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } #ifdef DEBUG_HER2 printf("doHer2 called\n"); #endif /* Validate arguments */ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid mem object..\n"); #endif return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid Size for Y\n"); #endif return retCode; } if ((commandQueue == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->order = order; if(order == clblasRowMajor) // Handling row-major. Invert X, Y and uplo { kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper; kargs->B = Y; kargs->ldb.vector = incy; kargs->offBX = offy; kargs->C = X; kargs->ldc.vector = incx; kargs->offCY = offx; } else { kargs->uplo = uplo; kargs->B = X; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->C = Y; kargs->ldc.vector = incy; kargs->offCY = offy; } kargs->N = N; kargs->A = A; kargs->lda.matrix = lda; kargs->offa = offa; kargs->offA = offa; #ifdef DEBUG_HER2 printf("Calling makeSolutionSeq : HER2\n"); #endif /* * Always use CommandQueue (0) * PENDING: * 1. No Multi-GPU / Multi-command queue support * 2. This can be optimized to use the commandQ with the higher * memmory bandwidth that supports the data-type and the LDA */ numCommandQueues = 1; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_HER2, kargs, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
static clblasStatus doHpmv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq1, seq2; cl_event first_event; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP, offa, 0, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { return retCode; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->N = N; kargs->A = AP; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = 0; // Set lda as zero for packed matrices kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = Y; kargs->offCY = offy; kargs->ldc.vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; kargs->pigFuncID = CLBLAS_HPMV; listInitHead(&seq1); err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &first_event, &seq1); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq1); if (err == CL_SUCCESS) { listInitHead(&seq2); kargs->transA = clblasConjTrans; kargs->diag = clblasUnit; err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues, 1, &first_event, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq1); return (clblasStatus)err; }
clblasStatus doScal( CLBlasKargs *kargs, size_t N, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, X, X, false, X_VEC_ERRSET, X_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { printf("Invalid mem object..\n"); return retCode; } // Check wheather enough memory was allocated if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) { printf("Invalid Size for X\n"); return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx if(incx < 0) { // According to Netlib - return for negative incx return clblasSuccess; } #ifdef DEBUG_SCAL printf("Calling makeSolutionSeq from DoScal: SCAL\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SCAL, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
static clblasStatus doSHbmv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) { return retCode; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->transA = clblasNoTrans; kargs->N = N; kargs->M = N; kargs->KL = K; kargs->KU = K; kargs->A = A; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
static clblasStatus doGemv( CLBlasKargs *kargs, clblasOrder order, clblasTranspose transA, size_t M, size_t N, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t sizev; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects( A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offA, lda, A_MAT_ERRSET ))) { return retCode; } sizev = (transA == clblasNoTrans) ? N : M; if ((retCode = checkVectorSizes(kargs->dtype, sizev, x, offx, incx, X_VEC_ERRSET ))) { return retCode; } sizev = (transA == clblasNoTrans) ? M : N; if ((retCode = checkVectorSizes(kargs->dtype, sizev, y, offy, incy, Y_VEC_ERRSET))) { return retCode; } kargs->order = order; kargs->transA = transA; kargs->M = M; kargs->N = N; /* * store original height of the matrix A * FIXME: store it to a dedicated field */ kargs->K = (transA == clblasNoTrans) ? M : N; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GEMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
clblasStatus doRotg( CLBlasKargs *kargs, cl_mem A, size_t offA, cl_mem B, size_t offB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; // C is of real type even for complex numbers DataType cType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT : ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype)); if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(A, B, A, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects A, B printf("Invalid mem object..\n"); return retCode; } retCode = checkMemObjects(C, S, C, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects C, S printf("Invalid mem object..\n"); return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET))) { printf("Invalid Size for A\n"); return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET))) { printf("Invalid Size for B\n"); return retCode; } if ((retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET))) { printf("Invalid Size for C\n"); return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET))) { printf("Invalid Size for S\n"); return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->A = A; kargs->B = B; kargs->C = C; kargs->D = S; kargs->offa = offA; kargs->offb = offB; kargs->offc = offC; kargs->offd = offS; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ROTG, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
clblasStatus doRotmg( CLBlasKargs *kargs, cl_mem D1, size_t offD1, cl_mem D2, size_t offD2, cl_mem X1, size_t offX1, cl_mem Y1, size_t offY1, cl_mem param, size_t offParam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(D1, D2, X1, true, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects A, B #ifdef DEBUG_ROTMG printf("Invalid mem object..\n"); #endif return retCode; } retCode = checkMemObjects(Y1, param, Y1, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects C, S #ifdef DEBUG_ROTMG printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for D1\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for D2\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for X1\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for Y1\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for PARAM\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->A = D1; kargs->B = D2; kargs->C = X1; kargs->D = Y1; kargs->E = param; kargs->offa = offD1; kargs->offb = offD2; kargs->offc = offX1; kargs->offd = offY1; kargs->offe = offParam; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ROTMG, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; }
clblasStatus doAsum( CLBlasKargs *kargs, size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq, seq2; clblasStatus retCode = clblasSuccess; cl_event firstAsumCall; CLBlasKargs redctnArgs; ListNode *listNodePtr; SolutionStep *step; DataType asumType = (kargs->dtype == TYPE_COMPLEX_FLOAT) ? TYPE_FLOAT: ((kargs->dtype == TYPE_COMPLEX_DOUBLE) ? TYPE_DOUBLE: kargs->dtype); if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(scratchBuff, asum, X, true, X_VEC_ERRSET, X_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { printf("Invalid mem object..\n"); return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET ))) { printf("Invalid Size for X\n"); return retCode; } // Minimum size of scratchBuff is N if ((retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET ))) { printf("Insufficient ScratchBuff\n"); return retCode; } if ((retCode = checkVectorSizes(asumType, 1, asum, offAsum, 1, X_VEC_ERRSET ))) { printf("Invalid Size for asum\n"); return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = asum; kargs->offA = offAsum; kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx if(incx <1){ kargs->N = 1; } kargs->D = scratchBuff; kargs->redctnType = REDUCE_BY_SUM; memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs)); redctnArgs.dtype = asumType; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ASUM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstAsumCall, &seq); if (err == CL_SUCCESS) { /** The second kernel call needs to know the number of work-groups used in the first kernel call. This number of work-groups is calculated here and passed as N to second reduction kernel **/ err = executeSolutionSeq(&seq); if (err == CL_SUCCESS) { listNodePtr = listNodeFirst(&seq); // Get the node step = container_of(listNodePtr, node, SolutionStep); redctnArgs.N = step->pgran.numWGSpawned[0]; // 1D block was used listInitHead(&seq2); err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues, 1, &firstAsumCall, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq); return (clblasStatus)err; }
clblasStatus doiAmax( CLBlasKargs *kargs, size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuf, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq, seq2; clblasStatus retCode = clblasSuccess; cl_event firstiAmaxCall; CLBlasKargs redctnArgs; ListNode *listNodePtr; SolutionStep *step; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, scratchBuf, iMax, true, X_VEC_ERRSET, A_MAT_ERRSET, X_VEC_ERRSET ); if (retCode) { printf("Invalid mem object..\n"); return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET ))) { printf("Invalid Size for X\n"); return retCode; } // Minimum size of scratchBuff is 2 * N if ((retCode = checkVectorSizes(kargs->dtype, (2 * N), scratchBuf, 0, 1, A_MAT_ERRSET ))) { printf("Insufficient ScratchBuff A\n"); return retCode; } if ((retCode = checkVectorSizes(TYPE_UNSIGNED_INT, 1, iMax, offiMax, 1, X_VEC_ERRSET ))) { printf("Invalid Size for iX\n"); return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } // cl_mem D is scratch buffer // cl_mem A is the output Buffer i.e. iMAX, offA for offiMax // cl_mem B is the input Buffer containing N Values kargs->N = N; kargs->B = X; kargs->offb = offx; kargs->ldb.vector = incx; // Will be using this as incx if(incx < 1) { // According to netlib, if incx<1, NRM2 will be zero kargs->N = 1; // Makeing it launch only 1 work-group } kargs->D = scratchBuf; kargs->A = iMax; kargs->offA = offiMax; #ifdef IAMAX_USE_ATOMIC_MIN kargs->redctnType = REDUCE_MAX_WITH_INDEX_ATOMICS; #else kargs->redctnType = REDUCE_MAX_WITH_INDEX; #endif memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs)); listInitHead(&seq); err = makeSolutionSeq(CLBLAS_iAMAX, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstiAmaxCall, &seq); if (err == CL_SUCCESS) { // The second kernel call needs to know the number of work-groups used // in the first kernel call. This number of work-groups is calculated here // and passed as N to second reduction kernel err = executeSolutionSeq(&seq); if (err == CL_SUCCESS) { listNodePtr = listNodeFirst(&seq); // Get the node step = container_of(listNodePtr, node, SolutionStep); redctnArgs.N = step->pgran.numWGSpawned[0]; // 1D block was used redctnArgs.dtype = (redctnArgs.dtype == TYPE_COMPLEX_FLOAT) ? TYPE_FLOAT : ((redctnArgs.dtype == TYPE_COMPLEX_DOUBLE) ? TYPE_DOUBLE : redctnArgs.dtype); listInitHead(&seq2); err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues, 1, &firstiAmaxCall, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq); return (clblasStatus)err; }
clblasStatus doTbsv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem x, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err = clblasNotImplemented; ListHead seq; CLBlasKargs gbmvKargs; ListHead gbmvSeq; //cl_context c; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET); if (retCode != clblasSuccess) { #ifdef DEBUG_TBSV printf("Invalid mem object..\n"); #endif return retCode; } /* * PENDING: * checkMatrixSizes() does not account for "offa" argument. * Need to pass "offa" when "checkMatrixSizes()" is changed. */ retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET ); if (retCode != clblasSuccess) { #ifdef DEBUG_TBSV printf("Invalid Size for A\n"); #endif return retCode; } retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET ); if (retCode != clblasSuccess) { #ifdef DEBUG_TBSV printf("Invalid Size for X\n"); #endif return retCode; } #ifdef DEBUG_TBSV printf("DoTbsv being called...\n"); #endif if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } numCommandQueues = 1; // NOTE: Hard-coding the number of command queues to 1i kargs->order = order; kargs->uplo = uplo; kargs->transA = trans; kargs->diag = diag; kargs->M = N; // store Original N kargs->N = N; kargs->K = K; kargs->A = A; kargs->lda.matrix = lda; kargs->B = x; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; kargs->C = x; kargs->offCY = offx; kargs->ldc.vector = incx; kargs->startRow = 0; if(trans == clblasNoTrans) { kargs->endRow = (order == clblasRowMajor) ? N-1 : N; } else { kargs->endRow = (order == clblasRowMajor) ? N : N-1; } memcpy(&gbmvKargs, kargs, sizeof(CLBlasKargs)); gbmvKargs.pigFuncID = CLBLAS_GBMV; listInitHead(&seq); listInitHead(&gbmvSeq); err = makeSolutionSeq(CLBLAS_TRSV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = makeSolutionSeq(CLBLAS_GBMV, &gbmvKargs, numCommandQueues, commandQueues, 0, NULL, NULL, &gbmvSeq); if (err == CL_SUCCESS) { err = orchestrateTBSV(kargs, &seq, &gbmvSeq, numEventsInWaitList, eventWaitList, events); } } freeSolutionSeq(&seq); freeSolutionSeq(&gbmvSeq); return (clblasStatus)err; }