void THClBlas_gemm2(THClState *state, char orderchar, char transa, char transb, long m, long n, long k, float alpha, THClTensor *a, long lda, THClTensor *b, long ldb, float beta, THClTensor *c, long ldc) { StatefulTimer::timeCheck("THClBlas_gemm START"); CLWrapper *aWrapper = THClTensor_wrapper(state, a); CLWrapper *bWrapper = THClTensor_wrapper(state, b); CLWrapper *cWrapper = THClTensor_wrapper(state, c); long offseta = THClTensor_storageOffset(state, a); long offsetb = THClTensor_storageOffset(state, b); long offsetc = THClTensor_storageOffset(state, c); // adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); clblasTranspose opa = convertTransToClblasOperation(transa); clblasTranspose opb = convertTransToClblasOperation(transb); clblasOrder order = orderchar == 'c' ? clblasColumnMajor : clblasRowMajor; if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) { int i_m = (int)m; int i_n = (int)n; int i_k = (int)k; int i_lda = (int)lda; int i_ldb = (int)ldb; int i_ldc = (int)ldc; cl_int err; if( !aWrapper->isOnDevice() ) { aWrapper->createOnDevice(); } if( !bWrapper->isOnDevice() ) { bWrapper->createOnDevice(); } if( !cWrapper->isOnDevice() ) { cWrapper->createOnDevice(); } EasyCL *cl = cWrapper->getCl(); cl_event *event = 0; if(state->addFinish) { event = new cl_event(); } err = clblasSgemm(order, opa, opb, i_m, i_n, i_k, alpha, aWrapper->getBuffer(), offseta, i_lda, bWrapper->getBuffer(), offsetb, i_ldb, beta, cWrapper->getBuffer(), offsetc, i_ldc, 1, cl->queue, 0, NULL, event); if (err != CL_SUCCESS) { THError("clblasSgemm() failed with %d", err); } else { if(state->addFinish) { err = clWaitForEvents(1, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger: wait for event failed with %d", err); } clReleaseEvent(*event); delete event; } } cWrapper->markDeviceDirty(); StatefulTimer::timeCheck("THClBlas_gemm END"); return; } THError("Clblas_gemm only supports m, n, k, lda, ldb, ldc" "with the bound [val] <= %d", INT_MAX); }
void THClBlas_ger(THClState *state, long m, long n, float alpha, THClTensor *x, long incx, THClTensor *y, long incy, THClTensor *a, long lda) { StatefulTimer::timeCheck("THClBlas_ger START"); if(n == 1) lda = m; if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_m = (int)m; int i_n = (int)n; int i_lda = (int)lda; int i_incx = (int)incx; int i_incy = (int)incy; cl_int err; CLWrapper *xwrap = THClTensor_wrapper(state, x); CLWrapper *ywrap = THClTensor_wrapper(state, y); CLWrapper *awrap = THClTensor_wrapper(state, a); long x_offset = THClTensor_storageOffset(state, x); long y_offset = THClTensor_storageOffset(state, y); long a_offset = THClTensor_storageOffset(state, a); if(!awrap->isOnDevice()) { awrap->createOnDevice(); } EasyCL *cl = ywrap->getCl(); cl_event *event = 0; if(state->addFinish) { event = new cl_event(); } err = clblasSger(clblasColumnMajor, i_m, i_n, alpha, xwrap->getBuffer(), x_offset, i_incx, ywrap->getBuffer(), y_offset, i_incy, awrap->getBuffer(), a_offset, i_lda, 1, (cl->queue), 0, NULL, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger() failed with %d", err); } else { if(state->addFinish) { err = clWaitForEvents(1, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger: wait for event failed with %d", err); } clReleaseEvent(*event); delete event; } } awrap->markDeviceDirty(); StatefulTimer::timeCheck("THClBlas_ger END"); return; } THError("Cublas_ger only supports m, n, lda, incx, incy" "with the bound [val] <= %d", INT_MAX); }