void DecoderBinaural::process(const float* const* inputs, float** outputs) { unsigned int i; for(i = 0; i < m_number_of_harmonics; i++) { cblas_scopy(m_vector_size, inputs[i], 1, m_input_matrix+i*m_vector_size, 1); } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, (m_impulses_size * 2), m_vector_size, m_number_of_harmonics, 1., m_impulses_matrix, m_number_of_harmonics, m_input_matrix, m_vector_size, 0., m_result_matrix, m_vector_size); for(i = 0; i < m_vector_size; i++) { cblas_saxpy(m_impulses_size, 1.f, m_result_matrix + i, m_vector_size, m_linear_vector_left + i, 1); outputs[0][i] = m_linear_vector_left[i]; } for(i = 0; i < m_vector_size; i++) { cblas_saxpy(m_impulses_size, 1.f, m_result_matrix + i + m_vector_size * m_impulses_size, m_vector_size, m_linear_vector_right + i, 1); outputs[1][i] = m_linear_vector_right[i]; } cblas_scopy(m_impulses_size-1, m_linear_vector_left+m_vector_size, 1, m_linear_vector_left, 1); cblas_scopy(m_impulses_size-1, m_linear_vector_right+m_vector_size, 1, m_linear_vector_right, 1); #ifdef __APPLE__ vDSP_vclr(m_linear_vector_left + m_impulses_size - 1, 1, m_vector_size); vDSP_vclr(m_linear_vector_right + m_impulses_size - 1, 1, m_vector_size); #else memset(m_linear_vector_left + m_impulses_size - 1, 0, m_vector_size * sizeof(float)); memset(m_linear_vector_right + m_impulses_size - 1, 0, m_vector_size * sizeof(float)); #endif }
bool axpy(RealVector &y, const REAL alpha, const RealVector &x) { bool flag = true; UINT n, incX, incY; n = x.size; incX = 1; incY = 1; if (NULL == &x || NULL == &y) { flag = false; goto end; } if (x.size != y.size) { flag = false; goto end; } cblas_saxpy(n, alpha, x.M, incX, y.M, incY); end: return flag; }
int main() { const int N = 10; float x[N], y[N]; float alpha = 10; // initialize for ( int i=0; i<N; i++ ) { x[i] = i; y[i] = 2*i; } // y = alpha * x + y cblas_saxpy( N, alpha, x, 1, y, 1 ); for ( int i=0; i<N; i++ ) std::cout << y[i] << " should equal " << alpha*i + 2*i << std::endl; return 0; }
void Map::process(const float* inputs, float* outputs) { if(m_first_source > -1) { m_encoders[m_first_source]->process(inputs[m_first_source] * m_gains[m_first_source], m_harmonics_float); m_widers[m_first_source]->process(m_harmonics_float, outputs); for(unsigned int i = m_first_source+1; i < m_number_of_sources; i++) { if (!m_muted[i]) { m_encoders[i]->process(inputs[i] * m_gains[i], m_harmonics_float); m_widers[i]->process(m_harmonics_float, m_harmonics_float); cblas_saxpy(m_number_of_harmonics, 1.f, m_harmonics_float, 1, outputs, 1); } } } else { for(unsigned int i = 0; i < m_number_of_harmonics; i++) outputs[i] = 0.f; } }
void AddAllElements(ImagePointerType &OutImg, const PrecisionType aScaler, ImagePointerType &xImg, ImagePointerType &yImg, const PrecisionType cScaler= 1.0F) { const size_t N = xImg->GetLargestPossibleRegion().GetNumberOfPixels()*xImg->GetNumberOfComponentsPerPixel(); PrecisionType * x = GetFirstPointer(xImg); PrecisionType * y = GetFirstPointer(yImg); cblas_saxpy(N,aScaler,x,1,y,1); PrecisionType * Out= GetFirstPointer(OutImg); if(cScaler != 1.0F) { cblas_sscal(N,cScaler,y,1); } if( OutImg.GetPointer() != yImg.GetPointer()) { ImagePointerType temp = OutImg; OutImg = yImg; yImg = temp; //Sanity Check to induce failures if variable is needed in future. //yImg has been corrupted by processing here. } }
// Saxpy operations for multiplying matrices void mat_saxpy(float* A, float* B, int m, int p, int n, float* C) { float* x = (float*) calloc(m,sizeof(float)); float* y = (float*) calloc(m,sizeof(float)); // If memory allocation fails then exit with error if(x == NULL || y == NULL) { exit(1); } int j,k; // Loop through columns of B for(j=0; j < n; j++) { // Loop through columns of A for(k=0; k < p; k++) { // Get column k of A and load into x get_col(A,m,p,k,x); // Get column j of C and load into y get_col(C,m,n,j,y); // Get alpha float alpha = B[k*n+j]; // Saxpy level 1 operation y <- alpha*x + y cblas_saxpy(m,alpha,x,1,y,1); // Set column j of C to y set_col(C,m,n,j,y); } } // Free intermediate values free(x); free(y); }
JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_smcscm (JNIEnv * env, jobject calling_obj, jint M, jint N, jfloatArray j_A, jint lda, jfloatArray j_B, jintArray j_ir, jintArray j_jc, jfloatArray j_C, jint ldc){ jfloat * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE); jfloat * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE); jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, JNI_FALSE); jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, JNI_FALSE); jfloat * C = (*env)->GetPrimitiveArrayCritical(env, j_C, JNI_FALSE); int ioff = jc[0]; int i, j, ir0; for (i = 0; i < N; i++) { for (j = jc[i]-ioff; j < jc[i+1]-ioff; j++) { ir0 = ir[j]-ioff; cblas_saxpy(M, B[j], A+(ir0*lda), 1, C+(i*ldc), 1); } } (*env)->ReleasePrimitiveArrayCritical(env, j_C, C, 0); (*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0); (*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0); (*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0); (*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0); }
// In addition, MKL comes with an additional function axpby that is not present // in standard blas. We will simply use a two-step (inefficient, of course) way // to mimic that. inline void cblas_saxpby(const int N, const float alpha, const float* X, const int incX, const float beta, float* Y, const int incY) { cblas_sscal(N, beta, Y, incY); cblas_saxpy(N, alpha, X, incX, Y, incY); }
void MaxPooling_bprop( unsigned long long gradOutput, //input, N*outC*outH*outW unsigned long long gradInput, //output result unsigned long long dnnprimitives, int initOK, const float beta) { dnnError_t err; long long* primitives = (long long*)dnnprimitives; if (initOK == 0) { Init_b((long long *)gradInput, (long long *)gradOutput, primitives); } //get resource float* resPool[dnnResourceNumber] = {0}; float* OutPtr= GetPtr(gradOutput); resPool[dnnResourceDiffSrc] = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT]; resPool[dnnResourceDiffDst] = OutPtr; resPool[dnnResourceWorkspace] = (float*)primitives[BUFFER_POOLING_FORWARD_WORKSPACE]; //make conversion for gradeOut if necessary dnnPrimitive_t cv_out_b = (dnnPrimitive_t)(primitives[CV_POOLING_BACKWARD_OUTPUT]); if (cv_out_b) { float* buf_out_b = (float*)primitives[BUFFER_POOLING_BACKWARD_OUTPUT]; CHECK_ERR( dnnConversionExecute_F32(cv_out_b, OutPtr, buf_out_b), err ); resPool[dnnResourceDiffDst] = buf_out_b; } long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)primitives[POOL_L_B_I]) ; float * tempPtr = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT]; #pragma omp parallel for for (long long i = 0; i < grad_in_len/4; ++i) { tempPtr[i] = 0; } CHECK_ERR( dnnExecute_F32((dnnPrimitive_t)primitives[POOLING_BACKWARD], (void**)resPool), err ); if(beta != 0.0) { //require to add previous delta long long* ptr_gradInput = (long long*)gradInput; float* pFirstBuf = GetPtr(gradInput); dnnLayout_t layout_pre_delta = (dnnLayout_t)ptr_gradInput[MKLLayout]; if(layout_pre_delta == NULL) layout_pre_delta = (dnnLayout_t)primitives[POOL_L_I]; dnnLayout_t layout_add_delta = (dnnLayout_t)primitives[POOL_L_B_I]; float* temp_memory = NULL; if (!dnnLayoutCompare_F32(layout_add_delta, layout_pre_delta)) { CHECK_ERR( dnnAllocateBuffer_F32((void**)&temp_memory, layout_add_delta) , err ); dnnPrimitive_t cv = NULL; CHECK_ERR( dnnConversionCreate_F32(&cv, layout_pre_delta, layout_add_delta), err ); CHECK_ERR( dnnConversionExecute_F32(cv, pFirstBuf, temp_memory), err ); pFirstBuf = temp_memory; } long len = (long long)dnnLayoutGetMemorySize_F32(layout_add_delta) / 4 ; cblas_saxpy(len, 1.0, pFirstBuf, 1, (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT], 1); if (temp_memory != NULL) dnnReleaseBuffer_F32(temp_memory); } ((long long *)gradInput)[MKLLayout] = primitives[POOL_L_B_I]; ((long long *)gradInput)[MKLPtr] = primitives[BUFFER_POOLING_BACKWARD_INPUT]; ERR_RETURN: return; }
int GaussNewton( void (*func)(T *x, T *r, int m, int n, void *adata), void (*jacf)(T *x, T *J, int m, int n, void *adata), T *x, T *r, T* J, int m, int n, int itmax, T *opts, /* delta, r_threshold, diff_threshold */ void *adata) { PhGUtils::debug("m", m, "n", n); float delta, R_THRES, DIFF_THRES; if( opts == NULL ) { // use default values delta = 1.0; // step size, default to use standard Newton-Ralphson R_THRES = 1e-6; DIFF_THRES = 1e-6; } else { delta = opts[0]; R_THRES = opts[1]; DIFF_THRES = opts[2]; } bool allocateR = false, allocateJ = false; // residue if( r == NULL ) { // allocate space for residue allocateR = true; r = new T[n]; memset(r, 0, sizeof(T)*n); } T* x0 = new T[m]; memset(x0, 0, sizeof(T)*m); T* deltaX = new T[m]; // also for Jtr memset(deltaX, 0, sizeof(T)*m); cblas_scopy(m, x, 1, deltaX, 1); T* JtJ = new T[m * m]; memset(JtJ, 0, sizeof(T)*m*m); // Jacobian if( J == NULL ) { allocateJ = true; J = new T[m * n]; memset(J, 0, sizeof(T)*m*n); } // compute initial residue func(x, r, m, n, adata); //ofstream fout0("r.txt"); //print2DArray(r, n, 1, fout0); //fout0.close(); int iters = 0; //::system("pause"); //printArray(x, m); //printArray(r, n); // do iteration while( (cblas_snrm2(m, deltaX, 1) > DIFF_THRES && cblas_snrm2(n, r, 1) > R_THRES && iters < itmax) || iters < 1 ) { // compute Jacobian jacf(x, J, m, n, adata); // store old value cblas_scopy(m, x, 1, x0, 1); //ofstream fout1("J.txt"); //print2DArray(J, n, m, fout1); //fout1.close(); //::system("pause"); // compute JtJ cblas_ssyrk (CblasColMajor, CblasUpper, CblasNoTrans, m, n, 1.0, J, m, 0, JtJ, m); //ofstream fout("JtJ.txt"); //print2DArray(JtJ, m, m, fout); //fout.close(); // compute Jtr cblas_sgemv (CblasColMajor, CblasNoTrans, m, n, 1.0, J, m, r, 1, 0, deltaX, 1); // compute deltaX LAPACKE_spotrf( LAPACK_COL_MAJOR, 'U', m, JtJ, m ); LAPACKE_spotrs( LAPACK_COL_MAJOR, 'U', m, 1, JtJ, m, deltaX, m ); //ofstream fout2("deltaX.txt"); //printArray(deltaX, m, fout2); //fout2.close(); // update x cblas_saxpy(m, -delta, deltaX, 1, x, 1); // update residue func(x, r, m, n, adata); //printArray(x, m); //system("pause"); iters++; } //::system("pause"); // delete workspace delete[] x0; delete[] deltaX; delete[] JtJ; if( allocateR ){ delete[] r;} if( allocateJ ){ delete[] J;} return iters; }
void gd(float *X, float* Z, float*B, int panelSz, int D, float lamda, float * W, float *I){ // each D elements in Z forms a z vector // W=(I-sum_{j}Z[j]X[j])W[0]+B int i,j,k,m,n,l; float temp; float *Wtmp = (float*)malloc(D*sizeof(float)); memset(Wtmp,NULL,D*sizeof(float)); // every iteration should re-initial I for(i=0;i<D*D;i++){ I[i]= 0.0; //I[i*D+i]= 1.0; } for(i=0;i<D;i++){ I[i*D+i]= 1.0; } // for(i=0;i<D;i++){ // printf("W[%d]= %8.4f \n", i, W[i]); // } int chunkSz= D/nt; float *XZ = (float*) malloc(D*D*sizeof(float)); //holder of matrix XZ memset(XZ,NULL,D*D*sizeof(float)); int PchunkSz = panelSz/nt; #pragma omp parallel for schedule(static) private(j,i) for(k=0;k<nt;k++){ for(i=0;i<D;i++){ for(m=0;m<D;m++){ temp = 0; for(j=k*PchunkSz;j<(k+1)*PchunkSz;j++){ temp += X[i*panelSz+j]*Z[j*D+m]; } XZ[i*D+m] = temp; //printf("XZ[%i*D+%j] = %8.4f \n",i,j,XZ[i*D+j]); } } } #pragma omp parallel for schedule(static) private(i) for(k=0;k<nt;k++){ for(i=k*chunkSz;i<(k+1)*chunkSz;i++){ // printf("first,I[%d]=%8.4f \n", i, I[i]); for(j=0;j<chunkSz*D;j++) I[i*D+j]+= -XZ[i*D+j]; } } cblas_saxpy(D,1,B, 1, Wtmp, 1); cblas_scopy(D, Wtmp, 1, W, 1); for(i=0;i<D;i++){ //printf("Wtmp[%d]=%8.4f \n",i,Wtmp[i]); } free(XZ); free(Wtmp); }
float calErr(float *data, float *Ypred, float *Ytmp, float* Y, float* W, int M, int D){ cblas_sgemv(CblasRowMajor, CblasNoTrans, M, D, 1, data, D, W, 1, 0, Ypred, 1); cblas_scopy(M,Ypred,1,Ytmp,1); cblas_saxpy(M,-1,Y,1,Ytmp,1); return cblas_sdot(M,Ytmp,1,Ytmp,1); }
static void slave(int myrank, char* parameterFile) { int i, j, dummyInt, target_ldA, my_ldA, rdA, interceptFlag, error; MPI_Status status; float *A, *xvalue, *resultVector, *tempHolder, *dummyFloat; char matrixfilename[MAX_FILENAME_SIZE]; //GET PARAMETERS FROM THE TEXT FILE getSlaveParams(parameterFile, &target_ldA, &rdA, &interceptFlag, matrixfilename); //ALLOCATE A, TEMPHOLDER, RESULTVECTOR and XVALUE A = malloc(target_ldA*(rdA+1)*sizeof(float)); if(A==NULL) fprintf(stdout,"Unable to allocate memory!"); xvalue = malloc( (target_ldA+rdA)*sizeof(float) ); tempHolder = malloc( (target_ldA+rdA)*sizeof(float) ); //place holder for intermediate calculations resultVector = malloc( (target_ldA+rdA)*sizeof(float) ); if(xvalue==NULL || tempHolder==NULL || resultVector==NULL) fprintf(stdout,"Unable to allocate memory!"); //FILL A WITH DESIRED VALUES AND SEND NUMBER OF FILLED ROWS TO MASTER my_ldA=get_dat_matrix(A, target_ldA, rdA, myrank, matrixfilename, interceptFlag); MPI_Gather(&my_ldA, 1, MPI_INT, &dummyInt, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&error, 1, MPI_INT, 0, MPI_COMM_WORLD); if(error==0) { //if there were file open errors, end program free(A); free(xvalue); free(tempHolder); free(resultVector); return; } fprintf(stdout,"Slave %d found %d valid rows: A[0] is %f \n", myrank, my_ldA, A[0] ); //CENTER FEATURES float* shifts = malloc((rdA+1)*sizeof(float)); float* ones = malloc(my_ldA*sizeof(float)); for(i=0; i<my_ldA; i++) ones[i] = 1.0; cblas_sgemv(CblasRowMajor, CblasTrans, my_ldA, rdA+1, 1.0, A, rdA+1, ones, 1, 0.0, shifts, 1); //shifts now holds the sums of the columns of A MPI_Reduce(shifts, dummyFloat, rdA, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Bcast(shifts, rdA, MPI_FLOAT, 0, MPI_COMM_WORLD); //shifts now holds the total means of the columns of A for(i=0; i<my_ldA; i++) { //Now we substract shifts from each row of A cblas_saxpy(rdA, -1.0, shifts, 1, &A[i*(rdA+1)], 1); } //SCALE FEATURES float* norms = calloc(rdA, sizeof(float)); for(i=0; i<my_ldA; i++) { for(j=0; j<rdA; j++) { norms[j] += pow( A[i*(rdA+1) + j], 2); } } MPI_Reduce(norms, dummyFloat, rdA, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Bcast(norms, rdA, MPI_FLOAT, 0, MPI_COMM_WORLD); //norms now holds the 2-norms of the total columns of A for(j=0; j<rdA; j++) { if(norms[j] > 0.0001) cblas_sscal(my_ldA, 1.0 / norms[j], A + j, rdA + 1); } //COMPUTATION LOOP while(1) { MPI_Recv(&dummyInt, 0, MPI_INT, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status); //Check the tag to determine what to do next if (status.MPI_TAG == TAG_DIE) { break; } else if (status.MPI_TAG == TAG_AX) { //Multiply A * x //Get xvalue MPI_Bcast(xvalue, rdA+1, MPI_FLOAT, 0, MPI_COMM_WORLD); //Multiply: resultVector = A*xvalue cblas_sgemv(CblasRowMajor, CblasNoTrans, my_ldA, rdA+1, 1.0, A, rdA+1, xvalue, 1, 0.0, resultVector, 1); //Gather xvalues MPI_Gatherv(resultVector, my_ldA, MPI_FLOAT, dummyFloat, &dummyInt, &dummyInt, MPI_FLOAT, 0, MPI_COMM_WORLD); } else if (status.MPI_TAG == TAG_ATX) { //Multiply A^t * x //Get xvalue MPI_Scatterv(dummyFloat, &dummyInt, &dummyInt, MPI_FLOAT, xvalue, my_ldA, MPI_FLOAT, 0, MPI_COMM_WORLD); //Multiply: resultVector = A'*xvalue cblas_sgemv(CblasRowMajor, CblasTrans, my_ldA, rdA+1, 1.0, A, rdA+1, xvalue, 1, 0.0, resultVector, 1); //Sum resultVectors to get final result MPI_Reduce(resultVector, dummyFloat, rdA+1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); } else if (status.MPI_TAG == TAG_ATAX) { //Multiply A^t * A * x //Get xvalue MPI_Bcast(xvalue, rdA+1, MPI_FLOAT, 0, MPI_COMM_WORLD); //Multiply: tempHolder = A*xvalue cblas_sgemv(CblasRowMajor, CblasNoTrans, my_ldA, rdA+1, 1.0, A, rdA+1, xvalue, 1, 0.0, tempHolder, 1); //Multiply: resultVector = A^t * tempHolder cblas_sgemv(CblasRowMajor, CblasTrans, my_ldA, rdA+1, 1.0, A, rdA+1, tempHolder, 1, 0.0, resultVector, 1); //Gather and sum results MPI_Reduce(resultVector, dummyFloat, rdA+1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); } } free(A); free(xvalue); free(tempHolder); free(resultVector); free(shifts); free(ones); free(norms); return; }
// B <- A - B void StrassenSingleProblem::matrix_subtract_inplace(int m, int n, float *A, int lda, float *B, int ldb) { for (int i = 0; i < n; i++) { cblas_saxpy(m, -1, A + i*lda, 1, B + i*ldb, 1); } }
void CpuDevice<float>::Axpby(float alpha, const Values<float> &x, float beta, Values<float> *y) { // TODO(robertsdionne): Figure out why clang thinks cblas_saxpby is an undefined symbol. cblas_sscal(y->width, beta, y->values.data(), 1); cblas_saxpy(x.width, alpha, x.values.data(), 1, y->values.data(), 1); }
void LLC::Encode_with_max_pooling(const float* const data, const uint32_t dim, const uint32_t num_frame, float* const code) const { if (data == NULL || dim != dim_ || num_frame <= 0) { cerr << "ERROR in input data" << endl; exit(-1); } if (!has_setup_) { cerr << "ERROR: Must call SetUp() before." << endl; exit(-1); } vl_uint32* index = (vl_uint32*) vl_malloc( sizeof(vl_uint32) * num_knn_ * num_frame); memset(index, 0, num_knn_ * num_frame); float* dist(NULL); vl_kdforest_query_with_array(kdforest_model_, index, num_knn_, num_frame, dist, data); // start to encode const uint32_t len_code = num_base_; memset(code, 0, sizeof(float) * len_code); const uint32_t len_z = dim_ * num_knn_; const uint32_t len_C = num_knn_ * num_knn_; const uint32_t len_b = num_knn_; float* z = (float*) malloc(sizeof(float) * len_z); float* C = (float*) malloc(sizeof(float) * len_C); float* b = (float*) malloc(sizeof(float) * len_b); memset(z, 0, sizeof(float) * len_z); memset(C, 0, sizeof(float) * len_C); memset(b, 0, sizeof(float) * len_b); double sum(0); const float* base = base_.get(); for (uint32_t i = 0; i < num_frame; i++) { uint32_t tmp_ind; // z = B_i - 1 * x_i' for (uint32_t n = 0; n < num_knn_; n++) { tmp_ind = (uint32_t) index[i * num_knn_ + n]; memcpy(z + n * dim_, base + tmp_ind * dim_, sizeof(float) * dim_); cblas_saxpy(dim_, -1.0f, data + i * dim_, 1, z + n * dim_, 1); } // C = z * z', i.e. covariance matrix for (uint32_t m = 0; m < num_knn_; ++m) for (uint32_t n = m; n < num_knn_; ++n) { float sum = cblas_sdot(dim_, z + m * dim_, 1, z + n * dim_, 1); C[m * num_knn_ + n] = sum; C[n * num_knn_ + m] = sum; } sum = 0; for (uint32_t m = 0; m < num_knn_; m++) sum += C[m * num_knn_ + m]; sum = sum * beta_; for (uint32_t m = 0; m < num_knn_; m++) C[m * num_knn_ + m] += sum; for (uint32_t m = 0; m < num_knn_; m++) b[m] = 1; // solve { char upper_triangle = 'U'; int INFO; int int_one = 1; const int num_knn = (int) num_knn_; sposv(&upper_triangle, &num_knn, &int_one, C, &num_knn, b, &num_knn, &INFO); } sum = 0; for (uint32_t m = 0; m < num_knn_; m++) sum += b[m]; cblas_sscal(num_knn_, 1.0 / sum, b, 1); for (uint32_t m = 0; m < num_knn_; m++) { tmp_ind = (uint32_t) index[i * num_knn_ + m]; if (code[tmp_ind] < b[m]) code[tmp_ind] = b[m]; } } free(index); free(z); free(C); free(b); }
int CORE_stsrfb(int side, int trans, int direct, int storev, int M1, int N1, int M2, int N2, int K, float *A1, int LDA1, float *A2, int LDA2, float *V, int LDV, float *T, int LDT, float *WORK, int LDWORK) { static float zone = 1.0; static float mzone = -1.0; int j; /* Check input arguments */ if (M1 < 0) { coreblas_error(5, "Illegal value of M1"); return -5; } if (N1 < 0) { coreblas_error(6, "Illegal value of N1"); return -6; } if ( (M2 < 0) || ( (M2 != M1) && (side == PlasmaRight) ) ){ coreblas_error(7, "Illegal value of M2"); return -7; } if ( (N2 < 0) || ( (N2 != N1) && (side == PlasmaLeft) ) ){ coreblas_error(8, "Illegal value of N2"); return -8; } if (K < 0) { coreblas_error(9, "Illegal value of K"); return -9; } /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) return PLASMA_SUCCESS; if (storev == PlasmaColumnwise) { if (direct == PlasmaForward) { if (side == PlasmaLeft) { /* * B = A1 + V' * A2 */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N1, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, K, N2, M2, (zone), V, LDV, A2, LDA2, (zone), WORK, LDWORK); /* * A2 = A2 - V*T*B -> B = T*B, A2 = A2 - V*B */ cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, N2, K, (mzone), V, LDV, WORK, LDWORK, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j = 0; j < N1; j++) { cblas_saxpy( K, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } /* * Columnwise / Forward / Right */ else { /* * B = A1 + A2 * V */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M1, K, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, K, N2, (zone), A2, LDA2, V, LDV, (zone), WORK, LDWORK); /* * A2 = A2 - B*T*V' -> B = B*T, A2 = A2 - B*V' */ cblas_strmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, M2, N2, K, (mzone), WORK, LDWORK, V, LDV, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j = 0; j < K; j++) { cblas_saxpy( M1, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } } else { coreblas_error(3, "Not implemented (ColMajor / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } else { if (direct == PlasmaForward) { /* * Rowwise / Forward / Left */ if (side == PlasmaLeft) { /* * B = A1 + V * A2 */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), K, N1, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, K, N2, M2, (zone), V, LDV, A2, LDA2, (zone), WORK, LDWORK); /* * A2 = A2 - V'*T*B -> B = T*B, A2 = A2 - V'*B */ cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, M2, N2, K, (mzone), V, LDV, WORK, LDWORK, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j=0; j<N1; j++) { cblas_saxpy( K, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } /* * Rowwise / Forward / Right */ else { /* * B = A1 + A2 * V' */ LAPACKE_slacpy_work(LAPACK_COL_MAJOR, lapack_const(PlasmaUpperLower), M1, K, A1, LDA1, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, M2, K, N2, (zone), A2, LDA2, V, LDV, (zone), WORK, LDWORK); /* * A2 = A2 - B*T*V -> B = B*T, A2 = A2 - B*V' */ cblas_strmm( CblasColMajor, CblasRight, CblasUpper, (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K, (zone), T, LDT, WORK, LDWORK); cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, M2, N2, K, (mzone), WORK, LDWORK, V, LDV, (zone), A2, LDA2); /* * A1 = A1 - B */ for(j = 0; j < K; j++) { cblas_saxpy( M1, (mzone), &WORK[LDWORK*j], 1, &A1[LDA1*j], 1); } } } else { coreblas_error(3, "Not implemented (RowMajor / Backward / Left or Right)"); return PLASMA_ERR_NOT_SUPPORTED; } } return PLASMA_SUCCESS; }
JNIEXPORT jobject JNICALL Java_MKL_apply(JNIEnv *env, jclass cl, jint n, jobjectArray jts) { int m = (*env)->GetArrayLength(env, jts); float **p = (float **)malloc(m * sizeof (float *)); jclass classTransform = (*env)->FindClass(env, "Transform"); jfieldID idA = (*env)->GetFieldID(env, classTransform, "a", "F"); jfieldID idB = (*env)->GetFieldID(env, classTransform, "b", "F"); jfieldID idC = (*env)->GetFieldID(env, classTransform, "c", "F"); jfieldID idD = (*env)->GetFieldID(env, classTransform, "d", "F"); jfieldID idE = (*env)->GetFieldID(env, classTransform, "e", "F"); jfieldID idF = (*env)->GetFieldID(env, classTransform, "f", "F"); int i; jobject t; for (i = 0; i < m; ++i) { t = (*env)->GetObjectArrayElement(env, jts, i); p[i] = (float *)malloc(7 * sizeof (float)); p[i][0] = -1; p[i][1] = (*env)->GetFloatField(env, t, idA); p[i][2] = (*env)->GetFloatField(env, t, idB); p[i][3] = (*env)->GetFloatField(env, t, idC); p[i][4] = (*env)->GetFloatField(env, t, idD); p[i][5] = (*env)->GetFloatField(env, t, idE); p[i][6] = (*env)->GetFloatField(env, t, idF); } int k = ipow(m, n); jfloatArray jxs = (*env)->NewFloatArray(env, k); jfloatArray jys = (*env)->NewFloatArray(env, k); jfloat *xs = (*env)->GetFloatArrayElements(env, jxs, NULL); jfloat *ys = (*env)->GetFloatArrayElements(env, jys, NULL); xs[0] = 0; ys[0] = 0; int j; int w; for (i = 0; i < n; ++i) { w = ipow(m, i); for (j = 1; j < m; ++j) { cblas_scopy(w, xs, 1, xs + w * j, 1); cblas_scopy(w, ys, 1, ys + w * j, 1); } for (j = 0; j < m; ++j) { cblas_srotm(w, xs + w * j, 1, ys + w * j, 1, p[j]); cblas_saxpy(w, 1, &(p[j][5]), 0, xs + w * j, 1); cblas_saxpy(w, 1, &(p[j][6]), 0, ys + w * j, 1); } } for (i = 0; i < m; ++i) { free(p[i]); } free(p); (*env)->ReleaseFloatArrayElements(env, jxs, xs, 0); (*env)->ReleaseFloatArrayElements(env, jys, ys, 0); jclass classPoints = (*env)->FindClass(env, "Points"); jmethodID idConstructorPoints = (*env)->GetMethodID(env, classPoints, "<init>", "([F[F)V"); jobject ps = (*env)->NewObject(env, classPoints, idConstructorPoints, jxs, jys); return ps; }
void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY) { cblas_saxpy(n, alpha, X, incX, Y, incY); }
static PyObject * dotblas_matrixproduct(PyObject *dummy, PyObject *args) { PyObject *op1, *op2; PyArrayObject *ap1=NULL, *ap2=NULL, *ret=NULL; int j, l, lda, ldb, ldc; int typenum, nd; intp ap1stride=0; intp dimensions[MAX_DIMS]; intp numbytes; static const float oneF[2] = {1.0, 0.0}; static const float zeroF[2] = {0.0, 0.0}; static const double oneD[2] = {1.0, 0.0}; static const double zeroD[2] = {0.0, 0.0}; double prior1, prior2; PyTypeObject *subtype; PyArray_Descr *dtype; MatrixShape ap1shape, ap2shape; if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL; /* * "Matrix product" using the BLAS. * Only works for float double and complex types. */ typenum = PyArray_ObjectType(op1, 0); typenum = PyArray_ObjectType(op2, typenum); /* This function doesn't handle other types */ if ((typenum != PyArray_DOUBLE && typenum != PyArray_CDOUBLE && typenum != PyArray_FLOAT && typenum != PyArray_CFLOAT)) { return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct(op1, op2)); } dtype = PyArray_DescrFromType(typenum); ap1 = (PyArrayObject *)PyArray_FromAny(op1, dtype, 0, 0, ALIGNED, NULL); if (ap1 == NULL) return NULL; Py_INCREF(dtype); ap2 = (PyArrayObject *)PyArray_FromAny(op2, dtype, 0, 0, ALIGNED, NULL); if (ap2 == NULL) goto fail; if ((ap1->nd > 2) || (ap2->nd > 2)) { /* This function doesn't handle dimensions greater than 2 (or negative striding) -- other than to ensure the dot function is altered */ if (!altered) { /* need to alter dot product */ PyObject *tmp1, *tmp2; tmp1 = PyTuple_New(0); tmp2 = dotblas_alterdot(NULL, tmp1); Py_DECREF(tmp1); Py_DECREF(tmp2); } ret = (PyArrayObject *)PyArray_MatrixProduct((PyObject *)ap1, (PyObject *)ap2); Py_DECREF(ap1); Py_DECREF(ap2); return PyArray_Return(ret); } if (_bad_strides(ap1)) { op1 = PyArray_NewCopy(ap1, PyArray_ANYORDER); Py_DECREF(ap1); ap1 = (PyArrayObject *)op1; if (ap1 == NULL) goto fail; } if (_bad_strides(ap2)) { op2 = PyArray_NewCopy(ap2, PyArray_ANYORDER); Py_DECREF(ap2); ap2 = (PyArrayObject *)op2; if (ap2 == NULL) goto fail; } ap1shape = _select_matrix_shape(ap1); ap2shape = _select_matrix_shape(ap2); if (ap1shape == _scalar || ap2shape == _scalar) { PyArrayObject *oap1, *oap2; oap1 = ap1; oap2 = ap2; /* One of ap1 or ap2 is a scalar */ if (ap1shape == _scalar) { /* Make ap2 the scalar */ PyArrayObject *t = ap1; ap1 = ap2; ap2 = t; ap1shape = ap2shape; ap2shape = _scalar; } if (ap1shape == _row) ap1stride = ap1->strides[1]; else if (ap1->nd > 0) ap1stride = ap1->strides[0]; if (ap1->nd == 0 || ap2->nd == 0) { intp *thisdims; if (ap1->nd == 0) { nd = ap2->nd; thisdims = ap2->dimensions; } else { nd = ap1->nd; thisdims = ap1->dimensions; } l = 1; for (j=0; j<nd; j++) { dimensions[j] = thisdims[j]; l *= dimensions[j]; } } else { l = oap1->dimensions[oap1->nd-1]; if (oap2->dimensions[0] != l) { PyErr_SetString(PyExc_ValueError, "matrices are not aligned"); goto fail; } nd = ap1->nd + ap2->nd - 2; /* nd = 0 or 1 or 2 */ /* If nd == 0 do nothing ... */ if (nd == 1) { /* Either ap1->nd is 1 dim or ap2->nd is 1 dim and the other is 2-dim */ dimensions[0] = (oap1->nd == 2) ? oap1->dimensions[0] : oap2->dimensions[1]; l = dimensions[0]; /* Fix it so that dot(shape=(N,1), shape=(1,)) and dot(shape=(1,), shape=(1,N)) both return an (N,) array (but use the fast scalar code) */ } else if (nd == 2) { dimensions[0] = oap1->dimensions[0]; dimensions[1] = oap2->dimensions[1]; /* We need to make sure that dot(shape=(1,1), shape=(1,N)) and dot(shape=(N,1),shape=(1,1)) uses scalar multiplication appropriately */ if (ap1shape == _row) l = dimensions[1]; else l = dimensions[0]; } } } else { /* (ap1->nd <= 2 && ap2->nd <= 2) */ /* Both ap1 and ap2 are vectors or matrices */ l = ap1->dimensions[ap1->nd-1]; if (ap2->dimensions[0] != l) { PyErr_SetString(PyExc_ValueError, "matrices are not aligned"); goto fail; } nd = ap1->nd+ap2->nd-2; if (nd == 1) dimensions[0] = (ap1->nd == 2) ? ap1->dimensions[0] : ap2->dimensions[1]; else if (nd == 2) { dimensions[0] = ap1->dimensions[0]; dimensions[1] = ap2->dimensions[1]; } } /* Choose which subtype to return */ if (ap1->ob_type != ap2->ob_type) { prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0); prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0); subtype = (prior2 > prior1 ? ap2->ob_type : ap1->ob_type); } else { prior1 = prior2 = 0.0; subtype = ap1->ob_type; } ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions, typenum, NULL, NULL, 0, 0, (PyObject *) (prior2 > prior1 ? ap2 : ap1)); if (ret == NULL) goto fail; numbytes = PyArray_NBYTES(ret); memset(ret->data, 0, numbytes); if (numbytes==0 || l == 0) { Py_DECREF(ap1); Py_DECREF(ap2); return PyArray_Return(ret); } if (ap2shape == _scalar) { /* Multiplication by a scalar -- Level 1 BLAS */ /* if ap1shape is a matrix and we are not contiguous, then we can't just blast through the entire array using a single striding factor */ NPY_BEGIN_ALLOW_THREADS if (typenum == PyArray_DOUBLE) { if (l == 1) { *((double *)ret->data) = *((double *)ap2->data) * \ *((double *)ap1->data); } else if (ap1shape != _matrix) { cblas_daxpy(l, *((double *)ap2->data), (double *)ap1->data, ap1stride/sizeof(double), (double *)ret->data, 1); } else { int maxind, oind, i, a1s, rets; char *ptr, *rptr; double val; maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1); oind = 1-maxind; ptr = ap1->data; rptr = ret->data; l = ap1->dimensions[maxind]; val = *((double *)ap2->data); a1s = ap1->strides[maxind] / sizeof(double); rets = ret->strides[maxind] / sizeof(double); for (i=0; i < ap1->dimensions[oind]; i++) { cblas_daxpy(l, val, (double *)ptr, a1s, (double *)rptr, rets); ptr += ap1->strides[oind]; rptr += ret->strides[oind]; } } } else if (typenum == PyArray_CDOUBLE) { if (l == 1) { cdouble *ptr1, *ptr2, *res; ptr1 = (cdouble *)ap2->data; ptr2 = (cdouble *)ap1->data; res = (cdouble *)ret->data; res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag; res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real; } else if (ap1shape != _matrix) { cblas_zaxpy(l, (double *)ap2->data, (double *)ap1->data, ap1stride/sizeof(cdouble), (double *)ret->data, 1); } else { int maxind, oind, i, a1s, rets; char *ptr, *rptr; double *pval; maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1); oind = 1-maxind; ptr = ap1->data; rptr = ret->data; l = ap1->dimensions[maxind]; pval = (double *)ap2->data; a1s = ap1->strides[maxind] / sizeof(cdouble); rets = ret->strides[maxind] / sizeof(cdouble); for (i=0; i < ap1->dimensions[oind]; i++) { cblas_zaxpy(l, pval, (double *)ptr, a1s, (double *)rptr, rets); ptr += ap1->strides[oind]; rptr += ret->strides[oind]; } } } else if (typenum == PyArray_FLOAT) { if (l == 1) { *((float *)ret->data) = *((float *)ap2->data) * \ *((float *)ap1->data); } else if (ap1shape != _matrix) { cblas_saxpy(l, *((float *)ap2->data), (float *)ap1->data, ap1stride/sizeof(float), (float *)ret->data, 1); } else { int maxind, oind, i, a1s, rets; char *ptr, *rptr; float val; maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1); oind = 1-maxind; ptr = ap1->data; rptr = ret->data; l = ap1->dimensions[maxind]; val = *((float *)ap2->data); a1s = ap1->strides[maxind] / sizeof(float); rets = ret->strides[maxind] / sizeof(float); for (i=0; i < ap1->dimensions[oind]; i++) { cblas_saxpy(l, val, (float *)ptr, a1s, (float *)rptr, rets); ptr += ap1->strides[oind]; rptr += ret->strides[oind]; } } } else if (typenum == PyArray_CFLOAT) { if (l == 1) { cfloat *ptr1, *ptr2, *res; ptr1 = (cfloat *)ap2->data; ptr2 = (cfloat *)ap1->data; res = (cfloat *)ret->data; res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag; res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real; } else if (ap1shape != _matrix) { cblas_caxpy(l, (float *)ap2->data, (float *)ap1->data, ap1stride/sizeof(cfloat), (float *)ret->data, 1); } else { int maxind, oind, i, a1s, rets; char *ptr, *rptr; float *pval; maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1); oind = 1-maxind; ptr = ap1->data; rptr = ret->data; l = ap1->dimensions[maxind]; pval = (float *)ap2->data; a1s = ap1->strides[maxind] / sizeof(cfloat); rets = ret->strides[maxind] / sizeof(cfloat); for (i=0; i < ap1->dimensions[oind]; i++) { cblas_caxpy(l, pval, (float *)ptr, a1s, (float *)rptr, rets); ptr += ap1->strides[oind]; rptr += ret->strides[oind]; } } } NPY_END_ALLOW_THREADS }
static int check_solution(PLASMA_enum transA, PLASMA_enum transB, int M, int N, int K, float alpha, float *A, int LDA, float *B, int LDB, float beta, float *Cref, float *Cplasma, int LDC) { int info_solution; float Anorm, Bnorm, Cinitnorm, Cplasmanorm, Clapacknorm, Rnorm, result; float eps; float beta_const; float *work = (float *)malloc(max(K,max(M, N))* sizeof(float)); int Am, An, Bm, Bn; beta_const = -1.0; if (transA == PlasmaNoTrans) { Am = M; An = K; } else { Am = K; An = M; } if (transB == PlasmaNoTrans) { Bm = K; Bn = N; } else { Bm = N; Bn = K; } Anorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), Am, An, A, LDA, work); Bnorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), Bm, Bn, B, LDB, work); Cinitnorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Cref, LDC, work); Cplasmanorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Cplasma, LDC, work); cblas_sgemm(CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, (alpha), A, LDA, B, LDB, (beta), Cref, LDC); Clapacknorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Cref, LDC, work); cblas_saxpy(LDC * N, (beta_const), Cplasma, 1, Cref, 1); Rnorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Cref, LDC, work); eps = LAPACKE_slamch_work('e'); printf("Rnorm %e, Anorm %e, Bnorm %e, Cinitnorm %e, Cplasmanorm %e, Clapacknorm %e\n", Rnorm, Anorm, Bnorm, Cinitnorm, Cplasmanorm, Clapacknorm); result = Rnorm / ((Anorm + Bnorm + Cinitnorm) * N * eps); printf("============\n"); printf("Checking the norm of the difference against reference SGEMM \n"); printf("-- ||Cplasma - Clapack||_oo/((||A||_oo+||B||_oo+||C||_oo).N.eps) = %e \n", result); if ( isnan(Rnorm) || isinf(Rnorm) || isnan(result) || isinf(result) || (result > 10.0) ) { printf("-- The solution is suspicious ! \n"); info_solution = 1; } else { printf("-- The solution is CORRECT ! \n"); info_solution= 0 ; } free(work); return info_solution; }
void caffe_axpy<float>(const int N, const float alpha, const float* X, float* Y, const int ldx, const int ldy) { cblas_saxpy(N, alpha, X, ldx, Y, ldy); }
// // Overloaded function for dispatching to // * CBLAS backend, and // * float value-type. // inline void axpy( const int n, const float a, const float* x, const int incx, float* y, const int incy ) { cblas_saxpy( n, a, x, incx, y, incy ); }
void caffe_cpu_xpasv<float>(const int M, const int N, const float alpha, float* X, const float* a, const float* b) { for (int i = 0; i < M; ++i) { cblas_saxpy(N, alpha * a[i], b, 1, X + i * N, 1); } }
inline void blas_axpy(size_t n1, float alpha, float* a, float* b) { cblas_saxpy(n1, alpha, a, 1, b, 1); }
int testing_strsm(int argc, char **argv) { /* Check for number of arguments*/ if ( argc != 5 ) { USAGE("TRSM", "alpha M N LDA LDB", " - alpha : alpha coefficient\n" " - M : number of rows of matrices B\n" " - N : number of columns of matrices B\n" " - LDA : leading dimension of matrix A\n" " - LDB : leading dimension of matrix B\n"); return -1; } float alpha = (float) atol(argv[0]); int M = atoi(argv[1]); int N = atoi(argv[2]); int LDA = atoi(argv[3]); int LDB = atoi(argv[4]); float eps; int info_solution; int s, u, t, d, i; int LDAxM = LDA*max(M,N); int LDBxN = LDB*max(M,N); float *A = (float *)malloc(LDAxM*sizeof(float)); float *B = (float *)malloc(LDBxN*sizeof(float)); float *Binit = (float *)malloc(LDBxN*sizeof(float)); float *Bfinal = (float *)malloc(LDBxN*sizeof(float)); /* Check if unable to allocate memory */ if ( (!A) || (!B) || (!Binit) || (!Bfinal)){ printf("Out of Memory \n "); return -2; } eps = LAPACKE_slamch_work('e'); printf("\n"); printf("------ TESTS FOR PLASMA STRSM ROUTINE ------- \n"); printf(" Size of the Matrix B : %d by %d\n", M, N); printf("\n"); printf(" The matrix A is randomly generated for each test.\n"); printf("============\n"); printf(" The relative machine precision (eps) is to be %e \n",eps); printf(" Computational tests pass if scaled residuals are less than 10.\n"); /*---------------------------------------------------------- * TESTING STRSM */ /* Initialize A, B, C */ LAPACKE_slarnv_work(IONE, ISEED, LDAxM, A); LAPACKE_slarnv_work(IONE, ISEED, LDBxN, B); for(i=0;i<max(M,N);i++) A[LDA*i+i] = A[LDA*i+i] + 2.0; for (s=0; s<2; s++) { for (u=0; u<2; u++) { #ifdef COMPLEX for (t=0; t<3; t++) { #else for (t=0; t<2; t++) { #endif for (d=0; d<2; d++) { memcpy(Binit, B, LDBxN*sizeof(float)); memcpy(Bfinal, B, LDBxN*sizeof(float)); /* PLASMA STRSM */ PLASMA_strsm(side[s], uplo[u], trans[t], diag[d], M, N, alpha, A, LDA, Bfinal, LDB); /* Check the solution */ info_solution = check_solution(side[s], uplo[u], trans[t], diag[d], M, N, alpha, A, LDA, Binit, Bfinal, LDB); printf("***************************************************\n"); if (info_solution == 0) { printf(" ---- TESTING STRSM (%s, %s, %s, %s) ...... PASSED !\n", sidestr[s], uplostr[u], transstr[t], diagstr[d]); } else { printf(" ---- TESTING STRSM (%s, %s, %s, %s) ... FAILED !\n", sidestr[s], uplostr[u], transstr[t], diagstr[d]); } printf("***************************************************\n"); } } } } free(A); free(B); free(Binit); free(Bfinal); return 0; } /*-------------------------------------------------------------- * Check the solution */ static int check_solution(PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum trans, PLASMA_enum diag, int M, int N, float alpha, float *A, int LDA, float *Bref, float *Bplasma, int LDB) { int info_solution; float Anorm, Binitnorm, Bplasmanorm, Blapacknorm, Rnorm, result; float eps; float mzone = (float)-1.0; float *work = (float *)malloc(max(M, N)* sizeof(float)); int Am, An; if (side == PlasmaLeft) { Am = M; An = M; } else { Am = N; An = N; } Anorm = LAPACKE_slantr_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), lapack_const(uplo), lapack_const(diag), Am, An, A, LDA, work); Binitnorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bref, LDB, work); Bplasmanorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bplasma, LDB, work); cblas_strsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag, M, N, (alpha), A, LDA, Bref, LDB); Blapacknorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bref, LDB, work); cblas_saxpy(LDB * N, (mzone), Bplasma, 1, Bref, 1); Rnorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bref, LDB, work); eps = LAPACKE_slamch_work('e'); printf("Rnorm %e, Anorm %e, Binitnorm %e, Bplasmanorm %e, Blapacknorm %e\n", Rnorm, Anorm, Binitnorm, Bplasmanorm, Blapacknorm); result = Rnorm / ((Anorm + Blapacknorm) * max(M,N) * eps); printf("============\n"); printf("Checking the norm of the difference against reference STRSM \n"); printf("-- ||Cplasma - Clapack||_oo/((||A||_oo+||B||_oo).N.eps) = %e \n", result); if ( isinf(Blapacknorm) || isinf(Bplasmanorm) || isnan(result) || isinf(result) || (result > 10.0) ) { printf("-- The solution is suspicious ! \n"); info_solution = 1; } else { printf("-- The solution is CORRECT ! \n"); info_solution= 0 ; } free(work); return info_solution; }
DLLEXPORT void s_axpy(const blasint n, const float alpha, const float x[], float y[]) { cblas_saxpy(n, alpha, x, 1, y, 1); }
void caffe_axpy<float>(const int N, const float alpha, const float* X, float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
void caffe_axpy<float>(const int N, const float alpha, const float* X, float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } // 封装的函数,调用BLAS函数, 进行矩阵线性运算