Example #1
0
    void DecoderBinaural::process(const float* const* inputs, float** outputs)
	{
        unsigned int i;
        for(i = 0; i < m_number_of_harmonics; i++)
        {
            cblas_scopy(m_vector_size, inputs[i], 1,  m_input_matrix+i*m_vector_size, 1);
        }

        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, (m_impulses_size * 2), m_vector_size, m_number_of_harmonics, 1.,
                    m_impulses_matrix, m_number_of_harmonics,
                    m_input_matrix,  m_vector_size,
                    0., m_result_matrix,  m_vector_size);

        for(i = 0; i < m_vector_size; i++)
        {
            cblas_saxpy(m_impulses_size, 1.f, m_result_matrix + i, m_vector_size, m_linear_vector_left + i, 1);
            outputs[0][i] = m_linear_vector_left[i];
        }

        for(i = 0; i < m_vector_size; i++)
        {
            cblas_saxpy(m_impulses_size, 1.f, m_result_matrix + i + m_vector_size * m_impulses_size, m_vector_size, m_linear_vector_right + i, 1);
            outputs[1][i] = m_linear_vector_right[i];
        }

        cblas_scopy(m_impulses_size-1, m_linear_vector_left+m_vector_size, 1, m_linear_vector_left, 1);
        cblas_scopy(m_impulses_size-1, m_linear_vector_right+m_vector_size, 1, m_linear_vector_right, 1);

#ifdef __APPLE__
        vDSP_vclr(m_linear_vector_left + m_impulses_size - 1, 1, m_vector_size);
        vDSP_vclr(m_linear_vector_right + m_impulses_size - 1, 1, m_vector_size);
#else
        memset(m_linear_vector_left + m_impulses_size - 1, 0, m_vector_size * sizeof(float));
        memset(m_linear_vector_right + m_impulses_size - 1, 0, m_vector_size * sizeof(float));
#endif
	}
Example #2
0
bool axpy(RealVector &y, const REAL alpha, const RealVector &x) {
  bool flag = true;
  UINT n, incX, incY;
  n = x.size;
  incX = 1;
  incY = 1;
  if (NULL == &x || NULL == &y) {
    flag = false;
	  goto end;
  }
  if (x.size != y.size) {
    flag = false;
	  goto end;
  }

  cblas_saxpy(n, alpha, x.M, incX, y.M, incY);

end:
  return flag;
}
Example #3
0
int main() {

  const int N = 10;
  float x[N], y[N];
  float alpha = 10;

  // initialize
  for ( int i=0; i<N; i++ ) {
    x[i] = i;
    y[i] = 2*i;
  }

  // y = alpha * x + y
  cblas_saxpy( N, alpha, x, 1, y, 1 );

  for ( int i=0; i<N; i++ )
    std::cout << y[i] << " should equal " << alpha*i + 2*i << std::endl;
  
  return 0;
  
}
Example #4
0
 void Map::process(const float* inputs, float* outputs)
 {
     if(m_first_source > -1)
     {
         m_encoders[m_first_source]->process(inputs[m_first_source] * m_gains[m_first_source], m_harmonics_float);
         m_widers[m_first_source]->process(m_harmonics_float, outputs);
         for(unsigned int i = m_first_source+1; i < m_number_of_sources; i++)
         {
             if (!m_muted[i])
             {
                 m_encoders[i]->process(inputs[i] * m_gains[i], m_harmonics_float);
                 m_widers[i]->process(m_harmonics_float, m_harmonics_float);
                 cblas_saxpy(m_number_of_harmonics, 1.f, m_harmonics_float, 1, outputs, 1);
             }
         }
     }
     else
     {
         for(unsigned int i = 0; i < m_number_of_harmonics; i++)
             outputs[i] = 0.f;
     }
 }
Example #5
0
void AddAllElements(ImagePointerType &OutImg,
                           const PrecisionType aScaler,
                           ImagePointerType &xImg, ImagePointerType &yImg,
                           const PrecisionType cScaler= 1.0F)
{
  const size_t N = xImg->GetLargestPossibleRegion().GetNumberOfPixels()*xImg->GetNumberOfComponentsPerPixel();
  PrecisionType * x =  GetFirstPointer(xImg);
  PrecisionType * y =  GetFirstPointer(yImg);
  cblas_saxpy(N,aScaler,x,1,y,1);

  PrecisionType * Out=  GetFirstPointer(OutImg);
  if(cScaler != 1.0F)
  {
  cblas_sscal(N,cScaler,y,1);
  }
  if( OutImg.GetPointer() != yImg.GetPointer())
  {
    ImagePointerType temp = OutImg;
    OutImg = yImg;
    yImg = temp; //Sanity Check to induce failures if variable is needed in future.
    //yImg has been corrupted by processing here.
  }
}
Example #6
0
// Saxpy operations for multiplying matrices
void mat_saxpy(float* A, float* B, int m, int p, int n, float* C)
{
  float* x = (float*) calloc(m,sizeof(float));
  float* y = (float*) calloc(m,sizeof(float));
  
  // If memory allocation fails then exit with error
  if(x == NULL || y == NULL) { exit(1); }
  
  int j,k;
  
  // Loop through columns of B
  for(j=0; j < n; j++)
  {
    // Loop through columns of A
    for(k=0; k < p; k++)
    {
      // Get column k of A and load into x
      get_col(A,m,p,k,x);
      
      // Get column j of C and load into y
      get_col(C,m,n,j,y);
      
      // Get alpha
      float alpha = B[k*n+j];
      
      // Saxpy level 1 operation y <- alpha*x + y
      cblas_saxpy(m,alpha,x,1,y,1);
      
      // Set column j of C to y
      set_col(C,m,n,j,y);
    }
  }

  // Free intermediate values
  free(x);
  free(y);
}
Example #7
0
JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_smcscm 
(JNIEnv * env, jobject calling_obj, jint M, jint N, jfloatArray j_A, jint lda, 
 jfloatArray j_B, jintArray j_ir, jintArray j_jc, jfloatArray j_C, jint ldc){
	jfloat * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE);
	jfloat * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE);
	jint * ir = (*env)->GetPrimitiveArrayCritical(env, j_ir, JNI_FALSE);
	jint * jc = (*env)->GetPrimitiveArrayCritical(env, j_jc, JNI_FALSE);
	jfloat * C = (*env)->GetPrimitiveArrayCritical(env, j_C, JNI_FALSE);

        int ioff = jc[0];
        int i, j, ir0;
        for (i = 0; i < N; i++) {
          for (j = jc[i]-ioff; j < jc[i+1]-ioff; j++) {
            ir0 = ir[j]-ioff;
            cblas_saxpy(M, B[j], A+(ir0*lda), 1, C+(i*ldc), 1);
          }
        }

	(*env)->ReleasePrimitiveArrayCritical(env, j_C, C, 0);
	(*env)->ReleasePrimitiveArrayCritical(env, j_jc, jc, 0);	
        (*env)->ReleasePrimitiveArrayCritical(env, j_ir, ir, 0);
	(*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0);
	(*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0);
}
Example #8
0
// In addition, MKL comes with an additional function axpby that is not present
// in standard blas. We will simply use a two-step (inefficient, of course) way
// to mimic that.
inline void cblas_saxpby(const int N, const float alpha, const float* X,
                         const int incX, const float beta, float* Y,
                         const int incY) {
  cblas_sscal(N, beta, Y, incY);
  cblas_saxpy(N, alpha, X, incX, Y, incY);
}
Example #9
0
void MaxPooling_bprop(
    unsigned long long gradOutput,  //input, N*outC*outH*outW
    unsigned long long gradInput,   //output result
    unsigned long long dnnprimitives,
    int initOK, const float beta)
{
    dnnError_t err;
    long long* primitives = (long long*)dnnprimitives;
    if (initOK == 0)
    {
        Init_b((long long *)gradInput, (long long *)gradOutput, primitives);
    }

    //get resource
    float* resPool[dnnResourceNumber] = {0};
    float* OutPtr= GetPtr(gradOutput);

    resPool[dnnResourceDiffSrc]   = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT];
    resPool[dnnResourceDiffDst]   = OutPtr;
    resPool[dnnResourceWorkspace] = (float*)primitives[BUFFER_POOLING_FORWARD_WORKSPACE];

    //make conversion for gradeOut if necessary
    dnnPrimitive_t cv_out_b = (dnnPrimitive_t)(primitives[CV_POOLING_BACKWARD_OUTPUT]);
    if (cv_out_b)
    {
        float* buf_out_b = (float*)primitives[BUFFER_POOLING_BACKWARD_OUTPUT];
        CHECK_ERR( dnnConversionExecute_F32(cv_out_b, OutPtr, buf_out_b), err );
        resPool[dnnResourceDiffDst] = buf_out_b;
    }

    long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)primitives[POOL_L_B_I]) ;
    float * tempPtr = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT];
    #pragma omp parallel for
    for (long long i = 0; i < grad_in_len/4; ++i)
    {
        tempPtr[i] = 0;
    }

    CHECK_ERR( dnnExecute_F32((dnnPrimitive_t)primitives[POOLING_BACKWARD], (void**)resPool), err );

    if(beta != 0.0)
    {
        //require to add previous delta
        long long* ptr_gradInput = (long long*)gradInput;
        float* pFirstBuf = GetPtr(gradInput);
        dnnLayout_t layout_pre_delta = (dnnLayout_t)ptr_gradInput[MKLLayout];
        if(layout_pre_delta == NULL) layout_pre_delta = (dnnLayout_t)primitives[POOL_L_I];
        dnnLayout_t layout_add_delta = (dnnLayout_t)primitives[POOL_L_B_I];
        float* temp_memory = NULL;
        if (!dnnLayoutCompare_F32(layout_add_delta, layout_pre_delta))
        {
            CHECK_ERR( dnnAllocateBuffer_F32((void**)&temp_memory, layout_add_delta) , err );
            dnnPrimitive_t cv = NULL;
            CHECK_ERR( dnnConversionCreate_F32(&cv, layout_pre_delta, layout_add_delta), err );
            CHECK_ERR( dnnConversionExecute_F32(cv, pFirstBuf, temp_memory), err );
            pFirstBuf = temp_memory;
        }
        long len = (long long)dnnLayoutGetMemorySize_F32(layout_add_delta) / 4 ;
        cblas_saxpy(len, 1.0, pFirstBuf, 1, (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT], 1);
        if (temp_memory != NULL)
            dnnReleaseBuffer_F32(temp_memory);
    }

    ((long long *)gradInput)[MKLLayout] = primitives[POOL_L_B_I];
    ((long long *)gradInput)[MKLPtr]    = primitives[BUFFER_POOLING_BACKWARD_INPUT];

ERR_RETURN:
    return;
}
Example #10
0
	int GaussNewton(
		void (*func)(T *x, T *r, int m, int n, void *adata),
		void (*jacf)(T *x, T *J, int m, int n, void *adata),
		T *x, T *r, T* J, int m, int n, int itmax, 
		T *opts,	/* delta,  r_threshold, diff_threshold */
		void *adata)
	{
		PhGUtils::debug("m", m, "n", n);

		float delta, R_THRES, DIFF_THRES;
		if( opts == NULL ) {
			// use default values
			delta = 1.0;	// step size, default to use standard Newton-Ralphson
			R_THRES = 1e-6;	DIFF_THRES = 1e-6;
		}
		else {
			delta = opts[0]; R_THRES = opts[1]; DIFF_THRES = opts[2];
		}

		bool allocateR = false, allocateJ = false;
		// residue
		if( r == NULL ) {
			// allocate space for residue
			allocateR = true;
			r = new T[n];
			memset(r, 0, sizeof(T)*n);
		}

		T* x0 = new T[m];
		memset(x0, 0, sizeof(T)*m);

		T* deltaX = new T[m];	// also for Jtr
		memset(deltaX, 0, sizeof(T)*m);
		cblas_scopy(m, x, 1, deltaX, 1);

		T* JtJ = new T[m * m];
		memset(JtJ, 0, sizeof(T)*m*m);

		// Jacobian
		if( J == NULL ) {
			allocateJ = true;
			J = new T[m * n];
			memset(J, 0, sizeof(T)*m*n);
		}

		// compute initial residue
		func(x, r, m, n, adata);

		//ofstream fout0("r.txt");
		//print2DArray(r, n, 1, fout0);
		//fout0.close();

		int iters = 0;

		//::system("pause");

		//printArray(x, m);
		//printArray(r, n);

		// do iteration
		while( (cblas_snrm2(m, deltaX, 1) > DIFF_THRES && cblas_snrm2(n, r, 1) > R_THRES && iters < itmax) || iters < 1 ) {
			// compute Jacobian
			jacf(x, J, m, n, adata);

			// store old value
			cblas_scopy(m, x, 1, x0, 1);

			//ofstream fout1("J.txt");
			//print2DArray(J, n, m, fout1);
			//fout1.close();

			//::system("pause");

			// compute JtJ
			cblas_ssyrk (CblasColMajor, CblasUpper, CblasNoTrans, m, n, 1.0, J, m, 0, JtJ, m);

			//ofstream fout("JtJ.txt");
			//print2DArray(JtJ, m, m, fout);
			//fout.close();

			// compute Jtr
			cblas_sgemv (CblasColMajor, CblasNoTrans, m, n, 1.0, J, m, r, 1, 0, deltaX, 1);
			
			// compute deltaX
			LAPACKE_spotrf( LAPACK_COL_MAJOR, 'U', m, JtJ, m );
			LAPACKE_spotrs( LAPACK_COL_MAJOR, 'U', m, 1, JtJ, m, deltaX, m );

			//ofstream fout2("deltaX.txt");
			//printArray(deltaX, m, fout2);
			//fout2.close();

			// update x
			cblas_saxpy(m, -delta, deltaX, 1, x, 1);

			// update residue
			func(x, r, m, n, adata);

			//printArray(x, m);
			//system("pause");
			iters++;
		}

		//::system("pause");

		// delete workspace
		delete[] x0;
		delete[] deltaX;
		delete[] JtJ;

		if( allocateR ){ delete[] r;}
		if( allocateJ ){ delete[] J;}

		return iters;
	}
Example #11
0
void gd(float *X, float* Z, float*B, int panelSz, int D, float lamda, float * W, float *I){
    // each D elements in Z forms a z vector
	// W=(I-sum_{j}Z[j]X[j])W[0]+B
    int i,j,k,m,n,l;
    float temp;
    float *Wtmp = (float*)malloc(D*sizeof(float));
    
    memset(Wtmp,NULL,D*sizeof(float));
    // every iteration should re-initial I
    for(i=0;i<D*D;i++){
      I[i]= 0.0;
      //I[i*D+i]= 1.0;
      }

      for(i=0;i<D;i++){
        I[i*D+i]= 1.0;
        }

 // for(i=0;i<D;i++){
 //    printf("W[%d]= %8.4f \n", i, W[i]);
 //  }



   int chunkSz= D/nt;
   float *XZ = (float*) malloc(D*D*sizeof(float)); //holder of matrix XZ
   memset(XZ,NULL,D*D*sizeof(float));


int PchunkSz = panelSz/nt;
#pragma omp parallel for schedule(static) private(j,i)
 for(k=0;k<nt;k++){
 for(i=0;i<D;i++){
  for(m=0;m<D;m++){
    temp = 0;
    for(j=k*PchunkSz;j<(k+1)*PchunkSz;j++){
     temp += X[i*panelSz+j]*Z[j*D+m];
     }
     XZ[i*D+m] = temp;
     //printf("XZ[%i*D+%j] = %8.4f \n",i,j,XZ[i*D+j]);
  }
 }
 }


 #pragma omp parallel for schedule(static) private(i)
  for(k=0;k<nt;k++){
    for(i=k*chunkSz;i<(k+1)*chunkSz;i++){
       // printf("first,I[%d]=%8.4f \n", i, I[i]);
       for(j=0;j<chunkSz*D;j++)
          I[i*D+j]+= -XZ[i*D+j];

    }
  }  
 
    cblas_saxpy(D,1,B, 1, Wtmp, 1);
    cblas_scopy(D, Wtmp, 1, W, 1);
   
     for(i=0;i<D;i++){
     //printf("Wtmp[%d]=%8.4f \n",i,Wtmp[i]);
    }
    free(XZ);
    free(Wtmp);

   

}
Example #12
0
float calErr(float *data, float *Ypred, float *Ytmp, float* Y, float* W, int M, int D){
    cblas_sgemv(CblasRowMajor, CblasNoTrans, M, D, 1, data, D, W, 1, 0, Ypred, 1);
    cblas_scopy(M,Ypred,1,Ytmp,1);
    cblas_saxpy(M,-1,Y,1,Ytmp,1);
    return cblas_sdot(M,Ytmp,1,Ytmp,1);
}
Example #13
0
static void slave(int myrank, char* parameterFile)
{
  int i, j, dummyInt, target_ldA, my_ldA, rdA, interceptFlag, error;
  MPI_Status status;
  float *A, *xvalue, *resultVector, *tempHolder, *dummyFloat;
  char matrixfilename[MAX_FILENAME_SIZE];

  //GET PARAMETERS FROM THE TEXT FILE
  getSlaveParams(parameterFile, &target_ldA, &rdA, &interceptFlag, matrixfilename);

  //ALLOCATE A, TEMPHOLDER, RESULTVECTOR and XVALUE
  A = malloc(target_ldA*(rdA+1)*sizeof(float));
  if(A==NULL)
    fprintf(stdout,"Unable to allocate memory!");

  xvalue = malloc( (target_ldA+rdA)*sizeof(float) );
  tempHolder = malloc( (target_ldA+rdA)*sizeof(float) ); //place holder for intermediate calculations
  resultVector = malloc( (target_ldA+rdA)*sizeof(float) );
  if(xvalue==NULL || tempHolder==NULL || resultVector==NULL)
    fprintf(stdout,"Unable to allocate memory!");


  //FILL A WITH DESIRED VALUES AND SEND NUMBER OF FILLED ROWS TO MASTER
  my_ldA=get_dat_matrix(A, target_ldA, rdA, myrank, matrixfilename, interceptFlag);
  MPI_Gather(&my_ldA, 1, MPI_INT, &dummyInt, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&error, 1, MPI_INT, 0, MPI_COMM_WORLD);
  if(error==0) { //if there were file open errors, end program
    free(A);
    free(xvalue);
    free(tempHolder);
    free(resultVector);
    return;
  }
  fprintf(stdout,"Slave %d found %d valid rows: A[0] is %f \n", myrank, my_ldA, A[0] );
  

  //CENTER FEATURES
  float* shifts = malloc((rdA+1)*sizeof(float));
  float* ones = malloc(my_ldA*sizeof(float));
  for(i=0; i<my_ldA; i++)
    ones[i] = 1.0;
  cblas_sgemv(CblasRowMajor, CblasTrans, my_ldA, rdA+1, 1.0, A, rdA+1, 
	      ones, 1, 0.0, shifts, 1); //shifts now holds the sums of the columns of A
  MPI_Reduce(shifts, dummyFloat, rdA, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
 
  MPI_Bcast(shifts, rdA, MPI_FLOAT, 0, MPI_COMM_WORLD); //shifts now holds the total means of the columns of A
  for(i=0; i<my_ldA; i++) { //Now we substract shifts from each row of A
    cblas_saxpy(rdA, -1.0, shifts, 1, &A[i*(rdA+1)], 1);
  }

  //SCALE FEATURES
  float* norms = calloc(rdA, sizeof(float));
  for(i=0; i<my_ldA; i++) {
    for(j=0; j<rdA; j++) {
	norms[j] += pow( A[i*(rdA+1) + j], 2);
    }
  }
  MPI_Reduce(norms, dummyFloat, rdA, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);

  MPI_Bcast(norms, rdA, MPI_FLOAT, 0, MPI_COMM_WORLD); //norms now holds the 2-norms of the total columns of A
  for(j=0; j<rdA; j++) {
    if(norms[j] > 0.0001)
      cblas_sscal(my_ldA, 1.0 / norms[j], A + j, rdA + 1);
  }


  //COMPUTATION LOOP
  while(1)
    {
      MPI_Recv(&dummyInt, 0, MPI_INT, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);

      //Check the tag to determine what to do next
      if (status.MPI_TAG == TAG_DIE)
	{
	  break;
	}

      else if (status.MPI_TAG == TAG_AX)
	{
	  //Multiply A * x

	  //Get xvalue
	  MPI_Bcast(xvalue, rdA+1, MPI_FLOAT, 0, MPI_COMM_WORLD);

	  //Multiply: resultVector = A*xvalue
	  cblas_sgemv(CblasRowMajor, CblasNoTrans, my_ldA, rdA+1, 1.0, A, rdA+1, 
		      xvalue, 1, 0.0, resultVector, 1);

	  //Gather xvalues
	  MPI_Gatherv(resultVector, my_ldA, MPI_FLOAT, dummyFloat, &dummyInt, &dummyInt, MPI_FLOAT, 0, MPI_COMM_WORLD);

	  
	}

      else if (status.MPI_TAG == TAG_ATX)
	{
	  //Multiply A^t * x
	  
	  //Get xvalue
	  MPI_Scatterv(dummyFloat, &dummyInt, &dummyInt, MPI_FLOAT, 
		       xvalue, my_ldA, MPI_FLOAT, 0, MPI_COMM_WORLD);

	  //Multiply: resultVector = A'*xvalue
	  cblas_sgemv(CblasRowMajor, CblasTrans, my_ldA, rdA+1, 1.0, A, rdA+1, 
		      xvalue, 1, 0.0, resultVector, 1);

	  //Sum resultVectors to get final result
	  MPI_Reduce(resultVector, dummyFloat, rdA+1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);


	}
      else if (status.MPI_TAG == TAG_ATAX)
	{
	  //Multiply A^t * A * x

	  //Get xvalue
	  MPI_Bcast(xvalue, rdA+1, MPI_FLOAT, 0, MPI_COMM_WORLD);

	  //Multiply: tempHolder = A*xvalue
	  cblas_sgemv(CblasRowMajor, CblasNoTrans, my_ldA, rdA+1, 1.0, A, rdA+1, 
		      xvalue, 1, 0.0, tempHolder, 1);
	  //Multiply: resultVector = A^t * tempHolder
	  cblas_sgemv(CblasRowMajor, CblasTrans, my_ldA, rdA+1, 1.0, A, rdA+1,
		      tempHolder, 1, 0.0, resultVector, 1);

	  //Gather and sum results
	  MPI_Reduce(resultVector, dummyFloat, rdA+1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
	  
	}

    }

  free(A); free(xvalue); free(tempHolder); free(resultVector); free(shifts); free(ones); free(norms);
  return;
}
Example #14
0
// B <- A - B
void StrassenSingleProblem::matrix_subtract_inplace(int m, int n, float *A, int lda, float *B, int ldb) {
    for (int i = 0; i < n; i++) {
        cblas_saxpy(m, -1, A + i*lda, 1, B + i*ldb, 1);
    }
}
Example #15
0
void CpuDevice<float>::Axpby(float alpha, const Values<float> &x, float beta, Values<float> *y) {
    // TODO(robertsdionne): Figure out why clang thinks cblas_saxpby is an undefined symbol.
    cblas_sscal(y->width, beta, y->values.data(), 1);
    cblas_saxpy(x.width, alpha, x.values.data(), 1, y->values.data(), 1);
}
Example #16
0
  void LLC::Encode_with_max_pooling(const float* const data, const uint32_t dim,
                                    const uint32_t num_frame,
                                    float* const code) const
  {
    if (data == NULL || dim != dim_ || num_frame <= 0)
    {
      cerr << "ERROR in input data" << endl;
      exit(-1);
    }

    if (!has_setup_)
    {
      cerr << "ERROR: Must call SetUp() before." << endl;
      exit(-1);
    }

    vl_uint32* index = (vl_uint32*) vl_malloc(
        sizeof(vl_uint32) * num_knn_ * num_frame);
    memset(index, 0, num_knn_ * num_frame);
    float* dist(NULL);

    vl_kdforest_query_with_array(kdforest_model_, index, num_knn_, num_frame,
                                 dist, data);

    // start to encode
    const uint32_t len_code = num_base_;
    memset(code, 0, sizeof(float) * len_code);

    const uint32_t len_z = dim_ * num_knn_;
    const uint32_t len_C = num_knn_ * num_knn_;
    const uint32_t len_b = num_knn_;
    float* z = (float*) malloc(sizeof(float) * len_z);
    float* C = (float*) malloc(sizeof(float) * len_C);
    float* b = (float*) malloc(sizeof(float) * len_b);
    memset(z, 0, sizeof(float) * len_z);
    memset(C, 0, sizeof(float) * len_C);
    memset(b, 0, sizeof(float) * len_b);

    double sum(0);
    const float* base = base_.get();

    for (uint32_t i = 0; i < num_frame; i++)
    {

      uint32_t tmp_ind;

      // z = B_i - 1 * x_i'
      for (uint32_t n = 0; n < num_knn_; n++)
      {
        tmp_ind = (uint32_t) index[i * num_knn_ + n];
        memcpy(z + n * dim_, base + tmp_ind * dim_, sizeof(float) * dim_);

        cblas_saxpy(dim_, -1.0f, data + i * dim_, 1, z + n * dim_, 1);
      }

      // C = z * z', i.e. covariance matrix
      for (uint32_t m = 0; m < num_knn_; ++m)
        for (uint32_t n = m; n < num_knn_; ++n)
        {
          float sum = cblas_sdot(dim_, z + m * dim_, 1, z + n * dim_, 1);
          C[m * num_knn_ + n] = sum;
          C[n * num_knn_ + m] = sum;
        }

      sum = 0;
      for (uint32_t m = 0; m < num_knn_; m++)
        sum += C[m * num_knn_ + m];
      sum = sum * beta_;
      for (uint32_t m = 0; m < num_knn_; m++)
        C[m * num_knn_ + m] += sum;

      for (uint32_t m = 0; m < num_knn_; m++)
        b[m] = 1;

      // solve
      {
        char upper_triangle = 'U';
        int INFO;
        int int_one = 1;
        const int num_knn = (int) num_knn_;
        sposv(&upper_triangle, &num_knn, &int_one, C, &num_knn, b, &num_knn,
              &INFO);
      }

      sum = 0;

      for (uint32_t m = 0; m < num_knn_; m++)
        sum += b[m];
      cblas_sscal(num_knn_, 1.0 / sum, b, 1);

      for (uint32_t m = 0; m < num_knn_; m++)
      {
        tmp_ind = (uint32_t) index[i * num_knn_ + m];

        if (code[tmp_ind] < b[m])
          code[tmp_ind] = b[m];
      }
    }

    free(index);
    free(z);
    free(C);
    free(b);
  }
Example #17
0
int CORE_stsrfb(int side, int trans, int direct, int storev,
                int M1, int N1, int M2, int N2, int K,
                float *A1, int LDA1,
                float *A2, int LDA2,
                float *V, int LDV,
                float *T, int LDT,
                float *WORK, int LDWORK)
{
    static float zone  =  1.0;
    static float mzone = -1.0;

    int j;

    /* Check input arguments */
    if (M1 < 0) {
        coreblas_error(5, "Illegal value of M1");
        return -5;
    }
    if (N1 < 0) {
        coreblas_error(6, "Illegal value of N1");
        return -6;
    }
    if ( (M2 < 0) || 
         ( (M2 != M1) && (side == PlasmaRight) ) ){
        coreblas_error(7, "Illegal value of M2");
        return -7;
    }
    if ( (N2 < 0) || 
         ( (N2 != N1) && (side == PlasmaLeft) ) ){
        coreblas_error(8, "Illegal value of N2");
        return -8;
    }
    if (K < 0) {
        coreblas_error(9, "Illegal value of K");
        return -9;
    }

    /* Quick return */
    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
        return PLASMA_SUCCESS;

    if (storev == PlasmaColumnwise) {
        if (direct == PlasmaForward) {
            if (side == PlasmaLeft) {
                /*
                 * B = A1 + V' * A2
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    K, N1,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasTrans, CblasNoTrans,
                    K, N2, M2,
                    (zone), V, LDV,
                    A2, LDA2,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - V*T*B -> B = T*B, A2 = A2 - V*B
                 */
                cblas_strmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    M2, N2, K,
                    (mzone), V, LDV,
                    WORK, LDWORK,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j = 0; j < N1; j++) {
                    cblas_saxpy(
                        K, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
            /*
             * Columnwise / Forward / Right
             */
            else {
                /*
                 * B = A1 + A2 * V
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    M1, K,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    M2, K, N2,
                    (zone), A2, LDA2,
                    V, LDV,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - B*T*V' -> B = B*T, A2 = A2 - B*V'
                 */
                cblas_strmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasTrans,
                    M2, N2, K,
                    (mzone), WORK, LDWORK,
                    V, LDV,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j = 0; j < K; j++) {
                    cblas_saxpy(
                        M1, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
        }
        else {
            coreblas_error(3, "Not implemented (ColMajor / Backward / Left or Right)");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }
    else {
        if (direct == PlasmaForward) {
            /*
             * Rowwise / Forward / Left
             */
            if (side == PlasmaLeft) {
                /*
                 * B = A1 + V * A2
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    K, N1,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    K, N2, M2,
                    (zone), V, LDV,
                    A2, LDA2,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - V'*T*B -> B = T*B, A2 = A2 - V'*B
                 */
                cblas_strmm(
                    CblasColMajor, CblasLeft, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, K, N2,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasTrans, CblasNoTrans,
                    M2, N2, K,
                    (mzone), V, LDV,
                    WORK, LDWORK,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j=0; j<N1; j++) {
                    cblas_saxpy(
                        K, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
            /*
             * Rowwise / Forward / Right
             */
            else {
                /*
                 * B = A1 + A2 * V'
                 */
                LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    M1, K,
                    A1, LDA1, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasTrans,
                    M2, K, N2,
                    (zone), A2, LDA2,
                    V, LDV,
                    (zone), WORK, LDWORK);
                /*
                 * A2 = A2 - B*T*V -> B = B*T, A2 = A2 - B*V'
                 */
                cblas_strmm(
                    CblasColMajor, CblasRight, CblasUpper,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M1, K,
                    (zone), T, LDT, WORK, LDWORK);

                cblas_sgemm(
                    CblasColMajor, CblasNoTrans, CblasNoTrans,
                    M2, N2, K,
                    (mzone), WORK, LDWORK,
                    V, LDV,
                    (zone), A2, LDA2);
                /*
                 * A1 = A1 - B
                 */
                for(j = 0; j < K; j++) {
                    cblas_saxpy(
                        M1, (mzone),
                        &WORK[LDWORK*j], 1,
                        &A1[LDA1*j], 1);
                }
            }
        }
        else {
            coreblas_error(3, "Not implemented (RowMajor / Backward / Left or Right)");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }
    return PLASMA_SUCCESS;
}
Example #18
0
JNIEXPORT jobject JNICALL Java_MKL_apply(JNIEnv *env, jclass cl, jint n, jobjectArray jts) {
	int m = (*env)->GetArrayLength(env, jts);
	float **p = (float **)malloc(m * sizeof (float *));
	
	jclass classTransform = (*env)->FindClass(env, "Transform");
	jfieldID idA = (*env)->GetFieldID(env, classTransform, "a", "F");
	jfieldID idB = (*env)->GetFieldID(env, classTransform, "b", "F");
	jfieldID idC = (*env)->GetFieldID(env, classTransform, "c", "F");
	jfieldID idD = (*env)->GetFieldID(env, classTransform, "d", "F");
	jfieldID idE = (*env)->GetFieldID(env, classTransform, "e", "F");
	jfieldID idF = (*env)->GetFieldID(env, classTransform, "f", "F");	

	int i;
	jobject t;
	for (i = 0; i < m; ++i) {
		t = (*env)->GetObjectArrayElement(env, jts, i);
		p[i] = (float *)malloc(7 * sizeof (float));
		p[i][0] = -1;
		p[i][1] = (*env)->GetFloatField(env, t, idA);
		p[i][2] = (*env)->GetFloatField(env, t, idB);
		p[i][3] = (*env)->GetFloatField(env, t, idC);
		p[i][4] = (*env)->GetFloatField(env, t, idD);
		p[i][5] = (*env)->GetFloatField(env, t, idE);
		p[i][6] = (*env)->GetFloatField(env, t, idF);
	}

	int k = ipow(m, n);
	jfloatArray jxs = (*env)->NewFloatArray(env, k);
	jfloatArray jys = (*env)->NewFloatArray(env, k);


        jfloat *xs = (*env)->GetFloatArrayElements(env, jxs, NULL);
        jfloat *ys = (*env)->GetFloatArrayElements(env, jys, NULL);
	xs[0] = 0;
	ys[0] = 0;

	int j;
	int w;
	for (i = 0; i < n; ++i) {
		w = ipow(m, i);
		for (j = 1; j < m; ++j) {
			cblas_scopy(w, xs, 1, xs + w * j, 1);
			cblas_scopy(w, ys, 1, ys + w * j, 1);
		}		
		for (j = 0; j < m; ++j) {
			cblas_srotm(w, xs + w * j, 1, ys + w * j, 1, p[j]);
			cblas_saxpy(w, 1, &(p[j][5]), 0, xs + w * j, 1);
			cblas_saxpy(w, 1, &(p[j][6]), 0, ys + w * j, 1);
		}
	}

	for (i = 0; i < m; ++i) {
		free(p[i]);
	}
	free(p);
	(*env)->ReleaseFloatArrayElements(env, jxs, xs, 0);
	(*env)->ReleaseFloatArrayElements(env, jys, ys, 0);
	jclass classPoints = (*env)->FindClass(env, "Points");
	jmethodID idConstructorPoints = (*env)->GetMethodID(env, classPoints, "<init>", "([F[F)V");
	jobject ps = (*env)->NewObject(env, classPoints, idConstructorPoints, jxs, jys);
	return ps;
}
void STARPU_SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
{
	cblas_saxpy(n, alpha, X, incX, Y, incY);
}
Example #20
0
static PyObject *
dotblas_matrixproduct(PyObject *dummy, PyObject *args)
{
    PyObject *op1, *op2;
    PyArrayObject *ap1=NULL, *ap2=NULL, *ret=NULL;
    int j, l, lda, ldb, ldc;
    int typenum, nd;
    intp ap1stride=0;
    intp dimensions[MAX_DIMS];
    intp numbytes;
    static const float oneF[2] = {1.0, 0.0};
    static const float zeroF[2] = {0.0, 0.0};
    static const double oneD[2] = {1.0, 0.0};
    static const double zeroD[2] = {0.0, 0.0};
    double prior1, prior2;
    PyTypeObject *subtype;
    PyArray_Descr *dtype;
    MatrixShape ap1shape, ap2shape;

    if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;

    /*
     * "Matrix product" using the BLAS.
     * Only works for float double and complex types.
     */

    typenum = PyArray_ObjectType(op1, 0);
    typenum = PyArray_ObjectType(op2, typenum);

    /* This function doesn't handle other types */
    if ((typenum != PyArray_DOUBLE && typenum != PyArray_CDOUBLE &&
            typenum != PyArray_FLOAT && typenum != PyArray_CFLOAT)) {
        return PyArray_Return((PyArrayObject *)PyArray_MatrixProduct(op1, op2));
    }

    dtype = PyArray_DescrFromType(typenum);
    ap1 = (PyArrayObject *)PyArray_FromAny(op1, dtype, 0, 0, ALIGNED, NULL);
    if (ap1 == NULL) return NULL;
    Py_INCREF(dtype);
    ap2 = (PyArrayObject *)PyArray_FromAny(op2, dtype, 0, 0, ALIGNED, NULL);
    if (ap2 == NULL) goto fail;


    if ((ap1->nd > 2) || (ap2->nd > 2)) {
        /* This function doesn't handle dimensions greater than 2
           (or negative striding)  -- other
           than to ensure the dot function is altered
        */
        if (!altered) {
            /* need to alter dot product */
            PyObject *tmp1, *tmp2;
            tmp1 = PyTuple_New(0);
            tmp2 = dotblas_alterdot(NULL, tmp1);
            Py_DECREF(tmp1);
            Py_DECREF(tmp2);
        }
        ret = (PyArrayObject *)PyArray_MatrixProduct((PyObject *)ap1,
                (PyObject *)ap2);
        Py_DECREF(ap1);
        Py_DECREF(ap2);
        return PyArray_Return(ret);
    }

    if (_bad_strides(ap1)) {
        op1 = PyArray_NewCopy(ap1, PyArray_ANYORDER);
        Py_DECREF(ap1);
        ap1 = (PyArrayObject *)op1;
        if (ap1 == NULL) goto fail;
    }
    if (_bad_strides(ap2)) {
        op2 = PyArray_NewCopy(ap2, PyArray_ANYORDER);
        Py_DECREF(ap2);
        ap2 = (PyArrayObject *)op2;
        if (ap2 == NULL) goto fail;
    }
    ap1shape = _select_matrix_shape(ap1);
    ap2shape = _select_matrix_shape(ap2);

    if (ap1shape == _scalar || ap2shape == _scalar) {
        PyArrayObject *oap1, *oap2;
        oap1 = ap1;
        oap2 = ap2;
        /* One of ap1 or ap2 is a scalar */
        if (ap1shape == _scalar) { 		/* Make ap2 the scalar */
            PyArrayObject *t = ap1;
            ap1 = ap2;
            ap2 = t;
            ap1shape = ap2shape;
            ap2shape = _scalar;
        }

        if (ap1shape == _row) ap1stride = ap1->strides[1];
        else if (ap1->nd > 0) ap1stride = ap1->strides[0];

        if (ap1->nd == 0 || ap2->nd == 0) {
            intp *thisdims;
            if (ap1->nd == 0) {
                nd = ap2->nd;
                thisdims = ap2->dimensions;
            }
            else {
                nd = ap1->nd;
                thisdims = ap1->dimensions;
            }
            l = 1;
            for (j=0; j<nd; j++) {
                dimensions[j] = thisdims[j];
                l *= dimensions[j];
            }
        }
        else {
            l = oap1->dimensions[oap1->nd-1];

            if (oap2->dimensions[0] != l) {
                PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
                goto fail;
            }
            nd = ap1->nd + ap2->nd - 2;
            /* nd = 0 or 1 or 2 */
            /* If nd == 0 do nothing ... */
            if (nd == 1) {
                /* Either ap1->nd is 1 dim or ap2->nd is 1 dim
                   and the other is 2-dim */
                dimensions[0] = (oap1->nd == 2) ? oap1->dimensions[0] : oap2->dimensions[1];
                l = dimensions[0];
                /* Fix it so that dot(shape=(N,1), shape=(1,))
                   and dot(shape=(1,), shape=(1,N)) both return
                   an (N,) array (but use the fast scalar code)
                */
            }
            else if (nd == 2) {
                dimensions[0] = oap1->dimensions[0];
                dimensions[1] = oap2->dimensions[1];
                /* We need to make sure that dot(shape=(1,1), shape=(1,N))
                   and dot(shape=(N,1),shape=(1,1)) uses
                   scalar multiplication appropriately
                */
                if (ap1shape == _row) l = dimensions[1];
                else l = dimensions[0];
            }
        }
    }
    else { /* (ap1->nd <= 2 && ap2->nd <= 2) */
        /*  Both ap1 and ap2 are vectors or matrices */
        l = ap1->dimensions[ap1->nd-1];

        if (ap2->dimensions[0] != l) {
            PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
            goto fail;
        }
        nd = ap1->nd+ap2->nd-2;

        if (nd == 1)
            dimensions[0] = (ap1->nd == 2) ? ap1->dimensions[0] : ap2->dimensions[1];
        else if (nd == 2) {
            dimensions[0] = ap1->dimensions[0];
            dimensions[1] = ap2->dimensions[1];
        }
    }

    /* Choose which subtype to return */
    if (ap1->ob_type != ap2->ob_type) {
        prior2 = PyArray_GetPriority((PyObject *)ap2, 0.0);
        prior1 = PyArray_GetPriority((PyObject *)ap1, 0.0);
        subtype = (prior2 > prior1 ? ap2->ob_type : ap1->ob_type);
    }
    else {
        prior1 = prior2 = 0.0;
        subtype = ap1->ob_type;
    }

    ret = (PyArrayObject *)PyArray_New(subtype, nd, dimensions,
                                       typenum, NULL, NULL, 0, 0,
                                       (PyObject *)
                                       (prior2 > prior1 ? ap2 : ap1));

    if (ret == NULL) goto fail;
    numbytes = PyArray_NBYTES(ret);
    memset(ret->data, 0, numbytes);
    if (numbytes==0 || l == 0) {
        Py_DECREF(ap1);
        Py_DECREF(ap2);
        return PyArray_Return(ret);
    }


    if (ap2shape == _scalar) {
        /* Multiplication by a scalar -- Level 1 BLAS */
        /* if ap1shape is a matrix and we are not contiguous, then we can't
           just blast through the entire array using a single
           striding factor */
        NPY_BEGIN_ALLOW_THREADS

        if (typenum == PyArray_DOUBLE) {
            if (l == 1) {
                *((double *)ret->data) = *((double *)ap2->data) * \
                                         *((double *)ap1->data);
            }
            else if (ap1shape != _matrix) {
                cblas_daxpy(l, *((double *)ap2->data), (double *)ap1->data,
                            ap1stride/sizeof(double), (double *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                double val;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                val = *((double *)ap2->data);
                a1s = ap1->strides[maxind] / sizeof(double);
                rets = ret->strides[maxind] / sizeof(double);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_daxpy(l, val, (double *)ptr, a1s,
                                (double *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        else if (typenum == PyArray_CDOUBLE) {
            if (l == 1) {
                cdouble *ptr1, *ptr2, *res;
                ptr1 = (cdouble *)ap2->data;
                ptr2 = (cdouble *)ap1->data;
                res = (cdouble *)ret->data;
                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
            }
            else if (ap1shape != _matrix) {
                cblas_zaxpy(l, (double *)ap2->data, (double *)ap1->data,
                            ap1stride/sizeof(cdouble), (double *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                double *pval;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                pval = (double *)ap2->data;
                a1s = ap1->strides[maxind] / sizeof(cdouble);
                rets = ret->strides[maxind] / sizeof(cdouble);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_zaxpy(l, pval, (double *)ptr, a1s,
                                (double *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        else if (typenum == PyArray_FLOAT) {
            if (l == 1) {
                *((float *)ret->data) = *((float *)ap2->data) * \
                                        *((float *)ap1->data);
            }
            else if (ap1shape != _matrix) {
                cblas_saxpy(l, *((float *)ap2->data), (float *)ap1->data,
                            ap1stride/sizeof(float), (float *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                float val;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                val = *((float *)ap2->data);
                a1s = ap1->strides[maxind] / sizeof(float);
                rets = ret->strides[maxind] / sizeof(float);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_saxpy(l, val, (float *)ptr, a1s,
                                (float *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        else if (typenum == PyArray_CFLOAT) {
            if (l == 1) {
                cfloat *ptr1, *ptr2, *res;
                ptr1 = (cfloat *)ap2->data;
                ptr2 = (cfloat *)ap1->data;
                res = (cfloat *)ret->data;
                res->real = ptr1->real * ptr2->real - ptr1->imag * ptr2->imag;
                res->imag = ptr1->real * ptr2->imag + ptr1->imag * ptr2->real;
            }
            else if (ap1shape != _matrix) {
                cblas_caxpy(l, (float *)ap2->data, (float *)ap1->data,
                            ap1stride/sizeof(cfloat), (float *)ret->data, 1);
            }
            else {
                int maxind, oind, i, a1s, rets;
                char *ptr, *rptr;
                float *pval;
                maxind = (ap1->dimensions[0] >= ap1->dimensions[1] ? 0 : 1);
                oind = 1-maxind;
                ptr = ap1->data;
                rptr = ret->data;
                l = ap1->dimensions[maxind];
                pval = (float *)ap2->data;
                a1s = ap1->strides[maxind] / sizeof(cfloat);
                rets = ret->strides[maxind] / sizeof(cfloat);
                for (i=0; i < ap1->dimensions[oind]; i++) {
                    cblas_caxpy(l, pval, (float *)ptr, a1s,
                                (float *)rptr, rets);
                    ptr += ap1->strides[oind];
                    rptr += ret->strides[oind];
                }
            }
        }
        NPY_END_ALLOW_THREADS
    }
Example #21
0
static int check_solution(PLASMA_enum transA, PLASMA_enum transB, int M, int N, int K,
                          float alpha, float *A, int LDA,
                          float *B, int LDB,
                          float beta, float *Cref, float *Cplasma, int LDC)
{
    int info_solution;
    float Anorm, Bnorm, Cinitnorm, Cplasmanorm, Clapacknorm, Rnorm, result;
    float eps;
    float beta_const;

    float *work = (float *)malloc(max(K,max(M, N))* sizeof(float));
    int Am, An, Bm, Bn;

    beta_const  = -1.0;

    if (transA == PlasmaNoTrans) {
        Am = M; An = K;
    } else {
        Am = K; An = M;
    }
    if (transB == PlasmaNoTrans) {
        Bm = K; Bn = N;
    } else {
        Bm = N; Bn = K;
    }

    Anorm       = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), Am, An, A,       LDA, work);
    Bnorm       = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), Bm, Bn, B,       LDB, work);
    Cinitnorm   = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M,  N,  Cref,    LDC, work);
    Cplasmanorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M,  N,  Cplasma, LDC, work);

    cblas_sgemm(CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB, M, N, K, 
                (alpha), A, LDA, B, LDB, (beta), Cref, LDC);

    Clapacknorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Cref, LDC, work);

    cblas_saxpy(LDC * N, (beta_const), Cplasma, 1, Cref, 1);

    Rnorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Cref, LDC, work);

    eps = LAPACKE_slamch_work('e');

    printf("Rnorm %e, Anorm %e, Bnorm %e, Cinitnorm %e, Cplasmanorm %e, Clapacknorm %e\n", 
           Rnorm, Anorm, Bnorm, Cinitnorm, Cplasmanorm, Clapacknorm);

    result = Rnorm / ((Anorm + Bnorm + Cinitnorm) * N * eps);
    printf("============\n");
    printf("Checking the norm of the difference against reference SGEMM \n");
    printf("-- ||Cplasma - Clapack||_oo/((||A||_oo+||B||_oo+||C||_oo).N.eps) = %e \n", 
           result);

    if (  isnan(Rnorm) || isinf(Rnorm) || isnan(result) || isinf(result) || (result > 10.0) ) {
         printf("-- The solution is suspicious ! \n");
         info_solution = 1;
    }
    else {
         printf("-- The solution is CORRECT ! \n");
         info_solution= 0 ;
    }

    free(work);

    return info_solution;
}
Example #22
0
void caffe_axpy<float>(const int N, const float alpha, const float* X, float* Y,
                       const int ldx, const int ldy) {
  cblas_saxpy(N, alpha, X, ldx, Y, ldy);
}
Example #23
0
//
// Overloaded function for dispatching to
// * CBLAS backend, and
// * float value-type.
//
inline void axpy( const int n, const float a, const float* x, const int incx,
        float* y, const int incy ) {
    cblas_saxpy( n, a, x, incx, y, incy );
}
Example #24
0
void caffe_cpu_xpasv<float>(const int M, const int N, const float alpha,
    float* X, const float* a, const float* b) {
  for (int i = 0; i < M; ++i) {
    cblas_saxpy(N, alpha * a[i], b, 1, X + i * N, 1);
  }
}
Example #25
0
inline void blas_axpy(size_t n1, float alpha, float* a, float* b) {
    cblas_saxpy(n1, alpha, a, 1, b, 1);
}
Example #26
0
int testing_strsm(int argc, char **argv)
{
    /* Check for number of arguments*/
    if ( argc != 5 ) {
        USAGE("TRSM", "alpha M N LDA LDB",
              "   - alpha  : alpha coefficient\n"
              "   - M      : number of rows of matrices B\n"
              "   - N      : number of columns of matrices B\n"
              "   - LDA    : leading dimension of matrix A\n"
              "   - LDB    : leading dimension of matrix B\n");
        return -1;
    }

    float alpha = (float) atol(argv[0]);
    int M     = atoi(argv[1]);
    int N     = atoi(argv[2]);
    int LDA   = atoi(argv[3]);
    int LDB   = atoi(argv[4]);

    float eps;
    int info_solution;
    int s, u, t, d, i;
    int LDAxM = LDA*max(M,N);
    int LDBxN = LDB*max(M,N);

    float *A      = (float *)malloc(LDAxM*sizeof(float));
    float *B      = (float *)malloc(LDBxN*sizeof(float));
    float *Binit  = (float *)malloc(LDBxN*sizeof(float));
    float *Bfinal = (float *)malloc(LDBxN*sizeof(float));

    /* Check if unable to allocate memory */
    if ( (!A) || (!B) || (!Binit) || (!Bfinal)){
        printf("Out of Memory \n ");
        return -2;
    }

    eps = LAPACKE_slamch_work('e');

    printf("\n");
    printf("------ TESTS FOR PLASMA STRSM ROUTINE -------  \n");
    printf("            Size of the Matrix B : %d by %d\n", M, N);
    printf("\n");
    printf(" The matrix A is randomly generated for each test.\n");
    printf("============\n");
    printf(" The relative machine precision (eps) is to be %e \n",eps);
    printf(" Computational tests pass if scaled residuals are less than 10.\n");

    /*----------------------------------------------------------
     *  TESTING STRSM
     */

    /* Initialize A, B, C */
    LAPACKE_slarnv_work(IONE, ISEED, LDAxM, A);
    LAPACKE_slarnv_work(IONE, ISEED, LDBxN, B);
    for(i=0;i<max(M,N);i++)
      A[LDA*i+i] = A[LDA*i+i] + 2.0;

    for (s=0; s<2; s++) {
        for (u=0; u<2; u++) {
#ifdef COMPLEX
            for (t=0; t<3; t++) {
#else
            for (t=0; t<2; t++) {
#endif
                for (d=0; d<2; d++) {

                    memcpy(Binit,  B, LDBxN*sizeof(float));
                    memcpy(Bfinal, B, LDBxN*sizeof(float));

                    /* PLASMA STRSM */
                    PLASMA_strsm(side[s], uplo[u], trans[t], diag[d],
                                 M, N, alpha, A, LDA, Bfinal, LDB);

                    /* Check the solution */
                    info_solution = check_solution(side[s], uplo[u], trans[t], diag[d],
                                                   M, N, alpha, A, LDA, Binit, Bfinal, LDB);

                    printf("***************************************************\n");
                    if (info_solution == 0) {
                        printf(" ---- TESTING STRSM (%s, %s, %s, %s) ...... PASSED !\n",
                               sidestr[s], uplostr[u], transstr[t], diagstr[d]);
                    }
                    else {
                        printf(" ---- TESTING STRSM (%s, %s, %s, %s) ... FAILED !\n",
                               sidestr[s], uplostr[u], transstr[t], diagstr[d]);
                    }
                    printf("***************************************************\n");
                }
            }
        }
    }

    free(A); free(B);
    free(Binit); free(Bfinal);

    return 0;
}

/*--------------------------------------------------------------
 * Check the solution
 */
static int check_solution(PLASMA_enum side, PLASMA_enum uplo, PLASMA_enum trans, PLASMA_enum diag,
                          int M, int N, float alpha,
                          float *A, int LDA,
                          float *Bref, float *Bplasma, int LDB)
{
    int info_solution;
    float Anorm, Binitnorm, Bplasmanorm, Blapacknorm, Rnorm, result;
    float eps;
    float mzone = (float)-1.0;

    float *work = (float *)malloc(max(M, N)* sizeof(float));
    int Am, An;

    if (side == PlasmaLeft) {
        Am = M; An = M;
    } else {
        Am = N; An = N;
    }

    Anorm       = LAPACKE_slantr_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), lapack_const(uplo), lapack_const(diag),
                                Am, An, A, LDA, work);
    Binitnorm   = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bref,    LDB, work);
    Bplasmanorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bplasma, LDB, work);

    cblas_strsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans,
                (CBLAS_DIAG)diag, M, N, (alpha), A, LDA, Bref, LDB);

    Blapacknorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bref, LDB, work);

    cblas_saxpy(LDB * N, (mzone), Bplasma, 1, Bref, 1);

    Rnorm = LAPACKE_slange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Bref, LDB, work);

    eps = LAPACKE_slamch_work('e');

    printf("Rnorm %e, Anorm %e, Binitnorm %e, Bplasmanorm %e, Blapacknorm %e\n",
           Rnorm, Anorm, Binitnorm, Bplasmanorm, Blapacknorm);

    result = Rnorm / ((Anorm + Blapacknorm) * max(M,N) * eps);

    printf("============\n");
    printf("Checking the norm of the difference against reference STRSM \n");
    printf("-- ||Cplasma - Clapack||_oo/((||A||_oo+||B||_oo).N.eps) = %e \n", result);

    if ( isinf(Blapacknorm) || isinf(Bplasmanorm) || isnan(result) || isinf(result) || (result > 10.0) ) {
        printf("-- The solution is suspicious ! \n");
        info_solution = 1;
    }
    else {
        printf("-- The solution is CORRECT ! \n");
        info_solution= 0 ;
    }
    free(work);

    return info_solution;
}
Example #27
0
 DLLEXPORT void s_axpy(const blasint n, const float alpha, const float x[], float y[]) {
     cblas_saxpy(n, alpha, x, 1, y, 1);
 }
Example #28
0
void caffe_axpy<float>(const int N, const float alpha, const float* X,
                       float* Y) {
    cblas_saxpy(N, alpha, X, 1, Y, 1);
}
Example #29
0
void caffe_axpy<float>(const int N, const float alpha, const float* X,
    float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } // 封装的函数,调用BLAS函数, 进行矩阵线性运算