C++ (Cpp) amat示例 - HotExamples

示例#1

0

显示文件

文件： KokkosBatched_Test_Gemm_Host.hpp 项目： giorgiobornia/trilinos

      void Gemm(const int NN) {
        typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;

        constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
        const int N = NN/VectorLength;

        {
          std::string value_type_name;
          if (std::is_same<value_type,double>::value)                   value_type_name = "double";
          if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
#if   defined(__AVX512F__)
          std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#elif defined(__AVX__) || defined(__AVX2__)
          std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#else
          std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#endif
        }

        const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize);
        const double tmax = 1.0e15;

        const int iter_begin = -10, iter_end = 100;
        Kokkos::Impl::Timer timer;

        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> cref;
        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
          amat("amat", N*VectorLength, BlkSize, BlkSize),
          bmat("bmat", N*VectorLength, BlkSize, BlkSize);

        Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718);
        Kokkos::fill_random(amat, random, value_type(1.0));
        Kokkos::fill_random(bmat, random, value_type(1.0));

        typedef Vector<SIMD<value_type>,VectorLength> VectorType;
        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
          amat_simd("amat_simd", N, BlkSize, BlkSize),
          bmat_simd("bmat_simd", N, BlkSize, BlkSize);

        Kokkos::parallel_for
          (Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
           KOKKOS_LAMBDA(const int k) {
            const int k0 = k/VectorLength, k1 = k%VectorLength;
            for (int i=0;i<BlkSize;++i)
              for (int j=0;j<BlkSize;++j) {
                amat_simd(k0, i, j)[k1] = amat(k, i, j);
                bmat_simd(k0, i, j)[k1] = bmat(k, i, j);                  
              }
          });

示例#2

0

显示文件

文件： prob2b.c 项目： thunderc90/cs_4234

int main (int argc, char *argv[])
{
#define amat(I,J) a[I*n + J]
#define bmat(I,J) b[I*n + J]
#define cmat(I,J) c[I*n + J]

  int n, nthreads, i, j, k;
  double *a, *b, *c;
  double t0, t1;

  n = 1000;
  nthreads = atoi(argv[1]);
  omp_set_num_threads (nthreads);

  a = (double *) malloc (n * n * sizeof (double));
  b = (double *) malloc (n * n * sizeof (double));
  c = (double *) malloc (n * n * sizeof (double));

  t0 = omp_get_wtime();

#pragma omp parallel for private (i, j, k) 
  for (j=0; j<n; j++) 
    for (i=0; i<n; i++) 
      for (k=0; k<n; k++) 
         cmat(i,j) = cmat(i,j) + amat(i,k) * bmat(k,j);

  t1 = omp_get_wtime();
  printf("nthreads, time: %d %6.2f\n", nthreads, t1-t0);
}

示例#3

0

显示文件

文件： l2.cpp 项目： fengyinyang/clusterpath

Results* join_clusters2_restart
(double *x,//array/matrix of data
 SymNoDiag *W,//lower triangle of weight matrix
 unsigned int Px,//problem size
 double lambda,//starting point in regularization path
 double join_thresh, //tolerance for equality of points
 double opt_thresh, //tolerance for optimality
 double lambda_factor,//increase of lambda after optimality
 double smooth,//smoothing parameter
 int maxit,
 int linesearch_freq,//how often to do a linesearch? if 0, never. if
		     //n>0, do n-1 linesearch steps for every
		     //decreasing step size step. set this to 2 if
		     //unsure.
 int linesearch_points,//how many points to check along the gradient
		       //direction. set to 10 if unsure.
 int check_splits,
 int target_cluster,
 int verbose
 ){
  unsigned int N = W->N;
  //W->print();
  double old_lambda=0;
  std::vector<int> rows,rowsj;
  std::vector<int>::iterator rowit,ri,rj;
  std::list< std::vector<int> > clusters,tocheck;
  std::list< std::vector<int> >::iterator it,cj;
  unsigned int i,k,j;
  int tried_restart;
  for(i=0;i<N;i++){
    rows.assign(1,i);
    clusters.push_back(rows);
  }
  double *old_alpha = new double[N*Px];
  double *alpha = new double[N*Px];
  double *xbar = new double[N*Px];
  double *dir = new double[N*Px];
  for(i=0;i<N*Px;i++){
    alpha[i]=xbar[i]=x[i];
  }
  Matrix amat(alpha,N,Px),xmat(x,N,Px);
  SymNoDiag diffs(N);
  diffs.calc_diffs(clusters,amat,nrm2);
  //store initial trivial solution
  Results *results = new Results(N,Px,opt_thresh);
  if(target_cluster==0)results->add(alpha,0,0);
  double weight,diff,step;
  while(clusters.size()>1){
    double grad=opt_thresh;
    int iteration=1;
    tried_restart=0;
    //if we use the general (slower) algorithm for any weights, then
    //split the clusters to individual points
    if(check_splits){
      clusters.clear();
      //reassign original clusters
      for(i=0;i<N;i++){
	rows.assign(1,i);
	clusters.push_back(rows);
      }
      //recopy original xbar
      for(i=0;i<N*Px;i++){
	xbar[i]=x[i];
      }
    }
    while(grad>=opt_thresh){
      //first calc gradients
      grad = 0;
      for(it=clusters.begin();it!=clusters.end();it++){
	rows = *it;
	i = rows[0];
	for(k=0;k<Px;k++){
	  dir[i+k*N] = xbar[i+k*N] - alpha[i+k*N];
	}
	for(cj=clusters.begin();cj!=clusters.end();cj++){
	  if(it!=cj){
	    rowsj = *cj;
	    j=rowsj[0];
	    weight=0;
	    diff = *diffs(i,j);
	    if(diff!=0){
	      if(smooth!=0){
		diff *= diff; //now squared l2 norm
		diff += smooth; //add smoothing parameter under sqrt
		diff = sqrt(diff);//put sqrt back
	      }
	      for(ri=rows.begin();ri!=rows.end();ri++){
		for(rj=rowsj.begin();rj!=rowsj.end();rj++){
		  weight += W->getval(*ri,*rj);
		}
	      }
	      //weight *= lambda / diff / ((double)(N-1)) / ((double)rows.size());
	      weight *= lambda / diff / ((double)rows.size());
	      for(k=0;k<Px;k++){
		dir[i+k*N] += weight * (alpha[j+k*N]-alpha[i+k*N]);
	      }
	    }
	  }
	}
	grad += nrm2(Array(dir+i,N,Px));
      }
      //store this iteration
      //results->add(alpha,lambda,grad);
      //then take a step
      if(linesearch_freq==0 || (iteration % linesearch_freq)==0 ){
	//Decreasing step size
	//TDH and pierre 18 jan 2011 try sqrt dec step size
	step=1/((double)iteration);
	//step=1/sqrt((double)iteration);
	if(verbose>=2)printf("grad %f step %f it %d\n",grad,step,iteration);
	take_step(clusters,alpha,dir,N,Px,step);
      }else{
	double cost_here,cost_step;
	std::map<double,double> cost_steps;
	std::map<double,double>::iterator step1,step2;
	for(i=0;i<N*Px;i++)old_alpha[i]=alpha[i];//copy alpha
	//compare current cost to cost after stepping in gradient direction
	cost_here=cost_step=calc_cost(clusters,amat,xmat,W,diffs,lambda);
	step = 0;
	cost_steps.insert(std::pair<double,double>(cost_here,0));
	while(cost_step<=cost_here){
	  take_step(clusters,alpha,dir,N,Px,1);
	  step += 1;
	  diffs.calc_diffs(clusters,amat,nrm2);
	  cost_step=calc_cost(clusters,amat,xmat,W,diffs,lambda);
	  if(verbose>=2)
	printf("cost %.10f step %f cost_here %f\n",cost_step,step,cost_here);
	  cost_steps.insert(std::pair<double,double>(cost_step,step));
	}
	for(int cuts=0;cuts<linesearch_points;cuts++){
	  step1=step2=cost_steps.begin();
	  step2++;
	  step = (step1->second + step2->second)/2;
	  for(i=0;i<N*Px;i++){
	    alpha[i]=old_alpha[i];
	  }
	  take_step(clusters,alpha,dir,N,Px,step);
	  diffs.calc_diffs(clusters,amat,nrm2);
	  cost_step=calc_cost(clusters,amat,xmat,W,diffs,lambda);
	  if(verbose>=2)printf("cost %.10f step %f %d\n",cost_step,step,cuts);
	  cost_steps.insert(std::pair<double,double>(cost_step,step));
	}
	cost_steps.clear();
      }
      if(iteration++ > maxit){
	if(tried_restart){
	  printf("max iteration %d exit\n",maxit);
	  delete old_alpha;
	  delete alpha;
	  delete xbar;
	  delete dir;
	  return results;
	}else{
	  if(verbose>=1)printf("max iterations, trying restart from x\n");
	  tried_restart=1;
	  iteration=1;
	  for(i=0;i<N*Px;i++)alpha[i]=x[i];
	}
      }
      //calculate differences
      diffs.calc_diffs(clusters,amat,nrm2);
      //check for joins
      JoinPair tojoin;
      while(dojoin(tojoin=check_clusters_thresh(&clusters,diffs,join_thresh))){
	//if(verbose>=1)
	//  printf("join: %d %d\n",tojoin.first->front(),tojoin.second->front());
	int ni=tojoin.first->size();
	int nj=tojoin.second->size();
	i=tojoin.first->front();
	j=tojoin.second->front();
	tojoin.first->insert(tojoin.first->end(),
			    tojoin.second->begin(),
			    tojoin.second->end());
	for(k=0;k<Px;k++){
	  alpha[i+k*N] = (alpha[i+k*N]*ni + alpha[j+k*N]*nj)/(ni+nj);
	  xbar[i+k*N] = (xbar[i+k*N]*ni + xbar[j+k*N]*nj)/(ni+nj);
	}
	clusters.erase(tojoin.second);
	iteration=1;
	if(clusters.size()>1){
	  diffs.calc_diffs(clusters,amat,nrm2);//inefficient
	}else{
	  grad=0;//so we can escape from the last optimization loop
	}
      }
    }//while(grad>=opt_thresh)
    if(verbose>=1)
    printf("solution iteration %d lambda %f nclusters %d\n",
	   iteration,lambda,(int)clusters.size());
    
    if(target_cluster == 0){
      //for each cluster, there may be several points. we store the
      //alpha value just in the row of the first point. thus here we
      //copy this value to the other rows before copying the optimal
      //alpha to results.
      for(it=clusters.begin();it!=clusters.end();it++){
	rows = *it;
	if(rows.size()>1){
	  for(i=1;i<rows.size();i++){
	    for(k=0;k<Px;k++){
	      alpha[rows[i]+k*N] = alpha[rows[0]+k*N];
	    }
	  }
	}
      }
      results->add(alpha,lambda,grad);
    }
    //haven't yet reached the target number of clusters, multiply
    //lambda by lambda_factor and continue along the path
    if((int)clusters.size()>target_cluster){
      old_lambda=lambda;
      lambda *= lambda_factor;
    }
    //if we have passed the target cluster number then decrease
    //lambda and go look for it!
    if((int)clusters.size()<target_cluster){
      if(verbose>=1){
	printf("missed target %d, going back for it\n",target_cluster);
      }
      lambda = (lambda+old_lambda)/2;
      clusters.clear();
      //reassign original clusters
      for(i=0;i<N;i++){
	rows.assign(1,i);
	clusters.push_back(rows);
      }
      //recopy original xbar
      for(i=0;i<N*Px;i++){
	xbar[i]=x[i];
      }
    }
    //this is the number of clusters that we were looking for,
    //save and quit!
    if((int)clusters.size()==target_cluster){
      for(it=clusters.begin();it!=clusters.end();it++){
	rows = *it;
	if(rows.size()>1){
	  for(i=1;i<rows.size();i++){
	    for(k=0;k<Px;k++){
	      alpha[rows[i]+k*N] = alpha[rows[0]+k*N];
	    }
	  }
	}
      }
      results->add(alpha,lambda,grad);
      if(verbose>=1)printf("got target cluster %d exit\n",target_cluster);
      delete old_alpha;
      delete alpha;
      delete xbar;
      delete dir;
      return results;
    }
  }	
  //TODO: consolidate cleanup... just use data structures that
  //automatically clean themselves up when the function exits.
  delete old_alpha;
  delete alpha;
  delete xbar;
  delete dir;
  return results;
}

示例#4

0

显示文件

文件： ring_matrix_part_c.c 项目： patlewis/matrix-matrix-multiplication

int main(int argc, char* argv[])
{
/* Preprocessor Definitions */
#define atempmat(I,J) Atemp[I + n*J]
#define amat(I,J) A[I + n*J]
#define bmat(I,J) B[I + n*J]
#define cmat(I,J) C[I + n*J]
//#define DEBUG 1

   /* Variables */
   int     p;                          //total number of processors    
   int     k;                          //this processor's rank
   int     i,j;                        //counter variables
   double  start, finish;              //used for timing matrix computations
   char    hostname[20];               //hostname of this machine (for debug)
   double *Amatrix, *Bmatrix, *Cmatrix;//actual matrices (used by process 0)
   /* Initializations */
   MPI_Init(NULL, NULL);
   MPI_Comm_size(MPI_COMM_WORLD, &p);
   MPI_Comm_rank(MPI_COMM_WORLD, &k);
   hostname[19] = '\0';
   gethostname(hostname, 19);
#   ifdef DEBUG
   printf("Process %d of %d running on host %s\n", k, p, hostname);
#   endif
   //handle command line input
   
   num_threads = 1;
   if(argc == 1)           n = 1024;
   else if (argc > 1)     n = atoi(argv[1]);
   if (argc == 3)
   {
       num_threads = atoi(argv[2]);
       
   }
   if(argc > 3 || argc < 1)
   {
       printf(usage);
       return 1;
   }
   nc = n/p;
   A     = (double *)malloc(n * nc * sizeof(double));
   Atemp = (double *)malloc(n * nc * sizeof(double));
   B     = (double *)malloc(n * nc * sizeof(double));
   C     = (double *)malloc(n * nc * sizeof(double));
    
   srand(0);

   for(i = 0; i < n*nc; i++)
   {
       C[i]=0;
   }
   /* Root process: generate and distribute data */
   if(k == 0)
   {
       Amatrix = (double *)malloc(n * n * sizeof(double));
       Bmatrix = (double *)malloc(n * n * sizeof(double));
       Cmatrix = (double *)malloc(n * n * sizeof(double));
       
       //Initialize data
       for(i = 0; i < n*n; i++)
       {
           Amatrix[i] = 0;
           Bmatrix[i] = 0;
           Cmatrix[i] = 0;
           //C[i] = 0;
       }
#ifdef VALUES //for correctness testing
       printf("\n%d\n", n);
       if(k==0)
       {
           printf("A = B = \n");
           for(i = 0; i < n; i++)
           {
               for(j = 0; j < n; j++)
               {
                   Amatrix[i+n*j] = i+j;
                   Bmatrix[i+n*j] = i+j;
                   printf("%f ", Amatrix[i+n*j]);
               }
               printf("\n");
           }
       }
#endif
   }
   //Now distribute.  Can do outside the root block
   MPI_Scatter(Amatrix, n*nc, MPI_DOUBLE, A, n*nc, MPI_DOUBLE, 0, MPI_COMM_WORLD);
   MPI_Scatter(Bmatrix, n*nc, MPI_DOUBLE, B, n*nc, MPI_DOUBLE, 0, MPI_COMM_WORLD);

   //pseudocode
   //Ck = Ck + Ak*Bkk
   //Atemp = Ak
   //j = k
   //for i = 1 to p-1 do
   //  j=(j+1) mod p
   //  send Atemp to left
   //  receive Atemp from right
   //  Ck = Ck + Atemp*Bjk
   //end

   /* Matrix Calculations */
   MPI_Barrier(MPI_COMM_WORLD); //do barrier so that we synchronize for better time
   GET_TIME(start);
   
   //real code
   memcpy(Atemp,A,(n * nc * sizeof(double)));

#ifdef VALUES
   if(k == 0)
   {
       printf("\n\n Ak = \n");
       for(i = 0; i < n; i++)
       {
           for(j = 0; j < nc; j++)
           {
               printf("%f ", amat(i,j));
           }
           printf("\n");
       }
   }
#endif
   calc_c(k,k);
   j = k;
   int send_to = k-1;
   int receive_from = k+1;
   if (receive_from == p) receive_from = 0;
   if (send_to == -1) {
       send_to = p-1;
   }
   MPI_Request rec_request;
   MPI_Request send_request;
   MPI_Status rec_status;
   MPI_Isend(Atemp, n*nc, MPI_DOUBLE, send_to,
              0, MPI_COMM_WORLD, &send_request);
   MPI_Irecv(A, n*nc, MPI_DOUBLE,
             receive_from, 0, MPI_COMM_WORLD, &rec_request);
   for(i = 0; i < p-1; i++)
   {
       j = (j+1) % p;
       MPI_Wait(&rec_request, &rec_status);
        memcpy(Atemp,A,(n * nc * sizeof(double)));
       MPI_Isend(Atemp, n*nc, MPI_DOUBLE, send_to,
                 0, MPI_COMM_WORLD, &send_request);
       MPI_Irecv(A, n*nc, MPI_DOUBLE,
                 receive_from, 0, MPI_COMM_WORLD, &rec_request);
       calc_c(j,k);
   }
   GET_TIME(finish);
   MPI_Barrier(MPI_COMM_WORLD);
   //output
   if(k == 0) printf("%f\n", finish-start);

   MPI_Gather(C, n*nc, MPI_DOUBLE, Cmatrix, n*nc, MPI_DOUBLE, 0, MPI_COMM_WORLD);
#ifdef VALUES
   if(k == 0)
   {
       printf("\n\n C = \n");
       for(i = 0; i < n; i++)
       {
           for(j = 0; j < n; j++)
           {
               printf("%f ", Cmatrix[i+n*j]);
           }
           printf("\n");
       }
   }
#endif
   MPI_Finalize();
   
   return 0;
} /* main */

示例#5

0

显示文件

文件： depthMapCmd.cpp 项目： esotericDisciple/makoto

MStatus depthMap::doIt( const MArgList& args ) 
{

	MStatus status = parseArgs( args );
	
	if( status != MS::kSuccess ) return status;
	
	MArgDatabase argData(syntax(), args);
	
	MAnimControl timeControl;
	MTime time = timeControl.currentTime();
	int frame =int(time.value());
	
	MString scene_name, camera_name, title;
	if (argData.isFlagSet("-n")) argData.getFlagArgument("-n", 0, title);
	else return MS::kFailure;
	if (argData.isFlagSet("-sc")) argData.getFlagArgument("-sc", 0, scene_name);
	else return MS::kFailure;
	if (argData.isFlagSet("-ca")) argData.getFlagArgument("-ca", 0, camera_name);
	else return MS::kFailure;
	
	m_eye[0][0]=1; m_eye[0][1]=0; m_eye[0][2]=0; m_eye[0][3]=0;
	m_eye[1][0]=0; m_eye[1][1]=1; m_eye[1][2]=0; m_eye[1][3]=0;
	m_eye[2][0]=0; m_eye[2][1]=0; m_eye[2][2]=1; m_eye[2][3]=0;
	m_eye[3][0]=0; m_eye[3][1]=0; m_eye[3][2]=0; m_eye[3][3]=1;
	
// get eye space
	zWorks::getTypedPathByName(MFn::kTransform, camera_name, p_eye);
	MObject o_eye = p_eye.transform();
	if(o_eye.isNull()) MGlobal::displayWarning("Cannot find eye camera, use default space.");
	else zWorks::getTransformWorldNoScale(p_eye.partialPathName(), m_eye);
	
	m_eye[0][0] *=-1; m_eye[0][1] *=-1; m_eye[0][2] *=-1;
	m_eye[2][0] *=-1; m_eye[2][1] *=-1; m_eye[2][2] *=-1;
	
	p_eye.extendToShape();
	MFnCamera feye(p_eye);
	double fov = feye.horizontalFieldOfView();
	int map_w = 1024, map_h = 1024;
	float* data = new float[map_w * map_h];
	
	for(int i=0; i<map_w * map_h; i++) data[i] = 10e6;

	string sscene = scene_name.asChar();
	zGlobal::changeFrameNumber(sscene, frame);
	MGlobal::displayInfo ( MString(" calculating ") + sscene.c_str());
	
	FXMLScene* fscene = new FXMLScene();
	if(fscene->load(sscene.c_str()) != 1) {
		MGlobal::displayWarning(" cannot open scene, do nothing.");
		return MS::kFailure;
	}
	
	fscene->depthMap(data, map_w, map_h, fov, m_eye);
	
	zGlobal::cutByLastSlash(sscene);
	sscene = sscene + "/" + title.asChar() + ".1.exr";
	zGlobal::changeFrameNumber(sscene, frame);
	//zGlobal::changeFilenameExtension(sscene, "exr");
	MGlobal::displayInfo ( MString(" saving ") + sscene.c_str());
	
	M44f amat(m_eye[0][0], m_eye[0][1], m_eye[0][2], m_eye[0][3],
			m_eye[1][0], m_eye[1][1], m_eye[1][2], m_eye[1][3],
			m_eye[2][0], m_eye[2][1], m_eye[2][2], m_eye[2][3],
			m_eye[3][0], m_eye[3][1], m_eye[3][2], m_eye[3][3] );
	ZFnEXR::saveCameraNZ(data, amat, fov, sscene.c_str(), map_w, map_h);
	delete[] data;
	delete fscene;

 return MS::kSuccess;
 }

示例#6

0

显示文件

文件： KokkosBatched_Test_LU_Host.hpp 项目： jwillenbring/Trilinos

      void LU(const int NN) {
        typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
        //typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType;

        constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
        const int N = NN/VectorLength;

        {
          std::string value_type_name;
          if (std::is_same<value_type,double>::value)                   value_type_name = "double";
          if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";

#if   defined(__AVX512F__)
          std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#elif defined(__AVX__) || defined(__AVX2__)
          std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#else
          std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#endif
        }

        const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize);
        const double tmax = 1.0e15;

        const int iter_begin = -10, iter_end = 100;
        Kokkos::Impl::Timer timer;

        ///
        /// Reference version using MKL DGETRF
        ///
        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> aref;
        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
          amat("amat", N*VectorLength, BlkSize, BlkSize);

        Random<value_type> random;

        for (int k=0;k<N*VectorLength;++k) {
          // use tridiagonal matrices; for now we just check elementwise l/u factors
          // do not allow pivots
          for (int i=0;i<BlkSize;++i) {
            amat(k, i, i) = random.value() + 10.0;
            if ((i+1) < BlkSize) {
              amat(k, i, i+1) = random.value() + 1.0;
              amat(k, i+1, i) = random.value() + 1.0;
            }
          }
        }

        typedef Vector<SIMD<value_type>,VectorLength> VectorType;
        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
          amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize);
      
        Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", 
           Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
           KOKKOS_LAMBDA(const int k) {
            const int k0 = k/VectorLength, k1 = k%VectorLength;
            for (int i=0;i<BlkSize;++i)
              for (int j=0;j<BlkSize;++j) {
                amat_simd(k0, i, j)[k1] = amat(k0*VectorLength+k1, i, j);
              }
          });