Beispiel #1
0
    static void
      copyOprodFromCPUArrayQuda(FullOprod cudaOprod, void *cpuOprod,
          size_t bytes_per_dir, int Vh)
      {
        // Use pinned memory 
        float2 *packedEven, *packedOdd;
        if(cudaMallocHost(&packedEven, bytes_per_dir) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        if (cudaMallocHost(&packedOdd, bytes_per_dir) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedOdd\n");
	}

        for(int dir=0; dir<4; dir++){
          packOprodFieldDir(packedEven, (float*)cpuOprod, dir, 0, Vh);
          packOprodFieldDir(packedOdd,  (float*)cpuOprod, dir, 1, Vh);

          cudaMemset(cudaOprod.even.data[dir], 0, bytes_per_dir);
          cudaMemset(cudaOprod.odd.data[dir],  0, bytes_per_dir);
          checkCudaError();

          cudaMemcpy(cudaOprod.even.data[dir], packedEven, bytes_per_dir, cudaMemcpyHostToDevice);
          cudaMemcpy(cudaOprod.odd.data[dir], packedOdd, bytes_per_dir, cudaMemcpyHostToDevice);
          checkCudaError();
        }
        cudaFreeHost(packedEven);
        cudaFreeHost(packedOdd);
      }
Beispiel #2
0
  void cudaCloverField::createTexObject(cudaTextureObject_t &tex, cudaTextureObject_t &texNorm,
					void *field, void *norm) {

    if (order == QUDA_FLOAT2_CLOVER_ORDER || order == QUDA_FLOAT4_CLOVER_ORDER) {
      // create the texture for the field components
      
      cudaChannelFormatDesc desc;
      memset(&desc, 0, sizeof(cudaChannelFormatDesc));
      if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat;
      else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2
      
      // always four components regardless of precision
      desc.x = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
      desc.y = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
      desc.z = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
      desc.w = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision;
      
      cudaResourceDesc resDesc;
      memset(&resDesc, 0, sizeof(resDesc));
      resDesc.resType = cudaResourceTypeLinear;
      resDesc.res.linear.devPtr = field;
      resDesc.res.linear.desc = desc;
      resDesc.res.linear.sizeInBytes = bytes/2;
      
      cudaTextureDesc texDesc;
      memset(&texDesc, 0, sizeof(texDesc));
      if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat;
      else texDesc.readMode = cudaReadModeElementType;
      
      cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
      checkCudaError();
      
      // create the texture for the norm components
      if (precision == QUDA_HALF_PRECISION) {
	cudaChannelFormatDesc desc;
	memset(&desc, 0, sizeof(cudaChannelFormatDesc));
	desc.f = cudaChannelFormatKindFloat;
	desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0;
	
	cudaResourceDesc resDesc;
	memset(&resDesc, 0, sizeof(resDesc));
	resDesc.resType = cudaResourceTypeLinear;
	resDesc.res.linear.devPtr = norm;
	resDesc.res.linear.desc = desc;
	resDesc.res.linear.sizeInBytes = norm_bytes/2;
	
	cudaTextureDesc texDesc;
	memset(&texDesc, 0, sizeof(texDesc));
	texDesc.readMode = cudaReadModeElementType;
	
	cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL);
	checkCudaError();
      }
    }

  }
Beispiel #3
0
    void loadOprodToGPU(void *cudaOprodEven, 
        void *cudaOprodOdd,
        void  *oprod,
        int vol){
        checkCudaError();
        int bytes = 4*vol*18*sizeof(float);

				std::cout << "vol = " << vol << std::endl;
        std::cout << "bytes = " << bytes << std::endl;
        checkCudaError();
        loadOprodFromCPUArrayQuda(cudaOprodEven, cudaOprodOdd, oprod, bytes, vol);
    }
Beispiel #4
0
device::device(int device_id) {
	cuInit(0);
	checkCudaError("device::device Init");
	cuDeviceGet(&cu_device, device_id);
	checkCudaError("device::device Get device");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	this->set_device(device_id);

	cudaGetDeviceProperties(&props, device_id);
	checkCudaError("device::device Get properties ");

	this->device_name = props.name;
}
Beispiel #5
0
void printDeviceColumnMajorMatrix(float *dA, int nrRows, int nrCols) {
    int   size = nrRows * nrCols;
    float hA[size];

    checkCudaError(__LINE__, cudaMemcpy(hA, dA, size * sizeof(float), cudaMemcpyDeviceToHost));
    printColumnMajorMatrix(hA, nrRows, nrCols);
}
Beispiel #6
0
  void cudaCloverField::copy(const CloverField &src, bool inverse) {

    checkField(src);
    
    if (typeid(src) == typeid(cudaCloverField)) {
      if (src.V(false))	copyGenericClover(*this, src, false, QUDA_CUDA_FIELD_LOCATION);
      if (src.V(true)) copyGenericClover(*this, src, true, QUDA_CUDA_FIELD_LOCATION);
    } else if (typeid(src) == typeid(cpuCloverField)) {
      resizeBufferPinned(bytes + norm_bytes);
      void *packClover = bufferPinned;
      void *packCloverNorm = (precision == QUDA_HALF_PRECISION) ? (char*)bufferPinned + bytes : 0;
      
      if (src.V(false)) {
	copyGenericClover(*this, src, false, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0);
	cudaMemcpy(clover, packClover, bytes, cudaMemcpyHostToDevice);
	if (precision == QUDA_HALF_PRECISION) 
	  cudaMemcpy(norm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice);
      }
      
      if (src.V(true) && inverse) {
	copyGenericClover(*this, src, true, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0);
	cudaMemcpy(cloverInv, packClover, bytes, cudaMemcpyHostToDevice);
	if (precision == QUDA_HALF_PRECISION) 
	  cudaMemcpy(invNorm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice);
      }
    } else {
      errorQuda("Invalid clover field type");
    }

    checkCudaError();
  }
cudaChannelFormatDesc getChannelDesc(cudaArray_const_t array) {
    cudaChannelFormatDesc desc;
    if (Ctx->isCreated()) {
        checkCudaError(cudaGetChannelDesc(&desc, array));
    }
    return desc;
}
size_t getTextureAlignmentOffset(const textureReference* tex) {
    size_t offset = 0;
    if (Ctx->isCreated()) {
        checkCudaError(cudaGetTextureAlignmentOffset(&offset, tex));
    }
    return offset;
}
const textureReference* getTextureReference(const void* symbol) {
    const textureReference* tex = nullptr;
    if (Ctx->isCreated()) {
        checkCudaError(cudaGetTextureReference(&tex, symbol));
    }
    return tex;
}
////////////////////////////////////////////////////////////////////////////////
//! Run the Cuda part of the computation
////////////////////////////////////////////////////////////////////////////////
void runCuda()
{
    cudaStream_t    stream = 0;
    const int nbResources = 2;
    cudaGraphicsResource *ppResources[nbResources] =
    {
        g_histogram.cudaResource,
        g_color.cudaResource,
    };
    // Map resources for Cuda
    checkCudaErrors(cudaGraphicsMapResources(nbResources, ppResources, stream));
    getLastCudaError("cudaGraphicsMapResources(2) failed");
    // Get pointers
    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&g_histogram.cudaBuffer, &g_histogram.size, g_histogram.cudaResource));
    getLastCudaError("cudaGraphicsResourceGetMappedPointer (g_color.pBuffer) failed");
    cudaGraphicsSubResourceGetMappedArray(&g_color.pCudaArray, g_color.cudaResource, 0, 0);
    getLastCudaError("cudaGraphicsSubResourceGetMappedArray (g_color.pBuffer) failed");

    // Execute kernel
    createHistogramTex(g_histogram.cudaBuffer, g_WindowWidth, g_WindowHeight, g_color.pCudaArray);
    checkCudaError();
    //
    // unmap the resources
    //
    checkCudaErrors(cudaGraphicsUnmapResources(nbResources, ppResources, stream));
    getLastCudaError("cudaGraphicsUnmapResources(2) failed");
}
void bindTextureToMipmappedArray(const textureReference* tex,
                                 cudaMipmappedArray_const_t mipmappedArray,
                                 const cudaChannelFormatDesc* desc) {
    if (Ctx->isCreated() && tex && mipmappedArray && desc) {
        checkCudaError(cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc));
    }
}
void CudaDeviceContextPrivate::destroy() {
    if (isValid()) {
        std::cout << "Reset cuda device" << std::endl;
        checkCudaError(cudaDeviceReset());
        activeDevice = -1;
        activeStream = nullptr;
    }
}
Beispiel #13
0
device::device() {
	cuInit(0);
	cuDeviceGet(&cu_device, 0);
	checkCudaError("device::device Init");
	//cuCtxCreate(&cu_context, 0, cu_device);
	//checkCudaError("device::device Create context");
	device_name = props.name;
}
Beispiel #14
0
AsyncCopier::AsyncCopier(size_t bufferSize)
  : bufferSize_(bufferSize),
    buffer_(allocPageLocked(bufferSize)) {
  int deviceCount;
  checkCudaError(cudaGetDeviceCount(&deviceCount), "cudaGetDeviceCount");
  events_.resize(deviceCount);
  freeEvents_.resize(deviceCount);
}
bool CudaDeviceContext::setDevice(int device) {
    if (device > 0) {
        D(CudaDeviceContext);
        checkCudaError(cudaSetDevice(device));
        d->activeDevice = device;
        return true;
    }
    return false;
}
void bindTexture(const textureReference* tex,
                 const void* devPtr,
                 const cudaChannelFormatDesc* desc,
                 size_t* offset,
                 size_t size) {
    if (Ctx->isCreated() && tex && devPtr && desc) {
        checkCudaError(cudaBindTexture(offset, tex, devPtr, desc, size));
    }
}
Beispiel #17
0
AsyncCopier::Event::Event(int d)
  : device(d),
    refCount(0) {
  event.emplace();
  checkCudaError(
      cudaEventCreateWithFlags(
          get_pointer(event), cudaEventDisableTiming | cudaEventBlockingSync),
      "cudaEventCreateWithFlags");
}
Beispiel #18
0
    void allocateOprodFields(void **cudaOprodEven, void **cudaOprodOdd, int vol){
      int bytes = 4*vol*18*sizeof(float);

      if (cudaMalloc((void **)cudaOprodEven, bytes) == cudaErrorMemoryAllocation) {
        errorQuda("Error allocating even outer product field");
      }

      cudaMemset((*cudaOprodEven), 0, bytes);
      checkCudaError();

      if (cudaMalloc((void **)cudaOprodOdd, bytes) == cudaErrorMemoryAllocation) {
        errorQuda("Error allocating odd outer product field");
      }

      cudaMemset((*cudaOprodOdd), 0, bytes);

      checkCudaError();
    }
Beispiel #19
0
void masterKendall(const float * x, size_t nx, 
                   const float * y, size_t ny,
                   size_t sampleSize, double * results)
{
	size_t 
		outputLength = nx * ny, outputBytes = outputLength*sizeof(double),
		xBytes = nx*sampleSize*sizeof(float), 
		yBytes = ny*sampleSize*sizeof(float); 
	float
		* gpux, * gpuy; 
	double
		* gpuResults;
	dim3
		grid(nx, ny), block(NUMTHREADS, NUMTHREADS);

	cudaMalloc((void **)&gpux, xBytes);
	cudaMalloc((void **)&gpuy, yBytes);
	checkCudaError("input vector space allocation");

	cudaMemcpy(gpux, x, xBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(gpuy, y, yBytes, cudaMemcpyHostToDevice);
	checkCudaError("copying input vectors to gpu");

	cudaMalloc((void **)&gpuResults, outputBytes);
	checkCudaError("allocation of space for result matrix");

  void *args[] =
    { &gpux
    , &nx
    , &gpuy
    , &ny
    , &sampleSize
    , &gpuResults
    };
  cudaLaunch("gpuKendall", args,
      grid, block);

  cudaFree(gpux);
  cudaFree(gpuy);
  cudaMemcpy(results, gpuResults, outputBytes, cudaMemcpyDeviceToHost);
  cudaFree(gpuResults);
  checkCudaError("copying results from gpu and cleaning up");
}
void bindTexture2D(const textureReference* tex,
                   const void* devPtr,
                   const cudaChannelFormatDesc* desc,
                   size_t* offset,
                   size_t width,
                   size_t height,
                   size_t pitch) {
    if (Ctx->isCreated() && tex && devPtr && desc) {
        checkCudaError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
    }
}
Beispiel #21
0
  cudaCloverField::~cudaCloverField() {
    if (clover != cloverInv) {
      if ( clover ) cudaFree(clover);
      if ( norm ) cudaFree(norm);
    }

    if ( cloverInv ) cudaFree(cloverInv);
    if ( invNorm ) cudaFree(invNorm);
    
    checkCudaError();
  }
Beispiel #22
0
    static void 
      loadOprodFromCPUArrayQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod,
          size_t bytes, int Vh)
      {
        // Use pinned memory 
	float2 *packedEven, *packedOdd;
        checkCudaError();

        if (cudaMallocHost(&packedEven, bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        if (cudaMallocHost(&packedOdd, bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        checkCudaError();


        packOprodField(packedEven, (float*)cpuOprod, 0, Vh);
        packOprodField(packedOdd,  (float*)cpuOprod, 1, Vh);
        checkCudaError();


        cudaMemset(cudaOprodEven, 0, bytes);
        cudaMemset(cudaOprodOdd, 0, bytes);
        checkCudaError();

        cudaMemcpy(cudaOprodEven, packedEven, bytes, cudaMemcpyHostToDevice);
        checkCudaError();
        cudaMemcpy(cudaOprodOdd, packedOdd, bytes, cudaMemcpyHostToDevice);
        checkCudaError();

        cudaFreeHost(packedEven);
        cudaFreeHost(packedOdd);
      }
Beispiel #23
0
void init_GPU_data(GlobalParams &p) {
    
    int n_known = p.nRegions;
    int n_total = p.pred_dist.n_rows;
    int n_pred = n_total-n_known;
    
    scatr_magma_init();

    arma::mat dist12 = p.pred_dist(arma::span(0,n_known-1),        arma::span(n_known, n_total-1));
    arma::mat dist22 = p.pred_dist(arma::span(n_known, n_total-1), arma::span(n_known, n_total-1));
    
    checkCublasError( cublasCreate_v2(&p.handle), "handle (Create)" );
    
    cudaMalloc((void**) &p.d_dist12, n_known*n_pred*sizeof(double));  checkCudaError("dist12 (Malloc)");
    checkCublasError( 
        cublasSetMatrix(n_known, n_pred, sizeof(double), dist12.memptr(), n_known, p.d_dist12, n_known), 
        "dist12 (Set)" 
    );
    
    cudaMalloc((void**) &p.d_dist22, n_pred*n_pred * sizeof(double));   checkCudaError("dist22 (Malloc)");
    checkCublasError( 
        cublasSetMatrix(n_pred, n_pred, sizeof(double), dist22.memptr(), n_pred, p.d_dist22, n_pred), 
        "dist22 (Set)"
    );
    
    cudaMalloc((void**) &p.d_cov12,    n_known * n_pred  * sizeof(double)); checkCudaError("cov12 (Malloc)");
    cudaMalloc((void**) &p.d_cov22,    n_pred  * n_pred  * sizeof(double)); checkCudaError("cov22 (Malloc)");
    cudaMalloc((void**) &p.d_invcov11, n_known * n_known * sizeof(double)); checkCudaError("invcov11 (Malloc)");
    cudaMalloc((void**) &p.d_tmp,      n_pred  * n_known * sizeof(double)); checkCudaError("tmp (Malloc)");
}
Beispiel #24
0
void device::get_cuda_info() {
	const int kb = 1024;
	const int mb = kb * kb;

	cudaDeviceProp props;
	cudaError_t error =
	cudaGetDeviceProperties(&props, this->device_id);
	checkCudaError("device::device::get_cuda_info Get properties ");
	if (error == cudaErrorInvalidDevice) {
		std::cout << "Device does not exist" << std::endl;
	}

	std::cout << props.name << std::endl;
	std::cout << "   Global memory:   " << props.totalGlobalMem / mb << "mb"
			<< std::endl;
	std::cout << "   Shared memory:   " << props.sharedMemPerBlock / kb << "kb"
			<< std::endl;
	std::cout << "   Constant memory: " << props.totalConstMem / kb << "kb"
			<< std::endl;
	std::cout << "   Block registers: " << props.regsPerBlock << std::endl
			<< std::endl;
	std::cout << "   Warp size:         " << props.warpSize << std::endl;
	std::cout << "   Threads per block: " << props.maxThreadsPerBlock
			<< std::endl;
	std::cout << "   Max block dimensions: [ " << props.maxThreadsDim[0] << ", "
			<< props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]"
			<< std::endl;
	std::cout << "   Max grid dimensions:  [ " << props.maxGridSize[0] << ", "
			<< props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]"
			<< std::endl;
	std::cout << "   Multiprocessor Count: " << props.multiProcessorCount
			<< std::endl;
	std::cout << std::endl;
	std::cout << "   Unified addressing: " << props.unifiedAddressing
			<< std::endl;
	std::cout << "   Concurrent kernels: " << props.concurrentKernels
			<< std::endl;
	std::cout << "   Diver Overlap: " << props.deviceOverlap << std::endl;
	std::cout << "   Memory Clock Rate: " << props.memoryClockRate << std::endl;
	std::cout << "   Memory Bus Width: " << props.memoryBusWidth << std::endl;
	std::cout << "   l2 Cache Size: " << props.l2CacheSize << std::endl;
	std::cout << "   Clock Rate: " << props.clockRate << std::endl;
	std::cout << "   Exec Time Out: " << props.kernelExecTimeoutEnabled
			<< std::endl << std::endl;

	std::cout << "   Compute Capability: " << props.major << "." << props.minor
			<< std::endl;
	std::cout << "   Compute Modes: " << props.computeMode << std::endl
			<< std::endl;

	//}
}
Beispiel #25
0
 void cudaCloverField::destroyTexObject() {
   cudaDestroyTextureObject(evenTex);
   cudaDestroyTextureObject(oddTex);
   cudaDestroyTextureObject(evenInvTex);
   cudaDestroyTextureObject(oddInvTex);
   if (precision == QUDA_HALF_PRECISION) {
     cudaDestroyTextureObject(evenNormTex);
     cudaDestroyTextureObject(evenNormTex);
     cudaDestroyTextureObject(evenInvNormTex);
     cudaDestroyTextureObject(evenInvNormTex);
   }
   checkCudaError();
 }
Beispiel #26
0
int main(int argc, char **argv)
{
    // device memory
    real *psi_d, *z_d;

    size_t fSize = sizeof(real);

    /* grid dimensions */
    unsigned int Nx = 513, Ny = 513;
    // omitting boundaries
    unsigned int nGridPoints = (Nx-2)*(Ny-2);

    cudaMalloc((void **) &psi_d, (nGridPoints+1)*fSize);
    cudaMalloc((void **) &z_d,   (nGridPoints+1)*fSize);

    /* initialization */
    fillArray(psi_d, 0.0, nGridPoints+1);
    fillArray(z_d,   1.0, nGridPoints+1);
    checkCudaError("Initialization of grid");

    // for timing purposes
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    // start timer
    cudaEventRecord(start,0);

    /* Call the poisson solver, right hand side
     * is stored on the device in z_d (make sure the data
     * is copied from CPU to GPU!), result is stored in
     * psi_d (on the GPU/device).
     * Here NX-2 is the width of the grid's interior
     * (without the boundaries).
     */
    cuPoisson((Nx-2), psi_d, z_d);

    // stop timer
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    float computationTime;
    cudaEventElapsedTime(&computationTime, start, stop);

    printf("Computation time was %.5f seconds.\n\n", computationTime/1000.0);

    printf("Writing result to disk...\n");
    // write result to file
    writeBinaryFile(Nx, Ny, psi_d, "data.dat");
    printf("done\n");

    return EXIT_SUCCESS;
}
Beispiel #27
0
    static void 
      fetchOprodFromGPUArraysQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, size_t bytes, int Vh)
      {
        float2 *packedEven, *packedOdd;
        if(cudaMallocHost(&packedEven,bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        if (cudaMallocHost(&packedOdd, bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedOdd\n");
	}


        cudaMemcpy(packedEven, cudaOprodEven, bytes, cudaMemcpyDeviceToHost);
        checkCudaError();
        cudaMemcpy(packedOdd, cudaOprodOdd, bytes, cudaMemcpyDeviceToHost);
        checkCudaError();

        unpackOprodField((float*)cpuOprod, packedEven, 0, Vh);
        unpackOprodField((float*)cpuOprod, packedOdd,  1, Vh);

        cudaFreeHost(packedEven);
        cudaFreeHost(packedOdd);
      }
Beispiel #28
0
  void cudaCloverField::destroyTexObject() {
    if (order == QUDA_FLOAT2_CLOVER_ORDER || order == QUDA_FLOAT4_CLOVER_ORDER) {
      cudaDestroyTextureObject(evenTex);
      cudaDestroyTextureObject(oddTex);
      cudaDestroyTextureObject(evenInvTex);
      cudaDestroyTextureObject(oddInvTex);
      if (precision == QUDA_HALF_PRECISION) {
	cudaDestroyTextureObject(evenNormTex);
	cudaDestroyTextureObject(oddNormTex);
	cudaDestroyTextureObject(evenInvNormTex);
	cudaDestroyTextureObject(oddInvNormTex);
      }
      checkCudaError();
    }
  }
Beispiel #29
0
  cudaCloverField::~cudaCloverField()
  {
#ifdef USE_TEXTURE_OBJECTS
    destroyTexObject();
#endif

    if (clover != cloverInv) {
      if (clover) device_free(clover);
      if (norm) device_free(norm);
    }
    if (cloverInv) device_free(cloverInv);
    if (invNorm) device_free(invNorm);
    
    checkCudaError();
  }
Beispiel #30
0
  cudaCloverField::~cudaCloverField()
  {
#ifdef USE_TEXTURE_OBJECTS
    destroyTexObject();
#endif

    if (create != QUDA_REFERENCE_FIELD_CREATE) {
      if (clover != cloverInv) {
	if (clover) device_free(clover);
	if (norm) device_free(norm);
      }
      if (cloverInv) device_free(cloverInv);
      if (invNorm) device_free(invNorm);
    }
    
    checkCudaError();
  }