Beispiel #1
0
void Mesh::NEListToArray() {
    std::vector< std::set<size_t> >::const_iterator vec_it;
    std::set<size_t>::const_iterator set_it;
    size_t offset = 0;
    size_t index = 0;

    for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
      offset += vec_it->size();

    NEListArray_size = offset;
    
    cuda_check(cudaHostAlloc((void **)&NEListIndex_pinned, sizeof(size_t) * (NNodes+1), cudaHostAllocPortable));
    cuda_check(cudaHostAlloc((void **)&NEListArray_pinned, sizeof(size_t) * NEListArray_size, cudaHostAllocPortable));

    offset = 0;

    for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
    {
      NEListIndex_pinned[index++] = offset;

      for(set_it = vec_it->begin(); set_it != vec_it->end(); set_it++)
        NEListArray_pinned[offset++] = *set_it;
    }

    assert(index == NEList.size());
    NEListIndex_pinned[index] = offset;
}
void GRTRegenerateChains::AllocatePerGPUMemory(GRTRegenerateThreadRunData *data) {
    // Malloc space on the GPU for everything.
    cudaError_t err;
    
    // Default flags are for memory on device and copying things.
    unsigned int flags = 0;

    // If we are using zero copy, set the zero copy flag.
    if (this->CommandLineData->GetUseZeroCopy()) {
        flags = cudaHostAllocMapped;
    }
    // Malloc device hash space.
    CH_CUDA_SAFE_CALL(cudaMalloc((void **)&this->DEVICE_Hashes[data->threadID],
        this->NumberOfHashes * this->HashLengthBytes * sizeof(unsigned char)));
    CH_CUDA_SAFE_CALL(cudaMemcpy(this->DEVICE_Hashes[data->threadID], this->HashList,
        this->NumberOfHashes * this->HashLengthBytes * sizeof(unsigned char), cudaMemcpyHostToDevice));

    //this->HOST_Success[data->threadID] = new unsigned char [this->NumberOfHashes * sizeof(unsigned char)];
    cudaHostAlloc((void **)&this->HOST_Success[data->threadID],
        this->NumberOfHashes * sizeof(unsigned char), flags);
    memset(this->HOST_Success[data->threadID], 0, this->NumberOfHashes * sizeof(unsigned char));
    this->HOST_Success_Reported[data->threadID] = new unsigned char [this->NumberOfHashes * sizeof(unsigned char)];
    memset(this->HOST_Success_Reported[data->threadID], 0, this->NumberOfHashes * sizeof(unsigned char));

    // If zero copy is in use, get the device pointer
    if (this->CommandLineData->GetUseZeroCopy()) {
        cudaHostGetDevicePointer((void **)&this->DEVICE_Success[data->threadID],
            this->HOST_Success[data->threadID], 0);
    } else {
        CH_CUDA_SAFE_CALL(cudaMalloc((void **)&this->DEVICE_Success[data->threadID],
            this->NumberOfHashes * sizeof(unsigned char)));
        CH_CUDA_SAFE_CALL(cudaMemset(this->DEVICE_Success[data->threadID], 0,
            this->NumberOfHashes * sizeof(unsigned char)));
    }

    //this->HOST_Passwords[data->threadID] = new unsigned char[MAX_PASSWORD_LEN * this->NumberOfHashes * sizeof(unsigned char)];
    cudaHostAlloc((void **)&this->HOST_Passwords[data->threadID],
        MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char), flags);
    memset(this->HOST_Passwords[data->threadID], 0, MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char));

    if (this->CommandLineData->GetUseZeroCopy()) {
        cudaHostGetDevicePointer((void **)&this->DEVICE_Passwords[data->threadID],
            this->HOST_Passwords[data->threadID], 0);
    } else {
        CH_CUDA_SAFE_CALL(cudaMalloc((void **)&this->DEVICE_Passwords[data->threadID],
            MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char)));
        CH_CUDA_SAFE_CALL(cudaMemset(this->DEVICE_Passwords[data->threadID], 0,
            MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char)));
    }

    
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Thread %d: CUDA error 5: %s. Exiting.\n",
                data->threadID, cudaGetErrorString( err));
        return;
    }
    //printf("Memory for thread %d allocated.\n", data->threadID);
}
Beispiel #3
0
TEST(HostAlloc, NullArguments) {
    cudaError_t ret;

    ret = cudaHostAlloc(NULL, 0, 0);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    ret = cudaHostAlloc(NULL, 4, 0);
    EXPECT_EQ(cudaErrorInvalidValue, ret);

    ret = cudaFreeHost(NULL);
    EXPECT_EQ(cudaSuccess, ret);
}
Beispiel #4
0
void alloc_data_host(DataArray* data_arr) {
    // when allocating data, we don't know which pointer will be ruturned, but we can use pointer to pointer
    // data_arr->data_r = complex**
    // *(complex**) = *complex
    // *(data_arr->data_r) means smth what is under the adress data_arr->data_r, so complex* double
    cudaHostAlloc((void**) data_arr->data_r, sizeof(double complex)*N, cudaHostAllocDefault); // pinnable memory <- check here for cudaMallocHost (could be faster)
    cudaHostAlloc((void**) data_arr->data_k, sizeof(double complex)*N, cudaHostAllocDefault); // pinnable memory

    // in case of pageable memory (slower/not asynchronous):
    //*(data_arr->data_r) = (double complex*) malloc( (size_t) sizeof(double complex)*data_arr->size );
    //*(data_arr->data_k) = (double complex*) malloc( (size_t) sizeof(double complex)*data_arr->size );

}
TEST_P(MemcpyAsync, H2DTransfers) {
    const size_t param = GetParam();
    const size_t alloc = 1 << param;

    cudaError_t ret;
    void *d1, *h1;
    ret = cudaMalloc(&d1, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    cudaStream_t stream;
    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(h1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}
Beispiel #6
0
Args3D<T>::Args3D(int _width, int _height, int _depth){
	width = _width;
	height = _height;
	depth = _depth;
	gpuErrchk( cudaHostAlloc((void **) &hostArray, width*height*depth*sizeof(T), cudaHostAllocWriteCombined | cudaHostAllocMapped) );
	gpuErrchk( cudaHostGetDevicePointer(&deviceArray, hostArray, 0) );
}
Waifu2x::eWaifu2xError Waifu2x::ProcessNet(std::shared_ptr<cNet> net, const int crop_w, const int crop_h, const bool use_tta, const int batch_size, cv::Mat &im)
{
	Waifu2x::eWaifu2xError ret;

	CudaDeviceSet devset(mProcess, mGPUNo);

	const auto OutputMemorySize = net->GetOutputMemorySize(crop_w, crop_h, OuterPadding, batch_size);
	if (OutputMemorySize > mOutputBlockSize)
	{
		if (mIsCuda)
		{
			CUDA_HOST_SAFE_FREE(mOutputBlock);
			CUDA_CHECK_WAIFU2X(cudaHostAlloc(&mOutputBlock, OutputMemorySize, cudaHostAllocDefault));
		}
		else
		{
			SAFE_DELETE_WAIFU2X(mOutputBlock);
			mOutputBlock = new float[OutputMemorySize];
		}

		mOutputBlockSize = OutputMemorySize;
	}

	ret = net->ReconstructImage(use_tta, crop_w, crop_h, OuterPadding, batch_size, mOutputBlock, im, im);
	if (ret != Waifu2x::eWaifu2xError_OK)
		return ret;

	return Waifu2x::eWaifu2xError_OK;
}
Beispiel #8
0
Args<T>::Args(int _width){
	width = _width;
	gpuErrchk( cudaDeviceReset() );
	gpuErrchk( cudaSetDeviceFlags(cudaDeviceMapHost) );
	gpuErrchk( cudaHostAlloc((void **) &hostArray, width*sizeof(T), cudaHostAllocWriteCombined | cudaHostAllocMapped) );
	gpuErrchk( cudaHostGetDevicePointer(&deviceArray, hostArray, 0) );
}
Beispiel #9
0
TEST(HostAlloc, MappedPointer) {
    cudaError_t ret;
    int device;

    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaDeviceProp prop;
    ret = cudaGetDeviceProperties(&prop, device);
    ASSERT_EQ(cudaSuccess, ret);

    void * ptr;
    ret = cudaHostAlloc(&ptr, 4, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    /*
     * Try to retrieve the device pointer, expecting a result according to
     * prop.canMapHostMemory.
     */
    void * device_ptr;
    ret = cudaHostGetDevicePointer(&device_ptr, ptr, 0);
    if (prop.canMapHostMemory) {
        EXPECT_EQ(cudaSuccess, ret);
        EXPECT_FALSE(device_ptr == NULL);
    } else {
        EXPECT_EQ(cudaErrorMemoryAllocation, ret);
    }

    ret = cudaFreeHost(ptr);
    ASSERT_EQ(cudaSuccess, ret);
}
Beispiel #10
0
void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
{
  void * ptr = NULL;

  CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) );

  return ptr ;
}
Beispiel #11
0
static void *alloc_pinned_mem(int size)
{
	void *ret;

	checkCudaErrors(cudaHostAlloc(&ret, size, cudaHostAllocPortable));

	return ret;
}
Beispiel #12
0
void* cuda_hostalloc(long N)
{
	void* ptr;
	if (cudaSuccess != cudaHostAlloc(&ptr, N, cudaHostAllocDefault))
		abort();

	insert(ptr, N, false);
	return ptr;
}
void* CUDAPageLockedMemAllocator::Malloc(size_t size, int deviceId)
{
    void* p;
    cudaSetDevice(deviceId);

    // Note: I ask for cudaHostAllocDefault but cudaHostGetFlags() shows that it is allocated as 'cudaHostAllocMapped'
    cudaHostAlloc(&p, size, cudaHostAllocDefault) || "Malloc in CUDAPageLockedMemAllocator failed";

    return p;
}
Beispiel #14
0
void Mesh::pin_data() {
  NNListToArray();
  NEListToArray();

  size_t ENList_bytes = sizeof(size_t) * ENList.size();
  size_t coords_bytes = sizeof(float) * coords.size();
  size_t metric_bytes = sizeof(float) * metric.size();
  size_t normal_bytes = sizeof(float) * normals.size();
  
  cuda_check(cudaHostAlloc((void **)&ENList_pinned, ENList_bytes, cudaHostAllocPortable));
  cuda_check(cudaHostAlloc((void **)&coords_pinned, coords_bytes, cudaHostAllocPortable));
  cuda_check(cudaHostAlloc((void **)&metric_pinned, metric_bytes, cudaHostAllocPortable));
  cuda_check(cudaHostAlloc((void **)&normals_pinned, normal_bytes, cudaHostAllocPortable));
  
  memcpy(ENList_pinned, &ENList[0], ENList_bytes);
  memcpy(coords_pinned, &coords[0], coords_bytes);
  memcpy(metric_pinned, &metric[0], metric_bytes);
  memcpy(normals_pinned, &normals[0], normal_bytes);
}
Beispiel #15
0
    /** constructor
     *
     * @param size extent for each dimension (in elements)
     */
    MappedBufferIntern(DataSpace<DIM> size):
    DeviceBuffer<TYPE, DIM>(size, size),
    pointer(nullptr), ownPointer(true)
    {
#if( PMACC_CUDA_ENABLED == 1 )
        CUDA_CHECK((cuplaError_t)cudaHostAlloc(&pointer, size.productOfComponents() * sizeof (TYPE), cudaHostAllocMapped));
#else
        pointer = new TYPE[size.productOfComponents()];
#endif
        reset(false);
    }
Beispiel #16
0
    void *hostAlloc(size_t size, unsigned int flags )
    {
        void *ptr;
        cudaError_t cudaError = cudaHostAlloc(&ptr, size, flags);
        
        if(cudaError != cudaSuccess) {
            throw cudaError;
        }

        return ptr;
    }
Beispiel #17
0
TEST(HostAlloc, MallocFree) {
    cudaError_t ret;
    int * ptr;

    ret = cudaHostAlloc((void **) &ptr, sizeof(*ptr), 0);
    ASSERT_EQ(cudaSuccess, ret);

    ASSERT_FALSE(NULL == ptr);
    *ptr = 0;

    ret = cudaFreeHost(ptr);
    EXPECT_EQ(cudaSuccess, ret);
}
Beispiel #18
0
   inline void* allocate(size_t num_bytes)
   {
     void* result = nullptr;
 
     cudaError_t error = cudaHostAlloc(&result, num_bytes, cudaHostAllocPortable);
 
     if(error != cudaSuccess)
     {
       throw thrust::system_error(error, thrust::cuda_category(), "pinned_resource::allocate(): cudaMallocManaged");
     }
 
     return result;
   }
Beispiel #19
0
void init()
{
	
	// Aloca memorie - local
	//a_h = (float *)malloc(N*sizeof(float));
	//b_h = (float *)malloc(N*sizeof(float));
	//r_h = (float *)malloc(N*sizeof(float));
	
	int size = N*sizeof(float);

	cudaHostAlloc((void **)&a_h, size, 0);
	checkCUDAError("cudaHostAllocMapped");
	cudaHostAlloc((void **)&b_h, size, 0);
	checkCUDAError("cudaHostAllocMapped");
	cudaHostAlloc((void **)&r_h, size, 0);
	checkCUDAError("cudaHostAllocMapped");

	// Aloca memorie - CUDA
	//cutilSafeCall(cudaMalloc((void **) &a_d, N*sizeof(float)));
	//cutilSafeCall(cudaMalloc((void **) &b_d, N*sizeof(float)));
	//cutilSafeCall(cudaMalloc((void **) &r_d, N*sizeof(float)));
	
	cudaHostGetDevicePointer((void **)&a_d, (void *)a_h, 0);
	checkCUDAError("cudaHostGetDevicePointer");
	cudaHostGetDevicePointer((void **)&b_d, (void *)b_h, 0);
	checkCUDAError("cudaHostGetDevicePointer");
	cudaHostGetDevicePointer((void **)&r_d, (void *)r_h, 0);
	checkCUDAError("cudaHostGetDevicePointer");
	
	control = (float *)malloc(N*sizeof(float));

	// Initializeaza vectori
	for(int i=0;i<N;i++)
	{
		a_h[i] = (float)(i % 13)+1;
		b_h[i] = (float)(i % 3)+1;
	}
}
Beispiel #20
0
TEST(HostAlloc, FlagRetrieval) {
    cudaError_t ret;
    void * ptrs[8];
    unsigned int flags[8];

    int device;

    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaDeviceProp prop;
    ret = cudaGetDeviceProperties(&prop, device);
    ASSERT_EQ(cudaSuccess, ret);

    for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) {
        unsigned int flag = cudaHostAllocDefault;

        if (i & 0x1) {
            flag |= cudaHostAllocPortable;
        }

        if (i & 0x2) {
            flag |= cudaHostAllocMapped;
        }

        if (i & 0x4) {
            flag |= cudaHostAllocWriteCombined;
        }

        ret = cudaHostAlloc(&ptrs[i], 4, flag);
        ASSERT_EQ(cudaSuccess, ret);

        flags[i] = flag;
    }

    for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) {
        unsigned int flag;
        ret = cudaHostGetFlags(&flag, ptrs[i]);
        ASSERT_EQ(cudaSuccess, ret);

        const unsigned int expected = flags[i] |
                                      (prop.canMapHostMemory ? cudaHostAllocMapped : 0);
        EXPECT_EQ(expected, flag);
    }

    for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) {
        ret = cudaFreeHost(ptrs[i]);
        EXPECT_EQ(cudaSuccess, ret);
    }
}
// Allocate ZeroCopy mapped memory, shared between CUDA and CPU.
bool GIEFeatExtractor::cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size )
{
	if( !cpuPtr || !gpuPtr || size == 0 )
		return false;

	//CUDA(cudaSetDeviceFlags(cudaDeviceMapHost));

	if( CUDA_FAILED(cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped)) )
		return false;

	if( CUDA_FAILED(cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0)) )
		return false;

	memset(*cpuPtr, 0, size);
	std::cout << "cudaAllocMapped : " << size << " bytes" << std::endl;
	return true;
}
int main() {

// Height of the capillary interface for the grid
CpuPtr_2D height_distribution(nx, ny, 0, true);
computeRandomHeights(0, H, height_distribution);

// Density difference between brine and CO2
float delta_rho = 500;
// Gravitational acceleration
float g = 9.87;
// Non-dimensional constant that scales the strength of the capillary forces
float c_cap = 1.0/6.0;
// Permeability data (In real simulations this will be a table based on rock data, here we use a random distribution )
float k_data[10] = {0.9352, 1.0444, 0.9947, 0.9305, 0.9682, 1.0215, 0.9383, 1.0477, 0.9486, 1.0835};
float k_heights[10] = {10, 20, 25, 100, 155, 193, 205, 245, 267, 300};

//Inside Kernel
// Converting the permeability data into a table of even subintervals in the z-directions
//float k_values[n+1];
//kDistribution(dz, h, k_heights, k_data, k_values);

// MOBILITY
// The mobility is a function of the saturation, which is directly related to the capillary pressure
// Pressure at capillary interface, which is known
float p_ci = 1;
// Table of capillary pressure values for our subintervals along the z-axis ranging from 0 to h
float resolution = 0.01;
int size = 1/resolution + 1;
float p_cap_ref_table[size];
float s_b_ref_table[size];
createReferenceTable(g, H, delta_rho, c_cap, resolution, p_cap_ref_table, s_b_ref_table);

// Set block and grid sizes and initialize gpu pointer
dim3 grid;
dim3 block;
computeGridBlock(dim3& grid, dim3& block, nx, ny, block_x, block_y);

// Allocate and set data on the GPU
GpuPtr_2D Lambda_device(nx, ny, 0, NULL);
GpuPtr_1D k_data_device(10, k_data);
GpuPtr_1D k_heights_device(10, k_heights);
GpuPtr_1D p_cap_ref_table_device(size, p_cap_ref_table);
GpuPtr_1D s_b_ref_table_device(size, s_b_ref_table);

cudaHostAlloc(&args, sizeof(CoarsePermIntegrationKernelArgs), cudaHostAllocWriteCombined);

// Set arguments and run coarse integration kernel
CoarsePermIntegrationArgs coarse_perm_int_args;
setCoarsePermIntegrationArgs(coarse_perm_int_args,\
							Lambda_device.getRawPtr(),\
							k_data_device.getRawPtr(),\
							k_heights_device.getRawPtr(),\
							p_cap_ref_table_device.getRawPtr(),\
							s_b_ref_table_device.getRawPtr(),\
							nx, ny, 0);
callCoarseIntegrationKernel(grid, block, coarse_perm_int_args);

float p_cap_values[n+1];
computeCapillaryPressure(p_ci, g, delta_rho, h, dz, n, p_cap_values);
float s_b_values[n+1];
inverseCapillaryPressure(n, g, h, delta_rho, c_cap, p_cap_values, s_b_values);
printArray(n+1, s_b_values);
// End point mobility lambda'_b, a known quantity
float lambda_end_point = 1;
float lambda_values[n+1];
computeMobility(n, s_b_values, lambda_end_point, lambda_values);

// Multiply permeability values with lambda values
float f_values[n+1];
multiply(n+1, lambda_values, k_values, f_values);

//Numerical integral with trapezoidal
float K = trapezoidal(dz, n, k_values);
float L = trapezoidal(dz, n, f_values)/K;
printf("Value of integral K. %.4f", K);
printf("Value of integral L. %.4f", L);

}
Beispiel #23
0
magma_int_t
magma_d_initP2P ( magma_int_t *bw_bmark, magma_int_t *num_gpus ){


    // Number of GPUs
    printf("Checking for multiple GPUs...\n");
    int gpu_n;
     (cudaGetDeviceCount(&gpu_n));
    printf("CUDA-capable device count: %i\n", gpu_n);
    if (gpu_n < 2)
    {
        printf("Two or more Tesla(s) with (SM 2.0)"
                        " class GPUs are required for P2P.\n");
    }

    // Query device properties
    cudaDeviceProp prop[64];
    int gpuid_tesla[64]; // find the first two GPU's that can support P2P
    int gpu_count = 0;   // GPUs that meet the criteria

    for (int i=0; i < gpu_n; i++) {
         (cudaGetDeviceProperties(&prop[i], i));
        // Only Tesla boards based on Fermi can support P2P
        {
            // This is an array of P2P capable GPUs
            gpuid_tesla[gpu_count++] = i;
        }
    }
    *num_gpus=gpu_n;

     for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
  // Check possibility for peer access
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
    int can_access_peer_0_1, can_access_peer_1_0;
    // In this case we just pick the first two that we can support
     (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], 
                                                        gpuid_tesla[j]));
     (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], 
                                                        gpuid_tesla[i]));


    // Output results from P2P capabilities
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],                                                                  
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j] ,
                             can_access_peer_0_1 ? "Yes" : "No");
    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", 
                        prop[gpuid_tesla[j]].name, gpuid_tesla[j],
                        prop[gpuid_tesla[i]].name, gpuid_tesla[i],
                            can_access_peer_1_0 ? "Yes" : "No");

    if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
    {
        printf("Two or more Tesla(s) with class"
                " GPUs are required for P2P to run.\n");
        printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n");
        printf("Peer to Peer access is not available between"
        " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]);
        printf("PASSED\n");
        //exit(EXIT_SUCCESS);
    }
     }    
    }

  // Enable peer access
     for(int i=0; i<gpu_n; i++)
     {
         for(int j=i+1; j<gpu_n; j++)
         {
             printf("Enabling peer access between GPU%d and GPU%d...\n",
                gpuid_tesla[i], gpuid_tesla[j]);
              (cudaSetDevice(gpuid_tesla[i]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0));
              (cudaSetDevice(gpuid_tesla[j]));
              (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0));
           magma_dcheckerr("P2P");
         }
     }

   magma_dcheckerr("P2P successful");


    // Enable peer access
    for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Check that we got UVA on both devices
    printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 
    gpuid_tesla[i], gpuid_tesla[j]);
    //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && 
    //                            prop[gpuid_tesla[j]].unifiedAddressing);

    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, 
    gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") );
    printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, 
    gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") );

        }
    }



  if(*bw_bmark==1){


    // P2P memcopy() benchmark
   for(int i=0; i<gpu_n; i++)
    {
        for(int j=i+1; j<gpu_n; j++)
        {
    // Allocate buffers
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", 
                int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]);
    (cudaSetDevice(gpuid_tesla[i]));
    float* g0;
    (cudaMalloc(&g0, buf_size));
    (cudaSetDevice(gpuid_tesla[j]));
    float* g1;
    (cudaMalloc(&g1, buf_size));
    float* h0;
    (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA

    // Create CUDA event handles
    printf("Creating event handles...\n");
    cudaEvent_t start_event, stop_event;
    float time_memcpy;
    int eventflags = cudaEventBlockingSync;
    (cudaEventCreateWithFlags(&start_event, eventflags));
    (cudaEventCreateWithFlags(&stop_event, eventflags));


    (cudaEventRecord(start_event, 0));
    for (int k=0; k<100; k++)
    {
        // With UVA we don't need to specify source and target devices, the
        // runtime figures this out by itself from the pointers
            
        // Ping-pong copy between GPUs
        if (k % 2 == 0)
            (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
        else
            (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
    }
    (cudaEventRecord(stop_event, 0));
    (cudaEventSynchronize(stop_event));
    (cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("cudaMemcpyPeer / cudaMemcpy between"
            "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j],
        (1.0f / (time_memcpy / 1000.0f)) 
            * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);


     // Cleanup and shutdown
    printf("Cleanup of P2P benchmark...\n");
    (cudaEventDestroy(start_event));
    (cudaEventDestroy(stop_event));
    (cudaSetDevice(gpuid_tesla[i]));
    (magma_free( g0) );
    (cudaSetDevice(gpuid_tesla[j]));
    (magma_free( g1) );
    (magma_free_cpu( h0) );

    }
    }

    // host-device memcopy() benchmark

        for(int j=0; j<gpu_n; j++)
        {
    cudaSetDevice(gpuid_tesla[j]);

    int *h_data_source;
    int *h_data_sink;

    int *h_data_in[STREAM_COUNT];
    int *d_data_in[STREAM_COUNT];

    int *h_data_out[STREAM_COUNT];
    int *d_data_out[STREAM_COUNT];


    cudaEvent_t cycleDone[STREAM_COUNT];
    cudaStream_t stream[STREAM_COUNT];

    cudaEvent_t start, stop;

    // Allocate resources
    int memsize;
    memsize = 1000000 * sizeof(int);

    h_data_source = (int*) malloc(memsize);
    h_data_sink = (int*) malloc(memsize);    

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        ( cudaHostAlloc(&h_data_in[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_in[i], memsize) );
        
        ( cudaHostAlloc(&h_data_out[i], memsize, 
            cudaHostAllocDefault) );
        ( cudaMalloc(&d_data_out[i], memsize) );

        
        ( cudaStreamCreate(&stream[i]) );
        ( cudaEventCreate(&cycleDone[i]) ); 
        
        cudaEventRecord(cycleDone[i], stream[i]);
    }

    cudaEventCreate(&start); cudaEventCreate(&stop);

    

    // Time host-device copies
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, 
        cudaMemcpyHostToDevice,0) );
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_h2d_time;    
    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);

    
    cudaEventRecord(start,0);
    ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, 
        cudaMemcpyDeviceToHost, 0) );        
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    
    float memcpy_d2h_time;    
    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
    
    cudaEventSynchronize(stop);
    printf("Measured timings (throughput):\n");
    printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j,
        memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time );
    printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j,
        memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time);

    // Free resources

    free( h_data_source );
    free( h_data_sink );

    for( int i =0; i<STREAM_COUNT; ++i ) {
        
        magma_free_cpu( h_data_in[i] );
        magma_free( d_data_in[i] );

        magma_free_cpu( h_data_out[i] );
        magma_free( d_data_out[i] );
        
        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);        
    }
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    
   }

  }//end if-loop bandwidth_benchmark

    magma_dcheckerr("P2P established");

    return MAGMA_SUCCESS;

}
Beispiel #24
0
////////////////////////////////////////////////////////////////////////////////
// Program Main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
	int Nx, Ny, Nz, max_iters;
	int blockX, blockY, blockZ;

	if (argc == 8) {
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", 
			argv[0]);
		exit(1);
	}

	// Get the number of GPUS
	int number_of_devices;
	checkCuda(cudaGetDeviceCount(&number_of_devices));
  
  if (number_of_devices < 2) {
  	printf("Less than two devices were found.\n");
  	printf("Exiting...\n");

  	return -1;
  }

	// Decompose along the Z-axis
	int _Nz = Nz/number_of_devices;

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Check if ECC is turned on
	ECCCheck(number_of_devices);

	// Set the number of OpenMP threads
	omp_set_num_threads(number_of_devices);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_num_threads();

		#pragma omp single
		{
			printf("Number of OpenMP threads: %d\n", tid);
		}
	}

  // CPU memory operations
  int dt_size = sizeof(_DOUBLE_);

	_DOUBLE_ *u_new, *u_old;

	u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(u_old, u_new, h, Nx, Ny, Nz);

	// Allocate and generate arrays on the host
	size_t pitch_bytes;
	size_t pitch_gc_bytes;

	_DOUBLE_ *h_Unew, *h_Uold;
	_DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices];
	_DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices];
	_DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices];

	h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(h_Uold, h_Unew, h, Nx, Ny, Nz);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
		h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

		right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

		checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

		init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

	// GPU memory operations
	_DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices];
	_DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices];
	_DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices];

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		checkCuda(cudaSetDevice(tid));

		CopyToConstantMemory(c0, c1);

		checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
	}

	// Copy data from host to the device
	double HtD_timer = 0.;
	HtD_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
		checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
	}
	HtD_timer += omp_get_wtime();

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

    // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);
	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	double compute_timer = 0.;
  compute_timer -= omp_get_wtime();

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		for(int iterations = 0; iterations < max_iters; iterations++)
		{
			// Compute inner nodes
			checkCuda(cudaSetDevice(tid));
			ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz);

			// Copy right boundary data to host
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
				checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}
			// Copy left boundary data to host
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
				checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}

			#pragma omp barrier

			// Copy right boundary data to device 1
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				
				checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
			}

			// Copy left boundary data to device 0
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));

				checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
			}

			// Swap pointers on the host
			#pragma omp barrier
			checkCuda(cudaSetDevice(tid));
			checkCuda(cudaDeviceSynchronize());
			swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]);
		}
	}

	compute_timer += omp_get_wtime();

  // Copy data from device to host
	double DtH_timer = 0;
  DtH_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost));
	}
	DtH_timer += omp_get_wtime();

	// Merge sub-domains into a one big domain
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

   	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
#endif

    float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
    PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

    _DOUBLE_ t = max_iters * dt;
    CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);

#if defined(DEBUG) || defined(_DEBUG)
    //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz);
#endif

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaFree(d_s_Unews[tid]));
    checkCuda(cudaFree(d_s_Uolds[tid]));
    checkCuda(cudaFree(d_right_send_buffer[tid]));
    checkCuda(cudaFree(d_left_send_buffer[tid]));
    checkCuda(cudaFree(d_right_receive_buffer[tid]));
    checkCuda(cudaFree(d_left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(h_s_Unews[tid]));
    checkCuda(cudaFreeHost(h_s_Uolds[tid]));
    checkCuda(cudaFreeHost(left_send_buffer[tid]));
    checkCuda(cudaFreeHost(right_send_buffer[tid]));
    checkCuda(cudaFreeHost(left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(right_receive_buffer[tid]));
    checkCuda(cudaDeviceReset());
  }

  free(u_old);
  free(u_new);

	return 0;
}
Beispiel #25
0
extern "C" double gsum(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) 
{
	DBUG_ENTER("udf_sum::gsum");

	double* h_in;
	double* h_out;
	double* d_in;
	double* d_out;
	unsigned int index = 0;

	cudaHostAlloc((void**)&h_in, MAXIMUM_ELEMENTS_IN_CACHE*sizeof(double), cudaHostAllocWriteCombined | cudaHostAllocMapped );
	CUDA_CHECK_ERRORS("cudaHostAlloc -> h_in");
	cudaHostAlloc((void**)&h_out, CUDA_BLOCK_SIZE*sizeof(double), cudaHostAllocWriteCombined | cudaHostAllocMapped );
	CUDA_CHECK_ERRORS("cudaHostAlloc -> h_out");

	cudaHostGetDevicePointer((void**)&d_in, h_in, 0);
	cudaHostGetDevicePointer((void**)&d_out, h_out, 0);


	char* column_name = (char*) args->args[0];
	char* table_name = (char*) args->args[1];
	char* schema_name = (char*) args->args[2];

	DBUG_PRINT("info", ("column_name [%s], table_name [%s], schema_name [%s]", column_name, table_name, schema_name));
	fprintf(stderr, "column_name [%s], table_name [%s], schema_name [%s]\n", column_name, table_name, schema_name);
	fflush(stderr);

	THD *thd = current_thd;

	TABLE_LIST* table_list = new TABLE_LIST;	
	memset((char*) table_list, 0, sizeof(TABLE_LIST));

	DBUG_PRINT("info", ("table_list->init_one_table"));
	table_list->init_one_table(schema_name, strlen(schema_name), table_name, strlen(table_name), table_name, TL_READ);
	DBUG_PRINT("info", ("open_and_lock_tables"));
	open_and_lock_tables(thd, table_list, FALSE, MYSQL_OPEN_IGNORE_FLUSH | MYSQL_LOCK_IGNORE_TIMEOUT);

	TABLE* table = table_list->table;

	clock_t cpu_clock;
	cpu_clock = clock();
	table->file->ha_rnd_init(true);

	while (table->file->ha_rnd_next(table->record[0]) == 0){
		h_in[index++] = table->field[1]->val_real();
	}
	table->file->ha_rnd_end();
	cpu_clock = clock() - cpu_clock;
	fprintf(stderr, "gsum -> index [%d]\n", index);
	fprintf(stderr, "gsum -> fill cache within [%f seconds]\n", ((float)cpu_clock)/CLOCKS_PER_SEC);
	fflush(stderr);
	

	cudaEvent_t start, stop;
	float elapsedTime;

	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	ReductionTask reduction_task(MAXIMUM_ELEMENTS_IN_CACHE, sizeof(double), CUDA_BLOCK_SIZE, CUDA_THREAD_PER_BLOCK_SIZE, R_SUM, R_DOUBLE);
	reductionWorkerUsingMappedMemory<double>(d_in, d_out, &reduction_task);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);

	double gpu_sum = 0;
	for (unsigned int i = 0; i < CUDA_BLOCK_SIZE; i++)
	{
		gpu_sum += ((double*)h_out)[i];
	}

	float bandwidthInMBs = (1e3f * MAXIMUM_ELEMENTS_IN_CACHE*sizeof(double)) / (elapsedTime * (float)(1 << 20));
	fprintf(stderr, "gpu result [%f], gpu time [%f seconds] bandwidth (mb) [%f]\n", gpu_sum, elapsedTime/1000.0, bandwidthInMBs);
	fflush(stderr);

	cudaFreeHost(h_in);
	CUDA_CHECK_ERRORS("cudaFreeHost -> h_in");
	cudaFreeHost(h_out);
	CUDA_CHECK_ERRORS("cudaFreeHost -> h_out");

	thd->cleanup_after_query();
	DBUG_PRINT("info", ("about to delete table_list"));
	delete table_list;

	DBUG_RETURN(gpu_sum);
}
Beispiel #26
0
cudaError_t cudaMallocHost(void **pHost, size_t size)
{
	return cudaHostAlloc(pHost, size, cudaHostAllocDefault);
}
void MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory() {
    trace_printf("MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory()\n");
    /**
     * Error variable - stores the result of the various mallocs & such.
     */
    cudaError_t err, err2;

    /**
     * Flags for calling cudaHostMalloc - will be set to cudaHostAllocMapped
     * if we are mapping memory to the host with zero copy.
     */
    unsigned int cudaHostMallocFlags = 0;

    if (this->useZeroCopy) {
        cudaHostMallocFlags |= cudaHostAllocMapped;
    }


    /*
     * Malloc the device hashlist space.  This is the number of available hashes
     * times the hash length in bytes.  The data will be copied later.
     */
    err = cudaMalloc((void **)&this->DeviceHashlistAddress,
        this->activeHashesProcessed.size() * this->hashLengthBytes);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for device hashlist!  Exiting!\n",
                this->activeHashesProcessed.size() * this->hashLengthBytes);
        printf("return code: %d\n", err);
        exit(1);
    }

    /*
     * Allocate the host/device space for the success list (flags for found passwords).
     * This is a byte per password.  To avoid atomic write issues, each password
     * gets a full addressible byte, and the GPU handles the dependencies between
     * multiple threads trying to set a flag in the same segment of memory.
     *
     * On the host, it will be allocated as mapped memory if we are using zerocopy.
     *
     * As this region of memory is frequently copied back to the host, mapping it
     * improves performance.  In theory.
     */
    err = cudaHostAlloc((void **)&this->HostSuccessAddress,
        this->activeHashesProcessed.size(), cudaHostMallocFlags);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for success flags!  Exiting!\n",
                this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        exit(1);
    }

    // Clear host success flags region - if we are mapping the memory, the GPU
    // will directly write this.
    memset(this->HostSuccessAddress, 0, this->activeHashesProcessed.size());

    // Allocate memory for the reported flags.
    this->HostSuccessReportedAddress = new uint8_t [this->activeHashesProcessed.size()];
    memset(this->HostSuccessReportedAddress, 0, this->activeHashesProcessed.size());

    // If zero copy is in use, get the device pointer for the success data, else
    // malloc a region of memory on the device.
    if (this->useZeroCopy) {
        err = cudaHostGetDevicePointer((void **)&this->DeviceSuccessAddress,
            this->HostSuccessAddress, 0);
        err2 = cudaSuccess;
    } else {
        err = cudaMalloc((void **)&this->DeviceSuccessAddress,
            this->activeHashesProcessed.size());
        err2 = cudaMemset(this->DeviceSuccessAddress, 0,
            this->activeHashesProcessed.size());
    }
    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for device success list!  Exiting!\n",
                this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }

    /*
     * Allocate memory for the found passwords.  As this is commonly copied
     * back and forth, it will be made zero copy if requested.
     *
     * This requires (number hashes * passwordLength) bytes of data.
     */

    err = cudaHostAlloc((void **)&this->HostFoundPasswordsAddress,
        this->passwordLength * this->activeHashesProcessed.size() , cudaHostMallocFlags);
    if (err != cudaSuccess) {
        printf("Unable to allocate %d bytes for host password list!  Exiting!\n",
                this->passwordLength * this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        exit(1);
    }
    // Clear the host found password space.
    memset(this->HostFoundPasswordsAddress, 0,
            this->passwordLength * this->activeHashesProcessed.size());

    if (this->useZeroCopy) {
        err = cudaHostGetDevicePointer((void **)&this->DeviceFoundPasswordsAddress,
            this->HostFoundPasswordsAddress, 0);
        err2 = cudaSuccess;
    } else {
        err = cudaMalloc((void **)&this->DeviceFoundPasswordsAddress,
            this->passwordLength * this->activeHashesProcessed.size());
        err2 = cudaMemset(this->DeviceFoundPasswordsAddress, 0,
            this->passwordLength * this->activeHashesProcessed.size());
    }
    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for device password list!  Exiting!\n",
                this->passwordLength * this->activeHashesProcessed.size());
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }

    /**
     * Allocate space for host and device start positions.  To improve performance,
     * this space is now aligned for improved coalescing performance.  All the
     * position 0 elements are together, followed by all the position 1 elements,
     * etc.
     *
     * This memory can be allocated as write combined, as it is not read by
     * the host ever - only written.  Since it is regularly transferred to the
     * GPU, this should help improve performance.
     */
    err = cudaHostAlloc((void**)&this->HostStartPointAddress,
        this->TotalKernelWidth * this->passwordLength,
        cudaHostAllocWriteCombined | cudaHostMallocFlags);

    err2 = cudaMalloc((void **)&this->DeviceStartPointAddress,
        this->TotalKernelWidth * this->passwordLength);

    if ((err != cudaSuccess) || (err2 != cudaSuccess)) {
        printf("Unable to allocate %d bytes for host/device startpos list!  Exiting!\n",
                this->TotalKernelWidth * this->passwordLength);
        printf("return code: %d\n", err);
        printf("return code: %d\n", err2);
        exit(1);
    }
    
    /**
     * Allocate space for the device start password values.  This is a copy of
     * the MFNHashTypePlain::HostStartPasswords32 vector for the GPU.
     */
    err = cudaMalloc((void **)&this->DeviceStartPasswords32Address,
        this->TotalKernelWidth * this->passwordLengthWords);
    
    if ((err != cudaSuccess)) {
        printf("Unable to allocate %d bytes for host/device startpos list!  Exiting!\n",
                this->TotalKernelWidth * this->passwordLengthWords);
        printf("return code: %d\n", err);
        exit(1);
    }


    /**
     * Finally, attempt to allocate space for the giant device bitmaps.  There
     * are 4x128MB bitmaps, and any number can be allocated.  If they are not
     * fully allocated, their address is set to null as a indicator to the device
     * that there is no data present.  Attempt to allocate as many as possible.
     *
     * This will be accessed regularly, so should probably not be zero copy.
     * Also, I'm not sure how mapping host memory into multiple threads would
     * work.  Typically, if the GPU doesn't have enough RAM for the full
     * set of bitmaps, it's a laptop, and therefore may be short on host RAM
     * for the pinned access.
     *
     * If there is an error in allocation, call cudaGetLastError() to clear it -
     * we know there has been an error, and do not want it to persist.
     */
    err = cudaMalloc((void **)&this->DeviceBitmap128mb_a_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap A\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap A\n");
        this->DeviceBitmap128mb_a_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_b_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap B\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap B\n");
        this->DeviceBitmap128mb_b_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_c_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap C\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap C\n");
        this->DeviceBitmap128mb_c_Address = 0;
        cudaGetLastError();
    }

    err = cudaMalloc((void **)&this->DeviceBitmap128mb_d_Address,
        128 * 1024 * 1024);
    if (err == cudaSuccess) {
        memalloc_printf("Successfully allocated Bitmap D\n");
    } else {
        memalloc_printf("Unable to allocate 128MB bitmap D\n");
        this->DeviceBitmap128mb_d_Address = 0;
        cudaGetLastError();
    }
    //printf("Thread %d memory allocated successfully\n", this->threadId);
}
Beispiel #28
0
Waifu2x::eWaifu2xError Waifu2x::init(int argc, char** argv, const std::string &Mode, const int NoiseLevel, const std::string &ModelDir, const std::string &Process,
	const int CropSize, const int BatchSize)
{
	Waifu2x::eWaifu2xError ret;

	if (is_inited)
		return eWaifu2xError_OK;

	try
	{
		mode = Mode;
		noise_level = NoiseLevel;
		model_dir = ModelDir;
		process = Process;

		crop_size = CropSize;
		batch_size = BatchSize;

		inner_padding = layer_num;
		outer_padding = 1;

		output_size = crop_size - offset * 2;
		input_block_size = crop_size + (inner_padding + outer_padding) * 2;
		original_width_height = 128 + layer_num * 2;

		output_block_size = crop_size + (inner_padding + outer_padding - layer_num) * 2;

		std::call_once(waifu2x_once_flag, [argc, argv]()
		{
			assert(argc >= 1);

			int tmpargc = 1;
			char* tmpargvv[] = { argv[0] };
			char** tmpargv = tmpargvv;
			// glog等の初期化
			caffe::GlobalInit(&tmpargc, &tmpargv);
		});

		const auto cuDNNCheckStartTime = std::chrono::system_clock::now();

		if (process == "gpu")
			process = "cudnn";

		const auto cuDNNCheckEndTime = std::chrono::system_clock::now();

		boost::filesystem::path mode_dir_path(model_dir);
		if (!mode_dir_path.is_absolute()) // model_dirが相対パスなら絶対パスに直す
		{
			// まずはカレントディレクトリ下にあるか探す
			mode_dir_path = boost::filesystem::absolute(model_dir);
			if (!boost::filesystem::exists(mode_dir_path) && argc >= 1) // 無かったらargv[0]から実行ファイルのあるフォルダを推定し、そのフォルダ下にあるか探す
			{
				boost::filesystem::path a0(argv[0]);
				if (a0.is_absolute())
					mode_dir_path = a0.branch_path() / model_dir;
			}
		}

		if (!boost::filesystem::exists(mode_dir_path))
			return eWaifu2xError_FailedOpenModelFile;

		if (process == "cpu")
		{
			caffe::Caffe::set_mode(caffe::Caffe::CPU);
			isCuda = false;
		}
		else
		{
			caffe::Caffe::set_mode(caffe::Caffe::GPU);
			isCuda = true;
		}

		if (mode == "noise" || mode == "noise_scale" || mode == "auto_scale")
		{
			const std::string model_path = (mode_dir_path / "srcnn.prototxt").string();
			const std::string param_path = (mode_dir_path / ("noise" + std::to_string(noise_level) + "_model.json")).string();

			ret = ConstractNet(net_noise, model_path, param_path, process);
			if (ret != eWaifu2xError_OK)
				return ret;
		}

		if (mode == "scale" || mode == "noise_scale" || mode == "auto_scale")
		{
			const std::string model_path = (mode_dir_path / "srcnn.prototxt").string();
			const std::string param_path = (mode_dir_path / "scale2.0x_model.json").string();

			ret = ConstractNet(net_scale, model_path, param_path, process);
			if (ret != eWaifu2xError_OK)
				return ret;
		}

		const int input_block_plane_size = input_block_size * input_block_size * input_plane;
		const int output_block_plane_size = output_block_size * output_block_size * input_plane;

		if (isCuda)
		{
			CUDA_CHECK_WAIFU2X(cudaHostAlloc(&input_block, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined));
			CUDA_CHECK_WAIFU2X(cudaHostAlloc(&dummy_data, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined));
			CUDA_CHECK_WAIFU2X(cudaHostAlloc(&output_block, sizeof(float) * output_block_plane_size * batch_size, cudaHostAllocDefault));
		}
		else
		{
			input_block = new float[input_block_plane_size * batch_size];
			dummy_data = new float[input_block_plane_size * batch_size];
			output_block = new float[output_block_plane_size * batch_size];
		}

		for (size_t i = 0; i < input_block_plane_size * batch_size; i++)
			dummy_data[i] = 0.0f;

		is_inited = true;
	}
	catch (...)
	{
		return eWaifu2xError_InvalidParameter;
	}

	return eWaifu2xError_OK;
}
Beispiel #29
0
uint8_t* allocPageLocked(size_t size) {
  void* ptr;
  checkCudaError(cudaHostAlloc(&ptr, size, cudaHostAllocPortable),
                 "cudaHostAlloc");
  return static_cast<uint8_t*>(ptr);
}
int main(int argc, char *argv[]) {
  int i,j,k;
  machineInformation currentMachine;
  counterSessionInfo session;

  initializeCUDA();

  // Set machine information from CounterHomeBrew.h
  currentMachine.cpu_model = CPU_MODEL;
  currentMachine.num_sockets = NUM_SOCKETS;
  currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET;
  currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET;
  currentMachine.num_cores = NUM_CORES;
  currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS???
  currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX;
  currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX;

  // Set session events, umasks and counters used
  //  int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80};
  // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01};

  session.core_gen_counter_num_used = 5;
  int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1};
  int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07};

  session.cbo_counter_num_used = 1;
  int32 cbo_event_numbers[] = {0x37};
  int32 cbo_umasks[] = {0xf};
  session.cbo_filter = 0x1f;

  for (i = 0; i < session.core_gen_counter_num_used; i++) {
    session.core_event_numbers[i] = core_event_numbers[i];
    session.core_umasks[i] = core_umasks[i];
  }
  for (i = 0; i < session.cbo_counter_num_used; i++) {
    session.cbo_event_numbers[i] = cbo_event_numbers[i];
    session.cbo_umasks[i] = cbo_umasks[i];
  }

  int fd[NUM_CORES];

  // Arrays to hold counter data...
  counterData before;
  counterData after;

  // some data for doing a naive matmul to test flop counting...
  // initloop(N);
  

  // M,N,K are multiples of the block size....
  int gpuOuter = atoi(argv[1]);
  int gpuInner = atoi(argv[2]);
  int cpuInner = atoi(argv[3]);
  double minRuntime = atoi(argv[4]);
  int Md = atoi(argv[5])*block_size;
  int Nd = atoi(argv[6])*block_size;
  int Kd = atoi(argv[7])*block_size;
  int Mh = atoi(argv[8]);
  int Nh = atoi(argv[9]);
  int Kh = atoi(argv[10]);

  char *ts1,*ts2,*ts3,*ts4;
  char *ts5,*ts6,*ts7,*ts8;
  double fineTimeStamps[8];
  double gTime = 0.0;
  double cTime = 0.0;
  double seconds = 0.0;
  int num_iters;

  uint64 *coreSums;
  coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64));

  uint64 *sums;
  sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64));

  float *Atmp = NULL;
  float *Btmp = NULL;
  float *Ctmp = NULL;
  Atmp = (float*) malloc( Mh * Nh * sizeof(float) );
  Btmp = (float*) malloc( Nh * sizeof(float) );
  Ctmp = (float*) malloc( Mh * sizeof(float) );
  randomInit(Atmp,Mh*Nh);
  randomInit(Btmp,Nh);

  for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) {
    seconds = 0.0;	
    for (i =0; i < num_iters; i++)
      BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 );
    seconds = read_timer()-seconds;
  }
  //  num_iters /= 2;

  free(Atmp);
  free(Btmp);
  free(Ctmp);

  int readyThreads = 0;
  #pragma omp parallel
  {
    int threadNum = omp_get_thread_num();
    int numThreads = omp_get_num_threads();
    assert(numThreads==2);
    if (threadNum == 0) {
      cudaError_t error;
      int memSizeA = sizeof(float)*Md*Nd;
      int memSizeB = sizeof(float)*Nd;
      int memSizeC = sizeof(float)*Md;
      
      float *Ahost,*Bhost,*Chost;
      // use pinned memory on the host for BW and asynch memory transfers..
      int flags = cudaHostAllocDefault;
      ts5 = getTimeStamp();
      fineTimeStamps[0] = read_timer();
      error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      // set local arrays
      randomInit(Ahost,Md*Nd);
      randomInit(Bhost,Nd);

      // allocate device memory
      float *Adevice,*Bdevice,*Cdevice;
      error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
      fineTimeStamps[1] = read_timer();
      ts6 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //     fprintf(stderr,"Incremented ready GPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);};

      //#pragma omp single 
      //{
      cudaStream_t stream1;
      cudaStreamCreate ( &stream1) ;
      ts3 = getTimeStamp();
      fineTimeStamps[2] = read_timer();
      gTime = read_timer();
      for (int i = 0; i < gpuOuter; i++) 
	GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1);
      cudaStreamSynchronize(stream1);
      gTime = read_timer() - gTime;
      fineTimeStamps[3] = read_timer();
      ts4 = getTimeStamp();
      cudaFreeHost(Ahost);
      cudaFreeHost(Bhost);
      cudaFreeHost(Chost);

    } else {
      //  uint64 min_iters = strtoull(argv[4],NULL,0);
      float *A = NULL;
      float *B = NULL;
      float *C = NULL;
      ts7 = getTimeStamp();
      fineTimeStamps[4] = read_timer();
      A = (float*) malloc( Mh * Nh * sizeof(float) );
      B = (float*) malloc( Nh * sizeof(float) );
      C = (float*) malloc( Mh * sizeof(float) );
      randomInit(A,Mh*Nh);
      randomInit(B,Nh);
      fineTimeStamps[5] = read_timer();
      ts8 = getTimeStamp();
#pragma omp critical
      {
	readyThreads += 1;
      }
      //   fprintf(stderr,"Incremented ready CPU\n");
      while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);};
                  
      // open the msr files for each core on the machine
      for (i = 0; i < currentMachine.num_cores; i++)
	open_msr_file(i,&fd[i]);
      
      
      int socketsProgrammed = 0;
      for (i = 0; i < currentMachine.num_cores; i++) {
	int currentCoreFD = fd[i];
	
	stopCounters(i, currentCoreFD, &currentMachine, &session);
	programCoreFixedCounters(currentCoreFD);    
	programGeneralPurposeRegisters(currentCoreFD, &currentMachine, &session);
	
	/* Program the Uncore as desired...*/
	// Only program the first physical core on each socket. 
	// NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm.
	if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) {
	  programUncoreCounters( currentCoreFD, &currentMachine, &session);
	  socketsProgrammed++;
	}
      }
      
      seconds = 0.0;
      
      // start the programmed counters...
      for (i = 0; i < currentMachine.num_cores; i++)
	startCounters( i, fd[i], &currentMachine, &session);
      
      /* READ COUNTERS BEFORE STUFF */
      readCounters(fd,&currentMachine,&session, &before);
      ts1 = getTimeStamp();
      fineTimeStamps[6] = read_timer();
      seconds = read_timer();
      
      /* DO STUFF */    
      for (i =0; i < num_iters; i++)
	BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 );
      
      /* END DOING STUFF */
      
      seconds = read_timer()-seconds;
      fineTimeStamps[7] = read_timer();
      ts2 = getTimeStamp();
      
      /* READ COUNTERS AFTER STUFF */    
      for (i = 0; i < currentMachine.num_cores; i++)
	stopCounters(i,fd[i],&currentMachine, &session);
      
      //  printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds);
      
      readCounters(fd,&currentMachine,&session,&after);
      diffCounterData(&currentMachine, &session, &after, &before, &after);
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//    printf("Socket %d\n",i);
	for (j = 0; j < currentMachine.num_cores_per_socket; j++) {
	  //   printf("%d,",j);
	  for (k = 0; k < session.core_gen_counter_num_used; k++){
	    //	printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	    coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k];
	  }
	  //	printf("\n");
	}
      }
      
      for (i = 0; i < currentMachine.num_sockets; i++) {
	//	printf("%d,",i);
	for (j = 0; j < currentMachine.num_cbos; j++) {
	  //	  printf("%d,",j);
	  for (k = 0; k < session.cbo_counter_num_used; k++) {
	    //	    printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]);
	    // bug in the indexing of the core sums???
	    //        sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	    sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k];
	  }
	}
      }
      //      printf("\n");
            
      // Stop counters, reset PMU, close msr files
      cleanup(fd,&currentMachine,&session);
      
      
      free(A);
      free(B);
      free(C);
    }
  } // end parallel region

  printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0);
  for (int i = 0; i < 8; i++)
    printf("%f,",fineTimeStamps[i]);

  for (j = 0; j < session.core_gen_counter_num_used; j++)
    printf("%llu,",coreSums[j]);
  for (j = 0; j < session.cbo_counter_num_used; j++)
    if (j == session.cbo_counter_num_used-1)
      printf("%llu",sums[j]);
    else
      printf("%llu,",sums[j]);
  printf("\n");
  
  free(sums);
  free(coreSums);
  
  return 0;
}