Exemple #1
0
void WaterPlaneCUDA::update()
{
	glBindBuffer(GL_ARRAY_BUFFER, oldVertexBuffer);
	float3* verticesTest = (float3*)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
	
	for (int i = 0; i < disturbances.size();i++)
	{
		Disturbances *dist = disturbances.at(i);
		for(int x = dist->xminW; x <= dist->xmaxW; x++)
		{
			for (int y = dist->zminW; y <= dist->zmaxW; y++)
			{
				float insideCircle = ((x-dist->centerX)*(x-dist->centerX))+((y-dist->centerZ)*(y-dist->centerZ))-dist->radiusSQ;

				if (insideCircle <= 0) {
					int vIndex = (y * pointsX) + x;
					if (vIndex < (pointsX*pointsY)) {
						verticesTest[vIndex].y = (insideCircle/dist->radiusSQ)*dist->height;
					}
				}
			} 
		}
	}
	glUnmapBufferARB(GL_ARRAY_BUFFER);
	disturbances.clear();

	
	size_t num_bytes; 

	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_newVertex_resource, 0));
	cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_newVertices, &num_bytes, cuda_newVertex_resource));
	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_oldVertex_resource, 0));
	cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_oldVertices, &num_bytes, cuda_oldVertex_resource));
	cutilSafeCall(cudaMemcpyToSymbol("DIM",&pointsX,sizeof(int)));
	updateWaveMapGPU1(gpu_newVertices,gpu_oldVertices);
	cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_newVertex_resource, 0));
	cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_oldVertex_resource, 0));
	
	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_oldVertex_resource, 0));
	cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_oldVertices, &num_bytes, cuda_oldVertex_resource));
	cutilSafeCall(cudaGraphicsMapResources(1, &cuda_normalsVB_resource, 0));
	cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_normals, &num_bytes, cuda_normalsVB_resource));
	cutilSafeCall(cudaMemcpyToSymbol("DIM",&pointsX,sizeof(int)));
	updateNormalsGPU1(gpu_oldVertices,gpu_normals);
	cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_normalsVB_resource, 0));
	cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_oldVertex_resource, 0));

	//swap between old and new wave map
	struct cudaGraphicsResource *temp = cuda_oldVertex_resource;
	cuda_oldVertex_resource = cuda_newVertex_resource;
	cuda_newVertex_resource = temp;
}
        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            dim_type nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const dim_type ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const dim_type blocksXPerImage = blocks.x;
            if(nimages > TI) {
                dim_type tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                transform_kernel<T, true, method><<<blocks, threads>>>
                                (out, in, nimages, ntransforms, blocksXPerImage);
            } else {
Exemple #3
0
static void
params_1vb_set(struct psc *psc,
	       struct psc_mparticles *mprts, struct psc_mfields *mflds)
{
  struct params_1vb params;

  params.dt = psc->dt;
  for (int d = 0; d < 3; d++) {
    params.dxi[d] = 1.f / ppsc->patch[0].dx[d];
  }

  params.fnqs   = sqr(psc->coeff.alpha) * psc->coeff.cori / psc->coeff.eta;
#if CALC_J == CALC_J_1VB_2D
#if !(DIM == DIM_YZ)
#error inc_params.c: CALC_J_1VB_2D only works for DIM_YZ
#endif
  params.fnqxs = params.fnqs;
#else
  params.fnqxs = ppsc->patch[0].dx[0] * params.fnqs / params.dt;
#endif
  params.fnqys = ppsc->patch[0].dx[1] * params.fnqs / params.dt;
  params.fnqzs = ppsc->patch[0].dx[2] * params.fnqs / params.dt;

  assert(psc->nr_kinds <= MAX_NR_KINDS);
  for (int k = 0; k < ppsc->nr_kinds; k++) {
    params.dq_kind[k] = .5f * ppsc->coeff.eta * params.dt * ppsc->kinds[k].q / ppsc->kinds[k].m;
  }

  if (mprts && mprts->nr_patches > 0) {
#if PSC_PARTICLES_AS_CUDA2
    struct psc_mparticles_cuda2 *mprts_sub = psc_mparticles_cuda2(mprts);
    for (int d = 0; d < 3; d++) {
      params.b_mx[d] = mprts_sub->b_mx[d];
    }
#else
    assert(0);
#endif
  }

  if (mflds) {
#if PSC_FIELDS_AS_CUDA2
    struct psc_mfields_cuda2 * mflds_sub = psc_mfields_cuda2(mflds);
    for (int d = 0; d < 3; d++) {
      params.mx[d] = mflds_sub->im[d];
      params.ilg[d] = mflds_sub->ib[d];
      assert(mflds_sub->ib[d] == -2 || mflds_sub->im[d] == 1); // assumes BND == 2
    }
#else
    assert(0);
#endif
  }

#ifndef __CUDACC__
  prm = params;
#else
  check(cudaMemcpyToSymbol(prm, &params, sizeof(prm)));
#endif
}
Exemple #4
0
void CudaMem::cudaMemCpyToSymbolReport(void* dst, const void * 	src, size_t 	count, enum cudaMemcpyKind 	kind)
{
	cudaError_t cudaStatus;
	cudaStatus = cudaMemcpyToSymbol(dst, src, count, kind);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr,"error cudaMemCopy");
	}

}
// create the bin boundaries
void initBinB( struct pb_TimerSet *timers )
{
  REAL *binb = (REAL*)malloc((NUM_BINS+1)*sizeof(REAL));
  for (int k = 0; k < NUM_BINS+1; k++)
    {
      binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec))
		    / 60.0*D2R);
    }
  pb_SwitchToTimer( timers, pb_TimerID_COPY );
  cudaMemcpyToSymbol(dev_binb, binb, (NUM_BINS+1)*sizeof(REAL));
  pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
  free(binb);
}
Exemple #6
0
const char *para(const char *in)
{
  static char sym[256] = "para_";
  sym[5] = '\0'; // necessary because sym is static

  while(*in == '-') ++in; // skip leading dashes
  const Z n = strchr(in, '=') - in;
  strncat(sym, in, n);

  const R val = atof(in + n + 1);
  if(cudaMemcpyToSymbol(sym, &val, sizeof(R)) == cudaSuccess) {
    sprintf(sym + 5 + n, " = %g", val);
    return sym + 5;
  } else
    return NULL;
}
Exemple #7
0
SEXP R_auto_cudaMemcpyToSymbol(SEXP r_symbol, SEXP r_src, SEXP r_count, SEXP r_offset, SEXP r_kind)
{
    SEXP r_ans = R_NilValue;
    const void * symbol = GET_REF(r_symbol, const void );
    const void * src = GET_REF(r_src, const void );
    size_t count = REAL(r_count)[0];
    size_t offset = REAL(r_offset)[0];
    enum cudaMemcpyKind kind = (enum cudaMemcpyKind) INTEGER(r_kind)[0];
    
    cudaError_t ans;
    ans = cudaMemcpyToSymbol(symbol, src, count, offset, kind);
    
    r_ans = Renum_convert_cudaError_t(ans) ;
    
    return(r_ans);
}
Exemple #8
0
  inline
  CudaParallelLaunch( const DriverType & driver ,
                      const dim3       & grid ,
                      const dim3       & block ,
                      const int          shmem )
  {
    if ( sizeof( KokkosArray::Impl::CudaTraits::ConstantGlobalBufferType ) <
         sizeof( DriverType ) ) {
      KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
    }

    if ( CudaTraits::SharedMemoryCapacity < shmem ) {
      KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
    }
    else if ( shmem ) {
      cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
    }

    // Copy functor to constant memory on the device
    cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );

    // Invoke the driver function on the device
    cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>();
  }
int tiramisu_cuda_memcpy_to_symbol(void * to, void * from, uint64_t size)
{
    handle_cuda_error(cudaMemcpyToSymbol(to, from, size), __FUNCTION__);
    return 0;
}
/** Documented at declaration */
int
gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output)
{
    // Get coder
    struct gpujpeg_coder* coder = &decoder->coder;
    
    // Reset durations
    coder->duration_memory_to = 0.0;
    coder->duration_memory_from = 0.0;
    coder->duration_memory_map = 0.0;
    coder->duration_memory_unmap = 0.0;
    coder->duration_preprocessor = 0.0;
    coder->duration_dct_quantization = 0.0;
    coder->duration_huffman_coder = 0.0;
    coder->duration_stream = 0.0;
    coder->duration_in_gpu = 0.0;

    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Read JPEG image data
    if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n");
        return -1;
    }
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Perform huffman decoding on CPU (when restart interval is not set)
    if ( coder->param.restart_interval == 0 ) {
        if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n");
            return -1;
        }
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy quantized data to device memory from cpu memory
        cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);
    }
    // Perform huffman decoding on GPU (when restart interval is set)
    else {
    #ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT
        // Copy huffman tables to constant memory
        for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
            for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
                int index = (comp_type * GPUJPEG_HUFFMAN_TYPE_COUNT + huff_type);
                cudaMemcpyToSymbol(
                    (char*)gpujpeg_decoder_table_huffman, 
                    &decoder->table_huffman[comp_type][huff_type], 
                    sizeof(struct gpujpeg_table_huffman_decoder), 
                    index * sizeof(struct gpujpeg_table_huffman_decoder), 
                    cudaMemcpyHostToDevice
                );
            }
        }
        gpujpeg_cuda_check_error("Decoder copy huffman tables to constant memory");
    #endif
    
        // Reset huffman output
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        // Copy scan data to device memory
        cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Copy segments to device memory
        cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Zero output memory
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);

        // Perform huffman decoding
        if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n");
            return -1;
        }

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    }
    
#ifdef GPUJPEG_DCT_FROM_NPP
    // Perform IDCT and dequantization (implementation from NPP)
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        // Get component
        struct gpujpeg_component* component = &coder->component[comp];
                
        // Determine table type
        enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
        
        //gpujpeg_component_print16(component, component->d_data_quantized);
        
        cudaMemset(component->d_data, 0, component->data_size * sizeof(uint8_t));
        
        //Perform inverse DCT
        NppiSize inv_roi;
        inv_roi.width = component->data_width * GPUJPEG_BLOCK_SIZE;
        inv_roi.height = component->data_height / GPUJPEG_BLOCK_SIZE;
        assert(GPUJPEG_BLOCK_SIZE == 8);
        NppStatus status = nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
            component->d_data_quantized, 
            component->data_width * GPUJPEG_BLOCK_SIZE * sizeof(int16_t), 
            component->d_data, 
            component->data_width * sizeof(uint8_t), 
            decoder->table_quantization[type].d_table, 
            inv_roi
        );
        if ( status != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Inverse DCT failed (error %d)!\n", status);
        }
            
        //gpujpeg_component_print8(component, component->d_data);
    }
#else
    // Perform IDCT and dequantization (own CUDA implementation)
    gpujpeg_idct_gpu(decoder);
    
    // Perform IDCT and dequantization (own CPU implementation)
    // gpujpeg_idct_cpu(decoder);
#endif
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Preprocessing
    if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 )
        return -1;

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu);
    coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu);

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    
    // Set decompressed image size
    output->data_size = coder->data_raw_size * sizeof(uint8_t);
    
    // Set decompressed image
    if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy decompressed image to host memory
        cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        // Set output to internal buffer
        output->data = coder->data_raw;
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
        
        assert(output->data != NULL);

        // Copy decompressed image to host memory
        cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Map OpenGL texture
        int data_size = 0;
        uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size);
        assert(data_size == coder->data_raw_size);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Copy decompressed image to texture pixel buffer object device data
        cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Unmap OpenGL texture
        gpujpeg_opengl_texture_unmap(output->texture);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else {
        // Unknown output type
        assert(0);
    }

    return 0;
}
Exemple #11
0
cudaError_t WINAPI wine_cudaMemcpyToSymbol( const char *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) {
    WINE_TRACE("\n");
    return cudaMemcpyToSymbol( symbol, src, count, offset, kind);
}
Exemple #12
0
 template<class T> static inline void uploadConstant(const char* name, const T& value) 
 { 
     cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); 
 }
Exemple #13
0
int main(int argc, char **argv) {
    char *output;
    int x;
    int y;
    struct cuda_device device;
    int available_words = 1;
    int current_words = 0;
    struct wordlist_file file;
    char input_hash[4][9];

    print_info();

    if (argc != ARG_COUNT) {
        printf("Usage: %s WORDLIST_FILE MD5_HASH\n", argv[0]);
        return -1;
    }

    if (process_wordlist(argv[ARG_WORDLIST], &file) == -1) {
        printf("Error Opening Wordlist File: %s\n", argv[ARG_WORDLIST]);
        return -1;
    }

    if (read_wordlist(&file) == 0) {
        printf("No valid passwords in the wordlist file: %s\n", argv[ARG_WORDLIST]);
        return -1;
    }

    // first things first, we need to select our CUDA device

    if (get_cuda_device(&device) == -1) {
        printf("No Cuda Device Installed\n");
        return -1;
    }

    // we now need to calculate the optimal amount of threads to use for this card

    calculate_cuda_params(&device);

    // now we input our target hash

    if (strlen(argv[ARG_MD5]) != 32) {
        printf("Not a valid MD5 Hash (should be 32 bytes and only Hex Chars\n");
        return -1;
    }

    // we split the input hash into 4 blocks

    memset(input_hash, 0, sizeof(input_hash));

    for(x=0; x < 4; x++) {
        strncpy(input_hash[x], argv[ARG_MD5] + (x * 8), 8);
        device.target_hash[x] = htonl(_httoi(input_hash[x]));
    }

    // allocate global memory for use on device
    if (cudaMalloc(&device.device_global_memory, device.device_global_memory_len) != CUDA_SUCCESS) {
        printf("Error allocating memory on device (global memory)\n");
        return -1;
    }

    // allocate the 'stats' that will indicate if we are successful in cracking
    if (cudaMalloc(&device.device_stats_memory, sizeof(struct device_stats)) != CUDA_SUCCESS) {
        printf("Error allocating memory on device (stats memory)\n");
        return -1;
    }

    // allocate debug memory if required
    if (cudaMalloc(&device.device_debug_memory, device.device_global_memory_len) != CUDA_SUCCESS) {
        printf("Error allocating memory on device (debug memory)\n");
        return -1;
    }

    // make sure the stats are clear on the device
    if (cudaMemset(device.device_stats_memory, 0, sizeof(struct device_stats)) != CUDA_SUCCESS) {
        printf("Error Clearing Stats on device\n");
        return -1;
    }

    // this is our host memory that we will copy to the graphics card
    if ((device.host_memory = malloc(device.device_global_memory_len)) == NULL) {
        printf("Error allocating memory on host\n");
        return -1;
    }

    // put our target hash into the GPU constant memory as this will not change (and we can't spare shared memory for speed)
    if (cudaMemcpyToSymbol("target_hash", device.target_hash, 16, 0, cudaMemcpyHostToDevice) != CUDA_SUCCESS) {
        printf("Error initalizing constants\n");
        return -1;
    }

#ifdef BENCHMARK
    // these will be used to benchmark
    int counter = 0;
    struct timeval start, end;

    gettimeofday(&start, NULL);
#endif

    int z;

    while(available_words) {
        memset(device.host_memory, 0, device.device_global_memory_len);

        for(x=0; x < (device.device_global_memory_len / 64) && file.words[current_words] != (char *)0; x++, current_words++) {
#ifdef BENCHMARK
            counter++;		// increment counter for this word
#endif
            output = md5_pad(file.words[current_words]);
            memcpy(device.host_memory + (x * 64), output, 64);
        }

        if (file.words[current_words] == (char *)0) {
            // read some more words !
            current_words = 0;
            if (!read_wordlist(&file)) {
                // no more words available
                available_words = 0;
                // we continue as we want to flush the cache !
            }
        }


        // now we need to transfer the MD5 hashes to the graphics card for preperation

        if (cudaMemcpy(device.device_global_memory, device.host_memory, device.device_global_memory_len, cudaMemcpyHostToDevice) != CUDA_SUCCESS) {
            printf("Error Copying Words to GPU\n");
            return -1;
        }

        md5_calculate(&device);		// launch the kernel of the CUDA device

        if (cudaMemcpy(&device.stats, device.device_stats_memory, sizeof(struct device_stats), cudaMemcpyDeviceToHost) != CUDA_SUCCESS) {
            printf("Error Copying STATS from the GPU\n");
            return -1;
        }


#ifdef DEBUG
        // For debug, we will receive the hashes for verification
        memset(device.host_memory, 0, device.device_global_memory_len);
        if (cudaMemcpy(device.host_memory, device.device_debug_memory, device.device_global_memory_len, cudaMemcpyDeviceToHost) != CUDA_SUCCESS) {
            printf("Error Copying words to GPU\n");
            return;
        }

        cudaThreadSynchronize();

        // prints out the debug hash'es
        printf("MD5 registers:\n\n");
        unsigned int *m = (unsigned int *)device.host_memory;
        for(y=0; y <= (device.max_blocks * device.max_threads); y++) {
            printf("------ [%d] -------\n", y);
            printf("A: %08x\n", m[(y * 4) + 0]);
            printf("B: %08x\n", m[(y * 4) + 1]);
            printf("C: %08x\n", m[(y * 4) + 2]);
            printf("D: %08x\n", m[(y * 4) + 3]);
            printf("-------------------\n\n");
        }
#endif

        if (device.stats.hash_found == 1) {
            printf("WORD FOUND: [%s]\n", md5_unpad(device.stats.word));
            break;
        }
    }

    if (device.stats.hash_found != 1) {
        printf("No word could be found for the provided MD5 hash\n");
    }

#ifdef BENCHMARK
    gettimeofday(&end, NULL);
    long long time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (start.tv_sec * (unsigned int)1e6 + start.tv_usec);
    printf("Time taken to check %d hashes: %f seconds\n", counter, (float)((float)time / 1000.0) / 1000.0);
    printf("Words per second: %d\n", counter / (time / 1000) * 1000);
#endif
}