void WaterPlaneCUDA::update() { glBindBuffer(GL_ARRAY_BUFFER, oldVertexBuffer); float3* verticesTest = (float3*)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); for (int i = 0; i < disturbances.size();i++) { Disturbances *dist = disturbances.at(i); for(int x = dist->xminW; x <= dist->xmaxW; x++) { for (int y = dist->zminW; y <= dist->zmaxW; y++) { float insideCircle = ((x-dist->centerX)*(x-dist->centerX))+((y-dist->centerZ)*(y-dist->centerZ))-dist->radiusSQ; if (insideCircle <= 0) { int vIndex = (y * pointsX) + x; if (vIndex < (pointsX*pointsY)) { verticesTest[vIndex].y = (insideCircle/dist->radiusSQ)*dist->height; } } } } } glUnmapBufferARB(GL_ARRAY_BUFFER); disturbances.clear(); size_t num_bytes; cutilSafeCall(cudaGraphicsMapResources(1, &cuda_newVertex_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_newVertices, &num_bytes, cuda_newVertex_resource)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_oldVertex_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_oldVertices, &num_bytes, cuda_oldVertex_resource)); cutilSafeCall(cudaMemcpyToSymbol("DIM",&pointsX,sizeof(int))); updateWaveMapGPU1(gpu_newVertices,gpu_oldVertices); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_newVertex_resource, 0)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_oldVertex_resource, 0)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_oldVertex_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_oldVertices, &num_bytes, cuda_oldVertex_resource)); cutilSafeCall(cudaGraphicsMapResources(1, &cuda_normalsVB_resource, 0)); cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void**)&gpu_normals, &num_bytes, cuda_normalsVB_resource)); cutilSafeCall(cudaMemcpyToSymbol("DIM",&pointsX,sizeof(int))); updateNormalsGPU1(gpu_oldVertices,gpu_normals); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_normalsVB_resource, 0)); cutilSafeCall(cudaGraphicsUnmapResources(1, &cuda_oldVertex_resource, 0)); //swap between old and new wave map struct cudaGraphicsResource *temp = cuda_oldVertex_resource; cuda_oldVertex_resource = cuda_newVertex_resource; cuda_newVertex_resource = temp; }
void transform(Param<T> out, CParam<T> in, CParam<float> tf, const bool inverse) { dim_type nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const dim_type ntransforms = out.dims[2] / in.dims[2]; // Copy transform to constant memory. CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, cudaMemcpyDeviceToDevice)); dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); const dim_type blocksXPerImage = blocks.x; if(nimages > TI) { dim_type tile_images = divup(nimages, TI); nimages = TI; blocks.x = blocks.x * tile_images; } if (ntransforms > 1) { blocks.y *= ntransforms; } if(inverse) { transform_kernel<T, true, method><<<blocks, threads>>> (out, in, nimages, ntransforms, blocksXPerImage); } else {
static void params_1vb_set(struct psc *psc, struct psc_mparticles *mprts, struct psc_mfields *mflds) { struct params_1vb params; params.dt = psc->dt; for (int d = 0; d < 3; d++) { params.dxi[d] = 1.f / ppsc->patch[0].dx[d]; } params.fnqs = sqr(psc->coeff.alpha) * psc->coeff.cori / psc->coeff.eta; #if CALC_J == CALC_J_1VB_2D #if !(DIM == DIM_YZ) #error inc_params.c: CALC_J_1VB_2D only works for DIM_YZ #endif params.fnqxs = params.fnqs; #else params.fnqxs = ppsc->patch[0].dx[0] * params.fnqs / params.dt; #endif params.fnqys = ppsc->patch[0].dx[1] * params.fnqs / params.dt; params.fnqzs = ppsc->patch[0].dx[2] * params.fnqs / params.dt; assert(psc->nr_kinds <= MAX_NR_KINDS); for (int k = 0; k < ppsc->nr_kinds; k++) { params.dq_kind[k] = .5f * ppsc->coeff.eta * params.dt * ppsc->kinds[k].q / ppsc->kinds[k].m; } if (mprts && mprts->nr_patches > 0) { #if PSC_PARTICLES_AS_CUDA2 struct psc_mparticles_cuda2 *mprts_sub = psc_mparticles_cuda2(mprts); for (int d = 0; d < 3; d++) { params.b_mx[d] = mprts_sub->b_mx[d]; } #else assert(0); #endif } if (mflds) { #if PSC_FIELDS_AS_CUDA2 struct psc_mfields_cuda2 * mflds_sub = psc_mfields_cuda2(mflds); for (int d = 0; d < 3; d++) { params.mx[d] = mflds_sub->im[d]; params.ilg[d] = mflds_sub->ib[d]; assert(mflds_sub->ib[d] == -2 || mflds_sub->im[d] == 1); // assumes BND == 2 } #else assert(0); #endif } #ifndef __CUDACC__ prm = params; #else check(cudaMemcpyToSymbol(prm, ¶ms, sizeof(prm))); #endif }
void CudaMem::cudaMemCpyToSymbolReport(void* dst, const void * src, size_t count, enum cudaMemcpyKind kind) { cudaError_t cudaStatus; cudaStatus = cudaMemcpyToSymbol(dst, src, count, kind); if (cudaStatus != cudaSuccess) { fprintf(stderr,"error cudaMemCopy"); } }
// create the bin boundaries void initBinB( struct pb_TimerSet *timers ) { REAL *binb = (REAL*)malloc((NUM_BINS+1)*sizeof(REAL)); for (int k = 0; k < NUM_BINS+1; k++) { binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) / 60.0*D2R); } pb_SwitchToTimer( timers, pb_TimerID_COPY ); cudaMemcpyToSymbol(dev_binb, binb, (NUM_BINS+1)*sizeof(REAL)); pb_SwitchToTimer( timers, pb_TimerID_COMPUTE ); free(binb); }
const char *para(const char *in) { static char sym[256] = "para_"; sym[5] = '\0'; // necessary because sym is static while(*in == '-') ++in; // skip leading dashes const Z n = strchr(in, '=') - in; strncat(sym, in, n); const R val = atof(in + n + 1); if(cudaMemcpyToSymbol(sym, &val, sizeof(R)) == cudaSuccess) { sprintf(sym + 5 + n, " = %g", val); return sym + 5; } else return NULL; }
SEXP R_auto_cudaMemcpyToSymbol(SEXP r_symbol, SEXP r_src, SEXP r_count, SEXP r_offset, SEXP r_kind) { SEXP r_ans = R_NilValue; const void * symbol = GET_REF(r_symbol, const void ); const void * src = GET_REF(r_src, const void ); size_t count = REAL(r_count)[0]; size_t offset = REAL(r_offset)[0]; enum cudaMemcpyKind kind = (enum cudaMemcpyKind) INTEGER(r_kind)[0]; cudaError_t ans; ans = cudaMemcpyToSymbol(symbol, src, count, offset, kind); r_ans = Renum_convert_cudaError_t(ans) ; return(r_ans); }
inline CudaParallelLaunch( const DriverType & driver , const dim3 & grid , const dim3 & block , const int shmem ) { if ( sizeof( KokkosArray::Impl::CudaTraits::ConstantGlobalBufferType ) < sizeof( DriverType ) ) { KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") ); } if ( CudaTraits::SharedMemoryCapacity < shmem ) { KokkosArray::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); } else if ( shmem ) { cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ); } // Copy functor to constant memory on the device cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) ); // Invoke the driver function on the device cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>(); }
int tiramisu_cuda_memcpy_to_symbol(void * to, void * from, uint64_t size) { handle_cuda_error(cudaMemcpyToSymbol(to, from, size), __FUNCTION__); return 0; }
/** Documented at declaration */ int gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output) { // Get coder struct gpujpeg_coder* coder = &decoder->coder; // Reset durations coder->duration_memory_to = 0.0; coder->duration_memory_from = 0.0; coder->duration_memory_map = 0.0; coder->duration_memory_unmap = 0.0; coder->duration_preprocessor = 0.0; coder->duration_dct_quantization = 0.0; coder->duration_huffman_coder = 0.0; coder->duration_stream = 0.0; coder->duration_in_gpu = 0.0; GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Read JPEG image data if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Perform huffman decoding on CPU (when restart interval is not set) if ( coder->param.restart_interval == 0 ) { if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy quantized data to device memory from cpu memory cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu); } // Perform huffman decoding on GPU (when restart interval is set) else { #ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT // Copy huffman tables to constant memory for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) { int index = (comp_type * GPUJPEG_HUFFMAN_TYPE_COUNT + huff_type); cudaMemcpyToSymbol( (char*)gpujpeg_decoder_table_huffman, &decoder->table_huffman[comp_type][huff_type], sizeof(struct gpujpeg_table_huffman_decoder), index * sizeof(struct gpujpeg_table_huffman_decoder), cudaMemcpyHostToDevice ); } } gpujpeg_cuda_check_error("Decoder copy huffman tables to constant memory"); #endif // Reset huffman output cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t)); // Copy scan data to device memory cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice); gpujpeg_cuda_check_error("Decoder copy compressed data"); // Copy segments to device memory cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice); gpujpeg_cuda_check_error("Decoder copy compressed data"); // Zero output memory cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t)); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu); // Perform huffman decoding if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); } #ifdef GPUJPEG_DCT_FROM_NPP // Perform IDCT and dequantization (implementation from NPP) for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { // Get component struct gpujpeg_component* component = &coder->component[comp]; // Determine table type enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE; //gpujpeg_component_print16(component, component->d_data_quantized); cudaMemset(component->d_data, 0, component->data_size * sizeof(uint8_t)); //Perform inverse DCT NppiSize inv_roi; inv_roi.width = component->data_width * GPUJPEG_BLOCK_SIZE; inv_roi.height = component->data_height / GPUJPEG_BLOCK_SIZE; assert(GPUJPEG_BLOCK_SIZE == 8); NppStatus status = nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R( component->d_data_quantized, component->data_width * GPUJPEG_BLOCK_SIZE * sizeof(int16_t), component->d_data, component->data_width * sizeof(uint8_t), decoder->table_quantization[type].d_table, inv_roi ); if ( status != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Inverse DCT failed (error %d)!\n", status); } //gpujpeg_component_print8(component, component->d_data); } #else // Perform IDCT and dequantization (own CUDA implementation) gpujpeg_idct_gpu(decoder); // Perform IDCT and dequantization (own CPU implementation) // gpujpeg_idct_cpu(decoder); #endif GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Preprocessing if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 ) return -1; GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu); coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); // Set decompressed image size output->data_size = coder->data_raw_size * sizeof(uint8_t); // Set decompressed image if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy decompressed image to host memory cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); // Set output to internal buffer output->data = coder->data_raw; } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); assert(output->data != NULL); // Copy decompressed image to host memory cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Map OpenGL texture int data_size = 0; uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size); assert(data_size == coder->data_raw_size); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy decompressed image to texture pixel buffer object device data cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Unmap OpenGL texture gpujpeg_opengl_texture_unmap(output->texture); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); } else { // Unknown output type assert(0); } return 0; }
cudaError_t WINAPI wine_cudaMemcpyToSymbol( const char *symbol, const void *src, size_t count, size_t offset , enum cudaMemcpyKind kind ) { WINE_TRACE("\n"); return cudaMemcpyToSymbol( symbol, src, count, offset, kind); }
template<class T> static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
int main(int argc, char **argv) { char *output; int x; int y; struct cuda_device device; int available_words = 1; int current_words = 0; struct wordlist_file file; char input_hash[4][9]; print_info(); if (argc != ARG_COUNT) { printf("Usage: %s WORDLIST_FILE MD5_HASH\n", argv[0]); return -1; } if (process_wordlist(argv[ARG_WORDLIST], &file) == -1) { printf("Error Opening Wordlist File: %s\n", argv[ARG_WORDLIST]); return -1; } if (read_wordlist(&file) == 0) { printf("No valid passwords in the wordlist file: %s\n", argv[ARG_WORDLIST]); return -1; } // first things first, we need to select our CUDA device if (get_cuda_device(&device) == -1) { printf("No Cuda Device Installed\n"); return -1; } // we now need to calculate the optimal amount of threads to use for this card calculate_cuda_params(&device); // now we input our target hash if (strlen(argv[ARG_MD5]) != 32) { printf("Not a valid MD5 Hash (should be 32 bytes and only Hex Chars\n"); return -1; } // we split the input hash into 4 blocks memset(input_hash, 0, sizeof(input_hash)); for(x=0; x < 4; x++) { strncpy(input_hash[x], argv[ARG_MD5] + (x * 8), 8); device.target_hash[x] = htonl(_httoi(input_hash[x])); } // allocate global memory for use on device if (cudaMalloc(&device.device_global_memory, device.device_global_memory_len) != CUDA_SUCCESS) { printf("Error allocating memory on device (global memory)\n"); return -1; } // allocate the 'stats' that will indicate if we are successful in cracking if (cudaMalloc(&device.device_stats_memory, sizeof(struct device_stats)) != CUDA_SUCCESS) { printf("Error allocating memory on device (stats memory)\n"); return -1; } // allocate debug memory if required if (cudaMalloc(&device.device_debug_memory, device.device_global_memory_len) != CUDA_SUCCESS) { printf("Error allocating memory on device (debug memory)\n"); return -1; } // make sure the stats are clear on the device if (cudaMemset(device.device_stats_memory, 0, sizeof(struct device_stats)) != CUDA_SUCCESS) { printf("Error Clearing Stats on device\n"); return -1; } // this is our host memory that we will copy to the graphics card if ((device.host_memory = malloc(device.device_global_memory_len)) == NULL) { printf("Error allocating memory on host\n"); return -1; } // put our target hash into the GPU constant memory as this will not change (and we can't spare shared memory for speed) if (cudaMemcpyToSymbol("target_hash", device.target_hash, 16, 0, cudaMemcpyHostToDevice) != CUDA_SUCCESS) { printf("Error initalizing constants\n"); return -1; } #ifdef BENCHMARK // these will be used to benchmark int counter = 0; struct timeval start, end; gettimeofday(&start, NULL); #endif int z; while(available_words) { memset(device.host_memory, 0, device.device_global_memory_len); for(x=0; x < (device.device_global_memory_len / 64) && file.words[current_words] != (char *)0; x++, current_words++) { #ifdef BENCHMARK counter++; // increment counter for this word #endif output = md5_pad(file.words[current_words]); memcpy(device.host_memory + (x * 64), output, 64); } if (file.words[current_words] == (char *)0) { // read some more words ! current_words = 0; if (!read_wordlist(&file)) { // no more words available available_words = 0; // we continue as we want to flush the cache ! } } // now we need to transfer the MD5 hashes to the graphics card for preperation if (cudaMemcpy(device.device_global_memory, device.host_memory, device.device_global_memory_len, cudaMemcpyHostToDevice) != CUDA_SUCCESS) { printf("Error Copying Words to GPU\n"); return -1; } md5_calculate(&device); // launch the kernel of the CUDA device if (cudaMemcpy(&device.stats, device.device_stats_memory, sizeof(struct device_stats), cudaMemcpyDeviceToHost) != CUDA_SUCCESS) { printf("Error Copying STATS from the GPU\n"); return -1; } #ifdef DEBUG // For debug, we will receive the hashes for verification memset(device.host_memory, 0, device.device_global_memory_len); if (cudaMemcpy(device.host_memory, device.device_debug_memory, device.device_global_memory_len, cudaMemcpyDeviceToHost) != CUDA_SUCCESS) { printf("Error Copying words to GPU\n"); return; } cudaThreadSynchronize(); // prints out the debug hash'es printf("MD5 registers:\n\n"); unsigned int *m = (unsigned int *)device.host_memory; for(y=0; y <= (device.max_blocks * device.max_threads); y++) { printf("------ [%d] -------\n", y); printf("A: %08x\n", m[(y * 4) + 0]); printf("B: %08x\n", m[(y * 4) + 1]); printf("C: %08x\n", m[(y * 4) + 2]); printf("D: %08x\n", m[(y * 4) + 3]); printf("-------------------\n\n"); } #endif if (device.stats.hash_found == 1) { printf("WORD FOUND: [%s]\n", md5_unpad(device.stats.word)); break; } } if (device.stats.hash_found != 1) { printf("No word could be found for the provided MD5 hash\n"); } #ifdef BENCHMARK gettimeofday(&end, NULL); long long time = (end.tv_sec * (unsigned int)1e6 + end.tv_usec) - (start.tv_sec * (unsigned int)1e6 + start.tv_usec); printf("Time taken to check %d hashes: %f seconds\n", counter, (float)((float)time / 1000.0) / 1000.0); printf("Words per second: %d\n", counter / (time / 1000) * 1000); #endif }