// Get the bounds of the chunk area that n int4 GetComputeBounds(const int2 * size, const int * neighbors) { return make_int4(neighbors[DIR_LEFT] != MPI_PROC_NULL? 0 : 1, neighbors[DIR_TOP] != MPI_PROC_NULL? 0 : 1, neighbors[DIR_RIGHT] != MPI_PROC_NULL? size->x - 1 : size->x - 2, neighbors[DIR_BOTTOM] != MPI_PROC_NULL? size->y - 1 : size->y - 2); }
void denoise(DenoisingTask &denoising, RenderTile &tile) { ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); tile.sample = tile.start_sample + tile.num_samples; denoising.functions.construct_transform = function_bind( &CPUDevice::denoising_construct_transform, this, &denoising); denoising.functions.accumulate = function_bind( &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); denoising.functions.divide_shadow = function_bind( &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.non_local_means = function_bind( &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); denoising.functions.combine_halves = function_bind( &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); denoising.functions.get_feature = function_bind( &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); denoising.functions.write_feature = function_bind( &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); denoising.functions.detect_outliers = function_bind( &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); denoising.render_buffer.samples = tile.sample; denoising.buffer.gpu_temporary_mem = false; denoising.run_denoising(&tile); }
void DenoisingTask::setup_denoising_buffer() { /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); rect = rect_expand(rect, radius); rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); buffer.passes = 14; buffer.width = rect.z - rect.x; buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats); buffer.mem.alloc_to_device(mem_size, false); /* CPUs process shifts sequentially while GPUs process them in parallel. */ int num_layers; if(buffer.gpu_temporary_mem) { /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ int max_radius = max(radius, 6); int num_shifts = (2*max_radius + 1) * (2*max_radius + 1); num_layers = 2*num_shifts + 1; } else { num_layers = 3; } /* Allocate two layers per shift as well as one for the weight accumulation. */ buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); }
void BoxShape::get6FaceQuads(int4* idxOut) { idxOut[0] = make_int4(0,3,2,1); idxOut[1] = make_int4(0,1,5,4); idxOut[2] = make_int4(1,2,6,5); idxOut[3] = make_int4(2,3,7,6); idxOut[4] = make_int4(3,0,4,7); idxOut[5] = make_int4(4,5,6,7); }
void AdlPrimitivesDemo::testFill4( Buffer<int4>& buf, int size, Stopwatch& sw ) { MyFill::Data* sortData = MyFill::allocate( m_deviceData ); sw.start(); MyFill::execute( sortData, buf, make_int4(12, 13, 1, 2), size ); sw.stop(); MyFill::deallocate( sortData ); { float t = sw.getMs(); sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "Fill int4: %3.2fGB/s (%3.2fms)", size/t/1000/1000*16, t); } }
void DenoisingTask::reconstruct() { storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x; reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset, target_buffer.stride, target_buffer.pass_stride, target_buffer.denoising_clean_offset); reconstruction_state.source_w = rect.z-rect.x; reconstruction_state.source_h = rect.w-rect.y; device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr); }
void Cloth::addVolume(const float4* vtx, int nVtx, float* vtxMass, const int2* edges, int nEdges, const int4* tris, int nTris, float dampingFactor, float volumeTarget, float volumeFactor) { int vtxOffset = m_vtx.getSize(); int triOffset = m_vTris.getSize(); m_vTris.setSize( triOffset+nTris ); for(int i=0; i<nTris; i++) { m_vTris[triOffset+i] = make_int4(tris[i].x+vtxOffset, tris[i].y+vtxOffset, tris[i].z+vtxOffset, 0); } m_vData.pushBack( VConstraintData(triOffset, nTris, ClothSimulation::calcVolume( vtx, tris, nTris )*volumeTarget, volumeFactor) ); add( vtx, nVtx, vtxMass, edges, nEdges, dampingFactor ); }
int sapporo::set_j_particle(int address, int id, double tj, double dtj, double mass, double k18[3], double j6[3], double a2[3], double v[3], double x[3], double snp[3], double crk[3], double eps) { #ifdef DEBUG_PRINT cerr << "set_j_particle (Addr: " << address << " Id: " << id << " )\n"; #endif //Prevent unused compiler warning k18 = k18; predJOnHost = false; //Reset the buffers on the device since they can be modified nj_updated = true; //There are particles that are updated //Check if the address does not fall outside the allocated memory range //if it falls outside that range increase j-memory by 10% if (address >= nj_max) { fprintf(stderr, "Increasing nj_max! Nj_max was: %d to be stored address: %d \n", nj_max, address); increase_jMemory(); //Extra check, if we are still outside nj_max, we quit since particles are not //nicely send in order if (address >= nj_max) { fprintf(stderr, "Increasing nj_max was not enough! Send particles in order to the library! Exit\n"); exit(-1); } } //Memory has been allocated, now we can store the particles //First calculate on which device this particle has to be stored //and on which physical address on that device. Note that the particles //are distributed to the different devices in a round-robin way (based on the addres) int dev = address % nCUDAdevices; int devAddr = address / nCUDAdevices; int storeLoc = jCopyInformation[dev].count; //Store this information, incase particles get overwritten map<int, int4>::iterator iterator = mappingFromIndexToDevIndex.find(address); map<int, int4>::iterator end = mappingFromIndexToDevIndex.end(); if(iterator != end) { //Particle with this address has been set before, retrieve previous //calculated indices and overwrite them with the new info int4 addrInfo = (*iterator).second; dev = addrInfo.x; storeLoc = addrInfo.y; devAddr = addrInfo.z; } else { //New particle not set before, save address info and increase particles //on that specific device by one mappingFromIndexToDevIndex[address] = make_int4(dev, storeLoc, devAddr, -1); jCopyInformation[dev].count++; } deviceList[dev]->pos_j_temp[storeLoc] = make_double4(x[0], x[1], x[2], mass); deviceList[dev]->address_j[storeLoc] = devAddr; if(integrationOrder > GRAPE5) { deviceList[dev]->t_j_temp[storeLoc] = make_double2(tj, dtj); deviceList[dev]->vel_j_temp[storeLoc] = make_double4(v[0], v[1], v[2], eps); deviceList[dev]->acc_j_temp[storeLoc] = make_double4(a2[0], a2[1], a2[2], 0.0); deviceList[dev]->jrk_j_temp[storeLoc] = make_double4(j6[0], j6[1], j6[2], 0.0); deviceList[dev]->id_j_temp[storeLoc] = id; //For 6th and 8 order we need more parameters if(integrationOrder > FOURTH) { deviceList[dev]->snp_j_temp[storeLoc] = make_double4(snp[0], snp[1], snp[2], 0.0); deviceList[dev]->crk_j_temp[storeLoc] = make_double4(crk[0], crk[1], crk[2], 0.0); } } #ifdef CPU_SUPPORT //Put the new j particles directly in the correct location on the host side. deviceList[dev]->pos_j[devAddr] = make_double4(x[0], x[1], x[2], mass); if(integrationOrder > GRAPE5) { deviceList[dev]->t_j[devAddr] = make_double2(tj, dtj); deviceList[dev]->vel_j[devAddr] = make_double4(v[0], v[1], v[2], eps); deviceList[dev]->acc_j[devAddr] = make_double4(a2[0], a2[1], a2[2], 0.0); deviceList[dev]->jrk_j[devAddr] = make_double4(j6[0], j6[1], j6[2], 0.0); deviceList[dev]->id_j[devAddr] = id; //For 6th and 8 order we need more parameters if(integrationOrder > FOURTH) { deviceList[dev]->snp_j[devAddr] = make_double4(snp[0], snp[1], snp[2], 0.0); deviceList[dev]->crk_j[devAddr] = make_double4(crk[0], crk[1], crk[2], 0.0); } } #endif #ifdef DEBUG_PRINT if(integrationOrder == GRAPE5) { fprintf(stderr, "Setj ad: %d\tid: %d storeLoc: %d \tpos: %f %f %f m: %f \n", address, id, storeLoc, x[0],x[1],x[2], mass); } else { fprintf(stderr, "Setj ad: %d\tid: %d storeLoc: %d \tpos: %f %f %f\t mass: %f \tvel: %f %f %f", address, id, storeLoc, x[0],x[1],x[2],mass, v[0],v[1],v[2]); fprintf(stderr, "\tacc: %f %f %f \n", a2[0],a2[1],a2[2]); if(integrationOrder > FOURTH) { fprintf(stderr, "\tsnp: %f %f %f ", snp[0],snp[1],snp[2]); fprintf(stderr, "\tcrk: %f %f %f \n", crk[0],crk[1],crk[2]); } } #endif return 0; };
__host__ int4 make_int4( const Vector4i& v ) { return make_int4( v.x, v.y, v.z, v.w ); }
void stream_video(void) { char stream_buf[VIDEO_INBUF_SZ + 16]; // padded so libavcodec detects the end SOCKET videoSocket = INVALID_SOCKET; int keep_waiting = 0; if (g_ip != NULL) { videoSocket = connectDroidCam(g_ip, g_port); if (videoSocket == INVALID_SOCKET) { return; } } v_running =1; _wait: // We are the server if (videoSocket == INVALID_SOCKET) { videoSocket = accept_inet_connection(g_port); if (videoSocket == INVALID_SOCKET) { goto _out; } keep_waiting = 1; } { int L = sprintf(stream_buf, VIDEO_REQ, g_webcam_w, g_webcam_h); if ( SendRecv(1, stream_buf, L, videoSocket) <= 0 ){ MSG_ERROR("Connection lost!"); goto _out; } dbgprint("Sent request, "); } memset(stream_buf, 0, sizeof(stream_buf)); if ( SendRecv(0, stream_buf, 5, videoSocket) <= 0 ){ MSG_ERROR("Connection reset!\nDroidCam is probably busy with another client"); goto _out; } if (decoder_prepare_video(stream_buf) == FALSE) { goto _out; } while (1){ int frameLen; char *p = stream_buf; if (SendRecv(0, p, 4, videoSocket) == FALSE) goto _out; make_int4(frameLen, p[0], p[1], p[2], p[3]); SetImageFrameSize(frameLen); p = GetImageFrameBuf(); while (frameLen > 4096) { if (SendRecv(0, p, 4096, videoSocket) == FALSE) goto _out; frameLen -= 4096; p += 4096; } if (SendRecv(0, p, frameLen, videoSocket) == FALSE) goto _out; if (DecodeFrame() == FALSE) break; } _out: dbgprint("disconnect\n"); disconnect(videoSocket); decoder_cleanup(); if (keep_waiting){ videoSocket = INVALID_SOCKET; goto _wait; } v_running = 0; connection_cleanup(); }
BoxShape::BoxShape( const float4& halfExtent ) : ShapeBase( SHAPE_BOX ) { m_halfExtent = halfExtent; m_vtx[0] = make_float4( -halfExtent.x, -halfExtent.y, halfExtent.z ); m_vtx[1] = make_float4( halfExtent.x, -halfExtent.y, halfExtent.z ); m_vtx[2] = make_float4( halfExtent.x, -halfExtent.y, -halfExtent.z ); m_vtx[3] = make_float4( -halfExtent.x, -halfExtent.y, -halfExtent.z ); m_vtx[4] = make_float4( -halfExtent.x, halfExtent.y, halfExtent.z ); m_vtx[5] = make_float4( halfExtent.x, halfExtent.y, halfExtent.z ); m_vtx[6] = make_float4( halfExtent.x, halfExtent.y, -halfExtent.z ); m_vtx[7] = make_float4( -halfExtent.x, halfExtent.y, -halfExtent.z ); m_tris[0] = make_int4(0,3,2); m_tris[1] = make_int4(0,2,1); m_tris[2] = make_int4(4,0,1); m_tris[3] = make_int4(4,1,5); m_tris[4] = make_int4(5,1,2); m_tris[5] = make_int4(5,2,6); m_tris[6] = make_int4(6,2,3); m_tris[7] = make_int4(6,3,7); m_tris[8] = make_int4(7,3,0); m_tris[9] = make_int4(7,0,4); m_tris[10] = make_int4(7,4,5); m_tris[11] = make_int4(7,5,6); m_normals[0] = createEquation( m_vtx[0], m_vtx[3], m_vtx[1] ); m_normals[1] = createEquation( m_vtx[4], m_vtx[0], m_vtx[1] ); m_normals[2] = createEquation( m_vtx[5], m_vtx[1], m_vtx[2] ); m_normals[3] = createEquation( m_vtx[6], m_vtx[2], m_vtx[3] ); m_normals[4] = createEquation( m_vtx[7], m_vtx[3], m_vtx[4] ); m_normals[5] = createEquation( m_vtx[7], m_vtx[4], m_vtx[5] ); }
void AdlPrimitivesDemo::render() { int size = 1024*256; // int size = 1024*64; size = NEXTMULTIPLEOF( size, 512 ); int* host1 = new int[size]; int2* host2 = new int2[size]; int4* host4 = new int4[size]; for(int i=0; i<size; i++) { host1[i] = getRandom(0,0xffff); host2[i] = make_int2( host1[i], i ); host4[i] = make_int4( host2[i].x, host2[i].y, host2[i].x, host2[i].y ); } Buffer<int> buf1( m_deviceData, size ); Buffer<int2> buf2( m_deviceData, size ); Buffer<int4> buf4( m_deviceData, size ); buf1.write( host1, size ); buf2.write( host2, size ); buf4.write( host4, size ); Stopwatch sw( m_deviceData ); m_nTxtLines = 0; sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "%d elems", size); // testSort( (Buffer<SortData>&)buf2, size, sw ); testFill1( buf1, size, sw ); testFill2( buf2, size, sw ); testFill4( buf4, size, sw ); test( buf2, size, sw ); delete [] host1; delete [] host2; delete [] host4; }