예제 #1
0
파일: Host.c 프로젝트: Haider-BA/Matlab2CPP
// Get the bounds of the chunk area that n
int4 GetComputeBounds(const int2 * size, const int * neighbors)
{
	return make_int4(neighbors[DIR_LEFT] 	!= MPI_PROC_NULL? 0 : 1,
					 neighbors[DIR_TOP] 	!= MPI_PROC_NULL? 0 : 1,
					 neighbors[DIR_RIGHT]   != MPI_PROC_NULL? size->x - 1 : size->x - 2,
					 neighbors[DIR_BOTTOM]  != MPI_PROC_NULL? size->y - 1 : size->y - 2);
}
예제 #2
0
  void denoise(DenoisingTask &denoising, RenderTile &tile)
  {
    ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);

    tile.sample = tile.start_sample + tile.num_samples;

    denoising.functions.construct_transform = function_bind(
        &CPUDevice::denoising_construct_transform, this, &denoising);
    denoising.functions.accumulate = function_bind(
        &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
    denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
    denoising.functions.divide_shadow = function_bind(
        &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
    denoising.functions.non_local_means = function_bind(
        &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
    denoising.functions.combine_halves = function_bind(
        &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
    denoising.functions.get_feature = function_bind(
        &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
    denoising.functions.write_feature = function_bind(
        &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
    denoising.functions.detect_outliers = function_bind(
        &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);

    denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
    denoising.render_buffer.samples = tile.sample;
    denoising.buffer.gpu_temporary_mem = false;

    denoising.run_denoising(&tile);
  }
예제 #3
0
void DenoisingTask::setup_denoising_buffer()
{
	/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
	rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
	rect = rect_expand(rect, radius);
	rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));

	buffer.passes = 14;
	buffer.width = rect.z - rect.x;
	buffer.stride = align_up(buffer.width, 4);
	buffer.h = rect.w - rect.y;
	int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
	buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
	/* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
	int mem_size = align_up(buffer.pass_stride * buffer.passes + 4, alignment_floats);
	buffer.mem.alloc_to_device(mem_size, false);

	/* CPUs process shifts sequentially while GPUs process them in parallel. */
	int num_layers;
	if(buffer.gpu_temporary_mem) {
		/* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
		int max_radius = max(radius, 6);
		int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
		num_layers = 2*num_shifts + 1;
	}
	else {
		num_layers = 3;
	}
	/* Allocate two layers per shift as well as one for the weight accumulation. */
	buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
}
예제 #4
0
void BoxShape::get6FaceQuads(int4* idxOut)
{
	idxOut[0] = make_int4(0,3,2,1);
	idxOut[1] = make_int4(0,1,5,4);
	idxOut[2] = make_int4(1,2,6,5);
	idxOut[3] = make_int4(2,3,7,6);
	idxOut[4] = make_int4(3,0,4,7);
	idxOut[5] = make_int4(4,5,6,7);
}
void AdlPrimitivesDemo::testFill4( Buffer<int4>& buf, int size, Stopwatch& sw )
{
	MyFill::Data* sortData = MyFill::allocate( m_deviceData );
	
	sw.start();

	MyFill::execute( sortData, buf, make_int4(12, 13, 1, 2), size );

	sw.stop();

	MyFill::deallocate( sortData );

	{
		float t = sw.getMs();
		sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "Fill int4: %3.2fGB/s (%3.2fms)", size/t/1000/1000*16, t);		
	}
}
예제 #6
0
void DenoisingTask::reconstruct()
{
	storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
	storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);

	reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
	int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x;
	reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
	                                               target_buffer.stride,
	                                               target_buffer.pass_stride,
	                                               target_buffer.denoising_clean_offset);
	reconstruction_state.source_w = rect.z-rect.x;
	reconstruction_state.source_h = rect.w-rect.y;

	device_sub_ptr color_ptr    (buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride);
	device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
	functions.reconstruct(*color_ptr, *color_var_ptr, target_buffer.ptr);
}
예제 #7
0
void Cloth::addVolume(const float4* vtx, int nVtx, float* vtxMass,
					  const int2* edges, int nEdges, 
					  const int4* tris, int nTris, float dampingFactor,
					  float volumeTarget, float volumeFactor)
{
	int vtxOffset = m_vtx.getSize();

	int triOffset = m_vTris.getSize();

	m_vTris.setSize( triOffset+nTris );
	for(int i=0; i<nTris; i++)
	{
		m_vTris[triOffset+i] = make_int4(tris[i].x+vtxOffset, 
			tris[i].y+vtxOffset, 
			tris[i].z+vtxOffset, 0);
	}

	m_vData.pushBack( VConstraintData(triOffset, nTris, 
		ClothSimulation::calcVolume( vtx, tris, nTris )*volumeTarget, volumeFactor) );

	add( vtx, nVtx, vtxMass, edges, nEdges, dampingFactor );
}
예제 #8
0
int sapporo::set_j_particle(int    address,
                            int    id,
                            double tj, double dtj,
                            double mass,
                            double k18[3],       double j6[3],
                            double a2[3],        double v[3],
                            double x[3],         double snp[3],
                            double crk[3],       double eps) {

  #ifdef DEBUG_PRINT
    cerr << "set_j_particle (Addr: " << address << "  Id: " << id << " )\n";
  #endif

  //Prevent unused compiler warning
  k18 = k18;
    
  predJOnHost  = false; //Reset the buffers on the device since they can be modified
  nj_updated   = true;  //There are particles that are updated

  //Check if the address does not fall outside the allocated memory range
  //if it falls outside that range increase j-memory by 10%
    
  if (address >= nj_max) {
    fprintf(stderr, "Increasing nj_max! Nj_max was: %d  to be stored address: %d \n",
            nj_max, address);
    increase_jMemory();

    //Extra check, if we are still outside nj_max, we quit since particles are not
    //nicely send in order
    if (address >= nj_max) {
      fprintf(stderr, "Increasing nj_max was not enough! Send particles in order to the library! Exit\n");
      exit(-1);
    }
  }

  //Memory has been allocated, now we can store the particles
  //First calculate on which device this particle has to be stored
  //and on which physical address on that device. Note that the particles
  //are distributed to the different devices in a round-robin way (based on the addres)
  int dev           = address % nCUDAdevices;
  int devAddr       = address / nCUDAdevices;
  int storeLoc      = jCopyInformation[dev].count;

  //Store this information, incase particles get overwritten
  map<int, int4>::iterator iterator = mappingFromIndexToDevIndex.find(address);
  map<int, int4>::iterator end      = mappingFromIndexToDevIndex.end();


  if(iterator != end)
  {
    //Particle with this address has been set before, retrieve previous
    //calculated indices and overwrite them with the new info
    int4 addrInfo = (*iterator).second;
    dev           = addrInfo.x;
    storeLoc      = addrInfo.y;
    devAddr       = addrInfo.z;
  }
  else
  {
    //New particle not set before, save address info and increase particles
    //on that specific device by one
    mappingFromIndexToDevIndex[address] = make_int4(dev, storeLoc, devAddr, -1);
    jCopyInformation[dev].count++;
  }


  deviceList[dev]->pos_j_temp[storeLoc] = make_double4(x[0], x[1], x[2], mass);
  deviceList[dev]->address_j[storeLoc]  = devAddr;
  
  if(integrationOrder > GRAPE5)
  {
    deviceList[dev]->t_j_temp[storeLoc]          = make_double2(tj, dtj);
    deviceList[dev]->vel_j_temp[storeLoc]        = make_double4(v[0], v[1], v[2], eps);
    deviceList[dev]->acc_j_temp[storeLoc]        = make_double4(a2[0], a2[1], a2[2], 0.0);
    deviceList[dev]->jrk_j_temp[storeLoc]        = make_double4(j6[0], j6[1], j6[2], 0.0);
    deviceList[dev]->id_j_temp[storeLoc]         = id;
    //For 6th and 8 order we need more parameters
    if(integrationOrder > FOURTH)
    {
      deviceList[dev]->snp_j_temp[storeLoc]        = make_double4(snp[0], snp[1], snp[2], 0.0);
      deviceList[dev]->crk_j_temp[storeLoc]        = make_double4(crk[0], crk[1], crk[2], 0.0);
    }
  }
  

  #ifdef CPU_SUPPORT
    //Put the new j particles directly in the correct location on the host side.
    deviceList[dev]->pos_j[devAddr] = make_double4(x[0], x[1], x[2], mass);
    
    if(integrationOrder > GRAPE5)
    {
      deviceList[dev]->t_j[devAddr]          = make_double2(tj, dtj);
      deviceList[dev]->vel_j[devAddr]        = make_double4(v[0], v[1], v[2], eps);
      deviceList[dev]->acc_j[devAddr]        = make_double4(a2[0], a2[1], a2[2], 0.0);
      deviceList[dev]->jrk_j[devAddr]        = make_double4(j6[0], j6[1], j6[2], 0.0);
      deviceList[dev]->id_j[devAddr]         = id;
      //For 6th and 8 order we need more parameters
      if(integrationOrder > FOURTH)
      {
        deviceList[dev]->snp_j[devAddr]        = make_double4(snp[0], snp[1], snp[2], 0.0);
        deviceList[dev]->crk_j[devAddr]        = make_double4(crk[0], crk[1], crk[2], 0.0);
      }
    }  
  #endif


  #ifdef DEBUG_PRINT
    if(integrationOrder == GRAPE5)
    {
      fprintf(stderr, "Setj ad: %d\tid: %d storeLoc: %d \tpos: %f %f %f m: %f \n", address, id, storeLoc, x[0],x[1],x[2], mass);
    }
    else
    {
      fprintf(stderr, "Setj ad: %d\tid: %d storeLoc: %d \tpos: %f %f %f\t mass: %f \tvel: %f %f %f", address, id, storeLoc, x[0],x[1],x[2],mass, v[0],v[1],v[2]);
      fprintf(stderr, "\tacc: %f %f %f \n", a2[0],a2[1],a2[2]);
      if(integrationOrder > FOURTH)
      {
        fprintf(stderr, "\tsnp: %f %f %f ", snp[0],snp[1],snp[2]);
        fprintf(stderr, "\tcrk: %f %f %f \n", crk[0],crk[1],crk[2]);
      }
    }
  #endif

  return 0;
};
예제 #9
0
__host__
int4 make_int4( const Vector4i& v )
{
    return make_int4( v.x, v.y, v.z, v.w );
}
예제 #10
0
void stream_video(void) {
    char stream_buf[VIDEO_INBUF_SZ + 16]; // padded so libavcodec detects the end
    SOCKET videoSocket = INVALID_SOCKET;
    int keep_waiting = 0;

    if (g_ip != NULL) {
        videoSocket = connectDroidCam(g_ip, g_port);
        if (videoSocket == INVALID_SOCKET) {
            return;
        }
    }
    v_running  =1;
_wait:
    // We are the server
    if (videoSocket == INVALID_SOCKET) {
        videoSocket = accept_inet_connection(g_port);
        if (videoSocket == INVALID_SOCKET) { goto _out; }
        keep_waiting = 1;
    }
    {
        int L = sprintf(stream_buf, VIDEO_REQ, g_webcam_w, g_webcam_h);
        if ( SendRecv(1, stream_buf, L, videoSocket) <= 0 ){
            MSG_ERROR("Connection lost!");
            goto _out;
        }
        dbgprint("Sent request, ");
    }
    memset(stream_buf, 0, sizeof(stream_buf));
    if ( SendRecv(0, stream_buf, 5, videoSocket) <= 0 ){
        MSG_ERROR("Connection reset!\nDroidCam is probably busy with another client");
        goto _out;
    }

    if (decoder_prepare_video(stream_buf) == FALSE) { goto _out; }

    while (1){
        int frameLen;
        char *p = stream_buf;
        if (SendRecv(0, p, 4, videoSocket) == FALSE) goto _out;
        make_int4(frameLen, p[0], p[1], p[2], p[3]);
        SetImageFrameSize(frameLen);

        p = GetImageFrameBuf();
        while (frameLen > 4096) {
            if (SendRecv(0, p, 4096, videoSocket) == FALSE) goto _out;
            frameLen -= 4096;
            p += 4096;
        }
        if (SendRecv(0, p, frameLen, videoSocket) == FALSE) goto _out;
        if (DecodeFrame() == FALSE) break;
    }

_out:
    dbgprint("disconnect\n");
    disconnect(videoSocket);
    decoder_cleanup();

    if (keep_waiting){
        videoSocket = INVALID_SOCKET;
        goto _wait;
    }
    v_running = 0;
    connection_cleanup();
}
예제 #11
0
BoxShape::BoxShape( const float4& halfExtent )
: ShapeBase( SHAPE_BOX )
{
	m_halfExtent = halfExtent;

	m_vtx[0] = make_float4( -halfExtent.x, -halfExtent.y, halfExtent.z );
	m_vtx[1] = make_float4( halfExtent.x, -halfExtent.y, halfExtent.z );
	m_vtx[2] = make_float4( halfExtent.x, -halfExtent.y, -halfExtent.z );
	m_vtx[3] = make_float4( -halfExtent.x, -halfExtent.y, -halfExtent.z );

	m_vtx[4] = make_float4( -halfExtent.x, halfExtent.y, halfExtent.z );
	m_vtx[5] = make_float4( halfExtent.x, halfExtent.y, halfExtent.z );
	m_vtx[6] = make_float4( halfExtent.x, halfExtent.y, -halfExtent.z );
	m_vtx[7] = make_float4( -halfExtent.x, halfExtent.y, -halfExtent.z );

	m_tris[0] = make_int4(0,3,2);
	m_tris[1] = make_int4(0,2,1);

	m_tris[2] = make_int4(4,0,1);
	m_tris[3] = make_int4(4,1,5);

	m_tris[4] = make_int4(5,1,2);
	m_tris[5] = make_int4(5,2,6);

	m_tris[6] = make_int4(6,2,3);
	m_tris[7] = make_int4(6,3,7);

	m_tris[8] = make_int4(7,3,0);
	m_tris[9] = make_int4(7,0,4);

	m_tris[10] = make_int4(7,4,5);
	m_tris[11] = make_int4(7,5,6);


	m_normals[0] = createEquation( m_vtx[0], m_vtx[3], m_vtx[1] );
	m_normals[1] = createEquation( m_vtx[4], m_vtx[0], m_vtx[1] );
	m_normals[2] = createEquation( m_vtx[5], m_vtx[1], m_vtx[2] );
	m_normals[3] = createEquation( m_vtx[6], m_vtx[2], m_vtx[3] );
	m_normals[4] = createEquation( m_vtx[7], m_vtx[3], m_vtx[4] );
	m_normals[5] = createEquation( m_vtx[7], m_vtx[4], m_vtx[5] );
}
void AdlPrimitivesDemo::render()
{
	int size = 1024*256;
//	int size = 1024*64;
	size = NEXTMULTIPLEOF( size, 512 );

	int* host1 = new int[size];
	int2* host2 = new int2[size];
	int4* host4 = new int4[size];
	for(int i=0; i<size; i++) { host1[i] = getRandom(0,0xffff); host2[i] = make_int2( host1[i], i ); host4[i] = make_int4( host2[i].x, host2[i].y, host2[i].x, host2[i].y ); }
	Buffer<int> buf1( m_deviceData, size );
	Buffer<int2> buf2( m_deviceData, size );
	Buffer<int4> buf4( m_deviceData, size );
	buf1.write( host1, size );
	buf2.write( host2, size );
	buf4.write( host4, size );

	Stopwatch sw( m_deviceData );

	m_nTxtLines = 0;
	sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "%d elems", size);
//	testSort( (Buffer<SortData>&)buf2, size, sw );
	testFill1( buf1, size, sw );
	testFill2( buf2, size, sw );
	testFill4( buf4, size, sw );

	test( buf2, size, sw );

	delete [] host1;
	delete [] host2;
	delete [] host4;
}