Exemplo n.º 1
0
unsigned int *
madd(unsigned int *a, unsigned int *b)
{
	if (MSIGN(a) == MSIGN(b))
		return addf(a, b);	// same sign, add together
	else
		return subf(a, b);	// opposite sign, find difference
}
Exemplo n.º 2
0
void
pcl::gpu::PseudoConvexHull3D::reconstruct (const Cloud &cloud, DeviceArray2D<int>& vertexes)
{
    const device::Cloud& c = (const device::Cloud&)cloud;

    device::FacetStream& fs = impl_->fs;
    device::PointStream ps(c);

    ps.computeInitalSimplex();

    device::InitalSimplex simplex = ps.simplex;

    fs.setInitialFacets(ps.simplex);
    ps.initalClassify();

    for(;;)
    {
        //new external points number
        ps.cloud_size = ps.searchFacetHeads(fs.facet_count, fs.head_points);
        if (ps.cloud_size == 0)
            break;

        fs.compactFacets();
        ps.classify(fs);

        if (!fs.canSplit())
            throw PCLException("Can't split facets, please enlarge default buffer", __FILE__, "", __LINE__);

        fs.splitFacets();
    }

    int ecount;
    int fcount = fs.facet_count;
    fs.empty_count.download(&ecount);

    vertexes.create(3, fcount + ecount);
    DeviceArray2D<int> subf(3, fcount, vertexes.ptr(),        vertexes.step());
    DeviceArray2D<int> sube(3, ecount, vertexes.ptr()+fcount, vertexes.step());

    DeviceArray2D<int>(3, fcount, fs.verts_inds.ptr(), fs.verts_inds.step()).copyTo(subf);
    DeviceArray2D<int>(3, ecount, fs.empty_facets.ptr(), fs.empty_facets.step()).copyTo(sube);
}
Exemplo n.º 3
0
void C1_MacroAssembler::initialize_body(Register obj, Register tmp1, Register tmp2,
                                        int obj_size_in_bytes, int hdr_size_in_bytes) {
  const int index = (obj_size_in_bytes - hdr_size_in_bytes) / HeapWordSize;

  const int cl_size         = VM_Version::L1_data_cache_line_size(),
            cl_dwords       = cl_size>>3,
            cl_dw_addr_bits = exact_log2(cl_dwords);

  const Register tmp = R0,
                 base_ptr = tmp1,
                 cnt_dwords = tmp2;

  if (index <= 6) {
    // Use explicit NULL stores.
    if (index > 0) { li(tmp, 0); }
    for (int i = 0; i < index; ++i) { std(tmp, hdr_size_in_bytes + i * HeapWordSize, obj); }

  } else if (index < (2<<cl_dw_addr_bits)-1) {
    // simple loop
    Label loop;

    li(cnt_dwords, index);
    addi(base_ptr, obj, hdr_size_in_bytes); // Compute address of first element.
    li(tmp, 0);
    mtctr(cnt_dwords);                      // Load counter.
  bind(loop);
    std(tmp, 0, base_ptr);                  // Clear 8byte aligned block.
    addi(base_ptr, base_ptr, 8);
    bdnz(loop);

  } else {
    // like clear_memory_doubleword
    Label startloop, fast, fastloop, restloop, done;

    addi(base_ptr, obj, hdr_size_in_bytes);           // Compute address of first element.
    load_const_optimized(cnt_dwords, index);
    rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
    beq(CCR0, fast);                                  // Already 128byte aligned.

    subfic(tmp, tmp, cl_dwords);
    mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
    subf(cnt_dwords, tmp, cnt_dwords); // rest.
    li(tmp, 0);

  bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
    addi(base_ptr, base_ptr, 8);
    bdnz(startloop);

  bind(fast);                                  // Clear 128byte blocks.
    srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
    andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
    mtctr(tmp);                                // Load counter.

  bind(fastloop);
    dcbz(base_ptr);                    // Clear 128byte aligned block.
    addi(base_ptr, base_ptr, cl_size);
    bdnz(fastloop);

    cmpdi(CCR0, cnt_dwords, 0);        // size 0?
    beq(CCR0, done);                   // rest == 0
    li(tmp, 0);
    mtctr(cnt_dwords);                 // Load counter.

  bind(restloop);                      // Clear rest.
    std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
    addi(base_ptr, base_ptr, 8);
    bdnz(restloop);

  bind(done);
  }
}
Exemplo n.º 4
0
// Call an accessor method (assuming it is resolved, otherwise drop into
// vanilla (slow path) entry.
address InterpreterGenerator::generate_accessor_entry(void) {
  if (!UseFastAccessorMethods && (!FLAG_IS_ERGO(UseFastAccessorMethods))) {
    return NULL;
  }

  Label Lslow_path, Lacquire;

  const Register
         Rclass_or_obj = R3_ARG1,
         Rconst_method = R4_ARG2,
         Rcodes        = Rconst_method,
         Rcpool_cache  = R5_ARG3,
         Rscratch      = R11_scratch1,
         Rjvmti_mode   = Rscratch,
         Roffset       = R12_scratch2,
         Rflags        = R6_ARG4,
         Rbtable       = R7_ARG5;

  static address branch_table[number_of_states];

  address entry = __ pc();

  // Check for safepoint:
  // Ditch this, real man don't need safepoint checks.

  // Also check for JVMTI mode
  // Check for null obj, take slow path if so.
  __ ld(Rclass_or_obj, Interpreter::stackElementSize, CC_INTERP_ONLY(R17_tos) NOT_CC_INTERP(R15_esp));
  __ lwz(Rjvmti_mode, thread_(interp_only_mode));
  __ cmpdi(CCR1, Rclass_or_obj, 0);
  __ cmpwi(CCR0, Rjvmti_mode, 0);
  __ crorc(/*CCR0 eq*/2, /*CCR1 eq*/4+2, /*CCR0 eq*/2);
  __ beq(CCR0, Lslow_path); // this==null or jvmti_mode!=0

  // Do 2 things in parallel:
  // 1. Load the index out of the first instruction word, which looks like this:
  //    <0x2a><0xb4><index (2 byte, native endianess)>.
  // 2. Load constant pool cache base.
  __ ld(Rconst_method, in_bytes(Method::const_offset()), R19_method);
  __ ld(Rcpool_cache, in_bytes(ConstMethod::constants_offset()), Rconst_method);

  __ lhz(Rcodes, in_bytes(ConstMethod::codes_offset()) + 2, Rconst_method); // Lower half of 32 bit field.
  __ ld(Rcpool_cache, ConstantPool::cache_offset_in_bytes(), Rcpool_cache);

  // Get the const pool entry by means of <index>.
  const int codes_shift = exact_log2(in_words(ConstantPoolCacheEntry::size()) * BytesPerWord);
  __ slwi(Rscratch, Rcodes, codes_shift); // (codes&0xFFFF)<<codes_shift
  __ add(Rcpool_cache, Rscratch, Rcpool_cache);

  // Check if cpool cache entry is resolved.
  // We are resolved if the indices offset contains the current bytecode.
  ByteSize cp_base_offset = ConstantPoolCache::base_offset();
  // Big Endian:
  __ lbz(Rscratch, in_bytes(cp_base_offset) + in_bytes(ConstantPoolCacheEntry::indices_offset()) + 7 - 2, Rcpool_cache);
  __ cmpwi(CCR0, Rscratch, Bytecodes::_getfield);
  __ bne(CCR0, Lslow_path);
  __ isync(); // Order succeeding loads wrt. load of _indices field from cpool_cache.

  // Finally, start loading the value: Get cp cache entry into regs.
  __ ld(Rflags, in_bytes(cp_base_offset) + in_bytes(ConstantPoolCacheEntry::flags_offset()), Rcpool_cache);
  __ ld(Roffset, in_bytes(cp_base_offset) + in_bytes(ConstantPoolCacheEntry::f2_offset()), Rcpool_cache);

  // Following code is from templateTable::getfield_or_static
  // Load pointer to branch table
  __ load_const_optimized(Rbtable, (address)branch_table, Rscratch);

  // Get volatile flag
  __ rldicl(Rscratch, Rflags, 64-ConstantPoolCacheEntry::is_volatile_shift, 63); // extract volatile bit
  // note: sync is needed before volatile load on PPC64

  // Check field type
  __ rldicl(Rflags, Rflags, 64-ConstantPoolCacheEntry::tos_state_shift, 64-ConstantPoolCacheEntry::tos_state_bits);

#ifdef ASSERT
  Label LFlagInvalid;
  __ cmpldi(CCR0, Rflags, number_of_states);
  __ bge(CCR0, LFlagInvalid);

  __ ld(R9_ARG7, 0, R1_SP);
  __ ld(R10_ARG8, 0, R21_sender_SP);
  __ cmpd(CCR0, R9_ARG7, R10_ARG8);
  __ asm_assert_eq("backlink", 0x543);
#endif // ASSERT
  __ mr(R1_SP, R21_sender_SP); // Cut the stack back to where the caller started.

  // Load from branch table and dispatch (volatile case: one instruction ahead)
  __ sldi(Rflags, Rflags, LogBytesPerWord);
  __ cmpwi(CCR6, Rscratch, 1); // volatile?
  if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
    __ sldi(Rscratch, Rscratch, exact_log2(BytesPerInstWord)); // volatile ? size of 1 instruction : 0
  }
  __ ldx(Rbtable, Rbtable, Rflags);

  if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
    __ subf(Rbtable, Rscratch, Rbtable); // point to volatile/non-volatile entry point
  }
  __ mtctr(Rbtable);
  __ bctr();

#ifdef ASSERT
  __ bind(LFlagInvalid);
  __ stop("got invalid flag", 0x6541);

  bool all_uninitialized = true,
       all_initialized   = true;
  for (int i = 0; i<number_of_states; ++i) {
    all_uninitialized = all_uninitialized && (branch_table[i] == NULL);
    all_initialized   = all_initialized   && (branch_table[i] != NULL);
  }
  assert(all_uninitialized != all_initialized, "consistency"); // either or

  __ fence(); // volatile entry point (one instruction before non-volatile_entry point)
  if (branch_table[vtos] == 0) branch_table[vtos] = __ pc(); // non-volatile_entry point
  if (branch_table[dtos] == 0) branch_table[dtos] = __ pc(); // non-volatile_entry point
  if (branch_table[ftos] == 0) branch_table[ftos] = __ pc(); // non-volatile_entry point
  __ stop("unexpected type", 0x6551);
#endif

  if (branch_table[itos] == 0) { // generate only once
    __ align(32, 28, 28); // align load
    __ fence(); // volatile entry point (one instruction before non-volatile_entry point)
    branch_table[itos] = __ pc(); // non-volatile_entry point
    __ lwax(R3_RET, Rclass_or_obj, Roffset);
    __ beq(CCR6, Lacquire);
    __ blr();
  }

  if (branch_table[ltos] == 0) { // generate only once
    __ align(32, 28, 28); // align load
    __ fence(); // volatile entry point (one instruction before non-volatile_entry point)
    branch_table[ltos] = __ pc(); // non-volatile_entry point
    __ ldx(R3_RET, Rclass_or_obj, Roffset);
    __ beq(CCR6, Lacquire);
    __ blr();
  }

  if (branch_table[btos] == 0) { // generate only once
    __ align(32, 28, 28); // align load
    __ fence(); // volatile entry point (one instruction before non-volatile_entry point)
    branch_table[btos] = __ pc(); // non-volatile_entry point
    __ lbzx(R3_RET, Rclass_or_obj, Roffset);
    __ extsb(R3_RET, R3_RET);
    __ beq(CCR6, Lacquire);
    __ blr();
  }

  if (branch_table[ctos] == 0) { // generate only once
    __ align(32, 28, 28); // align load
    __ fence(); // volatile entry point (one instruction before non-volatile_entry point)
    branch_table[ctos] = __ pc(); // non-volatile_entry point
    __ lhzx(R3_RET, Rclass_or_obj, Roffset);
    __ beq(CCR6, Lacquire);
    __ blr();
  }

  if (branch_table[stos] == 0) { // generate only once
    __ align(32, 28, 28); // align load
    __ fence(); // volatile entry point (one instruction before non-volatile_entry point)
    branch_table[stos] = __ pc(); // non-volatile_entry point
    __ lhax(R3_RET, Rclass_or_obj, Roffset);
    __ beq(CCR6, Lacquire);
    __ blr();
  }

  if (branch_table[atos] == 0) { // generate only once
    __ align(32, 28, 28); // align load
    __ fence(); // volatile entry point (one instruction before non-volatile_entry point)
    branch_table[atos] = __ pc(); // non-volatile_entry point
    __ load_heap_oop(R3_RET, (RegisterOrConstant)Roffset, Rclass_or_obj);
    __ verify_oop(R3_RET);
    //__ dcbt(R3_RET); // prefetch
    __ beq(CCR6, Lacquire);
    __ blr();
  }

  __ align(32, 12);
  __ bind(Lacquire);
  __ twi_0(R3_RET);
  __ isync(); // acquire
  __ blr();

#ifdef ASSERT
  for (int i = 0; i<number_of_states; ++i) {
    assert(branch_table[i], "accessor_entry initialization");
    //tty->print_cr("accessor_entry: branch_table[%d] = 0x%llx (opcode 0x%llx)", i, branch_table[i], *((unsigned int*)branch_table[i]));
  }
#endif

  __ bind(Lslow_path);
  __ branch_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals), Rscratch);
  __ flush();

  return entry;
}
Exemplo n.º 5
0
void HuygensOnCPU::calcFieldResponse(cuComplex *d_res, 
									 const unsigned int nObs, const float *coordObs, 
									 const unsigned int nSrc, const float *coordSrc, 
									 const float *fSrc, const float *apodSrc, 
									 const float *steerFocusDelaySrc, 
									 const float *srcTimeStamp, 
									 const unsigned int *srcPulseLength, 
									 const float timestampObs, 
									 const float refTime, 
									 const float c0,
									 const bool resultOnGPU) 
{
	// calc linear index of observation point

	cuComplex *resp = (cuComplex*) malloc(sizeof(cuComplex)*nObs);

   #pragma omp parallel for
	for (int index = 0; index < nObs; index++)
	{

		if (index < nObs)
		{
			// current observation point
			float3 obs = make_float3(coordObs, index, nObs);

			// init respons
			cuComplex respTemp = make_cuComplex(0.0f, 0.0f);

			// loop over all source points
			for (int n = 0; n < nSrc; n++) 
			{
				// Optimalization plans:
				// all threads will read this value! TODO: Check if this value gets broadcasted. Info: Broadcasting only works for shared memory!
				// If not -> TODO: Delegate one read into shared memory to each thread. If nSrc > blockDim.x, deligate multiple reads to each thread.
				// For CUDA 2.0: Shared memory == User-managed L2 cache. Broadcasting might however help improving the memory throughput.
				float3 src = make_float3(coordSrc, n, nSrc);  

				// fetch steer-focus delay, src timestamp, frequency and pulse length from global memory
				float tStFo			= steerFocusDelaySrc[n];	
				float timestampSrc	= srcTimeStamp[n];			
				float frequencySrc	= fSrc[n];					
				uint pulseL			= srcPulseLength[n];
				float apod			= apodSrc[n];

				// time of flight from source to current observation point
				float dist = absf(subf(obs,src));

				// clamp dist to prevent huge amplitude values
				float minDist = c0/(2*frequencySrc); 
				dist = (minDist < dist)? dist : minDist; 

				float timeOFligth = dist / c0;

				float currentFlightTime = timestampObs - timestampSrc - tStFo;

				if (pulseL == 0) // cw
				{
					if (currentFlightTime > timeOFligth) // source is alive in this obs point
					{
						// find actual time for response computation
						float t = timeOFligth - currentFlightTime;
						//float t = currentFlightTime - timeOFligth;

						// calc attenuation
						float att = dist;

						float r = apod / att;				// complex modulus of greens function
						float p = 2.0f * PI * frequencySrc * t;	// complex phase of greens function

						// calc respons of source in current obs point
						float cosptr = cos(p); 
						float sinptr = sin(p);  
						cuComplex newResp = make_cuComplex(r*cosptr, r*sinptr);

						respTemp = cuCaddf(respTemp, newResp); // add to respTemp
					}
				} 
				else // pw
				{
					float pulseLInSec = pulseL / frequencySrc;
					float halfPulseLInSec = pulseLInSec / 2.0f;

					if (currentFlightTime > timeOFligth - halfPulseLInSec && currentFlightTime < timeOFligth + halfPulseLInSec)
					{
						// pw source is alive in this observation point
						float t = timeOFligth - currentFlightTime;

						// calc attenuation
						float att = dist;

						float cosWeightPuls = cosf(PI * t / pulseLInSec);
						cosWeightPuls *= cosWeightPuls;

						float r = cosWeightPuls * apod / att;	// complex modulus of greens function
						float p = 2.0f * PI * frequencySrc * t;		// complex phase of greens function

						// calc respons of source in current obs point
						float cosptr = cos(p);
						float sinptr = sin(p);
						cuComplex newResp = make_cuComplex(r*cosptr, r*sinptr);

						respTemp = cuCaddf(respTemp, newResp); // add to respTemp

					} else {

						if (timeOFligth <= 1/frequencySrc) // hack to get a relative bright spot in pw-mode. Preventing depth normalization from happening.
						{
							float cosWeightFac = cosf(PI * timeOFligth / (2/frequencySrc));
							respTemp = cuCaddf(respTemp, make_cuComplex(cosWeightFac * cosWeightFac / dist, 0));
						}
					}
				}
			}
			resp[index] = respTemp; // save respons for this observation point
		}
	}

	// copy calculated field to the GPU for presentation
	if (resultOnGPU) {
		cudaMemcpy(d_res, resp, sizeof(cuComplex)*nObs, cudaMemcpyHostToDevice);
	} else {
		memcpy(d_res, resp, sizeof(cuComplex)*nObs);
	}

	free(resp);

	//free((void *)apodSrc);
	//free((void *)coordObs); // this one is now cleaned up by the observation object
	//free((void *)coordSrc);
	//free((void *)fSrc);
	//free((void *)steerFocusDelaySrc);
	//free((void *)srcTimeStamp);
	//free((void *)srcPulseLength);
}