unsigned int * madd(unsigned int *a, unsigned int *b) { if (MSIGN(a) == MSIGN(b)) return addf(a, b); // same sign, add together else return subf(a, b); // opposite sign, find difference }
void pcl::gpu::PseudoConvexHull3D::reconstruct (const Cloud &cloud, DeviceArray2D<int>& vertexes) { const device::Cloud& c = (const device::Cloud&)cloud; device::FacetStream& fs = impl_->fs; device::PointStream ps(c); ps.computeInitalSimplex(); device::InitalSimplex simplex = ps.simplex; fs.setInitialFacets(ps.simplex); ps.initalClassify(); for(;;) { //new external points number ps.cloud_size = ps.searchFacetHeads(fs.facet_count, fs.head_points); if (ps.cloud_size == 0) break; fs.compactFacets(); ps.classify(fs); if (!fs.canSplit()) throw PCLException("Can't split facets, please enlarge default buffer", __FILE__, "", __LINE__); fs.splitFacets(); } int ecount; int fcount = fs.facet_count; fs.empty_count.download(&ecount); vertexes.create(3, fcount + ecount); DeviceArray2D<int> subf(3, fcount, vertexes.ptr(), vertexes.step()); DeviceArray2D<int> sube(3, ecount, vertexes.ptr()+fcount, vertexes.step()); DeviceArray2D<int>(3, fcount, fs.verts_inds.ptr(), fs.verts_inds.step()).copyTo(subf); DeviceArray2D<int>(3, ecount, fs.empty_facets.ptr(), fs.empty_facets.step()).copyTo(sube); }
void C1_MacroAssembler::initialize_body(Register obj, Register tmp1, Register tmp2, int obj_size_in_bytes, int hdr_size_in_bytes) { const int index = (obj_size_in_bytes - hdr_size_in_bytes) / HeapWordSize; const int cl_size = VM_Version::L1_data_cache_line_size(), cl_dwords = cl_size>>3, cl_dw_addr_bits = exact_log2(cl_dwords); const Register tmp = R0, base_ptr = tmp1, cnt_dwords = tmp2; if (index <= 6) { // Use explicit NULL stores. if (index > 0) { li(tmp, 0); } for (int i = 0; i < index; ++i) { std(tmp, hdr_size_in_bytes + i * HeapWordSize, obj); } } else if (index < (2<<cl_dw_addr_bits)-1) { // simple loop Label loop; li(cnt_dwords, index); addi(base_ptr, obj, hdr_size_in_bytes); // Compute address of first element. li(tmp, 0); mtctr(cnt_dwords); // Load counter. bind(loop); std(tmp, 0, base_ptr); // Clear 8byte aligned block. addi(base_ptr, base_ptr, 8); bdnz(loop); } else { // like clear_memory_doubleword Label startloop, fast, fastloop, restloop, done; addi(base_ptr, obj, hdr_size_in_bytes); // Compute address of first element. load_const_optimized(cnt_dwords, index); rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. beq(CCR0, fast); // Already 128byte aligned. subfic(tmp, tmp, cl_dwords); mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). subf(cnt_dwords, tmp, cnt_dwords); // rest. li(tmp, 0); bind(startloop); // Clear at the beginning to reach 128byte boundary. std(tmp, 0, base_ptr); // Clear 8byte aligned block. addi(base_ptr, base_ptr, 8); bdnz(startloop); bind(fast); // Clear 128byte blocks. srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. mtctr(tmp); // Load counter. bind(fastloop); dcbz(base_ptr); // Clear 128byte aligned block. addi(base_ptr, base_ptr, cl_size); bdnz(fastloop); cmpdi(CCR0, cnt_dwords, 0); // size 0? beq(CCR0, done); // rest == 0 li(tmp, 0); mtctr(cnt_dwords); // Load counter. bind(restloop); // Clear rest. std(tmp, 0, base_ptr); // Clear 8byte aligned block. addi(base_ptr, base_ptr, 8); bdnz(restloop); bind(done); } }
// Call an accessor method (assuming it is resolved, otherwise drop into // vanilla (slow path) entry. address InterpreterGenerator::generate_accessor_entry(void) { if (!UseFastAccessorMethods && (!FLAG_IS_ERGO(UseFastAccessorMethods))) { return NULL; } Label Lslow_path, Lacquire; const Register Rclass_or_obj = R3_ARG1, Rconst_method = R4_ARG2, Rcodes = Rconst_method, Rcpool_cache = R5_ARG3, Rscratch = R11_scratch1, Rjvmti_mode = Rscratch, Roffset = R12_scratch2, Rflags = R6_ARG4, Rbtable = R7_ARG5; static address branch_table[number_of_states]; address entry = __ pc(); // Check for safepoint: // Ditch this, real man don't need safepoint checks. // Also check for JVMTI mode // Check for null obj, take slow path if so. __ ld(Rclass_or_obj, Interpreter::stackElementSize, CC_INTERP_ONLY(R17_tos) NOT_CC_INTERP(R15_esp)); __ lwz(Rjvmti_mode, thread_(interp_only_mode)); __ cmpdi(CCR1, Rclass_or_obj, 0); __ cmpwi(CCR0, Rjvmti_mode, 0); __ crorc(/*CCR0 eq*/2, /*CCR1 eq*/4+2, /*CCR0 eq*/2); __ beq(CCR0, Lslow_path); // this==null or jvmti_mode!=0 // Do 2 things in parallel: // 1. Load the index out of the first instruction word, which looks like this: // <0x2a><0xb4><index (2 byte, native endianess)>. // 2. Load constant pool cache base. __ ld(Rconst_method, in_bytes(Method::const_offset()), R19_method); __ ld(Rcpool_cache, in_bytes(ConstMethod::constants_offset()), Rconst_method); __ lhz(Rcodes, in_bytes(ConstMethod::codes_offset()) + 2, Rconst_method); // Lower half of 32 bit field. __ ld(Rcpool_cache, ConstantPool::cache_offset_in_bytes(), Rcpool_cache); // Get the const pool entry by means of <index>. const int codes_shift = exact_log2(in_words(ConstantPoolCacheEntry::size()) * BytesPerWord); __ slwi(Rscratch, Rcodes, codes_shift); // (codes&0xFFFF)<<codes_shift __ add(Rcpool_cache, Rscratch, Rcpool_cache); // Check if cpool cache entry is resolved. // We are resolved if the indices offset contains the current bytecode. ByteSize cp_base_offset = ConstantPoolCache::base_offset(); // Big Endian: __ lbz(Rscratch, in_bytes(cp_base_offset) + in_bytes(ConstantPoolCacheEntry::indices_offset()) + 7 - 2, Rcpool_cache); __ cmpwi(CCR0, Rscratch, Bytecodes::_getfield); __ bne(CCR0, Lslow_path); __ isync(); // Order succeeding loads wrt. load of _indices field from cpool_cache. // Finally, start loading the value: Get cp cache entry into regs. __ ld(Rflags, in_bytes(cp_base_offset) + in_bytes(ConstantPoolCacheEntry::flags_offset()), Rcpool_cache); __ ld(Roffset, in_bytes(cp_base_offset) + in_bytes(ConstantPoolCacheEntry::f2_offset()), Rcpool_cache); // Following code is from templateTable::getfield_or_static // Load pointer to branch table __ load_const_optimized(Rbtable, (address)branch_table, Rscratch); // Get volatile flag __ rldicl(Rscratch, Rflags, 64-ConstantPoolCacheEntry::is_volatile_shift, 63); // extract volatile bit // note: sync is needed before volatile load on PPC64 // Check field type __ rldicl(Rflags, Rflags, 64-ConstantPoolCacheEntry::tos_state_shift, 64-ConstantPoolCacheEntry::tos_state_bits); #ifdef ASSERT Label LFlagInvalid; __ cmpldi(CCR0, Rflags, number_of_states); __ bge(CCR0, LFlagInvalid); __ ld(R9_ARG7, 0, R1_SP); __ ld(R10_ARG8, 0, R21_sender_SP); __ cmpd(CCR0, R9_ARG7, R10_ARG8); __ asm_assert_eq("backlink", 0x543); #endif // ASSERT __ mr(R1_SP, R21_sender_SP); // Cut the stack back to where the caller started. // Load from branch table and dispatch (volatile case: one instruction ahead) __ sldi(Rflags, Rflags, LogBytesPerWord); __ cmpwi(CCR6, Rscratch, 1); // volatile? if (support_IRIW_for_not_multiple_copy_atomic_cpu) { __ sldi(Rscratch, Rscratch, exact_log2(BytesPerInstWord)); // volatile ? size of 1 instruction : 0 } __ ldx(Rbtable, Rbtable, Rflags); if (support_IRIW_for_not_multiple_copy_atomic_cpu) { __ subf(Rbtable, Rscratch, Rbtable); // point to volatile/non-volatile entry point } __ mtctr(Rbtable); __ bctr(); #ifdef ASSERT __ bind(LFlagInvalid); __ stop("got invalid flag", 0x6541); bool all_uninitialized = true, all_initialized = true; for (int i = 0; i<number_of_states; ++i) { all_uninitialized = all_uninitialized && (branch_table[i] == NULL); all_initialized = all_initialized && (branch_table[i] != NULL); } assert(all_uninitialized != all_initialized, "consistency"); // either or __ fence(); // volatile entry point (one instruction before non-volatile_entry point) if (branch_table[vtos] == 0) branch_table[vtos] = __ pc(); // non-volatile_entry point if (branch_table[dtos] == 0) branch_table[dtos] = __ pc(); // non-volatile_entry point if (branch_table[ftos] == 0) branch_table[ftos] = __ pc(); // non-volatile_entry point __ stop("unexpected type", 0x6551); #endif if (branch_table[itos] == 0) { // generate only once __ align(32, 28, 28); // align load __ fence(); // volatile entry point (one instruction before non-volatile_entry point) branch_table[itos] = __ pc(); // non-volatile_entry point __ lwax(R3_RET, Rclass_or_obj, Roffset); __ beq(CCR6, Lacquire); __ blr(); } if (branch_table[ltos] == 0) { // generate only once __ align(32, 28, 28); // align load __ fence(); // volatile entry point (one instruction before non-volatile_entry point) branch_table[ltos] = __ pc(); // non-volatile_entry point __ ldx(R3_RET, Rclass_or_obj, Roffset); __ beq(CCR6, Lacquire); __ blr(); } if (branch_table[btos] == 0) { // generate only once __ align(32, 28, 28); // align load __ fence(); // volatile entry point (one instruction before non-volatile_entry point) branch_table[btos] = __ pc(); // non-volatile_entry point __ lbzx(R3_RET, Rclass_or_obj, Roffset); __ extsb(R3_RET, R3_RET); __ beq(CCR6, Lacquire); __ blr(); } if (branch_table[ctos] == 0) { // generate only once __ align(32, 28, 28); // align load __ fence(); // volatile entry point (one instruction before non-volatile_entry point) branch_table[ctos] = __ pc(); // non-volatile_entry point __ lhzx(R3_RET, Rclass_or_obj, Roffset); __ beq(CCR6, Lacquire); __ blr(); } if (branch_table[stos] == 0) { // generate only once __ align(32, 28, 28); // align load __ fence(); // volatile entry point (one instruction before non-volatile_entry point) branch_table[stos] = __ pc(); // non-volatile_entry point __ lhax(R3_RET, Rclass_or_obj, Roffset); __ beq(CCR6, Lacquire); __ blr(); } if (branch_table[atos] == 0) { // generate only once __ align(32, 28, 28); // align load __ fence(); // volatile entry point (one instruction before non-volatile_entry point) branch_table[atos] = __ pc(); // non-volatile_entry point __ load_heap_oop(R3_RET, (RegisterOrConstant)Roffset, Rclass_or_obj); __ verify_oop(R3_RET); //__ dcbt(R3_RET); // prefetch __ beq(CCR6, Lacquire); __ blr(); } __ align(32, 12); __ bind(Lacquire); __ twi_0(R3_RET); __ isync(); // acquire __ blr(); #ifdef ASSERT for (int i = 0; i<number_of_states; ++i) { assert(branch_table[i], "accessor_entry initialization"); //tty->print_cr("accessor_entry: branch_table[%d] = 0x%llx (opcode 0x%llx)", i, branch_table[i], *((unsigned int*)branch_table[i])); } #endif __ bind(Lslow_path); __ branch_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals), Rscratch); __ flush(); return entry; }
void HuygensOnCPU::calcFieldResponse(cuComplex *d_res, const unsigned int nObs, const float *coordObs, const unsigned int nSrc, const float *coordSrc, const float *fSrc, const float *apodSrc, const float *steerFocusDelaySrc, const float *srcTimeStamp, const unsigned int *srcPulseLength, const float timestampObs, const float refTime, const float c0, const bool resultOnGPU) { // calc linear index of observation point cuComplex *resp = (cuComplex*) malloc(sizeof(cuComplex)*nObs); #pragma omp parallel for for (int index = 0; index < nObs; index++) { if (index < nObs) { // current observation point float3 obs = make_float3(coordObs, index, nObs); // init respons cuComplex respTemp = make_cuComplex(0.0f, 0.0f); // loop over all source points for (int n = 0; n < nSrc; n++) { // Optimalization plans: // all threads will read this value! TODO: Check if this value gets broadcasted. Info: Broadcasting only works for shared memory! // If not -> TODO: Delegate one read into shared memory to each thread. If nSrc > blockDim.x, deligate multiple reads to each thread. // For CUDA 2.0: Shared memory == User-managed L2 cache. Broadcasting might however help improving the memory throughput. float3 src = make_float3(coordSrc, n, nSrc); // fetch steer-focus delay, src timestamp, frequency and pulse length from global memory float tStFo = steerFocusDelaySrc[n]; float timestampSrc = srcTimeStamp[n]; float frequencySrc = fSrc[n]; uint pulseL = srcPulseLength[n]; float apod = apodSrc[n]; // time of flight from source to current observation point float dist = absf(subf(obs,src)); // clamp dist to prevent huge amplitude values float minDist = c0/(2*frequencySrc); dist = (minDist < dist)? dist : minDist; float timeOFligth = dist / c0; float currentFlightTime = timestampObs - timestampSrc - tStFo; if (pulseL == 0) // cw { if (currentFlightTime > timeOFligth) // source is alive in this obs point { // find actual time for response computation float t = timeOFligth - currentFlightTime; //float t = currentFlightTime - timeOFligth; // calc attenuation float att = dist; float r = apod / att; // complex modulus of greens function float p = 2.0f * PI * frequencySrc * t; // complex phase of greens function // calc respons of source in current obs point float cosptr = cos(p); float sinptr = sin(p); cuComplex newResp = make_cuComplex(r*cosptr, r*sinptr); respTemp = cuCaddf(respTemp, newResp); // add to respTemp } } else // pw { float pulseLInSec = pulseL / frequencySrc; float halfPulseLInSec = pulseLInSec / 2.0f; if (currentFlightTime > timeOFligth - halfPulseLInSec && currentFlightTime < timeOFligth + halfPulseLInSec) { // pw source is alive in this observation point float t = timeOFligth - currentFlightTime; // calc attenuation float att = dist; float cosWeightPuls = cosf(PI * t / pulseLInSec); cosWeightPuls *= cosWeightPuls; float r = cosWeightPuls * apod / att; // complex modulus of greens function float p = 2.0f * PI * frequencySrc * t; // complex phase of greens function // calc respons of source in current obs point float cosptr = cos(p); float sinptr = sin(p); cuComplex newResp = make_cuComplex(r*cosptr, r*sinptr); respTemp = cuCaddf(respTemp, newResp); // add to respTemp } else { if (timeOFligth <= 1/frequencySrc) // hack to get a relative bright spot in pw-mode. Preventing depth normalization from happening. { float cosWeightFac = cosf(PI * timeOFligth / (2/frequencySrc)); respTemp = cuCaddf(respTemp, make_cuComplex(cosWeightFac * cosWeightFac / dist, 0)); } } } } resp[index] = respTemp; // save respons for this observation point } } // copy calculated field to the GPU for presentation if (resultOnGPU) { cudaMemcpy(d_res, resp, sizeof(cuComplex)*nObs, cudaMemcpyHostToDevice); } else { memcpy(d_res, resp, sizeof(cuComplex)*nObs); } free(resp); //free((void *)apodSrc); //free((void *)coordObs); // this one is now cleaned up by the observation object //free((void *)coordSrc); //free((void *)fSrc); //free((void *)steerFocusDelaySrc); //free((void *)srcTimeStamp); //free((void *)srcPulseLength); }