void AvxGatherFloat(void) { const int merge_no = 0; const int merge_yes = 0x80000000; const int n = 15; float x[n]; __declspec(align(32)) YmmVal des; __declspec(align(32)) YmmVal indices; __declspec(align(32)) YmmVal mask; // Initialize the test array srand(22); for (int i = 0; i < n; i++) x[i] = (float)(rand() % 1000); // Load des with initial values for (int i = 0; i < 8; i++) des.r32[i] = -1.0f; // Initialize the indices indices.i32[0] = 2; indices.i32[1] = 1; indices.i32[2] = 6; indices.i32[3] = 5; indices.i32[4] = 4; indices.i32[5] = 13; indices.i32[6] = 11; indices.i32[7] = 9; // Initialize the mask value mask.i32[0] = merge_yes; mask.i32[1] = merge_yes; mask.i32[2] = merge_no; mask.i32[3] = merge_yes; mask.i32[4] = merge_yes; mask.i32[5] = merge_no; mask.i32[6] = merge_yes; mask.i32[7] = merge_yes; printf("\nResults for AvxGatherFloat()\n"); printf("Test array\n"); for (int i = 0; i < n; i++) printf("x[%02d]: %6.1f\n", i, x[i]); printf("\n"); const char* s1 = "Values BEFORE call to AvxGatherFloat_()"; const char* s2 = "Values AFTER call to AvxGatherFloat_()"; AvxGatherFloatPrint(s1, des, indices, mask); AvxGatherFloat_(&des, &indices, &mask, x); AvxGatherFloatPrint(s2, des, indices, mask); }
void IP_SingleColour::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const { __declspec(align(16)) PROCDATA_SINGLECOLOUR temp[3]; copyUserDataSingleColour(temp, (PROCDATA_SINGLECOLOUR*)vertexUserData[0]); copyUserDataSingleColour(temp + 1, (PROCDATA_SINGLECOLOUR*)vertexUserData[1]); copyUserDataSingleColour(temp + 2, (PROCDATA_SINGLECOLOUR*)vertexUserData[2]); mcemaths_mul_3_4(temp[0].normal, correctedContributes[0]); mcemaths_mul_3_4(temp[0].worldPos, correctedContributes[0]); mcemaths_mul_3_4(temp[0].shadowcoord, correctedContributes[0]); mcemaths_mul_3_4(temp[1].normal, correctedContributes[1]); mcemaths_mul_3_4(temp[1].worldPos, correctedContributes[1]); mcemaths_mul_3_4(temp[1].shadowcoord, correctedContributes[1]); mcemaths_mul_3_4(temp[2].normal, correctedContributes[2]); mcemaths_mul_3_4(temp[2].worldPos, correctedContributes[2]); mcemaths_mul_3_4(temp[2].shadowcoord, correctedContributes[2]); PROCDATA_SINGLECOLOUR* output = (PROCDATA_SINGLECOLOUR*)interpolatedUserData; mcemaths_quatcpy(output->normal, temp[0].normal); mcemaths_quatcpy(output->worldPos, temp[0].worldPos); mcemaths_quatcpy(output->shadowcoord, temp[0].shadowcoord); mcemaths_add_3_4_ip(output->normal, temp[1].normal); mcemaths_add_3_4_ip(output->worldPos, temp[1].worldPos); mcemaths_add_3_4_ip(output->shadowcoord, temp[1].shadowcoord); mcemaths_add_3_4_ip(output->normal, temp[2].normal); mcemaths_add_3_4_ip(output->worldPos, temp[2].worldPos); mcemaths_add_3_4_ip(output->shadowcoord, temp[2].shadowcoord); }
int main() { __declspec(target(mic)) nbwcs* struct1; send_inputs(struct1); use_the_data(struct1); receive_results(struct1); return 0; }
void TestFig8_3(void) { __declspec(align(16)) Mat4x4 m_src1; __declspec(align(16)) Mat4x4 m_src2; __declspec(align(16)) Mat4x4 m_des; Mat4x4SetRow(m_src1, 0, 1, 0, 0, 0); Mat4x4SetRow(m_src1, 1, 0, 1, 0, 0); Mat4x4SetRow(m_src1, 2, 0, 0, 1, 0); Mat4x4SetRow(m_src1, 3, 0, 0, 0, 1); Mat4x4SetRow(m_src2, 0, 2, 7, 8, 3); Mat4x4SetRow(m_src2, 1, 11, 14, 16, 10); Mat4x4SetRow(m_src2, 2, 24, 21, 27, 29); Mat4x4SetRow(m_src2, 3, 31, 34, 38, 33); SsePfpMatrix4x4Multiply_(m_des, m_src1, m_src2); }
void AvxGatherI64(void) { const Int64 merge_no = 0; const Int64 merge_yes = 0x8000000000000000LL; const int n = 15; Int64 x[n]; __declspec(align(32)) YmmVal des; __declspec(align(16)) XmmVal indices; __declspec(align(32)) YmmVal mask; // Initialize the test array srand(36); for (int i = 0; i < n; i++) x[i] = (Int64)(rand() % 1000); // Load des with initial values for (int i = 0; i < 4; i++) des.i64[i] = -1; // Initialize the indices and mask elements indices.i32[0] = 2; indices.i32[1] = 7; indices.i32[2] = 9; indices.i32[3] = 12; mask.i64[0] = merge_yes; mask.i64[1] = merge_yes; mask.i64[2] = merge_no; mask.i64[3] = merge_yes; printf("\nResults for AvxGatherI64()\n"); printf("Test array\n"); for (int i = 0; i < n; i++) printf("x[%02d]: %8lld\n", i, x[i]); printf("\n"); const char* s1 = "Values BEFORE call to AvxGatherI64_()"; const char* s2 = "Values AFTER call to AvxGatherI64_()"; AvxGatherI64Print(s1, des, indices, mask); AvxGatherI64_(&des, &indices, &mask, x); AvxGatherI64Print(s2, des, indices, mask); }
void CHeapAllocator::free(void * addr, size_t nbytes) { if ((m_hHeap != NULL)&&(addr)) { if (HeapFree(m_hHeap,0,addr) && m_bLogSize) { #pragma warning(suppress: 4267) __declspec(align(4)) LONG mysize = 0L - nbytes; ::InterlockedExchangeAdd(&m_lAllocatedSize,mysize); } } }
void read_luma_inter_pred_avg_8x16_intrinsic( BYTE *address1, BYTE *address2, INT stride_src, BYTE *dst, INT stride_dst ) { int i; int src_stride = stride_src; int dst_stride = stride_dst; const unsigned char* src1 = address1; const unsigned char* src2 = address2; for( i = 0; i < 16; i+=8) { __declspec(align(16)) __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_x, r1_x, r2_x, r3_x, r4_x, r5_x, r6_x, r7_x; int stride2 = (src_stride<<1); int stride4 = (src_stride<<2); int dst_stride2 = (dst_stride<<1); int dst_stride4 = (dst_stride<<2); r0 = _mm_loadl_epi64((__m128i*)(src1)); r1 = _mm_loadl_epi64((__m128i*)(src1+src_stride)); r2 = _mm_loadl_epi64((__m128i*)(src1+stride2)); r3 = _mm_loadl_epi64((__m128i*)(src1+stride2+src_stride)); r4 = _mm_loadl_epi64((__m128i*)(src1+stride4)); r5 = _mm_loadl_epi64((__m128i*)(src1+stride4+src_stride)); r6 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2)); r7 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2+src_stride)); r0_x = _mm_loadl_epi64((__m128i*)(src2)); r1_x = _mm_loadl_epi64((__m128i*)(src2+src_stride)); r2_x = _mm_loadl_epi64((__m128i*)(src2+stride2)); r3_x = _mm_loadl_epi64((__m128i*)(src2+stride2+src_stride)); r4_x = _mm_loadl_epi64((__m128i*)(src2+stride4)); r5_x = _mm_loadl_epi64((__m128i*)(src2+stride4+src_stride)); r6_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2)); r7_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2+src_stride)); r0 = _mm_avg_epu8(r0, r0_x); r1 = _mm_avg_epu8(r1, r1_x); r2 = _mm_avg_epu8(r2, r2_x); r3 = _mm_avg_epu8(r3, r3_x); r4 = _mm_avg_epu8(r4, r4_x); r5 = _mm_avg_epu8(r5, r5_x); r6 = _mm_avg_epu8(r6, r6_x); r7 = _mm_avg_epu8(r7, r7_x); _mm_storel_epi64((__m128i*)(dst), r0); _mm_storel_epi64((__m128i*)(dst+dst_stride), r1); _mm_storel_epi64((__m128i*)(dst+dst_stride2), r2); _mm_storel_epi64((__m128i*)(dst+dst_stride2+dst_stride), r3); _mm_storel_epi64((__m128i*)(dst+dst_stride4), r4); _mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride), r5); _mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2), r6); _mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2+dst_stride), r7); src1 += (stride4<<1); src2 += (stride4<<1); dst += (dst_stride4<<1); } }
static gboolean gum_windows_get_thread_details (DWORD thread_id, GumThreadDetails * details) { gboolean success = FALSE; __declspec (align (64)) CONTEXT context = { 0, }; details->id = thread_id; if (thread_id == GetCurrentThreadId ()) { details->state = GUM_THREAD_RUNNING; RtlCaptureContext (&context); gum_windows_parse_context (&context, &details->cpu_context); success = TRUE; } else { HANDLE thread; thread = OpenThread (THREAD_GET_CONTEXT | THREAD_SUSPEND_RESUME, FALSE, thread_id); if (thread != NULL) { DWORD previous_suspend_count; previous_suspend_count = SuspendThread (thread); if (previous_suspend_count != (DWORD) -1) { if (previous_suspend_count == 0) details->state = GUM_THREAD_RUNNING; else details->state = GUM_THREAD_STOPPED; context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; if (GetThreadContext (thread, &context)) { gum_windows_parse_context (&context, &details->cpu_context); success = TRUE; } ResumeThread (thread); } CloseHandle (thread); } } return success; }
double Timing::getTimeUsd() { #if defined(WIN32) || defined(_WIN32) __declspec(align(16)) LARGE_INTEGER counter; QueryPerformanceCounter(&counter); return ((1000000.0 * (double)counter.QuadPart) / (double)instance->frequency.QuadPart); #elif defined(__linux__) timespec current; clock_gettime(CLOCK_MONOTONIC, ¤t); return ((double)current.tv_sec * 1000000.0 + (double)current.tv_nsec / 1000.0); #endif return 0.0f; }
unsigned long Timing::getTimeUsul() { #if defined(WIN32) || defined(_WIN32) __declspec(align(16)) LARGE_INTEGER counter; QueryPerformanceCounter(&counter); return ((1000000 * counter.QuadPart) / instance->frequency.QuadPart); #elif defined(__linux__) timespec current; clock_gettime(CLOCK_MONOTONIC, ¤t); return ((unsigned long)current.tv_sec * 1000000 + (unsigned long)current.tv_nsec / 1000); #endif return 0; }
void IP_Planet::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const { __declspec(align(16)) PROCDATA_PLANET temp[3]; copyUserData(temp, (PROCDATA_PLANET*)vertexUserData[0]); copyUserData(temp + 1, (PROCDATA_PLANET*)vertexUserData[1]); copyUserData(temp + 2, (PROCDATA_PLANET*)vertexUserData[2]); mcemaths_mul_3_4(temp[0].tangent, correctedContributes[0]); mcemaths_mul_3_4(temp[0].binormal, correctedContributes[0]); mcemaths_mul_3_4(temp[0].normal, correctedContributes[0]); mcemaths_mul_3_4(temp[0].worldPos, correctedContributes[0]); mcemaths_mul_3_4(temp[0].texcoord, correctedContributes[0]); mcemaths_mul_3_4(temp[0].shadowcoord, correctedContributes[0]); mcemaths_mul_3_4(temp[1].tangent, correctedContributes[1]); mcemaths_mul_3_4(temp[1].binormal, correctedContributes[1]); mcemaths_mul_3_4(temp[1].normal, correctedContributes[1]); mcemaths_mul_3_4(temp[1].worldPos, correctedContributes[1]); mcemaths_mul_3_4(temp[1].texcoord, correctedContributes[1]); mcemaths_mul_3_4(temp[1].shadowcoord, correctedContributes[1]); mcemaths_mul_3_4(temp[2].tangent, correctedContributes[2]); mcemaths_mul_3_4(temp[2].binormal, correctedContributes[2]); mcemaths_mul_3_4(temp[2].normal, correctedContributes[2]); mcemaths_mul_3_4(temp[2].worldPos, correctedContributes[2]); mcemaths_mul_3_4(temp[2].texcoord, correctedContributes[2]); mcemaths_mul_3_4(temp[2].shadowcoord, correctedContributes[2]); PROCDATA_PLANET* output = (PROCDATA_PLANET*)interpolatedUserData; mcemaths_quatcpy(output->tangent, temp[0].tangent); mcemaths_quatcpy(output->binormal, temp[0].binormal); mcemaths_quatcpy(output->normal, temp[0].normal); mcemaths_quatcpy(output->worldPos, temp[0].worldPos); mcemaths_quatcpy(output->texcoord, temp[0].texcoord); mcemaths_quatcpy(output->shadowcoord, temp[0].shadowcoord); mcemaths_add_3_4_ip(output->tangent, temp[1].tangent); mcemaths_add_3_4_ip(output->binormal, temp[1].binormal); mcemaths_add_3_4_ip(output->normal, temp[1].normal); mcemaths_add_3_4_ip(output->worldPos, temp[1].worldPos); mcemaths_add_3_4_ip(output->texcoord, temp[1].texcoord); mcemaths_add_3_4_ip(output->shadowcoord, temp[1].shadowcoord); mcemaths_add_3_4_ip(output->tangent, temp[2].tangent); mcemaths_add_3_4_ip(output->binormal, temp[2].binormal); mcemaths_add_3_4_ip(output->normal, temp[2].normal); mcemaths_add_3_4_ip(output->worldPos, temp[2].worldPos); mcemaths_add_3_4_ip(output->texcoord, temp[2].texcoord); mcemaths_add_3_4_ip(output->shadowcoord, temp[2].shadowcoord); }
void SsePiMul32(void) { __declspec(align(16)) XmmVal a; __declspec(align(16)) XmmVal b; __declspec(align(16)) XmmVal c[2]; char buff[256]; a.i32[0] = 10; b.i32[0] = 100; a.i32[1] = 20; b.i32[1] = -200; a.i32[2] = -30; b.i32[2] = 300; a.i32[3] = -40; b.i32[3] = -400; SsePiMul32_(&a, &b, c); printf("\nResults for SsePiMul32_\n"); printf("a: %s\n", a.ToString_i32(buff, sizeof(buff))); printf("b: %s\n", b.ToString_i32(buff, sizeof(buff))); printf("c[0]: %s\n", c[0].ToString_i32(buff, sizeof(buff))); printf("\n"); printf("a: %s\n", a.ToString_i32(buff, sizeof(buff))); printf("b: %s\n", b.ToString_i32(buff, sizeof(buff))); printf("c[1]: %s\n", c[1].ToString_i64(buff, sizeof(buff))); }
void * CHeapAllocator::alloc(size_t nbytes) { if ((m_hHeap != NULL)&&(nbytes)) { void * pVoid = HeapAlloc(m_hHeap,0,nbytes); if (pVoid && m_bLogSize) { #pragma warning(suppress: 4267) __declspec(align(4)) LONG mysize = nbytes; ::InterlockedExchangeAdd(&m_lAllocatedSize,mysize); } return pVoid; } return NULL; }
void read_luma_inter_pred_avg_16x16_intrinsic( BYTE *address1, BYTE *address2, INT stride_src, BYTE *dst, INT stride_dst ) { for(int i = 0; i < 16; i+=8) { __declspec(align(16)) __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_x, r1_x, r2_x, r3_x, r4_x, r5_x, r6_x, r7_x; int stride2 = (stride_src<<1); int stride4 = (stride_src<<2); int dst_stride2 = (stride_dst<<1); int dst_stride4 = (stride_dst<<2); r0 = _mm_loadu_si128((__m128i*)(address1)); r1 = _mm_loadu_si128((__m128i*)(address1+stride_dst)); r2 = _mm_loadu_si128((__m128i*)(address1+stride2)); r3 = _mm_loadu_si128((__m128i*)(address1+stride2+stride_src)); r4 = _mm_loadu_si128((__m128i*)(address1+stride4)); r5 = _mm_loadu_si128((__m128i*)(address1+stride4+stride_src)); r6 = _mm_loadu_si128((__m128i*)(address1+stride4+stride2)); r7 = _mm_loadu_si128((__m128i*)(address1+stride4+stride2+stride_src)); r0_x = _mm_loadu_si128((__m128i*)(address2)); r1_x = _mm_loadu_si128((__m128i*)(address2+stride_src)); r2_x = _mm_loadu_si128((__m128i*)(address2+stride2)); r3_x = _mm_loadu_si128((__m128i*)(address2+stride2+stride_src)); r4_x = _mm_loadu_si128((__m128i*)(address2+stride4)); r5_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride_dst)); r6_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride2)); r7_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride2+stride_dst)); r0 = _mm_avg_epu8(r0, r0_x); r1 = _mm_avg_epu8(r1, r1_x); r2 = _mm_avg_epu8(r2, r2_x); r3 = _mm_avg_epu8(r3, r3_x); r4 = _mm_avg_epu8(r4, r4_x); r5 = _mm_avg_epu8(r5, r5_x); r6 = _mm_avg_epu8(r6, r6_x); r7 = _mm_avg_epu8(r7, r7_x); _mm_storeu_si128((__m128i*)(dst), r0); _mm_storeu_si128((__m128i*)(dst+stride_dst), r1); _mm_storeu_si128((__m128i*)(dst+dst_stride2), r2); _mm_storeu_si128((__m128i*)(dst+dst_stride2+stride_dst), r3); _mm_storeu_si128((__m128i*)(dst+dst_stride4), r4); _mm_storeu_si128((__m128i*)(dst+dst_stride4+stride_dst), r5); _mm_storeu_si128((__m128i*)(dst+dst_stride4+dst_stride2), r6); _mm_storeu_si128((__m128i*)(dst+dst_stride4+dst_stride2+stride_dst), r7); address1 += (stride4<<1); address2 += (stride4<<1); dst += (dst_stride4<<1); } }
double vNormalIntegral(double b) { __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; //NN/2-1 has to be the multiple of 8 //NN = (8*LV+1)*2, LV = 20 -> NN = 322 //const int NN = 322; const int vecsize = 8; const int nCal = (NN/2-1)/vecsize; //const int left = NN%vecsize; double a = 0.0f; double s, h, sum = 0.0f; h = (b-a)/NN; // add in the first few terms sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0); // and the last one sum += exp(-b*b/2.0); vec_cf0 = _mm512_set1_pd(a); vec_cf1 = _mm512_set1_pd(2*h); vec_cf2 = _mm512_set1_pd(-0.5); vec_s = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize vec_s = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h) vec_s = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h) vec_stp = _mm512_set1_pd(2*h*vecsize-h); vec_cf0 = _mm512_set1_pd(h); for (int i = 0; i < nCal; ++i){ vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum sum += 2.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_cf0);//s+=h vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp); sum += 4.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_stp); } sum = 0.5*sqrt(2*PI) + h*sum/3.0; return sum; }
void IP_CloudShadow::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const { __declspec(align(16)) PROCDATA_CLOUDSHADOW temp[3]; copyUserData(temp, (PROCDATA_CLOUDSHADOW*)vertexUserData[0]); copyUserData(temp + 1, (PROCDATA_CLOUDSHADOW*)vertexUserData[1]); copyUserData(temp + 2, (PROCDATA_CLOUDSHADOW*)vertexUserData[2]); mcemaths_mul_3_4(temp[0].texcoord, correctedContributes[0]); mcemaths_mul_3_4(temp[1].texcoord, correctedContributes[1]); mcemaths_mul_3_4(temp[2].texcoord, correctedContributes[2]); PROCDATA_CLOUDSHADOW* output = (PROCDATA_CLOUDSHADOW*)interpolatedUserData; mcemaths_quatcpy(output->texcoord, temp[0].texcoord); mcemaths_add_3_4_ip(output->texcoord, temp[1].texcoord); mcemaths_add_3_4_ip(output->texcoord, temp[2].texcoord); }
void eBezierSpline::evaluate(eF32 t, eVector3& resultPos, eQuat& resultRot) const { resultRot = rot0.slerp(t, rot1); __declspec(align(16)) eVector3 distNorm; eVector3::cubicBezier(t, control0, control1, control2, control3, resultPos, distNorm); eVector3 look = resultRot.getVector(ax); eVector3 side = (look^distNorm); eF32 sideLenSqr = side.sqrLength(); if(sideLenSqr > eALMOST_ZERO) { side /= eSqrt(sideLenSqr); eF32 dot = eClamp(-1.0f, look * distNorm, 1.0f); eF32 alpha = eACos(dot) * (1.0f / (2.0f * ePI)); eQuat rotation(side, alpha); resultRot = rotation * resultRot; } /**/ }
void conv_Short2ToShort1(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSshort* src = reinterpret_cast<const LSshort*>(s); s32 num = numSamples >> 2; //8個のshortをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const __m128i izero = _mm_setzero_si128(); __declspec(align(16)) LSshort tmp[8]; const LSshort* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ //32bit整数r0, r1に変換 __m128i t0 = _mm_loadu_si128((const __m128i*)p); __m128i t1 = _mm_cmpgt_epi16(izero, t0); __m128i r0 = _mm_unpackhi_epi16(t0, t1); __m128i r1 = _mm_unpacklo_epi16(t0, t1); __m128i r2 = _mm_add_epi32(r0, _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 3, 0, 1))); __m128i r3 = _mm_add_epi32(r1, _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 3, 0, 1))); r2 = _mm_srai_epi32(r2, 1); r3 = _mm_srai_epi32(r3, 1); __m128i r4 = _mm_packs_epi32(r3, r2); _mm_store_si128((__m128i*)tmp, r4); q[0] = tmp[0]; q[1] = tmp[2]; q[2] = tmp[4]; q[3] = tmp[6]; p += 8; q += 4; } for(s32 i=0; i<rem; ++i){ s32 j = i<<1; s32 t = (p[j+0] + p[j+1]) >> 1; q[i] = static_cast<LSshort>(t); } }
void scale_perform64_method(t_times *x, t_object *dsp64, double **ins, long numins, double **outs, long numouts, long sampleframes, long flags, void *userparam) { int invec = (int) userparam; // used to signal which one is the signal input (1 for right, 0 for left) t_double *in = ins[invec]; t_double *out = outs[0]; t_double val = x->x_val; #if defined(WIN_VERSION) && defined(WIN_SSE_INTRINSICS) __m128d mm_in1; __m128d *mm_out1 = (__m128d *) out; __m128d mm_in2; __m128d *mm_out2 = (__m128d *) out; __m128d mm_val; __declspec(align(16)) t_double aligned_val = x->x_val; int i; mm_val = _mm_set1_pd(aligned_val); // rbs fix: this version will break if the SVS is smaller than 4 C74_ASSERT(sampleframes >= 4); for (i=0; i < sampleframes; i+=4) { mm_in1 = _mm_load_pd(in+i); mm_out1[i/2] = _mm_mul_pd(mm_in1, mm_val); mm_in2 = _mm_load_pd(in+i+2); mm_out2[i/2 + 1] = _mm_mul_pd(mm_in2, mm_val); } #else t_double ftmp; // if (IS_DENORM_DOUBLE(*in)) { // static int counter = 0; // post("times~ (%p): saw denorm (%d)", x, counter++); // } while (sampleframes--) { ftmp = val **in++; FIX_DENORM_NAN_DOUBLE(ftmp); *out++ = ftmp; } #endif }
gboolean gum_process_modify_thread (GumThreadId thread_id, GumModifyThreadFunc func, gpointer user_data) { gboolean success = FALSE; HANDLE thread; __declspec (align (64)) CONTEXT context = { 0, }; GumCpuContext cpu_context; thread = OpenThread (THREAD_GET_CONTEXT | THREAD_SET_CONTEXT | THREAD_SUSPEND_RESUME, FALSE, thread_id); if (thread == NULL) goto beach; if (SuspendThread (thread) == (DWORD) -1) goto beach; context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER; if (!GetThreadContext (thread, &context)) goto beach; gum_windows_parse_context (&context, &cpu_context); func (thread_id, &cpu_context, user_data); gum_windows_unparse_context (&cpu_context, &context); if (!SetThreadContext (thread, &context)) { ResumeThread (thread); goto beach; } success = ResumeThread (thread) != (DWORD) -1; beach: if (thread != NULL) CloseHandle (thread); return success; }
void IP_DiffuseOnly::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const { __declspec(align(16)) PROCDATA_DIFFUSEONLY temp[3]; copyUserDataDiffuseOnly(temp, (PROCDATA_DIFFUSEONLY*)vertexUserData[0]); copyUserDataDiffuseOnly(temp + 1, (PROCDATA_DIFFUSEONLY*)vertexUserData[1]); copyUserDataDiffuseOnly(temp + 2, (PROCDATA_DIFFUSEONLY*)vertexUserData[2]); mcemaths_mul_3_4(temp[0].normal, correctedContributes[0]); mcemaths_mul_3_4(temp[0].worldPos, correctedContributes[0]); mcemaths_mul_3_4(temp[0].texcoord, correctedContributes[0]); mcemaths_mul_3_4(temp[0].shadowcoord, correctedContributes[0]); mcemaths_mul_3_4(temp[1].normal, correctedContributes[1]); mcemaths_mul_3_4(temp[1].worldPos, correctedContributes[1]); mcemaths_mul_3_4(temp[1].texcoord, correctedContributes[1]); mcemaths_mul_3_4(temp[1].shadowcoord, correctedContributes[1]); mcemaths_mul_3_4(temp[2].normal, correctedContributes[2]); mcemaths_mul_3_4(temp[2].worldPos, correctedContributes[2]); mcemaths_mul_3_4(temp[2].texcoord, correctedContributes[2]); mcemaths_mul_3_4(temp[2].shadowcoord, correctedContributes[2]); PROCDATA_DIFFUSEONLY* output = (PROCDATA_DIFFUSEONLY*)interpolatedUserData; mcemaths_quatcpy(output->normal, temp[0].normal); mcemaths_quatcpy(output->worldPos, temp[0].worldPos); mcemaths_quatcpy(output->texcoord, temp[0].texcoord); mcemaths_quatcpy(output->shadowcoord, temp[0].shadowcoord); mcemaths_add_3_4_ip(output->normal, temp[1].normal); mcemaths_add_3_4_ip(output->worldPos, temp[1].worldPos); mcemaths_add_3_4_ip(output->texcoord, temp[1].texcoord); mcemaths_add_3_4_ip(output->shadowcoord, temp[1].shadowcoord); mcemaths_add_3_4_ip(output->normal, temp[2].normal); mcemaths_add_3_4_ip(output->worldPos, temp[2].worldPos); mcemaths_add_3_4_ip(output->texcoord, temp[2].texcoord); mcemaths_add_3_4_ip(output->shadowcoord, temp[2].shadowcoord); }
void conv_Float1ToShort2(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSfloat* src = reinterpret_cast<const LSfloat*>(s); s32 num = numSamples >> 2; //4個のfloatをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const __m128 fcoff = _mm_set1_ps(32768.0f); __declspec(align(16)) LSshort tmp[8]; const LSfloat* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ __m128 f32_0 = _mm_mul_ps(_mm_loadu_ps(p), fcoff); __m128i s32_0 = _mm_cvtps_epi32(f32_0); __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0); _mm_store_si128((__m128i*)tmp, s16_0); q[0] = tmp[0]; q[1] = tmp[0]; q[2] = tmp[1]; q[3] = tmp[1]; q[4] = tmp[2]; q[5] = tmp[2]; q[6] = tmp[3]; q[7] = tmp[3]; p += 4; q += 8; } for(s32 i=0; i<rem; ++i){ s32 j=i<<1; q[j+0] = q[j+1] = toShort(p[i]); } }
void conv_Float2ToShort1(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSfloat* src = reinterpret_cast<const LSfloat*>(s); s32 num = numSamples >> 2; //4個のfloatをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const __m128 fcoff = _mm_set1_ps(32768.0f*0.5f); //half __declspec(align(16)) LSshort tmp[8]; const LSfloat* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ __m128 f32_0 = _mm_loadu_ps(p); __m128 f32_1 = _mm_add_ps(f32_0, _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(2, 3, 0, 1))); __m128 f32_2 = _mm_mul_ps(f32_1, fcoff); __m128i s32_0 = _mm_cvtps_epi32(f32_2); __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0); _mm_store_si128((__m128i*)tmp, s16_0); q[0] = tmp[0]; q[1] = tmp[2]; p += 4; q += 2; } for(s32 i=0; i<rem; ++i){ s32 j = i<<1; f32 v = 0.5f*(src[j+0] + src[j+1]); q[i] = toShort(v); } }
* */ static char where_to_get_source[] = "http://www.nethack.org/"; static char author[] = "The NetHack Development Team"; #include "hack.h" #include "wintty.h" #include "win32api.h" extern HANDLE hConIn; extern INPUT_RECORD ir; char dllname[512]; char *shortdllname; int FDECL(__declspec(dllexport) __cdecl ProcessKeystroke, (HANDLE hConIn, INPUT_RECORD *ir, boolean *valid, BOOLEAN_P numberpad, int portdebug)); int WINAPI DllMain(HINSTANCE hInstance, DWORD fdwReason, PVOID pvReserved) { char dlltmpname[512]; char *tmp = dlltmpname, *tmp2; *(tmp + GetModuleFileNameA(hInstance, tmp, 511)) = '\0'; (void)strcpy(dllname, tmp); tmp2 = strrchr(dllname, '\\'); if (tmp2) { tmp2++; shortdllname = tmp2; } return TRUE;
// returns whether the shape was drawn eBool eLSystem::drawShapes(eMesh& destMesh, tDrawState& state, const tTurtleState& turtle0, const tTurtleState& turtle1, eF32 shapeLen, eF32 stexY0, eF32 stexY1, eBool forceDraw, eU32 numParts) { eF32 partLen = eLerp(this->m_sizePar * (eF32)numParts, 0.0001f, detail); // eF32 partLen = eLerp(eF32_MAX, 0.0001f, detail); if(partLen <= 0.0f) partLen = eALMOST_ZERO; eF32 numToDrawF = (eF32)shapeLen / partLen; if(!forceDraw) { if(numToDrawF <= 1.0f) return false; } eU32 numDraw = eCeil(eClamp(1.0f, numToDrawF, (eF32)m_gen_rings)); eU32 numFaces = numDraw * m_gen_edges * 2; eU32 faceNr = 0; ePROFILER_ZONE("L-System - Draw Shapes"); __declspec(align(16)) const eVector3 control0 = turtle0.position; __declspec(align(16)) const eVector3 control1 = control0 + turtle0.rotation.getVector(2) * 0.333333f * shapeLen; __declspec(align(16)) const eVector3 control3 = turtle1.position; __declspec(align(16)) const eVector3 control2 = control3 - turtle1.rotation.getVector(2) * 0.333333f * shapeLen; eF32 rscale0 = turtle0.size * turtle0.width; eF32 rscale1 = turtle1.size * turtle1.width; for(eU32 d = 0; d < numDraw; d++) { eF32 t0 = ((eF32)d / (eF32)numDraw); eF32 t1 = ((eF32)(d + 1) / (eF32)numDraw); for(eU32 r = 0; r <= 1; r++) { eF32 tt = (r == 0) ? t0 : t1; eF32 rscale = eLerp(rscale0, rscale1, tt); if((r != 0) || (state.lastVertices->size() == 0)) { // create ring vertices __declspec(align(16)) eVector3 position; __declspec(align(16)) eVector3 normal; // calculate bezier curve position __m128 mt = _mm_set1_ps(tt); __m128 mtinv = _mm_set1_ps(1.0f - tt); __m128 mcp0 = _mm_load_ps(&control0.x); __m128 mcp1 = _mm_load_ps(&control1.x); __m128 m0 = _mm_add_ps(_mm_mul_ps(mcp0, mtinv), _mm_mul_ps(mcp1, mt)); __m128 mcp2 = _mm_load_ps(&control2.x); __m128 m1 = _mm_add_ps(_mm_mul_ps(mcp1, mtinv), _mm_mul_ps(mcp2, mt)); __m128 mm0 = _mm_add_ps(_mm_mul_ps(m0, mtinv), _mm_mul_ps(m1, mt)); __m128 mcp3 = _mm_load_ps(&control3.x); __m128 m2 = _mm_add_ps(_mm_mul_ps(mcp2, mtinv), _mm_mul_ps(mcp3, mt)); __m128 mm1 = _mm_add_ps(_mm_mul_ps(m1, mtinv), _mm_mul_ps(m2, mt)); __m128 bezCurvePosition = _mm_add_ps(_mm_mul_ps(mm0, mtinv), _mm_mul_ps(mm1, mt)); // calculate bezier tangent __m128 vec3mask = _mm_set_ps(0x0,0xFFFFFFFF,0xFFFFFFFF, 0xFFFFFFFF); __m128 mrestangent = _mm_and_ps(_mm_sub_ps(mm1, mm0), vec3mask); __m128 mdot = _mm_mul_ps(mrestangent, mrestangent); __m128 mdotagg = _mm_hadd_ps(mdot, mdot); __m128 recipsqrt = _mm_rsqrt_ss( _mm_hadd_ps(mdotagg, mdotagg) ); __m128 tangentnorm = _mm_mul_ps(mrestangent, _mm_shuffle_ps(recipsqrt, recipsqrt, _MM_SHUFFLE(0,0,0,0))); // get look vector on axis 2 (ringRot.getVector(2)) eQuat ringRot = turtle0.rotation.slerp(tt, turtle1.rotation); __m128 mRingRot = _mm_loadu_ps((eF32*)&ringRot); __m128 rrmulparts = _mm_mul_ps(mRingRot, _mm_shuffle_ps(mRingRot, mRingRot, _MM_SHUFFLE(0,1,3,2))); __m128 ringRotSqr = _mm_mul_ps(mRingRot, mRingRot); __m128 mrdotagg = _mm_hadd_ps(rrmulparts, ringRotSqr); __m128 mrdotaggshuf = _mm_shuffle_ps(mrdotagg, mrdotagg, _MM_SHUFFLE(0,2,0,2)); __m128 mrrotz = _mm_hsub_ps(rrmulparts, mrdotaggshuf); __m128 rrecipsqrt = _mm_rsqrt_ss( _mm_hadd_ps(mrdotagg, mrdotagg) ); __m128 maxisparts = _mm_shuffle_ps(mrrotz,mrdotagg, _MM_SHUFFLE(0,2,0,2)); // -Y-X __m128 maxisparts2 = _mm_add_ps(maxisparts, maxisparts); // -Y*2-X*2 __m128 maxispartsfinal = _mm_shuffle_ps(mrrotz,maxisparts2,_MM_SHUFFLE(0,0,2,0)); //ZZY*2X*2 __m128 mlook = _mm_and_ps(_mm_mul_ps(maxispartsfinal, _mm_shuffle_ps(rrecipsqrt, rrecipsqrt, _MM_SHUFFLE(0,0,0,0))), vec3mask); // calculate side vector (look ^ tangent) __m128 mside = _mm_sub_ps( _mm_mul_ps(_mm_shuffle_ps(mlook, mlook, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(tangentnorm, tangentnorm, _MM_SHUFFLE(3, 1, 0, 2))), _mm_mul_ps(_mm_shuffle_ps(mlook, mlook, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(tangentnorm, tangentnorm, _MM_SHUFFLE(3, 0, 2, 1))) ); // normalize side vector mdot = _mm_mul_ps(mside, mside); mdotagg = _mm_hadd_ps(mdot, mdot); __m128 dotsum = _mm_hadd_ps(mdotagg, mdotagg); const eF32 sideLenSqr = dotsum.m128_f32[0]; if(sideLenSqr > eALMOST_ZERO) { recipsqrt = _mm_rsqrt_ss( dotsum ); __m128 sidenorm = _mm_mul_ps(mside, _mm_shuffle_ps(recipsqrt, recipsqrt, _MM_SHUFFLE(0,0,0,0))); // calc dot product (look * tangent) __m128 dotprod = _mm_mul_ps(mlook, sidenorm); __m128 dph0 = _mm_hadd_ps(dotprod, dotprod); __m128 dph1 = _mm_hadd_ps(dph0, dph0); const eF32 dot = eClamp(-1.0f, dph1.m128_f32[0], 1.0f); eF32 alpha = eACos(dot) * (1.0f / (2.0f * ePI)); eQuat rotation(sidenorm, alpha); ringRot = rotation * ringRot; } eMatrix4x4 curveMat(ringRot); __declspec(align(16)) eVector3 ringX = curveMat.getVector(0); __declspec(align(16)) eVector3 ringY = curveMat.getVector(1); eF32 texY = eLerp(stexY0, stexY1, tt); const eF32 texXStep = 1.0f / m_gen_edges; eVector2 texPos(0, texY); __m128 mRingX = _mm_load_ps(&ringX.x); __m128 mRingY = _mm_load_ps(&ringY.x); __m128 mScale = _mm_set1_ps(rscale); for(eU32 e = 0; e <= m_gen_edges * 2; e += 2) { __m128 msin = _mm_set1_ps(m_gen_edge_sinCosTable[e]); __m128 mcos = _mm_set1_ps(m_gen_edge_sinCosTable[e+1]); __m128 mnormal = _mm_add_ps(_mm_mul_ps(mRingX, msin), _mm_mul_ps(mRingY, mcos)); _mm_store_ps(&normal.x, mnormal); __m128 mposition = _mm_add_ps(bezCurvePosition, _mm_mul_ps(mnormal, mScale)); _mm_store_ps(&position.x, mposition); state.curVertices->append(destMesh.addVertex(position, normal, texPos)); texPos.x += texXStep; } // connect triangles if(r != 0) { eF32 texY0 = eLerp(stexY0, stexY1, t0); eF32 texY1 = eLerp(stexY0, stexY1, t1); for(eU32 e = 0; e < m_gen_edges; e++) { destMesh.addTriangleFast((*state.curVertices)[e], (*state.curVertices)[e + 1], (*state.lastVertices)[e + 1], m_gen_materials_dsIdx[turtle0.polyMatIdx]); destMesh.addTriangleFast((*state.curVertices)[e], (*state.lastVertices)[e + 1], (*state.lastVertices)[e], m_gen_materials_dsIdx[turtle0.polyMatIdx]); } } state.lastVertices = state.curVertices; eSwap(state.curVertices, state.curTempVertices); state.curVertices->clear(); } } } return true; }
* in defaults.nh */ static char where_to_get_source[] = "http://www.nethack.org/"; static char author[] = "The NetHack Development Team"; #include "hack.h" #include "wintty.h" #include "win32api.h" extern HANDLE hConIn; extern INPUT_RECORD ir; char dllname[512]; char *shortdllname; int FDECL(__declspec(dllexport) __stdcall ProcessKeystroke, (HANDLE hConIn, INPUT_RECORD *ir, boolean *valid, BOOLEAN_P numberpad, int portdebug)); int WINAPI DllMain(HINSTANCE hInstance, DWORD fdwReason, PVOID pvReserved) { char dlltmpname[512]; char *tmp = dlltmpname, *tmp2; *(tmp + GetModuleFileName(hInstance, tmp, 511)) = '\0'; (void)strcpy(dllname, tmp); tmp2 = strrchr(dllname, '\\'); if (tmp2) { tmp2++; shortdllname = tmp2; } return TRUE;
template <class T> void MICStencil<T>::operator()( Matrix2D<T>& mtx, unsigned int nIters ) { unsigned int uDimWithHalo = mtx.GetNumRows(); unsigned int uHaloWidth = LINESIZE / sizeof(T); unsigned int uImgElements = uDimWithHalo * uDimWithHalo; __declspec(target(mic), align(LINESIZE)) T* pIn = mtx.GetFlatData(); __declspec(target(mic), align(sizeof(T))) T wcenter = this->wCenter; __declspec(target(mic), align(sizeof(T))) T wdiag = this->wDiagonal; __declspec(target(mic), align(sizeof(T))) T wcardinal = this->wCardinal; #pragma offload target(mic) in(pIn:length(uImgElements) ALLOC RETAIN) { // Just copy pIn to compute the copy transfer time } #pragma offload target(mic) in(pIn:length(uImgElements) REUSE RETAIN) \ in(uImgElements) in(uDimWithHalo) \ in(wcenter) in(wdiag) in(wcardinal) { unsigned int uRowPartitions = sysconf(_SC_NPROCESSORS_ONLN) / 4 - 1; unsigned int uColPartitions = 4; // Threads per core for KNC unsigned int uRowTileSize = (uDimWithHalo - 2 * uHaloWidth) / uRowPartitions; unsigned int uColTileSize = (uDimWithHalo - 2 * uHaloWidth) / uColPartitions; uRowTileSize = ((uDimWithHalo - 2 * uHaloWidth) % uRowPartitions > 0) ? (uRowTileSize + 1) : (uRowTileSize); // Should use the "Halo Val" when filling the memory space T *pTmp = (T*)pIn; T *pCrnt = (T*)memset((T*)_mm_malloc(uImgElements * sizeof(T), LINESIZE), 0, uImgElements * sizeof(T)); #pragma omp parallel firstprivate(pTmp, pCrnt, uRowTileSize, uColTileSize, uHaloWidth, uDimWithHalo) { unsigned int uThreadId = omp_get_thread_num(); unsigned int uRowTileId = uThreadId / uColPartitions; unsigned int uColTileId = uThreadId % uColPartitions; unsigned int uStartLine = uRowTileId * uRowTileSize + uHaloWidth; unsigned int uStartCol = uColTileId * uColTileSize + uHaloWidth; unsigned int uEndLine = uStartLine + uRowTileSize; uEndLine = (uEndLine > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndLine; unsigned int uEndCol = uStartCol + uColTileSize; uEndCol = (uEndCol > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndCol; T cardinal0 = 0.0; T diagonal0 = 0.0; T center0 = 0.0; unsigned int cntIterations, i, j; for (cntIterations = 0; cntIterations < nIters; cntIterations ++) { // Do Stencil Operation for (i = uStartLine; i < uEndLine; i++) { T * pCenter = &pTmp [ i * uDimWithHalo]; T * pTop = pCenter - uDimWithHalo; T * pBottom = pCenter + uDimWithHalo; T * pOut = &pCrnt[ i * uDimWithHalo]; __assume_aligned(pCenter, 64); __assume_aligned(pTop, 64); __assume_aligned(pBottom, 64); __assume_aligned(pOut, 64); #pragma simd vectorlengthfor(float) for (j = uStartCol; j < uEndCol; j++) { cardinal0 = pCenter[j - 1] + pCenter[j + 1] + pTop[j] + pBottom[j]; diagonal0 = pTop[j - 1] + pTop[j + 1] + pBottom[j - 1] + pBottom[j + 1]; center0 = pCenter[j]; pOut[j] = wcardinal * cardinal0 + wdiag * diagonal0 + wcenter * center0; } } #pragma omp barrier ; // Switch pointers T* pAux = pTmp; pTmp = pCrnt; pCrnt = pAux; } // End For } // End Parallel _mm_free(pCrnt); } // End Offload #pragma offload target(mic) out(pIn:length(uImgElements) REUSE FREE) { // Just copy back pIn } }
* in defaults.nh */ static char where_to_get_source[] = "http://www.nethack.org/"; static char author[] = "The NetHack Development Team"; #include "hack.h" #include "wintty.h" #include "win32api.h" extern HANDLE hConIn; extern INPUT_RECORD ir; char dllname[512]; char *shortdllname; int FDECL(__declspec(dllexport) __stdcall ProcessKeystroke, (HANDLE hConIn, INPUT_RECORD *ir, boolean *valid, BOOLEAN_P numberpad, int portdebug)); int WINAPI DllMain(HINSTANCE hInstance, DWORD fdwReason, PVOID pvReserved) { char dlltmpname[512]; char *tmp = dlltmpname, *tmp2; *(tmp + GetModuleFileName(hInstance, tmp, 511)) = '\0'; (void) strcpy(dllname, tmp); tmp2 = strrchr(dllname, '\\'); if (tmp2) { tmp2++; shortdllname = tmp2; }
static void gum_dbghelp_backtracer_generate (GumBacktracer * backtracer, const GumCpuContext * cpu_context, GumReturnAddressArray * return_addresses) { GumDbghelpBacktracer * self = GUM_DBGHELP_BACKTRACER_CAST (backtracer); GumDbgHelpImpl * dbghelp = self->priv->dbghelp; guint i; guint skip_count = 0; STACKFRAME64 frame = { 0, }; __declspec (align (64)) CONTEXT context = { 0, }; BOOL success; /* Get the raw addresses */ RtlCaptureContext (&context); frame.AddrPC.Mode = AddrModeFlat; frame.AddrFrame.Mode = AddrModeFlat; frame.AddrStack.Mode = AddrModeFlat; if (cpu_context != NULL) { #if GLIB_SIZEOF_VOID_P == 4 context.Eip = cpu_context->eip; context.Edi = cpu_context->edi; context.Esi = cpu_context->esi; context.Ebp = cpu_context->ebp; context.Esp = cpu_context->esp; context.Ebx = cpu_context->ebx; context.Edx = cpu_context->edx; context.Ecx = cpu_context->ecx; context.Eax = cpu_context->eax; #else context.Rip = cpu_context->rip; context.R15 = cpu_context->r15; context.R14 = cpu_context->r14; context.R13 = cpu_context->r13; context.R12 = cpu_context->r12; context.R11 = cpu_context->r11; context.R10 = cpu_context->r10; context.R9 = cpu_context->r9; context.R8 = cpu_context->r8; context.Rdi = cpu_context->rdi; context.Rsi = cpu_context->rsi; context.Rbp = cpu_context->rbp; context.Rsp = cpu_context->rsp; context.Rbx = cpu_context->rbx; context.Rdx = cpu_context->rdx; context.Rcx = cpu_context->rcx; context.Rax = cpu_context->rax; #endif #if GLIB_SIZEOF_VOID_P == 8 frame.AddrPC.Offset = cpu_context->rip; frame.AddrFrame.Offset = cpu_context->rbp; frame.AddrStack.Offset = cpu_context->rsp; #else frame.AddrPC.Offset = cpu_context->eip; frame.AddrFrame.Offset = cpu_context->ebp; frame.AddrStack.Offset = cpu_context->esp; #endif } else { #if GLIB_SIZEOF_VOID_P == 4 frame.AddrPC.Offset = context.Eip; frame.AddrFrame.Offset = context.Ebp; frame.AddrStack.Offset = context.Esp; #else frame.AddrPC.Offset = context.Rip; frame.AddrFrame.Offset = context.Rbp; frame.AddrStack.Offset = context.Rsp; #endif #ifdef _DEBUG skip_count = 1; /* leave out this function */ #endif #if GLIB_SIZEOF_VOID_P == 8 skip_count++; #endif } return_addresses->len = 0; dbghelp->Lock (); for (i = 0; i < GUM_MAX_BACKTRACE_DEPTH + skip_count; i++) { success = dbghelp->StackWalk64 (GUM_BACKTRACER_MACHINE_TYPE, GetCurrentProcess (), GetCurrentThread (), &frame, &context, NULL, dbghelp->SymFunctionTableAccess64, dbghelp->SymGetModuleBase64, NULL); if (!success) break; else if (frame.AddrPC.Offset == frame.AddrReturn.Offset) break; else if (frame.AddrPC.Offset != 0) { if (i >= skip_count) { g_assert_cmpuint (return_addresses->len, <, G_N_ELEMENTS (return_addresses->items)); return_addresses->items[return_addresses->len++] = GSIZE_TO_POINTER (frame.AddrPC.Offset); } } }
bool _IsAlphaAllOpaqueBC(_In_ const Image& cImage) { if (!cImage.pixels) return false; // Promote "typeless" BC formats DXGI_FORMAT cformat; switch (cImage.format) { case DXGI_FORMAT_BC1_TYPELESS: cformat = DXGI_FORMAT_BC1_UNORM; break; case DXGI_FORMAT_BC2_TYPELESS: cformat = DXGI_FORMAT_BC2_UNORM; break; case DXGI_FORMAT_BC3_TYPELESS: cformat = DXGI_FORMAT_BC3_UNORM; break; case DXGI_FORMAT_BC7_TYPELESS: cformat = DXGI_FORMAT_BC7_UNORM; break; default: cformat = cImage.format; break; } // Determine BC format decoder BC_DECODE pfDecode; size_t sbpp; switch (cformat) { case DXGI_FORMAT_BC1_UNORM: case DXGI_FORMAT_BC1_UNORM_SRGB: pfDecode = D3DXDecodeBC1; sbpp = 8; break; case DXGI_FORMAT_BC2_UNORM: case DXGI_FORMAT_BC2_UNORM_SRGB: pfDecode = D3DXDecodeBC2; sbpp = 16; break; case DXGI_FORMAT_BC3_UNORM: case DXGI_FORMAT_BC3_UNORM_SRGB: pfDecode = D3DXDecodeBC3; sbpp = 16; break; case DXGI_FORMAT_BC7_UNORM: case DXGI_FORMAT_BC7_UNORM_SRGB: pfDecode = D3DXDecodeBC7; sbpp = 16; break; default: // BC4, BC5, and BC6 don't have alpha channels return false; } // Scan blocks for non-opaque alpha static const XMVECTORF32 threshold = { { { 0.99f, 0.99f, 0.99f, 0.99f } } }; __declspec(align(16)) XMVECTOR temp[16]; const uint8_t *pPixels = cImage.pixels; for (size_t h = 0; h < cImage.height; h += 4) { const uint8_t *ptr = pPixels; size_t ph = std::min<size_t>(4, cImage.height - h); size_t w = 0; for (size_t count = 0; (count < cImage.rowPitch) && (w < cImage.width); count += sbpp, w += 4) { pfDecode(temp, ptr); size_t pw = std::min<size_t>(4, cImage.width - w); assert(pw > 0 && ph > 0); if (pw == 4 && ph == 4) { // Full blocks for (size_t j = 0; j < 16; ++j) { XMVECTOR alpha = XMVectorSplatW(temp[j]); if (XMVector4Less(alpha, threshold)) return false; } } else { // Handle partial blocks for (size_t y = 0; y < ph; ++y) { for (size_t x = 0; x < pw; ++x) { XMVECTOR alpha = XMVectorSplatW(temp[y * 4 + x]); if (XMVector4Less(alpha, threshold)) return false; } } } ptr += sbpp; } pPixels += cImage.rowPitch; } return true; }