void AvxGatherFloat(void)
{
    const int merge_no = 0;
    const int merge_yes = 0x80000000;
    const int n = 15;
    float x[n];
    __declspec(align(32)) YmmVal des;
    __declspec(align(32)) YmmVal indices;
    __declspec(align(32)) YmmVal mask;

    // Initialize the test array
    srand(22);
    for (int i = 0; i < n; i++)
        x[i] = (float)(rand() % 1000);

    // Load des with initial values
    for (int i = 0; i < 8; i++)
        des.r32[i] = -1.0f;

    // Initialize the indices
    indices.i32[0] = 2;
    indices.i32[1] = 1;
    indices.i32[2] = 6;
    indices.i32[3] = 5;
    indices.i32[4] = 4;
    indices.i32[5] = 13;
    indices.i32[6] = 11;
    indices.i32[7] = 9;

    // Initialize the mask value
    mask.i32[0] = merge_yes;
    mask.i32[1] = merge_yes;
    mask.i32[2] = merge_no;
    mask.i32[3] = merge_yes;
    mask.i32[4] = merge_yes;
    mask.i32[5] = merge_no;
    mask.i32[6] = merge_yes;
    mask.i32[7] = merge_yes;

    printf("\nResults for AvxGatherFloat()\n");
    printf("Test array\n");
    for (int i = 0; i < n; i++)
        printf("x[%02d]: %6.1f\n", i, x[i]);
    printf("\n");

    const char* s1 = "Values BEFORE call to AvxGatherFloat_()";
    const char* s2 = "Values AFTER call to AvxGatherFloat_()";

    AvxGatherFloatPrint(s1, des, indices, mask);
    AvxGatherFloat_(&des, &indices, &mask, x);
    AvxGatherFloatPrint(s2, des, indices, mask);
}
Esempio n. 2
0
void IP_SingleColour::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const
{
	__declspec(align(16)) PROCDATA_SINGLECOLOUR temp[3];
	copyUserDataSingleColour(temp,     (PROCDATA_SINGLECOLOUR*)vertexUserData[0]);
	copyUserDataSingleColour(temp + 1, (PROCDATA_SINGLECOLOUR*)vertexUserData[1]);
	copyUserDataSingleColour(temp + 2, (PROCDATA_SINGLECOLOUR*)vertexUserData[2]);

	mcemaths_mul_3_4(temp[0].normal, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].worldPos, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].shadowcoord, correctedContributes[0]);

	mcemaths_mul_3_4(temp[1].normal, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].worldPos, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].shadowcoord, correctedContributes[1]);

	mcemaths_mul_3_4(temp[2].normal, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].worldPos, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].shadowcoord, correctedContributes[2]);

	PROCDATA_SINGLECOLOUR* output = (PROCDATA_SINGLECOLOUR*)interpolatedUserData;

	mcemaths_quatcpy(output->normal, temp[0].normal);
	mcemaths_quatcpy(output->worldPos, temp[0].worldPos);
	mcemaths_quatcpy(output->shadowcoord, temp[0].shadowcoord);

	mcemaths_add_3_4_ip(output->normal, temp[1].normal);
	mcemaths_add_3_4_ip(output->worldPos, temp[1].worldPos);
	mcemaths_add_3_4_ip(output->shadowcoord, temp[1].shadowcoord);

	mcemaths_add_3_4_ip(output->normal, temp[2].normal);
	mcemaths_add_3_4_ip(output->worldPos, temp[2].worldPos);
	mcemaths_add_3_4_ip(output->shadowcoord, temp[2].shadowcoord);
}
int main()
{
	__declspec(target(mic)) nbwcs* struct1;

	send_inputs(struct1);
	use_the_data(struct1);
	receive_results(struct1);
	return 0;
}
void TestFig8_3(void)
{
    __declspec(align(16)) Mat4x4 m_src1;
    __declspec(align(16)) Mat4x4 m_src2;
    __declspec(align(16)) Mat4x4 m_des;

    Mat4x4SetRow(m_src1, 0, 1, 0, 0, 0);
    Mat4x4SetRow(m_src1, 1, 0, 1, 0, 0);
    Mat4x4SetRow(m_src1, 2, 0, 0, 1, 0);
    Mat4x4SetRow(m_src1, 3, 0, 0, 0, 1);

    Mat4x4SetRow(m_src2, 0, 2, 7, 8, 3);
    Mat4x4SetRow(m_src2, 1, 11, 14, 16, 10);
    Mat4x4SetRow(m_src2, 2, 24, 21, 27, 29);
    Mat4x4SetRow(m_src2, 3, 31, 34, 38, 33);

    SsePfpMatrix4x4Multiply_(m_des, m_src1, m_src2);
}
void AvxGatherI64(void)
{
    const Int64 merge_no = 0;
    const Int64 merge_yes = 0x8000000000000000LL;
    const int n = 15;
    Int64 x[n];
    __declspec(align(32)) YmmVal des;
    __declspec(align(16)) XmmVal indices;
    __declspec(align(32)) YmmVal mask;

    // Initialize the test array
    srand(36);
    for (int i = 0; i < n; i++)
        x[i] = (Int64)(rand() % 1000);

    // Load des with initial values
    for (int i = 0; i < 4; i++)
        des.i64[i] = -1;

    // Initialize the indices and mask elements
    indices.i32[0] = 2;
    indices.i32[1] = 7;
    indices.i32[2] = 9;
    indices.i32[3] = 12;

    mask.i64[0] = merge_yes;
    mask.i64[1] = merge_yes;
    mask.i64[2] = merge_no;
    mask.i64[3] = merge_yes;

    printf("\nResults for AvxGatherI64()\n");
    printf("Test array\n");
    for (int i = 0; i < n; i++)
        printf("x[%02d]: %8lld\n", i, x[i]);
    printf("\n");

    const char* s1 = "Values BEFORE call to AvxGatherI64_()";
    const char* s2 = "Values AFTER call to AvxGatherI64_()";

    AvxGatherI64Print(s1, des, indices, mask);
    AvxGatherI64_(&des, &indices, &mask, x);
    AvxGatherI64Print(s2, des, indices, mask);
}
Esempio n. 6
0
void   CHeapAllocator::free(void * addr, size_t nbytes)
{
    if ((m_hHeap != NULL)&&(addr))
    {
        if (HeapFree(m_hHeap,0,addr) && m_bLogSize)
        {
#pragma warning(suppress: 4267)
            __declspec(align(4)) LONG mysize = 0L - nbytes;
            ::InterlockedExchangeAdd(&m_lAllocatedSize,mysize);
        }
    }
}
void read_luma_inter_pred_avg_8x16_intrinsic( BYTE *address1, BYTE *address2, INT stride_src, BYTE *dst, INT stride_dst )
{
	int i;
	int src_stride = stride_src;
	int dst_stride = stride_dst;
	const unsigned char* src1 = address1;
	const unsigned char* src2 = address2;

	for( i = 0; i < 16; i+=8)
	{
		__declspec(align(16)) __m128i r0, r1, r2, r3, r4, r5, r6, r7,
			r0_x, r1_x, r2_x, r3_x, r4_x, r5_x, r6_x, r7_x;
		int stride2 = (src_stride<<1);
		int stride4 = (src_stride<<2);
		int dst_stride2 = (dst_stride<<1);
		int dst_stride4 = (dst_stride<<2);
		r0 = _mm_loadl_epi64((__m128i*)(src1));
		r1 = _mm_loadl_epi64((__m128i*)(src1+src_stride));
		r2 = _mm_loadl_epi64((__m128i*)(src1+stride2));
		r3 = _mm_loadl_epi64((__m128i*)(src1+stride2+src_stride));
		r4 = _mm_loadl_epi64((__m128i*)(src1+stride4));
		r5 = _mm_loadl_epi64((__m128i*)(src1+stride4+src_stride));
		r6 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2));
		r7 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2+src_stride));
		r0_x = _mm_loadl_epi64((__m128i*)(src2));
		r1_x = _mm_loadl_epi64((__m128i*)(src2+src_stride));
		r2_x = _mm_loadl_epi64((__m128i*)(src2+stride2));
		r3_x = _mm_loadl_epi64((__m128i*)(src2+stride2+src_stride));
		r4_x = _mm_loadl_epi64((__m128i*)(src2+stride4));
		r5_x = _mm_loadl_epi64((__m128i*)(src2+stride4+src_stride));
		r6_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2));
		r7_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2+src_stride));
		r0 = _mm_avg_epu8(r0, r0_x);
		r1 = _mm_avg_epu8(r1, r1_x);
		r2 = _mm_avg_epu8(r2, r2_x);
		r3 = _mm_avg_epu8(r3, r3_x);
		r4 = _mm_avg_epu8(r4, r4_x);
		r5 = _mm_avg_epu8(r5, r5_x);
		r6 = _mm_avg_epu8(r6, r6_x);
		r7 = _mm_avg_epu8(r7, r7_x);
		_mm_storel_epi64((__m128i*)(dst), r0);
		_mm_storel_epi64((__m128i*)(dst+dst_stride), r1);
		_mm_storel_epi64((__m128i*)(dst+dst_stride2), r2);
		_mm_storel_epi64((__m128i*)(dst+dst_stride2+dst_stride), r3);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4), r4);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride), r5);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2), r6);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2+dst_stride), r7);
		src1 += (stride4<<1);
		src2 += (stride4<<1);
		dst += (dst_stride4<<1);
	}
}
Esempio n. 8
0
static gboolean
gum_windows_get_thread_details (DWORD thread_id,
                                GumThreadDetails * details)
{
  gboolean success = FALSE;
  __declspec (align (64)) CONTEXT context = { 0, };

  details->id = thread_id;

  if (thread_id == GetCurrentThreadId ())
  {
    details->state = GUM_THREAD_RUNNING;

    RtlCaptureContext (&context);
    gum_windows_parse_context (&context, &details->cpu_context);

    success = TRUE;
  }
  else
  {
    HANDLE thread;

    thread = OpenThread (THREAD_GET_CONTEXT | THREAD_SUSPEND_RESUME, FALSE,
        thread_id);
    if (thread != NULL)
    {
      DWORD previous_suspend_count;

      previous_suspend_count = SuspendThread (thread);
      if (previous_suspend_count != (DWORD) -1)
      {
        if (previous_suspend_count == 0)
          details->state = GUM_THREAD_RUNNING;
        else
          details->state = GUM_THREAD_STOPPED;

        context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
        if (GetThreadContext (thread, &context))
        {
          gum_windows_parse_context (&context, &details->cpu_context);
          success = TRUE;
        }

        ResumeThread (thread);
      }

      CloseHandle (thread);
    }
  }

  return success;
}
Esempio n. 9
0
double Timing::getTimeUsd()
{
#if defined(WIN32) || defined(_WIN32)
	__declspec(align(16)) LARGE_INTEGER counter;
	QueryPerformanceCounter(&counter);
	return ((1000000.0 * (double)counter.QuadPart) / (double)instance->frequency.QuadPart);
#elif defined(__linux__)
	timespec current;
	clock_gettime(CLOCK_MONOTONIC, &current);
	return ((double)current.tv_sec * 1000000.0 + (double)current.tv_nsec / 1000.0);
#endif
	return 0.0f;
}
Esempio n. 10
0
unsigned long Timing::getTimeUsul()
{
#if defined(WIN32) || defined(_WIN32)
	__declspec(align(16)) LARGE_INTEGER counter;
	QueryPerformanceCounter(&counter);
	return ((1000000 * counter.QuadPart) / instance->frequency.QuadPart);
#elif defined(__linux__)
	timespec current;
	clock_gettime(CLOCK_MONOTONIC, &current);
	return ((unsigned long)current.tv_sec * 1000000 + (unsigned long)current.tv_nsec / 1000);
#endif
	return 0;
}
Esempio n. 11
0
void IP_Planet::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const
{
	__declspec(align(16)) PROCDATA_PLANET temp[3];
	copyUserData(temp,     (PROCDATA_PLANET*)vertexUserData[0]);
	copyUserData(temp + 1, (PROCDATA_PLANET*)vertexUserData[1]);
	copyUserData(temp + 2, (PROCDATA_PLANET*)vertexUserData[2]);

	mcemaths_mul_3_4(temp[0].tangent, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].binormal, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].normal, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].worldPos, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].texcoord, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].shadowcoord, correctedContributes[0]);

	mcemaths_mul_3_4(temp[1].tangent, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].binormal, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].normal, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].worldPos, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].texcoord, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].shadowcoord, correctedContributes[1]);

	mcemaths_mul_3_4(temp[2].tangent, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].binormal, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].normal, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].worldPos, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].texcoord, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].shadowcoord, correctedContributes[2]);

	PROCDATA_PLANET* output = (PROCDATA_PLANET*)interpolatedUserData;

	mcemaths_quatcpy(output->tangent, temp[0].tangent);
	mcemaths_quatcpy(output->binormal, temp[0].binormal);
	mcemaths_quatcpy(output->normal, temp[0].normal);
	mcemaths_quatcpy(output->worldPos, temp[0].worldPos);
	mcemaths_quatcpy(output->texcoord, temp[0].texcoord);
	mcemaths_quatcpy(output->shadowcoord, temp[0].shadowcoord);

	mcemaths_add_3_4_ip(output->tangent, temp[1].tangent);
	mcemaths_add_3_4_ip(output->binormal, temp[1].binormal);
	mcemaths_add_3_4_ip(output->normal, temp[1].normal);
	mcemaths_add_3_4_ip(output->worldPos, temp[1].worldPos);
	mcemaths_add_3_4_ip(output->texcoord, temp[1].texcoord);
	mcemaths_add_3_4_ip(output->shadowcoord, temp[1].shadowcoord);

	mcemaths_add_3_4_ip(output->tangent, temp[2].tangent);
	mcemaths_add_3_4_ip(output->binormal, temp[2].binormal);
	mcemaths_add_3_4_ip(output->normal, temp[2].normal);
	mcemaths_add_3_4_ip(output->worldPos, temp[2].worldPos);
	mcemaths_add_3_4_ip(output->texcoord, temp[2].texcoord);
	mcemaths_add_3_4_ip(output->shadowcoord, temp[2].shadowcoord);
}
void SsePiMul32(void)
{
    __declspec(align(16)) XmmVal a;
    __declspec(align(16)) XmmVal b;
    __declspec(align(16)) XmmVal c[2];
    char buff[256];

    a.i32[0] = 10;          b.i32[0] = 100;
    a.i32[1] = 20;          b.i32[1] = -200;
    a.i32[2] = -30;         b.i32[2] = 300;
    a.i32[3] = -40;         b.i32[3] = -400;

    SsePiMul32_(&a, &b, c);

    printf("\nResults for SsePiMul32_\n");
    printf("a:    %s\n", a.ToString_i32(buff, sizeof(buff)));
    printf("b:    %s\n", b.ToString_i32(buff, sizeof(buff)));
    printf("c[0]: %s\n", c[0].ToString_i32(buff, sizeof(buff)));
    printf("\n");
    printf("a:    %s\n", a.ToString_i32(buff, sizeof(buff)));
    printf("b:    %s\n", b.ToString_i32(buff, sizeof(buff)));
    printf("c[1]: %s\n", c[1].ToString_i64(buff, sizeof(buff)));
}
Esempio n. 13
0
void * CHeapAllocator::alloc(size_t nbytes)
{
    if ((m_hHeap != NULL)&&(nbytes))
    {
        void * pVoid = HeapAlloc(m_hHeap,0,nbytes);
        if (pVoid && m_bLogSize)
        {
#pragma warning(suppress: 4267)
            __declspec(align(4)) LONG mysize = nbytes;
            ::InterlockedExchangeAdd(&m_lAllocatedSize,mysize);
        }
        return pVoid;
    }
    return NULL;
}
void read_luma_inter_pred_avg_16x16_intrinsic( BYTE *address1, BYTE *address2, INT stride_src, BYTE *dst, INT stride_dst )
{
	for(int i = 0; i < 16; i+=8)
	{
		__declspec(align(16)) __m128i r0, r1, r2, r3, r4, r5, r6, r7,
			r0_x, r1_x, r2_x, r3_x, r4_x, r5_x, r6_x, r7_x;
		int stride2 = (stride_src<<1);
		int stride4 = (stride_src<<2);
		int dst_stride2 = (stride_dst<<1);
		int dst_stride4 = (stride_dst<<2);
		r0 = _mm_loadu_si128((__m128i*)(address1));
		r1 = _mm_loadu_si128((__m128i*)(address1+stride_dst));
		r2 = _mm_loadu_si128((__m128i*)(address1+stride2));
		r3 = _mm_loadu_si128((__m128i*)(address1+stride2+stride_src));
		r4 = _mm_loadu_si128((__m128i*)(address1+stride4));
		r5 = _mm_loadu_si128((__m128i*)(address1+stride4+stride_src));
		r6 = _mm_loadu_si128((__m128i*)(address1+stride4+stride2));
		r7 = _mm_loadu_si128((__m128i*)(address1+stride4+stride2+stride_src));
		r0_x = _mm_loadu_si128((__m128i*)(address2));
		r1_x = _mm_loadu_si128((__m128i*)(address2+stride_src));
		r2_x = _mm_loadu_si128((__m128i*)(address2+stride2));
		r3_x = _mm_loadu_si128((__m128i*)(address2+stride2+stride_src));
		r4_x = _mm_loadu_si128((__m128i*)(address2+stride4));
		r5_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride_dst));
		r6_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride2));
		r7_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride2+stride_dst));
		r0 = _mm_avg_epu8(r0, r0_x);
		r1 = _mm_avg_epu8(r1, r1_x);
		r2 = _mm_avg_epu8(r2, r2_x);
		r3 = _mm_avg_epu8(r3, r3_x);
		r4 = _mm_avg_epu8(r4, r4_x);
		r5 = _mm_avg_epu8(r5, r5_x);
		r6 = _mm_avg_epu8(r6, r6_x);
		r7 = _mm_avg_epu8(r7, r7_x);
		_mm_storeu_si128((__m128i*)(dst), r0);
		_mm_storeu_si128((__m128i*)(dst+stride_dst), r1);
		_mm_storeu_si128((__m128i*)(dst+dst_stride2), r2);
		_mm_storeu_si128((__m128i*)(dst+dst_stride2+stride_dst), r3);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4), r4);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4+stride_dst), r5);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4+dst_stride2), r6);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4+dst_stride2+stride_dst), r7);
		address1 += (stride4<<1);
		address2 += (stride4<<1);
		dst += (dst_stride4<<1);
	}
}
Esempio n. 15
0
double vNormalIntegral(double b)
{
  __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; 
  //NN/2-1 has to be the multiple of 8
  //NN = (8*LV+1)*2, LV = 20 -> NN = 322
  //const int NN = 322; 
  const int vecsize = 8; 
  const int nCal = (NN/2-1)/vecsize;
  //const int left = NN%vecsize;
  double a = 0.0f;
  double s, h, sum = 0.0f;
  h = (b-a)/NN;
  // add in the first few terms 
  sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0);
  // and the last one
  sum += exp(-b*b/2.0);

  vec_cf0 = _mm512_set1_pd(a);
  vec_cf1 = _mm512_set1_pd(2*h);
  vec_cf2 = _mm512_set1_pd(-0.5);

  vec_s   = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize
  vec_s   = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h)
  vec_s   = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h)
  
  vec_stp = _mm512_set1_pd(2*h*vecsize-h);
  vec_cf0 = _mm512_set1_pd(h);
  
  for (int i = 0; i < nCal; ++i){
    vec_exp = _mm512_mul_pd(vec_s, vec_s);
    vec_exp = _mm512_mul_pd(vec_exp, vec_cf2);
    vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum
    sum    += 2.0*_mm512_reduce_add_pd(vec_cf1);

    vec_s   = _mm512_add_pd(vec_s, vec_cf0);//s+=h
    vec_exp = _mm512_mul_pd(vec_s, vec_s);
    vec_exp = _mm512_mul_pd(vec_exp, vec_cf2);
    vec_cf1 = _mm512_exp_pd(vec_exp);
    sum    += 4.0*_mm512_reduce_add_pd(vec_cf1);
    
    vec_s   = _mm512_add_pd(vec_s, vec_stp);
  }

  sum = 0.5*sqrt(2*PI) + h*sum/3.0;
  return sum;
}
Esempio n. 16
0
void IP_CloudShadow::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const
{
	__declspec(align(16)) PROCDATA_CLOUDSHADOW temp[3];
	copyUserData(temp,     (PROCDATA_CLOUDSHADOW*)vertexUserData[0]);
	copyUserData(temp + 1, (PROCDATA_CLOUDSHADOW*)vertexUserData[1]);
	copyUserData(temp + 2, (PROCDATA_CLOUDSHADOW*)vertexUserData[2]);

	mcemaths_mul_3_4(temp[0].texcoord, correctedContributes[0]);
	mcemaths_mul_3_4(temp[1].texcoord, correctedContributes[1]);
	mcemaths_mul_3_4(temp[2].texcoord, correctedContributes[2]);

	PROCDATA_CLOUDSHADOW* output = (PROCDATA_CLOUDSHADOW*)interpolatedUserData;

	mcemaths_quatcpy(output->texcoord, temp[0].texcoord);
	mcemaths_add_3_4_ip(output->texcoord, temp[1].texcoord);
	mcemaths_add_3_4_ip(output->texcoord, temp[2].texcoord);
}
Esempio n. 17
0
void eBezierSpline::evaluate(eF32 t, eVector3& resultPos, eQuat& resultRot) const {
	resultRot = rot0.slerp(t, rot1);

    __declspec(align(16)) eVector3 distNorm;
	eVector3::cubicBezier(t, control0, control1, control2, control3, resultPos, distNorm);

	eVector3 look = resultRot.getVector(ax);
	eVector3 side = (look^distNorm);
	eF32 sideLenSqr = side.sqrLength();
	if(sideLenSqr > eALMOST_ZERO) {
		side /= eSqrt(sideLenSqr);
		eF32 dot = eClamp(-1.0f, look * distNorm, 1.0f);
		eF32 alpha = eACos(dot) * (1.0f / (2.0f * ePI));
		eQuat rotation(side, alpha);
		resultRot = rotation * resultRot;
	}
/**/
}
Esempio n. 18
0
File: dsp.cpp Progetto: taqu/opus
    void conv_Short2ToShort1(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSshort* src = reinterpret_cast<const LSshort*>(s);

        s32 num = numSamples >> 2; //8個のshortをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128i izero = _mm_setzero_si128();
        __declspec(align(16)) LSshort tmp[8];

        const LSshort* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            //32bit整数r0, r1に変換
            __m128i t0 = _mm_loadu_si128((const __m128i*)p);
            __m128i t1 = _mm_cmpgt_epi16(izero, t0);
            __m128i r0 = _mm_unpackhi_epi16(t0, t1);
            __m128i r1 = _mm_unpacklo_epi16(t0, t1);

            __m128i r2 = _mm_add_epi32(r0, _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 3, 0, 1)));
            __m128i r3 = _mm_add_epi32(r1, _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 3, 0, 1)));

            r2 = _mm_srai_epi32(r2, 1);
            r3 = _mm_srai_epi32(r3, 1);
            __m128i r4 = _mm_packs_epi32(r3, r2);
            _mm_store_si128((__m128i*)tmp, r4);
            q[0] = tmp[0];
            q[1] = tmp[2];
            q[2] = tmp[4];
            q[3] = tmp[6];
            p += 8;
            q += 4;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j = i<<1;
            s32 t = (p[j+0] + p[j+1]) >> 1;
            q[i] = static_cast<LSshort>(t);
        }
    }
Esempio n. 19
0
void scale_perform64_method(t_times *x, t_object *dsp64, double **ins, long numins, double **outs, long numouts, long sampleframes, long flags, void *userparam)
{
	int invec = (int) userparam;   // used to signal which one is the signal input (1 for right, 0 for left)
	t_double *in = ins[invec];
	t_double *out = outs[0];
	t_double val = x->x_val;
#if defined(WIN_VERSION) && defined(WIN_SSE_INTRINSICS)
	__m128d mm_in1;
	__m128d *mm_out1 = (__m128d *) out;
	__m128d mm_in2;
	__m128d *mm_out2 = (__m128d *) out;
	__m128d mm_val;
	__declspec(align(16)) t_double aligned_val = x->x_val;
	int i;

	mm_val = _mm_set1_pd(aligned_val);

	// rbs fix: this version will break if the SVS is smaller than 4
	C74_ASSERT(sampleframes >= 4);
	for (i=0; i < sampleframes; i+=4) {
		mm_in1 = _mm_load_pd(in+i);
		mm_out1[i/2] = _mm_mul_pd(mm_in1, mm_val);
		mm_in2 = _mm_load_pd(in+i+2);
		mm_out2[i/2 + 1] = _mm_mul_pd(mm_in2, mm_val);
	}

#else
	t_double ftmp;

//	if (IS_DENORM_DOUBLE(*in)) {
//		static int counter = 0;
//		post("times~ (%p): saw denorm (%d)", x, counter++);
//	}

	while (sampleframes--) {
		ftmp = val **in++;
		FIX_DENORM_NAN_DOUBLE(ftmp);
		*out++ = ftmp;
	}
#endif
}
Esempio n. 20
0
gboolean
gum_process_modify_thread (GumThreadId thread_id,
                           GumModifyThreadFunc func,
                           gpointer user_data)
{
  gboolean success = FALSE;
  HANDLE thread;
  __declspec (align (64)) CONTEXT context = { 0, };
  GumCpuContext cpu_context;

  thread = OpenThread (THREAD_GET_CONTEXT | THREAD_SET_CONTEXT |
      THREAD_SUSPEND_RESUME, FALSE, thread_id);
  if (thread == NULL)
    goto beach;

  if (SuspendThread (thread) == (DWORD) -1)
    goto beach;

  context.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER;
  if (!GetThreadContext (thread, &context))
    goto beach;

  gum_windows_parse_context (&context, &cpu_context);
  func (thread_id, &cpu_context, user_data);
  gum_windows_unparse_context (&cpu_context, &context);

  if (!SetThreadContext (thread, &context))
  {
    ResumeThread (thread);
    goto beach;
  }

  success = ResumeThread (thread) != (DWORD) -1;

beach:
  if (thread != NULL)
    CloseHandle (thread);

  return success;
}
Esempio n. 21
0
void IP_DiffuseOnly::interpolateByContributes(void* interpolatedUserData, const void** vertexUserData, const float* correctedContributes) const
{
	__declspec(align(16)) PROCDATA_DIFFUSEONLY temp[3];
	copyUserDataDiffuseOnly(temp,     (PROCDATA_DIFFUSEONLY*)vertexUserData[0]);
	copyUserDataDiffuseOnly(temp + 1, (PROCDATA_DIFFUSEONLY*)vertexUserData[1]);
	copyUserDataDiffuseOnly(temp + 2, (PROCDATA_DIFFUSEONLY*)vertexUserData[2]);

	mcemaths_mul_3_4(temp[0].normal, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].worldPos, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].texcoord, correctedContributes[0]);
	mcemaths_mul_3_4(temp[0].shadowcoord, correctedContributes[0]);

	mcemaths_mul_3_4(temp[1].normal, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].worldPos, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].texcoord, correctedContributes[1]);
	mcemaths_mul_3_4(temp[1].shadowcoord, correctedContributes[1]);

	mcemaths_mul_3_4(temp[2].normal, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].worldPos, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].texcoord, correctedContributes[2]);
	mcemaths_mul_3_4(temp[2].shadowcoord, correctedContributes[2]);

	PROCDATA_DIFFUSEONLY* output = (PROCDATA_DIFFUSEONLY*)interpolatedUserData;

	mcemaths_quatcpy(output->normal, temp[0].normal);
	mcemaths_quatcpy(output->worldPos, temp[0].worldPos);
	mcemaths_quatcpy(output->texcoord, temp[0].texcoord);
	mcemaths_quatcpy(output->shadowcoord, temp[0].shadowcoord);

	mcemaths_add_3_4_ip(output->normal, temp[1].normal);
	mcemaths_add_3_4_ip(output->worldPos, temp[1].worldPos);
	mcemaths_add_3_4_ip(output->texcoord, temp[1].texcoord);
	mcemaths_add_3_4_ip(output->shadowcoord, temp[1].shadowcoord);

	mcemaths_add_3_4_ip(output->normal, temp[2].normal);
	mcemaths_add_3_4_ip(output->worldPos, temp[2].worldPos);
	mcemaths_add_3_4_ip(output->texcoord, temp[2].texcoord);
	mcemaths_add_3_4_ip(output->shadowcoord, temp[2].shadowcoord);
}
Esempio n. 22
0
File: dsp.cpp Progetto: taqu/opus
    void conv_Float1ToShort2(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSfloat* src = reinterpret_cast<const LSfloat*>(s);

        s32 num = numSamples >> 2; //4個のfloatをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128 fcoff = _mm_set1_ps(32768.0f);
        __declspec(align(16)) LSshort tmp[8];

        const LSfloat* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            __m128 f32_0 = _mm_mul_ps(_mm_loadu_ps(p), fcoff);
            __m128i s32_0 = _mm_cvtps_epi32(f32_0);
            __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0);

            _mm_store_si128((__m128i*)tmp, s16_0);

            q[0] = tmp[0];
            q[1] = tmp[0];
            q[2] = tmp[1];
            q[3] = tmp[1];
            q[4] = tmp[2];
            q[5] = tmp[2];
            q[6] = tmp[3];
            q[7] = tmp[3];
            p += 4;
            q += 8;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j=i<<1;
            q[j+0] = q[j+1] = toShort(p[i]);
        }
    }
Esempio n. 23
0
File: dsp.cpp Progetto: taqu/opus
    void conv_Float2ToShort1(void* dst, const void* s, s32 numSamples)
    {
        LSshort* d = reinterpret_cast<LSshort*>(dst);
        const LSfloat* src = reinterpret_cast<const LSfloat*>(s);

        s32 num = numSamples >> 2; //4個のfloatをまとめて処理
        s32 offset = num << 2;
        s32 rem = numSamples - offset;

        const __m128 fcoff = _mm_set1_ps(32768.0f*0.5f); //half
        __declspec(align(16)) LSshort tmp[8];

        const LSfloat* p = src;
        LSshort* q = d;
        for(s32 i=0; i<num; ++i){
            __m128 f32_0 = _mm_loadu_ps(p);
            __m128 f32_1 = _mm_add_ps(f32_0, _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(2, 3, 0, 1)));
            __m128 f32_2 = _mm_mul_ps(f32_1, fcoff);

            __m128i s32_0 = _mm_cvtps_epi32(f32_2);
            __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0);

            _mm_store_si128((__m128i*)tmp, s16_0);

            q[0] = tmp[0];
            q[1] = tmp[2];
            p += 4;
            q += 2;
        }

        for(s32 i=0; i<rem; ++i){
            s32 j = i<<1;
            f32 v = 0.5f*(src[j+0] + src[j+1]);
            q[i] = toShort(v);
        }
    }
Esempio n. 24
0
 *
 */

static char where_to_get_source[] = "http://www.nethack.org/";
static char author[] = "The NetHack Development Team";

#include "hack.h"
#include "wintty.h"
#include "win32api.h"

extern HANDLE hConIn;
extern INPUT_RECORD ir;
char dllname[512];
char *shortdllname;

int FDECL(__declspec(dllexport) __cdecl
ProcessKeystroke, (HANDLE hConIn, INPUT_RECORD *ir, 
    boolean *valid, BOOLEAN_P numberpad, int portdebug));

int WINAPI DllMain(HINSTANCE hInstance, DWORD fdwReason, PVOID pvReserved)
{
	char dlltmpname[512];
	char *tmp = dlltmpname, *tmp2;
	*(tmp + GetModuleFileNameA(hInstance, tmp, 511)) = '\0';
	(void)strcpy(dllname, tmp);
	tmp2 = strrchr(dllname, '\\');
	if (tmp2) {
		tmp2++;
		shortdllname = tmp2;
	}
	return TRUE;
Esempio n. 25
0
// returns whether the shape was drawn
eBool eLSystem::drawShapes(eMesh& destMesh, tDrawState& state, const tTurtleState& turtle0, const tTurtleState& turtle1, eF32 shapeLen, eF32 stexY0, eF32 stexY1, eBool forceDraw, eU32 numParts) {
    eF32 partLen = eLerp(this->m_sizePar * (eF32)numParts, 0.0001f, detail);
//	eF32 partLen = eLerp(eF32_MAX, 0.0001f, detail);
	if(partLen <= 0.0f)
		partLen = eALMOST_ZERO;
	eF32 numToDrawF = (eF32)shapeLen / partLen;
	if(!forceDraw) {
		if(numToDrawF <= 1.0f)
			return false;
	}
	eU32 numDraw = eCeil(eClamp(1.0f, numToDrawF, (eF32)m_gen_rings)); 


	eU32		numFaces = numDraw * m_gen_edges * 2;
	eU32		faceNr = 0;

    ePROFILER_ZONE("L-System - Draw Shapes");

    __declspec(align(16)) const eVector3  control0 = turtle0.position;
	__declspec(align(16)) const eVector3  control1 = control0   + turtle0.rotation.getVector(2) * 0.333333f * shapeLen;
    __declspec(align(16)) const eVector3  control3 = turtle1.position;
	__declspec(align(16)) const eVector3  control2 = control3   - turtle1.rotation.getVector(2) * 0.333333f * shapeLen;

    eF32 rscale0 = turtle0.size * turtle0.width;
	eF32 rscale1 = turtle1.size * turtle1.width;
	for(eU32 d = 0; d < numDraw; d++) {
		eF32 t0 = ((eF32)d / (eF32)numDraw);
		eF32 t1 = ((eF32)(d + 1) / (eF32)numDraw);
		for(eU32 r = 0; r <= 1; r++) {
			eF32 tt = (r == 0) ? t0 : t1;
			eF32 rscale = eLerp(rscale0, rscale1, tt);
			if((r != 0) || (state.lastVertices->size() == 0)) {
                // create ring vertices
                __declspec(align(16)) eVector3 position;
                __declspec(align(16)) eVector3 normal;

                // calculate bezier curve position
                __m128 mt = _mm_set1_ps(tt);
                __m128 mtinv = _mm_set1_ps(1.0f - tt);
                __m128 mcp0 = _mm_load_ps(&control0.x);
                __m128 mcp1 = _mm_load_ps(&control1.x);
                __m128 m0 = _mm_add_ps(_mm_mul_ps(mcp0, mtinv), _mm_mul_ps(mcp1, mt));
                __m128 mcp2 = _mm_load_ps(&control2.x);
                __m128 m1 = _mm_add_ps(_mm_mul_ps(mcp1, mtinv), _mm_mul_ps(mcp2, mt));
                __m128 mm0 = _mm_add_ps(_mm_mul_ps(m0, mtinv), _mm_mul_ps(m1, mt));
                __m128 mcp3 = _mm_load_ps(&control3.x);
                __m128 m2 = _mm_add_ps(_mm_mul_ps(mcp2, mtinv), _mm_mul_ps(mcp3, mt));
                __m128 mm1 = _mm_add_ps(_mm_mul_ps(m1, mtinv), _mm_mul_ps(m2, mt));
                __m128 bezCurvePosition = _mm_add_ps(_mm_mul_ps(mm0, mtinv), _mm_mul_ps(mm1, mt));

                // calculate bezier tangent
                __m128 vec3mask = _mm_set_ps(0x0,0xFFFFFFFF,0xFFFFFFFF, 0xFFFFFFFF);
                __m128 mrestangent = _mm_and_ps(_mm_sub_ps(mm1, mm0), vec3mask);

                __m128 mdot = _mm_mul_ps(mrestangent, mrestangent);
                __m128 mdotagg = _mm_hadd_ps(mdot, mdot);
                __m128 recipsqrt = _mm_rsqrt_ss( _mm_hadd_ps(mdotagg, mdotagg) );
                __m128 tangentnorm = _mm_mul_ps(mrestangent, _mm_shuffle_ps(recipsqrt, recipsqrt, _MM_SHUFFLE(0,0,0,0)));


                // get look vector on axis 2 (ringRot.getVector(2))
                eQuat       ringRot = turtle0.rotation.slerp(tt, turtle1.rotation);
                __m128 mRingRot = _mm_loadu_ps((eF32*)&ringRot);
                __m128 rrmulparts = _mm_mul_ps(mRingRot, _mm_shuffle_ps(mRingRot, mRingRot, _MM_SHUFFLE(0,1,3,2)));

                __m128 ringRotSqr = _mm_mul_ps(mRingRot, mRingRot);
                __m128 mrdotagg = _mm_hadd_ps(rrmulparts, ringRotSqr);
                __m128 mrdotaggshuf = _mm_shuffle_ps(mrdotagg, mrdotagg, _MM_SHUFFLE(0,2,0,2));
                __m128 mrrotz = _mm_hsub_ps(rrmulparts, mrdotaggshuf);
                __m128 rrecipsqrt = _mm_rsqrt_ss( _mm_hadd_ps(mrdotagg, mrdotagg) );

                __m128 maxisparts = _mm_shuffle_ps(mrrotz,mrdotagg, _MM_SHUFFLE(0,2,0,2)); // -Y-X
                __m128 maxisparts2 = _mm_add_ps(maxisparts, maxisparts); // -Y*2-X*2
                __m128 maxispartsfinal = _mm_shuffle_ps(mrrotz,maxisparts2,_MM_SHUFFLE(0,0,2,0)); //ZZY*2X*2
                __m128 mlook = _mm_and_ps(_mm_mul_ps(maxispartsfinal, _mm_shuffle_ps(rrecipsqrt, rrecipsqrt, _MM_SHUFFLE(0,0,0,0))), vec3mask);

                // calculate side vector (look ^ tangent)
                __m128 mside = _mm_sub_ps(
                    _mm_mul_ps(_mm_shuffle_ps(mlook, mlook, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(tangentnorm, tangentnorm, _MM_SHUFFLE(3, 1, 0, 2))),
                    _mm_mul_ps(_mm_shuffle_ps(mlook, mlook, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(tangentnorm, tangentnorm, _MM_SHUFFLE(3, 0, 2, 1)))
                  );

                // normalize side vector
                mdot = _mm_mul_ps(mside, mside);
                mdotagg = _mm_hadd_ps(mdot, mdot);
                __m128 dotsum = _mm_hadd_ps(mdotagg, mdotagg);
                const eF32 sideLenSqr = dotsum.m128_f32[0];
                if(sideLenSqr > eALMOST_ZERO) {
                    recipsqrt = _mm_rsqrt_ss( dotsum );
                    __m128 sidenorm = _mm_mul_ps(mside, _mm_shuffle_ps(recipsqrt, recipsqrt, _MM_SHUFFLE(0,0,0,0)));

                    // calc dot product (look * tangent)
                    __m128 dotprod = _mm_mul_ps(mlook, sidenorm);
                    __m128 dph0 = _mm_hadd_ps(dotprod, dotprod);
                    __m128 dph1 = _mm_hadd_ps(dph0, dph0);
                    const eF32 dot = eClamp(-1.0f, dph1.m128_f32[0], 1.0f);
		            eF32 alpha = eACos(dot) * (1.0f / (2.0f * ePI));

                    eQuat rotation(sidenorm, alpha);
		            ringRot = rotation * ringRot;
	            }


				eMatrix4x4 curveMat(ringRot);
                __declspec(align(16)) eVector3 ringX = curveMat.getVector(0);
                __declspec(align(16)) eVector3 ringY = curveMat.getVector(1);

				eF32 texY = eLerp(stexY0, stexY1, tt);
                const eF32 texXStep = 1.0f / m_gen_edges;
                eVector2 texPos(0, texY);
                
                __m128 mRingX = _mm_load_ps(&ringX.x);
                __m128 mRingY = _mm_load_ps(&ringY.x);
                __m128 mScale = _mm_set1_ps(rscale);

				for(eU32 e = 0; e <= m_gen_edges * 2; e += 2) {
                    __m128 msin = _mm_set1_ps(m_gen_edge_sinCosTable[e]);
                    __m128 mcos = _mm_set1_ps(m_gen_edge_sinCosTable[e+1]);
                    __m128 mnormal = _mm_add_ps(_mm_mul_ps(mRingX, msin), _mm_mul_ps(mRingY, mcos));
                    _mm_store_ps(&normal.x, mnormal);
                    __m128 mposition = _mm_add_ps(bezCurvePosition, _mm_mul_ps(mnormal, mScale));
                    _mm_store_ps(&position.x, mposition);

                    state.curVertices->append(destMesh.addVertex(position, normal, texPos));
                    texPos.x += texXStep;
                }

				// connect triangles
				if(r != 0) {
					eF32 texY0 = eLerp(stexY0, stexY1, t0);
					eF32 texY1 = eLerp(stexY0, stexY1, t1);
					for(eU32 e = 0; e < m_gen_edges; e++) {
						destMesh.addTriangleFast((*state.curVertices)[e], 
							                 (*state.curVertices)[e + 1], 
											 (*state.lastVertices)[e + 1],
											 m_gen_materials_dsIdx[turtle0.polyMatIdx]);
						destMesh.addTriangleFast((*state.curVertices)[e], 
							                 (*state.lastVertices)[e + 1], 
											 (*state.lastVertices)[e],
											 m_gen_materials_dsIdx[turtle0.polyMatIdx]);
					}
				}

				state.lastVertices = state.curVertices;
				eSwap(state.curVertices, state.curTempVertices);
				state.curVertices->clear();
			}
		}
	}
	return true;
}
Esempio n. 26
0
 * in defaults.nh
 */

static char where_to_get_source[] = "http://www.nethack.org/";
static char author[] = "The NetHack Development Team";

#include "hack.h"
#include "wintty.h"
#include "win32api.h"

extern HANDLE hConIn;
extern INPUT_RECORD ir;
char dllname[512];
char *shortdllname;

int FDECL(__declspec(dllexport) __stdcall
ProcessKeystroke, (HANDLE hConIn, INPUT_RECORD *ir, 
    boolean *valid, BOOLEAN_P numberpad, int portdebug));

int WINAPI DllMain(HINSTANCE hInstance, DWORD fdwReason, PVOID pvReserved)
{
	char dlltmpname[512];
	char *tmp = dlltmpname, *tmp2;
	*(tmp + GetModuleFileName(hInstance, tmp, 511)) = '\0';
	(void)strcpy(dllname, tmp);
	tmp2 = strrchr(dllname, '\\');
	if (tmp2) {
		tmp2++;
		shortdllname = tmp2;
	}
	return TRUE;
Esempio n. 27
0
template <class T> void
MICStencil<T>::operator()( Matrix2D<T>& mtx, unsigned int nIters )
{
    unsigned int uDimWithHalo    = mtx.GetNumRows();
    unsigned int uHaloWidth      = LINESIZE / sizeof(T);
    unsigned int uImgElements    = uDimWithHalo * uDimWithHalo;

    __declspec(target(mic), align(LINESIZE)) T* pIn = mtx.GetFlatData();

    __declspec(target(mic), align(sizeof(T)))    T wcenter      = this->wCenter;
    __declspec(target(mic), align(sizeof(T)))    T wdiag        = this->wDiagonal;
    __declspec(target(mic), align(sizeof(T)))    T wcardinal    = this->wCardinal;

    #pragma offload target(mic) in(pIn:length(uImgElements) ALLOC RETAIN)
    {
        // Just copy pIn to compute the copy transfer time
    }

    #pragma offload target(mic) in(pIn:length(uImgElements) REUSE RETAIN)    \
                                in(uImgElements) in(uDimWithHalo)            \
                                in(wcenter) in(wdiag) in(wcardinal)
    {
        unsigned int uRowPartitions = sysconf(_SC_NPROCESSORS_ONLN) / 4 - 1;
        unsigned int uColPartitions = 4;    // Threads per core for KNC

        unsigned int uRowTileSize    = (uDimWithHalo - 2 * uHaloWidth) / uRowPartitions;
        unsigned int uColTileSize    = (uDimWithHalo - 2 * uHaloWidth) / uColPartitions;

        uRowTileSize = ((uDimWithHalo - 2 * uHaloWidth) % uRowPartitions > 0) ? (uRowTileSize + 1) : (uRowTileSize);

        // Should use the "Halo Val" when filling the memory space
        T *pTmp     = (T*)pIn;
        T *pCrnt = (T*)memset((T*)_mm_malloc(uImgElements * sizeof(T), LINESIZE), 0, uImgElements * sizeof(T));

        #pragma omp parallel firstprivate(pTmp, pCrnt, uRowTileSize, uColTileSize, uHaloWidth, uDimWithHalo)
        {
            unsigned int uThreadId = omp_get_thread_num();

            unsigned int uRowTileId = uThreadId / uColPartitions;
            unsigned int uColTileId = uThreadId % uColPartitions;

            unsigned int uStartLine = uRowTileId * uRowTileSize + uHaloWidth;
            unsigned int uStartCol  = uColTileId * uColTileSize + uHaloWidth;

            unsigned int uEndLine = uStartLine + uRowTileSize;
            uEndLine = (uEndLine > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndLine;

            unsigned int uEndCol    = uStartCol  + uColTileSize;
            uEndCol  = (uEndCol  > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndCol;

            T    cardinal0 = 0.0;
            T    diagonal0 = 0.0;
            T    center0   = 0.0;

            unsigned int cntIterations, i, j;

            for (cntIterations = 0; cntIterations < nIters; cntIterations ++)
            {
                // Do Stencil Operation
                for (i = uStartLine; i < uEndLine; i++)
                {
                    T * pCenter      = &pTmp [ i * uDimWithHalo];
                    T * pTop         = pCenter - uDimWithHalo;
                    T * pBottom      = pCenter + uDimWithHalo;
                    T * pOut         = &pCrnt[ i * uDimWithHalo];

                    __assume_aligned(pCenter, 64);
                    __assume_aligned(pTop,    64);
                    __assume_aligned(pBottom, 64);
                    __assume_aligned(pOut,    64);

                    #pragma simd vectorlengthfor(float)
                    for (j = uStartCol; j < uEndCol; j++)
                    {
                        cardinal0   = pCenter[j - 1] + pCenter[j + 1] + pTop[j] + pBottom[j];
                        diagonal0   = pTop[j - 1] + pTop[j + 1] + pBottom[j - 1] + pBottom[j + 1];
                        center0     = pCenter[j];

                        pOut[j]     = wcardinal * cardinal0 + wdiag * diagonal0 + wcenter * center0;
                    }
                }

                #pragma omp barrier
                ;

                // Switch pointers
                T* pAux    = pTmp;
                pTmp     = pCrnt;
                pCrnt    = pAux;
            } // End For

        } // End Parallel

        _mm_free(pCrnt);
    } // End Offload

    #pragma offload target(mic) out(pIn:length(uImgElements) REUSE FREE)
    {
        // Just copy back pIn
    }
}
Esempio n. 28
0
 * in defaults.nh
 */

static char where_to_get_source[] = "http://www.nethack.org/";
static char author[] = "The NetHack Development Team";

#include "hack.h"
#include "wintty.h"
#include "win32api.h"

extern HANDLE hConIn;
extern INPUT_RECORD ir;
char dllname[512];
char *shortdllname;

int FDECL(__declspec(dllexport) __stdcall ProcessKeystroke,
          (HANDLE hConIn, INPUT_RECORD *ir, boolean *valid,
           BOOLEAN_P numberpad, int portdebug));

int WINAPI
DllMain(HINSTANCE hInstance, DWORD fdwReason, PVOID pvReserved)
{
    char dlltmpname[512];
    char *tmp = dlltmpname, *tmp2;
    *(tmp + GetModuleFileName(hInstance, tmp, 511)) = '\0';
    (void) strcpy(dllname, tmp);
    tmp2 = strrchr(dllname, '\\');
    if (tmp2) {
        tmp2++;
        shortdllname = tmp2;
    }
Esempio n. 29
0
static void
gum_dbghelp_backtracer_generate (GumBacktracer * backtracer,
                                 const GumCpuContext * cpu_context,
                                 GumReturnAddressArray * return_addresses)
{
  GumDbghelpBacktracer * self = GUM_DBGHELP_BACKTRACER_CAST (backtracer);
  GumDbgHelpImpl * dbghelp = self->priv->dbghelp;
  guint i;
  guint skip_count = 0;
  STACKFRAME64 frame = { 0, };
  __declspec (align (64)) CONTEXT context = { 0, };
  BOOL success;

  /* Get the raw addresses */
  RtlCaptureContext (&context);

  frame.AddrPC.Mode = AddrModeFlat;
  frame.AddrFrame.Mode = AddrModeFlat;
  frame.AddrStack.Mode = AddrModeFlat;

  if (cpu_context != NULL)
  {
#if GLIB_SIZEOF_VOID_P == 4
    context.Eip = cpu_context->eip;

    context.Edi = cpu_context->edi;
    context.Esi = cpu_context->esi;
    context.Ebp = cpu_context->ebp;
    context.Esp = cpu_context->esp;
    context.Ebx = cpu_context->ebx;
    context.Edx = cpu_context->edx;
    context.Ecx = cpu_context->ecx;
    context.Eax = cpu_context->eax;
#else
    context.Rip = cpu_context->rip;

    context.R15 = cpu_context->r15;
    context.R14 = cpu_context->r14;
    context.R13 = cpu_context->r13;
    context.R12 = cpu_context->r12;
    context.R11 = cpu_context->r11;
    context.R10 = cpu_context->r10;
    context.R9  = cpu_context->r9;
    context.R8  = cpu_context->r8;

    context.Rdi = cpu_context->rdi;
    context.Rsi = cpu_context->rsi;
    context.Rbp = cpu_context->rbp;
    context.Rsp = cpu_context->rsp;
    context.Rbx = cpu_context->rbx;
    context.Rdx = cpu_context->rdx;
    context.Rcx = cpu_context->rcx;
    context.Rax = cpu_context->rax;
#endif

#if GLIB_SIZEOF_VOID_P == 8
    frame.AddrPC.Offset = cpu_context->rip;
    frame.AddrFrame.Offset = cpu_context->rbp;
    frame.AddrStack.Offset = cpu_context->rsp;
#else
    frame.AddrPC.Offset = cpu_context->eip;
    frame.AddrFrame.Offset = cpu_context->ebp;
    frame.AddrStack.Offset = cpu_context->esp;
#endif
  }
  else
  {
#if GLIB_SIZEOF_VOID_P == 4
    frame.AddrPC.Offset = context.Eip;
    frame.AddrFrame.Offset = context.Ebp;
    frame.AddrStack.Offset = context.Esp;
#else
    frame.AddrPC.Offset = context.Rip;
    frame.AddrFrame.Offset = context.Rbp;
    frame.AddrStack.Offset = context.Rsp;
#endif

#ifdef _DEBUG
    skip_count = 1; /* leave out this function */
#endif
#if GLIB_SIZEOF_VOID_P == 8
    skip_count++;
#endif
  }

  return_addresses->len = 0;

  dbghelp->Lock ();

  for (i = 0; i < GUM_MAX_BACKTRACE_DEPTH + skip_count; i++)
  {
    success = dbghelp->StackWalk64 (GUM_BACKTRACER_MACHINE_TYPE,
        GetCurrentProcess (), GetCurrentThread (), &frame, &context, NULL,
        dbghelp->SymFunctionTableAccess64, dbghelp->SymGetModuleBase64, NULL);
    if (!success)
      break;
    else if (frame.AddrPC.Offset == frame.AddrReturn.Offset)
      break;
    else if (frame.AddrPC.Offset != 0)
    {
      if (i >= skip_count)
      {
        g_assert_cmpuint (return_addresses->len, <,
            G_N_ELEMENTS (return_addresses->items));
        return_addresses->items[return_addresses->len++] =
            GSIZE_TO_POINTER (frame.AddrPC.Offset);
      }
    }
  }
Esempio n. 30
0
    bool _IsAlphaAllOpaqueBC(_In_ const Image& cImage)
    {
        if (!cImage.pixels)
            return false;

        // Promote "typeless" BC formats
        DXGI_FORMAT cformat;
        switch (cImage.format)
        {
        case DXGI_FORMAT_BC1_TYPELESS:  cformat = DXGI_FORMAT_BC1_UNORM; break;
        case DXGI_FORMAT_BC2_TYPELESS:  cformat = DXGI_FORMAT_BC2_UNORM; break;
        case DXGI_FORMAT_BC3_TYPELESS:  cformat = DXGI_FORMAT_BC3_UNORM; break;
        case DXGI_FORMAT_BC7_TYPELESS:  cformat = DXGI_FORMAT_BC7_UNORM; break;
        default:                        cformat = cImage.format;         break;
        }

        // Determine BC format decoder
        BC_DECODE pfDecode;
        size_t sbpp;
        switch (cformat)
        {
        case DXGI_FORMAT_BC1_UNORM:
        case DXGI_FORMAT_BC1_UNORM_SRGB:    pfDecode = D3DXDecodeBC1;   sbpp = 8;   break;
        case DXGI_FORMAT_BC2_UNORM:
        case DXGI_FORMAT_BC2_UNORM_SRGB:    pfDecode = D3DXDecodeBC2;   sbpp = 16;  break;
        case DXGI_FORMAT_BC3_UNORM:
        case DXGI_FORMAT_BC3_UNORM_SRGB:    pfDecode = D3DXDecodeBC3;   sbpp = 16;  break;
        case DXGI_FORMAT_BC7_UNORM:
        case DXGI_FORMAT_BC7_UNORM_SRGB:    pfDecode = D3DXDecodeBC7;   sbpp = 16;  break;
        default:
            // BC4, BC5, and BC6 don't have alpha channels
            return false;
        }

        // Scan blocks for non-opaque alpha
        static const XMVECTORF32 threshold = { { { 0.99f, 0.99f, 0.99f, 0.99f } } };

        __declspec(align(16)) XMVECTOR temp[16];
        const uint8_t *pPixels = cImage.pixels;
        for (size_t h = 0; h < cImage.height; h += 4)
        {
            const uint8_t *ptr = pPixels;
            size_t ph = std::min<size_t>(4, cImage.height - h);
            size_t w = 0;
            for (size_t count = 0; (count < cImage.rowPitch) && (w < cImage.width); count += sbpp, w += 4)
            {
                pfDecode(temp, ptr);

                size_t pw = std::min<size_t>(4, cImage.width - w);
                assert(pw > 0 && ph > 0);

                if (pw == 4 && ph == 4)
                {
                    // Full blocks
                    for (size_t j = 0; j < 16; ++j)
                    {
                        XMVECTOR alpha = XMVectorSplatW(temp[j]);
                        if (XMVector4Less(alpha, threshold))
                            return false;
                    }
                }
                else
                {
                    // Handle partial blocks
                    for (size_t y = 0; y < ph; ++y)
                    {
                        for (size_t x = 0; x < pw; ++x)
                        {
                            XMVECTOR alpha = XMVectorSplatW(temp[y * 4 + x]);
                            if (XMVector4Less(alpha, threshold))
                                return false;
                        }
                    }
                }

                ptr += sbpp;
            }

            pPixels += cImage.rowPitch;
        }

        return true;
    }