Beispiel #1
0
int
fn1 (void)
{
  if (__builtin_constant_p ()) /* { dg-error "7:not enough" } */
    return 0;
  if (__builtin_constant_p (1, 2)) /* { dg-error "7:too many" } */
    return 1;
  if (__builtin_isfinite ()) /* { dg-error "7:not enough" } */
    return 3;
  if (__builtin_isfinite (1, 2)) /* { dg-error "7:too many" } */
    return 4;
  if (__builtin_isless (0)) /* { dg-error "7:not enough" } */
    return 5;
  if (__builtin_isless (1, 2, 3)) /* { dg-error "7:too many" } */
    return 6;
  if (__builtin_fpclassify (1, 2, 3, 4, 5)) /* { dg-error "7:not enough" } */
    return 7;
  if (__builtin_fpclassify (1, 2, 3, 4, 5, r, 6)) /* { dg-error "7:too many" } */
    return 8;
  if (__builtin_assume_aligned (p)) /* { dg-error "7:too few" } */
    return 9;
  if (__builtin_assume_aligned (p, r, p, p)) /* { dg-error "7:too many" } */
    return 10;
  if (__builtin_add_overflow ()) /* { dg-error "7:not enough" } */
    return 11;
  if (__builtin_add_overflow (1, 2, 3, &r)) /* { dg-error "7:too many" } */
    return 12;
  return -1;
}
Beispiel #2
0
void
f3 (double *p, double *q)
{
  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
  for (unsigned int i = 0; i < ~0U - 3; i += 4)
    {
      double a = q[i + 2] + p[i + 2];
      double b = q[i + 3] + p[i + 3];
      q[i + 2] = a;
      q[i + 3] = b;
    }
}
Beispiel #3
0
void
f5 (double *p, double *q)
{
  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
  for (unsigned int i = 2; i < 1000; i += 4)
    {
      double a = q[i - 2] + p[i - 2];
      double b = q[i - 1] + p[i - 1];
      q[i - 2] = a;
      q[i - 1] = b;
    }
}
Beispiel #4
0
void
f2 (double *p, double *q)
{
  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
  for (unsigned int i = 2; i < ~0U - 4; i += 4)
    {
      double a = q[i] + p[i];
      double b = q[i + 1] + p[i + 1];
      q[i] = a;
      q[i + 1] = b;
    }
}
Beispiel #5
0
void
f4 (double *p, double *q)
{
  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
  for (unsigned int i = 0; i < 500; i += 6)
    for (unsigned int j = 0; j < 500; j += 4)
      {
	double a = q[j] + p[i];
	double b = q[j + 1] + p[i + 1];
	q[i] = a;
	q[i + 1] = b;
      }
}
Beispiel #6
0
int test4(int *a) {
  a = __builtin_assume_aligned(a, -32); // expected-error {{requested alignment is not a power of 2}}
  // FIXME: The line below produces {{requested alignment is not a power of 2}}
  // on i386-freebsd, but not on x86_64-linux (for example).
  // a = __builtin_assume_aligned(a, 1ULL << 63);
  return a[0];
}
Beispiel #7
0
void
fn0 (int n)
{
  p = __builtin_alloca_with_align (n, 6); /* { dg-error "39:must be a constant integer" } */

  r += __builtin_isfinite (0); /* { dg-error "28:non-floating-point argument in call" } */
  r += __builtin_isinf (0); /* { dg-error "25:non-floating-point argument in call" } */
  r += __builtin_isinf_sign (0); /* { dg-error "30:non-floating-point argument in call" } */
  r += __builtin_isnan (0); /* { dg-error "25:non-floating-point argument in call" } */
  r += __builtin_isnormal (0); /* { dg-error "28:non-floating-point argument in call" } */
  r += __builtin_signbit (0); /* { dg-error "27:non-floating-point argument in call" } */

  r += __builtin_isgreater (0, 0); /* { dg-error "8:non-floating-point arguments in call to function" } */
  r += __builtin_isgreaterequal (0, 0); /* { dg-error "8:non-floating-point arguments in call to function" } */
  r += __builtin_isless (0, 0); /* { dg-error "8:non-floating-point arguments in call to function" } */
  r += __builtin_islessequal (0, 0); /* { dg-error "8:non-floating-point arguments in call to function" } */
  r += __builtin_islessgreater (0, 0); /* { dg-error "8:non-floating-point arguments in call to function" } */
  r += __builtin_isunordered (0, 0); /* { dg-error "8:non-floating-point arguments in call to function" } */

  r += __builtin_fpclassify (1, 2, n, 4, 5, n); /* { dg-error "36:non-const integer argument 3 in call" } */
  r += __builtin_fpclassify (1, 2, 3, 4, 5, 6); /* { dg-error "45:non-floating-point argument in call" } */

  d = __builtin_assume_aligned (p, n, p); /* { dg-error "39:non-integer argument 3 in call" } */

  b = __builtin_add_overflow (n, *d, &r); /* { dg-error "34:argument 2 in call to function" } */
  b = __builtin_add_overflow (n, 5, d); /* { dg-error "37:argument 3 in call" } */
  b = __builtin_sub_overflow (n, *d, &r); /* { dg-error "34:argument 2 in call to function" } */
  b = __builtin_sub_overflow (n, 5, d); /* { dg-error "37:argument 3 in call" } */
  b = __builtin_mul_overflow (n, *d, &r); /* { dg-error "34:argument 2 in call to function" } */
  b = __builtin_mul_overflow (n, 5, d); /* { dg-error "37:argument 3 in call" } */
}
Beispiel #8
0
__attribute__((noinline, noclone)) void
foo (double *x, double *y)
{
  double *p = __builtin_assume_aligned (x, 16);
  double *q = __builtin_assume_aligned (y, 16);
  double z, h;
  int i;
  for (i = 0; i < 1024; i++)
    {
      if (p[i] < 0.0)
	z = q[i], h = q[i] * 7.0 + 3.0;
      else
	z = p[i] + 6.0, h = p[1024 + i];
      p[i] = z + 2.0 * h;
    }
}
buffer_c16_t FIRC16xR16x32Decim8::execute(
	const buffer_c16_t& src,
	const buffer_c16_t& dst
) {
	vec2_s16* const z = static_cast<vec2_s16*>(__builtin_assume_aligned(z_.data(), 4));
	const vec2_s16* const t = static_cast<vec2_s16*>(__builtin_assume_aligned(taps_.data(), 4));
	uint32_t* const d = static_cast<uint32_t*>(__builtin_assume_aligned(dst.p, 4));

	const auto k = output_scale;

	const size_t count = src.count / decimation_factor;
	for(size_t i=0; i<count; i++) {
		const vec2_s16* const in = static_cast<const vec2_s16*>(__builtin_assume_aligned(&src.p[i * decimation_factor], 4));

		complex32_t accum;

		// Oldest samples are discarded.
		accum = mac_shift(z, t, 0, accum);
		accum = mac_shift(z, t, 1, accum);
		accum = mac_shift(z, t, 2, accum);
		accum = mac_shift(z, t, 3, accum);

		// Middle samples are shifted earlier in the "z" delay buffer.
		accum = mac_shift_and_store(z, t, decimation_factor, 0, accum);
		accum = mac_shift_and_store(z, t, decimation_factor, 1, accum);
		accum = mac_shift_and_store(z, t, decimation_factor, 2, accum);
		accum = mac_shift_and_store(z, t, decimation_factor, 3, accum);
		accum = mac_shift_and_store(z, t, decimation_factor, 4, accum);
		accum = mac_shift_and_store(z, t, decimation_factor, 5, accum);
		accum = mac_shift_and_store(z, t, decimation_factor, 6, accum);
		accum = mac_shift_and_store(z, t, decimation_factor, 7, accum);

		// Newest samples come from "in" buffer, are copied to "z" delay buffer.
		accum = mac_shift_and_store_new_c16_samples(z, t, in, decimation_factor, 0, taps_count, accum);
		accum = mac_shift_and_store_new_c16_samples(z, t, in, decimation_factor, 1, taps_count, accum);
		accum = mac_shift_and_store_new_c16_samples(z, t, in, decimation_factor, 2, taps_count, accum);
		accum = mac_shift_and_store_new_c16_samples(z, t, in, decimation_factor, 3, taps_count, accum);

		d[i] = scale_round_and_pack(accum, k);
	}

	return {
		dst.p,
		count,
		src.sampling_rate / decimation_factor
	};
}
Beispiel #10
0
__attribute__((noinline, noclone)) void
foo (float *p, float *q, float x)
{
    int i;
    p = (float *) __builtin_assume_aligned (p, 32);
    q = (float *) __builtin_assume_aligned (q, 32);
    float f = 1.0f, g = 2.0f;
    for (i = 0; i < 1024; i++)
    {
        *p++ = f;
        f += x;
    }
    for (i = 0; i < 1024; i++)
    {
        *q++ = g;
        g += 0.5f;
    }
}
Beispiel #11
0
double
foo (double *x, int n)
{
  double p = 0.0;
  int i;
  x = __builtin_assume_aligned (x, 128);
  if (n % 128)
    __builtin_unreachable ();
  for (i = 0; i < n; i++)
    p += x[i];
  return p;
}
int main(int argc, char* argv[]) {
  char *ptr = (char *)malloc(3);

  offset = 1;

  __builtin_assume_aligned(ptr + 2, 0x8000, offset);
  // CHECK: {{.*}}alignment-assumption-{{.*}}.cpp:[[@LINE-1]]:32: runtime error: assumption of 32768 byte alignment (with offset of 1 byte) for pointer of type 'char *' failed
  // CHECK: 0x{{.*}}: note: offset address is {{.*}} aligned, misalignment offset is {{.*}} byte

  free(ptr);

  return 0;
}
Beispiel #13
0
double xyzp(double a, double b, double c, double *x, double *y, double *z,
            int n, int r_max)
{
    __builtin_assume_aligned(x, BYTEALIGN);
    __builtin_assume_aligned(y, BYTEALIGN);
    __builtin_assume_aligned(z, BYTEALIGN);

    int i, r;
    struct timespec ts_start, ts_end;
    double runtime;

    int midpt = n / 2;
    double sum;

    clock_gettime(CLOCK_MONOTONIC_RAW, &ts_start);
    for (r = 0; r < r_max; r++) {
        //#pragma unroll(8)
        for (i = 0; i < n; i++)
            //z[i] = x[i] + y[i] + z[i];
            //z[i] = x[i] * y[i] + z[i];
            //z[i] = a * x[i] + y[i] + z[i];
            //z[i] = a * x[i] + b * y[i] + z[i];
            //z[i] = a * x[i] + b * y[i] + c * z[i];
            //z[i] = a * x[i] + b * y[i] + c;
            z[i] = 0.25 * x[i] * y[i];
            //z[i] = 0.25 + x[i] + y[i];
            //z[i] = a * x[i] * y[i];
            //z[i] = a + x[i] + y[i];
        // To prevent outer loop removal during optimisation
        if (y[midpt] < 0.) dummy(a, x, y, z);
    }
    clock_gettime(CLOCK_MONOTONIC_RAW, &ts_end);

    runtime = (double) (ts_end.tv_sec - ts_start.tv_sec)
        + (double) (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9;

    return runtime;
}
void calibrate_energy(float* _x, float* _y, float* _z, float* _e, const size_t& _size){

  const size_t lenght = _size;
#if GCC_VERSION > 40701
  float *x = (float*)__builtin_assume_aligned(_x, 16);
  float *y = (float*)__builtin_assume_aligned(_y, 16);
  float *z = (float*)__builtin_assume_aligned(_z, 16);
  float *e = (float*)__builtin_assume_aligned(_e, 16);
#else
  float *x = _x;
  float *y = _y;
  float *z = _z;
  float *e = _e;
#endif

  for(size_t i = 0;i<lenght;++i){
    float r = std::sqrt(x[i]*x[i] + y[i]*y[i] + z[i]*z[i]);
    float scale = p0 + p1*r + p2*r*r + p3*r*r*r + p4*r*r*r*r + p5*r*r*r*r*r ;
    scale /= (2.f*1e7/3.f);
    e[i] = (1.f-scale)*e[i];
  }
  
}
Beispiel #15
0
inline size_t scanHaystackBlock(const StringPiece& haystack,
                                const StringPiece& needles,
                                int64_t blockStartIdx) {
  DCHECK_GT(needles.size(), 16);  // should handled by *needles16() method
  DCHECK(blockStartIdx + 16 <= haystack.size() ||
         (PAGE_FOR(haystack.data() + blockStartIdx) ==
          PAGE_FOR(haystack.data() + blockStartIdx + 15)));

  __v16qi arr1;
  if (HAYSTACK_ALIGNED) {
    void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16);
    arr1 = *reinterpret_cast<const __v16qi*>(ptr1);
  } else {
    arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx);
  }

  // This load is safe because needles.size() >= 16
  auto arr2 = __builtin_ia32_loaddqu(needles.data());
  size_t b = __builtin_ia32_pcmpestri128(
    arr2, 16, arr1, haystack.size() - blockStartIdx, 0);

  size_t j = nextAlignedIndex(needles.data());
  for (; j < needles.size(); j += 16) {
    void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16);
    arr2 = *reinterpret_cast<const __v16qi*>(ptr2);

    auto index = __builtin_ia32_pcmpestri128(
      arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0);
    b = std::min<size_t>(index, b);
  }

  if (b < 16) {
    return blockStartIdx + b;
  }
  return StringPiece::npos;
}
Beispiel #16
0
void
foo (char *input)
{
    input = __builtin_assume_aligned (input, 8);
    input[0] = 'H';
    input[1] = 'e';
    input[2] = 'l';
    input[3] = 'l';
    input[4] = 'o';
    input[5] = ' ';
    input[6] = 'w';
    input[7] = 'o';
    input[8] = 'r';
    input[9] = 'l';
    input[10] = 'd';
    input[11] = '\0';
}
Beispiel #17
0
int test8(int *a, int j) {
  a = __builtin_assume_aligned(a, j); // expected-error {{must be a constant integer}}
  return a[0];
}
Beispiel #18
0
int main(int argc, char**) {

  std::mt19937 eng;

  std::uniform_real_distribution<float> rgen(0.,1.);

  constexpr int NN = 1024*1024;

  // alignas(128) float r[NN];
  float * r = (float*)__builtin_assume_aligned(::memalign(32,NN*sizeof(float)),32);


  std::cout << sizeof(r) << " " << alignof(r) << std::endl;
  PerfStat c12, c2, c11, c22;
  c12.header(std::cout,true);	


 std::cout << std::endl;
 

  c11.startAll();
  for (int i=0;i!=NN;++i)
    r[i]=rgen(eng);
  c11.stopAll();
  std::cout << "|rgen  " << std::endl;
  c11.print(std::cout);

  c12.startAll();
  for (int i=0;i!=NN;++i)
    r[i]=rgen(eng);
  c12.stopAll();
  std::cout << "|rgen  " << std::endl;
  c12.print(std::cout);
 


  std::cout << std::endl;
  std::cout << std::endl;

  constexpr int KK=10000;


  bool err=false;
  float s[KK+3];
  for (int ok=0; ok!=KK+3; ++ok) {
    s[ok]=0;
    c2.start();
    for (int i=0;i!=NN;++i)
      s[ok]+=r[i];
    c2.stop();
    if (ok>0 && s[ok] != s[ok-1]) err=true;
    
    if ( (ok%1000)==2) {
      std::cout << "|sum " << ok << "  ";  c2.print(std::cout);
    }
  }
  
  if (err) std::cout << "a mess " << std::endl;
 
  std::cout << "end \n" << std::endl;
  
  c2.print(std::cout);

  ::free(r);

  return 0;

}
Beispiel #19
0
void
foo (char *p)
{
  p = __builtin_assume_aligned (p, 64);
  __builtin_memset (p, 0, 0x100000001ULL);
}
Beispiel #20
0
//

#include <stddef.h>
#include <stdint.h>

void
copy_soa2aos(float *__restrict__ src, float *__restrict__ dest1,
             float *__restrict__ dest2, size_t len)
{
    src = (float*)__builtin_assume_aligned(src, 16);
    dest1 = (float*)__builtin_assume_aligned(dest1, 16);
    dest2 = (float*)__builtin_assume_aligned(dest2, 16);
    for (size_t i = 0;i < len;i++) {
        dest1[i] = src[i * 2];
        dest2[i] = src[i * 2 + 1];
    }
}

void
copy_aos2soa(float *__restrict__ dest, float *__restrict__ src1,
             float *__restrict__ src2, size_t len)
{
    dest = (float*)__builtin_assume_aligned(dest, 16);
    src1 = (float*)__builtin_assume_aligned(src1, 16);
    src2 = (float*)__builtin_assume_aligned(src2, 16);
    for (size_t i = 0;i < len;i++) {
        dest[i * 2] = src1[i];
        dest[i * 2 + 1] = src2[i];
    }
}
Beispiel #21
0
// -----------------------------------------------------------------------------
// Main routine
// -----------------------------------------------------------------------------
int main() {

  int i;
  srand48(0);            // seed PRNG
  double e,s;            // timestamp variables
  float *a, *b;          // data pointers
  float *pA,*pB;         // work pointer
  __m128 rA,rB;          // variables for SSE
  __m256 rA_AVX, rB_AVX; // variables for AVX

  // define vector size 
  const int vector_size = 10000000;

  // allocate memory 
  a = (float*) _mm_malloc (vector_size*sizeof(float),32);
  b = (float*) _mm_malloc (vector_size*sizeof(float),32);

  // initialize vectors //
  for(i=0;i<vector_size;i++) {
    a[i]=fabs(drand48());
    b[i]=0.0f;
  }

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// Naive implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  s = getCurrentTime();
  pA = __builtin_assume_aligned(a, 16);
  pB = __builtin_assume_aligned(b, 16);
  for (i=0; i<vector_size; i++){
//    b[i] = sqrtf(sqrtf(sqrtf(a[i])));
      pB[i] = sqrtf(sqrtf(sqrtf(pA[i])));
  }
  e = getCurrentTime();
  printf("%lf ms b[42] = %lf\n",(e-s)*1000,b[42]);
//  cout << (e-s)*1000 << " ms" << ", b[42] = " << b[42] << endl;

// -----------------------------------------------------------------------------
  for(i=0;i<vector_size;i++) {
    b[i]=0.0f;
  }
// -----------------------------------------------------------------------------

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// SSE2 implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  pA = a; pB = b;

  s = getCurrentTime();
  for (i=0; i<vector_size; i+=4){
    rA   = _mm_load_ps(pA);
    rB   = _mm_sqrt_ps(_mm_sqrt_ps(_mm_sqrt_ps(rA)));
    _mm_store_ps(pB,rB);
    pA += 4;
    pB += 4;
  }
  e = getCurrentTime();
  printf("%lf ms b[42] = %lf\n",(e-s)*1000,b[42]);

// -----------------------------------------------------------------------------
  for(i=0;i<vector_size;i++) {
    b[i]=0.0f;
  }
// -----------------------------------------------------------------------------

// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// AVX implementation
// +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  pA = a; pB = b;

  s = getCurrentTime();
  for (i=0; i<vector_size; i+=8){
    rA_AVX   = _mm256_load_ps(pA);
    rB_AVX   = _mm256_sqrt_ps(_mm256_sqrt_ps(_mm256_sqrt_ps(rA_AVX)));
    _mm256_store_ps(pB,rB_AVX);
    pA += 8;
    pB += 8;
  }
  e = getCurrentTime();
  printf("%lf ms b[42] = %lf\n",(e-s)*1000,b[42]);

  _mm_free(a);
  _mm_free(b);

  return 0;
}
Beispiel #22
0
int test2(int *a) {
  a = __builtin_assume_aligned(a, 32, 0);
  return a[0];
}
Beispiel #23
0
	float sqrtResult = sqrt(inSqrt);

	plus = (-b + sqrtResult) * divTwoA;
	minus = (-b - sqrtResult) * divTwoA;
}
*/


float dotProduct(float ax, float ay, float az, float bx, float by, float bz) {
	return ax*bx + ay*by + az*bz;
}

void raySphereForRelativeSphereVectorized(float *__restrict__ relativePositionX, float *__restrict__ relativePositionY, float *__restrict__ relativePositionZ, float *__restrict__ directionX, float *__restrict__ directionY, float *__restrict__ directionZ, float *__restrict__ r, float *__restrict__ intersect, float *__restrict__ plus, float *__restrict__ minus) {
    const unsigned BATCHSIZE = 16;

    float *alignedRelativePositionX = static_cast<float*>(__builtin_assume_aligned(relativePositionX, 32));
	float *alignedRelativePositionY = static_cast<float*>(__builtin_assume_aligned(relativePositionY, 32));
	float *alignedRelativePositionZ = static_cast<float*>(__builtin_assume_aligned(relativePositionZ, 32));

	float *alignedDirectionX = static_cast<float*>(__builtin_assume_aligned(directionX, 32));
	float *alignedDirectionY = static_cast<float*>(__builtin_assume_aligned(directionY, 32));
	float *alignedDirectionZ = static_cast<float*>(__builtin_assume_aligned(directionZ, 32));

	float *alignedIntersect = static_cast<float*>(__builtin_assume_aligned(intersect, 32));


	float *alignedPlus = static_cast<float*>(__builtin_assume_aligned(plus, 32));
	float *alignedMinus = static_cast<float*>(__builtin_assume_aligned(minus, 32));

	float *alignedR = static_cast<float*>(__builtin_assume_aligned(r, 32));
Beispiel #24
0
int test5(int *a, unsigned *b) {
  a = __builtin_assume_aligned(a, 32, b); // expected-warning {{incompatible pointer to integer conversion passing 'unsigned int *' to parameter of type}}
  return a[0];
}
Beispiel #25
0
int test7(int *a) {
  a = __builtin_assume_aligned(a, 31); // expected-error {{requested alignment is not a power of 2}}
  return a[0];
}
Beispiel #26
0
int test6(int *a) {
  a = __builtin_assume_aligned(a, 32, 0, 0); // expected-error {{too many arguments to function call, expected at most 3, have 4}}
  return a[0];
}
Beispiel #27
0
#include <float.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include "AVX512_DJACV.h"

#define VSIZE_B 64u
#define DBLE_SZ 8u
#define NDBLE_V (VSIZE_B / DBLE_SZ)
#define NPAIR_V NDBLE_V

static inline __m256d avx512_ddots(const unsigned m, const double *const restrict Gp, const double *const restrict Gq)
{
  register const double *const Gp_i = (const double*)__builtin_assume_aligned(Gp, VSIZE_B);
  register const double *const Gq_i = (const double*)__builtin_assume_aligned(Gq, VSIZE_B);

  register __m512d Gpp = _mm512_setzero_pd();
  register __m512d Gqq = _mm512_setzero_pd();
  register __m512d Gpq = _mm512_setzero_pd();

  for (register unsigned i = 0u; i < m; i += NDBLE_V) {
    register const __m512d Gpi = _mm512_load_pd(Gp_i + i);
    register const __m512d Gqi = _mm512_load_pd(Gq_i + i);

    Gpp = _mm512_fmadd_pd(Gpi, Gpi, Gpp);
    Gqq = _mm512_fmadd_pd(Gqi, Gqi, Gqq);
    Gpq = _mm512_fmadd_pd(Gpi, Gqi, Gpq);
  }
void vectorcode ()
{
  const int N = 512;
  const int M = 1024;
  const int K = 32;

  {
    // example1:

    int a[N], b[N], c[N];
    // void foo ()
    {
      int i;

      for (i=0; i<N; i++) {
        a[i] = b[i] + c[i];
      }
    }
  }

  {
    // example2:

    int a[N], b[N], c[N];
    // void foo (int n, int x)
    int n;
    int x;
    {
      int i;

      /* feature: support for unknown loop bound  */
      /* feature: support for loop invariants  */
      for (i=0; i<n; i++) {
        b[i] = x;
      }

      /* feature: general loop exit condition  */
      /* feature: support for bitwise operations  */
      i = 0;
      while (n--) {
        a[i] = b[i]&c[i]; i++;
      }
    }
  }

  {
    // example3:

    typedef int aint __attribute__ ((__aligned__(16)));
    // void foo (int n, aint * __restrict__ p, aint * __restrict q)
    int n; aint * __restrict__ p; aint * __restrict q;
    {

      /* feature: support for (aligned) pointer accesses.  */
      while (n--) {
        *p++ = *q++;
      }
    }
  }

  {
    // example4:

    typedef int aint __attribute__ ((__aligned__(16)));
    int a[N], b[N], c[N];
    // void foo (int n, aint * __restrict__ p, aint * __restrict__ q)
    int n; aint * __restrict__ p; aint * __restrict__ q;
    {
      int i,j, MAX =  1234;

      /* feature: support for (aligned) pointer accesses  */
      /* feature: support for constants  */
      while (n--) {
        *p++ = *q++ + 5;
      }

      /* feature: support for read accesses with a compile time known misalignment  */
      for (i=0; i<n; i++) {
        a[i] = b[i+1] + c[i+3];
      }

      /* feature: support for if-conversion  */
      for (i=0; i<n; i++) {
        j = a[i];
        b[i] = (j > MAX ? MAX : 0);
      }
    }
  }

  {
    // example5:

    struct a
    {
      int ca[N];
    } s;
    // void foo (x)
    int x;
    {
      int i;

      for (i = 0; i < N; i++) {
        /* feature: support for alignable struct access  */
        s.ca[i] = 5;
      }
    }
  }

  {
    // example6 (gfortran):

    // DIMENSION A(1000000), B(1000000), C(1000000)
    // READ*, X, Y
    // A = LOG(X); B = LOG(Y); C = A + B
    // PRINT*, C(500000)
    // END
  }

  {
    // example7:

    int a[N], b[N];
    // void foo (int x)
    int x;
    {
      int i;

      /* feature: support for read accesses with an unknown misalignment  */
      for (i=0; i<N; i++) {
        a[i] = b[i+x];
      }
    }
  }

  {
    // example8:

    int a[M][N];
    // void foo (int x)
    int x;
    {
      int i,j;

      /* feature: support for multidimensional arrays  */
      for (i=0; i<M; i++) {
        for (j=0; j<N; j++) {
          a[i][j] = x;
        }
      }
    }
  }

  {
    // example9:

    unsigned int ub[N], uc[N];
    // void foo ()
    {
      int i;

      /* feature: support summation reduction.
         note: in case of floats use -funsafe-math-optimizations  */
      unsigned int udiff = 0;
      for (i = 0; i < N; i++) {
        udiff += (ub[i] - uc[i]);
      }
    }
  }

  {
    // example10:

    /* feature: support data-types of different sizes.
       Currently only a single vector-size per target is supported;
       it can accommodate n elements such that n = vector-size/element-size
       (e.g, 4 ints, 8 shorts, or 16 chars for a vector of size 16 bytes).
       A combination of data-types of different sizes in the same loop
       requires special handling. This support is now present in mainline,
       and also includes support for type conversions.  */

    short *sa, *sb, *sc;
    int *ia, *ib, *ic;
    int i;
    for (i = 0; i < N; i++) {
      ia[i] = ib[i] + ic[i];
      sa[i] = sb[i] + sc[i];
    }

    for (i = 0; i < N; i++) {
      ia[i] = (int) sb[i];
    }
  }

  {
    // example11:

    /* feature: support strided accesses - the data elements
       that are to be operated upon in parallel are not consecutive - they
       are accessed with a stride > 1 (in the example, the stride is 2):  */

    int i;
    float a[N/2], b[N], c[N], d[N/2];
    for (i = 0; i < N/2; i++) {
      a[i] = b[2*i+1] * c[2*i+1] - b[2*i] * c[2*i];
      d[i] = b[2*i] * c[2*i+1] + b[2*i+1] * c[2*i];
    }
  }

  {
    // example12: induction:

    int i;
    int a[N];
    for (i = 0; i < N; i++) {
      a[i] = i;
    }
  }

  {
    // void foo()
    {
      // example13: outer-loop:

      int i,j;
      int diff;
      int a[M][N];
      int b[M][N];
      int out[M];
      for (i = 0; i < M; i++) {
        diff = 0;
        for (j = 0; j < N; j+=8) {
          diff += (a[i][j] - b[i][j]);
        }
        out[i] = diff;
      }
    }
  }

  {
    // example14: double reduction:

    int i,j,k;
    int in[M][N];
    int coeff[M][N];
    int out[K];
    int sum;
    for (k = 0; k < K; k++) {
      sum = 0;
      for (j = 0; j < M; j++)
        for (i = 0; i < N; i++)
          sum += in[i+k][j] * coeff[i][j];

      out[k] = sum;
    }
  }

  {
    // example15: condition in nested loop:

    int i,j;
    int a[N+1];
    int c[N];
    int x_in[M];
    int x_out[M];
    for (j = 0; j < M; j++) {
      int x = x_in[j];
      int curr_a = a[0];

      for (i = 0; i < N; i++) {
        int next_a = a[i+1];
        curr_a = x > c[i] ? curr_a : next_a;
      }

      x_out[j] = curr_a;
    }
  }

  {
    // example16: load permutation in loop-aware SLP:
    int i;

    int  in[3*N];
    int  out[3*N];
    int* pInput  = in;
    int* pOutput = out;
    int  M00, M01, M02;
    int  M10, M11, M12;
    int  M20, M21, M22;
    for (i = 0; i < N; i++) {
      int a = *pInput++;
      int b = *pInput++;
      int c = *pInput++;

      *pOutput++ = M00 * a + M01 * b + M02 * c;
      *pOutput++ = M10 * a + M11 * b + M12 * c;
      *pOutput++ = M20 * a + M21 * b + M22 * c;
    }
  }

  {
    // example17: basic block SLP:

    // void foo ()
    {
      unsigned int in[N];
      unsigned int out[N];
      unsigned int *pin =  in;
      unsigned int *pout = out;

      *pout++ = *pin++;
      *pout++ = *pin++;
      *pout++ = *pin++;
      *pout++ = *pin++;
    }
  }

  {
    // example18: Simple reduction in SLP:

    int sum1 = 0;
    int sum2 = 0;
    int a[128];
    // void foo (void)
    {
      int i;

      for (i = 0; i < 64; i++) {
        sum1 += a[2*i];
        sum2 += a[2*i+1]; // max index = 63*2+1 = 127, need vector of 128 
      }
    }
  }

  {
    // example19: Reduction chain in SLP:

    int sum = 0;
    int a[128];
    // void foo (void)
    {
      int i;

      for (i = 0; i < 64; i++) {
        sum += a[2*i];
        sum += a[2*i+1]; // max index = 63*2+1 = 127, need vector of 128 
      }
    }
  }

  {
    // example20: Basic block SLP with multiple types, loads with different offsets, misaligned load, and not-affine accesses:

    // void foo (int * __restrict__ dst, short * __restrict__ src,
    // int h, int stride, short A, short B)
    int *   __restrict__   dst;
    short * __restrict__   src;
    int h;
    int stride;
    short A;
    short B;
    {
      int i;
      for (i = 0; i < h; i++) {
        dst[0] += A*src[0] + B*src[1];
        dst[1] += A*src[1] + B*src[2];
        dst[2] += A*src[2] + B*src[3];
        dst[3] += A*src[3] + B*src[4];
        dst[4] += A*src[4] + B*src[5];
        dst[5] += A*src[5] + B*src[6];
        dst[6] += A*src[6] + B*src[7];
        dst[7] += A*src[7] + B*src[8];
        dst += stride;
        src += stride;
      }
    }
  }

  {
    // example21: Backward access:

    // int foo (int *b, int n)
    int *b; int n;
    {
      int i, a = 0;

      for (i = n-1; i >= 0; i--)
        a += b[i];

      // return a;
    }
  }

  #if 0
  {
    // example22: Alignment hints:

    // void foo (int *out1, int *in1, int *in2, int n)
    int *out1; int *in1; int *in2; int n;
    {
      int i;

      // GB looks like __builtin_assume_aligned is a C99 gcc extension

      // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
      //  Built-in Function: void *__builtin_assume_aligned (const void *exp, size_t align, ...)

      // This function returns its first argument, and allows the compiler to assume that the
      // returned pointer is at least align bytes aligned. This built-in can have either two
      // or three arguments, if it has three, the third argument should have integer type,
      // and if it is nonzero means misalignment offset. For example:

      // void *x = __builtin_assume_aligned (arg, 16);

      // means that the compiler can assume x, set to arg, is at least 16-byte aligned, while:

      // void *x = __builtin_assume_aligned (arg, 32, 8);

      // means that the compiler can assume for x, set to arg, that (char *) x - 8 is 32-byte aligned.

      int* out1 = __builtin_assume_aligned (out1, 32, 16);
      int* in1  = __builtin_assume_aligned (in1,  32, 16);
      int* in2  = __builtin_assume_aligned (in2,  32,  0);

      for (i = 0; i < n; i++)
        out1[i] = in1[i] * in2[i];
    }
  }
  #endif

  {
    // example23: Widening shift:

    // void foo (unsigned short *src, unsigned int *dst)
    unsigned short *src; unsigned int *dst;
    {
      int i;

      for (i = 0; i < N; i++)
        *dst++ = *src++ << 7;
    }
  }

  {
    // example24: Condition with mixed types:

    float a[N], b[N];
    int c[N];

    //void foo (short x, short y)
    short x; short y;
    {
      int i;
      for (i = 0; i < N; i++)
        c[i] = a[i] < b[i] ? x : y;
    }
  }

  {
    // example25: Loop with bool:

    float a[N], b[N], c[N], d[N];
    int j[N];

    // void foo (void)
    {
      int i;
      bool x, y;
      for (i = 0; i < N; i++) {
        x = (a[i] < b[i]);
        y = (c[i] < d[i]);
        j[i] = x & y;
      }
    }
  }
} // void vectorcode ()