/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we call it for large
   matrices.  */

typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
                          const int *, const GFC_INTEGER_8 *, const int *,
                          const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
                          int, int);

#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray, 
	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i8_avx128_fma3);
void
matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray, 
	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
	int blas_limit, blas_call gemm)
{
  const GFC_INTEGER_8 * restrict abase;
  const GFC_INTEGER_8 * restrict bbase;
  GFC_INTEGER_8 * restrict dest;

  index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  index_type x, y, n, count, xcount, ycount;

  assert (GFC_DESCRIPTOR_RANK (a) == 2
          || GFC_DESCRIPTOR_RANK (b) == 2);
/* PR target/36936 */
/* { dg-do compile } */
/* { dg-require-effective-target ilp32 } */
/* { dg-options "-O2 -march=i386" } */
/* { dg-final { scan-assembler "cmov" } } */

extern int foo (int) __attribute__((__target__("arch=i686")));

int
foo (int x)
{
  if (x < 0)
    x = 1;
  return x;
}
Beispiel #3
0
/* PR target/69969 */
/* { dg-do compile } */
/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
/* { dg-options "-mcpu=power8" } */

int bar (int x) { return x; }
__attribute__((__target__("no-vsx"))) int foo (int x) { return x; } /* { dg-bogus "-mallow-movmisalign requires -mvsx" } */
Beispiel #4
0
/* { dg-do run } */
/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512f" } */
/* { dg-require-effective-target avx512f } */
/* { dg-skip-if "no M_PI" { vxworks_kernel } } */

#define __NO_MATH_INLINES
#include <math.h>
#include "avx512f-check.h"

#define NUM 64

static void
__attribute__((__target__("fpmath=sse")))
init_src (float *src)
{
  int i, sign = 1;
  float f = rand ();

  for (i = 0; i < NUM; i++)
    {
      src[i] = (i + 1) * f * M_PI * sign;
      if (i < (NUM / 2))
	{
          if ((i % 6) == 0)
	    f = f * src[i];
        }
      else if (i == (NUM / 2))
	f = rand ();
      else if ((i % 6) == 0)
	f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
      sign = -sign;
/* Test whether all of the 32-bit function specific options are accepted
   without error.  */
/* { dg-do compile } */
/* { dg-require-effective-target ilp32 } */

extern void test_abm (void)			__attribute__((__target__("abm")));
extern void test_aes (void)			__attribute__((__target__("aes")));
extern void test_bmi (void)			__attribute__((__target__("bmi")));
extern void test_mmx (void)			__attribute__((__target__("mmx")));
extern void test_pclmul (void)			__attribute__((__target__("pclmul")));
extern void test_popcnt (void)			__attribute__((__target__("popcnt")));
extern void test_recip (void)			__attribute__((__target__("recip")));
extern void test_sse (void)			__attribute__((__target__("sse")));
extern void test_sse2 (void)			__attribute__((__target__("sse2")));
extern void test_sse3 (void)			__attribute__((__target__("sse3")));
extern void test_sse4 (void)			__attribute__((__target__("sse4")));
extern void test_sse4_1 (void)			__attribute__((__target__("sse4.1")));
extern void test_sse4_2 (void)			__attribute__((__target__("sse4.2")));
extern void test_sse4a (void)			__attribute__((__target__("sse4a")));
extern void test_fma4 (void)			__attribute__((__target__("fma4")));
extern void test_ssse3 (void)			__attribute__((__target__("ssse3")));
extern void test_tbm (void)			__attribute__((__target__("tbm")));

extern void test_no_abm (void)			__attribute__((__target__("no-abm")));
extern void test_no_aes (void)			__attribute__((__target__("no-aes")));
extern void test_no_bmi (void)			__attribute__((__target__("no-bmi")));
extern void test_no_mmx (void)			__attribute__((__target__("no-mmx")));
extern void test_no_pclmul (void)		__attribute__((__target__("no-pclmul")));
extern void test_no_popcnt (void)		__attribute__((__target__("no-popcnt")));
extern void test_no_recip (void)		__attribute__((__target__("no-recip")));
extern void test_no_sse (void)			__attribute__((__target__("no-sse")));
Beispiel #6
0
  for (size_t index = 0; index < haystack.size(); ++index) {
    if (s.contains(haystack[index])) {
      return index;
    }
  }
  return StringPiece::npos;
}

#if FOLLY_HAVE_EMMINTRIN_H && __GNUC_PREREQ(4, 6)

template <bool HAYSTACK_ALIGNED>
inline size_t scanHaystackBlock(const StringPiece& haystack,
                                const StringPiece& needles,
                                int64_t idx)
// inline is okay because it's only called from other sse4.2 functions
  __attribute__ ((__target__("sse4.2")));

// Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
// needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
// If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
// block.
template <bool HAYSTACK_ALIGNED>
inline size_t scanHaystackBlock(const StringPiece& haystack,
                                const StringPiece& needles,
                                int64_t blockStartIdx) {
  DCHECK_GT(needles.size(), 16);  // should handled by *needles16() method
  DCHECK(blockStartIdx + 16 <= haystack.size() ||
         (PAGE_FOR(haystack.data() + blockStartIdx) ==
          PAGE_FOR(haystack.data() + blockStartIdx + 15)));

  __v16qi arr1;
Beispiel #7
0
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
 ****************************************************************************/

/*
 * \file
 */
#include "../strategyselector.h"
#include "../picture.h"
#include <immintrin.h>
#include <assert.h>

#ifdef __GNUC__
__attribute__ ((__target__ ("sse2,sse4.1")))
#endif
static unsigned reg_sad_sse41(const pixel * const data1, const pixel * const data2,
                        const int width, const int height, const unsigned stride1, const unsigned stride2)
{
  int y, x;
  unsigned sad = 0;
  __m128i sse_inc = _mm_setzero_si128 ();
  long long int sse_inc_array[2];
  
  for (y = 0; y < height; ++y) {
    for (x = 0; x <= width-16; x+=16) {
      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
      sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
    }
/* Test whether all of the 64-bit function specific options are accepted
   without error.  */
/* { dg-do compile } */
/* { dg-require-effective-target lp64 } */

extern void test_abm (void)			__attribute__((__target__("abm")));
extern void test_aes (void)			__attribute__((__target__("aes")));
extern void test_mmx (void)			__attribute__((__target__("mmx")));
extern void test_pclmul (void)			__attribute__((__target__("pclmul")));
extern void test_popcnt (void)			__attribute__((__target__("popcnt")));
extern void test_recip (void)			__attribute__((__target__("recip")));
extern void test_sse (void)			__attribute__((__target__("sse")));
extern void test_sse2 (void)			__attribute__((__target__("sse2")));
extern void test_sse3 (void)			__attribute__((__target__("sse3")));
extern void test_sse4 (void)			__attribute__((__target__("sse4")));
extern void test_sse4_1 (void)			__attribute__((__target__("sse4.1")));
extern void test_sse4_2 (void)			__attribute__((__target__("sse4.2")));
extern void test_sse4a (void)			__attribute__((__target__("sse4a")));
extern void test_fma4 (void)			__attribute__((__target__("fma4")));
extern void test_ssse3 (void)			__attribute__((__target__("ssse3")));

extern void test_no_abm (void)			__attribute__((__target__("no-abm")));
extern void test_no_aes (void)			__attribute__((__target__("no-aes")));
extern void test_no_mmx (void)			__attribute__((__target__("no-mmx")));
extern void test_no_pclmul (void)		__attribute__((__target__("no-pclmul")));
extern void test_no_popcnt (void)		__attribute__((__target__("no-popcnt")));
extern void test_no_recip (void)		__attribute__((__target__("no-recip")));
extern void test_no_sse (void)			__attribute__((__target__("no-sse")));
extern void test_no_sse2 (void)			__attribute__((__target__("no-sse2")));
extern void test_no_sse3 (void)			__attribute__((__target__("no-sse3")));
extern void test_no_sse4 (void)			__attribute__((__target__("no-sse4")));
Beispiel #9
0
// RUN: %clang_cc1 -fsyntax-only -ffreestanding %s -verify
// RUN: %clang_cc1 -fsyntax-only -ffreestanding -fno-lax-vector-conversions %s -verify
// RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s -verify
// expected-no-diagnostics

#if defined(i386) || defined(__x86_64__)

// Include the metaheader that includes all x86 intrinsic headers.
#include <x86intrin.h>

void __attribute__((__target__("mmx"))) mm_empty_wrap(void) {
  _mm_empty();
}

__m128 __attribute__((__target__("sse"))) mm_add_ss_wrap(__m128 a, __m128 b) {
  return _mm_add_ss(a, b);
}

__m128d __attribute__((__target__("sse2"))) mm_sqrt_sd_wrap(__m128d a, __m128d b) {
  return _mm_sqrt_sd(a, b);
}

void __attribute__((__target__("sse3"))) mm_mwait_wrap(int a) {
  _mm_mwait(0, 0);
}

__m64 __attribute__((__target__("ssse3"))) mm_abs_pi8_wrap(__m64 a) {
  return _mm_abs_pi8(a);
}

__m128i __attribute__((__target__("sse4.1"))) mm_minpos_epu16_wrap(__m128i v) {
Beispiel #10
0
/* Test whether all of the 64-bit function specific options are accepted
   without error.  */
/* { dg-do compile { target { ! ia32 } } } */

#include "funcspec-56.inc"

extern void test_arch_foo (void)		__attribute__((__target__("arch=foo"))); /* { dg-error "bad value" } */

extern void test_tune_foo (void)		__attribute__((__target__("tune=foo"))); /* { dg-error "bad value" } */
Beispiel #11
0
/* { dg-do compile { target { powerpc*-*-* && ilp32 } } } */
/* { dg-skip-if "" { powerpc*-*-darwin* } } */
/* { dg-require-effective-target powerpc_vsx_ok } */
/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power5" } } */
/* { dg-options "-O2 -ffast-math -mcpu=power5 -mabi=no-altivec" } */
/* { dg-final { scan-assembler-times "fabs" 3 } } */
/* { dg-final { scan-assembler-times "fnabs" 3 } } */
/* { dg-final { scan-assembler-times "fsel" 3 } } */
/* { dg-final { scan-assembler-times "fcpsgn" 4 } } */
/* { dg-final { scan-assembler-not "xscpsgndp" } } */

/* Like ppc-target-1.c, but do not enable the altivec abi on 32-bit, so the
   power7 code should generate fcpsgn and not xscpsgndp.  */

double normal1 (double, double);
double power5  (double, double) __attribute__((__target__("cpu=power5")));
double power6  (double, double) __attribute__((__target__("cpu=power6")));
double power6x (double, double) __attribute__((__target__("cpu=power6x")));
double power7  (double, double) __attribute__((__target__("cpu=power7")));
double power7n (double, double) __attribute__((__target__("cpu=power7,no-vsx")));
double normal2 (double, double);

/* fabs/fnabs/fsel */
double normal1 (double a, double b)
{
  return __builtin_copysign (a, b);
}

/* fabs/fnabs/fsel */
double power5  (double a, double b)
{
Beispiel #12
0
// RUN: %clang_cc1 -fsyntax-only -ffreestanding %s -verify
// RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s -verify
// expected-no-diagnostics

#if defined(i386) || defined(__x86_64__)
#include <pmmintrin.h>

int __attribute__((__target__(("sse3")))) foo(int a) {
  _mm_mwait(0, 0);
  return 4;
}
#endif