Beispiel #1
0
void
GCInit_Help(double defaultMinRatio, double defaultMaxRatio, 
	int defaultMinRatioSize, int defaultMaxRatioSize)
{
	MinHeapByte = Min(MinHeapByte, MaxHeapByte);
	init_double(&MinRatio, defaultMinRatio);
	init_double(&MaxRatio, defaultMaxRatio);
	init_int(&MinRatioSize, defaultMinRatioSize);
	init_int(&MaxRatioSize, defaultMaxRatioSize);
	fromSpace = Heap_Alloc(MinHeapByte, MaxHeapByte);
	toSpace = Heap_Alloc(MinHeapByte, MaxHeapByte);
	Heap_Resize(fromSpace, (MinHeapByte + MaxHeapByte) / 2, 1);
	Heap_Resize(toSpace, (MinHeapByte + MaxHeapByte) / 2, 1);
}
Beispiel #2
0
/* Allocates and initializes a new slab for the given cache */
static struct slab *new_slab(struct slab_cache *cache)
{
	struct slab *slab = navi_critical_malloc(SLAB_SIZE);

	if (cache->flags & NAVI_SLAB_DOUBLY_LINKED)
		init_double(cache, slab);
	else
		init_single(cache, slab);

	slab->in_use = 0;
	return slab;
}
int main()
{
    int i;

    init_double(old_dst.d, 4, 3.14);	/* Initialize memory and the registers */

    vind.i[0] = 0;
    vind.i[1] = 2;
    vind.i[2] = 3;
    vind.i[3] = 1;
    vind.i[4] = 7;
    vind.i[5] = 11;
    vind.i[6] = 13;
    vind.i[7] = 10;

    mask.i[0] = 0x80ffffff;
    mask.i[1] = 0x00ffffff;
    mask.i[2] = 0x00ffffff;
    mask.i[3] = 0x80ffffff;
    mask.i[4] = 0x80ffffff;
    mask.i[5] = 0x00ffffff;
    mask.i[6] = 0x80ffffff;
    mask.i[7] = 0x80ffffff;

    for (i = 0; i < 8; i++) {
        expect.f[i] = (mask.i[i] & 0x80000000) ? *(float *)(ptr+(vind.i[i] * 4 + 0))
                                               : old_dst.f[i];
    }

    __asm {     /* VGATHERDPS ymm1, [rax + ymm2_vind*4], ymm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovups YMMa, [old_dst.ms];
        vmovups YMMb, [vind.ms];
        vmovups YMMc, [mask.ms];
        vmovups [YMMDestBefore.ms],  YMMa;
        vmovups [YMMIndexBefore.ms], YMMb;
        vmovups [YMMMaskBefore.ms],  YMMc;
        vgatherdps YMMa, [REG + YMMb*4], YMMc
        vmovups [d.ms], YMMa;
        vmovups [YMMIndexAfter.ms], YMMb;
        vmovups [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(1);

    for (i = 4; i < 8; i++) {
        expect.f[i] = 0;
    }
    __asm {     /* VGATHERDPS xmm1, [rax + xmm2_vind*4], xmm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovups YMMa, [old_dst.ms];
        vmovups YMMb, [vind.ms];
        vmovups YMMc, [mask.ms];
        vmovups [YMMDestBefore.ms],  YMMa;
        vmovups [YMMIndexBefore.ms], YMMb;
        vmovups [YMMMaskBefore.ms],  YMMc;
        vgatherdps XMMa, [REG + XMMb*4], XMMc
        vmovups [d.ms], YMMa;
        vmovups [YMMIndexAfter.ms], YMMb;
        vmovups [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(2);

    for (i = 0; i < 8; i++) {
        expect.f[i] = (mask.i[i] & 0x80000000) ? *(float *)(ptr+(vind.i[i] * 4 + 8))
                                               : old_dst.f[i];
    }
    __asm {     /* VGATHERDPS ymm1, [rax + ymm2_vind*4 + 8], ymm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovups YMMa, [old_dst.ms];
        vmovups YMMb, [vind.ms];
        vmovups YMMc, [mask.ms];
        vmovups [YMMDestBefore.ms],  YMMa;
        vmovups [YMMIndexBefore.ms], YMMb;
        vmovups [YMMMaskBefore.ms],  YMMc;
        vgatherdps YMMa, [REG + YMMb*4 + 8], YMMc
        vmovups [d.ms], YMMa;
        vmovups [YMMIndexAfter.ms], YMMb;
        vmovups [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(3);

    for (i = 4; i < 8; i++) {
        expect.f[i] = 0;
    }
    __asm {     /* VGATHERDPS xmm1, [rax + xmm2_vind*4 + 8], xmm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovups YMMa, [old_dst.ms];
        vmovups YMMb, [vind.ms];
        vmovups YMMc, [mask.ms];
        vmovups [YMMDestBefore.ms],  YMMa;
        vmovups [YMMIndexBefore.ms], YMMb;
        vmovups [YMMMaskBefore.ms],  YMMc;
        vgatherdps XMMa, [REG + XMMb*4 + 8], XMMc
        vmovups [d.ms], YMMa;
        vmovups [YMMIndexAfter.ms], YMMb;
        vmovups [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(4);

    if(!res) PRINTF("gatherdps passed\n");
    return res;
}
Beispiel #4
0
int main(int argc, char* argv []) {
  /* check argument count for a valid range */
  if ( argc != 14 ) {
    print_help();
    return -1;
  }

  char* l_arch = NULL;
  char* l_precision = NULL;
  int l_m = 0;
  int l_n = 0;
  int l_k = 0;
  int l_lda = 0;
  int l_ldb = 0;
  int l_ldc = 0;
  int l_aligned_a = 0;
  int l_aligned_c = 0;
  int l_alpha = 0;
  int l_beta = 0;
  int l_single_precision = 0;
  int l_prefetch = 0;
    
  /* xgemm sizes */
  l_m = atoi(argv[1]);
  l_n = atoi(argv[2]);
  l_k = atoi(argv[3]);
  l_lda = atoi(argv[4]);
  l_ldb = atoi(argv[5]);
  l_ldc = atoi(argv[6]);

  /* some sugar */
  l_alpha = atoi(argv[7]);
  l_beta = atoi(argv[8]);
  l_aligned_a = atoi(argv[9]);
  l_aligned_c = atoi(argv[10]);

  /* arch specific stuff */
  l_arch = argv[11];
  l_precision = argv[13];

  /* set value of prefetch flag */
  if (strcmp("nopf", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_NONE;
  }
  else if (strcmp("pfsigonly", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_SIGNATURE;
  }
  else if (strcmp("BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C;
  }
  else if (strcmp("curAL2", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD;
  }
  else if (strcmp("curAL2_BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD;
  }
  else if (strcmp("AL2", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2;
  }
  else if (strcmp("AL2_BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C;
  }
  else if (strcmp("AL2jpst", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2_JPST;
  }
  else if (strcmp("AL2jpst_BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST;
  }
  else {
    print_help();
    return -1;
  }  

  /* check value of arch flag */
  if ( (strcmp(l_arch, "snb") != 0)    && 
       (strcmp(l_arch, "hsw") != 0)    && 
       (strcmp(l_arch, "knl") != 0)    && 
       (strcmp(l_arch, "skx") != 0)       ) {
    print_help();
    return -1;
  }

  /* check and evaluate precison flag */
  if ( strcmp(l_precision, "SP") == 0 ) {
    l_single_precision = 1;
  } else if ( strcmp(l_precision, "DP") == 0 ) {
    l_single_precision = 0;
  } else {
    print_help();
    return -1;
  }

  /* check alpha */
  if ((l_alpha != -1) && (l_alpha != 1)) {
    print_help();
    return -1;
  }

  /* check beta */
  if ((l_beta != 0) && (l_beta != 1)) {
    print_help();
    return -1;
  }

  libxsmm_xgemm_descriptor l_xgemm_desc;
  if ( l_m < 0 ) { l_xgemm_desc.m = 0; } else {  l_xgemm_desc.m = l_m; }
  if ( l_n < 0 ) { l_xgemm_desc.n = 0; } else {  l_xgemm_desc.n = l_n; }
  if ( l_k < 0 ) { l_xgemm_desc.k = 0; } else {  l_xgemm_desc.k = l_k; }
  if ( l_lda < 0 ) { l_xgemm_desc.lda = 0; } else {  l_xgemm_desc.lda = l_lda; }
  if ( l_ldb < 0 ) { l_xgemm_desc.ldb = 0; } else {  l_xgemm_desc.ldb = l_ldb; }
  if ( l_ldc < 0 ) { l_xgemm_desc.ldc = 0; } else {  l_xgemm_desc.ldc = l_ldc; }
  l_xgemm_desc.alpha = l_alpha;
  l_xgemm_desc.beta = l_beta;
  l_xgemm_desc.trans_a = 'n';
  l_xgemm_desc.trans_b = 'n';
  if (l_aligned_a == 0) {
    l_xgemm_desc.aligned_a = 0;
  } else {
    l_xgemm_desc.aligned_a = 1;
  }
  if (l_aligned_c == 0) {
    l_xgemm_desc.aligned_c = 0;
  } else {
    l_xgemm_desc.aligned_c = 1;
  }
  l_xgemm_desc.single_precision = l_single_precision;
  l_xgemm_desc.prefetch = l_prefetch;

  /* init data structures */
  double* l_a_d; 
  double* l_b_d; 
  double* l_c_d; 
  double* l_c_gold_d;
  float* l_a_f;
  float* l_b_f; 
  float* l_c_f; 
  float* l_c_gold_f;

  if ( l_xgemm_desc.single_precision == 0 ) {
    l_a_d = (double*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(double), 64);
    l_b_d = (double*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(double), 64);
    l_c_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64);
    l_c_gold_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64);
    init_double(l_a_d, l_b_d, l_c_d, l_c_gold_d, &l_xgemm_desc);
  } else {
    l_a_f = (float*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(float), 64);
    l_b_f = (float*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(float), 64);
    l_c_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64);
    l_c_gold_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64);
    init_float(l_a_f, l_b_f, l_c_f, l_c_gold_f, &l_xgemm_desc);
  }

  /* print some output... */
  printf("------------------------------------------------\n");
  printf("RUNNING (%ix%i) X (%ix%i) = (%ix%i)", l_xgemm_desc.m, l_xgemm_desc.k, l_xgemm_desc.k, l_xgemm_desc.n, l_xgemm_desc.m, l_xgemm_desc.n);
  if ( l_xgemm_desc.single_precision == 0 ) {
    printf(", DP\n");
  } else {
    printf(", SP\n");
  }
  printf("------------------------------------------------\n");

  /* run C */
  if ( l_xgemm_desc.single_precision == 0 ) {
    run_gold_double( l_a_d, l_b_d, l_c_gold_d, &l_xgemm_desc );
  } else {
    run_gold_float( l_a_f, l_b_f, l_c_gold_f, &l_xgemm_desc );
  }  

  /* run jit */
  if ( l_xgemm_desc.single_precision == 0 ) {
    run_jit_double( l_a_d, l_b_d, l_c_d, &l_xgemm_desc, l_arch );
  } else {
    run_jit_float( l_a_f, l_b_f, l_c_f, &l_xgemm_desc, l_arch );
  }  
 
  /* test result */
  if ( l_xgemm_desc.single_precision == 0 ) {
    max_error_double( l_c_d, l_c_gold_d, &l_xgemm_desc );
  } else {
    max_error_float( l_c_f, l_c_gold_f, &l_xgemm_desc );
  }

  /* free */
  if ( l_xgemm_desc.single_precision == 0 ) {
    _mm_free(l_a_d);
    _mm_free(l_b_d);
    _mm_free(l_c_d);
    _mm_free(l_c_gold_d);
  } else {
    _mm_free(l_a_f);
    _mm_free(l_b_f);
    _mm_free(l_c_f);
    _mm_free(l_c_gold_f);
  }

  printf("------------------------------------------------\n");
  return 0;
}
Beispiel #5
0
int main(int argc, char* argv []) {
  char* l_arch = NULL;
  char* l_precision = NULL;
  int l_m = 0;
  int l_n = 0;
  int l_k = 0;
  int l_lda = 0;
  int l_ldb = 0;
  int l_ldc = 0;
  int l_aligned_a = 0;
  int l_aligned_c = 0;
  int l_alpha = 0;
  int l_beta = 0;
  int l_single_precision = 0;
  libxsmm_prefetch_type l_prefetch = 0;

  libxsmm_gemm_descriptor l_xgemm_desc;
  /* init data structures */
  double* l_a_d; 
  double* l_b_d; 
  double* l_c_d; 
  double* l_c_gold_d;
  float* l_a_f;
  float* l_b_f; 
  float* l_c_f; 
  float* l_c_gold_f;

  /* check argument count for a valid range */
  if ( argc != 15 ) {
    print_help();
    return -1;
  }

  /* xgemm sizes */
  l_m = atoi(argv[1]);
  l_n = atoi(argv[2]);
  l_k = atoi(argv[3]);
  l_lda = atoi(argv[4]);
  l_ldb = atoi(argv[5]);
  l_ldc = atoi(argv[6]);

  /* some sugar */
  l_alpha = atoi(argv[7]);
  l_beta = atoi(argv[8]);
  l_aligned_a = atoi(argv[9]);
  l_aligned_c = atoi(argv[10]);

  /* arch specific stuff */
  l_arch = argv[11];
  l_precision = argv[13];
  g_jit_code_reps = atoi(argv[14]);

  /* set value of prefetch flag */
  if (strcmp("nopf", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_NONE;
  }
  else if (strcmp("pfsigonly", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_SIGONLY;
  }
  else if (strcmp("BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C;
  }
  else if (strcmp("curAL2", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD;
  }
  else if (strcmp("curAL2_BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD;
  }
  else if (strcmp("AL2", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2;
  }
  else if (strcmp("AL2_BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C;
  }
  else if (strcmp("AL2jpst", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2_JPST;
  }
  else if (strcmp("AL2jpst_BL2viaC", argv[12]) == 0) {
    l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST;
  }
  else {
    print_help();
    return -1;
  }  

  /* check value of arch flag */
  if ( (strcmp(l_arch, "snb") != 0)    && 
       (strcmp(l_arch, "hsw") != 0)    && 
       (strcmp(l_arch, "knl") != 0)    && 
       (strcmp(l_arch, "skx") != 0)       ) {
    print_help();
    return -1;
  }

  /* check and evaluate precison flag */
  if ( strcmp(l_precision, "SP") == 0 ) {
    l_single_precision = 1;
  } else if ( strcmp(l_precision, "DP") == 0 ) {
    l_single_precision = 0;
  } else {
    print_help();
    return -1;
  }

  /* check alpha */
  if ((l_alpha != 1)) {
    print_help();
    return -1;
  }

  /* check beta */
  if ((l_beta != 0) && (l_beta != 1)) {
    print_help();
    return -1;
  }

  LIBXSMM_GEMM_DESCRIPTOR(l_xgemm_desc, 1,
    (0 == l_single_precision ? 0 : LIBXSMM_GEMM_FLAG_F32PREC)
      | (0 != l_aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0)
      | (0 != l_aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0),
    l_m, l_n, l_k, l_lda, l_ldb, l_ldc,
    l_alpha, l_beta, l_prefetch);

  if ( l_single_precision == 0 ) {
    l_a_d = (double*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(double), 64);
    l_b_d = (double*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(double), 64);
    l_c_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64);
    l_c_gold_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64);
    init_double(l_a_d, l_b_d, l_c_d, l_c_gold_d, &l_xgemm_desc);
  } else {
    l_a_f = (float*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(float), 64);
    l_b_f = (float*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(float), 64);
    l_c_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64);
    l_c_gold_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64);
    init_float(l_a_f, l_b_f, l_c_f, l_c_gold_f, &l_xgemm_desc);
  }

  /* print some output... */
  printf("------------------------------------------------\n");
  printf("RUNNING (%ix%i) X (%ix%i) = (%ix%i)", l_xgemm_desc.m, l_xgemm_desc.k, l_xgemm_desc.k, l_xgemm_desc.n, l_xgemm_desc.m, l_xgemm_desc.n);
  if ( l_single_precision == 0 ) {
    printf(", DP\n");
  } else {
    printf(", SP\n");
  }
  printf("------------------------------------------------\n");

  /* run C */
  if ( l_single_precision == 0 ) {
    run_gold_double( l_a_d, l_b_d, l_c_gold_d, &l_xgemm_desc );
  } else {
    run_gold_float( l_a_f, l_b_f, l_c_gold_f, &l_xgemm_desc );
  }  

  /* run jit */
  if ( l_single_precision == 0 ) {
    run_jit_double( l_a_d, l_b_d, l_c_d, l_m, l_n, l_k, l_prefetch, l_arch );
  } else {
    run_jit_float( l_a_f, l_b_f, l_c_f, l_m, l_n, l_k, l_prefetch, l_arch );
  }  
 
  /* test result */
  if ( l_single_precision == 0 ) {
    max_error_double( l_c_d, l_c_gold_d, &l_xgemm_desc );
  } else {
    max_error_float( l_c_f, l_c_gold_f, &l_xgemm_desc );
  }

  /* free */
  if ( l_single_precision == 0 ) {
    _mm_free(l_a_d);
    _mm_free(l_b_d);
    _mm_free(l_c_d);
    _mm_free(l_c_gold_d);
  } else {
    _mm_free(l_a_f);
    _mm_free(l_b_f);
    _mm_free(l_c_f);
    _mm_free(l_c_gold_f);
  }

  printf("------------------------------------------------\n");
  return 0;
}
int main()
{
    int i;

    init_double(old_dst.d, 4, 3.14);	/* Initialize memory and the registers */

    vind.l[0] = 0;
    vind.l[1] = 6;
    vind.l[2] = 3;
    vind.l[3] = 1;

    mask.l[0] = 0x80ffffff00000000LL;
    mask.l[1] = 0x00ffffff10000000LL;
    mask.l[2] = 0x00ffffff00000000LL;
    mask.l[3] = 0x80ffffff10000000LL;

    init_long(&d, 0xdeadbeef);
    for (i = 0; i < 4; i++) {
        expect.d[i] = (mask.l[i] & 0x8000000000000000LL)
            ? *(double *)(ptr+(vind.l[i] * 8 + 0))
            : old_dst.d[i];
    }

    __asm {     /* VGATHERDPD ymm1, [rax + ymm2_vind*8], ymm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovupd YMMa, [old_dst.md];
        vmovupd YMMb, [vind.md];
        vmovupd YMMc, [mask.md];
        vmovupd [YMMDestBefore.ms],  YMMa;
        vmovupd [YMMIndexBefore.ms], YMMb;
        vmovupd [YMMMaskBefore.ms],  YMMc;
        vgatherqpd YMMa, [REG + YMMb*8], YMMc
        vmovupd [d.md], YMMa;
        vmovupd [YMMIndexAfter.ms], YMMb;
        vmovupd [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(1);

    init_long(&d, 0xdeadbeef);
    for (i = 2; i < 4; i++) {
        expect.d[i] = 0;
    }
    __asm {     /* VGATHERDPD xmm1, [rax + xmm2_vind*8], xmm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovupd YMMa, [old_dst.md];
        vmovupd YMMb, [vind.md];
        vmovupd YMMc, [mask.md];
        vmovupd [YMMDestBefore.ms],  YMMa;
        vmovupd [YMMIndexBefore.ms], YMMb;
        vmovupd [YMMMaskBefore.ms],  YMMc;
        vgatherqpd XMMa, [REG + XMMb*8], XMMc
        vmovupd [d.md], YMMa;
        vmovupd [YMMIndexAfter.ms], YMMb;
        vmovupd [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(2);

    init_long(&d, 0xdeadbeef);
    for (i = 0; i < 4; i++) {
        expect.d[i] = (mask.l[i] & 0x8000000000000000LL)
            ? *(double *)(ptr+(vind.l[i] * 8 + 8))
            : old_dst.d[i];
    }
    __asm {     /* VGATHERDPD ymm1, [rax + ymm2_vind*8 + 8], ymm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovupd YMMa, [old_dst.md];
        vmovupd YMMb, [vind.md];
        vmovupd YMMc, [mask.md];
        vmovupd [YMMDestBefore.ms],  YMMa;
        vmovupd [YMMIndexBefore.ms], YMMb;
        vmovupd [YMMMaskBefore.ms],  YMMc;
        vgatherqpd YMMa, [REG + YMMb*8 + 8], YMMc
        vmovupd [d.md], YMMa;
        vmovupd [YMMIndexAfter.ms], YMMb;
        vmovupd [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(3);

    init_long(&d, 0xdeadbeef);
    for (i = 2; i < 4; i++) {
        expect.d[i] = 0;
    }
    __asm {     /* VGATHERDPD xmm1, [rax + xmm2_vind*8 + 8], xmm3_mask */
        lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */
        vmovupd YMMa, [old_dst.md];
        vmovupd YMMb, [vind.md];
        vmovupd YMMc, [mask.md];
        vmovupd [YMMDestBefore.ms],  YMMa;
        vmovupd [YMMIndexBefore.ms], YMMb;
        vmovupd [YMMMaskBefore.ms],  YMMc;
        vgatherqpd XMMa, [REG + XMMb*8 + 8], XMMc
        vmovupd [d.md], YMMa;
        vmovupd [YMMIndexAfter.ms], YMMb;
        vmovupd [YMMMaskAfter.ms],  YMMc;
    }
    printVl("YMM dest  before:     ", YMMDestBefore.l);
    printVl("YMM dest  after:      ", d.l);
    printVl("YMM index before:     ", YMMIndexBefore.l);
    printVl("YMM index after:      ", YMMIndexAfter.l);
    printVl("YMM mask  before:     ", YMMMaskBefore.l);
    printVl("YMM mask  after:      ", YMMMaskAfter.l);
    check_res(4);

    if(!res) PRINTF("gatherqpd passed\n");
    return res;
}