void GCInit_Help(double defaultMinRatio, double defaultMaxRatio, int defaultMinRatioSize, int defaultMaxRatioSize) { MinHeapByte = Min(MinHeapByte, MaxHeapByte); init_double(&MinRatio, defaultMinRatio); init_double(&MaxRatio, defaultMaxRatio); init_int(&MinRatioSize, defaultMinRatioSize); init_int(&MaxRatioSize, defaultMaxRatioSize); fromSpace = Heap_Alloc(MinHeapByte, MaxHeapByte); toSpace = Heap_Alloc(MinHeapByte, MaxHeapByte); Heap_Resize(fromSpace, (MinHeapByte + MaxHeapByte) / 2, 1); Heap_Resize(toSpace, (MinHeapByte + MaxHeapByte) / 2, 1); }
/* Allocates and initializes a new slab for the given cache */ static struct slab *new_slab(struct slab_cache *cache) { struct slab *slab = navi_critical_malloc(SLAB_SIZE); if (cache->flags & NAVI_SLAB_DOUBLY_LINKED) init_double(cache, slab); else init_single(cache, slab); slab->in_use = 0; return slab; }
int main() { int i; init_double(old_dst.d, 4, 3.14); /* Initialize memory and the registers */ vind.i[0] = 0; vind.i[1] = 2; vind.i[2] = 3; vind.i[3] = 1; vind.i[4] = 7; vind.i[5] = 11; vind.i[6] = 13; vind.i[7] = 10; mask.i[0] = 0x80ffffff; mask.i[1] = 0x00ffffff; mask.i[2] = 0x00ffffff; mask.i[3] = 0x80ffffff; mask.i[4] = 0x80ffffff; mask.i[5] = 0x00ffffff; mask.i[6] = 0x80ffffff; mask.i[7] = 0x80ffffff; for (i = 0; i < 8; i++) { expect.f[i] = (mask.i[i] & 0x80000000) ? *(float *)(ptr+(vind.i[i] * 4 + 0)) : old_dst.f[i]; } __asm { /* VGATHERDPS ymm1, [rax + ymm2_vind*4], ymm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovups YMMa, [old_dst.ms]; vmovups YMMb, [vind.ms]; vmovups YMMc, [mask.ms]; vmovups [YMMDestBefore.ms], YMMa; vmovups [YMMIndexBefore.ms], YMMb; vmovups [YMMMaskBefore.ms], YMMc; vgatherdps YMMa, [REG + YMMb*4], YMMc vmovups [d.ms], YMMa; vmovups [YMMIndexAfter.ms], YMMb; vmovups [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(1); for (i = 4; i < 8; i++) { expect.f[i] = 0; } __asm { /* VGATHERDPS xmm1, [rax + xmm2_vind*4], xmm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovups YMMa, [old_dst.ms]; vmovups YMMb, [vind.ms]; vmovups YMMc, [mask.ms]; vmovups [YMMDestBefore.ms], YMMa; vmovups [YMMIndexBefore.ms], YMMb; vmovups [YMMMaskBefore.ms], YMMc; vgatherdps XMMa, [REG + XMMb*4], XMMc vmovups [d.ms], YMMa; vmovups [YMMIndexAfter.ms], YMMb; vmovups [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(2); for (i = 0; i < 8; i++) { expect.f[i] = (mask.i[i] & 0x80000000) ? *(float *)(ptr+(vind.i[i] * 4 + 8)) : old_dst.f[i]; } __asm { /* VGATHERDPS ymm1, [rax + ymm2_vind*4 + 8], ymm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovups YMMa, [old_dst.ms]; vmovups YMMb, [vind.ms]; vmovups YMMc, [mask.ms]; vmovups [YMMDestBefore.ms], YMMa; vmovups [YMMIndexBefore.ms], YMMb; vmovups [YMMMaskBefore.ms], YMMc; vgatherdps YMMa, [REG + YMMb*4 + 8], YMMc vmovups [d.ms], YMMa; vmovups [YMMIndexAfter.ms], YMMb; vmovups [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(3); for (i = 4; i < 8; i++) { expect.f[i] = 0; } __asm { /* VGATHERDPS xmm1, [rax + xmm2_vind*4 + 8], xmm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovups YMMa, [old_dst.ms]; vmovups YMMb, [vind.ms]; vmovups YMMc, [mask.ms]; vmovups [YMMDestBefore.ms], YMMa; vmovups [YMMIndexBefore.ms], YMMb; vmovups [YMMMaskBefore.ms], YMMc; vgatherdps XMMa, [REG + XMMb*4 + 8], XMMc vmovups [d.ms], YMMa; vmovups [YMMIndexAfter.ms], YMMb; vmovups [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(4); if(!res) PRINTF("gatherdps passed\n"); return res; }
int main(int argc, char* argv []) { /* check argument count for a valid range */ if ( argc != 14 ) { print_help(); return -1; } char* l_arch = NULL; char* l_precision = NULL; int l_m = 0; int l_n = 0; int l_k = 0; int l_lda = 0; int l_ldb = 0; int l_ldc = 0; int l_aligned_a = 0; int l_aligned_c = 0; int l_alpha = 0; int l_beta = 0; int l_single_precision = 0; int l_prefetch = 0; /* xgemm sizes */ l_m = atoi(argv[1]); l_n = atoi(argv[2]); l_k = atoi(argv[3]); l_lda = atoi(argv[4]); l_ldb = atoi(argv[5]); l_ldc = atoi(argv[6]); /* some sugar */ l_alpha = atoi(argv[7]); l_beta = atoi(argv[8]); l_aligned_a = atoi(argv[9]); l_aligned_c = atoi(argv[10]); /* arch specific stuff */ l_arch = argv[11]; l_precision = argv[13]; /* set value of prefetch flag */ if (strcmp("nopf", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_NONE; } else if (strcmp("pfsigonly", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_SIGNATURE; } else if (strcmp("BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; } else if (strcmp("curAL2", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; } else if (strcmp("curAL2_BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; } else if (strcmp("AL2", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2; } else if (strcmp("AL2_BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; } else if (strcmp("AL2jpst", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2_JPST; } else if (strcmp("AL2jpst_BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; } else { print_help(); return -1; } /* check value of arch flag */ if ( (strcmp(l_arch, "snb") != 0) && (strcmp(l_arch, "hsw") != 0) && (strcmp(l_arch, "knl") != 0) && (strcmp(l_arch, "skx") != 0) ) { print_help(); return -1; } /* check and evaluate precison flag */ if ( strcmp(l_precision, "SP") == 0 ) { l_single_precision = 1; } else if ( strcmp(l_precision, "DP") == 0 ) { l_single_precision = 0; } else { print_help(); return -1; } /* check alpha */ if ((l_alpha != -1) && (l_alpha != 1)) { print_help(); return -1; } /* check beta */ if ((l_beta != 0) && (l_beta != 1)) { print_help(); return -1; } libxsmm_xgemm_descriptor l_xgemm_desc; if ( l_m < 0 ) { l_xgemm_desc.m = 0; } else { l_xgemm_desc.m = l_m; } if ( l_n < 0 ) { l_xgemm_desc.n = 0; } else { l_xgemm_desc.n = l_n; } if ( l_k < 0 ) { l_xgemm_desc.k = 0; } else { l_xgemm_desc.k = l_k; } if ( l_lda < 0 ) { l_xgemm_desc.lda = 0; } else { l_xgemm_desc.lda = l_lda; } if ( l_ldb < 0 ) { l_xgemm_desc.ldb = 0; } else { l_xgemm_desc.ldb = l_ldb; } if ( l_ldc < 0 ) { l_xgemm_desc.ldc = 0; } else { l_xgemm_desc.ldc = l_ldc; } l_xgemm_desc.alpha = l_alpha; l_xgemm_desc.beta = l_beta; l_xgemm_desc.trans_a = 'n'; l_xgemm_desc.trans_b = 'n'; if (l_aligned_a == 0) { l_xgemm_desc.aligned_a = 0; } else { l_xgemm_desc.aligned_a = 1; } if (l_aligned_c == 0) { l_xgemm_desc.aligned_c = 0; } else { l_xgemm_desc.aligned_c = 1; } l_xgemm_desc.single_precision = l_single_precision; l_xgemm_desc.prefetch = l_prefetch; /* init data structures */ double* l_a_d; double* l_b_d; double* l_c_d; double* l_c_gold_d; float* l_a_f; float* l_b_f; float* l_c_f; float* l_c_gold_f; if ( l_xgemm_desc.single_precision == 0 ) { l_a_d = (double*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(double), 64); l_b_d = (double*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(double), 64); l_c_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64); l_c_gold_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64); init_double(l_a_d, l_b_d, l_c_d, l_c_gold_d, &l_xgemm_desc); } else { l_a_f = (float*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(float), 64); l_b_f = (float*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(float), 64); l_c_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64); l_c_gold_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64); init_float(l_a_f, l_b_f, l_c_f, l_c_gold_f, &l_xgemm_desc); } /* print some output... */ printf("------------------------------------------------\n"); printf("RUNNING (%ix%i) X (%ix%i) = (%ix%i)", l_xgemm_desc.m, l_xgemm_desc.k, l_xgemm_desc.k, l_xgemm_desc.n, l_xgemm_desc.m, l_xgemm_desc.n); if ( l_xgemm_desc.single_precision == 0 ) { printf(", DP\n"); } else { printf(", SP\n"); } printf("------------------------------------------------\n"); /* run C */ if ( l_xgemm_desc.single_precision == 0 ) { run_gold_double( l_a_d, l_b_d, l_c_gold_d, &l_xgemm_desc ); } else { run_gold_float( l_a_f, l_b_f, l_c_gold_f, &l_xgemm_desc ); } /* run jit */ if ( l_xgemm_desc.single_precision == 0 ) { run_jit_double( l_a_d, l_b_d, l_c_d, &l_xgemm_desc, l_arch ); } else { run_jit_float( l_a_f, l_b_f, l_c_f, &l_xgemm_desc, l_arch ); } /* test result */ if ( l_xgemm_desc.single_precision == 0 ) { max_error_double( l_c_d, l_c_gold_d, &l_xgemm_desc ); } else { max_error_float( l_c_f, l_c_gold_f, &l_xgemm_desc ); } /* free */ if ( l_xgemm_desc.single_precision == 0 ) { _mm_free(l_a_d); _mm_free(l_b_d); _mm_free(l_c_d); _mm_free(l_c_gold_d); } else { _mm_free(l_a_f); _mm_free(l_b_f); _mm_free(l_c_f); _mm_free(l_c_gold_f); } printf("------------------------------------------------\n"); return 0; }
int main(int argc, char* argv []) { char* l_arch = NULL; char* l_precision = NULL; int l_m = 0; int l_n = 0; int l_k = 0; int l_lda = 0; int l_ldb = 0; int l_ldc = 0; int l_aligned_a = 0; int l_aligned_c = 0; int l_alpha = 0; int l_beta = 0; int l_single_precision = 0; libxsmm_prefetch_type l_prefetch = 0; libxsmm_gemm_descriptor l_xgemm_desc; /* init data structures */ double* l_a_d; double* l_b_d; double* l_c_d; double* l_c_gold_d; float* l_a_f; float* l_b_f; float* l_c_f; float* l_c_gold_f; /* check argument count for a valid range */ if ( argc != 15 ) { print_help(); return -1; } /* xgemm sizes */ l_m = atoi(argv[1]); l_n = atoi(argv[2]); l_k = atoi(argv[3]); l_lda = atoi(argv[4]); l_ldb = atoi(argv[5]); l_ldc = atoi(argv[6]); /* some sugar */ l_alpha = atoi(argv[7]); l_beta = atoi(argv[8]); l_aligned_a = atoi(argv[9]); l_aligned_c = atoi(argv[10]); /* arch specific stuff */ l_arch = argv[11]; l_precision = argv[13]; g_jit_code_reps = atoi(argv[14]); /* set value of prefetch flag */ if (strcmp("nopf", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_NONE; } else if (strcmp("pfsigonly", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_SIGONLY; } else if (strcmp("BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; } else if (strcmp("curAL2", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; } else if (strcmp("curAL2_BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; } else if (strcmp("AL2", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2; } else if (strcmp("AL2_BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; } else if (strcmp("AL2jpst", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2_JPST; } else if (strcmp("AL2jpst_BL2viaC", argv[12]) == 0) { l_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; } else { print_help(); return -1; } /* check value of arch flag */ if ( (strcmp(l_arch, "snb") != 0) && (strcmp(l_arch, "hsw") != 0) && (strcmp(l_arch, "knl") != 0) && (strcmp(l_arch, "skx") != 0) ) { print_help(); return -1; } /* check and evaluate precison flag */ if ( strcmp(l_precision, "SP") == 0 ) { l_single_precision = 1; } else if ( strcmp(l_precision, "DP") == 0 ) { l_single_precision = 0; } else { print_help(); return -1; } /* check alpha */ if ((l_alpha != 1)) { print_help(); return -1; } /* check beta */ if ((l_beta != 0) && (l_beta != 1)) { print_help(); return -1; } LIBXSMM_GEMM_DESCRIPTOR(l_xgemm_desc, 1, (0 == l_single_precision ? 0 : LIBXSMM_GEMM_FLAG_F32PREC) | (0 != l_aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0) | (0 != l_aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0), l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_alpha, l_beta, l_prefetch); if ( l_single_precision == 0 ) { l_a_d = (double*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(double), 64); l_b_d = (double*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(double), 64); l_c_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64); l_c_gold_d = (double*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(double), 64); init_double(l_a_d, l_b_d, l_c_d, l_c_gold_d, &l_xgemm_desc); } else { l_a_f = (float*)_mm_malloc(l_xgemm_desc.lda * l_xgemm_desc.k * sizeof(float), 64); l_b_f = (float*)_mm_malloc(l_xgemm_desc.ldb * l_xgemm_desc.n * sizeof(float), 64); l_c_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64); l_c_gold_f = (float*)_mm_malloc(l_xgemm_desc.ldc * l_xgemm_desc.n * sizeof(float), 64); init_float(l_a_f, l_b_f, l_c_f, l_c_gold_f, &l_xgemm_desc); } /* print some output... */ printf("------------------------------------------------\n"); printf("RUNNING (%ix%i) X (%ix%i) = (%ix%i)", l_xgemm_desc.m, l_xgemm_desc.k, l_xgemm_desc.k, l_xgemm_desc.n, l_xgemm_desc.m, l_xgemm_desc.n); if ( l_single_precision == 0 ) { printf(", DP\n"); } else { printf(", SP\n"); } printf("------------------------------------------------\n"); /* run C */ if ( l_single_precision == 0 ) { run_gold_double( l_a_d, l_b_d, l_c_gold_d, &l_xgemm_desc ); } else { run_gold_float( l_a_f, l_b_f, l_c_gold_f, &l_xgemm_desc ); } /* run jit */ if ( l_single_precision == 0 ) { run_jit_double( l_a_d, l_b_d, l_c_d, l_m, l_n, l_k, l_prefetch, l_arch ); } else { run_jit_float( l_a_f, l_b_f, l_c_f, l_m, l_n, l_k, l_prefetch, l_arch ); } /* test result */ if ( l_single_precision == 0 ) { max_error_double( l_c_d, l_c_gold_d, &l_xgemm_desc ); } else { max_error_float( l_c_f, l_c_gold_f, &l_xgemm_desc ); } /* free */ if ( l_single_precision == 0 ) { _mm_free(l_a_d); _mm_free(l_b_d); _mm_free(l_c_d); _mm_free(l_c_gold_d); } else { _mm_free(l_a_f); _mm_free(l_b_f); _mm_free(l_c_f); _mm_free(l_c_gold_f); } printf("------------------------------------------------\n"); return 0; }
int main() { int i; init_double(old_dst.d, 4, 3.14); /* Initialize memory and the registers */ vind.l[0] = 0; vind.l[1] = 6; vind.l[2] = 3; vind.l[3] = 1; mask.l[0] = 0x80ffffff00000000LL; mask.l[1] = 0x00ffffff10000000LL; mask.l[2] = 0x00ffffff00000000LL; mask.l[3] = 0x80ffffff10000000LL; init_long(&d, 0xdeadbeef); for (i = 0; i < 4; i++) { expect.d[i] = (mask.l[i] & 0x8000000000000000LL) ? *(double *)(ptr+(vind.l[i] * 8 + 0)) : old_dst.d[i]; } __asm { /* VGATHERDPD ymm1, [rax + ymm2_vind*8], ymm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovupd YMMa, [old_dst.md]; vmovupd YMMb, [vind.md]; vmovupd YMMc, [mask.md]; vmovupd [YMMDestBefore.ms], YMMa; vmovupd [YMMIndexBefore.ms], YMMb; vmovupd [YMMMaskBefore.ms], YMMc; vgatherqpd YMMa, [REG + YMMb*8], YMMc vmovupd [d.md], YMMa; vmovupd [YMMIndexAfter.ms], YMMb; vmovupd [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(1); init_long(&d, 0xdeadbeef); for (i = 2; i < 4; i++) { expect.d[i] = 0; } __asm { /* VGATHERDPD xmm1, [rax + xmm2_vind*8], xmm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovupd YMMa, [old_dst.md]; vmovupd YMMb, [vind.md]; vmovupd YMMc, [mask.md]; vmovupd [YMMDestBefore.ms], YMMa; vmovupd [YMMIndexBefore.ms], YMMb; vmovupd [YMMMaskBefore.ms], YMMc; vgatherqpd XMMa, [REG + XMMb*8], XMMc vmovupd [d.md], YMMa; vmovupd [YMMIndexAfter.ms], YMMb; vmovupd [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(2); init_long(&d, 0xdeadbeef); for (i = 0; i < 4; i++) { expect.d[i] = (mask.l[i] & 0x8000000000000000LL) ? *(double *)(ptr+(vind.l[i] * 8 + 8)) : old_dst.d[i]; } __asm { /* VGATHERDPD ymm1, [rax + ymm2_vind*8 + 8], ymm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovupd YMMa, [old_dst.md]; vmovupd YMMb, [vind.md]; vmovupd YMMc, [mask.md]; vmovupd [YMMDestBefore.ms], YMMa; vmovupd [YMMIndexBefore.ms], YMMb; vmovupd [YMMMaskBefore.ms], YMMc; vgatherqpd YMMa, [REG + YMMb*8 + 8], YMMc vmovupd [d.md], YMMa; vmovupd [YMMIndexAfter.ms], YMMb; vmovupd [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(3); init_long(&d, 0xdeadbeef); for (i = 2; i < 4; i++) { expect.d[i] = 0; } __asm { /* VGATHERDPD xmm1, [rax + xmm2_vind*8 + 8], xmm3_mask */ lea REG, ADDRPTR [arr-4] /* the memory rewrite of the vgather will add 4 back to the address */ vmovupd YMMa, [old_dst.md]; vmovupd YMMb, [vind.md]; vmovupd YMMc, [mask.md]; vmovupd [YMMDestBefore.ms], YMMa; vmovupd [YMMIndexBefore.ms], YMMb; vmovupd [YMMMaskBefore.ms], YMMc; vgatherqpd XMMa, [REG + XMMb*8 + 8], XMMc vmovupd [d.md], YMMa; vmovupd [YMMIndexAfter.ms], YMMb; vmovupd [YMMMaskAfter.ms], YMMc; } printVl("YMM dest before: ", YMMDestBefore.l); printVl("YMM dest after: ", d.l); printVl("YMM index before: ", YMMIndexBefore.l); printVl("YMM index after: ", YMMIndexAfter.l); printVl("YMM mask before: ", YMMMaskBefore.l); printVl("YMM mask after: ", YMMMaskAfter.l); check_res(4); if(!res) PRINTF("gatherqpd passed\n"); return res; }