void test_vmov_ns16 (void) { int16x4_t out_int16x4_t; int16_t arg0_int16_t; out_int16x4_t = vmov_n_s16 (arg0_int16_t); }
void matMult(int16_t mat1[], int16_t mat2[], int32_t prod[matrix_size][matrix_size]) { int output_size = 2 * matrix_size; int l,k; int16x4_t data1; int32x4_t mac_output[output_size/4]; int32x4_t MAC_addvalue[output_size/4]; int16x4_t constant_value; unsigned int index_input = 0; unsigned int transfer_index = 0 ; int32_t *pres_ver; /* Allocate output */ pres_ver = malloc(output_size * output_size * sizeof(int32_t)); for(l = 0 ; l < matrix_size/4; l++) { MAC_addvalue[l] = vmovq_n_s32(0); } /* Perform the multiplication */ for(l = 0; l < matrix_size*matrix_size; l++) { constant_value = vmov_n_s16 (mat1[l]); for(k = 0 ; k < matrix_size/4 ; k++) { data1 = vld1_s16 (&mat2[index_input]); MAC4 (&MAC_addvalue[k], &constant_value, &data1,&mac_output[k]); MAC_addvalue[k] = mac_output[k]; index_input +=4; } index_input+=output_size-matrix_size; if ((l + 1) % matrix_size == 0 ) { index_input = 0; for(k = 0 ; k < matrix_size/4 ; k++) { vst1q_s32(&pres_ver[transfer_index],MAC_addvalue[k]); transfer_index +=4; } transfer_index += output_size-matrix_size; for(k = 0 ; k < matrix_size/4; k++) { MAC_addvalue[k] = vmovq_n_s32(0); } } } }
int16x4_t test_vmov_n_s16(int16_t v1) { // CHECK: test_vmov_n_s16 return vmov_n_s16(v1); // CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} }