static void ult_nn_convolution_fixedpoint_comp_both_initialize_matrices( int16_t* input, int16_t* output, int32_t* biases, int16_t* kernel, int16_t* input_ref, int16_t* output_ref, int32_t* biases_ref, int16_t* kernel_ref, uint_least32_t num_output_feature_maps, uint_least32_t num_input_feature_maps, uint_least32_t output_feature_map_width, uint_least32_t output_feature_map_height, uint_least32_t input_feature_map_width, uint_least32_t input_feature_map_height, uint_least32_t kernel_width, uint_least32_t kernel_height, uint_least32_t center_x, uint_least32_t center_y ) { uint_least32_t input_size = input_feature_map_width * input_feature_map_height * num_input_feature_maps * sizeof(int16_t); int16_t * inputT = (int16_t*)_mm_malloc(input_size, 4096); uint_least32_t kernel_size = num_input_feature_maps * num_output_feature_maps * kernel_width * kernel_height * sizeof(int16_t); int16_t * weightT = (int16_t*)_mm_malloc(kernel_size, 4096); uint32_t IFMBlock = 16; if (num_input_feature_maps == 4) IFMBlock = 4; uint32_t OFMBlock = 32; uint32_t OFMpBlock = 2; if (num_output_feature_maps == 3072 && num_input_feature_maps == 1024) { IFMBlock = 16; OFMBlock = 384; } for (uint_least32_t input_map = 0; input_map < num_input_feature_maps; input_map++) { uint_least32_t element = 0; int16_t value = input_map * 0x0100; for (uint_least32_t row = 0; row < input_feature_map_height; row++) { for (uint_least32_t column = 0; column < input_feature_map_width; column++) { uint_least32_t offset; //ult_nn_convolution_fixedpoint_comp_optimized_set_input_value(inputT, input_feature_map_width, num_input_feature_maps, column, row, input_map, value, offset); ult_nn_merge_convolution_fixedpoint_naive_set_input_value(inputT, input_feature_map_width, input_feature_map_height, column, row, input_map, value, offset); ult_nn_merge_convolution_fixedpoint_naive_set_input_value(input_ref, input_feature_map_width, input_feature_map_height, column, row, input_map, value, offset); value++; } } } for (uint_least32_t outmapa = 0; outmapa < num_output_feature_maps; outmapa++) { for (uint_least32_t input_map = 0; input_map < num_input_feature_maps; input_map++) { uint_least32_t element = 0; int16_t value = input_map * 0x0100 + outmapa * 0x2000; for (uint_least32_t row = 0; row < kernel_height; row++) { for (uint_least32_t column = 0; column < kernel_width; column++) { element++; uint_least32_t offset; //ult_nn_convolution_fixedpoint_comp_optimized_set_kernel_value ult_nn_merge_convolution_fixedpoint_naive_set_kernel_value(kernel, kernel_width, kernel_height, num_input_feature_maps, column, row, input_map, outmapa, value, offset); ult_nn_merge_convolution_fixedpoint_naive_set_kernel_value(kernel_ref, kernel_width, kernel_height, num_input_feature_maps, column, row, input_map, outmapa, value, offset); value++; } } } } for (uint_least32_t outmapa = 0; outmapa < num_output_feature_maps; outmapa++) { for (uint_least32_t row = 0; row < output_feature_map_height + 2 * center_y; row++) { for (uint_least32_t column = 0; column < output_feature_map_width + 2 * center_x; column++) { uint32_t index = column + row * (output_feature_map_width + 2 * center_x) + outmapa * (output_feature_map_width + 2 * center_x) * (output_feature_map_height + 2 * center_y); output[index] = 0; output_ref[index] = 0; } } } for (uint_least32_t outmapa = 0; outmapa < num_output_feature_maps; outmapa++) { biases[outmapa] = outmapa; biases_ref[outmapa] = outmapa; } //prepare right input layout for naive implementation for (size_t y = 0; y < input_feature_map_height; y++) { for (size_t x = 0; x < input_feature_map_width; x++) { for (size_t z = 0; z < num_input_feature_maps; z++) { input[z + x * num_input_feature_maps + y * num_input_feature_maps * input_feature_map_width] = inputT[z * input_feature_map_width * input_feature_map_height + y * input_feature_map_height + x]; } } } _mm_free(inputT); }
static void ult_nn_merge_convolution_fixedpoint_both_initialize_matrices( int16_t* input, int16_t* output, int32_t* biases, int16_t* kernel, int16_t* input_ref, int16_t* output_ref, int32_t* biases_ref, int16_t* kernel_ref, uint_least32_t num_output_feature_maps, uint_least32_t num_input_feature_maps, uint_least32_t output_feature_map_width, uint_least32_t output_feature_map_height, uint_least32_t input_feature_map_width, uint_least32_t input_feature_map_height, uint_least32_t kernel_width, uint_least32_t kernel_height, uint_least32_t center_x, uint_least32_t center_y ) { uint_least32_t input_size = input_feature_map_width * input_feature_map_height * num_input_feature_maps * sizeof(int16_t); int16_t * inputT = (int16_t*)_mm_malloc(input_size, 4096); uint_least32_t kernel_size = num_input_feature_maps * num_output_feature_maps * kernel_width * kernel_height * sizeof(int16_t); int16_t * weightT = (int16_t*)_mm_malloc(kernel_size, 4096); uint32_t IFMBlock = 8; if (num_input_feature_maps == 4) IFMBlock = 4; uint32_t OFMBlock = 32; uint32_t OFMpBlock = 2; if (num_output_feature_maps == 3072 && num_input_feature_maps == 1024) { IFMBlock = 8; OFMBlock = 384; } for (uint_least32_t input_map = 0; input_map < num_input_feature_maps; input_map++) { uint_least32_t element = 0; int16_t value = input_map * 0x0100; for (uint_least32_t row = 0; row < input_feature_map_height; row++) { for (uint_least32_t column = 0; column < input_feature_map_width; column++) { uint_least32_t offset; //ult_nn_convolution_optimized_set_input_value(inputT, input_feature_map_width, num_input_feature_maps, column, row, input_map, value, offset); ult_nn_merge_convolution_fixedpoint_naive_set_input_value(inputT, input_feature_map_width, input_feature_map_height, column, row, input_map, value, offset); ult_nn_merge_convolution_fixedpoint_naive_set_input_value(input_ref, input_feature_map_width, input_feature_map_height, column, row, input_map, value, offset); value++; } } } int16_t value = 0; for (uint_least32_t outmapa = 0; outmapa < num_output_feature_maps; outmapa++) { for (uint_least32_t input_map = 0; input_map < num_input_feature_maps; input_map++) { //uint_least32_t element = 0; //int16_t value = input_map * 0x0100 + outmapa * 0x2000; for (uint_least32_t row = 0; row < kernel_height; row++) { for (uint_least32_t column = 0; column < kernel_width; column++) { //element++; uint_least32_t offset; //ult_nn_convolution_optimized_set_kernel_value ult_nn_merge_convolution_fixedpoint_naive_set_kernel_value(kernel, kernel_width, kernel_height, num_input_feature_maps, column, row, input_map, outmapa, value, offset); ult_nn_merge_convolution_fixedpoint_naive_set_kernel_value(kernel_ref, kernel_width, kernel_height, num_input_feature_maps, column, row, input_map, outmapa, value, offset); value++; } } } } for (uint_least32_t outmapa = 0; outmapa < num_output_feature_maps; outmapa++) { for (uint_least32_t row = 0; row < output_feature_map_height + 2 * center_y; row++) { for (uint_least32_t column = 0; column < output_feature_map_width + 2 * center_x; column++) { uint32_t index = column + row * (output_feature_map_width + 2 * center_x) + outmapa * (output_feature_map_width + 2 * center_x) * (output_feature_map_height + 2 * center_y); output[index] = 0; output_ref[index] = 0; } } } for (uint_least32_t outmapa = 0; outmapa < num_output_feature_maps; outmapa++) { biases[outmapa] = outmapa; biases_ref[outmapa] = outmapa; } //prepare right input layout for zxy for (size_t y = 0; y < input_feature_map_height; y++) { for (size_t x = 0; x < input_feature_map_width; x++) { for (size_t z = 0; z < num_input_feature_maps; z++) { input[z + x * num_input_feature_maps + y * num_input_feature_maps * input_feature_map_width] = inputT[z * input_feature_map_width * input_feature_map_height + y * input_feature_map_height + x]; } } } //prepare right input layout for naive implementation /*for (uint32_t i = 0; i < num_input_feature_maps / IFMBlock; i++) for (uint32_t j = 0; j < input_feature_map_width * input_feature_map_height; j++) for (uint32_t n = 0; n < IFMBlock; n++) input[n + j * IFMBlock + i * input_feature_map_width * input_feature_map_height * IFMBlock] = inputT[n * input_feature_map_width * input_feature_map_height + j + i * input_feature_map_width * input_feature_map_height * IFMBlock]; const uint32_t ItrIn = num_input_feature_maps / 2; const uint32_t ItrOut = num_output_feature_maps / OFMBlock; for (uint32_t k = 0; k < ItrOut; k++) for (uint32_t i = 0; i < kernel_width * kernel_height; i++) for (uint32_t j = 0; j < ItrIn; j++) for (uint32_t n = 0; n < OFMBlock; n++) for (uint32_t m = 0; m < 2; m++) weightT[m + 2 * n + 2 * OFMBlock * j + i * 2 * OFMBlock * ItrIn + k * 2 * OFMBlock * ItrIn * kernel_width * kernel_height] = kernel[m * kernel_width * kernel_height + n * num_input_feature_maps * kernel_width * kernel_height + 2 * j * kernel_width * kernel_height + k * OFMBlock * num_input_feature_maps * kernel_width * kernel_height + i];*/ _mm_free(inputT); //_mm_free(weightT); }