void ar_stride3_vminall_u8_neon(const uint8_t* a, uint32_t n, uint8_t* line_results) { uint8x16x3_t a_loaded; uint8x16_t line0_min = vdupq_n_u8(255); uint8x16_t line1_min = vdupq_n_u8(255); uint8x16_t line2_min = vdupq_n_u8(255); line_results[0] = 255; line_results[1] = 255; line_results[2] = 255; uint8_t line0_array[16]; uint8_t line1_array[16]; uint8_t line2_array[16]; for (uint32_t i = 0; i < n; i += 48) { a_loaded = vld3q_u8(&(a[i])); line0_min = vminq_u8(a_loaded.val[0], line0_min); line1_min = vminq_u8(a_loaded.val[1], line1_min); line2_min = vminq_u8(a_loaded.val[2], line2_min); } vst1q_u8(line0_array, line0_min); vst1q_u8(line1_array, line1_min); vst1q_u8(line2_array, line2_min); for (uint32_t i = 0; i < 16; i++) { line_results[0] = ar_min_u8(line_results[0], line0_array[i]); line_results[1] = ar_min_u8(line_results[1], line1_array[i]); line_results[2] = ar_min_u8(line_results[2], line2_array[i]); } }
/* Build an RGBA palette from the RGB and separate alpha palettes. */ void png_riffle_palette_rgba(png_structrp png_ptr, png_row_infop row_info) { png_const_colorp palette = png_ptr->palette; png_bytep riffled_palette = png_ptr->riffled_palette; png_const_bytep trans_alpha = png_ptr->trans_alpha; int num_trans = png_ptr->num_trans; int i; /* Initially black, opaque. */ uint8x16x4_t w = {{ vdupq_n_u8(0x00), vdupq_n_u8(0x00), vdupq_n_u8(0x00), vdupq_n_u8(0xff), }}; if (row_info->bit_depth != 8) { png_error(png_ptr, "bit_depth must be 8 for png_riffle_palette_rgba"); return; } /* First, riffle the RGB colours into a RGBA palette, the A value is * set to opaque for now. */ for (i = 0; i < (1 << row_info->bit_depth); i += 16) { uint8x16x3_t v = vld3q_u8((png_const_bytep)(palette + i)); w.val[0] = v.val[0]; w.val[1] = v.val[1]; w.val[2] = v.val[2]; vst4q_u8(riffled_palette + (i << 2), w); } /* Fix up the missing transparency values. */ for (i = 0; i < num_trans; i++) riffled_palette[(i << 2) + 3] = trans_alpha[i]; }
void test_vld3Qu8 (void) { uint8x16x3_t out_uint8x16x3_t; out_uint8x16x3_t = vld3q_u8 (0); }
// If we have ARM NEON support, pick off 48 bytes at a time: while (srclen >= 48) { uint8x16x3_t str; uint8x16x4_t res; // Load 48 bytes and deinterleave: str = vld3q_u8((uint8_t *)c); // Reshuffle: res = enc_reshuffle(str); // Translate reshuffled bytes to the Base64 alphabet: res = enc_translate(res); // Interleave and store result: vst4q_u8((uint8_t *)o, res); c += 48; // 3 * 16 bytes of input o += 64; // 4 * 16 bytes of output outl += 64; srclen -= 48; }
inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); }