inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); }
inline v_int32x4 v_load_expand_q(const schar* ptr) { int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); return v_int32x4(vmovl_s16(v1)); }