// Load string: __m128i str = _mm_loadu_si128((__m128i *)c); // The input consists of six character sets in the Base64 alphabet, // which we need to map back to the 6-bit values they represent. // There are three ranges, two singles, and then there's the rest. // // # From To Add Characters // 1 [43] [62] +19 + // 2 [47] [63] +16 / // 3 [48..57] [52..61] +4 0..9 // 4 [65..90] [0..25] -65 A..Z // 5 [97..122] [26..51] -71 a..z // (6) Everything else => invalid input const __m128i set1 = CMPEQ(str, '+'); const __m128i set2 = CMPEQ(str, '/'); const __m128i set3 = RANGE(str, '0', '9'); const __m128i set4 = RANGE(str, 'A', 'Z'); const __m128i set5 = RANGE(str, 'a', 'z'); __m128i delta = REPLACE(set1, 19); delta = _mm_or_si128(delta, REPLACE(set2, 16)); delta = _mm_or_si128(delta, REPLACE(set3, 4)); delta = _mm_or_si128(delta, REPLACE(set4, -65)); delta = _mm_or_si128(delta, REPLACE(set5, -71)); // Check for invalid input: if any of the delta values are zero, // fall back on bytewise code to do error checking and reporting: if (_mm_movemask_epi8(CMPEQ(delta, 0))) { break;
// Load string: __m256i str = _mm256_loadu_si256((__m256i *)c); // The input consists of six character sets in the Base64 alphabet, // which we need to map back to the 6-bit values they represent. // There are three ranges, two singles, and then there's the rest. // // # From To Add Characters // 1 [43] [62] +19 + // 2 [47] [63] +16 / // 3 [48..57] [52..61] +4 0..9 // 4 [65..90] [0..25] -65 A..Z // 5 [97..122] [26..51] -71 a..z // (6) Everything else => invalid input const __m256i set1 = CMPEQ(str, '+'); const __m256i set2 = CMPEQ(str, '/'); const __m256i set3 = RANGE(str, '0', '9'); const __m256i set4 = RANGE(str, 'A', 'Z'); const __m256i set5 = RANGE(str, 'a', 'z'); __m256i delta = REPLACE(set1, 19); delta = _mm256_or_si256(delta, REPLACE(set2, 16)); delta = _mm256_or_si256(delta, REPLACE(set3, 4)); delta = _mm256_or_si256(delta, REPLACE(set4, -65)); delta = _mm256_or_si256(delta, REPLACE(set5, -71)); // Check for invalid input: if any of the delta values are zero, // fall back on bytewise code to do error checking and reporting: if (_mm256_movemask_epi8(CMPEQ(delta, 0))) { break;
static inline uint8x16x4_t enc_translate (uint8x16x4_t in) { uint8x16x4_t mask1, mask2, mask3, mask4, out; // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Delta Characters // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 -75 0123456789 // 3 [62] [43] -19 -15 + // 4 [63] [47] -16 +3 / // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4], // [3,4], and [4]: mask1.val[0] = CMPGT(in.val[0], 25); mask1.val[1] = CMPGT(in.val[1], 25); mask1.val[2] = CMPGT(in.val[2], 25); mask1.val[3] = CMPGT(in.val[3], 25); mask2.val[0] = CMPGT(in.val[0], 51); mask2.val[1] = CMPGT(in.val[1], 51); mask2.val[2] = CMPGT(in.val[2], 51); mask2.val[3] = CMPGT(in.val[3], 51); mask3.val[0] = CMPGT(in.val[0], 61); mask3.val[1] = CMPGT(in.val[1], 61); mask3.val[2] = CMPGT(in.val[2], 61); mask3.val[3] = CMPGT(in.val[3], 61); mask4.val[0] = CMPEQ(in.val[0], 63); mask4.val[1] = CMPEQ(in.val[1], 63); mask4.val[2] = CMPEQ(in.val[2], 63); mask4.val[3] = CMPEQ(in.val[3], 63); // All characters are at least in cumulative set 0, so add 'A': out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65)); out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65)); out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65)); out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65)); // For inputs which are also in any of the other cumulative sets, // add delta values against the previous set(s) to correct the shift: out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6)); out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6)); out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6)); out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6)); out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75)); out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75)); out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75)); out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75)); out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15)); out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15)); out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15)); out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15)); out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3)); out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3)); out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3)); out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3)); return out; }