/** * blockmix_salsa8(Bin, Bout, X, r): * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r * bytes in length; the output Bout must also be the same size. The * temporary space X must be 64 bytes. */ static void blockmix_salsa8(const __m128i * Bin, __m128i * Bout, __m128i * X, size_t r) { size_t i; /* 1: X <-- B_{2r - 1} */ blkcpy(X, &Bin[8 * r - 4], 64); /* 2: for i = 0 to 2r - 1 do */ for (i = 0; i < r; i++) { /* 3: X <-- H(X \xor B_i) */ blkxor(X, &Bin[i * 8], 64); salsa20_8(X); /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ blkcpy(&Bout[i * 4], X, 64); /* 3: X <-- H(X \xor B_i) */ blkxor(X, &Bin[i * 8 + 4], 64); salsa20_8(X); /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ blkcpy(&Bout[(r + i) * 4], X, 64); } }
/** * blockmix_salsa8(Bin, Bout, X, r): * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r * bytes in length; the output Bout must also be the same size. The * temporary space X must be 64 bytes. */ static void blockmix_salsa8(uint32_t * Bin, uint32_t * Bout, uint32_t * X, size_t r) { size_t i; /* 1: X <-- B_{2r - 1} */ blkcpy(X, &Bin[(2 * r - 1) * 16], 64); /* 2: for i = 0 to 2r - 1 do */ for (i = 0; i < 2 * r; i += 2) { /* 3: X <-- H(X \xor B_i) */ blkxor(X, &Bin[i * 16], 64); salsa20_8(X); /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ blkcpy(&Bout[i * 8], X, 64); /* 3: X <-- H(X \xor B_i) */ blkxor(X, &Bin[i * 16 + 16], 64); salsa20_8(X); /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ blkcpy(&Bout[i * 8 + r * 16], X, 64); } }
/** * blockmix_salsa8(B, Y, r): * Compute B = BlockMix_{salsa20/8, r}(B). The input B must be 128r bytes in * length; the temporary space Y must also be the same size. */ static void blockmix_salsa8(uint8_t * B, uint8_t * Y, size_t r) { uint8_t X[64]; size_t i; /* 1: X <-- B_{2r - 1} */ blkcpy(X, &B[(2 * r - 1) * 64], 64); /* 2: for i = 0 to 2r - 1 do */ for (i = 0; i < 2 * r; i++) { /* 3: X <-- H(X \xor B_i) */ blkxor(X, &B[i * 64], 64); salsa20_8(X); /* 4: Y_i <-- X */ blkcpy(&Y[i * 64], X, 64); } /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ for (i = 0; i < r; i++) blkcpy(&B[i * 64], &Y[(i * 2) * 64], 64); for (i = 0; i < r; i++) blkcpy(&B[(i + r) * 64], &Y[(i * 2 + 1) * 64], 64); }
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes */ static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) { uint32_t * V; uint32_t X[32]; uint32_t i; uint32_t j; uint32_t k; uint64_t *p1, *p2; p1 = (uint64_t *)X; V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); PBKDF2_SHA256_80_128(input, X); for (i = 0; i < 1024; i += 2) { memcpy(&V[i * 32], X, 128); salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); memcpy(&V[(i + 1) * 32], X, 128); salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); } for (i = 0; i < 1024; i += 2) { j = X[16] & 1023; p2 = (uint64_t *)(&V[j * 32]); for(k = 0; k < 16; k++) p1[k] ^= p2[k]; salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); j = X[16] & 1023; p2 = (uint64_t *)(&V[j * 32]); for(k = 0; k < 16; k++) p1[k] ^= p2[k]; salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); } return PBKDF2_SHA256_80_128_32(input, X); }