// Run 16 parallel hashes void sha2Hash(vecu16 pointers, int totalBlocks, vecu16 outHashes) { // Initial H values vecu16 A = __builtin_vp_makevectori(0x6A09E667); vecu16 B = __builtin_vp_makevectori(0xBB67AE85); vecu16 C = __builtin_vp_makevectori(0x3C6EF372); vecu16 D = __builtin_vp_makevectori(0xA54FF53A); vecu16 E = __builtin_vp_makevectori(0x510E527F); vecu16 F = __builtin_vp_makevectori(0x9B05688C); vecu16 G = __builtin_vp_makevectori(0x1F83D9AB); vecu16 H = __builtin_vp_makevectori(0x5BE0CD19); for (int i = 0; i < totalBlocks; i++) { vecu16 W[64]; for (int index = 0; index < 16; index++) { W[index] = __builtin_vp_gather_loadi(pointers); pointers += __builtin_vp_makevectori(4); } for (int index = 16; index < 64; index++) W[index] = SIG1(W[index - 2]) + W[index - 7] + SIG0(W[index - 15]) + W[index - 16]; for (int round = 0; round < 64; round++) { vecu16 temp1 = H + SIG1(E) + CH(E, F, G) + __builtin_vp_makevectori(K[round]) + W[round]; vecu16 temp2 = SIG0(A) + MA(A, B, C); H = G; G = F; F = E; E = D + temp1; D = C; C = B; B = A; A = temp1 + temp2; } } // doesn't add padding or length fields to end... __builtin_vp_scatter_storei(outHashes, A); __builtin_vp_scatter_storei(outHashes + __builtin_vp_makevectori(4), B); __builtin_vp_scatter_storei(outHashes + __builtin_vp_makevectori(8), C); __builtin_vp_scatter_storei(outHashes + __builtin_vp_makevectori(12), D); __builtin_vp_scatter_storei(outHashes + __builtin_vp_makevectori(16), E); __builtin_vp_scatter_storei(outHashes + __builtin_vp_makevectori(20), F); __builtin_vp_scatter_storei(outHashes + __builtin_vp_makevectori(24), G); __builtin_vp_scatter_storei(outHashes + __builtin_vp_makevectori(28), H); }
static void Sha256_Transform(Sha256ContextType *context, uint8_t * data) { uint32_t a,b,c,d,e,f,g,h,i,j,t1,t2,m[64]; for (i = 0, j = 0; i < 16; ++i, j += 4) { m[i] = (data[j] << 24) | (data[j+1] << 16) | (data[j+2] << 8) | (data[j+3]); } for (; i < 64; i++) { m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16]; } a = context->State[0]; b = context->State[1]; c = context->State[2]; d = context->State[3]; e = context->State[4]; f = context->State[5]; g = context->State[6]; h = context->State[7]; for (i = 0; i < 64; i++) { t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i]; t2 = EP0(a) + MAJ(a,b,c); h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; } context->State[0] += a; context->State[1] += b; context->State[2] += c; context->State[3] += d; context->State[4] += e; context->State[5] += f; context->State[6] += g; context->State[7] += h; }
static void SHA256_transform( SHA256_ctx* ctx ) { int t; unsigned int A = ctx->H[ 0 ]; unsigned int B = ctx->H[ 1 ]; unsigned int C = ctx->H[ 2 ]; unsigned int D = ctx->H[ 3 ]; unsigned int E = ctx->H[ 4 ]; unsigned int F = ctx->H[ 5 ]; unsigned int G = ctx->H[ 6 ]; unsigned int H = ctx->H[ 7 ]; unsigned int T1, T2; unsigned int W[ 64 ]; memcpy( W, ctx->M, 64 ); for ( t = 16; t < 64; t++ ) { W[ t ] = sig1(W[t-2]) + W[t-7] + sig0(W[t-15]) + W[t-16]; } for ( t = 0; t < 64; t++ ) { T1 = H + SIG1(E) + Ch(E,F,G) + K[t] + W[t]; T2 = SIG0(A) + Maj(A,B,C); H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2; } ctx->H[ 0 ] += A; ctx->H[ 1 ] += B; ctx->H[ 2 ] += C; ctx->H[ 3 ] += D; ctx->H[ 4 ] += E; ctx->H[ 5 ] += F; ctx->H[ 6 ] += G; ctx->H[ 7 ] += H; }
// Run 16 parallel hashes void sha2Hash(vecu16_t pointers, int totalBlocks, vecu16_t outHashes) { // Initial H values vecu16_t h0 = __builtin_nyuzi_makevectori(0x6A09E667); vecu16_t h1 = __builtin_nyuzi_makevectori(0xBB67AE85); vecu16_t h2 = __builtin_nyuzi_makevectori(0x3C6EF372); vecu16_t h3 = __builtin_nyuzi_makevectori(0xA54FF53A); vecu16_t h4 = __builtin_nyuzi_makevectori(0x510E527F); vecu16_t h5 = __builtin_nyuzi_makevectori(0x9B05688C); vecu16_t h6 = __builtin_nyuzi_makevectori(0x1F83D9AB); vecu16_t h7 = __builtin_nyuzi_makevectori(0x5BE0CD19); for (int i = 0; i < totalBlocks; i++) { vecu16_t w[64]; for (int index = 0; index < 16; index++) { w[index] = __builtin_nyuzi_gather_loadi(pointers); pointers += __builtin_nyuzi_makevectori(4); } for (int index = 16; index < 64; index++) w[index] = SIG1(w[index - 2]) + w[index - 7] + SIG0(w[index - 15]) + w[index - 16]; vecu16_t a = h0; vecu16_t b = h1; vecu16_t c = h2; vecu16_t d = h3; vecu16_t e = h4; vecu16_t f = h5; vecu16_t g = h6; vecu16_t h = h7; for (int round = 0; round < 64; round++) { vecu16_t temp1 = h + SIG1(e) + CH(e, f, g) + __builtin_nyuzi_makevectori(K[round]) + w[round]; vecu16_t temp2 = SIG0(a) + MA(a, b, c); h = g; g = f; f = e; e = d + temp1; d = c; c = b; b = a; a = temp1 + temp2; } h0 += a; h1 += b; h2 += c; h3 += d; h4 += e; h5 += f; h6 += g; h7 += h; } // doesn't add padding or length fields to end... __builtin_nyuzi_scatter_storei(outHashes, h0); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(4), h1); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(8), h2); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(12), h3); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(16), h4); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(20), h5); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(24), h6); __builtin_nyuzi_scatter_storei(outHashes + __builtin_nyuzi_makevectori(28), h7); }
void *subchain_hash_thread(void *pthread_arg) { /* * subchain_hash_thread() * Given a hash value and its position (index) in the chain, determine * the corresponding final hash. This is then used to find a candidate * chain in the main table */ PthreadData *mydata; mydata = (PthreadData*)pthread_arg; // TableHeader *header = mydata->header; TableEntry *entry = mydata->entry; uint thread_idx = mydata->entry_idx; uint8_t M[64]; // Initial string - zero padded and length in bits appended uint32_t W[64]; // Expanded Key Schedule uint32_t H[8]; // Hash int i = 0; // working index uint64_t l = 0; // length of message uint8_t B[64]; // store initial and working passwords here to protect original data uint8_t *in,*out; int reduction_idx,count; if(thread_idx<LINKS) { // set up a pointer to input_hash & final_hash TableEntry *data = entry + thread_idx; // move target hash to H for(i=0;i<8;i++) H[i] = data->input_hash[i]; reduction_idx = thread_idx; count = LINKS - thread_idx - 1; while(count > 0) { // Reduce hash to zero terminated password in B reduce_hash(H,B,reduction_idx); // copy zero terminated string from B to M and note length in = B; out = M; i=0; l=0; while(in[i] != 0x00) { out[i] = in[i]; i++; l++; } out[i++] = 0x80; // zero fill while(i < 56) out[i++]=0x00; /* * The hash algorithm uses 32 bit (4 byte words). * On little endian machines (Intel) the constants * are stored lsb->msb internally. To match this the WORDS * of the input message are subject to endian swap. */ uint8_t *x = M; int y; for(y=0; y<14; y++) { // long swap *(x+3) ^= *x; *x ^= *(x+3); *(x+3) ^= *x; // short swap *(x+2) ^= *(x+1); *(x+1) ^= *(x+2); *(x+2) ^= *(x+1); // move pointer up x += 4; } // need a 32 bit pointer to store length as 2 words l*=8; //length in bits uint32_t *p = (uint32_t*)&l; uint32_t *q = (uint32_t*)&out[i]; *q = *(p+1); *(q+1) = *p; // The 64 bytes in the message block can now be used // to initialise the 64 4-byte words in the message schedule W[64] // REUSE i uint8_t *r = (uint8_t*)M; uint8_t *s = (uint8_t*)W; for(i=0;i<64;i++) s[i] = r[i]; for(i=16;i<64;i++) W[i] = SIG1(W[i-2]) + W[i-7] + SIG0(W[i-15]) + W[i-16]; // set initial hash values initHash(H); // Now calc the hash sha256_transform(W,H); // update the counters reduction_idx += 1; count -= 1; } // while(count>0) // copy comp_hash to final hash for(i=0;i<8;i++) data->final_hash[i] = H[i]; data->sublinks = thread_idx; } // if(thread_idx<LINKS) void *void_ptr=NULL; return(void_ptr); } // hash_calculate