static int md5_final(struct shash_desc *desc, u8 *out)
{
	struct md5_state *mctx = shash_desc_ctx(desc);
	const unsigned int offset = mctx->byte_count & 0x3f;
	char *p = (char *)mctx->block + offset;
	int padding = 56 - (offset + 1);

	*p++ = 0x80;
	if (padding < 0) {
		memset(p, 0x00, padding + sizeof (u64));
		md5_transform_helper(mctx);
		p = (char *)mctx->block;
		padding = 56;
	}

	memset(p, 0, padding);
	mctx->block[14] = mctx->byte_count << 3;
	mctx->block[15] = mctx->byte_count >> 29;
	le32_to_cpu_array(mctx->block, (sizeof(mctx->block) -
	                  sizeof(u64)) / sizeof(u32));
	md5_transform(mctx->hash, mctx->block);
	cpu_to_le32_array(mctx->hash, sizeof(mctx->hash) / sizeof(u32));
	memcpy(out, mctx->hash, sizeof(mctx->hash));
	memset(mctx, 0, sizeof(*mctx));

	return 0;
}
Beispiel #2
0
void md5_finish(struct md_context *context) {
  int buflen = context->count & 63;
  uint64_t bitcount;
  
  context->buffer[buflen++] = 0x80;
  memset(&context->buffer[buflen], 0, 64-buflen);
  if(buflen > 56) {
    md5_transform(context);
    memset(context->buffer, 0, 64);
  }
//   *(uint64_t *)(&context->buffer[56]) = context->count * 8;
  bitcount = context->count * 8;
  memcpy(&context->buffer[56], &bitcount, sizeof(uint64_t));
  md5_transform(context);
  memcpy(context->digest, context->h, 16);
  return;
}
static inline void md5_transform_helper(struct md5_ctx *ctx)
{
#ifdef DEBUG_MD5
	printk("md5_transform_helper running\n");
#endif //DEBUG_MD5

	//le32_to_cpu_array(ctx->block, sizeof(ctx->block) / sizeof(u32));
	md5_transform(ctx->hash, ctx->block);
}
Beispiel #4
0
__u32 secure_ipv6_id(const __be32 daddr[4])
{
	__u32 hash[4];

	memcpy(hash, daddr, 16);
	md5_transform(hash, net_secret);

	return hash[0];
}
void md5_hash (unsigned char * sum, unsigned char * data, int data_len)
{
	struct md5_context context;
	int i;
	
	context.A = md5_A;
	context.B = md5_B;
	context.C = md5_C;
	context.D = md5_D;
	
	data_len %= 55;
	
	// turned out faster than memset
	for (i = 0; i < 16; i++)
		context.M[i] ^= context.M[i];
	// faster than memcpy
	for (i = 0; i < data_len / 4; i++)
		context.M[i] = ((unsigned int *) data)[i];

	i <<= 2;
	switch (data_len & 3)
	{
		case 0 :
			((unsigned char *) context.M)[i] = 0x80;
			break;
		case 1 :
			((unsigned char *) context.M)[i] = data[i];
			i++;
			((unsigned char *) context.M)[i] = 0x80;
			break;
		case 2 :
			((unsigned char *) context.M)[i] = data[i];
			i++;
			((unsigned char *) context.M)[i] = data[i];
			i++;
			((unsigned char *) context.M)[i] = 0x80;
			break;
		case 3 :
			((unsigned char *) context.M)[i] = data[i];
			i++;
			((unsigned char *) context.M)[i] = data[i];
			i++;
			((unsigned char *) context.M)[i] = data[i];
			i++;
			((unsigned char *) context.M)[i] = 0x80;
			break;
	}
	
	context.M[14] = data_len << 3; // * 8

	md5_transform(&context);
	
    *((uint32_t *) &(sum[ 0])) = context.A;
    *((uint32_t *) &(sum[ 4])) = context.B;
    *((uint32_t *) &(sum[ 8])) = context.C;
    *((uint32_t *) &(sum[12])) = context.D;
}
Beispiel #6
0
static unsigned int get_random_int(void)
{
	__u32 hash, random;
	unsigned int ret;

	hash = get_cycles();
	md5_transform(&hash, &random);
	ret = hash;

	return ret;
}
Beispiel #7
0
void md5_update(struct md_context *context, uint8_t *chunk, uint64_t chunk_size) {
  int buflen = context->count & 63;
  context->count += chunk_size;
  
  if((buflen + chunk_size) < 64) {
    memcpy(&context->buffer[buflen], chunk, chunk_size);
    return;
  }
  
  memcpy(&context->buffer[buflen], chunk, 64 - buflen);
  md5_transform(context);
  chunk_size -= (64 - buflen);
  chunk += (64 - buflen);
  while(chunk_size >= 64) {
    memcpy(context->buffer, chunk, 64);
    md5_transform(context);
    chunk_size -= 64;
    chunk += 64;
  }
  memcpy(context->buffer, chunk, chunk_size);
  return;
}
Beispiel #8
0
static unsigned char *
md5_final()
{
  int i, buflen = length & 63;

  buffer[buflen++] = 0x80;
  memset (buffer+buflen, 0, 64 - buflen);
  if (buflen > 56)
    {
      md5_transform (buffer);
      memset (buffer, 0, 64);
      buflen = 0;
    }
  
  *(UINT4 *) (buffer + 56) = cpu_to_le32 (8 * length);
  *(UINT4 *) (buffer + 60) = 0;
  md5_transform (buffer);

  for (i = 0; i < 4; i++)
    state[i] = cpu_to_le32 (state[i]);
  return (unsigned char *) state;
}
Beispiel #9
0
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
	u32 hash[MD5_DIGEST_WORDS];

	hash[0] = (__force u32)saddr;
	hash[1] = (__force u32)daddr;
	hash[2] = (__force u32)dport ^ net_secret[14];
	hash[3] = net_secret[15];

	md5_transform(hash, net_secret);

	return hash[0];
}
Beispiel #10
0
__u32 secure_ip_id(__be32 daddr)
{
	u32 hash[MD5_DIGEST_WORDS];

	hash[0] = (__force __u32) daddr;
	hash[1] = net_secret[13];
	hash[2] = net_secret[14];
	hash[3] = net_secret[15];

	md5_transform(hash, net_secret);

	return hash[0];
}
Beispiel #11
0
/* exact copy of tcp_sequence number function from net/core/secure.c */
__u32 secure_tcp_sequence_number(u32 saddr, u32 daddr,
				 u16 sport, u16 dport)
{
	u32 hash[MD5_DIGEST_WORDS];

	hash[0] = saddr;
	hash[1] = daddr;
	hash[2] = (sport << 16) + dport;
	hash[3] = net_secret[15];

	md5_transform(hash, net_secret);

	return seq_scale(hash[0]);
}
void md5_final (MD5Schedule_ref ctx, char *digest /* 16 chars */)
{
  register unsigned int count = (ctx->bits[0] >> 3) & 0x3F ;
  register unsigned char *p = ctx->in + count ;
  *p++ = 0x80;
  count = 63 - count ;
  if (count < 8)
  {
    byte_zero(p, count) ;
    uint32_little_endian(ctx->in, 16) ;
    md5_transform(ctx->buf, (uint32 *)ctx->in) ;
    byte_zero(ctx->in, 56) ;
  }
  else byte_zero(p, count - 8) ;
  uint32_little_endian(ctx->in, 14) ;

  byte_copy(ctx->in + 56, 4, (char *)&ctx->bits[0]) ;
  byte_copy(ctx->in + 60, 4, (char *)&ctx->bits[1]) ;

  md5_transform(ctx->buf, (uint32 *)ctx->in) ;
  uint32_little_endian((char *)ctx->buf, 4) ;
  byte_copy(digest, 16, (char *)ctx->buf) ;
}
Beispiel #13
0
__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
				 __be16 sport, __be16 dport)
{
	u32 hash[MD5_DIGEST_WORDS];

	hash[0] = (__force u32)saddr;
	hash[1] = (__force u32)daddr;
	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
	hash[3] = net_secret[15];

	md5_transform(hash, net_secret);

	return seq_scale(hash[0]);
}
/* Generate HMAC-MD5 intermediate Hash */
static void sa_hmac_md5_get_pad(const u8 *key, u16 key_sz, u32 *ipad, u32 *opad)
{
	u8 k_ipad[MD5_MESSAGE_BYTES];
	u8 k_opad[MD5_MESSAGE_BYTES];
	int i;

	for (i = 0; i < key_sz; i++) {
		k_ipad[i] = key[i] ^ 0x36;
		k_opad[i] = key[i] ^ 0x5c;
	}
	/* Instead of XOR with 0 */
	for (; i < SHA_MESSAGE_BYTES; i++) {
		k_ipad[i] = 0x36;
		k_opad[i] = 0x5c;
	}

	/* SHA-1 on k_ipad */
	md5_init(ipad);
	md5_transform(ipad, (u32 *)k_ipad);

	/* SHA-1 on k_opad */
	md5_init(opad);
	md5_transform(ipad, (u32 *)k_opad);
}
/*
 * Final wrapup - pad to 64-byte boundary with the bit pattern 
 * 1 0* (64-bit count of bits processed, MSB-first)
 */
static inline void
md5_final (md5_ctx_t * ctx, u8 out[16])
{
	/* Number of bytes in ctx->in */
	const int offset = ctx->byte_count & 0x3f;

	/* p points after last content byte in ctx->in */
	u8 *p = (u8 *) ctx->in + offset;

	/* Bytes of padding needed to make 56 bytes (-8..55) */
	int padding = 56 - (offset + 1);

	/* Set the first byte of padding to 0x80.  There is always room. */
	*p++ = 0x80;

	if (padding < 0) {	/* Padding forces an extra block */
		memset (p, 0x00, padding + sizeof (u64));

		md5_transform_helper (ctx);

		p = (u8 *) ctx->in;
		padding = 56;
	}

	/* pad remaining bytes w/ 0x00 */
	memset (p, 0x00, padding);

	/* Append length in bits and transform */
	ctx->in[14] = ctx->byte_count << 3;	/* low order word first */
	ctx->in[15] = ctx->byte_count >> 29;

	/* keep the appended bit-count words in host order! */
	le32_to_cpu_array (ctx->in,
			   (sizeof (ctx->in) - sizeof (u64)) / sizeof (u32));
	md5_transform (ctx->hash, ctx->in);

	/* convert digest buf from host to LE byteorder */
	cpu_to_le32_array (ctx->hash, sizeof (ctx->hash) / sizeof (u32));

	/* copy to output buffer */
	memcpy (out, ctx->hash, sizeof (ctx->hash));

	/* wipe context */
	memset (ctx, 0, sizeof (ctx));
}
Beispiel #16
0
u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
			       __be16 dport)
{
	u32 secret[MD5_MESSAGE_BYTES / 4];
	u32 hash[MD5_DIGEST_WORDS];
	u32 i;

	memcpy(hash, saddr, 16);
	for (i = 0; i < 4; i++)
		secret[i] = net_secret[i] + (__force u32) daddr[i];
	secret[4] = net_secret[4] + (__force u32)dport;
	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
		secret[i] = net_secret[i];

	md5_transform(hash, secret);

	return hash[0];
}
Beispiel #17
0
__u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr,
				   __be16 sport, __be16 dport)
{
	u32 secret[MD5_MESSAGE_BYTES / 4];
	u32 hash[MD5_DIGEST_WORDS];
	u32 i;

	memcpy(hash, saddr, 16);
	for (i = 0; i < 4; i++)
		secret[i] = net_secret[i] + daddr[i];
	secret[4] = net_secret[4] +
		(((__force u16)sport << 16) + (__force u16)dport);
	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
		secret[i] = net_secret[i];

	md5_transform(hash, secret);

	return seq_scale(hash[0]);
}
Beispiel #18
0
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
				__be16 sport, __be16 dport)
{
	u32 hash[MD5_DIGEST_WORDS];
	u64 seq;

	hash[0] = (__force u32)saddr;
	hash[1] = (__force u32)daddr;
	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
	hash[3] = net_secret[15];

	md5_transform(hash, net_secret);

	seq = hash[0] | (((u64)hash[1]) << 32);
	seq += ktime_to_ns(ktime_get_real());
	seq &= (1ull << 48) - 1;

	return seq;
}
Beispiel #19
0
/*! \fn static void md5_final(struct crypto_tfm *tfm, u8 *out)
 *  \ingroup IFX_MD5_FUNCTIONS
 *  \brief compute final md5 value   
 *  \param tfm linux crypto algo transform  
 *  \param out final md5 output value  
*/                                 
static int md5_final(struct shash_desc *desc, u8 *out)
{
    struct md5_ctx *mctx = shash_desc_ctx(desc);
    const unsigned int offset = mctx->byte_count & 0x3f;
    char *p = (char *)mctx->block + offset;
    int padding = 56 - (offset + 1);
    volatile struct deu_hash_t *hashs = (struct deu_hash_t *) HASH_START;
    unsigned long flag;

    *p++ = 0x80;
    if (padding < 0) {
        memset(p, 0x00, padding + sizeof (u64));
        md5_transform_helper(mctx);
        p = (char *)mctx->block;
        padding = 56;
    }

    memset(p, 0, padding);
    mctx->block[14] = endian_swap(mctx->byte_count << 3);
    mctx->block[15] = endian_swap(mctx->byte_count >> 29);

#if 0
    le32_to_cpu_array(mctx->block, (sizeof(mctx->block) -
                      sizeof(u64)) / sizeof(u32));
#endif

    md5_transform(mctx, mctx->hash, mctx->block);                                                 

    CRTCL_SECT_START;

    *((u32 *) out + 0) = endian_swap (hashs->D1R);
    *((u32 *) out + 1) = endian_swap (hashs->D2R);
    *((u32 *) out + 2) = endian_swap (hashs->D3R);
    *((u32 *) out + 3) = endian_swap (hashs->D4R);

    CRTCL_SECT_END;

    // Wipe context
    memset(mctx, 0, sizeof(*mctx));

    return 0;
}
Beispiel #20
0
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
				  __be16 sport, __be16 dport)
{
	u32 secret[MD5_MESSAGE_BYTES / 4];
	u32 hash[MD5_DIGEST_WORDS];
	u64 seq;
	u32 i;

	memcpy(hash, saddr, 16);
	for (i = 0; i < 4; i++)
		secret[i] = net_secret[i] + daddr[i];
	secret[4] = net_secret[4] +
		(((__force u16)sport << 16) + (__force u16)dport);
	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
		secret[i] = net_secret[i];

	md5_transform(hash, secret);

	seq = hash[0] | (((u64)hash[1]) << 32);
	seq += ktime_to_ns(ktime_get_real());
	seq &= (1ull << 48) - 1;

	return seq;
}
static inline void md5_transform_helper(struct md5_ctx *ctx)
{
	le32_to_cpu_array(ctx->block, sizeof(ctx->block) / sizeof(u32));
	md5_transform(ctx->hash, ctx->block);
}
static inline void
md5_transform_helper (md5_ctx_t * ctx)
{
	le32_to_cpu_array (ctx->in, sizeof (ctx->in) / sizeof (u32));
	md5_transform (ctx->hash, ctx->in);
}
static void md5_final(void *ctx, u8 *out)
{
	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
	struct md5_ctx				*mctx = ctx;
	const unsigned int			offset = mctx->byte_count & 0x3f;
	char						*p = (char *) mctx->block + offset;
	int							padding = 56 - (offset + 1);
	unsigned long			    flag;
#ifndef ONLY_IN_MEM
	volatile struct deu_hash_t	*hashs = (struct hash_t *) HASH_START;
#else
	unsigned char				*prin = NULL;
	struct deu_hash_t			*hashs = NULL;
	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/

	hashs = (struct deu_hash_t *) kmalloc(sizeof(*hashs), GFP_KERNEL);
	memset(hashs, 0, sizeof(*hashs));
	prin = (unsigned char *) hashs;
#endif
#ifdef DEBUG_MD5
	printk("md5_final running\n");

	//hexdump(mctx,sizeof(*mctx));
	printk("block before and after transform\n");
	hexdump(mctx->block, sizeof(mctx->block));
#endif //DEBUG_MD5
	*p++ = 0x80;

	if(padding < 0)
	{
		memset(p, 0x00, padding + sizeof(u64));
		md5_transform_helper(mctx);
		p = (char *) mctx->block;
		padding = 56;
	}

	memset(p, 0, padding);

#ifdef DEBUG_MD5
	hexdump(&mctx->byte_count, sizeof(mctx->byte_count));
#endif //DEBUG_MD5
	mctx->block[14] = cpu_to_le32(mctx->byte_count << 3);
	mctx->block[15] = cpu_to_le32(mctx->byte_count >> 29);

#ifdef DEBUG_MD5
	hexdump(mctx->block, sizeof(mctx->block));
#endif //DEBUG_MD5

	//le32_to_cpu_array(mctx->block, (sizeof(mctx->block) -
	//                  sizeof(u64)) / sizeof(u32));
	md5_transform(mctx->hash, mctx->block);

	local_irq_save(flag);

#ifdef CONFIG_CRYPTO_DEV_DANUBE_DMA
	//wait for processing
	while(hashs->controlr.BSY)
	{
		// this will not take long
	}
#endif
	*((u32 *) out + 0) = le32_to_cpu(hashs->D1R);
	*((u32 *) out + 1) = le32_to_cpu(hashs->D2R);
	*((u32 *) out + 2) = le32_to_cpu(hashs->D3R);
	*((u32 *) out + 3) = le32_to_cpu(hashs->D4R);

	hashs->controlr.SM = 0; // switch off again for next dma transfer

	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
#ifdef CONFIG_CRYPTO_DEV_DANUBE_DMA
	struct dma_device_info	*dma_device;
	_ifx_deu_device			*pDev = ifx_deu;
	/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/

	dma_device = pDev->dma_device;

	//if(dma_device) dma_device_release(dma_device);
	if(dma_device)
	{
		dma_device_unregister(pDev->dma_device);
		dma_device_release(pDev->dma_device);
	}
#endif

	//cpu_to_le32_array(mctx->hash, sizeof(mctx->hash) / sizeof(u32));
	//memcpy(out, mctx->hash, sizeof(mctx->hash));


	local_irq_restore(flag);

	// Wipe context
    memset(mctx, 0, sizeof(*mctx));
}
Beispiel #24
0
int main (int argc, char *argv[])
{
  uint64_t skip =  0;
  uint64_t left = -1;

  if (argc >= 2) skip = atoll (argv[1]);
  if (argc >= 3) left = atoll (argv[2]);

  printf ("Loading Kernel...\n");

  const char *filename = KERNEL_SRC;

  struct stat s;

  if (stat (filename, &s) == -1)
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  FILE *fp = fopen (filename, "rb");

  if (fp == NULL)
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  char *source_buf = (char *) malloc (s.st_size + 1);

  if (!fread (source_buf, sizeof (char), s.st_size, fp))
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  source_buf[s.st_size] = 0;

  fclose (fp);

  const char *sourceBuf[] = { source_buf };

  const size_t sourceLen[] = { s.st_size + 1 };

  printf ("Initializing OpenCL...\n");

  cl_platform_id platform;

  cl_uint num_devices = 0;

  cl_device_id devices[MAX_PLATFORM];

  gc_clGetPlatformIDs (1, &platform, NULL);

  gc_clGetDeviceIDs (platform, DEV_TYPE, MAX_PLATFORM, devices, &num_devices);

  gpu_ctx_t gpu_ctxs[MAX_GPU];

  memset (gpu_ctxs, 0, sizeof (gpu_ctxs));

  for (cl_uint device_id = 0; device_id < num_devices; device_id++)
  {
    cl_device_id device = devices[device_id];

    cl_context context = gc_clCreateContext (NULL, 1, &device, NULL, NULL);

    cl_program program = gc_clCreateProgramWithSource (context, 1, sourceBuf, sourceLen);

    gc_clBuildProgram (program, 1, &device, BUILD_OPTS, NULL, NULL);

    cl_kernel kernel = gc_clCreateKernel (program, KERNEL_NAME);

    cl_command_queue command_queue = gc_clCreateCommandQueue (context, device, 0);

    cl_uint max_compute_units;

    gc_clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (max_compute_units), &max_compute_units, NULL);

    char device_name[BUFSIZ];

    memset (device_name, 0, sizeof (device_name));

    gc_clGetDeviceInfo (device, CL_DEVICE_NAME, sizeof (device_name), &device_name, NULL);

    printf ("Found new device #%2d: %s, %u compute units\n", device_id, device_name, max_compute_units);

    const int num_threads  = GPU_THREADS;
    const int num_elements = max_compute_units * num_threads * GPU_ACCEL;

    /**
     * GPU memory
     */

    const size_t size_block   = num_elements * sizeof (block_t);
    const size_t size_results = num_threads  * sizeof (uint32_t);

    cl_mem d_block = gc_clCreateBuffer (context, CL_MEM_READ_ONLY, size_block, NULL);

    cl_mem d_results = gc_clCreateBuffer (context, CL_MEM_WRITE_ONLY, size_results, NULL);

    gc_clSetKernelArg (kernel, 0, sizeof (cl_mem), (void *) &d_block);
    gc_clSetKernelArg (kernel, 1, sizeof (cl_mem), (void *) &d_results);

    /**
     * Host memory
     */

    block_t *h_block = (block_t *) malloc (size_block);

    uint32_t *h_results = (uint32_t *) malloc (size_results);

    memset (h_results, 0xff, size_results);

    gc_clEnqueueWriteBuffer (command_queue, d_results, CL_TRUE, 0, size_results, h_results, 0, NULL, NULL);

    /**
     * Buffers for candidates
     */

    uint8_t **plains_buf = (uint8_t **) calloc (num_elements * VECT_SIZE, sizeof (uint8_t *));

    for (int i = 0; i < num_elements * VECT_SIZE; i++)
    {
      /* Agreed, this is not nice. But who cares nowadays? */

      plains_buf[i] = (uint8_t *) malloc (MAX_LINELEN);
    }

    size_t *plains_len = (size_t *) calloc (num_elements * VECT_SIZE, sizeof (size_t));

    gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id];

    gpu_ctx->context           = context;
    gpu_ctx->program           = program;
    gpu_ctx->kernel            = kernel;
    gpu_ctx->command_queue     = command_queue;
    gpu_ctx->max_compute_units = max_compute_units;
    gpu_ctx->d_block           = d_block;
    gpu_ctx->d_results         = d_results;
    gpu_ctx->h_block           = h_block;
    gpu_ctx->h_results         = h_results;
    gpu_ctx->num_threads       = num_threads;
    gpu_ctx->num_elements      = num_elements;
    gpu_ctx->plains_buf        = plains_buf;
    gpu_ctx->plains_len        = plains_len;
  }

  /* static salt */

  const uint8_t salt_buf[16] =
  {
    0x97, 0x48, 0x6C, 0xAA,
    0x22, 0x5F, 0xE8, 0x77,
    0xC0, 0x35, 0xCC, 0x03,
    0x73, 0x23, 0x6D, 0x51
  };

  const size_t salt_len = sizeof (salt_buf);

  /* main loop */

  printf ("Initialization done, accepting candidates from stdin...\n\n");

  cl_uint cur_device_id = 0;

  while (!feof (stdin))
  {
    /* Get new password candidate from stdin */

    uint8_t line_buf[MAX_LINELEN];

    int cur_c = 0;

    int prev_c = 0;

    size_t line_len = 0;

    for (size_t i = 0; i < MAX_LINELEN - 100; i++) // - 100 = we need some space for salt and padding
    {
      cur_c = getchar ();

      if (cur_c == EOF) break;

      if ((prev_c == '\n') && (cur_c == '\0'))
      {
        line_len--;

        break;
      }

      line_buf[line_len] = cur_c;

      line_len++;

      prev_c = cur_c;
    }

    /* chop \r if it exists for some reason (in case user used a dictionary) */

    if (line_len >= 2)
    {
      if ((prev_c == '\r') && (cur_c == '\0')) line_len -= 2;
    }

    /* skip empty lines */

    if (line_len == 0) continue;

    /* The following enables distributed computing / resume work */

    if (skip)
    {
      skip--;

      continue;
    }

    if (left)
    {
      left--;
    }
    else
    {
      break;
    }

    /* Append constant salt */

    memcpy (line_buf + line_len, salt_buf, salt_len);

    line_len += salt_len;

    /* Generate digest out of it */

    uint32_t digest[4];

    md5_transform ((uint32_t *) line_buf, (uint32_t) line_len, digest);

    /* Next garanteed free GPU */

    gpu_ctx_t *gpu_ctx = &gpu_ctxs[cur_device_id];

    /* Save original buffer in case it cracks it */

    memcpy (gpu_ctx->plains_buf[gpu_ctx->num_cached], line_buf, line_len - salt_len);

    gpu_ctx->plains_len[gpu_ctx->num_cached] = line_len - salt_len;

    /* Next garanteed free memory element on that GPU */

    const uint32_t element_div = gpu_ctx->num_cached / 4;
    const uint32_t element_mod = gpu_ctx->num_cached % 4;

    /* Copy new digest */

    gpu_ctx->h_block[element_div].A[element_mod] = digest[0];
    gpu_ctx->h_block[element_div].B[element_mod] = digest[1];
    gpu_ctx->h_block[element_div].C[element_mod] = digest[2];
    gpu_ctx->h_block[element_div].D[element_mod] = digest[3];

    gpu_ctx->num_cached++;

    /* If memory elements on that GPU are full, switch to the next GPU */

    if ((gpu_ctx->num_cached / VECT_SIZE) < gpu_ctx->num_elements) continue;

    cur_device_id++;

    /* If there is no more GPU left, run the calculation */

    if (cur_device_id < num_devices) continue;

    /* Fire! */

    calc_work (num_devices, gpu_ctxs);

    launch_kernel (num_devices, gpu_ctxs);

    /* Collecting data has a blocking effect */

    check_results (num_devices, gpu_ctxs);

    /* Reset buffer state */

    for (cl_uint device_id = 0; device_id < num_devices; device_id++)
    {
      gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id];

      gpu_ctx->num_cached = 0;
    }

    cur_device_id = 0;
  }

  /* Final calculation of leftovers */

  calc_work (num_devices, gpu_ctxs);

  launch_kernel (num_devices, gpu_ctxs);

  check_results (num_devices, gpu_ctxs);

  return -1;
}