Beispiel #1
0
void writer_run(const reader_t* reader)
{
    context_t context;

    context_initialize(&context);

    calc_work(&context, reader);
    allocate_work(&context);
    read(&context, &reader);

    context_finalize(&context);
}
Beispiel #2
0
int main (int argc, char *argv[])
{
  uint64_t skip =  0;
  uint64_t left = -1;

  if (argc >= 2) skip = atoll (argv[1]);
  if (argc >= 3) left = atoll (argv[2]);

  printf ("Loading Kernel...\n");

  const char *filename = KERNEL_SRC;

  struct stat s;

  if (stat (filename, &s) == -1)
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  FILE *fp = fopen (filename, "rb");

  if (fp == NULL)
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  char *source_buf = (char *) malloc (s.st_size + 1);

  if (!fread (source_buf, sizeof (char), s.st_size, fp))
  {
    fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__);

    return (-1);
  }

  source_buf[s.st_size] = 0;

  fclose (fp);

  const char *sourceBuf[] = { source_buf };

  const size_t sourceLen[] = { s.st_size + 1 };

  printf ("Initializing OpenCL...\n");

  cl_platform_id platform;

  cl_uint num_devices = 0;

  cl_device_id devices[MAX_PLATFORM];

  gc_clGetPlatformIDs (1, &platform, NULL);

  gc_clGetDeviceIDs (platform, DEV_TYPE, MAX_PLATFORM, devices, &num_devices);

  gpu_ctx_t gpu_ctxs[MAX_GPU];

  memset (gpu_ctxs, 0, sizeof (gpu_ctxs));

  for (cl_uint device_id = 0; device_id < num_devices; device_id++)
  {
    cl_device_id device = devices[device_id];

    cl_context context = gc_clCreateContext (NULL, 1, &device, NULL, NULL);

    cl_program program = gc_clCreateProgramWithSource (context, 1, sourceBuf, sourceLen);

    gc_clBuildProgram (program, 1, &device, BUILD_OPTS, NULL, NULL);

    cl_kernel kernel = gc_clCreateKernel (program, KERNEL_NAME);

    cl_command_queue command_queue = gc_clCreateCommandQueue (context, device, 0);

    cl_uint max_compute_units;

    gc_clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (max_compute_units), &max_compute_units, NULL);

    char device_name[BUFSIZ];

    memset (device_name, 0, sizeof (device_name));

    gc_clGetDeviceInfo (device, CL_DEVICE_NAME, sizeof (device_name), &device_name, NULL);

    printf ("Found new device #%2d: %s, %u compute units\n", device_id, device_name, max_compute_units);

    const int num_threads  = GPU_THREADS;
    const int num_elements = max_compute_units * num_threads * GPU_ACCEL;

    /**
     * GPU memory
     */

    const size_t size_block   = num_elements * sizeof (block_t);
    const size_t size_results = num_threads  * sizeof (uint32_t);

    cl_mem d_block = gc_clCreateBuffer (context, CL_MEM_READ_ONLY, size_block, NULL);

    cl_mem d_results = gc_clCreateBuffer (context, CL_MEM_WRITE_ONLY, size_results, NULL);

    gc_clSetKernelArg (kernel, 0, sizeof (cl_mem), (void *) &d_block);
    gc_clSetKernelArg (kernel, 1, sizeof (cl_mem), (void *) &d_results);

    /**
     * Host memory
     */

    block_t *h_block = (block_t *) malloc (size_block);

    uint32_t *h_results = (uint32_t *) malloc (size_results);

    memset (h_results, 0xff, size_results);

    gc_clEnqueueWriteBuffer (command_queue, d_results, CL_TRUE, 0, size_results, h_results, 0, NULL, NULL);

    /**
     * Buffers for candidates
     */

    uint8_t **plains_buf = (uint8_t **) calloc (num_elements * VECT_SIZE, sizeof (uint8_t *));

    for (int i = 0; i < num_elements * VECT_SIZE; i++)
    {
      /* Agreed, this is not nice. But who cares nowadays? */

      plains_buf[i] = (uint8_t *) malloc (MAX_LINELEN);
    }

    size_t *plains_len = (size_t *) calloc (num_elements * VECT_SIZE, sizeof (size_t));

    gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id];

    gpu_ctx->context           = context;
    gpu_ctx->program           = program;
    gpu_ctx->kernel            = kernel;
    gpu_ctx->command_queue     = command_queue;
    gpu_ctx->max_compute_units = max_compute_units;
    gpu_ctx->d_block           = d_block;
    gpu_ctx->d_results         = d_results;
    gpu_ctx->h_block           = h_block;
    gpu_ctx->h_results         = h_results;
    gpu_ctx->num_threads       = num_threads;
    gpu_ctx->num_elements      = num_elements;
    gpu_ctx->plains_buf        = plains_buf;
    gpu_ctx->plains_len        = plains_len;
  }

  /* static salt */

  const uint8_t salt_buf[16] =
  {
    0x97, 0x48, 0x6C, 0xAA,
    0x22, 0x5F, 0xE8, 0x77,
    0xC0, 0x35, 0xCC, 0x03,
    0x73, 0x23, 0x6D, 0x51
  };

  const size_t salt_len = sizeof (salt_buf);

  /* main loop */

  printf ("Initialization done, accepting candidates from stdin...\n\n");

  cl_uint cur_device_id = 0;

  while (!feof (stdin))
  {
    /* Get new password candidate from stdin */

    uint8_t line_buf[MAX_LINELEN];

    int cur_c = 0;

    int prev_c = 0;

    size_t line_len = 0;

    for (size_t i = 0; i < MAX_LINELEN - 100; i++) // - 100 = we need some space for salt and padding
    {
      cur_c = getchar ();

      if (cur_c == EOF) break;

      if ((prev_c == '\n') && (cur_c == '\0'))
      {
        line_len--;

        break;
      }

      line_buf[line_len] = cur_c;

      line_len++;

      prev_c = cur_c;
    }

    /* chop \r if it exists for some reason (in case user used a dictionary) */

    if (line_len >= 2)
    {
      if ((prev_c == '\r') && (cur_c == '\0')) line_len -= 2;
    }

    /* skip empty lines */

    if (line_len == 0) continue;

    /* The following enables distributed computing / resume work */

    if (skip)
    {
      skip--;

      continue;
    }

    if (left)
    {
      left--;
    }
    else
    {
      break;
    }

    /* Append constant salt */

    memcpy (line_buf + line_len, salt_buf, salt_len);

    line_len += salt_len;

    /* Generate digest out of it */

    uint32_t digest[4];

    md5_transform ((uint32_t *) line_buf, (uint32_t) line_len, digest);

    /* Next garanteed free GPU */

    gpu_ctx_t *gpu_ctx = &gpu_ctxs[cur_device_id];

    /* Save original buffer in case it cracks it */

    memcpy (gpu_ctx->plains_buf[gpu_ctx->num_cached], line_buf, line_len - salt_len);

    gpu_ctx->plains_len[gpu_ctx->num_cached] = line_len - salt_len;

    /* Next garanteed free memory element on that GPU */

    const uint32_t element_div = gpu_ctx->num_cached / 4;
    const uint32_t element_mod = gpu_ctx->num_cached % 4;

    /* Copy new digest */

    gpu_ctx->h_block[element_div].A[element_mod] = digest[0];
    gpu_ctx->h_block[element_div].B[element_mod] = digest[1];
    gpu_ctx->h_block[element_div].C[element_mod] = digest[2];
    gpu_ctx->h_block[element_div].D[element_mod] = digest[3];

    gpu_ctx->num_cached++;

    /* If memory elements on that GPU are full, switch to the next GPU */

    if ((gpu_ctx->num_cached / VECT_SIZE) < gpu_ctx->num_elements) continue;

    cur_device_id++;

    /* If there is no more GPU left, run the calculation */

    if (cur_device_id < num_devices) continue;

    /* Fire! */

    calc_work (num_devices, gpu_ctxs);

    launch_kernel (num_devices, gpu_ctxs);

    /* Collecting data has a blocking effect */

    check_results (num_devices, gpu_ctxs);

    /* Reset buffer state */

    for (cl_uint device_id = 0; device_id < num_devices; device_id++)
    {
      gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id];

      gpu_ctx->num_cached = 0;
    }

    cur_device_id = 0;
  }

  /* Final calculation of leftovers */

  calc_work (num_devices, gpu_ctxs);

  launch_kernel (num_devices, gpu_ctxs);

  check_results (num_devices, gpu_ctxs);

  return -1;
}