void writer_run(const reader_t* reader) { context_t context; context_initialize(&context); calc_work(&context, reader); allocate_work(&context); read(&context, &reader); context_finalize(&context); }
int main (int argc, char *argv[]) { uint64_t skip = 0; uint64_t left = -1; if (argc >= 2) skip = atoll (argv[1]); if (argc >= 3) left = atoll (argv[2]); printf ("Loading Kernel...\n"); const char *filename = KERNEL_SRC; struct stat s; if (stat (filename, &s) == -1) { fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__); return (-1); } FILE *fp = fopen (filename, "rb"); if (fp == NULL) { fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__); return (-1); } char *source_buf = (char *) malloc (s.st_size + 1); if (!fread (source_buf, sizeof (char), s.st_size, fp)) { fprintf (stderr, "%s: %s in line %d\n", filename, strerror (errno), __LINE__); return (-1); } source_buf[s.st_size] = 0; fclose (fp); const char *sourceBuf[] = { source_buf }; const size_t sourceLen[] = { s.st_size + 1 }; printf ("Initializing OpenCL...\n"); cl_platform_id platform; cl_uint num_devices = 0; cl_device_id devices[MAX_PLATFORM]; gc_clGetPlatformIDs (1, &platform, NULL); gc_clGetDeviceIDs (platform, DEV_TYPE, MAX_PLATFORM, devices, &num_devices); gpu_ctx_t gpu_ctxs[MAX_GPU]; memset (gpu_ctxs, 0, sizeof (gpu_ctxs)); for (cl_uint device_id = 0; device_id < num_devices; device_id++) { cl_device_id device = devices[device_id]; cl_context context = gc_clCreateContext (NULL, 1, &device, NULL, NULL); cl_program program = gc_clCreateProgramWithSource (context, 1, sourceBuf, sourceLen); gc_clBuildProgram (program, 1, &device, BUILD_OPTS, NULL, NULL); cl_kernel kernel = gc_clCreateKernel (program, KERNEL_NAME); cl_command_queue command_queue = gc_clCreateCommandQueue (context, device, 0); cl_uint max_compute_units; gc_clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof (max_compute_units), &max_compute_units, NULL); char device_name[BUFSIZ]; memset (device_name, 0, sizeof (device_name)); gc_clGetDeviceInfo (device, CL_DEVICE_NAME, sizeof (device_name), &device_name, NULL); printf ("Found new device #%2d: %s, %u compute units\n", device_id, device_name, max_compute_units); const int num_threads = GPU_THREADS; const int num_elements = max_compute_units * num_threads * GPU_ACCEL; /** * GPU memory */ const size_t size_block = num_elements * sizeof (block_t); const size_t size_results = num_threads * sizeof (uint32_t); cl_mem d_block = gc_clCreateBuffer (context, CL_MEM_READ_ONLY, size_block, NULL); cl_mem d_results = gc_clCreateBuffer (context, CL_MEM_WRITE_ONLY, size_results, NULL); gc_clSetKernelArg (kernel, 0, sizeof (cl_mem), (void *) &d_block); gc_clSetKernelArg (kernel, 1, sizeof (cl_mem), (void *) &d_results); /** * Host memory */ block_t *h_block = (block_t *) malloc (size_block); uint32_t *h_results = (uint32_t *) malloc (size_results); memset (h_results, 0xff, size_results); gc_clEnqueueWriteBuffer (command_queue, d_results, CL_TRUE, 0, size_results, h_results, 0, NULL, NULL); /** * Buffers for candidates */ uint8_t **plains_buf = (uint8_t **) calloc (num_elements * VECT_SIZE, sizeof (uint8_t *)); for (int i = 0; i < num_elements * VECT_SIZE; i++) { /* Agreed, this is not nice. But who cares nowadays? */ plains_buf[i] = (uint8_t *) malloc (MAX_LINELEN); } size_t *plains_len = (size_t *) calloc (num_elements * VECT_SIZE, sizeof (size_t)); gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id]; gpu_ctx->context = context; gpu_ctx->program = program; gpu_ctx->kernel = kernel; gpu_ctx->command_queue = command_queue; gpu_ctx->max_compute_units = max_compute_units; gpu_ctx->d_block = d_block; gpu_ctx->d_results = d_results; gpu_ctx->h_block = h_block; gpu_ctx->h_results = h_results; gpu_ctx->num_threads = num_threads; gpu_ctx->num_elements = num_elements; gpu_ctx->plains_buf = plains_buf; gpu_ctx->plains_len = plains_len; } /* static salt */ const uint8_t salt_buf[16] = { 0x97, 0x48, 0x6C, 0xAA, 0x22, 0x5F, 0xE8, 0x77, 0xC0, 0x35, 0xCC, 0x03, 0x73, 0x23, 0x6D, 0x51 }; const size_t salt_len = sizeof (salt_buf); /* main loop */ printf ("Initialization done, accepting candidates from stdin...\n\n"); cl_uint cur_device_id = 0; while (!feof (stdin)) { /* Get new password candidate from stdin */ uint8_t line_buf[MAX_LINELEN]; int cur_c = 0; int prev_c = 0; size_t line_len = 0; for (size_t i = 0; i < MAX_LINELEN - 100; i++) // - 100 = we need some space for salt and padding { cur_c = getchar (); if (cur_c == EOF) break; if ((prev_c == '\n') && (cur_c == '\0')) { line_len--; break; } line_buf[line_len] = cur_c; line_len++; prev_c = cur_c; } /* chop \r if it exists for some reason (in case user used a dictionary) */ if (line_len >= 2) { if ((prev_c == '\r') && (cur_c == '\0')) line_len -= 2; } /* skip empty lines */ if (line_len == 0) continue; /* The following enables distributed computing / resume work */ if (skip) { skip--; continue; } if (left) { left--; } else { break; } /* Append constant salt */ memcpy (line_buf + line_len, salt_buf, salt_len); line_len += salt_len; /* Generate digest out of it */ uint32_t digest[4]; md5_transform ((uint32_t *) line_buf, (uint32_t) line_len, digest); /* Next garanteed free GPU */ gpu_ctx_t *gpu_ctx = &gpu_ctxs[cur_device_id]; /* Save original buffer in case it cracks it */ memcpy (gpu_ctx->plains_buf[gpu_ctx->num_cached], line_buf, line_len - salt_len); gpu_ctx->plains_len[gpu_ctx->num_cached] = line_len - salt_len; /* Next garanteed free memory element on that GPU */ const uint32_t element_div = gpu_ctx->num_cached / 4; const uint32_t element_mod = gpu_ctx->num_cached % 4; /* Copy new digest */ gpu_ctx->h_block[element_div].A[element_mod] = digest[0]; gpu_ctx->h_block[element_div].B[element_mod] = digest[1]; gpu_ctx->h_block[element_div].C[element_mod] = digest[2]; gpu_ctx->h_block[element_div].D[element_mod] = digest[3]; gpu_ctx->num_cached++; /* If memory elements on that GPU are full, switch to the next GPU */ if ((gpu_ctx->num_cached / VECT_SIZE) < gpu_ctx->num_elements) continue; cur_device_id++; /* If there is no more GPU left, run the calculation */ if (cur_device_id < num_devices) continue; /* Fire! */ calc_work (num_devices, gpu_ctxs); launch_kernel (num_devices, gpu_ctxs); /* Collecting data has a blocking effect */ check_results (num_devices, gpu_ctxs); /* Reset buffer state */ for (cl_uint device_id = 0; device_id < num_devices; device_id++) { gpu_ctx_t *gpu_ctx = &gpu_ctxs[device_id]; gpu_ctx->num_cached = 0; } cur_device_id = 0; } /* Final calculation of leftovers */ calc_work (num_devices, gpu_ctxs); launch_kernel (num_devices, gpu_ctxs); check_results (num_devices, gpu_ctxs); return -1; }