/
main.c
350 lines (290 loc) · 11.8 KB
/
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/**
* @author Mikuláš Dítě
* @license Original BSD, see license.txt
*/
/**
* If the application fails during run time, the kernel could not be found in path and you
* 1) must click on the file under "Executables" list item, then get info (or apple-I), then
* 2) set the working directory to Project Directory (not build directory)
*/
#include <stdio.h>
#include <assert.h>
#include <sys/sysctl.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <OpenCL/OpenCL.h>
#pragma mark Utilities
char * load_program_source(const char *filename)
{
struct stat statbuf;
FILE *fh;
char *source;
fh = fopen(filename, "r");
if (fh == 0)
return 0;
stat(filename, &statbuf);
source = (char *) malloc(statbuf.st_size + 1);
fread(source, statbuf.st_size, 1, fh);
source[statbuf.st_size] = '\0';
return source;
}
/** globals */
cl_program program[1];
cl_kernel kernel[2];
cl_command_queue cmd_queue;
cl_context context;
cl_device_id cpu = NULL, device = NULL;
cl_int err = 0;
size_t returned_size = 0;
size_t buffer_size;
cl_mem mem_c_position, mem_c_velocity, mem_p_angle, mem_p_velocity, mem_fitness;
int initiated = 0;
#pragma mark -
#pragma mark OpenCL context
int initGPU(int n)
{
#pragma mark Device Information
// Find the CPU CL device, as a fallback
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
assert(err == CL_SUCCESS);
// Find the GPU CL device, this is what we really want
// If there is no GPU device is CL capable, fall back to CPU
err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) device = cpu;
assert(device);
// Get some information about the returned device
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
assert(err == CL_SUCCESS);
printf("Connecting to %s %s...", vendor_name, device_name);
#pragma mark Context and Command Queue
// Now create a context to perform our calculation with the
// specified device
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
assert(err == CL_SUCCESS);
// And also a command queue for the context
cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
#pragma mark Program and Kernel Creation
// Load the program source from disk
// The kernel/program is the project directory and in Xcode the executable
// is set to launch from that directory hence we use a relative path
const char * filename = "kernel.cl";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err);
assert(err == CL_SUCCESS);
err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
assert(err == CL_SUCCESS);
// Now create the kernel "objects" that we want to use in the example file
kernel[0] = clCreateKernel(program[0], "add", &err);
assert(err == CL_SUCCESS);
#pragma mark Memory Allocation
// Allocate memory on the device to hold our data and store the results into
buffer_size = sizeof(int) * n;
mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
assert(err == CL_SUCCESS);
mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err);
assert(err == CL_SUCCESS);
// Get all of the stuff written and allocated
clFinish(cmd_queue);
printf(" done\n");
return err; // CL_SUCCESS
}
void terminateGPU()
{
#pragma mark Teardown
clReleaseMemObject(mem_c_position);
clReleaseMemObject(mem_c_velocity);
clReleaseMemObject(mem_p_angle);
clReleaseMemObject(mem_p_velocity);
clReleaseMemObject(mem_fitness);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
}
#pragma mark -
#pragma mark Generation context
int computeFitness(int * c_position, int * c_velocity, int * p_angle, int * p_velocity, int * fitness, int n)
{
if (!initiated) {
initGPU(n);
initiated = 1;
}
#pragma mark Writing memory
// Allocate memory on the device to hold our data and store the results into
buffer_size = sizeof(int) * n;
err = clEnqueueWriteBuffer(cmd_queue, mem_c_position, CL_TRUE, 0, buffer_size, (void *) c_position, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, mem_c_velocity, CL_TRUE, 0, buffer_size, (void *) c_velocity, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, mem_p_angle, CL_TRUE, 0, buffer_size, (void *) p_angle, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, mem_p_velocity, CL_TRUE, 0, buffer_size, (void *) p_velocity, 0, NULL, NULL);
assert(err == CL_SUCCESS);
// Get all of the stuff written and allocated
clFinish(cmd_queue);
#pragma mark Kernel Arguments
// Now setup the arguments to our kernel
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), (void *) &mem_c_position);
err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), (void *) &mem_c_velocity);
err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), (void *) &mem_p_angle);
err |= clSetKernelArg(kernel[0], 3, sizeof(cl_mem), (void *) &mem_p_velocity);
err |= clSetKernelArg(kernel[0], 4, sizeof(cl_mem), (void *) &mem_fitness);
assert(err == CL_SUCCESS);
#pragma mark Execution and Reading memory
// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = n;
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
assert(err == CL_SUCCESS);
clFinish(cmd_queue);
// Once finished read back the results from the answer
// array into the results array
err = clEnqueueReadBuffer(cmd_queue, mem_fitness, CL_TRUE, 0, buffer_size, fitness, 0, NULL, NULL);
assert(err == CL_SUCCESS);
clFinish(cmd_queue);
return CL_SUCCESS;
}
#pragma mark -
int main (int argc, const char * argv[]) {
#pragma mark Configuration
const int generation_size = 40;
const int generation_count = 10000;
const float mutation = 0.1;
const int time_total = 60000; // should be the same as in kernel.cl
srand(time(NULL));
#pragma mark Allocate standard memory
int * c_position = (int *) malloc(generation_size * sizeof(int));
int * c_velocity = (int *) malloc(generation_size * sizeof(int));
int * p_angle = (int *) malloc(generation_size * sizeof(int));
int * p_velocity = (int *) malloc(generation_size * sizeof(int));
int * fitness = (int *) malloc(generation_size * sizeof(int));
int fitness_sum = 0;
int best_key = 0;
int * next_c_position = (int *) malloc(generation_size * sizeof(int));
int * next_c_velocity = (int *) malloc(generation_size * sizeof(int));
int * next_p_angle = (int *) malloc(generation_size * sizeof(int));
int * next_p_velocity = (int *) malloc(generation_size * sizeof(int));
#pragma mark Generate first generation
for (int i = 0; i < generation_size; i++) {
int sign = rand() % 2 == 1 ? 1 : -1;
next_c_position[i] = sign * rand() % 1000;
next_c_velocity[i] = sign * rand() % 1000;
next_p_angle[i] = sign * rand() % 1000;
next_p_velocity[i] = sign * rand() % 1000;
// fitness[i] = 0;
}
#pragma mark Genetical algorithm
int n;
int last_sum = 0;
for (n = 0; n < generation_count; n++) {
c_position = next_c_position;
c_velocity = next_c_velocity;
p_angle = next_p_angle;
p_velocity = next_p_velocity;
fitness_sum = 0;
best_key = 0;
computeFitness(c_position, c_velocity, p_angle, p_velocity, fitness, generation_size);
// prevent computing generation in the last cycle
if (n == generation_count - 1) break;
int fitness_max = 0;
// TODO: allocate it only once
int * border = (int *) malloc(generation_size * sizeof(int));
for (int i = 0; i < generation_size; i++) {
fitness_sum += fitness[i];
if (fitness[i] > fitness_max) {
fitness_max = fitness[i];
best_key = i;
}
if (i == 0) {
border[i] = fitness[i];
} else {
border[i] = border[i - 1] + fitness[i];
}
}
// break if best solution is already found
if (fitness_max >= time_total) break;
//printf("gen[%d] best_fitness = \t%d\t[%d]\t%s\n", n, fitness_max, fitness_sum, last_sum < fitness_sum ? "up" : "FALLS");
last_sum = fitness_sum;
// Elite - always copy the best one
next_c_position[0] = c_position[best_key];
next_c_velocity[0] = c_velocity[best_key];
next_p_angle[0] = p_angle[best_key];
next_p_velocity[0] = p_velocity[best_key];
for (int k = 1; k < generation_size; k++) {
int key_parent_1 = 0;
int key_parent_2 = 0;
// Get weighted entity (roulette wheel implementation)
int roll = rand() % fitness_sum;
int i;
for (i = 0; i < generation_size; i++) {
if (roll < border[i]) {
break;
}
}
key_parent_1 = i;
roll = rand() % fitness_sum;
for (i = 0; i < generation_size; i++) {
if (roll < border[i]) {
break;
}
}
key_parent_2 = i;
printf("%d\t", c_position[k]);
// Prepare next generation as combination of two parens, with mutation
next_c_position[k] = c_position[key_parent_1] + mutation * (rand() % 2 == 1 ? 1 : -1) * (rand() % (c_position[key_parent_1] == 0 ? 1 : c_position[key_parent_1]));
next_c_velocity[k] = c_velocity[key_parent_1] + mutation * (rand() % 2 == 1 ? 1 : -1) * (rand() % (c_velocity[key_parent_1] == 0 ? 1 : c_velocity[key_parent_1]));
next_p_angle[k] = p_angle[key_parent_2] + mutation * (rand() % 2 == 1 ? 1 : -1) * (rand() % (p_angle[key_parent_2] == 0 ? 1 : p_angle[key_parent_2]));
next_p_velocity[k] = p_velocity[key_parent_2] + mutation * (rand() % 2 == 1 ? 1 : -1) * (rand() % (p_velocity[key_parent_2] == 0 ? 1 : p_velocity[key_parent_2]));
}
printf("\n");
}
printf("Solution:\n\tfitness = %d\n\tc1 = %d\n\tc2 = %d\n\tc3 = %d\n\tc4 = %d\n", fitness[best_key], c_position[best_key], c_velocity[best_key], p_angle[best_key], p_velocity[best_key]);
terminateGPU();
return 0; // comment to run tests
#pragma mark -
#pragma mark Debug
printf("\nENTERING DEBUG SCOPE:\n\n");
initiated = 0; // so the context is new
#pragma mark - GPU test
printf("GPU fitness again:\n");
int k = generation_size;
int * test_c_position = (int *) malloc(k * sizeof(int));
int * test_c_velocity = (int *) malloc(k * sizeof(int));
int * test_p_angle = (int *) malloc(k * sizeof(int));
int * test_p_velocity = (int *) malloc(k * sizeof(int));
int * test_fitness = (int *) malloc(k * sizeof(int));
for (int i = 0; i < k; i++) {
test_c_position[i] = c_position[best_key];
test_c_velocity[i] = c_velocity[best_key];
test_p_angle[i] = p_angle[best_key];
test_p_velocity[i] = p_velocity[best_key];
}
computeFitness(test_c_position, test_c_velocity, test_p_angle, test_p_velocity, test_fitness, 1);
for (int i = 0; i < k; i++) {
printf("Test Solution:\n\tfitness = %d\n\tc1 = %d\n\tc2 = %d\n\tc3 = %d\n\tc4 = %d\n", test_fitness[i], test_c_position[i], test_c_velocity[i], test_p_angle[i], test_p_velocity[i]);
break; // since all the results are the same
}
terminateGPU();
#pragma mark - CPU test and Visualization
printf("CPU fitness:\n");
char command[254];
FILE *fp;
char output[254];
// link this to to the Visualization binary
sprintf(command, "/Volumes/Data/Projects/PoleBalanceGPU/Visualization/build/Debug/Visualization %d %d %d %d", c_position[best_key], c_velocity[best_key], p_angle[best_key], p_velocity[best_key]);
fp = popen(command, "r");
if (fp == NULL) {
printf("Failed to run command\n" );
exit;
}
while (fgets(output, sizeof(output), fp) != NULL) {
printf("\t%s", output);
}
int cpu_fitness = atoi(output);
// this might fail from time to time since CPU and GPU round implementation differs
assert(fitness[best_key] == test_fitness[0] && fitness[best_key] == cpu_fitness);
return 0;
}