int main(int argc, char* argv[]) { if (argc != 4) { printf("Usage: %s <nx> <ny> <nz>\n", argv[0]); return 0; } int nx = atoi(argv[1]); int ny = atoi(argv[2]); int nz = atoi(argv[3]); sincos_(&nx, &ny, &nz); return 0; }
/** * Create a gear wheel. * * @param inner_radius radius of hole at center * @param outer_radius radius at center of teeth * @param width width of gear * @param teeth number of teeth * @param tooth_depth depth of tooth * * @return pointer to the constructed struct gear */ static struct gear * create_gear(struct pipe_screen *screen, struct pipe_context *pipe, float inner_radius, float outer_radius, float width, int teeth, float tooth_depth) { float r0, r1, r2; float da; GearVertex *v; struct gear *gear; double s[5], c[5]; float normal[3]; int cur_strip = 0; int i; /* Allocate memory for the gear */ gear = malloc(sizeof *gear); if (gear == NULL) return NULL; /* Calculate the radii used in the gear */ r0 = inner_radius; r1 = outer_radius - tooth_depth / 2.0; r2 = outer_radius + tooth_depth / 2.0; da = 2.0 * M_PI / teeth / 4.0; /* Allocate memory for the triangle strip information */ gear->nstrips = STRIPS_PER_TOOTH * teeth; gear->strips = calloc(gear->nstrips, sizeof (*gear->strips)); /* Allocate memory for the vertices */ gear->vertices = calloc(VERTICES_PER_TOOTH * teeth, sizeof(*gear->vertices)); v = gear->vertices; for (i = 0; i < teeth; i++) { /* Calculate needed sin/cos for varius angles */ sincos_(i * 2.0 * M_PI / teeth, &s[0], &c[0]); sincos_(i * 2.0 * M_PI / teeth + da, &s[1], &c[1]); sincos_(i * 2.0 * M_PI / teeth + da * 2, &s[2], &c[2]); sincos_(i * 2.0 * M_PI / teeth + da * 3, &s[3], &c[3]); sincos_(i * 2.0 * M_PI / teeth + da * 4, &s[4], &c[4]); /* A set of macros for making the creation of the gears easier */ #define GEAR_POINT(r, da) { (r) * c[(da)], (r) * s[(da)] } #define SET_NORMAL(x, y, z) do { \ normal[0] = (x); normal[1] = (y); normal[2] = (z); \ } while(0) #define GEAR_VERT(v, point, sign) vert((v), p[(point)].x, p[(point)].y, (sign) * width * 0.5, normal) #define START_STRIP do { \ gear->strips[cur_strip].first = v - gear->vertices; \ } while(0); #define END_STRIP do { \ int _tmp = (v - gear->vertices); \ gear->strips[cur_strip].count = _tmp - gear->strips[cur_strip].first; \ cur_strip++; \ } while (0) #define QUAD_WITH_NORMAL(p1, p2) do { \ SET_NORMAL((p[(p1)].y - p[(p2)].y), -(p[(p1)].x - p[(p2)].x), 0); \ v = GEAR_VERT(v, (p1), -1); \ v = GEAR_VERT(v, (p1), 1); \ v = GEAR_VERT(v, (p2), -1); \ v = GEAR_VERT(v, (p2), 1); \ } while(0) struct point { float x; float y; }; /* Create the 7 points (only x,y coords) used to draw a tooth */ struct point p[7] = { GEAR_POINT(r2, 1), // 0 GEAR_POINT(r2, 2), // 1 GEAR_POINT(r1, 0), // 2 GEAR_POINT(r1, 3), // 3 GEAR_POINT(r0, 0), // 4 GEAR_POINT(r1, 4), // 5 GEAR_POINT(r0, 4), // 6 }; /* Front face */ START_STRIP; SET_NORMAL(0, 0, 1.0); v = GEAR_VERT(v, 0, +1); v = GEAR_VERT(v, 1, +1); v = GEAR_VERT(v, 2, +1); v = GEAR_VERT(v, 3, +1); v = GEAR_VERT(v, 4, +1); v = GEAR_VERT(v, 5, +1); v = GEAR_VERT(v, 6, +1); END_STRIP; /* Inner face */ START_STRIP; QUAD_WITH_NORMAL(4, 6); END_STRIP; /* Back face */ START_STRIP; SET_NORMAL(0, 0, -1.0); v = GEAR_VERT(v, 6, -1); v = GEAR_VERT(v, 5, -1); v = GEAR_VERT(v, 4, -1); v = GEAR_VERT(v, 3, -1); v = GEAR_VERT(v, 2, -1); v = GEAR_VERT(v, 1, -1); v = GEAR_VERT(v, 0, -1); END_STRIP; /* Outer face */ START_STRIP; QUAD_WITH_NORMAL(0, 2); END_STRIP; START_STRIP; QUAD_WITH_NORMAL(1, 0); END_STRIP; START_STRIP; QUAD_WITH_NORMAL(3, 1); END_STRIP; START_STRIP; QUAD_WITH_NORMAL(5, 3); END_STRIP; } gear->nvertices = (v - gear->vertices); /* element layout */ struct pipe_vertex_element pipe_vertex_elements[] = { { /* positions */ .src_offset = 0x0, .instance_divisor = 0, .vertex_buffer_index = 0, .src_format = PIPE_FORMAT_R32G32B32_FLOAT }, { /* normals */ .src_offset = 0xc,
int main(int argc, char* argv[]) { if (argc != 5) { printf("Usage: %s <nx> <ny> <ns> <nt>\n", argv[0]); exit(1); } const char* no_timing = getenv("NO_TIMING"); #if defined(_OPENACC) char* regcount_fname = getenv("OPENACC_PROFILING_FNAME"); if (regcount_fname) { char* regcount_lineno = getenv("OPENACC_PROFILING_LINENO"); int lineno = -1; if (regcount_lineno) lineno = atoi(regcount_lineno); //kernelgen_enable_openacc_regcount(regcount_fname, lineno); } #endif parse_arg(nx, argv[1]); parse_arg(ny, argv[2]); parse_arg(ns, argv[3]); parse_arg(nt, argv[4]); size_t szarray = (size_t)nx * ny * ns; size_t szarrayb = szarray * sizeof(real); real* x = (real*)memalign(MEMALIGN, szarrayb); real* y = (real*)memalign(MEMALIGN, szarrayb); real* xy = (real*)memalign(MEMALIGN, szarrayb); if (!x || !y || !xy) { printf("Error allocating memory for arrays: %p, %p, %p\n", x, y, xy); exit(1); } real mean = 0.0f; for (int i = 0; i < szarray; i++) { x[i] = real_rand(); y[i] = real_rand(); xy[i] = real_rand(); mean += x[i] + y[i] + xy[i]; } printf("initial mean = %f\n", mean / szarray / 3); // // MIC or OPENACC: // // 1) Perform an empty offload, that should strip // the initialization time from further offloads. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec init_s, init_f; #if defined(_MIC) get_time(&init_s); #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(0) free_if(0)), \ nocopy(y:length(szarray) alloc_if(0) free_if(0)), \ nocopy(xy:length(szarray) alloc_if(0) free_if(0)) { } get_time(&init_f); #endif #if defined(_OPENACC) get_time(&init_s); acc_init(acc_device_default); get_time(&init_f); #endif double init_t = get_time_diff((struct timespec*)&init_s, (struct timespec*)&init_f); if (!no_timing) printf("init time = %f sec\n", init_t); #endif volatile struct timespec total_s, total_f; get_time(&total_s); // // MIC or OPENACC: // // 2) Allocate data on device, but do not copy anything. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec alloc_s, alloc_f; #if defined(_MIC) get_time(&alloc_s); #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(1) free_if(0)), \ nocopy(y:length(szarray) alloc_if(1) free_if(0)), \ nocopy(xy:length(szarray) alloc_if(1) free_if(0)) { } get_time(&alloc_f); #endif #if defined(_OPENACC) get_time(&alloc_s); #pragma acc data create (x[0:szarray], y[0:szarray], xy[0:szarray]) { get_time(&alloc_f); #endif double alloc_t = get_time_diff((struct timespec*)&alloc_s, (struct timespec*)&alloc_f); if (!no_timing) printf("device buffer alloc time = %f sec\n", alloc_t); #endif // // MIC or OPENACC: // // 3) Transfer data from host to device and leave it there, // i.e. do not allocate deivce memory buffers. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec load_s, load_f; #if defined(_MIC) get_time(&load_s); #pragma offload target(mic) \ in(x:length(szarray) alloc_if(0) free_if(0)), \ in(y:length(szarray) alloc_if(0) free_if(0)), \ in(xy:length(szarray) alloc_if(0) free_if(0)) { } get_time(&load_f); #endif #if defined(_OPENACC) get_time(&load_s); #pragma acc update device(x[0:szarray], y[0:szarray], xy[0:szarray]) get_time(&load_f); #endif double load_t = get_time_diff((struct timespec*)&load_s, (struct timespec*)&load_f); if (!no_timing) printf("data load time = %f sec (%f GB/sec)\n", load_t, 2 * szarrayb / (load_t * 1024 * 1024 * 1024)); #endif // // 4) Perform data processing iterations, keeping all data // on device. // volatile struct timespec compute_s, compute_f; get_time(&compute_s); #if defined(_MIC) #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(0) free_if(0)), \ nocopy(y:length(szarray) alloc_if(0) free_if(0)), \ nocopy(xy:length(szarray) alloc_if(0) free_if(0)) #endif { for (int it = 0; it < nt; it++) { #if defined(_PATUS) real* dummy; #pragma omp parallel sincos_patus(&dummy, x, y, xy, nx, ny, ns); #else sincos_(&nx, &ny, &ns, x, y, xy); #endif } } get_time(&compute_f); double compute_t = get_time_diff((struct timespec*)&compute_s, (struct timespec*)&compute_f); if (!no_timing) printf("compute time = %f sec\n", compute_t); // // MIC or OPENACC: // // 5) Transfer output data back from device to host. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec save_s, save_f; #if defined(_MIC) get_time(&save_s); #pragma offload target(mic) \ out(xy:length(szarray) alloc_if(0) free_if(0)) { } get_time(&save_f); #endif #if defined(_OPENACC) get_time(&save_s); #pragma acc update host (xy[0:szarray]) get_time(&save_f); #endif double save_t = get_time_diff((struct timespec*)&save_s, (struct timespec*)&save_f); if (!no_timing) printf("data save time = %f sec (%f GB/sec)\n", save_t, szarrayb / (save_t * 1024 * 1024 * 1024)); #endif // // MIC or OPENACC: // // 6) Deallocate device data buffers. // OPENACC does not seem to have explicit deallocation. // #if defined(_OPENACC) } #endif #if defined(_MIC) volatile struct timespec free_s, free_f; get_time(&free_s); #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(0) free_if(1)), \ nocopy(y:length(szarray) alloc_if(0) free_if(1)), \ nocopy(xy:length(szarray) alloc_if(0) free_if(1)) { } get_time(&free_f); double free_t = get_time_diff((struct timespec*)&free_s, (struct timespec*)&free_f); // if (!no_timing) printf("device buffer free time = %f sec\n", free_t); #endif get_time(&total_f); if (!no_timing) printf("device buffer free time = %f sec\n", get_time_diff((struct timespec*)&total_s, (struct timespec*)&total_f)); // For the final mean - account only the norm of the top // most level (tracked by swapping idxs array of indexes). mean = 0.0f; for (int i = 0; i < szarray; i++) mean += xy[i]; printf("final mean = %f\n", mean / szarray); free(x); free(y); free(xy); fflush(stdout); return 0; }