void box_gaussian_conv_(T *dest_data, T *buffer_data, const T *src, long N, long stride, T sigma, int K) { struct { T *data; long stride; } dest, buffer, cur, next; T scale; long r; int step; assert(dest_data && buffer_data && src && dest_data != buffer_data && N > 0 && sigma > 0 && K > 0); /* Compute the box radius according to Wells' formula. */ r = (long)(0.5 * sqrt((12.0 * sigma * sigma) / K + 1.0)); scale = (T)(1.0 / (double)pow(2.0*r + 1.0, K)); dest.data = dest_data; dest.stride = stride; buffer.data = buffer_data; buffer.stride = (buffer_data == src) ? stride : 1; /* Here we decide whether dest or buffer should be the first output array. If K is even, then buffer is the better choice so that the result is in dest after K iterations, e.g. for K = 4 (as in the function comment), src -> buffer -> dest -> buffer -> dest. If K is odd, we would like to choose dest, e.g. for K = 3, src -> dest -> buffer -> dest. However, if src and dest point to the same memory (i.e., in-place computation), then we must select buffer as the first output array. */ if (buffer_data == src || (dest_data != src && K % 2 == 1)) next = dest; else next = buffer; /* Perform the first step of box filtering. */ box_filter(next.data, next.stride, src, stride, N, r); /* Perform another (K - 1) steps of box filtering, alternating the roles of the dest and buffer arrays. */ for (step = 1; step < K; ++step) { cur = next; next = (cur.data == buffer_data) ? dest : buffer; box_filter(next.data, next.stride, cur.data, cur.stride, N, r); } /* Multiply by the constant scale factor. */ if (next.data != dest_data) { long n, i; for (n = i = 0; n < N; ++n, i += stride) dest_data[i] = buffer_data[n] * scale; } else { long i, i_end = stride * N; for (i = 0; i < i_end; i += stride) dest_data[i] *= scale; } return; }
// --------------------------------------------------------------------------- /// Creates piece of plasma (velocity distribution to be set later). // --------------------------------------------------------------------------- double tag_plasma (FILE *fp) { // Cell sub-sampling and velocity space resolution. plasma_nx = cfg_readInt (fp); plasma_ny = cfg_readInt (fp); plasma_nz = cfg_readInt (fp); plasma_layers = cfg_readInt (fp); // Reads all exctrusion elements. while (cfg_isParameter (fp)) { const char *element = cfg_readWord (fp); if (!strcmp (element, "planes")) { convex_read (fp); // Reads set of planes. } else if (!strcmp (element, "spheres")) { sphere_read (fp); // Reads set of spheres. } else if (!strcmp (element, "cylinders")) { cylinder_read (fp); // Reads set of cylinders. } else if (!strcmp (element, "boxes")) { box_read (fp); // Reads set of boxes. } else { DIE ("bad shape '%s' ('planes', 'spheres', or 'cylinders' expected)", element); } } // Deactivates axises if necessary. plasma_nx = mc_have_x ? plasma_nx : 1; plasma_ny = mc_have_y ? plasma_ny : 1; plasma_nz = mc_have_z ? plasma_nz : 1; plasma_PPC = plasma_nx*plasma_ny*plasma_nz*plasma_layers; // Checks parameters. ENSURE (plasma_nx > 0 && plasma_ny > 0 && plasma_nz > 0 && plasma_layers > 0, "bad nx(%d), ny(%d), nz(%d) or number of velocity layers (%d)", plasma_nx, plasma_ny, plasma_nz, plasma_layers); say ("tag_plasma: "); say (" - %d x %d x %d in-cell spacing", plasma_nx, plasma_ny, plasma_nz); say (" - %d velocity layers", plasma_layers); say (" - %d particles per cell", plasma_PPC); marker_t cell[plasma_PPC], filtered[plasma_PPC]; double dx = h1*mc_have_x/(double) plasma_nx, dy = h2*mc_have_y/(double) plasma_ny, dz = h3*mc_have_z/(double) plasma_nz; // 'p' enumerates particles in cell. int p = 0; for (int i = 0 ; i < plasma_nx ; ++i) for (int j = 0 ; j < plasma_ny ; ++j) for (int k = 0 ; k < plasma_nz ; ++k) { // Fills sample (NAN) to catch unitialized variables. marker_t m = {.x = (i + 0.5)*dx, .y = (j + 0.5)*dy, .z = (k + 0.5)*dz, .vx = NAN, .vy = NAN, .vz = NAN, .rho = NAN, .qDivM = NAN}; for (int l = 0 ; l < plasma_layers ; ++l) { cell[p] = m; cell[p].vx = p++; // Stores enumeration position. } } double memUsage = 0; plasma_newObject (); for (int I = cpu_min[0] ; I < cpu_max[0] + 1 - mc_have_x ; ++I) for (int J = cpu_min[1] ; J < cpu_max[1] + 1 - mc_have_y ; ++J) for (int K = cpu_min[2] ; K < cpu_max[2] + 1 - mc_have_z ; ++K) { // Adds cell data into current cell. for (int p = 0 ; p < plasma_PPC ; ++p) { filtered[p] = cell[p]; filtered[p].x += I*h1*mc_have_x; filtered[p].y += J*h2*mc_have_y; filtered[p].z += K*h3*mc_have_z; } // Set of filter extrusions. int PPC = plasma_PPC; for (convex_t *b = polys, *b2 = b + polysN ; b < b2 ; ++b) { PPC = convex_filter (filtered, PPC, b); } for (sphere_t *s = spheres, *s2 = s + spheresN ; s < s2 ; ++s) { PPC = sphere_filter (filtered, PPC, s); } for (cylinder_t *c = cylinders, *c2 = c + cylindersN ; c < c2 ; ++c) { PPC = cylinder_filter (filtered, PPC, c); } for (box_t *b = boxes, *b2 = b + boxesN ; b < b2 ; ++b) { PPC = box_filter (filtered, PPC, b); } // Adds particles to the storage. for (int p = 0 ; p < PPC && (!memEstimateOnly) ; ++p) { *plasma_marker () = filtered[p]; } memUsage += PPC; } say (" - %.3e particles on cpu %d", memUsage, cpu_here); // Turns off all filters. box_free (); convex_free (); cylinder_free (); sphere_free (); return memUsage*sizeof (marker_t); // Returns memory usage. }