multi_dgemm_type::host_data_type::~host_data_type() { /*LIBXSTREAM_CHECK_CALL_THROW*/(libxstream_mem_deallocate(-1, m_adata)); /*LIBXSTREAM_CHECK_CALL_THROW*/(libxstream_mem_deallocate(-1, m_bdata)); /*LIBXSTREAM_CHECK_CALL_THROW*/(libxstream_mem_deallocate(-1, m_cdata)); /*LIBXSTREAM_CHECK_CALL_THROW*/(libxstream_mem_deallocate(-1, m_idata)); }
int multi_dgemm_type::deinit() { if (m_host_data) { int device = -1; LIBXSTREAM_CHECK_CALL(libxstream_stream_device(m_stream, &device)); LIBXSTREAM_CHECK_CALL(libxstream_fn_destroy_signature(m_signature)); LIBXSTREAM_CHECK_CALL(libxstream_stream_destroy(m_stream)); LIBXSTREAM_CHECK_CALL(libxstream_event_destroy(m_event)); LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_adata)); LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_bdata)); LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_cdata)); LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_idata)); m_host_data = 0; #if defined(LIBXSTREAM_DEBUG) m_max_batch = 0; m_signature = 0; m_stream = 0; m_event = 0; m_adata = 0; m_bdata = 0; m_cdata = 0; m_idata = 0; #endif } return LIBXSTREAM_ERROR_NONE; }
multi_dgemm_type::host_data_type::~host_data_type() { LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_adata)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_bdata)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_cdata)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_idata)); }
int main(int argc, char* argv[]) { size_t ndevices = 0; if (LIBXSTREAM_ERROR_NONE != libxstream_get_ndevices(&ndevices) || 0 == ndevices) { LIBXSTREAM_PRINT0(2, "No device found or device not ready!"); } size_t filesize = 0; FILE *const file = 1 < argc ? fileopen(argv[1], "rb", &filesize) : 0; const size_t nitems = (1 < argc && 0 == filesize && 0 < atoi(argv[1])) ? (atoi(argv[1]) * (1ULL << 20)/*MB*/) : (0 < filesize ? filesize : (512 << 20)); const size_t mbatch = LIBXSTREAM_MIN(2 < argc ? strtoul(argv[2], 0, 10) : 0/*auto*/, nitems >> 20) << 20; const size_t mstreams = LIBXSTREAM_MIN(LIBXSTREAM_MAX(3 < argc ? atoi(argv[3]) : 2, 0), LIBXSTREAM_MAX_NSTREAMS); #if !defined(_OPENMP) LIBXSTREAM_PRINT0(1, "OpenMP support needed for performance results!"); #endif const size_t nstreams = LIBXSTREAM_MAX(mstreams, 1) * LIBXSTREAM_MAX(ndevices, 1), nbatch = (0 == mbatch) ? (nitems / nstreams) : mbatch, hsize = 256; size_t histogram[256/*hsize*/]; memset(histogram, 0, sizeof(histogram)); char* data; { /*allocate and initialize host memory*/ size_t i; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(-1/*host*/, (void**)&data, nitems, 0)); if (0 == filesize || nitems > fread(data, 1, filesize, file)) { for (i = 0; i < nitems; ++i) data[i] = (char)LIBXSTREAM_MOD(rand(), hsize/*POT*/); } } struct { libxstream_stream* handle; #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) libxstream_event* event; #endif size_t* histogram; char* data; } stream[(LIBXSTREAM_MAX_NDEVICES)*(LIBXSTREAM_MAX_NSTREAMS)]; { /*allocate and initialize streams and device memory*/ size_t i; for (i = 0; i < nstreams; ++i) { #if defined(NDEBUG) /*no name*/ const char *const name = 0; #else char name[128]; LIBXSTREAM_SNPRINTF(name, sizeof(name), "stream %i", (int)(i + 1)); #endif const int device = (0 < ndevices) ? ((int)(i % ndevices)) : -1; stream[i].handle = 0; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_create(0 < mstreams ? &stream[i].handle : 0, device, 0, name)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].data, nbatch, 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].histogram, hsize * sizeof(size_t), 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memset_zero(stream[i].histogram, hsize * sizeof(size_t), stream[i].handle)); #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_create(&stream[i].event)); #endif } /*start benchmark with no pending work*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(0)); } /*process data in chunks of size nbatch*/ const size_t nstep = nbatch * nstreams; const int end = (int)((nitems + nstep - 1) / nstep); int i; libxstream_type sizetype = LIBXSTREAM_TYPE_U32; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_autotype(sizeof(size_t), sizetype, &sizetype)); #if defined(_OPENMP) /*if (0 == ndevices) omp_set_nested(1);*/ const double start = omp_get_wtime(); #endif for (i = 0; i < end; ++i) { const size_t ibase = i * nstep, n = LIBXSTREAM_MIN(nstreams, nitems - ibase); libxstream_argument* signature; size_t j; for (j = 0; j < n; ++j) { /*enqueue work into streams*/ const size_t base = ibase + j * nbatch, size = base < nitems ? LIBXSTREAM_MIN(nbatch, nitems - base) : 0; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(data + base, stream[j].data, size, stream[j].handle)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_signature(&signature)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input(signature, 0, stream[j].data, LIBXSTREAM_TYPE_CHAR, 1, &size)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_output(signature, 1, stream[j].histogram, sizetype, 1, &hsize)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call((libxstream_function)makehist, signature, stream[j].handle, LIBXSTREAM_CALL_DEFAULT)); #if defined(SYNCMETHOD) && (2 <= SYNCMETHOD) /*record event*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_record(stream[j].event, stream[j].handle)); #endif } #if defined(SYNCMETHOD) for (j = 0; j < n; ++j) { /*synchronize streams*/ const size_t k = n - j - 1; /*j-reverse*/ # if (3 <= (SYNCMETHOD)) /*wait for an event within a stream*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait_event(stream[k].handle, stream[(j+nstreams-1)%n].event)); # elif (2 <= (SYNCMETHOD)) /*wait for an event on the host*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_wait(stream[k].event)); # else LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[k].handle)); # endif } #endif } { /*reduce stream-local histograms*/ LIBXSTREAM_ALIGNED(size_t local[256/*hsize*/], LIBXSTREAM_MAX_SIMD); size_t i, j; for (j = 0; j < nstreams; ++j) { LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(stream[j].histogram, local, sizeof(local), stream[j].handle)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[j].handle)); /*wait for pending work*/ for (i = 0; i < hsize; ++i) histogram[i] += local[i]; } } #if defined(_OPENMP) const double duration = omp_get_wtime() - start; #endif const double kilo = 1.0 / (1 << 10), mega = 1.0 / (1 << 20); double entropy = 0; { /*calculate entropy*/ const double log2_nitems = log2((double)nitems); size_t i; for (i = 0; i < hsize; ++i) { const double h = (double)histogram[i], log2h = 0 < h ? log2(h) : log2_nitems; entropy -= h * LIBXSTREAM_MIN(log2h - log2_nitems, 0); } entropy /= nitems; } if (0 < entropy) { if ((1 << 20) <= nitems) { /*mega*/ fprintf(stdout, "Compression %gx: %.1f -> %.1f MB", 8.0 / entropy, mega * nitems, mega * entropy * nitems / 8.0); } else if ((1 << 10) <= nitems) { /*kilo*/ fprintf(stdout, "Compression %gx: %.1f -> %.1f KB", 8.0 / entropy, kilo * nitems, kilo * entropy * nitems / 8.0); } else { fprintf(stdout, "Compression %gx: %.0f -> %0.f B", 8.0 / entropy, 1.0 * nitems, entropy * nitems / 8.0); } fprintf(stdout, " (redundancy %0.f%%, entropy %.0f bit)\n", 100.0 - 12.5 * entropy, entropy); } #if defined(_OPENMP) if (0 < duration) { fprintf(stdout, "Finished after %.1f s", duration); } else { fprintf(stdout, "Finished"); } #endif { /*validate result*/ size_t check = 0, i; for (i = 0; i < hsize; ++i) check += histogram[i]; if (nitems != check) { size_t expected[256/*hsize*/]; memset(expected, 0, sizeof(expected)); LIBXSTREAM_CONCATENATE(histogram,HISTOGRAM)(data, nitems, expected); check = 0; for (i = 0; i < hsize; ++i) check += expected[i] == histogram[i] ? 0 : 1; fprintf(stdout, " with %llu error%s\n", (unsigned long long)check, 1 != check ? "s" : ""); } else { fprintf(stdout, "\n"); } } { /*release resources*/ size_t i; for (i = 0; i < nstreams; ++i) { int device = -1; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_device(stream[i].handle, &device)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].histogram)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].data)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_destroy(stream[i].handle)); #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_destroy(stream[i].event)); #endif } LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1/*host*/, data)); } return EXIT_SUCCESS; }