int multi_dgemm_type::init(const char* name, host_data_type& host_data, int device, size_t max_batch) { LIBXSTREAM_CHECK_CALL(deinit()); m_host_data = &host_data; m_max_batch = max_batch; const size_t max_msize = m_max_batch * m_host_data->max_matrix_size(); LIBXSTREAM_CHECK_CALL(libxstream_stream_create(&m_stream, device, 0, name)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_adata), sizeof(double) * max_msize, 0)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_bdata), sizeof(double) * max_msize, 0)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_cdata), sizeof(double) * max_msize, 0)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_idata), sizeof(size_t) * max_batch, 0)); return LIBXSTREAM_ERROR_NONE; }
int multi_dgemm_type::init(const char* name, host_data_type& host_data, int device, int demux, size_t max_batch) { LIBXSTREAM_CHECK_CALL(deinit()); const size_t max_msize = max_batch * host_data.max_matrix_size(); m_host_data = &host_data; m_max_batch = max_batch; LIBXSTREAM_CHECK_CALL(libxstream_stream_create(&m_stream, device, demux, 0, name)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_adata), sizeof(double) * max_msize, 0)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_bdata), sizeof(double) * max_msize, 0)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_cdata), sizeof(double) * max_msize, 0)); LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast<void**>(&m_idata), sizeof(size_t) * max_batch, 0)); LIBXSTREAM_CHECK_CALL(libxstream_fn_create_signature(&m_signature, 6)); LIBXSTREAM_CHECK_CALL(libxstream_fn_input (m_signature, 2, m_idata, libxstream_map_to_type(m_idata), 1, &max_msize)); LIBXSTREAM_CHECK_CALL(libxstream_fn_input (m_signature, 3, m_adata, libxstream_map_to_type(m_adata), 1, &max_msize)); LIBXSTREAM_CHECK_CALL(libxstream_fn_input (m_signature, 4, m_bdata, libxstream_map_to_type(m_bdata), 1, &max_msize)); LIBXSTREAM_CHECK_CALL(libxstream_fn_output(m_signature, 5, m_cdata, libxstream_map_to_type(m_cdata), 1, &max_msize)); return LIBXSTREAM_ERROR_NONE; }
multi_dgemm_type::host_data_type::host_data_type(libxstream_function process, size_t size, const size_t split[]) : m_process(process) , m_adata(0), m_bdata(0), m_cdata(0), m_idata(0) , m_size(size), m_flops(0) { LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast<void**>(&m_idata), sizeof(size_t) * (size + 1), 0)); size_t isize = split[0]; size_t msize = 0, n = 100, nn = n * n; for (size_t i = 0; i < isize; ++i) { m_flops += nn * (2 * n + 1); m_idata[i] = msize; msize += nn; } isize += split[1]; n = 600, nn = n * n; for (size_t i = split[0]; i < isize; ++i) { m_flops += nn * (2 * n + 1); m_idata[i] = msize; msize += nn; } n = 1000, nn = n * n; for (size_t i = isize; i < size; ++i) { m_flops += nn * (2 * n + 1); m_idata[i] = msize; msize += nn; } m_idata[size] = msize; LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast<void**>(&m_adata), sizeof(double) * msize, 0)); LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast<void**>(&m_bdata), sizeof(double) * msize, 0)); LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast<void**>(&m_cdata), sizeof(double) * msize, 0)); static const double scale = 1.0 / RAND_MAX; for (size_t i = 0; i < msize; ++i) { m_adata[i] = scale * (2 * std::rand() - RAND_MAX); m_bdata[i] = scale * (2 * std::rand() - RAND_MAX); m_cdata[i] = 0; } }
int main(int argc, char* argv[]) { size_t ndevices = 0; if (LIBXSTREAM_ERROR_NONE != libxstream_get_ndevices(&ndevices) || 0 == ndevices) { LIBXSTREAM_PRINT0(2, "No device found or device not ready!"); } size_t filesize = 0; FILE *const file = 1 < argc ? fileopen(argv[1], "rb", &filesize) : 0; const size_t nitems = (1 < argc && 0 == filesize && 0 < atoi(argv[1])) ? (atoi(argv[1]) * (1ULL << 20)/*MB*/) : (0 < filesize ? filesize : (512 << 20)); const size_t mbatch = LIBXSTREAM_MIN(2 < argc ? strtoul(argv[2], 0, 10) : 0/*auto*/, nitems >> 20) << 20; const size_t mstreams = LIBXSTREAM_MIN(LIBXSTREAM_MAX(3 < argc ? atoi(argv[3]) : 2, 0), LIBXSTREAM_MAX_NSTREAMS); #if !defined(_OPENMP) LIBXSTREAM_PRINT0(1, "OpenMP support needed for performance results!"); #endif const size_t nstreams = LIBXSTREAM_MAX(mstreams, 1) * LIBXSTREAM_MAX(ndevices, 1), nbatch = (0 == mbatch) ? (nitems / nstreams) : mbatch, hsize = 256; size_t histogram[256/*hsize*/]; memset(histogram, 0, sizeof(histogram)); char* data; { /*allocate and initialize host memory*/ size_t i; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(-1/*host*/, (void**)&data, nitems, 0)); if (0 == filesize || nitems > fread(data, 1, filesize, file)) { for (i = 0; i < nitems; ++i) data[i] = (char)LIBXSTREAM_MOD(rand(), hsize/*POT*/); } } struct { libxstream_stream* handle; #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) libxstream_event* event; #endif size_t* histogram; char* data; } stream[(LIBXSTREAM_MAX_NDEVICES)*(LIBXSTREAM_MAX_NSTREAMS)]; { /*allocate and initialize streams and device memory*/ size_t i; for (i = 0; i < nstreams; ++i) { #if defined(NDEBUG) /*no name*/ const char *const name = 0; #else char name[128]; LIBXSTREAM_SNPRINTF(name, sizeof(name), "stream %i", (int)(i + 1)); #endif const int device = (0 < ndevices) ? ((int)(i % ndevices)) : -1; stream[i].handle = 0; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_create(0 < mstreams ? &stream[i].handle : 0, device, 0, name)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].data, nbatch, 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].histogram, hsize * sizeof(size_t), 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memset_zero(stream[i].histogram, hsize * sizeof(size_t), stream[i].handle)); #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_create(&stream[i].event)); #endif } /*start benchmark with no pending work*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(0)); } /*process data in chunks of size nbatch*/ const size_t nstep = nbatch * nstreams; const int end = (int)((nitems + nstep - 1) / nstep); int i; libxstream_type sizetype = LIBXSTREAM_TYPE_U32; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_autotype(sizeof(size_t), sizetype, &sizetype)); #if defined(_OPENMP) /*if (0 == ndevices) omp_set_nested(1);*/ const double start = omp_get_wtime(); #endif for (i = 0; i < end; ++i) { const size_t ibase = i * nstep, n = LIBXSTREAM_MIN(nstreams, nitems - ibase); libxstream_argument* signature; size_t j; for (j = 0; j < n; ++j) { /*enqueue work into streams*/ const size_t base = ibase + j * nbatch, size = base < nitems ? LIBXSTREAM_MIN(nbatch, nitems - base) : 0; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(data + base, stream[j].data, size, stream[j].handle)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_signature(&signature)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input(signature, 0, stream[j].data, LIBXSTREAM_TYPE_CHAR, 1, &size)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_output(signature, 1, stream[j].histogram, sizetype, 1, &hsize)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call((libxstream_function)makehist, signature, stream[j].handle, LIBXSTREAM_CALL_DEFAULT)); #if defined(SYNCMETHOD) && (2 <= SYNCMETHOD) /*record event*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_record(stream[j].event, stream[j].handle)); #endif } #if defined(SYNCMETHOD) for (j = 0; j < n; ++j) { /*synchronize streams*/ const size_t k = n - j - 1; /*j-reverse*/ # if (3 <= (SYNCMETHOD)) /*wait for an event within a stream*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait_event(stream[k].handle, stream[(j+nstreams-1)%n].event)); # elif (2 <= (SYNCMETHOD)) /*wait for an event on the host*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_wait(stream[k].event)); # else LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[k].handle)); # endif } #endif } { /*reduce stream-local histograms*/ LIBXSTREAM_ALIGNED(size_t local[256/*hsize*/], LIBXSTREAM_MAX_SIMD); size_t i, j; for (j = 0; j < nstreams; ++j) { LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(stream[j].histogram, local, sizeof(local), stream[j].handle)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[j].handle)); /*wait for pending work*/ for (i = 0; i < hsize; ++i) histogram[i] += local[i]; } } #if defined(_OPENMP) const double duration = omp_get_wtime() - start; #endif const double kilo = 1.0 / (1 << 10), mega = 1.0 / (1 << 20); double entropy = 0; { /*calculate entropy*/ const double log2_nitems = log2((double)nitems); size_t i; for (i = 0; i < hsize; ++i) { const double h = (double)histogram[i], log2h = 0 < h ? log2(h) : log2_nitems; entropy -= h * LIBXSTREAM_MIN(log2h - log2_nitems, 0); } entropy /= nitems; } if (0 < entropy) { if ((1 << 20) <= nitems) { /*mega*/ fprintf(stdout, "Compression %gx: %.1f -> %.1f MB", 8.0 / entropy, mega * nitems, mega * entropy * nitems / 8.0); } else if ((1 << 10) <= nitems) { /*kilo*/ fprintf(stdout, "Compression %gx: %.1f -> %.1f KB", 8.0 / entropy, kilo * nitems, kilo * entropy * nitems / 8.0); } else { fprintf(stdout, "Compression %gx: %.0f -> %0.f B", 8.0 / entropy, 1.0 * nitems, entropy * nitems / 8.0); } fprintf(stdout, " (redundancy %0.f%%, entropy %.0f bit)\n", 100.0 - 12.5 * entropy, entropy); } #if defined(_OPENMP) if (0 < duration) { fprintf(stdout, "Finished after %.1f s", duration); } else { fprintf(stdout, "Finished"); } #endif { /*validate result*/ size_t check = 0, i; for (i = 0; i < hsize; ++i) check += histogram[i]; if (nitems != check) { size_t expected[256/*hsize*/]; memset(expected, 0, sizeof(expected)); LIBXSTREAM_CONCATENATE(histogram,HISTOGRAM)(data, nitems, expected); check = 0; for (i = 0; i < hsize; ++i) check += expected[i] == histogram[i] ? 0 : 1; fprintf(stdout, " with %llu error%s\n", (unsigned long long)check, 1 != check ? "s" : ""); } else { fprintf(stdout, "\n"); } } { /*release resources*/ size_t i; for (i = 0; i < nstreams; ++i) { int device = -1; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_device(stream[i].handle, &device)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].histogram)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].data)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_destroy(stream[i].handle)); #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_destroy(stream[i].event)); #endif } LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1/*host*/, data)); } return EXIT_SUCCESS; }