int libxstream_construct(libxstream_argument arguments[], size_t arg, libxstream_argument::kind_type kind, const void* value, libxstream_type type, size_t dims, const size_t shape[]) { size_t typesize = 0; const bool weak_candidate = LIBXSTREAM_TYPE_VOID == type || (LIBXSTREAM_ERROR_NONE == libxstream_get_typesize(type, &typesize) && 1 == typesize); LIBXSTREAM_CHECK_CONDITION((((libxstream_argument::kind_invalid == kind || libxstream_argument::kind_inout == kind) && LIBXSTREAM_TYPE_INVALID == type) || LIBXSTREAM_TYPE_INVALID > type) && ((0 == dims && 0 == shape) || (0 == dims && 0 != shape && weak_candidate) || (0 < dims)) && (LIBXSTREAM_MAX_NDIMS) >= dims); LIBXSTREAM_ASSERT((LIBXSTREAM_MAX_NARGS) >= arg); libxstream_argument& argument = arguments[arg]; #if defined(LIBXSTREAM_DEBUG) memset(argument.data.self, 0, sizeof(libxstream_argument)); // avoid false pos. with mem. analysis #endif #if defined(LIBXSTREAM_PRINT) static const char *const context[] = { "", "input", "output", "inout" }; #endif argument.kind = kind; argument.dims = dims; if (shape) { if (0 < dims || !weak_candidate) { #if defined(LIBXSTREAM_PRINT) if (0 == dims && !weak_candidate) { LIBXSTREAM_PRINT_WARN("libxstream_fn_%s: signature=0x%llx arg=%lu is strong-typed (ignored shape)!", context[kind], reinterpret_cast<unsigned long long>(arguments), static_cast<unsigned long>(arg)); } #endif argument.type = type; } else { // 0 == dims && weak_candidate argument.type = LIBXSTREAM_TYPE_VOID; LIBXSTREAM_CHECK_CONDITION(sizeof(libxstream_argument::data_union) >= *shape); argument.shape[0] = shape[0]; } #if defined(__INTEL_COMPILER) # pragma loop_count min(0), max(LIBXSTREAM_MAX_NDIMS), avg(2) #endif for (size_t i = 0; i < dims; ++i) argument.shape[i] = shape[i]; } else { #if defined(LIBXSTREAM_PRINT) if (0 < dims && 0 == shape) { LIBXSTREAM_PRINT_WARN("libxstream_fn_%s: signature=0x%llx arg=%lu is weak-typed (no shape information)!", context[kind], reinterpret_cast<unsigned long long>(arguments), static_cast<unsigned long>(arg)); } #endif std::fill_n(argument.shape, dims, 0); argument.type = type; } return libxstream_argument::kind_invalid != kind ? libxstream_set_value(argument, value) : LIBXSTREAM_ERROR_NONE; }
int multi_dgemm_type::operator()(size_t index, size_t size) { LIBXSTREAM_CHECK_CONDITION(ready() && (index + size) <= m_host_data->size()); if (0 < size) { if (0 == demux()) { // This manual synchronization prevents multiple threads from queuing work into the *same* stream (at the same time). // This is only needed if the stream was created without demux support in order to rely on manual synchronization. LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_lock(m_stream)); } const size_t i0 = m_host_data->idata()[index], i1 = m_host_data->idata()[index+size]; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->adata() + i0, m_adata, sizeof(double) * (i1 - i0), m_stream)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->bdata() + i0, m_bdata, sizeof(double) * (i1 - i0), m_stream)); // transferring cdata is part of the benchmark; since it is all zeros we could do better with libxstream_memset_zero LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->cdata() + i0, m_cdata, sizeof(double) * (i1 - i0), m_stream)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->idata() + index, m_idata, sizeof(size_t) * size, m_stream)); #if defined(LIBXSTREAM_DEBUG) size_t n = 0; LIBXSTREAM_ASSERT(LIBXSTREAM_ERROR_NONE == libxstream_fn_nargs(m_signature, &n) && 6 == n); #endif const size_t nn = i1 - m_host_data->idata()[index+size-1]; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input(m_signature, 0, &size, libxstream_map_to_type(size), 0, 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input(m_signature, 1, &nn, libxstream_map_to_type(nn ), 0, 0)); LIBXSTREAM_ASSERT(LIBXSTREAM_ERROR_NONE == libxstream_get_arity(m_signature, &n) && 6 == n); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call(m_host_data->process(), m_signature, m_stream, LIBXSTREAM_CALL_DEFAULT)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(m_cdata, m_host_data->cdata() + i0, sizeof(double) * (i1 - i0), m_stream)); if (0 == demux()) { LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_unlock(m_stream)); } } return LIBXSTREAM_ERROR_NONE; }
int multi_dgemm_type::operator()(size_t index, size_t size) { LIBXSTREAM_CHECK_CONDITION(ready() && (index + size) <= m_host_data->size()); if (0 < size) { const size_t i0 = m_host_data->idata()[index], i1 = m_host_data->idata()[index+size]; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->adata() + i0, m_adata, sizeof(double) * (i1 - i0), m_stream)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->bdata() + i0, m_bdata, sizeof(double) * (i1 - i0), m_stream)); // transferring cdata is part of the benchmark; since it is all zeros we could do better with libxstream_memset_zero LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->cdata() + i0, m_cdata, sizeof(double) * (i1 - i0), m_stream)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->idata() + index, m_idata, sizeof(size_t) * size, m_stream)); libxstream_argument* signature = 0; const size_t max_msize = m_max_batch * m_host_data->max_matrix_size(), nn = i1 - m_host_data->idata()[index+size-1]; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_signature(&signature)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 0, &size, libxstream_map_to_type(size), 0, 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 1, &nn, libxstream_map_to_type(nn ), 0, 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 2, m_idata, libxstream_map_to_type(m_idata), 1, &max_msize)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 3, m_adata, libxstream_map_to_type(m_adata), 1, &max_msize)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 4, m_bdata, libxstream_map_to_type(m_bdata), 1, &max_msize)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_output(signature, 5, m_cdata, libxstream_map_to_type(m_cdata), 1, &max_msize)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call(m_host_data->process(), signature, m_stream, LIBXSTREAM_CALL_DEFAULT)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(m_cdata, m_host_data->cdata() + i0, sizeof(double) * (i1 - i0), m_stream)); } return LIBXSTREAM_ERROR_NONE; }
int libxstream_construct(libxstream_argument* signature, size_t nargs) { LIBXSTREAM_CHECK_CONDITION((0 != signature || 0 == nargs) && (LIBXSTREAM_MAX_NARGS) >= nargs); if (0 != signature) { #if defined(__INTEL_COMPILER) # pragma loop_count min(0), max(LIBXSTREAM_MAX_NARGS), avg(LIBXSTREAM_MAX_NARGS/2) #endif for (size_t i = 0; i < nargs; ++i) { LIBXSTREAM_CHECK_CALL(libxstream_construct(signature, i, libxstream_argument::kind_inout, 0, LIBXSTREAM_TYPE_INVALID, 0, 0)); } LIBXSTREAM_CHECK_CALL(libxstream_construct(signature, nargs, libxstream_argument::kind_invalid, 0, LIBXSTREAM_TYPE_INVALID, 0, 0)); } return LIBXSTREAM_ERROR_NONE; }