double gflops() { if (buffer_.side_ == clblasLeft) { return buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); } else { return buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } }
double bandwidth() { #if 0 //Check VK //Host to GPU: CSR-> [rowOffsets(num_rows + 1) + Column Indices] * sizeof(int) + sizeof(T) * (num_nonzero) //GPU to Host: Dense - > [sizeof(T) * denseMtx.num_rows * denseMTx.num_cols] size_t sparseBytes = sizeof(cl_int) * (csrMtx.num_nonzeros + csrMtx.num_rows + 1) + sizeof(T) * (csrMtx.num_nonzeros) + sizeof(T) * (denseMtx.num_rows * denseMtx.num_cols); return (sparseBytes / time_in_ns()); #endif // Number of Elements converted in unit time return (csrMtx.num_nonzeros / time_in_ns()); }// end
double xTrsm<cl_double2>:: gflops() { if (buffer_.side_ == clblasLeft) { return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); } else { return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } }
double bandwidth() { #if 0 //Check VK //Host to GPU: CSR-> [rowOffsets(num_rows + 1) + Column Indices] * sizeof(int) + sizeof(T) * (num_nonzero) //GPU to Host: Coo - > row_indices + Col_indices + Values- > [sizeof(T) * num_nonzero] + sizeof(int) size_t sparseBytes = sizeof(cl_int) * (csrMtx.num_nonzeros + csrMtx.num_rows + 1) + sizeof(T) * (csrMtx.num_nonzeros) + sizeof(T) * (cooMtx.num_nonzeros) + sizeof(cl_int) * (cooMtx.num_nonzeros * 2); return (sparseBytes / time_in_ns()); #endif // Number of Elements converted in unit time return (csrMtx.num_nonzeros / time_in_ns()); }// end
double bandwidth( ) { #if 0 // Assuming that accesses to the vector always hit in the cache after the first access // There are NNZ integers in the cols[ ] array // You access each integer value in row_delimiters[ ] once. // There are NNZ float_types in the vals[ ] array // You read num_cols floats from the vector, afterwards they cache perfectly. // Finally, you write num_rows floats out to DRAM at the end of the kernel. return ( sizeof( cl_int )*( csrMtx.num_nonzeros + csrMtx.num_rows ) + sizeof( T ) * ( csrMtx.num_nonzeros + csrMtx.num_cols + csrMtx.num_rows ) ) / time_in_ns( ); #endif // Number of Elements converted in unit time return (csrMtx.num_cols * csrMtx.num_rows / time_in_ns()); }
double xGemm<cl_double2>:: gflops() { return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/(time_in_ns() / buffer_.apiCallCount); }
double bandwidth( ) { // Number of Elements converted in unit time return ( n_vals / time_in_ns( ) ); }
double gflops() { return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() + buffer_.n_*(buffer_.n_+1)/time_in_ns(); }
double gflops() { return static_cast<double>(buffer_.m_ * buffer_.m_ )/time_in_ns(); }
double xTrmv<cl_double2>:: gflops() { return static_cast<double>(4 * buffer_.m_ * buffer_.m_ )/time_in_ns(); }
double bandwidth() // Need to modify this later ********** { // Assuming that accesses to the vector always hit in the cache after the first access // There are NNZ integers in the cols[ ] array // You access each integer value in row_delimiters[ ] once. // There are NNZ float_types in the vals[ ] array // You read num_cols floats from the vector, afterwards they cache perfectly. // Finally, you write num_rows floats out to DRAM at the end of the kernel. return (sizeof(clsparseIdx_t)*(csrMtx.num_nonzeros + csrMtx.num_rows) + sizeof(T) * (csrMtx.num_nonzeros + csrMtx.num_cols + csrMtx.num_rows)) / time_in_ns(); } // end of function
double xSyr2k<cl_double2>::gflops() { return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns(); }
double gflops() { return (2*buffer_.k_*buffer_.n_*buffer_.n_+buffer_.n_)/time_in_ns(); }
double gflops() { return static_cast<double>(8*(buffer_.K_ * buffer_.N_ * buffer_.N_)/time_in_ns()+2*buffer_.N_/time_in_ns()); }
double gflops() { return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount); }
double gflops() { return (2.0*buffer_.n_*buffer_.n_)/time_in_ns(); }
double xSyrk<cl_double2>::gflops() { return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); }
double bandwidth() { // Number of Elements processed in unit time return (n_rows * n_cols / time_in_ns()); }
double gflops() { return (buffer.N*(buffer.N+1))/time_in_ns(); }