52#include <Teuchos_ScalarTraits.hpp>
53#include <Kokkos_ArithTraits.hpp>
63 namespace PerfDetails {
64 template<
class Scalar,
class Node>
67 using impl_scalar_type =
typename Kokkos::Details::ArithTraits<Scalar>::val_type;
69 using exec_space =
typename Node::execution_space;
70 using memory_space =
typename Node::memory_space;
71 using range_policy = Kokkos::RangePolicy<exec_space>;
73 Kokkos::View<impl_scalar_type*,memory_space> a(
"a", VECTOR_SIZE);
74 Kokkos::View<impl_scalar_type*,memory_space> b(
"b", VECTOR_SIZE);
75 Kokkos::View<impl_scalar_type*,memory_space> c(
"c", VECTOR_SIZE);
76 double total_test_time = 0.0;
78 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
80 Kokkos::parallel_for(
"stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t i) {
81 a(i) = ONE * (double)i;
86 using clock = std::chrono::high_resolution_clock;
88 clock::time_point start, stop;
90 for(
int i = 0; i < KERNEL_REPEATS; i++) {
92 Kokkos::parallel_for(
"stream/add",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t j) {
98 double my_test_time = std::chrono::duration<double>(stop - start).count();
99 total_test_time += my_test_time;
102 return total_test_time / KERNEL_REPEATS;
105 template<
class Scalar,
class Node>
108 using impl_scalar_type =
typename Kokkos::Details::ArithTraits<Scalar>::val_type;
110 using exec_space =
typename Node::execution_space;
111 using memory_space =
typename Node::memory_space;
112 using range_policy = Kokkos::RangePolicy<exec_space>;
114 Kokkos::View<impl_scalar_type*,memory_space> a(
"a", VECTOR_SIZE);
115 Kokkos::View<impl_scalar_type*,memory_space> b(
"b", VECTOR_SIZE);
116 double total_test_time = 0.0;
118 impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
120 Kokkos::parallel_for(
"stream/fill",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t i) {
123 exec_space().fence();
125 using clock = std::chrono::high_resolution_clock;
126 clock::time_point start, stop;
128 for(
int i = 0; i < KERNEL_REPEATS; i++) {
129 start = clock::now();
130 Kokkos::parallel_for(
"stream/copy",range_policy(0,VECTOR_SIZE), KOKKOS_LAMBDA (
const size_t j) {
134 exec_space().fence();
136 double my_test_time = std::chrono::duration<double>(stop - start).count();
137 total_test_time += my_test_time;
140 return total_test_time / KERNEL_REPEATS;
145 double table_lookup(
const std::vector<int> & x,
const std::vector<double> & y,
int value) {
147 if(x.size() == 0)
return Teuchos::ScalarTraits<double>::nan();
150 int N = (int) x.size();
152 for( ; hi < N; hi++) {
166 int run = x[hi] - x[hi-1];
167 double rise = y[hi] - y[hi-1];
168 double slope = rise / run;
169 int diff = value - x[hi-1];
171 return y[hi-1] + slope * diff;
176 int run = x[hi] - x[hi-1];
177 double rise = y[hi] - y[hi-1];
178 double slope = rise / run;
179 int diff = value - x[hi-1];
181 return y[hi-1] + slope * diff;
186 const double GB = 1024.0 * 1024.0 * 1024.0;
188 double time_per_call = time / num_calls;
189 return memory_per_call_bytes /
GB / time_per_call;
193 template <
class exec_space,
class memory_space>
195 int rank = comm.getRank();
196 int nproc = comm.getSize();
198 if(nproc < 2)
return;
201 using range_policy = Kokkos::RangePolicy<exec_space>;
202 const int buff_size = (int) pow(2,MAX_SIZE);
204 sizes.resize(MAX_SIZE+1);
205 times.resize(MAX_SIZE+1);
208 Kokkos::View<char*,memory_space> r_buf(
"recv",buff_size), s_buf(
"send",buff_size);
209 Kokkos::deep_copy(s_buf,1);
214 int buddy = odd ? rank - 1 : rank + 1;
216 for(
int i = 0; i < MAX_SIZE + 1 ;i ++) {
217 int msg_size = (int) pow(2,i);
220 double t0 = MPI_Wtime();
221 for(
int j = 0; j < KERNEL_REPEATS; j++) {
224 comm.send(msg_size, (
char*)s_buf.data(), buddy);
225 comm.receive(buddy, msg_size, (
char*)r_buf.data());
228 comm.receive(buddy, msg_size,(
char*)r_buf.data());
229 comm.send(msg_size, (
char*)s_buf.data(), buddy);
234 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
236 times[i] = time_per_call;
246 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
250 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
255 launch_latency_make_table(KERNEL_REPEATS);
256 double latency = launch_latency_lookup();
261 stream_sizes_.resize(LOG_MAX_SIZE+1);
262 stream_copy_times_.resize(LOG_MAX_SIZE+1);
263 stream_add_times_.resize(LOG_MAX_SIZE+1);
264 latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE+1);
265 latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE+1);
267 for(
int i=0; i<LOG_MAX_SIZE+1; i++) {
268 int size = (int) pow(2,i);
269 double c_time = PerfDetails::stream_vector_copy<Scalar,Node>(KERNEL_REPEATS,size);
270 double a_time = PerfDetails::stream_vector_add<Scalar,Node>(KERNEL_REPEATS,size);
272 stream_sizes_[i] = size;
275 stream_copy_times_[i] = c_time / 2.0;
276 stream_add_times_[i] = a_time / 3.0;
280 latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ( (c_time-latency)/2.0 );
281 latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ( (a_time-latency)/3.0 );
287 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
293 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
299 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
302 return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES),stream_vector_add_lookup(SIZE_IN_BYTES));
306 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
312 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
318 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
321 return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES),latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
325 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
328 print_stream_vector_table_impl(out,
false);
331 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
334 print_stream_vector_table_impl(out,
true);
338 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
342 std::ios old_format(NULL);
343 old_format.copyfmt(out);
345 out << setw(20) <<
"Length in Scalars" << setw(1) <<
" "
346 << setw(20) <<
"COPY (us)" << setw(1) <<
" "
347 << setw(20) <<
"ADD (us)" << setw(1) <<
" "
348 << setw(20) <<
"COPY (GB/s)" << setw(1) <<
" "
349 << setw(20) <<
"ADD (GB/s)" << std::endl;
351 out << setw(20) <<
"-----------------" << setw(1) <<
" "
352 << setw(20) <<
"---------" << setw(1) <<
" "
353 << setw(20) <<
"--------" << setw(1) <<
" "
354 << setw(20) <<
"-----------" << setw(1) <<
" "
355 << setw(20) <<
"----------" << std::endl;
358 for(
int i=0; i<(int)stream_sizes_.size(); i++) {
359 int size = stream_sizes_[i];
360 double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
361 double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
367 out << setw(20) << size << setw(1) <<
" "
368 << setw(20) << fixed << setprecision(4) << (c_time*1e6) << setw(1) <<
" "
369 << setw(20) << fixed << setprecision(4) << (a_time*1e6) << setw(1) <<
" "
370 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) <<
" "
371 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
374 out.copyfmt(old_format);
380 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
384 PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space,Kokkos::HostSpace::memory_space>(KERNEL_REPEATS,LOG_MAX_SIZE,*comm,pingpong_sizes_,pingpong_host_times_);
386 PerfDetails::pingpong_basic<typename Node::execution_space,typename Node::memory_space>(KERNEL_REPEATS,LOG_MAX_SIZE,*comm,pingpong_sizes_,pingpong_device_times_);
390 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
396 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
403 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
406 if(pingpong_sizes_.size() == 0)
return;
409 std::ios old_format(NULL);
410 old_format.copyfmt(out);
412 out << setw(20) <<
"Message Size" << setw(1) <<
" "
413 << setw(20) <<
"Host (us)" << setw(1) <<
" "
414 << setw(20) <<
"Device (us)" << std::endl;
416 out << setw(20) <<
"------------" << setw(1) <<
" "
417 << setw(20) <<
"---------" << setw(1) <<
" "
418 << setw(20) <<
"-----------" << std::endl;
421 for(
int i=0; i<(int)pingpong_sizes_.size(); i++) {
422 int size = pingpong_sizes_[i];
423 double h_time = pingpong_host_times_[i];
424 double d_time = pingpong_device_times_[i];
427 out << setw(20) << size << setw(1) <<
" "
428 << setw(20) << fixed << setprecision(4) << (h_time*1e6) << setw(1) <<
" "
429 << setw(20) << fixed << setprecision(4) << (d_time*1e6) << setw(1) << std::endl;
432 out.copyfmt(old_format);
435 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
438 using exec_space =
typename Node::execution_space;
439 using range_policy = Kokkos::RangePolicy<exec_space>;
440 using clock = std::chrono::high_resolution_clock;
442 double total_test_time = 0;
443 clock::time_point start, stop;
444 for(
int i = 0; i < KERNEL_REPEATS; i++) {
445 start = clock::now();
446 Kokkos::parallel_for(
"empty kernel",range_policy(0,1), KOKKOS_LAMBDA (
const size_t j) {
449 exec_space().fence();
451 double my_test_time = std::chrono::duration<double>(stop - start).count();
452 total_test_time += my_test_time;
455 launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
458 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
461 return launch_and_wait_latency_;
465 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
469 std::ios old_format(NULL);
470 old_format.copyfmt(out);
472 out << setw(20) <<
"Launch+Wait Latency (us)" << setw(1) <<
" "
473 << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_*1e6) << std::endl;
475 out.copyfmt(old_format);
MueLu::DefaultScalar Scalar
void print_pingpong_table(std::ostream &out)
void print_latency_corrected_stream_vector_table(std::ostream &out)
void print_stream_vector_table(std::ostream &out)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction)
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
double launch_latency_lookup()
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
double pingpong_device_lookup(int SIZE_IN_BYTES)
double pingpong_host_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
double stream_vector_lookup(int SIZE_IN_BYTES)
void launch_latency_make_table(int KERNEL_REPEATS)
void print_launch_latency_table(std::ostream &out)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > ×)
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
Namespace for MueLu classes and methods.