Compadre 1.5.5
Loading...
Searching...
No Matches
Compadre_ApplyTargetEvaluations.hpp
Go to the documentation of this file.
1#ifndef _COMPADRE_APPLY_TARGET_EVALUATIONS_HPP_
2#define _COMPADRE_APPLY_TARGET_EVALUATIONS_HPP_
3
4#include "Compadre_GMLS.hpp"
5namespace Compadre {
6
7/*! \brief For applying the evaluations from a target functional to the polynomial coefficients
8 \param data [out/in] - GMLSSolutionData struct (stores solution in data._d_ss._alphas)
9 \param teamMember [in] - Kokkos::TeamPolicy member type (created by parallel_for)
10 \param Q [in] - 2D Kokkos View containing the polynomial coefficients
11 \param P_target_row [in] - 1D Kokkos View where the evaluation of the polynomial basis is stored
12*/
13template <typename SolutionData>
14KOKKOS_INLINE_FUNCTION
15void applyTargetsToCoefficients(const SolutionData& data, const member_type& teamMember, scratch_matrix_right_type Q, scratch_matrix_right_type P_target_row) {
16
17 const int target_index = data._initial_index_for_batch + teamMember.league_rank();
18
19#if defined(COMPADRE_USE_CUDA)
20// // GPU
21// for (int j=0; j<_operations.size(); ++j) {
22// for (int k=0; k<_lro_output_tile_size[j]; ++k) {
23// for (int m=0; m<_lro_input_tile_size[j]; ++m) {
24// const int alpha_offset = (_lro_total_offsets[j] + m*_lro_output_tile_size[j] + k)*_neighbor_lists(target_index,0);
25// const int P_offset =_basis_multiplier*target_NP*(_lro_total_offsets[j] + m*_lro_output_tile_size[j] + k);
26// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,
27// _pc._nla.getNumberOfNeighborsDevice(target_index)), [=] (const int i) {
28//
29// double alpha_ij = 0;
30// if (_sampling_multiplier>1 && m<_sampling_multiplier) {
31// const int m_neighbor_offset = i+m*_pc._nla.getNumberOfNeighborsDevice(target_index);
32// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember,
33// _basis_multiplier*target_NP), [=] (const int l, double &talpha_ij) {
34// //for (int l=0; l<_basis_multiplier*target_NP; ++l) {
35// talpha_ij += P_target_row(P_offset + l)*Q(ORDER_INDICES(m_neighbor_offset,l));
36// }, alpha_ij);
37// //}
38// } else if (_sampling_multiplier == 1) {
39// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember,
40// _basis_multiplier*target_NP), [=] (const int l, double &talpha_ij) {
41// //for (int l=0; l<_basis_multiplier*target_NP; ++l) {
42// talpha_ij += P_target_row(P_offset + l)*Q(ORDER_INDICES(i,l));
43// }, alpha_ij);
44// //}
45// }
46// Kokkos::single(Kokkos::PerThread(teamMember), [&] () {
47// t1(i) = alpha_ij;
48// });
49// });
50// Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember,
51// _pc._nla.getNumberOfNeighborsDevice(target_index)), [=] (const int i) {
52// _alphas(ORDER_INDICES(target_index, alpha_offset + i)) = t1(i);
53// });
54// teamMember.team_barrier();
55// }
56// }
57// }
58
59 // GPU
60 const auto n_evaluation_sites_per_target = data.additional_number_of_neighbors_list(target_index) + 1;
61 const auto nn = data.number_of_neighbors_list(target_index);
62 auto alphas = data._d_ss._alphas;
63 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,
64 nn + data._d_ss._added_alpha_size), [&] (const int i) {
65 for (int e=0; e<n_evaluation_sites_per_target; ++e) {
66 for (int j=0; j<(int)data.operations_size; ++j) {
67 for (int k=0; k<data._d_ss._lro_output_tile_size[j]; ++k) {
68 for (int m=0; m<data._d_ss._lro_input_tile_size[j]; ++m) {
69 const int offset_index_jmke = data._d_ss.getTargetOffsetIndex(j,m,k,e);
70 const int alphas_index = data._d_ss.getAlphaIndex(target_index, offset_index_jmke);
71 double alpha_ij = 0;
72 if (data._sampling_multiplier>1 && m<data._sampling_multiplier) {
73 const int m_neighbor_offset = i+m*nn;
74 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, data.this_num_cols),
75 [&] (int& l, double& t_alpha_ij) {
76 t_alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, m_neighbor_offset);
77
78 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
79 && "NaN in P_target_row matrix.");
80 compadre_kernel_assert_extreme_debug(Q(l, m_neighbor_offset)==Q(l, m_neighbor_offset)
81 && "NaN in Q coefficient matrix.");
82
83 }, alpha_ij);
84 } else if (data._sampling_multiplier == 1) {
85 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, data.this_num_cols),
86 [&] (int& l, double& t_alpha_ij) {
87 t_alpha_ij += P_target_row(offset_index_jmke, l)*Q(l,i);
88
89 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
90 && "NaN in P_target_row matrix.");
91 compadre_kernel_assert_extreme_debug(Q(l,i)==Q(l,i)
92 && "NaN in Q coefficient matrix.");
93
94 }, alpha_ij);
95 }
96 // could use a PerThread here, but performance takes a hit
97 // and it isn't necessary
98 alphas(alphas_index+i) = alpha_ij;
99 compadre_kernel_assert_extreme_debug(alpha_ij==alpha_ij && "NaN in alphas.");
100
101 }
102 }
103 }
104 }
105 });
106#else
107
108 // CPU
109 const int alphas_per_tile_per_target = data.number_of_neighbors_list(target_index) + data._d_ss._added_alpha_size;
110 const global_index_type base_offset_index_jmke = data._d_ss.getTargetOffsetIndex(0,0,0,0);
111 const global_index_type base_alphas_index = data._d_ss.getAlphaIndex(target_index, base_offset_index_jmke);
112
113 scratch_matrix_right_type this_alphas(data._d_ss._alphas.data() + TO_GLOBAL(base_alphas_index), data._d_ss._total_alpha_values*data._d_ss._max_evaluation_sites_per_target, alphas_per_tile_per_target);
114
115 auto n_evaluation_sites_per_target = data.additional_number_of_neighbors_list(target_index) + 1;
116 const auto nn = data.number_of_neighbors_list(target_index);
117 for (int e=0; e<n_evaluation_sites_per_target; ++e) {
118 // evaluating alpha_ij
119 for (size_t j=0; j<data.operations_size; ++j) {
120 for (int k=0; k<data._d_ss._lro_output_tile_size[j]; ++k) {
121 for (int m=0; m<data._d_ss._lro_input_tile_size[j]; ++m) {
122 const int offset_index_jmke = data._d_ss.getTargetOffsetIndex(j,m,k,e);
123 for (int i=0; i<nn + data._d_ss._added_alpha_size; ++i) {
124 double alpha_ij = 0;
125 const int Q_col = i+m*nn;
126 // one thread per team on CPU, so no reason to add inner parallel reductions here
127 // or Kokkos::single. Either will cause significant slowdown.
128 for (int l=0; l<data.this_num_cols; ++l) {
129 if (data._sampling_multiplier>1 && m<data._sampling_multiplier) {
130
131 alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, Q_col);
132
133 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
134 && "NaN in P_target_row matrix.");
135 compadre_kernel_assert_extreme_debug(Q(l, i+m*data.number_of_neighbors_list(target_index))==Q(l, i+m*data.number_of_neighbors_list(target_index))
136 && "NaN in Q coefficient matrix.");
137
138 } else if (data._sampling_multiplier == 1) {
139
140 alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, i);
141
142 compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
143 && "NaN in P_target_row matrix.");
145 && "NaN in Q coefficient matrix.");
146
147 } else {
148 alpha_ij += 0;
149 }
150 }
151 this_alphas(offset_index_jmke,i) = alpha_ij;
152 }
153 }
154 }
155 }
156 }
157#endif
158
159 teamMember.team_barrier();
160}
161
162} // Compadre
163#endif
std::size_t global_index_type
team_policy::member_type member_type
#define TO_GLOBAL(variable)
#define compadre_kernel_assert_extreme_debug(condition)
Kokkos::View< double **, layout_right, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_right_type
KOKKOS_INLINE_FUNCTION void applyTargetsToCoefficients(const SolutionData &data, const member_type &teamMember, scratch_matrix_right_type Q, scratch_matrix_right_type P_target_row)
For applying the evaluations from a target functional to the polynomial coefficients.