Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
Stokhos_CrsProductTensor.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP
43 #define STOKHOS_CRSPRODUCTTENSOR_HPP
44 
45 #include "Kokkos_Core.hpp"
46 
47 #include "Stokhos_Multiply.hpp"
48 #include "Stokhos_ProductBasis.hpp"
50 #include "Teuchos_ParameterList.hpp"
53 #include "Stokhos_TinyVec.hpp"
54 
55 //----------------------------------------------------------------------------
56 //----------------------------------------------------------------------------
57 
58 namespace Stokhos {
59 
77 template< typename ValueType, class ExecutionSpace, class Memory = void >
79 public:
80 
81  typedef ExecutionSpace execution_space;
82  typedef int size_type;
83  typedef ValueType value_type;
84  typedef Memory memory_type;
85 
86  typedef typename Kokkos::ViewTraits< size_type*, execution_space,void,void >::host_mirror_space host_mirror_space ;
88 
89 // Vectorsize used in multiply algorithm
90 #if defined(__AVX__)
91  static const size_type host_vectorsize = 32/sizeof(value_type);
92  static const bool use_intrinsics = true;
93  static const size_type num_entry_align = 1;
94 #elif defined(__MIC__)
95  static const size_type host_vectorsize = 16;
96  static const bool use_intrinsics = true;
97  static const size_type num_entry_align = 8; // avoid use of mask instructions
98 #else
99  static const size_type host_vectorsize = 2;
100  static const bool use_intrinsics = false;
101  static const size_type num_entry_align = 1;
102 #endif
103  static const size_type cuda_vectorsize = 32;
104  static const bool is_cuda =
105 #if defined( KOKKOS_HAVE_CUDA )
106  Kokkos::Impl::is_same<ExecutionSpace,Kokkos::Cuda>::value;
107 #else
108  false ;
109 #endif
111 
112  // Alignment in terms of number of entries of CRS rows
114 
115 private:
116 
117  template <class, class, class> friend class CrsProductTensor;
118 
119  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > vec_type;
120  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type;
121  typedef Kokkos::View< size_type*[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type;
122  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type;
123  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type;
124  typedef Kokkos::View< size_type*, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type;
125 
136 
137  struct CijkRowCount {
138  unsigned count;
139  unsigned basis;
140 
142  : count(0)
143  , basis(0)
144  {}
145  };
146 
148  bool operator() (const CijkRowCount& a, const CijkRowCount& b) const {
149  return a.count < b.count;
150  }
151  };
152 
153 public:
154 
155  KOKKOS_INLINE_FUNCTION
157 
158  KOKKOS_INLINE_FUNCTION
160  m_coord(),
161  m_coord2(),
162  m_value(),
163  m_num_entry(),
164  m_row_map(),
165  m_dim(0),
166  m_entry_max(0),
167  m_nnz(0),
168  m_flops(0),
170 
171  template <class M>
172  KOKKOS_INLINE_FUNCTION
174  m_coord( rhs.m_coord ),
175  m_coord2( rhs.m_coord2 ),
176  m_value( rhs.m_value ),
177  m_num_entry( rhs.m_num_entry ),
178  m_row_map( rhs.m_row_map ),
179  m_dim( rhs.m_dim ),
180  m_entry_max( rhs.m_entry_max ),
181  m_nnz( rhs.m_nnz ),
182  m_flops( rhs.m_flops ),
184 
185  template <class M>
186  KOKKOS_INLINE_FUNCTION
189  {
190  m_coord = rhs.m_coord;
191  m_coord2 = rhs.m_coord2;
192  m_value = rhs.m_value;
193  m_num_entry = rhs.m_num_entry;
194  m_row_map = rhs.m_row_map;
195  m_dim = rhs.m_dim;
196  m_entry_max = rhs.m_entry_max;
197  m_nnz = rhs.m_nnz;
198  m_flops = rhs.m_flops;
200  return *this;
201  }
202 
204  KOKKOS_INLINE_FUNCTION
205  size_type dimension() const { return m_dim; }
206 
208  KOKKOS_INLINE_FUNCTION
209  bool is_empty() const { return m_dim == 0; }
210 
212  KOKKOS_INLINE_FUNCTION
214  { return m_coord.dimension_0(); }
215 
217  KOKKOS_INLINE_FUNCTION
219  { return m_entry_max; }
220 
222  KOKKOS_INLINE_FUNCTION
224  { return m_row_map[i]; }
225 
227  KOKKOS_INLINE_FUNCTION
229  { return m_row_map[i] + m_num_entry(i); }
230 
232  KOKKOS_INLINE_FUNCTION
234  { return m_num_entry(i); }
235 
237  KOKKOS_INLINE_FUNCTION
238  const size_type& coord( const size_type entry, const size_type c ) const
239  { return m_coord2( entry, c ); }
240 
242  KOKKOS_INLINE_FUNCTION
243  const size_type& coord( const size_type entry ) const
244  { return m_coord( entry ); }
245 
247  KOKKOS_INLINE_FUNCTION
248  const value_type & value( const size_type entry ) const
249  { return m_value( entry ); }
250 
252  KOKKOS_INLINE_FUNCTION
254  { return m_nnz; }
255 
257  KOKKOS_INLINE_FUNCTION
259  { return m_flops; }
260 
262  KOKKOS_INLINE_FUNCTION
264  { return m_avg_entries_per_row; }
265 
266  template <typename OrdinalType>
267  static CrsProductTensor
270  const Teuchos::ParameterList& params = Teuchos::ParameterList())
271  {
273 
274  // Note (etp 01/08/15 Commenting out the sorting as it causes a really
275  // weird compiler error when compiling with NVCC. It seems to think the
276  // < in CompareCijkRowCount() is part of a template parameter. We don't
277  // seem to use this option, so I am just commenting it out.
278 
279  // bool sort_nnz = false;
280  // if (params.isParameter("Sort Nonzeros"))
281  // sort_nnz = params.get<bool>("Sort Nonzeros");
282 
283  // Compute number of non-zeros for each i
284  const size_type dimension = basis.size();
285  std::vector< size_t > coord_work( dimension, (size_t) 0 );
287  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
288  i_it!=Cijk.i_end(); ++i_it) {
289  OrdinalType i = index(i_it);
290  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
291  k_it != Cijk.k_end(i_it); ++k_it) {
292  OrdinalType k = index(k_it);
293  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
294  j_it != Cijk.j_end(k_it); ++j_it) {
295  OrdinalType j = index(j_it);
296  if (j >= k) {
297  ++coord_work[i];
298  ++entry_count;
299  }
300  }
301  }
302  }
303 
304  // Compute average nonzeros per row (must be before padding)
306 
307  // Pad each row to have size divisible by alignment size
308  for ( size_type i = 0; i < dimension; ++i ) {
309  const size_t rem = coord_work[i] % tensor_align;
310  if (rem > 0) {
311  const size_t pad = tensor_align - rem;
312  coord_work[i] += pad;
313  entry_count += pad;
314  }
315  }
316 
317  // Sort based on number of non-zeros
318  std::vector< CijkRowCount > row_count( dimension );
319  for ( size_type i = 0; i < dimension; ++i ) {
320  row_count[i].count = coord_work[i];
321  row_count[i].basis = i;
322  }
323 
324  // Note (etp 01/08/15 See above.
325 
326  // if (sort_nnz)
327  // std::sort( row_count.begin(), row_count.end(), CompareCijkRowCount() );
328  std::vector<size_type> sorted_row_map( dimension );
329  for ( size_type i = 0; i < dimension; ++i ) {
330  coord_work[i] = row_count[i].count;
331  sorted_row_map[ row_count[i].basis ] = i;
332  }
333 
334  // Allocate tensor data
335  // coord and coord2 are initialized to zero because otherwise we get
336  // seg faults in the MIC algorithm when processing the tail of each
337  // tensor row. Not quite sure why as the coord loads are padded to
338  // length 16 and are masked for the remainder (unless it does load x[j]
339  // anyway and masks off the result, so j needs to be valid).
340  CrsProductTensor tensor;
341  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
342  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
343  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
344  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
345  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
346  tensor.m_dim = dimension;
347  tensor.m_entry_max = 0;
349 
350  // Create mirror, is a view if is host memory
351  typename coord_array_type::HostMirror
352  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
353  typename coord2_array_type::HostMirror
354  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
355  typename value_array_type::HostMirror
356  host_value = Kokkos::create_mirror_view( tensor.m_value );
357  typename entry_array_type::HostMirror
358  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
359  typename entry_array_type::HostMirror
360  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
361 
362  // Compute row map
363  size_type sum = 0;
364  host_row_map(0) = 0;
365  for ( size_type i = 0; i < dimension; ++i ) {
366  sum += coord_work[i];
367  host_row_map(i+1) = sum;
368  host_num_entry(i) = 0;
369  }
370 
371  for ( size_type iCoord = 0; iCoord < dimension; ++iCoord ) {
372  coord_work[iCoord] = host_row_map[iCoord];
373  }
374 
375  // Initialize values and coordinates to zero since we will have extra
376  // ones for alignment
377  Kokkos::deep_copy( host_value, 0.0 );
378  Kokkos::deep_copy( host_coord, 0 );
379  Kokkos::deep_copy( host_coord2, 0 );
380 
381  for (typename Cijk_type::i_iterator i_it=Cijk.i_begin();
382  i_it!=Cijk.i_end(); ++i_it) {
383  OrdinalType i = index(i_it);
384  const size_type row = sorted_row_map[i];
385  for (typename Cijk_type::ik_iterator k_it = Cijk.k_begin(i_it);
386  k_it != Cijk.k_end(i_it); ++k_it) {
387  OrdinalType k = index(k_it);
388  for (typename Cijk_type::ikj_iterator j_it = Cijk.j_begin(k_it);
389  j_it != Cijk.j_end(k_it); ++j_it) {
390  OrdinalType j = index(j_it);
391  ValueType c = Stokhos::value(j_it);
392  if (j >= k) {
393  const size_type n = coord_work[row]; ++coord_work[row];
394  host_value(n) = (j != k) ? c : 0.5*c;
395  host_coord2(n,0) = j;
396  host_coord2(n,1) = k;
397  host_coord(n) = ( k << 16 ) | j;
398  ++host_num_entry(row);
399  ++tensor.m_nnz;
400  }
401  }
402  }
403  // Align num_entry
404  host_num_entry(row) =
405  (host_num_entry(row) + num_entry_align-1) & ~(num_entry_align-1);
406  }
407 
408  // Copy data to device if necessary
409  Kokkos::deep_copy( tensor.m_coord, host_coord );
410  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
411  Kokkos::deep_copy( tensor.m_value, host_value );
412  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
413  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
414 
415  for ( size_type i = 0; i < dimension; ++i ) {
416  tensor.m_entry_max = std::max( tensor.m_entry_max, host_num_entry(i) );
417  }
418 
419  tensor.m_flops = 5*tensor.m_nnz + dimension;
420 
421  return tensor;
422  }
423 
425  {
426  const size_type dimension = 1;
427  const size_type entry_count = 1;
428 
429  // Allocate tensor data
430  // coord and coord2 are initialized to zero because otherwise we get
431  // seg faults in the MIC algorithm when processing the tail of each
432  // tensor row. Not quite sure why as the coord loads are padded to
433  // length 16 and are masked for the remainder (unless it does load x[j]
434  // anyway and masks off the result, so j needs to be valid).
435  CrsProductTensor tensor;
436  tensor.m_coord = coord_array_type("tensor_coord", entry_count );
437  tensor.m_coord2 = coord2_array_type( "tensor_coord2", entry_count );
438  tensor.m_value = value_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_value"), entry_count );
439  tensor.m_num_entry = entry_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_num_entry"), dimension );
440  tensor.m_row_map = row_map_array_type( Kokkos::ViewAllocateWithoutInitializing("tensor_row_map"), dimension+1 );
441  tensor.m_dim = dimension;
442  tensor.m_entry_max = 1;
443  tensor.m_avg_entries_per_row = 1;
444  tensor.m_nnz = 1;
445  tensor.m_flops = 5*tensor.m_nnz + dimension;
446 
447  // Create mirror, is a view if is host memory
448  typename coord_array_type::HostMirror
449  host_coord = Kokkos::create_mirror_view( tensor.m_coord );
450  typename coord2_array_type::HostMirror
451  host_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
452  typename value_array_type::HostMirror
453  host_value = Kokkos::create_mirror_view( tensor.m_value );
454  typename entry_array_type::HostMirror
455  host_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
456  typename entry_array_type::HostMirror
457  host_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
458 
459  // Compute row map
460  host_row_map(0) = 0;
461  host_row_map(1) = 1;
462  host_num_entry(0) = 1;
463 
464  // Compute tensor values
465  host_value(0) = 0.5;
466  host_coord2(0,0) = 0;
467  host_coord2(0,1) = 0;
468  host_coord(0) = 0;
469 
470  // Copy data to device if necessary
471  Kokkos::deep_copy( tensor.m_coord, host_coord );
472  Kokkos::deep_copy( tensor.m_coord2, host_coord2 );
473  Kokkos::deep_copy( tensor.m_value, host_value );
474  Kokkos::deep_copy( tensor.m_num_entry, host_num_entry );
475  Kokkos::deep_copy( tensor.m_row_map, host_row_map );
476 
477  return tensor;
478  }
479 
480  static HostMirror
482  HostMirror host_tensor;
483 
484  host_tensor.m_coord = Kokkos::create_mirror_view( tensor.m_coord );
485  host_tensor.m_coord2 = Kokkos::create_mirror_view( tensor.m_coord2 );
486  host_tensor.m_value = Kokkos::create_mirror_view( tensor.m_value );
487  host_tensor.m_num_entry = Kokkos::create_mirror_view( tensor.m_num_entry );
488  host_tensor.m_row_map = Kokkos::create_mirror_view( tensor.m_row_map );
489 
490  host_tensor.m_dim = tensor.m_dim;
491  host_tensor.m_entry_max = tensor.m_entry_max;
492  host_tensor.m_avg_entries_per_row = tensor.m_avg_entries_per_row;
493  host_tensor.m_nnz = tensor.m_nnz;
494  host_tensor.m_flops = tensor.m_flops;
495 
496  return host_tensor;
497  }
498 
499  template < class DstDevice, class DstMemory >
500  static void
502  const CrsProductTensor& src ) {
503  Kokkos::deep_copy( dst.m_coord, src.m_coord );
504  Kokkos::deep_copy( dst.m_coord2, src.m_coord2 );
505  Kokkos::deep_copy( dst.m_value, src.m_value );
508  }
509 
510 };
511 
512 template< class Device, typename OrdinalType, typename ValueType>
513 CrsProductTensor<ValueType, Device>
517  const Teuchos::ParameterList& params = Teuchos::ParameterList())
518 {
519  return CrsProductTensor<ValueType, Device>::create(basis, Cijk, params );
520 }
521 
522 template< class Device, typename OrdinalType, typename ValueType,
523  class Memory >
524 CrsProductTensor<ValueType, Device, Memory>
528  const Teuchos::ParameterList& params = Teuchos::ParameterList())
529 {
531  basis, Cijk, params );
532 }
533 
534 template< class Device, typename OrdinalType, typename ValueType>
535 CrsProductTensor<ValueType, Device>
537 {
539 }
540 
541 template< class Device, typename OrdinalType, typename ValueType,
542  class Memory >
543 CrsProductTensor<ValueType, Device, Memory>
545 {
547 }
548 
549 template < class ValueType, class Device, class Memory >
550 inline
553 {
555 }
556 
557  template < class ValueType,
558  class DstDevice, class DstMemory,
559  class SrcDevice, class SrcMemory >
560 void
563 {
565 }
566 
567 template < typename ValueType, typename Device >
568 class BlockMultiply< CrsProductTensor< ValueType , Device > >
569 {
570 public:
571 
572  typedef Device execution_space;
575 
576 // Whether to use manual or auto-vectorization
577 #ifdef __MIC__
578 #define USE_AUTO_VECTORIZATION 1
579 #else
580 #define USE_AUTO_VECTORIZATION 0
581 #endif
582 
583 #if defined(__INTEL_COMPILER) && USE_AUTO_VECTORIZATION
584 
585  // Version leveraging intel vectorization
586  template< typename MatrixValue , typename VectorValue >
587  KOKKOS_INLINE_FUNCTION
588  static void apply( const tensor_type & tensor ,
589  const MatrixValue * const a ,
590  const VectorValue * const x ,
591  VectorValue * const y ,
592  const VectorValue & alpha = VectorValue(1) )
593  {
594  // The intel compiler doesn't seem to be able to vectorize through
595  // the coord() calls, so extract pointers
596  const size_type * cj = &tensor.coord(0,0);
597  const size_type * ck = &tensor.coord(0,1);
598  const size_type nDim = tensor.dimension();
599 
600  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
601  const size_type nEntry = tensor.num_entry(iy);
602  const size_type iEntryBeg = tensor.entry_begin(iy);
603  const size_type iEntryEnd = iEntryBeg + nEntry;
604  VectorValue ytmp = 0;
605 
606 #pragma simd vectorlength(tensor_type::vectorsize)
607 #pragma ivdep
608 #pragma vector aligned
609  for (size_type iEntry = iEntryBeg; iEntry<iEntryEnd; ++iEntry) {
610  const size_type j = cj[iEntry]; //tensor.coord(iEntry,0);
611  const size_type k = ck[iEntry]; //tensor.coord(iEntry,1);
612  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
613  }
614 
615  y[iy] += alpha * ytmp ;
616  }
617  }
618 
619 #elif defined(__MIC__)
620 
621  // Version specific to MIC architecture using manual vectorization
622  template< typename MatrixValue , typename VectorValue >
623  KOKKOS_INLINE_FUNCTION
624  static void apply( const tensor_type & tensor ,
625  const MatrixValue * const a ,
626  const VectorValue * const x ,
627  VectorValue * const y ,
628  const VectorValue & alpha = VectorValue(1) )
629  {
630  const size_type nDim = tensor.dimension();
631  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
632 
633  const size_type nEntry = tensor.num_entry(iy);
634  const size_type iEntryBeg = tensor.entry_begin(iy);
635  const size_type iEntryEnd = iEntryBeg + nEntry;
636  size_type iEntry = iEntryBeg;
637 
638  VectorValue ytmp = 0 ;
639 
640  const size_type nBlock = nEntry / tensor_type::vectorsize;
641  const size_type nEntryB = nBlock * tensor_type::vectorsize;
642  const size_type iEnd = iEntryBeg + nEntryB;
643 
644  typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> TV;
645  TV vy;
646  vy.zero();
647  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
648  const size_type *j = &tensor.coord(iEntry,0);
649  const size_type *k = &tensor.coord(iEntry,1);
650  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
651  c(&(tensor.value(iEntry)));
652 
653  // vy += c * ( aj * xk + ak * xj)
654  aj.times_equal(xk);
655  aj.multiply_add(ak, xj);
656  vy.multiply_add(c, aj);
657 
658  }
659  ytmp += vy.sum();
660 
661  // The number of nonzeros is always constrained to be a multiple of 8
662 
663  const size_type rem = iEntryEnd-iEntry;
664  if (rem >= 8) {
665  typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> TV2;
666  const size_type *j = &tensor.coord(iEntry,0);
667  const size_type *k = &tensor.coord(iEntry,1);
668  TV2 aj(a, j), ak(a, k), xj(x, j), xk(x, k),
669  c(&(tensor.value(iEntry)));
670 
671  // vy += c * ( aj * xk + ak * xj)
672  aj.times_equal(xk);
673  aj.multiply_add(ak, xj);
674  aj.times_equal(c);
675  ytmp += aj.sum();
676  }
677 
678  y[iy] += alpha * ytmp ;
679  }
680  }
681 
682 #else
683 
684  // General version
685  template< typename MatrixValue , typename VectorValue >
686  KOKKOS_INLINE_FUNCTION
687  static void apply( const tensor_type & tensor ,
688  const MatrixValue * const a ,
689  const VectorValue * const x ,
690  VectorValue * const y ,
691  const VectorValue & alpha = VectorValue(1) )
692  {
693  const size_type nDim = tensor.dimension();
694  for ( size_type iy = 0 ; iy < nDim ; ++iy ) {
695 
696  const size_type nEntry = tensor.num_entry(iy);
697  const size_type iEntryBeg = tensor.entry_begin(iy);
698  const size_type iEntryEnd = iEntryBeg + nEntry;
699  size_type iEntry = iEntryBeg;
700 
701  VectorValue ytmp = 0 ;
702 
703  // Do entries with a blocked loop of size vectorsize
704  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
705  const size_type nBlock = nEntry / tensor_type::vectorsize;
706  const size_type nEntryB = nBlock * tensor_type::vectorsize;
707  const size_type iEnd = iEntryBeg + nEntryB;
708 
710  TV vy;
711  vy.zero();
712  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
713  const size_type *j = &tensor.coord(iEntry,0);
714  const size_type *k = &tensor.coord(iEntry,1);
715  TV aj(a, j), ak(a, k), xj(x, j), xk(x, k), c(&(tensor.value(iEntry)));
716 
717  // vy += c * ( aj * xk + ak * xj)
718  aj.times_equal(xk);
719  aj.multiply_add(ak, xj);
720  vy.multiply_add(c, aj);
721  }
722  ytmp += vy.sum();
723  }
724 
725  // Do remaining entries with a scalar loop
726  for ( ; iEntry<iEntryEnd; ++iEntry) {
727  const size_type j = tensor.coord(iEntry,0);
728  const size_type k = tensor.coord(iEntry,1);
729 
730  ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
731  }
732 
733  y[iy] += alpha * ytmp ;
734  }
735  }
736 #endif
737 
738  KOKKOS_INLINE_FUNCTION
739  static size_type matrix_size( const tensor_type & tensor )
740  { return tensor.dimension(); }
741 
742  KOKKOS_INLINE_FUNCTION
743  static size_type vector_size( const tensor_type & tensor )
744  { return tensor.dimension(); }
745 };
746 
747 // Specialization of Multiply< BlockCrsMatrix< BlockSpec, ... > > > for
748 // CrsProductTensor, which provides a version that processes blocks of FEM
749 // columns together to reduce the number of global reads of the sparse 3 tensor
750 
751 // Even though this isn't specific to Threads, templating on Device creates a
752 // duplicate specialization error for Cuda. Need to see if we can fix this,
753 // or put the implementation in another class easily specialized for Threads,
754 // OpenMP, ...
755 template< typename ValueType , typename MatrixValue , typename VectorValue ,
756  typename Device >
758 public:
759 
760  typedef Device execution_space ;
763  typedef typename BlockSpec::size_type size_type ;
764  typedef Kokkos::View< VectorValue** , Kokkos::LayoutLeft , execution_space > block_vector_type ;
766 
767  const matrix_type m_A ;
770 
772  const block_vector_type & x ,
773  const block_vector_type & y )
774  : m_A( A )
775  , m_x( x )
776  , m_y( y )
777  {}
778 
779  //--------------------------------------------------------------------------
780  // A( storage_size( m_A.block.size() ) , m_A.graph.row_map.size() );
781  // x( m_A.block.dimension() , m_A.graph.row_map.first_count() );
782  // y( m_A.block.dimension() , m_A.graph.row_map.first_count() );
783  //
784 
785  KOKKOS_INLINE_FUNCTION
786  void operator()( const size_type iBlockRow ) const
787  {
788  // Prefer that y[ m_A.block.dimension() ] be scratch space
789  // on the local thread, but cannot dynamically allocate
790  VectorValue * const y = & m_y(0,iBlockRow);
791 
792  const size_type iEntryBegin = m_A.graph.row_map[ iBlockRow ];
793  const size_type iEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
794 
795  // Leading dimension guaranteed contiguous for LayoutLeft
796  for ( size_type j = 0 ; j < m_A.block.dimension() ; ++j ) { y[j] = 0 ; }
797 
798  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
799  const VectorValue * const x = & m_x( 0 , m_A.graph.entries(iEntry) );
800  const MatrixValue * const a = & m_A.values( 0 , iEntry );
801 
803  }
804 
805  }
806 
807  /*
808  * Compute work range = (begin, end) such that adjacent threads write to
809  * separate cache lines
810  */
811  KOKKOS_INLINE_FUNCTION
812  std::pair< size_type , size_type >
813  compute_work_range( const size_type work_count ,
814  const size_type thread_count ,
815  const size_type thread_rank ) const
816  {
817  enum { work_align = 64 / sizeof(VectorValue) };
818  enum { work_shift = Kokkos::Impl::power_of_two< work_align >::value };
819  enum { work_mask = work_align - 1 };
820 
821  const size_type work_per_thread =
822  ( ( ( ( work_count + work_mask ) >> work_shift ) + thread_count - 1 ) /
823  thread_count ) << work_shift ;
824 
825  const size_type work_begin =
826  std::min( thread_rank * work_per_thread , work_count );
827  const size_type work_end =
828  std::min( work_begin + work_per_thread , work_count );
829 
830  return std::make_pair( work_begin , work_end );
831  }
832 
833 #if defined(__MIC__)
834 
835  // A MIC-specific version of the block-multiply algorithm, where block here
836  // means processing multiple FEM columns at a time
837  KOKKOS_INLINE_FUNCTION
838  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
839  {
840  const size_type iBlockRow = device.league_rank();
841 
842  // Check for valid row
843  const size_type row_count = m_A.graph.row_map.dimension_0()-1;
844  if (iBlockRow >= row_count)
845  return;
846 
847  const size_type num_thread = device.team_size();
848  const size_type thread_idx = device.team_rank();
849  std::pair<size_type,size_type> work_range =
850  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
851 
852  // Prefer that y[ m_A.block.dimension() ] be scratch space
853  // on the local thread, but cannot dynamically allocate
854  VectorValue * const y = & m_y(0,iBlockRow);
855 
856  // Leading dimension guaranteed contiguous for LayoutLeft
857  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
858  y[j] = 0 ;
859 
860  const tensor_type& tensor = m_A.block.tensor();
861 
862  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
863  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
864  const size_type BlockSize = 9;
865  const size_type numBlock =
866  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
867 
868  const MatrixValue* sh_A[BlockSize];
869  const VectorValue* sh_x[BlockSize];
870 
871  size_type iBlockEntry = iBlockEntryBeg;
872  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
873  const size_type block_size =
874  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
875 
876  for ( size_type col = 0; col < block_size; ++col ) {
877  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
878  sh_x[col] = & m_x( 0 , iBlockColumn );
879  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
880  }
881 
882  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
883 
884  const size_type nEntry = tensor.num_entry(iy);
885  const size_type iEntryBeg = tensor.entry_begin(iy);
886  const size_type iEntryEnd = iEntryBeg + nEntry;
887  size_type iEntry = iEntryBeg;
888 
889  VectorValue ytmp = 0 ;
890 
891  // Do entries with a blocked loop of size blocksize
892  const size_type nBlock = nEntry / tensor_type::vectorsize;
893  const size_type nEntryB = nBlock * tensor_type::vectorsize;
894  const size_type iEnd = iEntryBeg + nEntryB;
895 
896  typedef TinyVec<ValueType,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
897  typedef TinyVec<MatrixValue,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
898  typedef TinyVec<VectorValue,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
899  VecTV vy;
900  vy.zero();
901  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
902  const size_type *j = &tensor.coord(iEntry,0);
903  const size_type *k = &tensor.coord(iEntry,1);
904  ValTV c(&(tensor.value(iEntry)));
905 
906  for ( size_type col = 0; col < block_size; ++col ) {
907  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
908  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
909 
910  // vy += c * ( aj * xk + ak * xj)
911  aj.times_equal(xk);
912  aj.multiply_add(ak, xj);
913  vy.multiply_add(c, aj);
914  }
915  }
916  ytmp += vy.sum();
917 
918  // The number of nonzeros is always constrained to be a multiple of 8
919 
920  const size_type rem = iEntryEnd-iEntry;
921  if (rem >= 8) {
922  typedef TinyVec<ValueType,8,tensor_type::use_intrinsics> ValTV2;
923  typedef TinyVec<MatrixValue,8,tensor_type::use_intrinsics> MatTV2;
924  typedef TinyVec<VectorValue,8,tensor_type::use_intrinsics> VecTV2;
925  const size_type *j = &tensor.coord(iEntry,0);
926  const size_type *k = &tensor.coord(iEntry,1);
927  ValTV2 c(&(tensor.value(iEntry)));
928 
929  for ( size_type col = 0; col < block_size; ++col ) {
930  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
931  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
932 
933  // vy += c * ( aj * xk + ak * xj)
934  aj.times_equal(xk);
935  aj.multiply_add(ak, xj);
936  aj.times_equal(c);
937  ytmp += aj.sum();
938  }
939  }
940 
941  y[iy] += ytmp ;
942  }
943 
944  // Add a team barrier to keep the thread team in-sync before going on
945  // to the next block
946  device.team_barrier();
947  }
948 
949  }
950 
951 #else
952 
953  // A general hand-vectorized version of the block multiply algorithm, where
954  // block here means processing multiple FEM columns at a time. Note that
955  // auto-vectorization of a block algorithm doesn't work, because the
956  // stochastic loop is not the inner-most loop.
957  KOKKOS_INLINE_FUNCTION
958  void operator()( const typename Kokkos::TeamPolicy< execution_space >::member_type & device ) const
959  {
960  const size_type iBlockRow = device.league_rank();
961 
962  // Check for valid row
963  const size_type row_count = m_A.graph.row_map.dimension_0()-1;
964  if (iBlockRow >= row_count)
965  return;
966 
967  const size_type num_thread = device.team_size();
968  const size_type thread_idx = device.team_rank();
969  std::pair<size_type,size_type> work_range =
970  compute_work_range(m_A.block.dimension(), num_thread, thread_idx);
971 
972  // Prefer that y[ m_A.block.dimension() ] be scratch space
973  // on the local thread, but cannot dynamically allocate
974  VectorValue * const y = & m_y(0,iBlockRow);
975 
976  // Leading dimension guaranteed contiguous for LayoutLeft
977  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
978  y[j] = 0 ;
979 
980  const tensor_type& tensor = m_A.block.tensor();
981 
982  const size_type iBlockEntryBeg = m_A.graph.row_map[ iBlockRow ];
983  const size_type iBlockEntryEnd = m_A.graph.row_map[ iBlockRow + 1 ];
984  const size_type BlockSize = 14;
985  const size_type numBlock =
986  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
987 
988  const MatrixValue* sh_A[BlockSize];
989  const VectorValue* sh_x[BlockSize];
990 
991  size_type iBlockEntry = iBlockEntryBeg;
992  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
993  const size_type block_size =
994  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
995 
996  for ( size_type col = 0; col < block_size; ++col ) {
997  const size_type iBlockColumn = m_A.graph.entries( iBlockEntry + col );
998  sh_x[col] = & m_x( 0 , iBlockColumn );
999  sh_A[col] = & m_A.values( 0 , iBlockEntry + col );
1000  }
1001 
1002  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
1003 
1004  const size_type nEntry = tensor.num_entry(iy);
1005  const size_type iEntryBeg = tensor.entry_begin(iy);
1006  const size_type iEntryEnd = iEntryBeg + nEntry;
1007  size_type iEntry = iEntryBeg;
1008 
1009  VectorValue ytmp = 0 ;
1010 
1011  // Do entries with a blocked loop of size blocksize
1012  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
1013  const size_type nBlock = nEntry / tensor_type::vectorsize;
1014  const size_type nEntryB = nBlock * tensor_type::vectorsize;
1015  const size_type iEnd = iEntryBeg + nEntryB;
1016 
1020  VecTV vy;
1021  vy.zero();
1022  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
1023  const size_type *j = &tensor.coord(iEntry,0);
1024  const size_type *k = &tensor.coord(iEntry,1);
1025  ValTV c(&(tensor.value(iEntry)));
1026 
1027  for ( size_type col = 0; col < block_size; ++col ) {
1028  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
1029  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
1030 
1031  // vy += c * ( aj * xk + ak * xj)
1032  aj.times_equal(xk);
1033  aj.multiply_add(ak, xj);
1034  vy.multiply_add(c, aj);
1035  }
1036  }
1037  ytmp += vy.sum();
1038  }
1039 
1040  // Do remaining entries with a scalar loop
1041  for ( ; iEntry<iEntryEnd; ++iEntry) {
1042  const size_type j = tensor.coord(iEntry,0);
1043  const size_type k = tensor.coord(iEntry,1);
1044  ValueType cijk = tensor.value(iEntry);
1045 
1046  for ( size_type col = 0; col < block_size; ++col ) {
1047  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
1048  sh_A[col][k] * sh_x[col][j] );
1049  }
1050 
1051  }
1052 
1053  y[iy] += ytmp ;
1054  }
1055 
1056  // Add a team barrier to keep the thread team in-sync before going on
1057  // to the next block
1058  device.team_barrier();
1059  }
1060 
1061  }
1062 
1063 #endif
1064 
1065  static void apply( const matrix_type & A ,
1066  const block_vector_type & x ,
1067  const block_vector_type & y )
1068  {
1069  // Generally the block algorithm seems to perform better on the MIC,
1070  // as long as the stochastic size isn't too big, but doesn't perform
1071  // any better on the CPU (probably because the CPU has a fat L3 cache
1072  // to store the sparse 3 tensor).
1073 #ifdef __MIC__
1074  const bool use_block_algorithm = true;
1075 #else
1076  const bool use_block_algorithm = false;
1077 #endif
1078 
1079  const size_t row_count = A.graph.row_map.dimension_0() - 1 ;
1080  if (use_block_algorithm) {
1081 #ifdef __MIC__
1082  const size_t team_size = 4; // 4 hyperthreads for MIC
1083 #else
1084  const size_t team_size = 2; // 2 for everything else
1085 #endif
1086  const size_t league_size = row_count;
1087  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
1088  Kokkos::parallel_for( config , MultiplyImpl(A,x,y) );
1089  }
1090  else {
1091  Kokkos::parallel_for( row_count , MultiplyImpl(A,x,y) );
1092  }
1093  }
1094 };
1095 
1096 //----------------------------------------------------------------------------
1097 
1098 } /* namespace Stokhos */
1099 
1100 //----------------------------------------------------------------------------
1101 //----------------------------------------------------------------------------
1102 
1103 // Inject some functions into the Kokkos namespace
1104 namespace Kokkos {
1105 
1107  using Stokhos::deep_copy;
1108 
1109 } // namespace Kokkos
1110 
1111 #endif /* #ifndef STOKHOS_CRSPRODUCTTENSOR_HPP */
static const size_type num_entry_align
KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy< execution_space >::member_type &device) const
static void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor &src)
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
KOKKOS_INLINE_FUNCTION size_type num_entry(size_type i) const
Number of entries with a coordinate &#39;i&#39;.
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
Bases defined by combinatorial product of polynomial bases.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
CrsProductTensor< ValueType, Device > create_mean_based_product_tensor()
KOKKOS_INLINE_FUNCTION void operator()(const size_type iBlockRow) const
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
CrsProductTensor< ValueType, execution_space > tensor_type
static void apply(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP... > >::value >::type sum(const Kokkos::View< RD, RP... > &r, const Kokkos::View< XD, XP... > &x)
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
bool operator()(const CijkRowCount &a, const CijkRowCount &b) const
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
BlockCrsMatrix< BlockSpec, MatrixValue, execution_space > matrix_type
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero&#39;s.
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
static CrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
const block_vector_type m_x
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y, const VectorValue &alpha=VectorValue(1))
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
KOKKOS_INLINE_FUNCTION size_type entry_end(size_type i) const
End entries with a coordinate &#39;i&#39;.
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:260
static const size_type host_vectorsize
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > vec_type
KOKKOS_INLINE_FUNCTION CrsProductTensor & operator=(const CrsProductTensor< value_type, execution_space, M > &rhs)
CrsProductTensor< ValueType, Device, Memory >::HostMirror create_mirror_view(const CrsProductTensor< ValueType, Device, Memory > &src)
CrsProductTensor< value_type, host_mirror_space > HostMirror
KOKKOS_INLINE_FUNCTION PCE< Storage > max(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION CrsProductTensor(const CrsProductTensor< value_type, execution_space, M > &rhs)
Kokkos::View< VectorValue **, Kokkos::LayoutLeft, execution_space > block_vector_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
Abstract base class for multivariate orthogonal polynomials generated from tensor products of univari...
expr expr expr expr j
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop&#39;s per multiply-add.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Stokhos::Sparse3Tensor< int, double > Cijk_type
KOKKOS_INLINE_FUNCTION CrsProductTensor()
Kokkos::DefaultExecutionSpace device
static HostMirror create_mirror_view(const CrsProductTensor &tensor)
CrsProductTensor< ValueType, Device > create_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params=Teuchos::ParameterList())
Kokkos::View< value_type *, Kokkos::LayoutLeft, execution_space, memory_type > value_array_type
void deep_copy(const CrsProductTensor< ValueType, DstDevice, DstMemory > &dst, const CrsProductTensor< ValueType, SrcDevice, SrcMemory > &src)
CRS matrix of dense blocks.
MultiplyImpl(const matrix_type &A, const block_vector_type &x, const block_vector_type &y)
k_iterator k_end() const
Iterator pointing to last k entry.
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > coord_array_type
Kokkos::ViewTraits< size_type *, execution_space, void, void >::host_mirror_space host_mirror_space
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
KOKKOS_INLINE_FUNCTION size_type avg_entries_per_row() const
Number average number of entries per row.
KOKKOS_INLINE_FUNCTION std::pair< size_type, size_type > compute_work_range(const size_type work_count, const size_type thread_count, const size_type thread_rank) const
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > row_map_array_type
const block_vector_type m_y
Kokkos::View< size_type *[2], Kokkos::LayoutLeft, execution_space, memory_type > coord2_array_type
KOKKOS_INLINE_FUNCTION size_type entry_maximum() const
Maximum sparse entries for any coordinate.
KOKKOS_INLINE_FUNCTION size_type entry_begin(size_type i) const
Begin entries with a coordinate &#39;i&#39;.
KOKKOS_INLINE_FUNCTION void zero()
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
Kokkos::View< size_type *, Kokkos::LayoutLeft, execution_space, memory_type > entry_array_type
KOKKOS_INLINE_FUNCTION ~CrsProductTensor()
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267
virtual ordinal_type size() const =0
Return total size of basis.
Stokhos::CrsMatrix< ValueType, Device, Layout >::HostMirror create_mirror_view(const Stokhos::CrsMatrix< ValueType, Device, Layout > &A)
StochasticProductTensor< ValueType, tensor_type, execution_space > BlockSpec
static const size_type cuda_vectorsize
KOKKOS_INLINE_FUNCTION bool is_empty() const
Is the tensor empty.
static CrsProductTensor createMeanBased()
k_iterator k_begin() const
Iterator pointing to first k entry.