Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
Kokkos_CrsMatrix_MP_Vector_Cuda.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
43 #define KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP
44 
45 #if defined( __CUDACC__)
46 
48 #include "Kokkos_Core.hpp"
50 
51 //----------------------------------------------------------------------------
52 // Specializations of Kokkos::CrsMatrix for Sacado::MP::Vector scalar type
53 // and Cuda device
54 //----------------------------------------------------------------------------
55 
56 namespace Stokhos {
57 
58 namespace details {
59 
60 // We can make things slightly faster for the ensemble multiply kernel with
61 // NumPerThread == 1 by using at most 32 registers per thread for 100%
62 // occupancy, which we get with these launch bounds on Kepler. For
63 // NumPerThread > 1 it is worse though due to too many spilled registers.
64 template <typename Kernel>
65 __global__ void
66 #if __CUDA_ARCH__ >= 300
67 __launch_bounds__(1024,2)
68 #endif
69 FullOccupancyKernelLaunch(Kernel kernel) {
70  kernel();
71 }
72 
73 // Kernel implementing y = A * x for Cuda device where
74 // A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
75 // x, y == Kokkos::View< Sacado::MP::Vector<...>*,...>,
76 // x and y are rank 1
77 // We spell everything out here to make sure the ranks and devices match.
78 //
79 // This implementation uses the underlying 2-D view directly.
80 // Currently only works for statically sized MP::Vector
81 template <typename MatrixStorage,
82  typename MatrixOrdinal,
83  typename MatrixMemory,
84  typename MatrixSize,
85  typename InputStorage,
86  typename ... InputP,
87  typename OutputStorage,
88  typename ... OutputP,
89  typename Update>
90 class MPMultiply< Kokkos::CrsMatrix<Sacado::MP::Vector<MatrixStorage>,
91  MatrixOrdinal,
92  Kokkos::Cuda,
93  MatrixMemory,
94  MatrixSize>,
95  Kokkos::View< Sacado::MP::Vector<InputStorage>*,
96  InputP... >,
97  Kokkos::View< Sacado::MP::Vector<OutputStorage>*,
98  OutputP... >,
99  Update,
100  void
101  >
102 {
103 public:
104 
105  typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
106  typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
107  typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
108 
109  typedef typename Kokkos::Cuda Device;
110  typedef Device execution_space;
111  typedef typename execution_space::size_type size_type;
112 
113  typedef Kokkos::CrsMatrix<MatrixValue,
114  MatrixOrdinal,
116  MatrixMemory,
117  MatrixSize> matrix_type;
118  typedef typename matrix_type::values_type matrix_values_type;
119  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
120  typedef Kokkos::View< InputVectorValue*,
121  InputP... > input_vector_type;
122  typedef Kokkos::View< OutputVectorValue*,
123  OutputP... > output_vector_type;
124  typedef Update update_type;
125 
126  // Multiply that uses shared memory for coalesced access of sparse
127  // graph row offsets and column indices and Rank-1 vectors
128  template <unsigned NumPerThread>
129  struct Kernel {
130  typedef typename output_vector_type::value_type output_vector_value;
132 
133  const matrix_graph_type m_Agraph;
134  const matrix_values_type m_Avals;
135  const input_vector_type m_x;
136  const output_vector_type m_y;
137  const update_type m_update;
138  const size_type m_row_count;
139 
140  Kernel( const matrix_type & A,
141  const input_vector_type & x,
142  const output_vector_type & y,
143  const update_type& update )
144  : m_Agraph( A.graph )
145  , m_Avals( A.values )
146  , m_x( x )
147  , m_y( y )
148  , m_update( update )
149  , m_row_count( A.graph.row_map.dimension_0()-1 )
150  {}
151 
152  __device__
153  inline void operator()(void) const
154  {
155  volatile size_type * const sh_row =
156  kokkos_impl_cuda_shared_memory<size_type>();
157  volatile size_type * const sh_col = sh_row + blockDim.y+1;
158 
159  const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
160  const size_type nid = blockDim.x*blockDim.y;
161 
162  const size_type block_row = blockDim.y*blockIdx.x;
163 
164  // Read blockDim.y+1 row offsets in coalesced manner
165  const size_type num_row =
166  block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
167  m_row_count+1 - block_row;
168  for (size_type i=tid; i<num_row; i+=nid)
169  sh_row[i] = m_Agraph.row_map[block_row+i];
170  __syncthreads();
171 
172  const size_type iRow = block_row + threadIdx.y;
173  if (iRow < m_row_count) {
174  scalar_type sum[NumPerThread];
175  const size_type iEntryBegin = sh_row[threadIdx.y];
176  const size_type iEntryEnd = sh_row[threadIdx.y+1];
177 
178  for (size_type e=0; e<NumPerThread; ++e)
179  sum[e] = 0;
180 
181  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
182  col_block+=blockDim.x) {
183  const size_type num_col =
184  col_block+blockDim.x <= iEntryEnd ?
185  blockDim.x : iEntryEnd-col_block;
186 
187  // Read blockDim.x entries column indices at a time to maintain
188  // coalesced accesses (don't need __syncthreads() assuming
189  // blockDim.x <= warp_size
190  // Note: it might be a little faster if we ensured aligned access
191  // to m_A.graph.entries() and m_A.values() below.
192  if (threadIdx.x < num_col)
193  sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
194  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
195  __syncthreads();
196 
197  for ( size_type col = 0; col < num_col; ++col ) {
198  size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
199 
200  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
201  ++e, ee+=blockDim.x) {
202  sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
203  m_x(iCol).fastAccessCoeff(ee);
204  }
205  }
206 
207  }
208 
209  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
210  ++e, ee+=blockDim.x) {
211  m_update( m_y(iRow).fastAccessCoeff(ee), sum[e] );
212  }
213  }
214  } // operator()
215  };
216 
217  static void apply( const matrix_type & A,
218  const input_vector_type & x,
219  const output_vector_type & y,
220  const update_type & update )
221  {
222  // Compute CUDA block and grid sizes.
223  //
224  // For Kepler, the best block size appears to be 256 threads with
225  // 16 threads per vector for double precision, yielding 16 rows per
226  // block. Due to register usage, this gives 64 or 48 warps per SM
227  // and thus 8 or 6 blocks per SM. We use these values by default if
228  // the user-specified block dimensions are zero
229 
230  const size_type value_dimension = Kokkos::dimension_scalar(x);
231 
232  size_type threads_per_vector = A.dev_config.block_dim.x;
233  if (threads_per_vector == 0)
234  threads_per_vector = value_dimension ;
235  size_type rows_per_block = A.dev_config.block_dim.y;
236  if (rows_per_block == 0)
237  rows_per_block = 256 / threads_per_vector;
238  const size_type row_count = A.graph.row_map.dimension_0()-1;
239  const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
240  const dim3 block( threads_per_vector, rows_per_block, 1 );
241  const dim3 grid( num_blocks, 1 );
242 
243  // Check threads_per_vector evenly divides number of vector entries
244  size_type num_per_thread = value_dimension / threads_per_vector;
245  TEUCHOS_TEST_FOR_EXCEPTION(
246  num_per_thread * threads_per_vector != value_dimension, std::logic_error,
247  "Entries/thread * threads/vector must equal number of vector entries");
248 
249  // Check threads_per_vector is not greater than warp size (kernels assume
250  // this)
251  const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
252  TEUCHOS_TEST_FOR_EXCEPTION(
253  threads_per_vector > warp_size, std::logic_error,
254  "Threads/vector cannont exceed Cuda warp size");
255 
256  // Launch kernel based on static number of entries per thread
257  if (num_per_thread == 1) {
258  launch_impl<1>( A, x, y, update, block, grid );
259  }
260  else if (num_per_thread == 2) {
261  launch_impl<2>( A, x, y, update, block, grid );
262  }
263  else if (num_per_thread == 3) {
264  launch_impl<3>( A, x, y, update, block, grid );
265  }
266  else if (num_per_thread == 4) {
267  launch_impl<4>( A, x, y, update, block, grid );
268  }
269  else
270  TEUCHOS_TEST_FOR_EXCEPTION(
271  true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
272  }
273 
274 private:
275 
276  // Function to launch our kernel, templated on number of entries per thread
277  // and shared memory choice
278  template <unsigned num_per_thread>
279  static void launch_impl( const matrix_type & A,
280  const input_vector_type & x,
281  const output_vector_type & y,
282  const update_type & update,
283  dim3 block,
284  dim3 grid)
285  {
286  typedef Kernel<num_per_thread> Krnl;
287 
288  // Use this to check occupancy, 64 is 100% on Kepler
289  const bool occupancy_check = false;
290  if (occupancy_check) {
291  DeviceProp device_prop;
292  size_type warps_per_sm;
293  if (num_per_thread == 1)
294  warps_per_sm = device_prop.get_resident_warps_per_sm(
295  FullOccupancyKernelLaunch<Krnl>);
296  else
297  warps_per_sm = device_prop.get_resident_warps_per_sm(
298  Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
299  std::cout << "warps_per_sm = " << warps_per_sm
300  << " max_warps_per_sm = " << device_prop.max_warps_per_sm
301  << std::endl;
302  }
303 
304  const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
305  if (sizeof(size_type) <= 4)
306  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
307  else
308  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
309 
310  // Launch
311  if (num_per_thread == 1)
312  FullOccupancyKernelLaunch<<< grid, block, shared >>>
313  ( Krnl( A, x, y, update ) );
314  else
315  Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
316  ( Krnl( A, x, y, update ) );
317  }
318 };
319 
320 // Kernel implementing y = A * x for Cuda device where
321 // A == Kokkos::CrsMatrix< Sacado::MP::Vector<...>,...>,
322 // x, y == Kokkos::View< Sacado::MP::Vector<...>**,...>,
323 // x and y are rank 2
324 // We spell everything out here to make sure the ranks and devices match.
325 //
326 // This implementation uses the underlying 2-D view directly.
327 // Currently only works for statically sized MP::Vector
328 template <typename MatrixStorage,
329  typename MatrixOrdinal,
330  typename MatrixMemory,
331  typename MatrixSize,
332  typename InputStorage,
333  typename ... InputP,
334  typename OutputStorage,
335  typename ... OutputP,
336  typename Update>
337 class MPMultiply< Kokkos::CrsMatrix<Sacado::MP::Vector<MatrixStorage>,
338  MatrixOrdinal,
339  Kokkos::Cuda,
340  MatrixMemory,
341  MatrixSize>,
342  Kokkos::View< Sacado::MP::Vector<InputStorage>**,
343  InputP... >,
344  Kokkos::View< Sacado::MP::Vector<OutputStorage>**,
345  OutputP... >,
346  Update,
347  void
348  >
349 {
350 public:
351 
352  typedef Sacado::MP::Vector<MatrixStorage> MatrixValue;
353  typedef Sacado::MP::Vector<InputStorage> InputVectorValue;
354  typedef Sacado::MP::Vector<OutputStorage> OutputVectorValue;
355 
356  typedef typename Kokkos::Cuda Device;
357  typedef Device execution_space;
358  typedef typename execution_space::size_type size_type;
359 
360  typedef Kokkos::CrsMatrix<MatrixValue,
361  MatrixOrdinal,
363  MatrixMemory,
364  MatrixSize> matrix_type;
365  typedef typename matrix_type::values_type matrix_values_type;
366  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
367  typedef Kokkos::View< InputVectorValue**,
368  InputP... > input_vector_type;
369  typedef Kokkos::View< OutputVectorValue**,
370  OutputP... > output_vector_type;
371  typedef Update update_type;
372 
373  // Specialization that uses shared memory for coalesced access of sparse
374  // graph row offsets and column indices and Rank-2 vectors
375  template <unsigned NumPerThread>
376  struct Kernel {
377  typedef typename output_vector_type::value_type output_vector_value;
379 
380  const matrix_graph_type m_Agraph;
381  const matrix_values_type m_Avals;
382  const input_vector_type m_x;
383  const output_vector_type m_y;
384  const update_type m_update;
385  const size_type m_row_count;
386  const size_type m_num_vec_cols;
387 
388  Kernel( const matrix_type & A,
389  const input_vector_type & x,
390  const output_vector_type & y,
391  const update_type& update )
392  : m_Agraph( A.graph )
393  , m_Avals( A.values )
394  , m_x( x )
395  , m_y( y )
396  , m_update( update )
397  , m_row_count( A.graph.row_map.dimension_0()-1 )
398  , m_num_vec_cols( x.dimension_1() )
399  {}
400 
401  __device__
402  inline void operator()(void) const
403  {
404  volatile size_type * const sh_row =
405  kokkos_impl_cuda_shared_memory<size_type>();
406  volatile size_type * const sh_col = sh_row + blockDim.y+1;
407 
408  const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
409  const size_type nid = blockDim.x*blockDim.y;
410 
411  const size_type block_row = blockDim.y*blockIdx.x;
412 
413  // Read blockDim.y+1 row offsets in coalesced manner
414  const size_type num_row =
415  block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
416  m_row_count+1 - block_row;
417  for (size_type i=tid; i<num_row; i+=nid)
418  sh_row[i] = m_Agraph.row_map[block_row+i];
419  __syncthreads();
420 
421  const size_type iRow = block_row + threadIdx.y;
422  if (iRow < m_row_count) {
423  scalar_type sum[NumPerThread];
424  const size_type iEntryBegin = sh_row[threadIdx.y];
425  const size_type iEntryEnd = sh_row[threadIdx.y+1];
426 
427  // We could potentially improve performance by putting this loop
428  // inside the matrix column loop, however that would require storing
429  // an array for sum over vector columns, which would require shared
430  // memory.
431  for (size_type vec_col=0; vec_col<m_num_vec_cols; vec_col++) {
432 
433  for (size_type e=0; e<NumPerThread; ++e)
434  sum[e] = 0;
435 
436  for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
437  col_block+=blockDim.x) {
438  const size_type num_col =
439  col_block+blockDim.x <= iEntryEnd ?
440  blockDim.x : iEntryEnd-col_block;
441 
442  // Read blockDim.x entries column indices at a time to maintain
443  // coalesced accesses (don't need __syncthreads() assuming
444  // blockDim.x <= warp_size
445  // Note: it might be a little faster if we ensured aligned access
446  // to m_A.graph.entries() and m_A.values() below.
447  if (threadIdx.x < num_col)
448  sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
449  if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
450  __syncthreads();
451 
452  for ( size_type col = 0; col < num_col; ++col ) {
453  size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
454 
455  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
456  ++e, ee+=blockDim.x) {
457  sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
458  m_x(iCol, vec_col).fastAccessCoeff(ee);
459  }
460  }
461 
462  }
463 
464  for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
465  ++e, ee+=blockDim.x) {
466  m_update( m_y(iRow, vec_col).fastAccessCoeff(ee), sum[e] );
467  }
468 
469  }
470  }
471  } // operator()
472  };
473 
474  static void apply( const matrix_type & A,
475  const input_vector_type & x,
476  const output_vector_type & y,
477  const update_type & update )
478  {
479  // Compute CUDA block and grid sizes.
480  //
481  // For Kepler, the best block size appears to be 256 threads with
482  // 16 threads per vector for double precision, yielding 16 rows per
483  // block. Due to register usage, this gives 64 or 48 warps per SM
484  // and thus 8 or 6 blocks per SM. We use these values by default if
485  // the user-specified block dimensions are zero
486 
487  const size_type value_dimension = dimension_scalar(x);
488 
489  size_type threads_per_vector = A.dev_config.block_dim.x;
490  if (threads_per_vector == 0)
491  threads_per_vector = value_dimension ;
492  size_type rows_per_block = A.dev_config.block_dim.y;
493  if (rows_per_block == 0)
494  rows_per_block = 256 / threads_per_vector;
495  const size_type row_count = A.graph.row_map.dimension_0()-1;
496  const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
497  const dim3 block( threads_per_vector, rows_per_block, 1 );
498  const dim3 grid( num_blocks, 1 );
499 
500  // Check threads_per_vector evenly divides number of vector entries
501  size_type num_per_thread = value_dimension / threads_per_vector;
502  TEUCHOS_TEST_FOR_EXCEPTION(
503  num_per_thread * threads_per_vector != value_dimension, std::logic_error,
504  "Entries/thread * threads/vector must equal number of vector entries");
505 
506  // Launch kernel based on static number of entries per thread
507  if (num_per_thread == 1) {
508  launch_impl<1>( A, x, y, update, block, grid );
509  }
510  else if (num_per_thread == 2) {
511  launch_impl<2>( A, x, y, update, block, grid );
512  }
513  else if (num_per_thread == 3) {
514  launch_impl<3>( A, x, y, update, block, grid );
515  }
516  else if (num_per_thread == 4) {
517  launch_impl<4>( A, x, y, update, block, grid );
518  }
519  else
520  TEUCHOS_TEST_FOR_EXCEPTION(
521  true, std::logic_error, "Invalid num_per_thread == " << num_per_thread);
522  }
523 
524 private:
525 
526  // Function to launch our kernel, templated on number of entries per thread
527  // and shared memory choice
528  template <unsigned num_per_thread>
529  static void launch_impl( const matrix_type & A,
530  const input_vector_type & x,
531  const output_vector_type & y,
532  const update_type & update,
533  dim3 block,
534  dim3 grid)
535  {
536  typedef Kernel<num_per_thread> Krnl;
537 
538  // Use this to check occupancy, 64 is 100% on Kepler
539  const bool occupancy_check = false;
540  if (occupancy_check) {
541  DeviceProp device_prop;
542  size_type warps_per_sm;
543  if (num_per_thread == 1)
544  warps_per_sm = device_prop.get_resident_warps_per_sm(
545  FullOccupancyKernelLaunch<Krnl>);
546  else
547  warps_per_sm = device_prop.get_resident_warps_per_sm(
548  Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
549  std::cout << "warps_per_sm = " << warps_per_sm
550  << " max_warps_per_sm = " << device_prop.max_warps_per_sm
551  << std::endl;
552  }
553 
554  const size_t shared = (block.y+1 + block.x*block.y)*sizeof(size_type);
555  if (sizeof(size_type) <= 4)
556  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
557  else
558  cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
559 
560  // Launch
561  if (num_per_thread == 1)
562  FullOccupancyKernelLaunch<<< grid, block, shared >>>
563  ( Krnl( A, x, y, update ) );
564  else
565  Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
566  ( Krnl( A, x, y, update ) );
567  }
568 };
569 
570 } // namespace details
571 
572 } // namespace Stokhos
573 
574 #endif /* #if defined( __CUDACC__) */
575 
576 #endif /* #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP */
Kokkos::DefaultExecutionSpace execution_space
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP... > >::value >::type sum(const Kokkos::View< RD, RP... > &r, const Kokkos::View< XD, XP... > &x)
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:260
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)