Actual source code: ex2.c

  1: static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
  2: /*
  3:   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
  4:   operations in the default stream and does not sync these operations since it assumes routines consume
  5:   the destination data are also on the default stream. However, when destination data in on CPU,
  6:   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
  7:  */

  9: #include <petscvec.h>
 10: int main(int argc,char **argv)
 11: {
 12:   PetscInt           i,n=100000; /* Big enough to make the asynchronous copy meaningful */
 13:   PetscScalar        *val;
 14:   const PetscScalar  *yval;
 15:   Vec                x,y;
 16:   PetscMPIInt        size;
 17:   IS                 ix,iy;
 18:   VecScatter         vscat;

 20:   PetscInitialize(&argc,&argv,(char*)0,help);
 21:   MPI_Comm_size(PETSC_COMM_WORLD,&size);

 24:   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
 25:      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
 26:      cudaMemcpyDeviceToHost.
 27:    */
 28:   VecCreateSeq(PETSC_COMM_WORLD,n,&x);
 29:   VecSetFromOptions(x);
 30:   VecCreateSeq(PETSC_COMM_WORLD,n,&y);
 31:   VecSetFromOptions(y);

 33:   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
 34:   VecGetArray(x,&val);
 35:   for (i=0; i<n; i++) val[i] = i/2.0;
 36:   VecRestoreArray(x,&val);
 37:   VecScale(x,2.0);
 38:   VecSet(y,314);

 40:   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
 41:   VecGetArray(y,&val);
 42:   VecRestoreArray(y,&val);

 44:   /* The vscat is simply a vector copy */
 45:   ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);
 46:   ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);
 47:   VecScatterCreate(x,ix,y,iy,&vscat);

 49:   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
 50:      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
 51:    */
 52:   VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
 53:   VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);
 54:   VecGetArrayRead(y,&yval);
 55:   /* Display the first and the last entries of y to see if it is valid on host */
 56:   PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));
 57:   VecRestoreArrayRead(y,&yval);

 59:   VecDestroy(&x);
 60:   VecDestroy(&y);
 61:   ISDestroy(&ix);
 62:   ISDestroy(&iy);
 63:   VecScatterDestroy(&vscat);
 64:   PetscFinalize();
 65:   return 0;
 66: }

 68: /*TEST

 70:    test:
 71:     requires: cuda
 72:     diff_args: -j
 73:     #make sure the host memory is pinned
 74:     # sf_backend cuda is not needed if compiling only with cuda
 75:     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0

 77:    test:
 78:     suffix: hip
 79:     requires: hip
 80:     diff_args: -j
 81:     output_file: output/ex2_1.out
 82:     #make sure the host memory is pinned
 83:     # sf_backend hip is not needed if compiling only with hip
 84:     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0

 86: TEST*/