/* Implements the sequential cusp vectors. */ #define PETSC_SKIP_COMPLEX #define PETSC_SKIP_SPINLOCK #include #include #include <../src/vec/vec/impls/dvecimpl.h> #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h> #include #undef __FUNCT__ #define __FUNCT__ "VecCUSPAllocateCheck" /* Allocates space for the vector array on the GPU if it does not exist. Does NOT change the PetscCUSPFlag for the vector Does NOT zero the CUSP array */ PetscErrorCode VecCUSPAllocateCheck(Vec v) { cudaError_t err; cudaStream_t stream; Vec_CUSP *veccusp; PetscFunctionBegin; if (!v->spptr) { try { v->spptr = new Vec_CUSP; veccusp = (Vec_CUSP*)v->spptr; veccusp->GPUarray = new CUSPARRAY; veccusp->GPUarray->resize((PetscBLASInt)v->map->n); err = cudaStreamCreate(&stream);CHKERRCUSP(err); veccusp->stream = stream; veccusp->hostDataRegisteredAsPageLocked = PETSC_FALSE; v->ops->destroy = VecDestroy_SeqCUSP; if (v->valid_GPU_array == PETSC_CUSP_UNALLOCATED) { if (v->data && ((Vec_Seq*)v->data)->array) { v->valid_GPU_array = PETSC_CUSP_CPU; } else { v->valid_GPU_array = PETSC_CUSP_GPU; } } } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPCopyToGPU" /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */ PetscErrorCode VecCUSPCopyToGPU(Vec v) { PetscErrorCode ierr; cudaError_t err; Vec_CUSP *veccusp; CUSPARRAY *varray; PetscFunctionBegin; ierr = VecCUSPAllocateCheck(v);CHKERRQ(ierr); if (v->valid_GPU_array == PETSC_CUSP_CPU) { ierr = PetscLogEventBegin(VEC_CUSPCopyToGPU,v,0,0,0);CHKERRQ(ierr); try { veccusp=(Vec_CUSP*)v->spptr; varray=veccusp->GPUarray; err = cudaMemcpy(varray->data().get(),((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogEventEnd(VEC_CUSPCopyToGPU,v,0,0,0);CHKERRQ(ierr); v->valid_GPU_array = PETSC_CUSP_BOTH; } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPCopyToGPUSome" PetscErrorCode VecCUSPCopyToGPUSome(Vec v, PetscCUSPIndices ci) { CUSPARRAY *varray; PetscErrorCode ierr; cudaError_t err; PetscScalar *cpuPtr, *gpuPtr; Vec_Seq *s; VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter; PetscFunctionBegin; ierr = VecCUSPAllocateCheck(v);CHKERRQ(ierr); if (v->valid_GPU_array == PETSC_CUSP_CPU) { s = (Vec_Seq*)v->data; ierr = PetscLogEventBegin(VEC_CUSPCopyToGPUSome,v,0,0,0);CHKERRQ(ierr); varray = ((Vec_CUSP*)v->spptr)->GPUarray; gpuPtr = varray->data().get() + ptop_scatter->recvLowestIndex; cpuPtr = s->array + ptop_scatter->recvLowestIndex; /* Note : this code copies the smallest contiguous chunk of data containing ALL of the indices */ err = cudaMemcpy(gpuPtr,cpuPtr,ptop_scatter->nr*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); // Set the buffer states v->valid_GPU_array = PETSC_CUSP_BOTH; ierr = PetscLogEventEnd(VEC_CUSPCopyToGPUSome,v,0,0,0);CHKERRQ(ierr); } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPCopyFromGPU" /* VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU */ PetscErrorCode VecCUSPCopyFromGPU(Vec v) { PetscErrorCode ierr; cudaError_t err; Vec_CUSP *veccusp; CUSPARRAY *varray; PetscFunctionBegin; ierr = VecCUSPAllocateCheckHost(v);CHKERRQ(ierr); if (v->valid_GPU_array == PETSC_CUSP_GPU) { ierr = PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);CHKERRQ(ierr); try { veccusp=(Vec_CUSP*)v->spptr; varray=veccusp->GPUarray; err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray->data().get(),v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);CHKERRQ(ierr); v->valid_GPU_array = PETSC_CUSP_BOTH; } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPCopyFromGPUSome" /* Note that this function only copies *some* of the values up from the GPU to CPU, which means that we need recombine the data at some point before using any of the standard functions. We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray where you have to always call in pairs */ PetscErrorCode VecCUSPCopyFromGPUSome(Vec v, PetscCUSPIndices ci) { CUSPARRAY *varray; PetscErrorCode ierr; cudaError_t err; PetscScalar *cpuPtr, *gpuPtr; Vec_Seq *s; VecScatterCUSPIndices_PtoP ptop_scatter = (VecScatterCUSPIndices_PtoP)ci->scatter; PetscFunctionBegin; ierr = VecCUSPAllocateCheckHost(v);CHKERRQ(ierr); if (v->valid_GPU_array == PETSC_CUSP_GPU) { ierr = PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);CHKERRQ(ierr); varray=((Vec_CUSP*)v->spptr)->GPUarray; s = (Vec_Seq*)v->data; gpuPtr = varray->data().get() + ptop_scatter->sendLowestIndex; cpuPtr = s->array + ptop_scatter->sendLowestIndex; /* Note : this code copies the smallest contiguous chunk of data containing ALL of the indices */ err = cudaMemcpy(cpuPtr,gpuPtr,ptop_scatter->ns*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUSP(err); ierr = VecCUSPRestoreArrayRead(v,&varray);CHKERRQ(ierr); ierr = PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);CHKERRQ(ierr); } PetscFunctionReturn(0); } /*MC VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP Options Database Keys: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions() Level: beginner .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq() M*/ /* for VecAYPX_SeqCUSP*/ namespace cusp { namespace blas { namespace detail { template struct AYPX : public thrust::binary_function { T alpha; AYPX(T _alpha) : alpha(_alpha) {} __host__ __device__ T operator()(T x, T y) { return alpha * y + x; } }; } template void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha) { thrust::transform(first1,last1,first2,first2,detail::AYPX(alpha)); } template void aypx(const Array1& x, Array2& y, ScalarType alpha) { #if defined(CUSP_VERSION) && CUSP_VERSION >= 500 cusp::assert_same_dimensions(x,y); #else detail::assert_same_dimensions(x,y); #endif aypx(x.begin(),x.end(),y.begin(),alpha); } } } #undef __FUNCT__ #define __FUNCT__ "VecAYPX_SeqCUSP" PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin) { CUSPARRAY *xarray,*yarray; PetscErrorCode ierr; PetscFunctionBegin; ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(yin,&yarray);CHKERRQ(ierr); try { if (alpha != 0.0) { cusp::blas::aypx(*xarray,*yarray,alpha); ierr = PetscLogFlops(2.0*yin->map->n);CHKERRQ(ierr); } else { cusp::blas::copy(*xarray,*yarray); } ierr = WaitForGPU();CHKERRCUSP(ierr); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayReadWrite(yin,&yarray);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecAXPY_SeqCUSP" PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin) { CUSPARRAY *xarray,*yarray; PetscErrorCode ierr; PetscFunctionBegin; if (alpha != 0.0) { ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(yin,&yarray);CHKERRQ(ierr); try { cusp::blas::axpy(*xarray,*yarray,alpha); ierr = WaitForGPU();CHKERRCUSP(ierr); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayReadWrite(yin,&yarray);CHKERRQ(ierr); ierr = PetscLogFlops(2.0*yin->map->n);CHKERRQ(ierr); } PetscFunctionReturn(0); } struct VecCUSPPointwiseDivide { template __host__ __device__ void operator()(Tuple t) { thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t); } }; #undef __FUNCT__ #define __FUNCT__ "VecPointwiseDivide_SeqCUSP" PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin) { CUSPARRAY *warray=NULL,*xarray=NULL,*yarray=NULL; PetscErrorCode ierr; PetscFunctionBegin; ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayWrite(win,&warray);CHKERRQ(ierr); try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( warray->begin(), xarray->begin(), yarray->begin())), thrust::make_zip_iterator( thrust::make_tuple( warray->end(), xarray->end(), yarray->end())), VecCUSPPointwiseDivide()); ierr = WaitForGPU();CHKERRCUSP(ierr); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogFlops(win->map->n);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(win,&warray);CHKERRQ(ierr); PetscFunctionReturn(0); } struct VecCUSPWAXPY { template __host__ __device__ void operator()(Tuple t) { thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t); } }; struct VecCUSPSum { template __host__ __device__ void operator()(Tuple t) { thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t); } }; struct VecCUSPDiff { template __host__ __device__ void operator()(Tuple t) { thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t); } }; #undef __FUNCT__ #define __FUNCT__ "VecWAXPY_SeqCUSP" PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin) { CUSPARRAY *xarray=NULL,*yarray=NULL,*warray=NULL; PetscErrorCode ierr; PetscFunctionBegin; if (alpha == 0.0) { ierr = VecCopy_SeqCUSP(yin,win);CHKERRQ(ierr); } else { ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayWrite(win,&warray);CHKERRQ(ierr); if (alpha == 1.0) { try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( warray->begin(), yarray->begin(), xarray->begin())), thrust::make_zip_iterator( thrust::make_tuple( warray->end(), yarray->end(), xarray->end())), VecCUSPSum()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogFlops(win->map->n);CHKERRQ(ierr); } else if (alpha == -1.0) { try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( warray->begin(), yarray->begin(), xarray->begin())), thrust::make_zip_iterator( thrust::make_tuple( warray->end(), yarray->end(), xarray->end())), VecCUSPDiff()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogFlops(win->map->n);CHKERRQ(ierr); } else { try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( warray->begin(), yarray->begin(), thrust::make_constant_iterator(alpha), xarray->begin())), thrust::make_zip_iterator( thrust::make_tuple( warray->end(), yarray->end(), thrust::make_constant_iterator(alpha), xarray->end())), VecCUSPWAXPY()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogFlops(2*win->map->n);CHKERRQ(ierr); } ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(win,&warray);CHKERRQ(ierr); } PetscFunctionReturn(0); } /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */ struct VecCUSPMAXPY4 { template __host__ __device__ void operator()(Tuple t) { /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */ thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t); } }; struct VecCUSPMAXPY3 { template __host__ __device__ void operator()(Tuple t) { /*y += a1*x1 +a2*x2 + a3*x3 */ thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t); } }; struct VecCUSPMAXPY2 { template __host__ __device__ void operator()(Tuple t) { /*y += a1*x1 +a2*x2*/ thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t); } }; #undef __FUNCT__ #define __FUNCT__ "VecMAXPY_SeqCUSP" PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y) { PetscErrorCode ierr; CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3; PetscInt n = xin->map->n,j,j_rem; PetscScalar alpha0,alpha1,alpha2,alpha3; PetscFunctionBegin; ierr = PetscLogFlops(nv*2.0*n);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(xin,&xarray);CHKERRQ(ierr); switch (j_rem=nv&0x3) { case 3: alpha0 = alpha[0]; alpha1 = alpha[1]; alpha2 = alpha[2]; alpha += 3; ierr = VecCUSPGetArrayRead(y[0],&yy0);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(y[1],&yy1);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(y[2],&yy2);CHKERRQ(ierr); try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( xarray->begin(), thrust::make_constant_iterator(alpha0), yy0->begin(), thrust::make_constant_iterator(alpha1), yy1->begin(), thrust::make_constant_iterator(alpha2), yy2->begin())), thrust::make_zip_iterator( thrust::make_tuple( xarray->end(), thrust::make_constant_iterator(alpha0), yy0->end(), thrust::make_constant_iterator(alpha1), yy1->end(), thrust::make_constant_iterator(alpha2), yy2->end())), VecCUSPMAXPY3()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(y[0],&yy0);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(y[1],&yy1);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(y[2],&yy2);CHKERRQ(ierr); y += 3; break; case 2: alpha0 = alpha[0]; alpha1 = alpha[1]; alpha +=2; ierr = VecCUSPGetArrayRead(y[0],&yy0);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(y[1],&yy1);CHKERRQ(ierr); try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( xarray->begin(), thrust::make_constant_iterator(alpha0), yy0->begin(), thrust::make_constant_iterator(alpha1), yy1->begin())), thrust::make_zip_iterator( thrust::make_tuple( xarray->end(), thrust::make_constant_iterator(alpha0), yy0->end(), thrust::make_constant_iterator(alpha1), yy1->end())), VecCUSPMAXPY2()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } y +=2; break; case 1: alpha0 = *alpha++; ierr = VecAXPY_SeqCUSP(xin,alpha0,y[0]); y +=1; break; } for (j=j_rem; jbegin(), thrust::make_constant_iterator(alpha0), yy0->begin(), thrust::make_constant_iterator(alpha1), yy1->begin(), thrust::make_constant_iterator(alpha2), yy2->begin(), thrust::make_constant_iterator(alpha3), yy3->begin())), thrust::make_zip_iterator( thrust::make_tuple( xarray->end(), thrust::make_constant_iterator(alpha0), yy0->end(), thrust::make_constant_iterator(alpha1), yy1->end(), thrust::make_constant_iterator(alpha2), yy2->end(), thrust::make_constant_iterator(alpha3), yy3->end())), VecCUSPMAXPY4()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(y[0],&yy0);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(y[1],&yy1);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(y[2],&yy2);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(y[3],&yy3);CHKERRQ(ierr); y += 4; } ierr = VecCUSPRestoreArrayReadWrite(xin,&xarray);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecDot_SeqCUSP" PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z) { CUSPARRAY *xarray,*yarray; PetscErrorCode ierr; // PetscScalar *xptr,*yptr,*zgpu; //PetscReal tmp; PetscFunctionBegin; //VecNorm_SeqCUSP(xin, NORM_2, &tmp); //VecNorm_SeqCUSP(yin, NORM_2, &tmp); ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin,&yarray);CHKERRQ(ierr); try { #if defined(PETSC_USE_COMPLEX) *z = cusp::blas::dotc(*yarray,*xarray); #else *z = cusp::blas::dot(*yarray,*xarray); #endif } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = WaitForGPU();CHKERRCUSP(ierr); if (xin->map->n >0) { ierr = PetscLogFlops(2.0*xin->map->n-1);CHKERRQ(ierr); } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin,&yarray);CHKERRQ(ierr); PetscFunctionReturn(0); } // // CUDA kernels for MDot to follow // // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed) #define MDOT_WORKGROUP_SIZE 128 #define MDOT_WORKGROUP_NUM 128 // M = 2: __global__ void VecMDot_SeqCUSP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1, PetscInt size, PetscScalar *group_results) { __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE]; PetscInt entries_per_group = (size - 1) / gridDim.x + 1; entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work PetscInt vec_start_index = blockIdx.x * entries_per_group; PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size PetscScalar entry_x = 0; PetscScalar group_sum0 = 0; PetscScalar group_sum1 = 0; for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { entry_x = x[i]; // load only once from global memory! group_sum0 += entry_x * y0[i]; group_sum1 += entry_x * y1[i]; } tmp_buffer[threadIdx.x] = group_sum0; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1; // parallel reduction for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) { __syncthreads(); if (threadIdx.x < stride) { tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE]; } } // write result of group to group_results if (threadIdx.x == 0) { group_results[blockIdx.x] = tmp_buffer[0]; group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE]; } } // M = 3: __global__ void VecMDot_SeqCUSP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2, PetscInt size, PetscScalar *group_results) { __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE]; PetscInt entries_per_group = (size - 1) / gridDim.x + 1; entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work PetscInt vec_start_index = blockIdx.x * entries_per_group; PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size PetscScalar entry_x = 0; PetscScalar group_sum0 = 0; PetscScalar group_sum1 = 0; PetscScalar group_sum2 = 0; for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { entry_x = x[i]; // load only once from global memory! group_sum0 += entry_x * y0[i]; group_sum1 += entry_x * y1[i]; group_sum2 += entry_x * y2[i]; } tmp_buffer[threadIdx.x] = group_sum0; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1; tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2; // parallel reduction for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) { __syncthreads(); if (threadIdx.x < stride) { tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE]; } } // write result of group to group_results if (threadIdx.x == 0) { group_results[blockIdx.x ] = tmp_buffer[0]; group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE]; } } // M = 4: __global__ void VecMDot_SeqCUSP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3, PetscInt size, PetscScalar *group_results) { __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE]; PetscInt entries_per_group = (size - 1) / gridDim.x + 1; entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work PetscInt vec_start_index = blockIdx.x * entries_per_group; PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size PetscScalar entry_x = 0; PetscScalar group_sum0 = 0; PetscScalar group_sum1 = 0; PetscScalar group_sum2 = 0; PetscScalar group_sum3 = 0; for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { entry_x = x[i]; // load only once from global memory! group_sum0 += entry_x * y0[i]; group_sum1 += entry_x * y1[i]; group_sum2 += entry_x * y2[i]; group_sum3 += entry_x * y3[i]; } tmp_buffer[threadIdx.x] = group_sum0; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1; tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2; tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3; // parallel reduction for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) { __syncthreads(); if (threadIdx.x < stride) { tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE]; } } // write result of group to group_results if (threadIdx.x == 0) { group_results[blockIdx.x ] = tmp_buffer[0]; group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE]; } } // M = 8: __global__ void VecMDot_SeqCUSP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3, const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7, PetscInt size, PetscScalar *group_results) { __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE]; PetscInt entries_per_group = (size - 1) / gridDim.x + 1; entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work PetscInt vec_start_index = blockIdx.x * entries_per_group; PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size PetscScalar entry_x = 0; PetscScalar group_sum0 = 0; PetscScalar group_sum1 = 0; PetscScalar group_sum2 = 0; PetscScalar group_sum3 = 0; PetscScalar group_sum4 = 0; PetscScalar group_sum5 = 0; PetscScalar group_sum6 = 0; PetscScalar group_sum7 = 0; for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { entry_x = x[i]; // load only once from global memory! group_sum0 += entry_x * y0[i]; group_sum1 += entry_x * y1[i]; group_sum2 += entry_x * y2[i]; group_sum3 += entry_x * y3[i]; group_sum4 += entry_x * y4[i]; group_sum5 += entry_x * y5[i]; group_sum6 += entry_x * y6[i]; group_sum7 += entry_x * y7[i]; } tmp_buffer[threadIdx.x] = group_sum0; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1; tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2; tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3; tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4; tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5; tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6; tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7; // parallel reduction for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) { __syncthreads(); if (threadIdx.x < stride) { tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE]; tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE]; } } // write result of group to group_results if (threadIdx.x == 0) { group_results[blockIdx.x ] = tmp_buffer[0]; group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE]; group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE]; } } #undef __FUNCT__ #define __FUNCT__ "VecMDot_SeqCUSP" PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z) { PetscErrorCode ierr; PetscInt i,j,n = xin->map->n,current_y_index = 0; CUSPARRAY *xarray,*y0array,*y1array,*y2array,*y3array,*y4array,*y5array,*y6array,*y7array; PetscScalar *group_results_gpu,*xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr; PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel cudaError_t cuda_ierr; PetscFunctionBegin; if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUSP not positive."); /* Handle the case of local size zero first */ if (!xin->map->n) { for (i=0; idata()); while (current_y_index < nv) { switch (nv - current_y_index) { case 7: case 6: case 5: case 4: ierr = VecCUSPGetArrayRead(yin[current_y_index ],&y0array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) z[current_y_index] = cusp::blas::dot(*y0array,*xarray); z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray); z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray); z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray); #else // extract raw device pointers: y0ptr = thrust::raw_pointer_cast(y0array->data()); y1ptr = thrust::raw_pointer_cast(y1array->data()); y2ptr = thrust::raw_pointer_cast(y2array->data()); y3ptr = thrust::raw_pointer_cast(y3array->data()); // run kernel: VecMDot_SeqCUSP_kernel4<<>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu); // copy results back to cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost); if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr); // sum group results into z: for (j=0; j<4; ++j) { z[current_y_index + j] = 0; for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i]; } #endif ierr = VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);CHKERRQ(ierr); current_y_index += 4; break; case 3: ierr = VecCUSPGetArrayRead(yin[current_y_index ],&y0array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) z[current_y_index] = cusp::blas::dot(*y0array,*xarray); z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray); z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray); #else // extract raw device pointers: y0ptr = thrust::raw_pointer_cast(y0array->data()); y1ptr = thrust::raw_pointer_cast(y1array->data()); y2ptr = thrust::raw_pointer_cast(y2array->data()); // run kernel: VecMDot_SeqCUSP_kernel3<<>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu); // copy results back to cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost); if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr); // sum group results into z: for (j=0; j<3; ++j) { z[current_y_index + j] = 0; for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i]; } #endif ierr = VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);CHKERRQ(ierr); current_y_index += 3; break; case 2: ierr = VecCUSPGetArrayRead(yin[current_y_index],&y0array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) z[current_y_index] = cusp::blas::dot(*y0array,*xarray); z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray); #else // extract raw device pointers: y0ptr = thrust::raw_pointer_cast(y0array->data()); y1ptr = thrust::raw_pointer_cast(y1array->data()); // run kernel: VecMDot_SeqCUSP_kernel2<<>>(xptr,y0ptr,y1ptr,n,group_results_gpu); // copy results back to cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost); if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr); // sum group results into z: for (j=0; j<2; ++j) { z[current_y_index + j] = 0; for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i]; } #endif ierr = VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); current_y_index += 2; break; case 1: ierr = VecCUSPGetArrayRead(yin[current_y_index],&y0array);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) z[current_y_index] = cusp::blas::dotc(*y0array, *xarray); #else z[current_y_index] = cusp::blas::dot(*xarray, *y0array); #endif ierr = VecCUSPRestoreArrayRead(yin[current_y_index],&y0array);CHKERRQ(ierr); current_y_index += 1; break; default: // 8 or more vectors left ierr = VecCUSPGetArrayRead(yin[current_y_index ],&y0array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+2],&y2array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+3],&y3array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+4],&y4array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+5],&y5array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+6],&y6array);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin[current_y_index+7],&y7array);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) z[current_y_index] = cusp::blas::dot(*y0array,*xarray); z[current_y_index+1] = cusp::blas::dot(*y1array,*xarray); z[current_y_index+2] = cusp::blas::dot(*y2array,*xarray); z[current_y_index+3] = cusp::blas::dot(*y3array,*xarray); z[current_y_index+4] = cusp::blas::dot(*y4array,*xarray); z[current_y_index+5] = cusp::blas::dot(*y5array,*xarray); z[current_y_index+6] = cusp::blas::dot(*y6array,*xarray); z[current_y_index+7] = cusp::blas::dot(*y7array,*xarray); #else // extract raw device pointers: y0ptr = thrust::raw_pointer_cast(y0array->data()); y1ptr = thrust::raw_pointer_cast(y1array->data()); y2ptr = thrust::raw_pointer_cast(y2array->data()); y3ptr = thrust::raw_pointer_cast(y3array->data()); y4ptr = thrust::raw_pointer_cast(y4array->data()); y5ptr = thrust::raw_pointer_cast(y5array->data()); y6ptr = thrust::raw_pointer_cast(y6array->data()); y7ptr = thrust::raw_pointer_cast(y7array->data()); // run kernel: VecMDot_SeqCUSP_kernel8<<>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu); // copy results back to cuda_ierr = cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost); if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host. Error code: %d", (int)cuda_ierr); // sum group results into z: for (j=0; j<8; ++j) { z[current_y_index + j] = 0; for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i]; } #endif ierr = VecCUSPRestoreArrayRead(yin[current_y_index ],&y0array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+1],&y1array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+2],&y2array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+3],&y3array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+4],&y4array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+5],&y5array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+6],&y6array);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin[current_y_index+7],&y7array);CHKERRQ(ierr); current_y_index += 8; break; } } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); cuda_ierr = cudaFree(group_results_gpu); if (cuda_ierr != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Could not copy CUDA buffer to host: %d", (int)cuda_ierr); ierr = PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));CHKERRQ(ierr); PetscFunctionReturn(0); } #undef MDOT_WORKGROUP_SIZE #undef MDOT_WORKGROUP_NUM #undef __FUNCT__ #define __FUNCT__ "VecSet_SeqCUSP" PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha) { CUSPARRAY *xarray=NULL; PetscErrorCode ierr; PetscFunctionBegin; /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/ ierr = VecCUSPGetArrayWrite(xin,&xarray);CHKERRQ(ierr); try { cusp::blas::fill(*xarray,alpha); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = VecCUSPRestoreArrayWrite(xin,&xarray); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecScale_SeqCUSP" PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha) { CUSPARRAY *xarray; PetscErrorCode ierr; PetscFunctionBegin; if (alpha == 0.0) { ierr = VecSet_SeqCUSP(xin,alpha);CHKERRQ(ierr); } else if (alpha != 1.0) { ierr = VecCUSPGetArrayReadWrite(xin,&xarray);CHKERRQ(ierr); try { cusp::blas::scal(*xarray,alpha); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayReadWrite(xin,&xarray);CHKERRQ(ierr); } ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = PetscLogFlops(xin->map->n);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecTDot_SeqCUSP" PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z) { CUSPARRAY *xarray,*yarray; PetscErrorCode ierr; PetscFunctionBegin; //#if defined(PETSC_USE_COMPLEX) /*Not working for complex*/ //#else ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin,&yarray);CHKERRQ(ierr); try { *z = cusp::blas::dot(*xarray,*yarray); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } //#endif ierr = WaitForGPU();CHKERRCUSP(ierr); if (xin->map->n > 0) { ierr = PetscLogFlops(2.0*xin->map->n-1);CHKERRQ(ierr); } ierr = VecCUSPRestoreArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCopy_SeqCUSP" PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin) { CUSPARRAY *xarray,*yarray; PetscErrorCode ierr; PetscFunctionBegin; if (xin != yin) { if (xin->valid_GPU_array == PETSC_CUSP_GPU) { ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayWrite(yin,&yarray);CHKERRQ(ierr); try { cusp::blas::copy(*xarray,*yarray); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(yin,&yarray);CHKERRQ(ierr); } else if (xin->valid_GPU_array == PETSC_CUSP_CPU) { /* copy in CPU if we are on the CPU*/ ierr = VecCopy_SeqCUSP_Private(xin,yin);CHKERRQ(ierr); } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) { /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */ if (yin->valid_GPU_array == PETSC_CUSP_CPU) { /* copy in CPU */ ierr = VecCopy_SeqCUSP_Private(xin,yin);CHKERRQ(ierr); } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) { /* copy in GPU */ ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayWrite(yin,&yarray);CHKERRQ(ierr); try { cusp::blas::copy(*xarray,*yarray); ierr = WaitForGPU();CHKERRCUSP(ierr); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(yin,&yarray);CHKERRQ(ierr); } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) { /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck default to copy in GPU (this is an arbitrary choice) */ ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayWrite(yin,&yarray);CHKERRQ(ierr); try { cusp::blas::copy(*xarray,*yarray); ierr = WaitForGPU();CHKERRCUSP(ierr); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayWrite(yin,&yarray);CHKERRQ(ierr); } else { ierr = VecCopy_SeqCUSP_Private(xin,yin);CHKERRQ(ierr); } } } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecSwap_SeqCUSP" PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin) { PetscErrorCode ierr; PetscBLASInt one = 1,bn; CUSPARRAY *xarray,*yarray; cublasStatus_t cberr; PetscFunctionBegin; ierr = PetscBLASIntCast(xin->map->n,&bn);CHKERRQ(ierr); if (xin != yin) { ierr = VecCUSPGetArrayReadWrite(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(yin,&yarray);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) #if defined(PETSC_USE_REAL_SINGLE) cberr = cublasCswap(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuFloatComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr); #else cberr = cublasZswap(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,(cuDoubleComplex*)VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr); #endif #else #if defined(PETSC_USE_REAL_SINGLE) cberr = cublasSswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr); #else cberr = cublasDswap(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);CHKERRCUBLAS(cberr); #endif #endif ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = VecCUSPRestoreArrayReadWrite(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayReadWrite(yin,&yarray);CHKERRQ(ierr); } PetscFunctionReturn(0); } struct VecCUSPAX { template __host__ __device__ void operator()(Tuple t) { thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t); } }; #undef __FUNCT__ #define __FUNCT__ "VecAXPBY_SeqCUSP" PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin) { PetscErrorCode ierr; PetscScalar a = alpha,b = beta; CUSPARRAY *xarray,*yarray; PetscFunctionBegin; if (a == 0.0) { ierr = VecScale_SeqCUSP(yin,beta);CHKERRQ(ierr); } else if (b == 1.0) { ierr = VecAXPY_SeqCUSP(yin,alpha,xin);CHKERRQ(ierr); } else if (a == 1.0) { ierr = VecAYPX_SeqCUSP(yin,beta,xin);CHKERRQ(ierr); } else if (b == 0.0) { ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(yin,&yarray);CHKERRQ(ierr); try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( yarray->begin(), thrust::make_constant_iterator(a), xarray->begin())), thrust::make_zip_iterator( thrust::make_tuple( yarray->end(), thrust::make_constant_iterator(a), xarray->end())), VecCUSPAX()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogFlops(xin->map->n);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayReadWrite(yin,&yarray);CHKERRQ(ierr); } else { ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(yin,&yarray);CHKERRQ(ierr); try { cusp::blas::axpby(*xarray,*yarray,*yarray,a,b); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayReadWrite(yin,&yarray);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = PetscLogFlops(3.0*xin->map->n);CHKERRQ(ierr); } PetscFunctionReturn(0); } /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */ struct VecCUSPXPBYPCZ { /* z = x + b*y + c*z */ template __host__ __device__ void operator()(Tuple t) { thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t); } }; struct VecCUSPAXPBYPZ { /* z = ax + b*y + z */ template __host__ __device__ void operator()(Tuple t) { thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t); } }; #undef __FUNCT__ #define __FUNCT__ "VecAXPBYPCZ_SeqCUSP" PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin) { PetscErrorCode ierr; PetscInt n = zin->map->n; CUSPARRAY *xarray,*yarray,*zarray; PetscFunctionBegin; ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(zin,&zarray);CHKERRQ(ierr); if (alpha == 1.0) { try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( zarray->begin(), thrust::make_constant_iterator(gamma), xarray->begin(), yarray->begin(), thrust::make_constant_iterator(beta))), thrust::make_zip_iterator( thrust::make_tuple( zarray->end(), thrust::make_constant_iterator(gamma), xarray->end(), yarray->end(), thrust::make_constant_iterator(beta))), VecCUSPXPBYPCZ()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogFlops(4.0*n);CHKERRQ(ierr); } else if (gamma == 1.0) { try { thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( zarray->begin(), xarray->begin(), thrust::make_constant_iterator(alpha), yarray->begin(), thrust::make_constant_iterator(beta))), thrust::make_zip_iterator( thrust::make_tuple( zarray->end(), xarray->end(), thrust::make_constant_iterator(alpha), yarray->end(), thrust::make_constant_iterator(beta))), VecCUSPAXPBYPZ()); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = PetscLogFlops(4.0*n);CHKERRQ(ierr); } else { try { cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayReadWrite(zin,&zarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = PetscLogFlops(5.0*n);CHKERRQ(ierr); } ierr = WaitForGPU();CHKERRCUSP(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecPointwiseMult_SeqCUSP" PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin) { PetscErrorCode ierr; PetscInt n = win->map->n; CUSPARRAY *xarray,*yarray,*warray; PetscFunctionBegin; ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayReadWrite(win,&warray);CHKERRQ(ierr); try { cusp::blas::xmy(*xarray,*yarray,*warray); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(yin,&yarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayReadWrite(win,&warray);CHKERRQ(ierr); ierr = PetscLogFlops(n);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); PetscFunctionReturn(0); } /* should do infinity norm in cusp */ #undef __FUNCT__ #define __FUNCT__ "VecNorm_SeqCUSP" PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal *z) { const PetscScalar *xx; PetscErrorCode ierr; PetscInt n = xin->map->n; PetscBLASInt one = 1, bn; CUSPARRAY *xarray; cublasStatus_t cberr; PetscFunctionBegin; ierr = PetscBLASIntCast(n,&bn);CHKERRQ(ierr); if (type == NORM_2 || type == NORM_FROBENIUS) { ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); try { *z = cusp::blas::nrm2(*xarray); } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = PetscLogFlops(PetscMax(2.0*n-1,0.0));CHKERRQ(ierr); } else if (type == NORM_INFINITY) { PetscInt i; PetscReal max = 0.0,tmp; ierr = VecGetArrayRead(xin,&xx);CHKERRQ(ierr); for (i=0; i max) max = tmp; /* check special case of tmp == NaN */ if (tmp != tmp) {max = tmp; break;} xx++; } ierr = VecRestoreArrayRead(xin,&xx);CHKERRQ(ierr); *z = max; } else if (type == NORM_1) { ierr = VecCUSPGetArrayRead(xin,&xarray);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) #if defined(PETSC_USE_REAL_SINGLE) cberr = cublasSasum(cublasv2handle,bn,(cuFloatComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr); #else cberr = cublasDasum(cublasv2handle,bn,(cuDoubleComplex*)VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr); #endif #else #if defined(PETSC_USE_REAL_SINGLE) cberr = cublasSasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr); #else cberr = cublasDasum(cublasv2handle,bn,VecCUSPCastToRawPtr(*xarray),one,z);CHKERRCUBLAS(cberr); #endif #endif ierr = VecCUSPRestoreArrayRead(xin,&xarray);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = PetscLogFlops(PetscMax(n-1.0,0.0));CHKERRQ(ierr); } else if (type == NORM_1_AND_2) { ierr = VecNorm_SeqCUSP(xin,NORM_1,z);CHKERRQ(ierr); ierr = VecNorm_SeqCUSP(xin,NORM_2,z+1);CHKERRQ(ierr); } PetscFunctionReturn(0); } /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/ template struct cuspdotnormcalculate : thrust::unary_function { __host__ __device__ T operator()(T x) { #if defined(PETSC_USE_COMPLEX) //return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x)); #else return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x), thrust::get<1>(x)*thrust::get<1>(x)); #endif } }; template struct cuspdotnormreduce : thrust::binary_function { __host__ __device__ T operator()(T x,T y) { return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y), thrust::get<1>(x)+thrust::get<1>(y)); } }; #undef __FUNCT__ #define __FUNCT__ "VecDotNorm2_SeqCUSP" PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm) { PetscErrorCode ierr; PetscScalar zero = 0.0; PetscReal n=s->map->n; thrust::tuple result; CUSPARRAY *sarray,*tarray; PetscFunctionBegin; /*ierr = VecCUSPCopyToGPU(s);CHKERRQ(ierr); ierr = VecCUSPCopyToGPU(t);CHKERRQ(ierr);*/ ierr = VecCUSPGetArrayRead(s,&sarray);CHKERRQ(ierr); ierr = VecCUSPGetArrayRead(t,&tarray);CHKERRQ(ierr); try { #if defined(PETSC_USE_COMPLEX) ierr = VecDot_SeqCUSP(s,t,dp);CHKERRQ(ierr); ierr = VecDot_SeqCUSP(t,t,nm);CHKERRQ(ierr); //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*dp),PetscImaginaryPart(*dp)); //printf("VecDotNorm2_SeqCUSP=%1.5g,%1.5g\n",PetscRealPart(*nm),PetscImaginaryPart(*nm)); #else result = thrust::transform_reduce( thrust::make_zip_iterator( thrust::make_tuple( sarray->begin(), tarray->begin())), thrust::make_zip_iterator( thrust::make_tuple( sarray->end(), tarray->end())), cuspdotnormcalculate >(), thrust::make_tuple(zero,zero), /*init */ cuspdotnormreduce >()); /* binary function */ *dp = thrust::get<0>(result); *nm = thrust::get<1>(result); #endif } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecCUSPRestoreArrayRead(s,&sarray);CHKERRQ(ierr); ierr = VecCUSPRestoreArrayRead(t,&tarray);CHKERRQ(ierr); ierr = WaitForGPU();CHKERRCUSP(ierr); ierr = PetscLogFlops(4.0*n);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecDestroy_SeqCUSP" PetscErrorCode VecDestroy_SeqCUSP(Vec v) { PetscErrorCode ierr; cudaError_t err; PetscFunctionBegin; try { if (v->spptr) { delete ((Vec_CUSP*)v->spptr)->GPUarray; err = cudaStreamDestroy(((Vec_CUSP*)v->spptr)->stream);CHKERRCUSP(err); delete (Vec_CUSP*)v->spptr; } } catch(char *ex) { SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex); } ierr = VecDestroy_SeqCUSP_Private(v);CHKERRQ(ierr); PetscFunctionReturn(0); } #if defined(PETSC_USE_COMPLEX) struct conjugate { __host__ __device__ PetscScalar operator()(PetscScalar x) { return cusp::conj(x); } }; #endif #undef __FUNCT__ #define __FUNCT__ "VecConjugate_SeqCUSP" PetscErrorCode VecConjugate_SeqCUSP(Vec xin) { PetscErrorCode ierr; CUSPARRAY *xarray; PetscFunctionBegin; ierr = VecCUSPGetArrayReadWrite(xin,&xarray);CHKERRQ(ierr); #if defined(PETSC_USE_COMPLEX) thrust::transform(xarray->begin(), xarray->end(), xarray->begin(), conjugate()); #endif ierr = VecCUSPRestoreArrayReadWrite(xin,&xarray);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecGetLocalVector_SeqCUSP" PetscErrorCode VecGetLocalVector_SeqCUSP(Vec v,Vec w) { VecType t; PetscErrorCode ierr; cudaError_t err; PetscBool flg; PetscFunctionBegin; PetscValidHeaderSpecific(v,VEC_CLASSID,1); PetscValidHeaderSpecific(w,VEC_CLASSID,2); ierr = VecGetType(w,&t);CHKERRQ(ierr); ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP); if (w->data) { if (((Vec_Seq*)w->data)->array_allocated) PetscFree(((Vec_Seq*)w->data)->array_allocated); ((Vec_Seq*)w->data)->array = 0; ((Vec_Seq*)w->data)->array_allocated = 0; ((Vec_Seq*)w->data)->unplacedarray = 0; } if (w->spptr) { if (((Vec_CUSP*)w->spptr)->GPUarray) delete ((Vec_CUSP*)w->spptr)->GPUarray; err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err); delete (Vec_CUSP*)w->spptr; w->spptr = 0; } if (v->petscnative) { ierr = PetscFree(w->data);CHKERRQ(ierr); w->data = v->data; w->valid_GPU_array = v->valid_GPU_array; w->spptr = v->spptr; ierr = PetscObjectStateIncrease((PetscObject)w);CHKERRQ(ierr); } else { ierr = VecGetArray(v,&((Vec_Seq*)w->data)->array);CHKERRQ(ierr); w->valid_GPU_array = PETSC_CUSP_CPU; ierr = VecCUSPAllocateCheck(w);CHKERRQ(ierr); } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecRestoreLocalVector_SeqCUSP" PetscErrorCode VecRestoreLocalVector_SeqCUSP(Vec v,Vec w) { VecType t; PetscErrorCode ierr; cudaError_t err; PetscBool flg; PetscFunctionBegin; PetscValidHeaderSpecific(v,VEC_CLASSID,1); PetscValidHeaderSpecific(w,VEC_CLASSID,2); ierr = VecGetType(w,&t);CHKERRQ(ierr); ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr); if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed to argument #2. Should be %s.\n",t,VECSEQCUSP); if (v->petscnative) { v->data = w->data; v->valid_GPU_array = w->valid_GPU_array; v->spptr = w->spptr; ierr = VecCUSPCopyFromGPU(v);CHKERRQ(ierr); ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr); w->data = 0; w->valid_GPU_array = PETSC_CUSP_UNALLOCATED; w->spptr = 0; } else { ierr = VecRestoreArray(v,&((Vec_Seq*)w->data)->array);CHKERRQ(ierr); if ((Vec_CUSP*)w->spptr) { delete ((Vec_CUSP*)w->spptr)->GPUarray; err = cudaStreamDestroy(((Vec_CUSP*)w->spptr)->stream);CHKERRCUSP(err); delete (Vec_CUSP*)w->spptr; } } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPGetArrayReadWrite" /*@C VecCUSPGetArrayReadWrite - Provides access to the CUSP vector inside a vector. This function has semantics similar to VecGetArray(): the CUSP vector returned by this function points to a consistent view of the vector data. This may involve a copy operation of data from the host to the device if the data on the device is out of date. If the device memory hasn't been allocated previously it will be allocated as part of this function call. VecCUSPGetArrayReadWrite() assumes that the user will modify the vector data. This is similar to intent(inout) in fortran. The CUSP device vector has to be released by calling VecCUSPRestoreArrayReadWrite(). Upon restoring the vector data the data on the host will be marked as out of date. A subsequent access of the host data will thus incur a data transfer from the device to the host. Input Parameter: . v - the vector Output Parameter: . a - the CUSP device vector Fortran note: This function is not currently available from Fortran. Fortran note: This function is not currently available from Fortran. Level: intermediate .seealso: VecCUSPRestoreArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPGetArrayReadWrite(Vec v, CUSPARRAY **a) { PetscErrorCode ierr; PetscFunctionBegin; *a = 0; ierr = VecCUSPCopyToGPU(v);CHKERRQ(ierr); *a = ((Vec_CUSP*)v->spptr)->GPUarray; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPRestoreArrayReadWrite" /*@C VecCUSPRestoreArrayReadWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayReadWrite(). This marks the host data as out of date. Subsequent access to the vector data on the host side with for instance VecGetArray() incurs a data transfer. Input Parameter: + v - the vector - a - the CUSP device vector. This pointer is invalid after VecCUSPRestoreArrayReadWrite() returns. Fortran note: This function is not currently available from Fortran. Level: intermediate .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayReadWrite(Vec v, CUSPARRAY **a) { PetscErrorCode ierr; PetscFunctionBegin; v->valid_GPU_array = PETSC_CUSP_GPU; ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPGetArrayRead" /*@C VecCUSPGetArrayRead - Provides read access to the CUSP device vector inside a vector. This function is analogous to VecGetArrayRead(): The CUSP vector returned by this function points to a consistent view of the vector data. This may involve a copy operation of data from the host to the device if the data on the device is out of date. If the device memory hasn't been allocated previously it will be allocated as part of this function call. VecCUSPGetArrayRead() assumes that the user will not modify the vector data. This is analogous to intent(in) in Fortran. The CUSP device vector has to be released by calling VecCUSPRestoreArrayRead(). If the data on the host side was previously up to date it will remain so, i.e. data on both the device and the host is up to date. Accessing data on the host side does not incur a device to host data transfer. Input Parameter: . v - the vector Output Parameter: . a - the CUSP device vector Fortran note: This function is not currently available from Fortran. Level: intermediate .seealso: VecCUSPRestoreArrayRead(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPGetArrayRead(Vec v, CUSPARRAY **a) { PetscErrorCode ierr; PetscFunctionBegin; *a = 0; ierr = VecCUSPCopyToGPU(v);CHKERRQ(ierr); *a = ((Vec_CUSP*)v->spptr)->GPUarray; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPRestoreArrayRead" /*@C VecCUSPRestoreArrayRead - Restore a CUSP device vector previously acquired with VecCUSPGetArrayRead(). If the data on the host side was previously up to date it will remain so, i.e. data on both the device and the host is up to date. Accessing data on the host side e.g. with VecGetArray() does not incur a device to host data transfer. Input Parameter: + v - the vector - a - the CUSP device vector. This pointer is invalid after VecCUSPRestoreArrayRead() returns. Fortran note: This function is not currently available from Fortran. Level: intermediate .seealso: VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayRead(Vec v, CUSPARRAY **a) { PetscFunctionBegin; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPGetArrayWrite" /*@C VecCUSPGetArrayWrite - Provides write access to the CUSP device vector inside a vector. The data pointed to by the device vector is uninitialized. The user must not read this data. Furthermore, the entire array needs to be filled by the user to obtain well-defined behaviour. The device memory will be allocated by this function if it hasn't been allocated previously. This is analogous to intent(out) in Fortran. The CUSP device vector needs to be released with VecCUSPRestoreArrayWrite(). When the pointer is released the host data of the vector is marked as out of data. Subsequent access of the host data with e.g. VecGetArray() incurs a device to host data transfer. Input Parameter: . v - the vector Output Parameter: . a - the CUDA pointer Fortran note: This function is not currently available from Fortran. Level: intermediate .seealso: VecCUSPRestoreArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPGetArrayWrite(Vec v, CUSPARRAY **a) { PetscErrorCode ierr; PetscFunctionBegin; *a = 0; ierr = VecCUSPAllocateCheck(v);CHKERRQ(ierr); *a = ((Vec_CUSP*)v->spptr)->GPUarray; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPRestoreArrayWrite" /*@C VecCUSPRestoreArrayWrite - Restore a CUSP device vector previously acquired with VecCUSPGetArrayWrite(). Data on the host will be marked as out of date. Subsequent access of the data on the host side e.g. with VecGetArray() will incur a device to host data transfer. Input Parameter: + v - the vector - a - the CUDA device pointer. This pointer is invalid after VecCUSPRestoreArrayWrite() returns. Fortran note: This function is not currently available from Fortran. Level: intermediate .seealso: VecCUSPGetArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPRestoreArrayWrite(Vec v, CUSPARRAY **a) { PetscErrorCode ierr; PetscFunctionBegin; v->valid_GPU_array = PETSC_CUSP_GPU; ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPGetCUDAArrayReadWrite" /*@C VecCUSPGetCUDAArrayReadWrite - Provides access to the CUDA buffer inside a vector. This function has semantics similar to VecGetArray(): the pointer returned by this function points to a consistent view of the vector data. This may involve a copy operation of data from the host to the device if the data on the device is out of date. If the device memory hasn't been allocated previously it will be allocated as part of this function call. VecCUSPGetCUDAArrayReadWrite() assumes that the user will modify the vector data. This is similar to intent(inout) in fortran. The CUDA device pointer has to be released by calling VecCUSPRestoreCUDAArrayReadWrite(). Upon restoring the vector data the data on the host will be marked as out of date. A subsequent access of the host data will thus incur a data transfer from the device to the host. Input Parameter: . v - the vector Output Parameter: . a - the CUDA device pointer Fortran note: This function is not currently available from Fortran. Level: advanced .seealso: VecCUSPRestoreCUDAArrayReadWrite(), VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayReadWrite(Vec v, PetscScalar **a) { PetscErrorCode ierr; CUSPARRAY *cusparray; PetscFunctionBegin; PetscValidPointer(a,1); ierr = VecCUSPGetArrayReadWrite(v, &cusparray);CHKERRQ(ierr); *a = thrust::raw_pointer_cast(cusparray->data()); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPRestoreCUDAArrayReadWrite" /*@C VecCUSPRestoreCUDAArrayReadWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayReadWrite(). This marks the host data as out of date. Subsequent access to the vector data on the host side with for instance VecGetArray() incurs a data transfer. Input Parameter: + v - the vector - a - the CUDA device pointer. This pointer is invalid after VecCUSPRestoreCUDAArrayReadWrite() returns. Fortran note: This function is not currently available from Fortran. Level: advanced .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayReadWrite(Vec v, PetscScalar **a) { PetscErrorCode ierr; PetscFunctionBegin; v->valid_GPU_array = PETSC_CUSP_GPU; ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPGetCUDAArrayRead" /*@C VecCUSPGetCUDAArrayRead - Provides read access to the CUDA buffer inside a vector. This function is analogous to VecGetArrayRead(): The pointer returned by this function points to a consistent view of the vector data. This may involve a copy operation of data from the host to the device if the data on the device is out of date. If the device memory hasn't been allocated previously it will be allocated as part of this function call. VecCUSPGetCUDAArrayRead() assumes that the user will not modify the vector data. This is analgogous to intent(in) in Fortran. The CUDA device pointer has to be released by calling VecCUSPRestoreCUDAArrayRead(). If the data on the host side was previously up to date it will remain so, i.e. data on both the device and the host is up to date. Accessing data on the host side does not incur a device to host data transfer. Input Parameter: . v - the vector Output Parameter: . a - the CUDA pointer. Fortran note: This function is not currently available from Fortran. Level: advanced .seealso: VecCUSPRestoreCUDAArrayRead(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayRead(Vec v, PetscScalar **a) { PetscErrorCode ierr; CUSPARRAY *cusparray; PetscFunctionBegin; PetscValidPointer(a,1); ierr = VecCUSPGetArrayRead(v, &cusparray);CHKERRQ(ierr); *a = thrust::raw_pointer_cast(cusparray->data()); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPRestoreCUDAArrayRead" /*@C VecCUSPRestoreCUDAArrayRead - Restore a device vector previously acquired with VecCUSPGetCUDAArrayRead() If the data on the host side was previously up to date it will remain so, i.e. data on both the device and the host is up to date. Accessing data on the host side e.g. with VecGetArray() does not incur a device to host data transfer. Input Parameter: + v - the vector - a - the CUDA device pointer. This pointer is invalid after VecCUSPRestoreCUDAArrayRead() returns. Fortran note: This function is not currently available from Fortran. Level: advanced Fortran note: This function is not currently available from Fortran. .seealso: VecCUSPGetCUDAArrayRead(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayRead(Vec v, PetscScalar **a) { PetscFunctionBegin; v->valid_GPU_array = PETSC_CUSP_BOTH; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPGetCUDAArrayWrite" /*@C VecCUSPGetCUDAArrayWrite - Provides write access to the CUDA buffer inside a vector. The data pointed to by the device pointer is uninitialized. The user may not read from this data. Furthermore, the entire array needs to be filled by the user to obtain well-defined behaviour. The device memory will be allocated by this function if it hasn't been allocated previously. This is analogous to intent(out) in Fortran. The device pointer needs to be released with VecCUSPRestoreCUDAArrayWrite(). When the pointer is released the host data of the vector is marked as out of data. Subsequent access of the host data with e.g. VecGetArray() incurs a device to host data transfer. Input Parameter: . v - the vector Output Parameter: . a - the CUDA pointer Fortran note: This function is not currently available from Fortran. Level: advanced .seealso: VecCUSPRestoreCUDAArrayWrite(), VecCUSPGetCUDAArrayReadWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPGetCUDAArrayWrite(Vec v, PetscScalar **a) { PetscErrorCode ierr; CUSPARRAY *cusparray; PetscFunctionBegin; PetscValidPointer(a,1); ierr = VecCUSPGetArrayWrite(v, &cusparray);CHKERRQ(ierr); *a = thrust::raw_pointer_cast(cusparray->data()); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPRestoreCUDAArrayWrite" /*@C VecCUSPRestoreCUDAArrayWrite - Restore a device vector previously acquired with VecCUSPGetCUDAArrayWrite(). Data on the host will be marked as out of date. Subsequent access of the data on the host side e.g. with VecGetArray() will incur a device to host data transfer. Input Parameter: + v - the vector - a - the CUDA device pointer. This pointer is invalid after VecCUSPRestoreCUDAArrayWrite() returns. Fortran note: This function is not currently available from Fortran. Level: advanced .seealso: VecCUSPGetCUDAArrayWrite(), VecCUSPGetCUDAArrayWrite(), VecCUSPGetArrayReadWrite(), VecCUSPGetArrayRead(), VecCUSPGetArrayWrite(), VecGetArray(), VecRestoreArray(), VecGetArrayRead() @*/ PETSC_EXTERN PetscErrorCode VecCUSPRestoreCUDAArrayWrite(Vec v, PetscScalar **a) { PetscErrorCode ierr; PetscFunctionBegin; v->valid_GPU_array = PETSC_CUSP_GPU; ierr = PetscObjectStateIncrease((PetscObject)v);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPPlaceArray" /*@C VecCUSPPlaceArray - Allows one to replace the array in a vector with a CUSPARRAY provided by the user. This is useful to avoid copying a CUSPARRAY into a vector. Not Collective Input Parameters: + vec - the vector - array - the CUSPARRAY Notes: You can return to the original CUSPARRAY with a call to VecCUSPResetArray() It is not possible to use VecCUSPPlaceArray() and VecPlaceArray() at the same time on the same vector. Level: developer .seealso: VecPlaceArray(), VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPReplaceArray() @*/ PetscErrorCode VecCUSPPlaceArray(Vec vin,CUSPARRAY *a) { PetscErrorCode ierr; PetscFunctionBegin; ierr = VecCUSPCopyToGPU(vin);CHKERRQ(ierr); if (((Vec_Seq*)vin->data)->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"VecCUSPPlaceArray()/VecPlaceArray() was already called on this vector, without a call to VecCUSPResetArray()/VecResetArray()"); ((Vec_Seq*)vin->data)->unplacedarray = (PetscScalar *) ((Vec_CUSP*)vin->spptr)->GPUarray; /* save previous CUDAARRAY so reset can bring it back */ ((Vec_CUSP*)vin->spptr)->GPUarray = a; vin->valid_GPU_array = PETSC_CUSP_GPU; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPReplaceArray" /*@C VecCUSPReplaceArray - Allows one to replace the CUSPARRAY in a vector with a CUSPARRAY provided by the user. This is useful to avoid copying a CUSPARRAY into a vector. Not Collective Input Parameters: + vec - the vector - array - the CUSPARRAY Notes: This permanently replaces the CUSPARRAY and frees the memory associated with the old CUSPARRAY. The memory passed in CANNOT be freed by the user. It will be freed when the vector is destroy. Not supported from Fortran Level: developer .seealso: VecGetArray(), VecRestoreArray(), VecPlaceArray(), VecResetArray(), VecCUSPResetArray(), VecCUSPPlaceArray(), VecReplaceArray() @*/ PetscErrorCode VecCUSPReplaceArray(Vec vin,CUSPARRAY *a) { PetscFunctionBegin; delete ((Vec_CUSP*)vin->spptr)->GPUarray; ((Vec_CUSP*)vin->spptr)->GPUarray = a; vin->valid_GPU_array = PETSC_CUSP_GPU; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCUSPResetArray" /*@C VecCUSPResetArray - Resets a vector to use its default memory. Call this after the use of VecCUSPPlaceArray(). Not Collective Input Parameters: . vec - the vector Level: developer .seealso: VecGetArray(), VecRestoreArray(), VecReplaceArray(), VecPlaceArray(), VecResetArray(), VecCUSPPlaceArray(), VecCUSPReplaceArray() @*/ PetscErrorCode VecCUSPResetArray(Vec vin) { PetscErrorCode ierr; PetscFunctionBegin; ierr = VecCUSPCopyToGPU(vin);CHKERRQ(ierr); ((Vec_CUSP*)vin->spptr)->GPUarray = (CUSPARRAY *) ((Vec_Seq*)vin->data)->unplacedarray; ((Vec_Seq*)vin->data)->unplacedarray = 0; vin->valid_GPU_array = PETSC_CUSP_GPU; PetscFunctionReturn(0); }