/* This file contains routines for Parallel vector operations. */ #define PETSC_SKIP_SPINLOCK #include #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I "petscvec.h" I*/ #include <../src/vec/vec/impls/seq/seqcuda/cudavecimpl.h> #undef __FUNCT__ #define __FUNCT__ "VecDestroy_MPICUDA" PetscErrorCode VecDestroy_MPICUDA(Vec v) { PetscErrorCode ierr; cudaError_t err; PetscFunctionBegin; if (v->spptr) { if (((Vec_CUDA*)v->spptr)->GPUarray) { err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray);CHKERRCUDA(err); ((Vec_CUDA*)v->spptr)->GPUarray = NULL; } err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err); ierr = PetscFree(v->spptr);CHKERRQ(ierr); } ierr = VecDestroy_MPI(v);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecNorm_MPICUDA" PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z) { PetscReal sum,work = 0.0; PetscErrorCode ierr; PetscFunctionBegin; if (type == NORM_2 || type == NORM_FROBENIUS) { ierr = VecNorm_SeqCUDA(xin,NORM_2,&work); work *= work; ierr = MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));CHKERRQ(ierr); *z = PetscSqrtReal(sum); } else if (type == NORM_1) { /* Find the local part */ ierr = VecNorm_SeqCUDA(xin,NORM_1,&work);CHKERRQ(ierr); /* Find the global max */ ierr = MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));CHKERRQ(ierr); } else if (type == NORM_INFINITY) { /* Find the local max */ ierr = VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);CHKERRQ(ierr); /* Find the global max */ ierr = MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));CHKERRQ(ierr); } else if (type == NORM_1_AND_2) { PetscReal temp[2]; ierr = VecNorm_SeqCUDA(xin,NORM_1,temp);CHKERRQ(ierr); ierr = VecNorm_SeqCUDA(xin,NORM_2,temp+1);CHKERRQ(ierr); temp[1] = temp[1]*temp[1]; ierr = MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));CHKERRQ(ierr); z[1] = PetscSqrtReal(z[1]); } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecDot_MPICUDA" PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z) { PetscScalar sum,work; PetscErrorCode ierr; PetscFunctionBegin; ierr = VecDot_SeqCUDA(xin,yin,&work);CHKERRQ(ierr); ierr = MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));CHKERRQ(ierr); *z = sum; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecTDot_MPICUDA" PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z) { PetscScalar sum,work; PetscErrorCode ierr; PetscFunctionBegin; ierr = VecTDot_SeqCUDA(xin,yin,&work);CHKERRQ(ierr); ierr = MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));CHKERRQ(ierr); *z = sum; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecMDot_MPICUDA" PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z) { PetscScalar awork[128],*work = awork; PetscErrorCode ierr; PetscFunctionBegin; if (nv > 128) { ierr = PetscMalloc1(nv,&work);CHKERRQ(ierr); } ierr = VecMDot_SeqCUDA(xin,nv,y,work);CHKERRQ(ierr); ierr = MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));CHKERRQ(ierr); if (nv > 128) { ierr = PetscFree(work);CHKERRQ(ierr); } PetscFunctionReturn(0); } /*MC VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA Options Database Keys: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions() Level: beginner .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI() M*/ #undef __FUNCT__ #define __FUNCT__ "VecDuplicate_MPICUDA" PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v) { PetscErrorCode ierr; Vec_MPI *vw,*w = (Vec_MPI*)win->data; PetscScalar *array; PetscFunctionBegin; ierr = VecCreate(PetscObjectComm((PetscObject)win),v);CHKERRQ(ierr); ierr = PetscLayoutReference(win->map,&(*v)->map);CHKERRQ(ierr); ierr = VecCreate_MPI_Private(*v,PETSC_FALSE,w->nghost,0);CHKERRQ(ierr); vw = (Vec_MPI*)(*v)->data; ierr = PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));CHKERRQ(ierr); /* save local representation of the parallel vector (and scatter) if it exists */ if (w->localrep) { ierr = VecGetArray(*v,&array);CHKERRQ(ierr); ierr = VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);CHKERRQ(ierr); ierr = PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));CHKERRQ(ierr); ierr = VecRestoreArray(*v,&array);CHKERRQ(ierr); ierr = PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);CHKERRQ(ierr); vw->localupdate = w->localupdate; if (vw->localupdate) { ierr = PetscObjectReference((PetscObject)vw->localupdate);CHKERRQ(ierr); } } /* New vector should inherit stashing property of parent */ (*v)->stash.donotstash = win->stash.donotstash; (*v)->stash.ignorenegidx = win->stash.ignorenegidx; /* change type_name appropriately */ ierr = PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);CHKERRQ(ierr); ierr = PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);CHKERRQ(ierr); ierr = PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);CHKERRQ(ierr); (*v)->map->bs = PetscAbs(win->map->bs); (*v)->bstash.bs = win->bstash.bs; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecDotNorm2_MPICUDA" PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm) { PetscErrorCode ierr; PetscScalar work[2],sum[2]; PetscFunctionBegin; ierr = VecDotNorm2_SeqCUDA(s,t,work,work+1);CHKERRQ(ierr); ierr = MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));CHKERRQ(ierr); *dp = sum[0]; *nm = sum[1]; PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCreate_MPICUDA" PETSC_EXTERN PetscErrorCode VecCreate_MPICUDA(Vec vv) { PetscErrorCode ierr; PetscFunctionBegin; ierr = VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);CHKERRQ(ierr); ierr = PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);CHKERRQ(ierr); vv->ops->dotnorm2 = VecDotNorm2_MPICUDA; vv->ops->waxpy = VecWAXPY_SeqCUDA; vv->ops->duplicate = VecDuplicate_MPICUDA; vv->ops->dot = VecDot_MPICUDA; vv->ops->mdot = VecMDot_MPICUDA; vv->ops->tdot = VecTDot_MPICUDA; vv->ops->norm = VecNorm_MPICUDA; vv->ops->scale = VecScale_SeqCUDA; vv->ops->copy = VecCopy_SeqCUDA; vv->ops->set = VecSet_SeqCUDA; vv->ops->swap = VecSwap_SeqCUDA; vv->ops->axpy = VecAXPY_SeqCUDA; vv->ops->axpby = VecAXPBY_SeqCUDA; vv->ops->maxpy = VecMAXPY_SeqCUDA; vv->ops->aypx = VecAYPX_SeqCUDA; vv->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA; vv->ops->pointwisemult = VecPointwiseMult_SeqCUDA; vv->ops->setrandom = VecSetRandom_SeqCUDA; vv->ops->placearray = VecPlaceArray_SeqCUDA; vv->ops->replacearray = VecReplaceArray_SeqCUDA; vv->ops->resetarray = VecResetArray_SeqCUDA; vv->ops->dot_local = VecDot_SeqCUDA; vv->ops->tdot_local = VecTDot_SeqCUDA; vv->ops->norm_local = VecNorm_SeqCUDA; vv->ops->mdot_local = VecMDot_SeqCUDA; vv->ops->destroy = VecDestroy_MPICUDA; vv->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA; vv->ops->getlocalvector = VecGetLocalVector_SeqCUDA; vv->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA; vv->ops->getlocalvectorread = VecGetLocalVector_SeqCUDA; vv->ops->restorelocalvectorread = VecRestoreLocalVector_SeqCUDA; ierr = VecCUDAAllocateCheck(vv);CHKERRCUDA(ierr); vv->valid_GPU_array = PETSC_CUDA_GPU; ierr = VecSet(vv,0.0);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "VecCreate_CUDA" PETSC_EXTERN PetscErrorCode VecCreate_CUDA(Vec v) { PetscErrorCode ierr; PetscMPIInt size; PetscFunctionBegin; ierr = MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);CHKERRQ(ierr); if (size == 1) { ierr = VecSetType(v,VECSEQCUDA);CHKERRQ(ierr); } else { ierr = VecSetType(v,VECMPICUDA);CHKERRQ(ierr); } PetscFunctionReturn(0); }