Actual source code: bvcuda.cu
slepc-3.20.2 2024-03-15
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: CUDA-related code common to several BV impls
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepccublas.h>
17: #define BLOCKSIZE 64
19: /*
20: C := alpha*A*B + beta*C
21: */
22: PetscErrorCode BVMult_BLAS_CUDA(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_)
23: {
24: PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
25: cublasHandle_t cublasv2handle;
27: PetscFunctionBegin;
28: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
29: PetscCall(PetscCuBLASIntCast(m_,&m));
30: PetscCall(PetscCuBLASIntCast(n_,&n));
31: PetscCall(PetscCuBLASIntCast(k_,&k));
32: PetscCall(PetscCuBLASIntCast(lda_,&lda));
33: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
34: PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
35: PetscCall(PetscLogGpuTimeBegin());
36: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc));
37: PetscCall(PetscLogGpuTimeEnd());
38: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
39: PetscFunctionReturn(PETSC_SUCCESS);
40: }
42: /*
43: y := alpha*A*x + beta*y
44: */
45: PetscErrorCode BVMultVec_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y)
46: {
47: PetscCuBLASInt n=0,k=0,lda=0,one=1;
48: cublasHandle_t cublasv2handle;
50: PetscFunctionBegin;
51: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
52: PetscCall(PetscCuBLASIntCast(n_,&n));
53: PetscCall(PetscCuBLASIntCast(k_,&k));
54: PetscCall(PetscCuBLASIntCast(lda_,&lda));
55: PetscCall(PetscLogGpuTimeBegin());
56: PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one));
57: PetscCall(PetscLogGpuTimeEnd());
58: PetscCall(PetscLogGpuFlops(2.0*n*k));
59: PetscFunctionReturn(PETSC_SUCCESS);
60: }
62: /*
63: A(:,s:e-1) := A*B(:,s:e-1)
64: */
65: PetscErrorCode BVMultInPlace_BLAS_CUDA(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans)
66: {
67: const PetscScalar *d_B1;
68: PetscScalar *d_work,sone=1.0,szero=0.0;
69: PetscCuBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE;
70: size_t freemem,totmem;
71: cublasHandle_t cublasv2handle;
72: cublasOperation_t bt;
74: PetscFunctionBegin;
75: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
76: PetscCall(PetscCuBLASIntCast(m_,&m));
77: PetscCall(PetscCuBLASIntCast(e-s,&n));
78: PetscCall(PetscCuBLASIntCast(k_,&k));
79: PetscCall(PetscCuBLASIntCast(lda_,&lda));
80: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
81: PetscCall(PetscLogGpuTimeBegin());
82: if (PetscUnlikely(btrans)) {
83: d_B1 = d_B+s;
84: bt = CUBLAS_OP_C;
85: } else {
86: d_B1 = d_B+s*ldb;
87: bt = CUBLAS_OP_N;
88: }
89: /* try to allocate the whole matrix */
90: PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem));
91: if (freemem>=lda*n*sizeof(PetscScalar)) {
92: PetscCallCUDA(cudaMalloc((void**)&d_work,lda*n*sizeof(PetscScalar)));
93: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda));
94: PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
95: } else {
96: PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
97: PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
98: PetscCall(PetscCuBLASIntCast(m % bs,&l));
99: if (l) {
100: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l));
101: PetscCallCUDA(cudaMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
102: }
103: for (;l<m;l+=bs) {
104: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs));
105: PetscCallCUDA(cudaMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
106: }
107: }
108: PetscCall(PetscLogGpuTimeEnd());
109: PetscCallCUDA(cudaFree(d_work));
110: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
111: PetscFunctionReturn(PETSC_SUCCESS);
112: }
114: /*
115: B := alpha*A + beta*B
116: */
117: PetscErrorCode BVAXPY_BLAS_CUDA(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_)
118: {
119: PetscCuBLASInt n=0,k=0,lda=0,ldb=0;
120: cublasHandle_t cublasv2handle;
122: PetscFunctionBegin;
123: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
124: PetscCall(PetscCuBLASIntCast(n_,&n));
125: PetscCall(PetscCuBLASIntCast(k_,&k));
126: PetscCall(PetscCuBLASIntCast(lda_,&lda));
127: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
128: PetscCall(PetscLogGpuTimeBegin());
129: PetscCallCUBLAS(cublasXgeam(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb));
130: PetscCall(PetscLogGpuTimeEnd());
131: PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k));
132: PetscFunctionReturn(PETSC_SUCCESS);
133: }
135: /*
136: C := A'*B
138: C is a CPU array
139: */
140: PetscErrorCode BVDot_BLAS_CUDA(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi)
141: {
142: PetscScalar *d_work,sone=1.0,szero=0.0,*CC;
143: PetscInt j;
144: PetscCuBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
145: PetscMPIInt len;
146: cublasHandle_t cublasv2handle;
148: PetscFunctionBegin;
149: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
150: PetscCall(PetscCuBLASIntCast(m_,&m));
151: PetscCall(PetscCuBLASIntCast(n_,&n));
152: PetscCall(PetscCuBLASIntCast(k_,&k));
153: PetscCall(PetscCuBLASIntCast(lda_,&lda));
154: PetscCall(PetscCuBLASIntCast(ldb_,&ldb));
155: PetscCall(PetscCuBLASIntCast(ldc_,&ldc));
156: PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
157: if (mpi) {
158: if (ldc==m) {
159: PetscCall(BVAllocateWork_Private(bv,m*n));
160: if (k) {
161: PetscCall(PetscLogGpuTimeBegin());
162: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc));
163: PetscCall(PetscLogGpuTimeEnd());
164: PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
165: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
166: } else PetscCall(PetscArrayzero(bv->work,m*n));
167: PetscCall(PetscMPIIntCast(m*n,&len));
168: PetscCall(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
169: } else {
170: PetscCall(BVAllocateWork_Private(bv,2*m*n));
171: CC = bv->work+m*n;
172: if (k) {
173: PetscCall(PetscLogGpuTimeBegin());
174: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
175: PetscCall(PetscLogGpuTimeEnd());
176: PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
177: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
178: } else PetscCall(PetscArrayzero(bv->work,m*n));
179: PetscCall(PetscMPIIntCast(m*n,&len));
180: PetscCall(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
181: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m));
182: }
183: } else {
184: if (k) {
185: PetscCall(BVAllocateWork_Private(bv,m*n));
186: PetscCall(PetscLogGpuTimeBegin());
187: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
188: PetscCall(PetscLogGpuTimeEnd());
189: PetscCallCUDA(cudaMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
190: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
191: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m));
192: }
193: }
194: PetscCallCUDA(cudaFree(d_work));
195: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: /*
200: y := A'*x computed as y' := x'*A
202: y is a CPU array, if NULL bv->buffer is used as a workspace
203: */
204: PetscErrorCode BVDotVec_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi)
205: {
206: PetscScalar *d_work,szero=0.0,sone=1.0,*yy=y;
207: PetscCuBLASInt n=0,k=0,lda=0,one=1;
208: PetscMPIInt len;
209: cublasHandle_t cublasv2handle;
211: PetscFunctionBegin;
212: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
213: PetscCall(PetscCuBLASIntCast(n_,&n));
214: PetscCall(PetscCuBLASIntCast(k_,&k));
215: PetscCall(PetscCuBLASIntCast(lda_,&lda));
216: if (!y) PetscCall(VecCUDAGetArrayWrite(bv->buffer,&d_work));
217: else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar)));
218: if (mpi) {
219: PetscCall(BVAllocateWork_Private(bv,k));
220: if (n) {
221: PetscCall(PetscLogGpuTimeBegin());
222: #if defined(PETSC_USE_COMPLEX)
223: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,lda,&szero,d_work,one));
224: PetscCall(BV_ConjugateCUDAArray(d_work,k));
225: #else
226: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,lda,&szero,d_work,one));
227: #endif
228: PetscCall(PetscLogGpuTimeEnd());
229: PetscCallCUDA(cudaMemcpy(bv->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
230: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
231: } else PetscCall(PetscArrayzero(bv->work,k));
232: if (!y) {
233: PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
234: PetscCall(VecGetArray(bv->buffer,&yy));
235: } else PetscCallCUDA(cudaFree(d_work));
236: PetscCall(PetscMPIIntCast(k,&len));
237: PetscCall(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
238: } else {
239: if (n) {
240: PetscCall(PetscLogGpuTimeBegin());
241: #if defined(PETSC_USE_COMPLEX)
242: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,lda,&szero,d_work,one));
243: PetscCall(BV_ConjugateCUDAArray(d_work,k));
244: #else
245: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,lda,&szero,d_work,one));
246: #endif
247: PetscCall(PetscLogGpuTimeEnd());
248: }
249: if (!y) PetscCall(VecCUDARestoreArrayWrite(bv->buffer,&d_work));
250: else {
251: PetscCallCUDA(cudaMemcpy(y,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
252: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
253: PetscCallCUDA(cudaFree(d_work));
254: }
255: }
256: PetscCall(PetscLogGpuFlops(2.0*n*k));
257: PetscFunctionReturn(PETSC_SUCCESS);
258: }
260: /*
261: Scale n scalars
262: */
263: PetscErrorCode BVScale_BLAS_CUDA(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha)
264: {
265: PetscCuBLASInt n=0,one=1;
266: cublasHandle_t cublasv2handle;
268: PetscFunctionBegin;
269: PetscCall(PetscCuBLASIntCast(n_,&n));
270: if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar)));
271: else if (alpha != (PetscScalar)1.0) {
272: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
273: PetscCall(PetscLogGpuTimeBegin());
274: PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one));
275: PetscCall(PetscLogGpuTimeEnd());
276: PetscCall(PetscLogGpuFlops(1.0*n));
277: }
278: PetscFunctionReturn(PETSC_SUCCESS);
279: }
281: #if defined(PETSC_USE_COMPLEX)
282: #include <thrust/device_ptr.h>
284: struct conjugate
285: {
286: __host__ __device__
287: PetscScalar operator()(PetscScalar x)
288: {
289: return PetscConj(x);
290: }
291: };
293: PetscErrorCode BV_ConjugateCUDAArray(PetscScalar *a,PetscInt n)
294: {
295: thrust::device_ptr<PetscScalar> ptr;
297: PetscFunctionBegin;
298: try {
299: ptr = thrust::device_pointer_cast(a);
300: thrust::transform(ptr,ptr+n,ptr,conjugate());
301: } catch (char *ex) {
302: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
303: }
304: PetscFunctionReturn(PETSC_SUCCESS);
305: }
306: #endif
308: /*
309: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
310: */
311: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
312: {
313: PetscScalar *d_hh,*d_a;
314: PetscInt i;
316: PetscFunctionBegin;
317: if (!h) {
318: PetscCall(VecCUDAGetArray(bv->buffer,&d_a));
319: PetscCall(PetscLogGpuTimeBegin());
320: d_hh = d_a + j*(bv->nc+bv->m);
321: PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
322: PetscCall(PetscLogGpuTimeEnd());
323: PetscCall(VecCUDARestoreArray(bv->buffer,&d_a));
324: } else { /* cpu memory */
325: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
326: }
327: PetscFunctionReturn(PETSC_SUCCESS);
328: }
330: /*
331: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
332: into column j of the bv buffer
333: */
334: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
335: {
336: PetscScalar *d_h,*d_c,sone=1.0;
337: PetscInt i;
338: PetscCuBLASInt idx=0,one=1;
339: cublasHandle_t cublasv2handle;
341: PetscFunctionBegin;
342: if (!h) {
343: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
344: PetscCall(VecCUDAGetArray(bv->buffer,&d_c));
345: d_h = d_c + j*(bv->nc+bv->m);
346: PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
347: PetscCall(PetscLogGpuTimeBegin());
348: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one));
349: PetscCall(PetscLogGpuTimeEnd());
350: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
351: PetscCall(VecCUDARestoreArray(bv->buffer,&d_c));
352: } else { /* cpu memory */
353: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
354: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
355: }
356: PetscFunctionReturn(PETSC_SUCCESS);
357: }
359: /*
360: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
361: of the coefficients array
362: */
363: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
364: {
365: PetscScalar *d_h,*a;
367: PetscFunctionBegin;
368: if (!h) {
369: PetscCall(VecCUDAGetArray(bv->buffer,&a));
370: PetscCall(PetscLogGpuTimeBegin());
371: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
372: PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice));
373: PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
374: PetscCall(PetscLogGpuTimeEnd());
375: PetscCall(VecCUDARestoreArray(bv->buffer,&a));
376: } else { /* cpu memory */
377: h[bv->nc+j] = value;
378: }
379: PetscFunctionReturn(PETSC_SUCCESS);
380: }
382: /*
383: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
384: coefficients array (up to position j)
385: */
386: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
387: {
388: const PetscScalar *d_h;
389: PetscScalar dot;
390: PetscInt i;
391: PetscCuBLASInt idx=0,one=1;
392: cublasHandle_t cublasv2handle;
394: PetscFunctionBegin;
395: if (!h) {
396: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
397: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
398: PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
399: PetscCall(PetscLogGpuTimeBegin());
400: PetscCallCUBLAS(cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot));
401: PetscCall(PetscLogGpuTimeEnd());
402: PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
403: *sum = PetscRealPart(dot);
404: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
405: } else { /* cpu memory */
406: *sum = 0.0;
407: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
408: PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
409: }
410: PetscFunctionReturn(PETSC_SUCCESS);
411: }
413: /* pointwise multiplication */
414: static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
415: {
416: PetscInt x;
418: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
419: if (x<n) a[x] *= PetscRealPart(b[x]);
420: }
422: /* pointwise division */
423: static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
424: {
425: PetscInt x;
427: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
428: if (x<n) a[x] /= PetscRealPart(b[x]);
429: }
431: /*
432: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
433: the contents of the coefficients array (up to position j) and omega is the signature;
434: if inverse=TRUE then the operation is h/omega
435: */
436: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
437: {
438: PetscScalar *d_h;
439: const PetscScalar *d_omega,*omega;
440: PetscInt i,xcount;
441: dim3 blocks3d, threads3d;
443: PetscFunctionBegin;
444: if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
445: if (!h) {
446: PetscCall(VecCUDAGetArray(bv->buffer,&d_h));
447: PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega));
448: PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
449: PetscCall(PetscLogGpuTimeBegin());
450: if (inverse) {
451: for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
452: } else {
453: for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
454: }
455: PetscCallCUDA(cudaGetLastError());
456: PetscCall(PetscLogGpuTimeEnd());
457: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
458: PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega));
459: PetscCall(VecCUDARestoreArray(bv->buffer,&d_h));
460: } else {
461: PetscCall(VecGetArrayRead(bv->omega,&omega));
462: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
463: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
464: PetscCall(VecRestoreArrayRead(bv->omega,&omega));
465: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
466: }
467: PetscFunctionReturn(PETSC_SUCCESS);
468: }
470: /*
471: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
472: of the coefficients array
473: */
474: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
475: {
476: const PetscScalar *d_h;
477: PetscScalar hh;
479: PetscFunctionBegin;
480: if (!h) {
481: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
482: PetscCall(PetscLogGpuTimeBegin());
483: PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost));
484: PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
485: PetscCall(PetscLogGpuTimeEnd());
486: PetscCall(BV_SafeSqrt(bv,hh,beta));
487: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
488: } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
489: PetscFunctionReturn(PETSC_SUCCESS);
490: }
492: /*
493: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
494: provided by the caller (only values from l to j are copied)
495: */
496: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
497: {
498: const PetscScalar *d_h,*d_a;
499: PetscInt i;
501: PetscFunctionBegin;
502: if (!h) {
503: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a));
504: PetscCall(PetscLogGpuTimeBegin());
505: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
506: PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
507: PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
508: PetscCall(PetscLogGpuTimeEnd());
509: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a));
510: } else {
511: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
512: }
513: PetscFunctionReturn(PETSC_SUCCESS);
514: }