Actual source code: fnutil.c

slepc-3.16.1 2021-11-17
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Utility subroutines common to several impls
 12: */

 14: #include <slepc/private/fnimpl.h>
 15: #include <slepcblaslapack.h>

 17: /*
 18:    Compute the square root of an upper quasi-triangular matrix T,
 19:    using Higham's algorithm (LAA 88, 1987). T is overwritten with sqrtm(T).
 20:  */
 21: static PetscErrorCode SlepcMatDenseSqrt(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld)
 22: {
 23:   PetscScalar  one=1.0,mone=-1.0;
 24:   PetscReal    scal;
 25:   PetscBLASInt i,j,si,sj,r,ione=1,info;
 26: #if !defined(PETSC_USE_COMPLEX)
 27:   PetscReal    alpha,theta,mu,mu2;
 28: #endif

 31:   for (j=0;j<n;j++) {
 32: #if defined(PETSC_USE_COMPLEX)
 33:     sj = 1;
 34:     T[j+j*ld] = PetscSqrtScalar(T[j+j*ld]);
 35: #else
 36:     sj = (j==n-1 || T[j+1+j*ld] == 0.0)? 1: 2;
 37:     if (sj==1) {
 38:       if (T[j+j*ld]<0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Matrix has a real negative eigenvalue, no real primary square root exists");
 39:       T[j+j*ld] = PetscSqrtReal(T[j+j*ld]);
 40:     } else {
 41:       /* square root of 2x2 block */
 42:       theta = (T[j+j*ld]+T[j+1+(j+1)*ld])/2.0;
 43:       mu    = (T[j+j*ld]-T[j+1+(j+1)*ld])/2.0;
 44:       mu2   = -mu*mu-T[j+1+j*ld]*T[j+(j+1)*ld];
 45:       mu    = PetscSqrtReal(mu2);
 46:       if (theta>0.0) alpha = PetscSqrtReal((theta+PetscSqrtReal(theta*theta+mu2))/2.0);
 47:       else alpha = mu/PetscSqrtReal(2.0*(-theta+PetscSqrtReal(theta*theta+mu2)));
 48:       T[j+j*ld]       /= 2.0*alpha;
 49:       T[j+1+(j+1)*ld] /= 2.0*alpha;
 50:       T[j+(j+1)*ld]   /= 2.0*alpha;
 51:       T[j+1+j*ld]     /= 2.0*alpha;
 52:       T[j+j*ld]       += alpha-theta/(2.0*alpha);
 53:       T[j+1+(j+1)*ld] += alpha-theta/(2.0*alpha);
 54:     }
 55: #endif
 56:     for (i=j-1;i>=0;i--) {
 57: #if defined(PETSC_USE_COMPLEX)
 58:       si = 1;
 59: #else
 60:       si = (i==0 || T[i+(i-1)*ld] == 0.0)? 1: 2;
 61:       if (si==2) i--;
 62: #endif
 63:       /* solve Sylvester equation of order si x sj */
 64:       r = j-i-si;
 65:       if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&si,&sj,&r,&mone,T+i+(i+si)*ld,&ld,T+i+si+j*ld,&ld,&one,T+i+j*ld,&ld));
 66:       PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,&si,&sj,T+i+i*ld,&ld,T+j+j*ld,&ld,T+i+j*ld,&ld,&scal,&info));
 67:       SlepcCheckLapackInfo("trsyl",info);
 68:       if (scal!=1.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Current implementation cannot handle scale factor %g",scal);
 69:     }
 70:     if (sj==2) j++;
 71:   }
 72:   return(0);
 73: }

 75: #define BLOCKSIZE 64

 77: /*
 78:    Schur method for the square root of an upper quasi-triangular matrix T.
 79:    T is overwritten with sqrtm(T).
 80:    If firstonly then only the first column of T will contain relevant values.
 81:  */
 82: PetscErrorCode FNSqrtmSchur(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool firstonly)
 83: {
 85:   PetscBLASInt   i,j,k,r,ione=1,sdim,lwork,*s,*p,info,bs=BLOCKSIZE;
 86:   PetscScalar    *wr,*W,*Q,*work,one=1.0,zero=0.0,mone=-1.0;
 87:   PetscInt       m,nblk;
 88:   PetscReal      scal;
 89: #if defined(PETSC_USE_COMPLEX)
 90:   PetscReal      *rwork;
 91: #else
 92:   PetscReal      *wi;
 93: #endif

 96:   m     = n;
 97:   nblk  = (m+bs-1)/bs;
 98:   lwork = 5*n;
 99:   k     = firstonly? 1: n;

101:   /* compute Schur decomposition A*Q = Q*T */
102: #if !defined(PETSC_USE_COMPLEX)
103:   PetscMalloc7(m,&wr,m,&wi,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
104:   PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,wi,Q,&ld,work,&lwork,NULL,&info));
105: #else
106:   PetscMalloc7(m,&wr,m,&rwork,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
107:   PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,Q,&ld,work,&lwork,rwork,NULL,&info));
108: #endif
109:   SlepcCheckLapackInfo("gees",info);

111:   /* determine block sizes and positions, to avoid cutting 2x2 blocks */
112:   j = 0;
113:   p[j] = 0;
114:   do {
115:     s[j] = PetscMin(bs,n-p[j]);
116: #if !defined(PETSC_USE_COMPLEX)
117:     if (p[j]+s[j]!=n && T[p[j]+s[j]+(p[j]+s[j]-1)*ld]!=0.0) s[j]++;
118: #endif
119:     if (p[j]+s[j]==n) break;
120:     j++;
121:     p[j] = p[j-1]+s[j-1];
122:   } while (1);
123:   nblk = j+1;

125:   for (j=0;j<nblk;j++) {
126:     /* evaluate f(T_jj) */
127:     SlepcMatDenseSqrt(s[j],T+p[j]+p[j]*ld,ld);
128:     for (i=j-1;i>=0;i--) {
129:       /* solve Sylvester equation for block (i,j) */
130:       r = p[j]-p[i]-s[i];
131:       if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",s+i,s+j,&r,&mone,T+p[i]+(p[i]+s[i])*ld,&ld,T+p[i]+s[i]+p[j]*ld,&ld,&one,T+p[i]+p[j]*ld,&ld));
132:       PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,s+i,s+j,T+p[i]+p[i]*ld,&ld,T+p[j]+p[j]*ld,&ld,T+p[i]+p[j]*ld,&ld,&scal,&info));
133:       SlepcCheckLapackInfo("trsyl",info);
134:       if (scal!=1.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Current implementation cannot handle scale factor %g",scal);
135:     }
136:   }

138:   /* backtransform B = Q*T*Q' */
139:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","C",&n,&k,&n,&one,T,&ld,Q,&ld,&zero,W,&ld));
140:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&k,&n,&one,Q,&ld,W,&ld,&zero,T,&ld));

142:   /* flop count: Schur decomposition, triangular square root, and backtransform */
143:   PetscLogFlops(25.0*n*n*n+n*n*n/3.0+4.0*n*n*k);

145: #if !defined(PETSC_USE_COMPLEX)
146:   PetscFree7(wr,wi,W,Q,work,s,p);
147: #else
148:   PetscFree7(wr,rwork,W,Q,work,s,p);
149: #endif
150:   return(0);
151: }

153: #define DBMAXIT 25

155: /*
156:    Computes the principal square root of the matrix T using the product form
157:    of the Denman-Beavers iteration.
158:    T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
159:  */
160: PetscErrorCode FNSqrtmDenmanBeavers(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
161: {
162:   PetscScalar        *Told,*M=NULL,*invM,*work,work1,prod,alpha;
163:   PetscScalar        szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sp25=0.25;
164:   PetscReal          tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,rwork[1];
165:   PetscBLASInt       N,i,it,*piv=NULL,info,query=-1,lwork;
166:   const PetscBLASInt one=1;
167:   PetscBool          converged=PETSC_FALSE,scale=PETSC_FALSE;
168:   PetscErrorCode     ierr;
169:   unsigned int       ftz;

172:   N = n*n;
173:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
174:   SlepcSetFlushToZero(&ftz);

176:   /* query work size */
177:   PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M,&ld,piv,&work1,&query,&info));
178:   PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
179:   PetscMalloc5(lwork,&work,n,&piv,n*n,&Told,n*n,&M,n*n,&invM);
180:   PetscArraycpy(M,T,n*n);

182:   if (inv) {  /* start recurrence with I instead of A */
183:     PetscArrayzero(T,n*n);
184:     for (i=0;i<n;i++) T[i+i*ld] += 1.0;
185:   }

187:   for (it=0;it<DBMAXIT && !converged;it++) {

189:     if (scale) {  /* g = (abs(det(M)))^(-1/(2*n)) */
190:       PetscArraycpy(invM,M,n*n);
191:       PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
192:       SlepcCheckLapackInfo("getrf",info);
193:       prod = invM[0];
194:       for (i=1;i<n;i++) prod *= invM[i+i*ld];
195:       detM = PetscAbsScalar(prod);
196:       g = PetscPowReal(detM,-1.0/(2.0*n));
197:       alpha = g;
198:       PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,T,&one));
199:       alpha = g*g;
200:       PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,M,&one));
201:       PetscLogFlops(2.0*n*n*n/3.0+2.0*n*n);
202:     }

204:     PetscArraycpy(Told,T,n*n);
205:     PetscArraycpy(invM,M,n*n);

207:     PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
208:     SlepcCheckLapackInfo("getrf",info);
209:     PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,invM,&ld,piv,work,&lwork,&info));
210:     SlepcCheckLapackInfo("getri",info);
211:     PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);

213:     for (i=0;i<n;i++) invM[i+i*ld] += 1.0;
214:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Told,&ld,invM,&ld,&szero,T,&ld));
215:     for (i=0;i<n;i++) invM[i+i*ld] -= 1.0;

217:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&sone,invM,&one,M,&one));
218:     PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sp25,M,&one));
219:     for (i=0;i<n;i++) M[i+i*ld] -= 0.5;
220:     PetscLogFlops(2.0*n*n*n+2.0*n*n);

222:     Mres = LAPACKlange_("F",&n,&n,M,&n,rwork);
223:     for (i=0;i<n;i++) M[i+i*ld] += 1.0;

225:     if (scale) {
226:       /* reldiff = norm(T - Told,'fro')/norm(T,'fro') */
227:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,T,&one,Told,&one));
228:       fnormdiff = LAPACKlange_("F",&n,&n,Told,&n,rwork);
229:       fnormT = LAPACKlange_("F",&n,&n,T,&n,rwork);
230:       PetscLogFlops(7.0*n*n);
231:       reldiff = fnormdiff/fnormT;
232:       PetscInfo4(fn,"it: %D reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g);
233:       if (reldiff<1e-2) scale = PETSC_FALSE;  /* Switch off scaling */
234:     }

236:     if (Mres<=tol) converged = PETSC_TRUE;
237:   }

239:   if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",DBMAXIT);
240:   PetscFree5(work,piv,Told,M,invM);
241:   SlepcResetFlushToZero(&ftz);
242:   return(0);
243: }

245: #define NSMAXIT 50

247: /*
248:    Computes the principal square root of the matrix A using the Newton-Schulz iteration.
249:    T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
250:  */
251: PetscErrorCode FNSqrtmNewtonSchulz(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
252: {
253:   PetscScalar    *Y=A,*Yold,*Z,*Zold,*M;
254:   PetscScalar    szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
255:   PetscReal      sqrtnrm,tol,Yres=0.0,nrm,rwork[1],done=1.0;
256:   PetscBLASInt   info,i,it,N,one=1,zero=0;
257:   PetscBool      converged=PETSC_FALSE;
259:   unsigned int   ftz;

262:   N = n*n;
263:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
264:   SlepcSetFlushToZero(&ftz);

266:   PetscMalloc4(N,&Yold,N,&Z,N,&Zold,N,&M);

268:   /* scale A so that ||I-A|| < 1 */
269:   PetscArraycpy(Z,A,N);
270:   for (i=0;i<n;i++) Z[i+i*ld] -= 1.0;
271:   nrm = LAPACKlange_("fro",&n,&n,Z,&n,rwork);
272:   sqrtnrm = PetscSqrtReal(nrm);
273:   PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,A,&N,&info));
274:   SlepcCheckLapackInfo("lascl",info);
275:   tol *= nrm;
276:   PetscInfo2(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
277:   PetscLogFlops(2.0*n*n);

279:   /* Z = I */
280:   PetscArrayzero(Z,N);
281:   for (i=0;i<n;i++) Z[i+i*ld] = 1.0;

283:   for (it=0;it<NSMAXIT && !converged;it++) {
284:     /* Yold = Y, Zold = Z */
285:     PetscArraycpy(Yold,Y,N);
286:     PetscArraycpy(Zold,Z,N);

288:     /* M = (3*I-Zold*Yold) */
289:     PetscArrayzero(M,N);
290:     for (i=0;i<n;i++) M[i+i*ld] = sthree;
291:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&smone,Zold,&ld,Yold,&ld,&sone,M,&ld));

293:     /* Y = (1/2)*Yold*M, Z = (1/2)*M*Zold */
294:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Yold,&ld,M,&ld,&szero,Y,&ld));
295:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,M,&ld,Zold,&ld,&szero,Z,&ld));

297:     /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
298:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,Y,&one,Yold,&one));
299:     Yres = LAPACKlange_("fro",&n,&n,Yold,&n,rwork);
300:     if (PetscIsNanReal(Yres)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_FP,"The computed norm is not-a-number");
301:     if (Yres<=tol) converged = PETSC_TRUE;
302:     PetscInfo2(fn,"it: %D res: %g\n",it,(double)Yres);

304:     PetscLogFlops(6.0*n*n*n+2.0*n*n);
305:   }

307:   if (Yres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",NSMAXIT);

309:   /* undo scaling */
310:   if (inv) {
311:     PetscArraycpy(A,Z,N);
312:     PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&sqrtnrm,&done,&N,&one,A,&N,&info));
313:   } else PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&done,&sqrtnrm,&N,&one,A,&N,&info));
314:   SlepcCheckLapackInfo("lascl",info);

316:   PetscFree4(Yold,Z,Zold,M);
317:   SlepcResetFlushToZero(&ftz);
318:   return(0);
319: }

321: #if defined(PETSC_HAVE_CUDA)
322: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
323: #include <slepccublas.h>

325: /*
326:  * Matrix square root by Newton-Schulz iteration. CUDA version.
327:  * Computes the principal square root of the matrix T using the
328:  * Newton-Schulz iteration. T is overwritten with sqrtm(T).
329:  */
330: PetscErrorCode FNSqrtmNewtonSchulz_CUDA(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
331: {
332:   PetscScalar        *d_A,*d_Yold,*d_Z,*d_Zold,*d_M;
333:   PetscReal          nrm,sqrtnrm;
334:   const PetscScalar  szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
335:   PetscReal          tol,Yres=0.0,alpha;
336:   PetscInt           it;
337:   PetscBLASInt       N;
338:   const PetscBLASInt one=1,zero=0;
339:   PetscBool          converged=PETSC_FALSE;
340:   cublasHandle_t     cublasv2handle;
341:   PetscErrorCode     ierr;
342:   cublasStatus_t     cberr;
343:   cudaError_t        cerr;

346:   PetscCUDAInitializeCheck(); /* For CUDA event timers */
347:   PetscCUBLASGetHandle(&cublasv2handle);
348:   N = n*n;
349:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;

351:   cerr = cudaMalloc((void **)&d_A,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
352:   cerr = cudaMalloc((void **)&d_Yold,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
353:   cerr = cudaMalloc((void **)&d_Z,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
354:   cerr = cudaMalloc((void **)&d_Zold,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
355:   cerr = cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);

357:   PetscLogGpuTimeBegin();

359:   /* Y = A; */
360:   cerr = cudaMemcpy(d_A,A,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
361:   /* Z = I; */
362:   cerr = cudaMemset(d_Z,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
363:   set_diagonal(n,d_Z,ld,sone);CHKERRQ(cerr);

365:   /* scale A so that ||I-A|| < 1 */
366:   cberr = cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Z,one);CHKERRCUBLAS(cberr);
367:   cberr = cublasXnrm2(cublasv2handle,N,d_Z,one,&nrm);CHKERRCUBLAS(cberr);
368:   sqrtnrm = PetscSqrtReal(nrm);
369:   alpha = 1.0/nrm;
370:   cberr = cublasXscal(cublasv2handle,N,&alpha,d_A,one);CHKERRCUBLAS(cberr);
371:   tol *= nrm;
372:   PetscInfo2(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
373:   PetscLogGpuFlops(2.0*n*n);

375:   /* Z = I; */
376:   cerr = cudaMemset(d_Z,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
377:   set_diagonal(n,d_Z,ld,sone);CHKERRQ(cerr);

379:   for (it=0;it<NSMAXIT && !converged;it++) {
380:     /* Yold = Y, Zold = Z */
381:     cerr = cudaMemcpy(d_Yold,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
382:     cerr = cudaMemcpy(d_Zold,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);

384:     /* M = (3*I - Zold*Yold) */
385:     cerr = cudaMemset(d_M,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
386:     set_diagonal(n,d_M,ld,sthree);CHKERRQ(cerr);
387:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&smone,d_Zold,ld,d_Yold,ld,&sone,d_M,ld);CHKERRCUBLAS(cberr);

389:     /* Y = (1/2) * Yold * M, Z = (1/2) * M * Zold */
390:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Yold,ld,d_M,ld,&szero,d_A,ld);CHKERRCUBLAS(cberr);
391:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_M,ld,d_Zold,ld,&szero,d_Z,ld);CHKERRCUBLAS(cberr);

393:     /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
394:     cberr = cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Yold,one);CHKERRCUBLAS(cberr);
395:     cberr = cublasXnrm2(cublasv2handle,N,d_Yold,one,&Yres);CHKERRCUBLAS(cberr);
396:     if (PetscIsNanReal(Yres)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_FP,"The computed norm is not-a-number");
397:     if (Yres<=tol) converged = PETSC_TRUE;
398:     PetscInfo2(fn,"it: %D res: %g\n",it,(double)Yres);

400:     PetscLogGpuFlops(6.0*n*n*n+2.0*n*n);
401:   }

403:   if (Yres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", NSMAXIT);

405:   /* undo scaling */
406:   if (inv) {
407:     sqrtnrm = 1.0/sqrtnrm;
408:     cberr = cublasXscal(cublasv2handle,N,&sqrtnrm,d_Z,one);CHKERRCUBLAS(cberr);
409:     cerr = cudaMemcpy(A,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
410:   } else {
411:     cberr = cublasXscal(cublasv2handle,N,&sqrtnrm,d_A,one);CHKERRCUBLAS(cberr);
412:     cerr = cudaMemcpy(A,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
413:   }

415:   PetscLogGpuTimeEnd();
416:   cerr = cudaFree(d_A);CHKERRCUDA(cerr);
417:   cerr = cudaFree(d_Yold);CHKERRCUDA(cerr);
418:   cerr = cudaFree(d_Z);CHKERRCUDA(cerr);
419:   cerr = cudaFree(d_Zold);CHKERRCUDA(cerr);
420:   cerr = cudaFree(d_M);CHKERRCUDA(cerr);
421:   return(0);
422: }

424: #if defined(PETSC_HAVE_MAGMA)
425: #include <slepcmagma.h>

427: /*
428:  * Matrix square root by product form of Denman-Beavers iteration. CUDA version.
429:  * Computes the principal square root of the matrix T using the product form
430:  * of the Denman-Beavers iteration. T is overwritten with sqrtm(T).
431:  */
432: PetscErrorCode FNSqrtmDenmanBeavers_CUDAm(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
433: {
434:   PetscScalar    *d_T,*d_Told,*d_M,*d_invM,*d_work,zero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sneg_pfive=-0.5,sp25=0.25,alpha;
435:   PetscReal      tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,prod;
436:   PetscInt       i,it,lwork,nb;
437:   PetscBLASInt   N,one=1,info,*piv=NULL;
438:   PetscBool      converged=PETSC_FALSE,scale=PETSC_FALSE;
439:   cublasHandle_t cublasv2handle;
441:   cublasStatus_t cberr;
442:   cudaError_t    cerr;
443:   magma_int_t    mierr;

446:   PetscCUDAInitializeCheck(); /* For CUDA event timers */
447:   PetscCUBLASGetHandle(&cublasv2handle);
448:   magma_init();
449:   N = n*n;
450:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;

452:   /* query work size */
453:   nb = magma_get_xgetri_nb(n);
454:   lwork = nb*n;
455:   PetscMalloc1(n,&piv);
456:   cerr = cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork);CHKERRCUDA(cerr);
457:   cerr = cudaMalloc((void **)&d_T,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
458:   cerr = cudaMalloc((void **)&d_Told,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
459:   cerr = cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
460:   cerr = cudaMalloc((void **)&d_invM,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);

462:   PetscLogGpuTimeBegin();
463:   cerr = cudaMemcpy(d_M,T,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
464:   if (inv) {  /* start recurrence with I instead of A */
465:     cerr = cudaMemset(d_T,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
466:     set_diagonal(n,d_T,ld,1.0);CHKERRQ(cerr);
467:   } else {
468:     cerr = cudaMemcpy(d_T,T,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
469:   }

471:   for (it=0;it<DBMAXIT && !converged;it++) {

473:     if (scale) { /* g = (abs(det(M)))^(-1/(2*n)); */
474:       cerr = cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
475:       mmagma_xgetrf_gpu(n,n,d_invM,ld,piv,&info);CHKERRMAGMA(mierr);
476:       if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
477:       if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);

479:       /* XXX pending */
480: //      mult_diagonal(d_invM,n,ld,&detM);CHKERRQ(cerr);
481:       cerr = cudaMemcpy(T,d_invM,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
482:       prod = T[0];
483:       for (i=1;i<n;i++) { prod *= T[i+i*ld]; }
484:       detM = PetscAbsReal(prod);
485:       g = PetscPowReal(detM,-1.0/(2.0*n));
486:       alpha = g;
487:       cberr = cublasXscal(cublasv2handle,N,&alpha,d_T,one);CHKERRCUBLAS(cberr);
488:       alpha = g*g;
489:       cberr = cublasXscal(cublasv2handle,N,&alpha,d_M,one);CHKERRCUBLAS(cberr);
490:       PetscLogGpuFlops(2.0*n*n*n/3.0+2.0*n*n);
491:     }

493:     cerr = cudaMemcpy(d_Told,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
494:     cerr = cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);

496:     mmagma_xgetrf_gpu(n,n,d_invM,ld,piv,&info);CHKERRMAGMA(mierr);
497:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
498:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
499:     mmagma_xgetri_gpu(n,d_invM,ld,piv,d_work,lwork,&info);CHKERRMAGMA(mierr);
500:     if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
501:     if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
502:     PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);

504:     shift_diagonal(n,d_invM,ld,sone);CHKERRQ(cerr);
505:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Told,ld,d_invM,ld,&zero,d_T,ld);CHKERRCUBLAS(cberr);
506:     shift_diagonal(n,d_invM,ld,smone);CHKERRQ(cerr);

508:     cberr = cublasXaxpy(cublasv2handle,N,&sone,d_invM,one,d_M,one);CHKERRCUBLAS(cberr);
509:     cberr = cublasXscal(cublasv2handle,N,&sp25,d_M,one);CHKERRCUBLAS(cberr);
510:     shift_diagonal(n,d_M,ld,sneg_pfive);CHKERRQ(cerr);
511:     PetscLogGpuFlops(2.0*n*n*n+2.0*n*n);

513:     cberr = cublasXnrm2(cublasv2handle,N,d_M,one,&Mres);CHKERRCUBLAS(cberr);
514:     shift_diagonal(n,d_M,ld,sone);CHKERRQ(cerr);

516:     if (scale) {
517:       // reldiff = norm(T - Told,'fro')/norm(T,'fro');
518:       cberr = cublasXaxpy(cublasv2handle,N,&smone,d_T,one,d_Told,one);CHKERRCUBLAS(cberr);
519:       cberr = cublasXnrm2(cublasv2handle,N,d_Told,one,&fnormdiff);CHKERRCUBLAS(cberr);
520:       cberr = cublasXnrm2(cublasv2handle,N,d_T,one,&fnormT);CHKERRCUBLAS(cberr);
521:       PetscLogGpuFlops(7.0*n*n);
522:       reldiff = fnormdiff/fnormT;
523:       PetscInfo4(fn,"it: %D reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g);
524:       if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch to no scaling. */
525:     }

527:     PetscInfo2(fn,"it: %D Mres: %g\n",it,(double)Mres);
528:     if (Mres<=tol) converged = PETSC_TRUE;
529:   }

531:   if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", DBMAXIT);
532:   cerr = cudaMemcpy(T,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
533:   PetscLogGpuTimeEnd();
534:   PetscFree(piv);
535:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
536:   cerr = cudaFree(d_T);CHKERRCUDA(cerr);
537:   cerr = cudaFree(d_Told);CHKERRCUDA(cerr);
538:   cerr = cudaFree(d_M);CHKERRCUDA(cerr);
539:   cerr = cudaFree(d_invM);CHKERRCUDA(cerr);
540:   magma_finalize();
541:   return(0);
542: }
543: #endif /* PETSC_HAVE_MAGMA */

545: #endif /* PETSC_HAVE_CUDA */

547: #define ITMAX 5
548: #define SWAP(a,b,t) {t=a;a=b;b=t;}

550: /*
551:    Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
552: */
553: static PetscErrorCode SlepcNormEst1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
554: {
555:   PetscScalar    *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
556:   PetscReal      est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux;
557:   PetscBLASInt   i,j,t=2,it=0,ind[2],est_j=0,m1;

561:   X = work;
562:   Y = work + 2*n;
563:   Z = work + 4*n;
564:   S = work + 6*n;
565:   S_old = work + 8*n;
566:   zvals = (PetscReal*)(work + 10*n);

568:   for (i=0;i<n;i++) {  /* X has columns of unit 1-norm */
569:     X[i] = 1.0/n;
570:     PetscRandomGetValue(rand,&val);
571:     if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
572:     else X[i+n] = 1.0/n;
573:   }
574:   for (i=0;i<t*n;i++) S[i] = 0.0;
575:   ind[0] = 0; ind[1] = 0;
576:   est_old = 0;
577:   while (1) {
578:     it++;
579:     for (j=0;j<m;j++) {  /* Y = A^m*X */
580:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
581:       if (j<m-1) SWAP(X,Y,aux);
582:     }
583:     for (j=0;j<t;j++) {  /* vals[j] = norm(Y(:,j),1) */
584:       vals[j] = 0.0;
585:       for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
586:     }
587:     if (vals[0]<vals[1]) {
588:       SWAP(vals[0],vals[1],raux);
589:       m1 = 1;
590:     } else m1 = 0;
591:     est = vals[0];
592:     if (est>est_old || it==2) est_j = ind[m1];
593:     if (it>=2 && est<=est_old) {
594:       est = est_old;
595:       break;
596:     }
597:     est_old = est;
598:     if (it>ITMAX) break;
599:     SWAP(S,S_old,aux);
600:     for (i=0;i<t*n;i++) {  /* S = sign(Y) */
601:       S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
602:     }
603:     for (j=0;j<m;j++) {  /* Z = (A^T)^m*S */
604:       PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
605:       if (j<m-1) SWAP(S,Z,aux);
606:     }
607:     maxzval[0] = -1; maxzval[1] = -1;
608:     ind[0] = 0; ind[1] = 0;
609:     for (i=0;i<n;i++) {  /* zvals[i] = norm(Z(i,:),inf) */
610:       zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
611:       if (zvals[i]>maxzval[0]) {
612:         maxzval[0] = zvals[i];
613:         ind[0] = i;
614:       } else if (zvals[i]>maxzval[1]) {
615:         maxzval[1] = zvals[i];
616:         ind[1] = i;
617:       }
618:     }
619:     if (it>=2 && maxzval[0]==zvals[est_j]) break;
620:     for (i=0;i<t*n;i++) X[i] = 0.0;
621:     for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
622:   }
623:   *nrm = est;
624:   /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
625:   PetscLogFlops(4.0*it*m*t*n*n);
626:   return(0);
627: }

629: #define SMALLN 100

631: /*
632:    Estimate norm(A^m,1) (required workspace is 2*n*n)
633: */
634: PetscErrorCode SlepcNormAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
635: {
636:   PetscScalar    *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
637:   PetscReal      rwork[1],tmp;
638:   PetscBLASInt   i,j,one=1;
639:   PetscBool      isrealpos=PETSC_TRUE;

643:   if (n<SMALLN) {   /* compute matrix power explicitly */
644:     if (m==1) {
645:       *nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
646:       PetscLogFlops(1.0*n*n);
647:     } else {  /* m>=2 */
648:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
649:       for (j=0;j<m-2;j++) {
650:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
651:         SWAP(v,w,aux);
652:       }
653:       *nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
654:       PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
655:     }
656:   } else {
657:     for (i=0;i<n;i++)
658:       for (j=0;j<n;j++)
659: #if defined(PETSC_USE_COMPLEX)
660:         if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
661: #else
662:         if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
663: #endif
664:     if (isrealpos) {   /* for positive matrices only */
665:       for (i=0;i<n;i++) v[i] = 1.0;
666:       for (j=0;j<m;j++) {  /* w = A'*v */
667:         PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
668:         SWAP(v,w,aux);
669:       }
670:       PetscLogFlops(2.0*n*n*m);
671:       *nrm = 0.0;
672:       for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > *nrm) *nrm = tmp;   /* norm(v,inf) */
673:     } else {
674:       SlepcNormEst1(n,A,m,work,rand,nrm);
675:     }
676:   }
677:   return(0);
678: }