Actual source code: ilu.h

  1: /* $Id: ilu.h,v 1.29 2001/08/07 03:01:46 balay Exp $ */
  2: /*
  3:     Kernels used in sparse ILU (and LU) and in the resulting triangular
  4:  solves. These are for block algorithms where the block sizes are on 
  5:  the order of 2-6+.

  7:     There are TWO versions of the macros below. 
  8:     1) standard for MatScalar == PetscScalar use the standard BLAS for 
  9:        block size larger than 7 and
 10:     2) handcoded Fortran single precision for the matrices, since BLAS
 11:        does not have some arguments in single and some in double.

 13: */

 17:  #include petscblaslapack.h

 19: /*
 20:       These are C kernels,they are contained in 
 21:    src/mat/impls/baij/seq
 22: */

 24: EXTERN int  LINPACKdgefa(MatScalar *,int,int *);
 25: EXTERN int  LINPACKdgedi(MatScalar *,int,int *,MatScalar*);
 26: EXTERN int  Kernel_A_gets_inverse_A_2(MatScalar *);
 27: EXTERN int  Kernel_A_gets_inverse_A_3(MatScalar *);

 29: #define Kernel_A_gets_inverse_A_4_nopivot(mat) 0;
 30: {
 31:   MatScalar d, di;
 32: 
 33:   di = mat[0];
 34:   mat[0] = d = 1.0 / di;
 35:   mat[4] *= -d;
 36:   mat[8] *= -d;
 37:   mat[12] *= -d;
 38:   mat[1] *= d;
 39:   mat[2] *= d;
 40:   mat[3] *= d;
 41:   mat[5] += mat[4] * mat[1] * di;
 42:   mat[6] += mat[4] * mat[2] * di;
 43:   mat[7] += mat[4] * mat[3] * di;
 44:   mat[9] += mat[8] * mat[1] * di;
 45:   mat[10] += mat[8] * mat[2] * di;
 46:   mat[11] += mat[8] * mat[3] * di;
 47:   mat[13] += mat[12] * mat[1] * di;
 48:   mat[14] += mat[12] * mat[2] * di;
 49:   mat[15] += mat[12] * mat[3] * di;
 50:   di = mat[5];
 51:   mat[5] = d = 1.0 / di;
 52:   mat[1] *= -d;
 53:   mat[9] *= -d;
 54:   mat[13] *= -d;
 55:   mat[4] *= d;
 56:   mat[6] *= d;
 57:   mat[7] *= d;
 58:   mat[0] += mat[1] * mat[4] * di;
 59:   mat[2] += mat[1] * mat[6] * di;
 60:   mat[3] += mat[1] * mat[7] * di;
 61:   mat[8] += mat[9] * mat[4] * di;
 62:   mat[10] += mat[9] * mat[6] * di;
 63:   mat[11] += mat[9] * mat[7] * di;
 64:   mat[12] += mat[13] * mat[4] * di;
 65:   mat[14] += mat[13] * mat[6] * di;
 66:   mat[15] += mat[13] * mat[7] * di;
 67:   di = mat[10];
 68:   mat[10] = d = 1.0 / di;
 69:   mat[2] *= -d;
 70:   mat[6] *= -d;
 71:   mat[14] *= -d;
 72:   mat[8] *= d;
 73:   mat[9] *= d;
 74:   mat[11] *= d;
 75:   mat[0] += mat[2] * mat[8] * di;
 76:   mat[1] += mat[2] * mat[9] * di;
 77:   mat[3] += mat[2] * mat[11] * di;
 78:   mat[4] += mat[6] * mat[8] * di;
 79:   mat[5] += mat[6] * mat[9] * di;
 80:   mat[7] += mat[6] * mat[11] * di;
 81:   mat[12] += mat[14] * mat[8] * di;
 82:   mat[13] += mat[14] * mat[9] * di;
 83:   mat[15] += mat[14] * mat[11] * di;
 84:   di = mat[15];
 85:   mat[15] = d = 1.0 / di;
 86:   mat[3] *= -d;
 87:   mat[7] *= -d;
 88:   mat[11] *= -d;
 89:   mat[12] *= d;
 90:   mat[13] *= d;
 91:   mat[14] *= d;
 92:   mat[0] += mat[3] * mat[12] * di;
 93:   mat[1] += mat[3] * mat[13] * di;
 94:   mat[2] += mat[3] * mat[14] * di;
 95:   mat[4] += mat[7] * mat[12] * di;
 96:   mat[5] += mat[7] * mat[13] * di;
 97:   mat[6] += mat[7] * mat[14] * di;
 98:   mat[8] += mat[11] * mat[12] * di;
 99:   mat[9] += mat[11] * mat[13] * di;
100:   mat[10] += mat[11] * mat[14] * di;
101: }

103: EXTERN int  Kernel_A_gets_inverse_A_4(MatScalar *);
104: # if defined(PETSC_HAVE_SSE)
105: EXTERN int  Kernel_A_gets_inverse_A_4_SSE(MatScalar *);
106: # endif

108: EXTERN int  Kernel_A_gets_inverse_A_5(MatScalar *);
109: EXTERN int  Kernel_A_gets_inverse_A_6(MatScalar *);
110: EXTERN int  Kernel_A_gets_inverse_A_7(MatScalar *);


113: /*
114:     A = inv(A)    A_gets_inverse_A

116:    A      - square bs by bs array stored in column major order
117:    pivots - integer work array of length bs
118:    W      -  bs by 1 work array
119: */
120: #define Kernel_A_gets_inverse_A(bs,A,pivots,W)
121: { 
122:   LINPACKdgefa((A),(bs),(pivots)); 
123:   LINPACKdgedi((A),(bs),(pivots),(W)); 
124: }

126: /* -----------------------------------------------------------------------*/

128: #if !defined(PETSC_USE_MAT_SINGLE)
129: /*
130:         Version that calls the BLAS directly
131: */
132: /*
133:       A = A * B   A_gets_A_times_B

135:    A, B - square bs by bs arrays stored in column major order
136:    W    - square bs by bs work array

138: */
139: #define Kernel_A_gets_A_times_B(bs,A,B,W) 
140: { 
141:   PetscScalar _one = 1.0,_zero = 0.0; 
142:   int    _ierr; 
143:   _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); 
144:   BLgemm_("N","N",&(bs),&(bs),&(bs),&_one,(W),&(bs),(B),&(bs),&_zero,(A),&(bs));
145: }

147: /*

149:     A = A - B * C  A_gets_A_minus_B_times_C 

151:    A, B, C - square bs by bs arrays stored in column major order
152: */
153: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) 
154: { 
155:   PetscScalar _mone = -1.0,_one = 1.0; 
156:   BLgemm_("N","N",&(bs),&(bs),&(bs),&_mone,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
157: }

159: /*

161:     A = A + B^T * C  A_gets_A_plus_Btranspose_times_C 

163:    A, B, C - square bs by bs arrays stored in column major order
164: */
165: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) 
166: { 
167:   PetscScalar _one = 1.0; 
168:   BLgemm_("T","N",&(bs),&(bs),&(bs),&_one,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
169: }

171: /*
172:     v = v + A^T w  v_gets_v_plus_Atranspose_times_w

174:    v - array of length bs
175:    A - square bs by bs array
176:    w - array of length bs
177: */
178: #define  Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) 
179: {  
180:   PetscScalar _one = 1.0; 
181:   int    _ione = 1; 
182:   LAgemv_("T",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione); 
183: } 

185: /*
186:     v = v - A w  v_gets_v_minus_A_times_w

188:    v - array of length bs
189:    A - square bs by bs array
190:    w - array of length bs
191: */
192: #define  Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) 
193: {  
194:   PetscScalar _mone = -1.0,_one = 1.0; 
195:   int    _ione = 1; 
196:   LAgemv_("N",&(bs),&(bs),&_mone,A,&(bs),w,&_ione,&_one,v,&_ione); 
197: }

199: /*
200:     v = v + A w  v_gets_v_plus_A_times_w

202:    v - array of length bs
203:    A - square bs by bs array
204:    w - array of length bs
205: */
206: #define  Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) 
207: {  
208:   PetscScalar _one = 1.0; 
209:   int    _ione = 1; 
210:   LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione); 
211: }

213: /*
214:     v = v + A w  v_gets_v_plus_Ar_times_w

216:    v - array of length bs
217:    A - square bs by bs array
218:    w - array of length bs
219: */
220: #define  Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) 
221: {  
222:   PetscScalar _one = 1.0; 
223:   int    _ione = 1; 
224:   LAgemv_("N",&(bs),&(ncols),&_one,A,&(bs),v,&_ione,&_one,w,&_ione); 
225: }

227: /*
228:     w = A v   w_gets_A_times_v

230:    v - array of length bs
231:    A - square bs by bs array
232:    w - array of length bs
233: */
234: #define Kernel_w_gets_A_times_v(bs,v,A,w) 
235: {  
236:   PetscScalar _zero = 0.0,_one = 1.0; 
237:   int    _ione = 1; 
238:   LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),v,&_ione,&_zero,w,&_ione); 
239: }

241: /*
242:         z = A*x
243: */
244: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) 
245: { 
246:   PetscScalar _one = 1.0,_zero = 0.0; 
247:   int    _ione = 1; 
248:   LAgemv_("N",&bs,&ncols,&_one,A,&bs,x,&_ione,&_zero,z,&_ione); 
249: }

251: /*
252:         z = trans(A)*x
253: */
254: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) 
255: { 
256:   PetscScalar _one = 1.0; 
257:   int    _ione = 1; 
258:   LAgemv_("T",&bs,&ncols,&_one,A,&bs,x,&_ione,&_one,z,&_ione); 
259: }

261: #else 
262: /*
263:        Version that calls Fortran routines; can handle different precision
264:    of matrix (array) and vectors
265: */
266: /*
267:      These are Fortran kernels: They replace certain BLAS routines but
268:    have some arguments that may be single precision,rather than double
269:    These routines are provided in src/fortran/kernels/sgemv.F 
270:    They are pretty pitiful but get the job done. The intention is 
271:    that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined 
272:    code is used.
273: */
274: #ifdef PETSC_HAVE_FORTRAN_CAPS
275: #define msgemv_  MSGEMV
276: #define msgemvp_ MSGEMVP
277: #define msgemvm_ MSGEMVM
278: #define msgemvt_ MSGEMVT
279: #define msgemmi_ MSGEMMI
280: #define msgemm_  MSGEMM
281: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
282: #define msgemv_  msgemv
283: #define msgemvp_ msgemvp
284: #define msgemvm_ msgemvm
285: #define msgemvt_ msgemvt
286: #define msgemmi_ msgemmi
287: #define msgemm_  msgemm
288: #endif
289: EXTERN_C_BEGIN
290: EXTERN void msgemv_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
291: EXTERN void msgemvp_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
292: EXTERN void msgemvm_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
293: EXTERN void msgemvt_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
294: EXTERN void msgemmi_(int *,MatScalar*,MatScalar*,MatScalar*);
295: EXTERN void msgemm_(int *,MatScalar*,MatScalar*,MatScalar*);
296: EXTERN_C_END

298: /*
299:       A = A * B   A_gets_A_times_B

301:    A, B - square bs by bs arrays stored in column major order
302:    W    - square bs by bs work array

304: */
305: #define Kernel_A_gets_A_times_B(bs,A,B,W) 
306: { 
307:   int _ierr; 
308:   _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); 
309:   msgemmi_(&bs,A,B,W); 
310: }

312: /*

314:     A = A - B * C  A_gets_A_minus_B_times_C 

316:    A, B, C - square bs by bs arrays stored in column major order
317: */
318: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) 
319: { 
320:   msgemm_(&bs,A,B,C); 
321: }

323: /*
324:     v = v - A w  v_gets_v_minus_A_times_w

326:    v - array of length bs
327:    A - square bs by bs array
328:    w - array of length bs
329: */
330: #define  Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) 
331: {  
332:   msgemvm_(&bs,&bs,A,w,v); 
333: }

335: /*
336:     v = v + A w  v_gets_v_plus_A_times_w

338:    v - array of length bs
339:    A - square bs by bs array
340:    w - array of length bs
341: */
342: #define  Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) 
343: {  
344:   msgemvp_(&bs,&bs,A,w,v);
345: }

347: /*
348:     v = v + A w  v_gets_v_plus_Ar_times_w

350:    v - array of length bs
351:    A - square bs by bs array
352:    w - array of length bs
353: */
354: #define  Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) 
355: {  
356:   msgemvp_(&bs,&ncol,A,v,w);
357: }

359: /*
360:     w = A v   w_gets_A_times_v

362:    v - array of length bs
363:    A - square bs by bs array
364:    w - array of length bs
365: */
366: #define Kernel_w_gets_A_times_v(bs,v,A,w) 
367: {  
368:   msgemv_(&bs,&bs,A,v,w);
369: }
370: 
371: /*
372:         z = A*x
373: */
374: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) 
375: { 
376:   msgemv_(&bs,&ncols,A,x,z);
377: }

379: /*
380:         z = trans(A)*x
381: */
382: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) 
383: { 
384:   msgemvt_(&bs,&ncols,A,x,z);
385: }

387: /* These do not work yet */
388: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) 
389: #define  Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) 


392: #endif

394: #endif