Actual source code: ilu.h
1: /* $Id: ilu.h,v 1.29 2001/08/07 03:01:46 balay Exp $ */
2: /*
3: Kernels used in sparse ILU (and LU) and in the resulting triangular
4: solves. These are for block algorithms where the block sizes are on
5: the order of 2-6+.
7: There are TWO versions of the macros below.
8: 1) standard for MatScalar == PetscScalar use the standard BLAS for
9: block size larger than 7 and
10: 2) handcoded Fortran single precision for the matrices, since BLAS
11: does not have some arguments in single and some in double.
13: */
17: #include petscblaslapack.h
19: /*
20: These are C kernels,they are contained in
21: src/mat/impls/baij/seq
22: */
24: EXTERN int LINPACKdgefa(MatScalar *,int,int *);
25: EXTERN int LINPACKdgedi(MatScalar *,int,int *,MatScalar*);
26: EXTERN int Kernel_A_gets_inverse_A_2(MatScalar *);
27: EXTERN int Kernel_A_gets_inverse_A_3(MatScalar *);
29: #define Kernel_A_gets_inverse_A_4_nopivot(mat) 0;
30: {
31: MatScalar d, di;
32:
33: di = mat[0];
34: mat[0] = d = 1.0 / di;
35: mat[4] *= -d;
36: mat[8] *= -d;
37: mat[12] *= -d;
38: mat[1] *= d;
39: mat[2] *= d;
40: mat[3] *= d;
41: mat[5] += mat[4] * mat[1] * di;
42: mat[6] += mat[4] * mat[2] * di;
43: mat[7] += mat[4] * mat[3] * di;
44: mat[9] += mat[8] * mat[1] * di;
45: mat[10] += mat[8] * mat[2] * di;
46: mat[11] += mat[8] * mat[3] * di;
47: mat[13] += mat[12] * mat[1] * di;
48: mat[14] += mat[12] * mat[2] * di;
49: mat[15] += mat[12] * mat[3] * di;
50: di = mat[5];
51: mat[5] = d = 1.0 / di;
52: mat[1] *= -d;
53: mat[9] *= -d;
54: mat[13] *= -d;
55: mat[4] *= d;
56: mat[6] *= d;
57: mat[7] *= d;
58: mat[0] += mat[1] * mat[4] * di;
59: mat[2] += mat[1] * mat[6] * di;
60: mat[3] += mat[1] * mat[7] * di;
61: mat[8] += mat[9] * mat[4] * di;
62: mat[10] += mat[9] * mat[6] * di;
63: mat[11] += mat[9] * mat[7] * di;
64: mat[12] += mat[13] * mat[4] * di;
65: mat[14] += mat[13] * mat[6] * di;
66: mat[15] += mat[13] * mat[7] * di;
67: di = mat[10];
68: mat[10] = d = 1.0 / di;
69: mat[2] *= -d;
70: mat[6] *= -d;
71: mat[14] *= -d;
72: mat[8] *= d;
73: mat[9] *= d;
74: mat[11] *= d;
75: mat[0] += mat[2] * mat[8] * di;
76: mat[1] += mat[2] * mat[9] * di;
77: mat[3] += mat[2] * mat[11] * di;
78: mat[4] += mat[6] * mat[8] * di;
79: mat[5] += mat[6] * mat[9] * di;
80: mat[7] += mat[6] * mat[11] * di;
81: mat[12] += mat[14] * mat[8] * di;
82: mat[13] += mat[14] * mat[9] * di;
83: mat[15] += mat[14] * mat[11] * di;
84: di = mat[15];
85: mat[15] = d = 1.0 / di;
86: mat[3] *= -d;
87: mat[7] *= -d;
88: mat[11] *= -d;
89: mat[12] *= d;
90: mat[13] *= d;
91: mat[14] *= d;
92: mat[0] += mat[3] * mat[12] * di;
93: mat[1] += mat[3] * mat[13] * di;
94: mat[2] += mat[3] * mat[14] * di;
95: mat[4] += mat[7] * mat[12] * di;
96: mat[5] += mat[7] * mat[13] * di;
97: mat[6] += mat[7] * mat[14] * di;
98: mat[8] += mat[11] * mat[12] * di;
99: mat[9] += mat[11] * mat[13] * di;
100: mat[10] += mat[11] * mat[14] * di;
101: }
103: EXTERN int Kernel_A_gets_inverse_A_4(MatScalar *);
104: # if defined(PETSC_HAVE_SSE)
105: EXTERN int Kernel_A_gets_inverse_A_4_SSE(MatScalar *);
106: # endif
108: EXTERN int Kernel_A_gets_inverse_A_5(MatScalar *);
109: EXTERN int Kernel_A_gets_inverse_A_6(MatScalar *);
110: EXTERN int Kernel_A_gets_inverse_A_7(MatScalar *);
113: /*
114: A = inv(A) A_gets_inverse_A
116: A - square bs by bs array stored in column major order
117: pivots - integer work array of length bs
118: W - bs by 1 work array
119: */
120: #define Kernel_A_gets_inverse_A(bs,A,pivots,W)
121: {
122: LINPACKdgefa((A),(bs),(pivots));
123: LINPACKdgedi((A),(bs),(pivots),(W));
124: }
126: /* -----------------------------------------------------------------------*/
128: #if !defined(PETSC_USE_MAT_SINGLE)
129: /*
130: Version that calls the BLAS directly
131: */
132: /*
133: A = A * B A_gets_A_times_B
135: A, B - square bs by bs arrays stored in column major order
136: W - square bs by bs work array
138: */
139: #define Kernel_A_gets_A_times_B(bs,A,B,W)
140: {
141: PetscScalar _one = 1.0,_zero = 0.0;
142: int _ierr;
143: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr);
144: BLgemm_("N","N",&(bs),&(bs),&(bs),&_one,(W),&(bs),(B),&(bs),&_zero,(A),&(bs));
145: }
147: /*
149: A = A - B * C A_gets_A_minus_B_times_C
151: A, B, C - square bs by bs arrays stored in column major order
152: */
153: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C)
154: {
155: PetscScalar _mone = -1.0,_one = 1.0;
156: BLgemm_("N","N",&(bs),&(bs),&(bs),&_mone,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
157: }
159: /*
161: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
163: A, B, C - square bs by bs arrays stored in column major order
164: */
165: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
166: {
167: PetscScalar _one = 1.0;
168: BLgemm_("T","N",&(bs),&(bs),&(bs),&_one,(B),&(bs),(C),&(bs),&_one,(A),&(bs));
169: }
171: /*
172: v = v + A^T w v_gets_v_plus_Atranspose_times_w
174: v - array of length bs
175: A - square bs by bs array
176: w - array of length bs
177: */
178: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
179: {
180: PetscScalar _one = 1.0;
181: int _ione = 1;
182: LAgemv_("T",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione);
183: }
185: /*
186: v = v - A w v_gets_v_minus_A_times_w
188: v - array of length bs
189: A - square bs by bs array
190: w - array of length bs
191: */
192: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w)
193: {
194: PetscScalar _mone = -1.0,_one = 1.0;
195: int _ione = 1;
196: LAgemv_("N",&(bs),&(bs),&_mone,A,&(bs),w,&_ione,&_one,v,&_ione);
197: }
199: /*
200: v = v + A w v_gets_v_plus_A_times_w
202: v - array of length bs
203: A - square bs by bs array
204: w - array of length bs
205: */
206: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w)
207: {
208: PetscScalar _one = 1.0;
209: int _ione = 1;
210: LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),w,&_ione,&_one,v,&_ione);
211: }
213: /*
214: v = v + A w v_gets_v_plus_Ar_times_w
216: v - array of length bs
217: A - square bs by bs array
218: w - array of length bs
219: */
220: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w)
221: {
222: PetscScalar _one = 1.0;
223: int _ione = 1;
224: LAgemv_("N",&(bs),&(ncols),&_one,A,&(bs),v,&_ione,&_one,w,&_ione);
225: }
227: /*
228: w = A v w_gets_A_times_v
230: v - array of length bs
231: A - square bs by bs array
232: w - array of length bs
233: */
234: #define Kernel_w_gets_A_times_v(bs,v,A,w)
235: {
236: PetscScalar _zero = 0.0,_one = 1.0;
237: int _ione = 1;
238: LAgemv_("N",&(bs),&(bs),&_one,A,&(bs),v,&_ione,&_zero,w,&_ione);
239: }
241: /*
242: z = A*x
243: */
244: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z)
245: {
246: PetscScalar _one = 1.0,_zero = 0.0;
247: int _ione = 1;
248: LAgemv_("N",&bs,&ncols,&_one,A,&bs,x,&_ione,&_zero,z,&_ione);
249: }
251: /*
252: z = trans(A)*x
253: */
254: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z)
255: {
256: PetscScalar _one = 1.0;
257: int _ione = 1;
258: LAgemv_("T",&bs,&ncols,&_one,A,&bs,x,&_ione,&_one,z,&_ione);
259: }
261: #else
262: /*
263: Version that calls Fortran routines; can handle different precision
264: of matrix (array) and vectors
265: */
266: /*
267: These are Fortran kernels: They replace certain BLAS routines but
268: have some arguments that may be single precision,rather than double
269: These routines are provided in src/fortran/kernels/sgemv.F
270: They are pretty pitiful but get the job done. The intention is
271: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
272: code is used.
273: */
274: #ifdef PETSC_HAVE_FORTRAN_CAPS
275: #define msgemv_ MSGEMV
276: #define msgemvp_ MSGEMVP
277: #define msgemvm_ MSGEMVM
278: #define msgemvt_ MSGEMVT
279: #define msgemmi_ MSGEMMI
280: #define msgemm_ MSGEMM
281: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
282: #define msgemv_ msgemv
283: #define msgemvp_ msgemvp
284: #define msgemvm_ msgemvm
285: #define msgemvt_ msgemvt
286: #define msgemmi_ msgemmi
287: #define msgemm_ msgemm
288: #endif
289: EXTERN_C_BEGIN
290: EXTERN void msgemv_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
291: EXTERN void msgemvp_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
292: EXTERN void msgemvm_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
293: EXTERN void msgemvt_(int *,int *,MatScalar*,PetscScalar*,PetscScalar*);
294: EXTERN void msgemmi_(int *,MatScalar*,MatScalar*,MatScalar*);
295: EXTERN void msgemm_(int *,MatScalar*,MatScalar*,MatScalar*);
296: EXTERN_C_END
298: /*
299: A = A * B A_gets_A_times_B
301: A, B - square bs by bs arrays stored in column major order
302: W - square bs by bs work array
304: */
305: #define Kernel_A_gets_A_times_B(bs,A,B,W)
306: {
307: int _ierr;
308: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr);
309: msgemmi_(&bs,A,B,W);
310: }
312: /*
314: A = A - B * C A_gets_A_minus_B_times_C
316: A, B, C - square bs by bs arrays stored in column major order
317: */
318: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C)
319: {
320: msgemm_(&bs,A,B,C);
321: }
323: /*
324: v = v - A w v_gets_v_minus_A_times_w
326: v - array of length bs
327: A - square bs by bs array
328: w - array of length bs
329: */
330: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w)
331: {
332: msgemvm_(&bs,&bs,A,w,v);
333: }
335: /*
336: v = v + A w v_gets_v_plus_A_times_w
338: v - array of length bs
339: A - square bs by bs array
340: w - array of length bs
341: */
342: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w)
343: {
344: msgemvp_(&bs,&bs,A,w,v);
345: }
347: /*
348: v = v + A w v_gets_v_plus_Ar_times_w
350: v - array of length bs
351: A - square bs by bs array
352: w - array of length bs
353: */
354: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w)
355: {
356: msgemvp_(&bs,&ncol,A,v,w);
357: }
359: /*
360: w = A v w_gets_A_times_v
362: v - array of length bs
363: A - square bs by bs array
364: w - array of length bs
365: */
366: #define Kernel_w_gets_A_times_v(bs,v,A,w)
367: {
368: msgemv_(&bs,&bs,A,v,w);
369: }
370:
371: /*
372: z = A*x
373: */
374: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z)
375: {
376: msgemv_(&bs,&ncols,A,x,z);
377: }
379: /*
380: z = trans(A)*x
381: */
382: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z)
383: {
384: msgemvt_(&bs,&ncols,A,x,z);
385: }
387: /* These do not work yet */
388: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
389: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
392: #endif
394: #endif