Actual source code: aijnode.c

  1: /*$Id: aijnode.c,v 1.128 2001/08/07 03:02:47 balay Exp $*/
  2: /*
  3:   This file provides high performance routines for the AIJ (compressed row)
  4:   format by taking advantage of rows with identical nonzero structure (I-nodes).
  5: */
 6:  #include src/mat/impls/aij/seq/aij.h
 7:  #include src/vec/vecimpl.h

  9: EXTERN int Mat_AIJ_CheckInode(Mat,PetscTruth);
 10: EXTERN int MatSolve_SeqAIJ_Inode(Mat,Vec,Vec);
 11: EXTERN int MatLUFactorNumeric_SeqAIJ_Inode(Mat,Mat *);

 13: EXTERN int MatMult_SeqAIJ(Mat,Vec,Vec);
 14: EXTERN int MatMultAdd_SeqAIJ(Mat,Vec,Vec,Vec);
 15: EXTERN int MatSolve_SeqAIJ(Mat,Vec,Vec);
 16: EXTERN int MatLUFactorNumeric_SeqAIJ(Mat,Mat*);
 17: EXTERN int MatGetRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
 18: EXTERN int MatRestoreRowIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
 19: EXTERN int MatGetColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);
 20: EXTERN int MatRestoreColumnIJ_SeqAIJ(Mat,int,PetscTruth,int*,int**,int**,PetscTruth*);

 22: #undef __FUNCT__  
 24: static int Mat_AIJ_CreateColInode(Mat A,int* size,int ** ns)
 25: {
 26:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 27:   int        ierr,i,count,m,n,min_mn,*ns_row,*ns_col;

 30:   n      = A->n;
 31:   m      = A->m;
 32:   ns_row = a->inode.size;
 33: 
 34:   min_mn = (m < n) ? m : n;
 35:   if (!ns) {
 36:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 37:     for(; count+1 < n; count++,i++);
 38:     if (count < n)  {
 39:       i++;
 40:     }
 41:     *size = i;
 42:     return(0);
 43:   }
 44:   PetscMalloc((n+1)*sizeof(int),&ns_col);
 45: 
 46:   /* Use the same row structure wherever feasible. */
 47:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 48:     ns_col[i] = ns_row[i];
 49:   }

 51:   /* if m < n; pad up the remainder with inode_limit */
 52:   for(; count+1 < n; count++,i++) {
 53:     ns_col[i] = 1;
 54:   }
 55:   /* The last node is the odd ball. padd it up with the remaining rows; */
 56:   if (count < n)  {
 57:     ns_col[i] = n - count;
 58:     i++;
 59:   } else if (count > n) {
 60:     /* Adjust for the over estimation */
 61:     ns_col[i-1] += n - count;
 62:   }
 63:   *size = i;
 64:   *ns   = ns_col;
 65:   return(0);
 66: }


 69: /*
 70:       This builds symmetric version of nonzero structure,
 71: */
 72: #undef __FUNCT__  
 74: static int MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
 75: {
 76:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
 77:   int        *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n,ierr;
 78:   int        *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 81:   nslim_row = a->inode.node_count;
 82:   m         = A->m;
 83:   n         = A->n;
 84:   if (m != n) SETERRQ(1,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix shoul be square");
 85: 
 86:   /* Use the row_inode as column_inode */
 87:   nslim_col = nslim_row;
 88:   ns_col    = ns_row;

 90:   /* allocate space for reformated inode structure */
 91:   PetscMalloc((nslim_col+1)*sizeof(int),&tns);
 92:   PetscMalloc((n+1)*sizeof(int),&tvc);
 93:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 95:   for (i1=0,col=0; i1<nslim_col; ++i1){
 96:     nsz = ns_col[i1];
 97:     for (i2=0; i2<nsz; ++i2,++col)
 98:       tvc[col] = i1;
 99:   }
100:   /* allocate space for row pointers */
101:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
102:   *iia = ia;
103:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
104:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

106:   /* determine the number of columns in each row */
107:   ia[0] = oshift;
108:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

110:     j    = aj + ai[row] + ishift;
111:     jmax = aj + ai[row+1] + ishift;
112:     i2   = 0;
113:     col  = *j++ + ishift;
114:     i2   = tvc[col];
115:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
116:       ia[i1+1]++;
117:       ia[i2+1]++;
118:       i2++;                     /* Start col of next node */
119:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
120:       i2 = tvc[col];
121:     }
122:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
123:   }

125:   /* shift ia[i] to point to next row */
126:   for (i1=1; i1<nslim_row+1; i1++) {
127:     row        = ia[i1-1];
128:     ia[i1]    += row;
129:     work[i1-1] = row - oshift;
130:   }

132:   /* allocate space for column pointers */
133:   nz   = ia[nslim_row] + (!ishift);
134:   PetscMalloc(nz*sizeof(int),&ja);
135:   *jja = ja;

137:  /* loop over lower triangular part putting into ja */
138:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
139:     j    = aj + ai[row] + ishift;
140:     jmax = aj + ai[row+1] + ishift;
141:     i2   = 0;                     /* Col inode index */
142:     col  = *j++ + ishift;
143:     i2   = tvc[col];
144:     while (i2<i1 && j<jmax) {
145:       ja[work[i2]++] = i1 + oshift;
146:       ja[work[i1]++] = i2 + oshift;
147:       ++i2;
148:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
149:       i2 = tvc[col];
150:     }
151:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

153:   }
154:   PetscFree(work);
155:   PetscFree(tns);
156:   PetscFree(tvc);
157:   return(0);
158: }

160: /*
161:       This builds nonsymmetric version of nonzero structure,
162: */
163: #undef __FUNCT__  
165: static int MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
166: {
167:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
168:   int        *work,*ia,*ja,*j,nz,nslim_row,n,row,col,ierr,*ns_col,nslim_col;
169:   int        *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

172:   nslim_row = a->inode.node_count;
173:   n         = A->n;

175:   /* Create The column_inode for this matrix */
176:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
177: 
178:   /* allocate space for reformated column_inode structure */
179:   PetscMalloc((nslim_col +1)*sizeof(int),&tns);
180:   PetscMalloc((n +1)*sizeof(int),&tvc);
181:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

183:   for (i1=0,col=0; i1<nslim_col; ++i1){
184:     nsz = ns_col[i1];
185:     for (i2=0; i2<nsz; ++i2,++col)
186:       tvc[col] = i1;
187:   }
188:   /* allocate space for row pointers */
189:   PetscMalloc((nslim_row+1)*sizeof(int),&ia);
190:   *iia = ia;
191:   PetscMemzero(ia,(nslim_row+1)*sizeof(int));
192:   PetscMalloc((nslim_row+1)*sizeof(int),&work);

194:   /* determine the number of columns in each row */
195:   ia[0] = oshift;
196:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
197:     j   = aj + ai[row] + ishift;
198:     col = *j++ + ishift;
199:     i2  = tvc[col];
200:     nz  = ai[row+1] - ai[row];
201:     while (nz-- > 0) {           /* off-diagonal elemets */
202:       ia[i1+1]++;
203:       i2++;                     /* Start col of next node */
204:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
205:       i2 = tvc[col];
206:     }
207:   }

209:   /* shift ia[i] to point to next row */
210:   for (i1=1; i1<nslim_row+1; i1++) {
211:     row        = ia[i1-1];
212:     ia[i1]    += row;
213:     work[i1-1] = row - oshift;
214:   }

216:   /* allocate space for column pointers */
217:   nz   = ia[nslim_row] + (!ishift);
218:   PetscMalloc(nz*sizeof(int),&ja);
219:   *jja = ja;

221:  /* loop over matrix putting into ja */
222:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
223:     j   = aj + ai[row] + ishift;
224:     i2  = 0;                     /* Col inode index */
225:     col = *j++ + ishift;
226:     i2  = tvc[col];
227:     nz  = ai[row+1] - ai[row];
228:     while (nz-- > 0) {
229:       ja[work[i1]++] = i2 + oshift;
230:       ++i2;
231:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
232:       i2 = tvc[col];
233:     }
234:   }
235:   PetscFree(ns_col);
236:   PetscFree(work);
237:   PetscFree(tns);
238:   PetscFree(tvc);
239:   return(0);
240: }

242: #undef __FUNCT__  
244: static int MatGetRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
245: {
246:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
247:   int        ierr,ishift;

250:   *n     = a->inode.node_count;
251:   if (!ia) return(0);

253:   ishift = a->indexshift;
254:   if (symmetric) {
255:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
256:   } else {
257:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
258:   }
259:   return(0);
260: }

262: #undef __FUNCT__  
264: static int MatRestoreRowIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
265: {

269:   if (!ia) return(0);
270:   PetscFree(*ia);
271:   PetscFree(*ja);
272:   return(0);
273: }

275: /* ----------------------------------------------------------- */

277: #undef __FUNCT__  
279: static int MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,int **iia,int **jja,int ishift,int oshift)
280: {
281:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
282:   int *work,*ia,*ja,*j,nz,nslim_row, n,row,col,ierr,*ns_col,nslim_col;
283:   int *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

286:   nslim_row = a->inode.node_count;
287:   n         = A->n;

289:   /* Create The column_inode for this matrix */
290:   Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
291: 
292:   /* allocate space for reformated column_inode structure */
293:   PetscMalloc((nslim_col + 1)*sizeof(int),&tns);
294:   PetscMalloc((n + 1)*sizeof(int),&tvc);
295:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

297:   for (i1=0,col=0; i1<nslim_col; ++i1){
298:     nsz = ns_col[i1];
299:     for (i2=0; i2<nsz; ++i2,++col)
300:       tvc[col] = i1;
301:   }
302:   /* allocate space for column pointers */
303:   PetscMalloc((nslim_col+1)*sizeof(int),&ia);
304:   *iia = ia;
305:   PetscMemzero(ia,(nslim_col+1)*sizeof(int));
306:   PetscMalloc((nslim_col+1)*sizeof(int),&work);

308:   /* determine the number of columns in each row */
309:   ia[0] = oshift;
310:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
311:     j   = aj + ai[row] + ishift;
312:     col = *j++ + ishift;
313:     i2  = tvc[col];
314:     nz  = ai[row+1] - ai[row];
315:     while (nz-- > 0) {           /* off-diagonal elemets */
316:       /* ia[i1+1]++; */
317:       ia[i2+1]++;
318:       i2++;
319:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
320:       i2 = tvc[col];
321:     }
322:   }

324:   /* shift ia[i] to point to next col */
325:   for (i1=1; i1<nslim_col+1; i1++) {
326:     col        = ia[i1-1];
327:     ia[i1]    += col;
328:     work[i1-1] = col - oshift;
329:   }

331:   /* allocate space for column pointers */
332:   nz   = ia[nslim_col] + (!ishift);
333:   PetscMalloc(nz*sizeof(int),&ja);
334:   *jja = ja;

336:  /* loop over matrix putting into ja */
337:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
338:     j   = aj + ai[row] + ishift;
339:     i2  = 0;                     /* Col inode index */
340:     col = *j++ + ishift;
341:     i2  = tvc[col];
342:     nz  = ai[row+1] - ai[row];
343:     while (nz-- > 0) {
344:       /* ja[work[i1]++] = i2 + oshift; */
345:       ja[work[i2]++] = i1 + oshift;
346:       i2++;
347:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
348:       i2 = tvc[col];
349:     }
350:   }
351:   PetscFree(ns_col);
352:   PetscFree(work);
353:   PetscFree(tns);
354:   PetscFree(tvc);
355:   return(0);
356: }

358: #undef __FUNCT__  
360: static int MatGetColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
361: {
362:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
363:   int        ierr,ishift;

366:   Mat_AIJ_CreateColInode(A,n,PETSC_NULL);
367:   if (!ia) return(0);

369:   ishift = a->indexshift;
370:   if (symmetric) {
371:     /* Since the indices are symmetric it does'nt matter */
372:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,ishift,oshift);
373:   } else {
374:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,ishift,oshift);
375:   }
376:   return(0);
377: }

379: #undef __FUNCT__  
381: static int MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,int oshift,PetscTruth symmetric,int *n,int **ia,int **ja,PetscTruth *done)
382: {

386:   if (!ia) return(0);
387:   PetscFree(*ia);
388:   PetscFree(*ja);
389:   return(0);
390: }

392: /* ----------------------------------------------------------- */

394: #undef __FUNCT__  
396: static int MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
397: {
398:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
399:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
400:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y;
401:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
402:   int          shift = a->indexshift;
403: 
404: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
405: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
406: #endif

409:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
410:   node_max = a->inode.node_count;
411:   ns       = a->inode.size;     /* Node Size array */
412:   VecGetArray(xx,&x);
413:   VecGetArray(yy,&y);
414:   x    = x + shift;             /* shift for Fortran start by 1 indexing */
415:   idx  = a->j;
416:   v1   = a->a;
417:   ii   = a->i;

419:   for (i = 0,row = 0; i< node_max; ++i){
420:     nsz  = ns[i];
421:     n    = ii[1] - ii[0];
422:     ii  += nsz;
423:     sz   = n;                   /*No of non zeros in this row */
424:                                 /* Switch on the size of Node */
425:     switch (nsz){               /* Each loop in 'case' is unrolled */
426:     case 1 :
427:       sum1  = 0;
428: 
429:       for(n = 0; n< sz-1; n+=2) {
430:         i1   = idx[0];          /* The instructions are ordered to */
431:         i2   = idx[1];          /* make the compiler's job easy */
432:         idx += 2;
433:         tmp0 = x[i1];
434:         tmp1 = x[i2];
435:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436:        }
437: 
438:       if (n == sz-1){          /* Take care of the last nonzero  */
439:         tmp0  = x[*idx++];
440:         sum1 += *v1++ * tmp0;
441:       }
442:       y[row++]=sum1;
443:       break;
444:     case 2:
445:       sum1  = 0;
446:       sum2  = 0;
447:       v2    = v1 + n;
448: 
449:       for (n = 0; n< sz-1; n+=2) {
450:         i1   = idx[0];
451:         i2   = idx[1];
452:         idx += 2;
453:         tmp0 = x[i1];
454:         tmp1 = x[i2];
455:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
456:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
457:       }
458:       if (n == sz-1){
459:         tmp0  = x[*idx++];
460:         sum1 += *v1++ * tmp0;
461:         sum2 += *v2++ * tmp0;
462:       }
463:       y[row++]=sum1;
464:       y[row++]=sum2;
465:       v1      =v2;              /* Since the next block to be processed starts there*/
466:       idx    +=sz;
467:       break;
468:     case 3:
469:       sum1  = 0;
470:       sum2  = 0;
471:       sum3  = 0;
472:       v2    = v1 + n;
473:       v3    = v2 + n;
474: 
475:       for (n = 0; n< sz-1; n+=2) {
476:         i1   = idx[0];
477:         i2   = idx[1];
478:         idx += 2;
479:         tmp0 = x[i1];
480:         tmp1 = x[i2];
481:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
482:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
483:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
484:       }
485:       if (n == sz-1){
486:         tmp0  = x[*idx++];
487:         sum1 += *v1++ * tmp0;
488:         sum2 += *v2++ * tmp0;
489:         sum3 += *v3++ * tmp0;
490:       }
491:       y[row++]=sum1;
492:       y[row++]=sum2;
493:       y[row++]=sum3;
494:       v1       =v3;             /* Since the next block to be processed starts there*/
495:       idx     +=2*sz;
496:       break;
497:     case 4:
498:       sum1  = 0;
499:       sum2  = 0;
500:       sum3  = 0;
501:       sum4  = 0;
502:       v2    = v1 + n;
503:       v3    = v2 + n;
504:       v4    = v3 + n;
505: 
506:       for (n = 0; n< sz-1; n+=2) {
507:         i1   = idx[0];
508:         i2   = idx[1];
509:         idx += 2;
510:         tmp0 = x[i1];
511:         tmp1 = x[i2];
512:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
513:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
514:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
515:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
516:       }
517:       if (n == sz-1){
518:         tmp0  = x[*idx++];
519:         sum1 += *v1++ * tmp0;
520:         sum2 += *v2++ * tmp0;
521:         sum3 += *v3++ * tmp0;
522:         sum4 += *v4++ * tmp0;
523:       }
524:       y[row++]=sum1;
525:       y[row++]=sum2;
526:       y[row++]=sum3;
527:       y[row++]=sum4;
528:       v1      =v4;              /* Since the next block to be processed starts there*/
529:       idx    +=3*sz;
530:       break;
531:     case 5:
532:       sum1  = 0;
533:       sum2  = 0;
534:       sum3  = 0;
535:       sum4  = 0;
536:       sum5  = 0;
537:       v2    = v1 + n;
538:       v3    = v2 + n;
539:       v4    = v3 + n;
540:       v5    = v4 + n;
541: 
542:       for (n = 0; n<sz-1; n+=2) {
543:         i1   = idx[0];
544:         i2   = idx[1];
545:         idx += 2;
546:         tmp0 = x[i1];
547:         tmp1 = x[i2];
548:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
549:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
550:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
551:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
552:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
553:       }
554:       if (n == sz-1){
555:         tmp0  = x[*idx++];
556:         sum1 += *v1++ * tmp0;
557:         sum2 += *v2++ * tmp0;
558:         sum3 += *v3++ * tmp0;
559:         sum4 += *v4++ * tmp0;
560:         sum5 += *v5++ * tmp0;
561:       }
562:       y[row++]=sum1;
563:       y[row++]=sum2;
564:       y[row++]=sum3;
565:       y[row++]=sum4;
566:       y[row++]=sum5;
567:       v1      =v5;       /* Since the next block to be processed starts there */
568:       idx    +=4*sz;
569:       break;
570:     default :
571:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
572:     }
573:   }
574:   VecRestoreArray(xx,&x);
575:   VecRestoreArray(yy,&y);
576:   PetscLogFlops(2*a->nz - A->m);
577:   return(0);
578: }
579: /* ----------------------------------------------------------- */
580: /* Almost same code as the MatMult_SeqAij_Inode() */
581: #undef __FUNCT__  
583: static int MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
584: {
585:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
586:   PetscScalar  sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
587:   PetscScalar  *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
588:   int          ierr,*idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
589:   int          shift = a->indexshift;
590: 
592:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
593:   node_max = a->inode.node_count;
594:   ns       = a->inode.size;     /* Node Size array */
595:   VecGetArray(xx,&x);
596:   VecGetArray(yy,&y);
597:   if (zz != yy) {
598:     VecGetArray(zz,&z);
599:   } else {
600:     z = y;
601:   }
602:   zt = z;

604:   x    = x + shift;             /* shift for Fortran start by 1 indexing */
605:   idx  = a->j;
606:   v1   = a->a;
607:   ii   = a->i;

609:   for (i = 0,row = 0; i< node_max; ++i){
610:     nsz  = ns[i];
611:     n    = ii[1] - ii[0];
612:     ii  += nsz;
613:     sz   = n;                   /*No of non zeros in this row */
614:                                 /* Switch on the size of Node */
615:     switch (nsz){               /* Each loop in 'case' is unrolled */
616:     case 1 :
617:       sum1  = *zt++;
618: 
619:       for(n = 0; n< sz-1; n+=2) {
620:         i1   = idx[0];          /* The instructions are ordered to */
621:         i2   = idx[1];          /* make the compiler's job easy */
622:         idx += 2;
623:         tmp0 = x[i1];
624:         tmp1 = x[i2];
625:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
626:        }
627: 
628:       if(n   == sz-1){          /* Take care of the last nonzero  */
629:         tmp0  = x[*idx++];
630:         sum1 += *v1++ * tmp0;
631:       }
632:       y[row++]=sum1;
633:       break;
634:     case 2:
635:       sum1  = *zt++;
636:       sum2  = *zt++;
637:       v2    = v1 + n;
638: 
639:       for(n = 0; n< sz-1; n+=2) {
640:         i1   = idx[0];
641:         i2   = idx[1];
642:         idx += 2;
643:         tmp0 = x[i1];
644:         tmp1 = x[i2];
645:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
646:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
647:       }
648:       if(n   == sz-1){
649:         tmp0  = x[*idx++];
650:         sum1 += *v1++ * tmp0;
651:         sum2 += *v2++ * tmp0;
652:       }
653:       y[row++]=sum1;
654:       y[row++]=sum2;
655:       v1      =v2;              /* Since the next block to be processed starts there*/
656:       idx    +=sz;
657:       break;
658:     case 3:
659:       sum1  = *zt++;
660:       sum2  = *zt++;
661:       sum3  = *zt++;
662:       v2    = v1 + n;
663:       v3    = v2 + n;
664: 
665:       for (n = 0; n< sz-1; n+=2) {
666:         i1   = idx[0];
667:         i2   = idx[1];
668:         idx += 2;
669:         tmp0 = x[i1];
670:         tmp1 = x[i2];
671:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
672:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
673:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
674:       }
675:       if (n == sz-1){
676:         tmp0  = x[*idx++];
677:         sum1 += *v1++ * tmp0;
678:         sum2 += *v2++ * tmp0;
679:         sum3 += *v3++ * tmp0;
680:       }
681:       y[row++]=sum1;
682:       y[row++]=sum2;
683:       y[row++]=sum3;
684:       v1       =v3;             /* Since the next block to be processed starts there*/
685:       idx     +=2*sz;
686:       break;
687:     case 4:
688:       sum1  = *zt++;
689:       sum2  = *zt++;
690:       sum3  = *zt++;
691:       sum4  = *zt++;
692:       v2    = v1 + n;
693:       v3    = v2 + n;
694:       v4    = v3 + n;
695: 
696:       for (n = 0; n< sz-1; n+=2) {
697:         i1   = idx[0];
698:         i2   = idx[1];
699:         idx += 2;
700:         tmp0 = x[i1];
701:         tmp1 = x[i2];
702:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
703:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
704:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
705:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
706:       }
707:       if (n == sz-1){
708:         tmp0  = x[*idx++];
709:         sum1 += *v1++ * tmp0;
710:         sum2 += *v2++ * tmp0;
711:         sum3 += *v3++ * tmp0;
712:         sum4 += *v4++ * tmp0;
713:       }
714:       y[row++]=sum1;
715:       y[row++]=sum2;
716:       y[row++]=sum3;
717:       y[row++]=sum4;
718:       v1      =v4;              /* Since the next block to be processed starts there*/
719:       idx    +=3*sz;
720:       break;
721:     case 5:
722:       sum1  = *zt++;
723:       sum2  = *zt++;
724:       sum3  = *zt++;
725:       sum4  = *zt++;
726:       sum5  = *zt++;
727:       v2    = v1 + n;
728:       v3    = v2 + n;
729:       v4    = v3 + n;
730:       v5    = v4 + n;
731: 
732:       for (n = 0; n<sz-1; n+=2) {
733:         i1   = idx[0];
734:         i2   = idx[1];
735:         idx += 2;
736:         tmp0 = x[i1];
737:         tmp1 = x[i2];
738:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
739:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
740:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
741:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
742:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
743:       }
744:       if(n   == sz-1){
745:         tmp0  = x[*idx++];
746:         sum1 += *v1++ * tmp0;
747:         sum2 += *v2++ * tmp0;
748:         sum3 += *v3++ * tmp0;
749:         sum4 += *v4++ * tmp0;
750:         sum5 += *v5++ * tmp0;
751:       }
752:       y[row++]=sum1;
753:       y[row++]=sum2;
754:       y[row++]=sum3;
755:       y[row++]=sum4;
756:       y[row++]=sum5;
757:       v1      =v5;       /* Since the next block to be processed starts there */
758:       idx    +=4*sz;
759:       break;
760:     default :
761:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
762:     }
763:   }
764:   VecRestoreArray(xx,&x);
765:   VecRestoreArray(yy,&y);
766:   if (zz != yy) {
767:     VecRestoreArray(zz,&z);
768:   }
769:   PetscLogFlops(2*a->nz);
770:   return(0);
771: }
772: /* ----------------------------------------------------------- */
773: EXTERN int MatColoringPatch_SeqAIJ_Inode(Mat,int,int,int *,ISColoring *);

775: /*
776:     samestructure indicates that the matrix has not changed its nonzero structure so we 
777:     do not need to recompute the inodes 
778: */
779: #undef __FUNCT__  
781: int Mat_AIJ_CheckInode(Mat A,PetscTruth samestructure)
782: {
783:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
784:   int        ierr,i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
785:   PetscTruth flag,flg;

788:   if (samestructure && a->inode.checked) return(0);

790:   a->inode.checked = PETSC_TRUE;

792:   /* Notes: We set a->inode.limit=5 in MatCreateSeqAIJ(). */
793:   if (!a->inode.use) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODESn"); return(0);}
794:   PetscOptionsHasName(A->prefix,"-mat_aij_no_inode",&flg);
795:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_aij_no_inoden");return(0);}
796:   PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
797:   if (flg) {PetscLogInfo(A,"Mat_AIJ_CheckInode: Not using Inode routines due to -mat_no_unrolln");return(0);}
798:   PetscOptionsGetInt(A->prefix,"-mat_aij_inode_limit",&a->inode.limit,PETSC_NULL);
799:   if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
800:   m = A->m;
801:   if (a->inode.size) {ns = a->inode.size;}
802:   else {PetscMalloc((m+1)*sizeof(int),&ns);}

804:   i          = 0;
805:   node_count = 0;
806:   idx        = a->j;
807:   ii         = a->i;
808:   while (i < m){                /* For each row */
809:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
810:     /* Limits the number of elements in a node to 'a->inode.limit' */
811:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
812:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
813:       if (nzy != nzx) break;
814:       idy  += nzx;             /* Same nonzero pattern */
815:       PetscMemcmp(idx,idy,nzx*sizeof(int),&flag);
816:       if (flag == PETSC_FALSE) break;
817:     }
818:     ns[node_count++] = blk_size;
819:     idx += blk_size*nzx;
820:     i    = j;
821:   }
822:   /* If not enough inodes found,, do not use inode version of the routines */
823:   if (!a->inode.size && m && node_count > 0.9*m) {
824:     PetscFree(ns);
825:     A->ops->mult            = MatMult_SeqAIJ;
826:     A->ops->multadd         = MatMultAdd_SeqAIJ;
827:     A->ops->solve           = MatSolve_SeqAIJ;
828:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ;
829:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ;
830:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ;
831:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ;
832:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
833:     A->ops->coloringpatch   = 0;
834:     a->inode.node_count     = 0;
835:     a->inode.size           = 0;
836:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes out of %d rows. Not using Inode routinesn",node_count,m);
837:   } else {
838:     A->ops->mult            = MatMult_SeqAIJ_Inode;
839:     A->ops->multadd         = MatMultAdd_SeqAIJ_Inode;
840:     A->ops->solve           = MatSolve_SeqAIJ_Inode;
841:     A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode;
842:     A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
843:     A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
844:     A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
845:     A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
846:     A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
847:     a->inode.node_count     = node_count;
848:     a->inode.size           = ns;
849:     PetscLogInfo(A,"Mat_AIJ_CheckInode: Found %d nodes of %d. Limit used: %d. Using Inode routinesn",node_count,m,a->inode.limit);
850:   }
851:   return(0);
852: }

854: /* ----------------------------------------------------------- */
855: #undef __FUNCT__  
857: int MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
858: {
859:   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data;
860:   IS          iscol = a->col,isrow = a->row;
861:   int         *r,*c,ierr,i,j,n = A->m,*ai = a->i,nz,shift = a->indexshift,*a_j = a->j;
862:   int         node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
863:   PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
864:   PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;

867:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
868:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
869:   node_max = a->inode.node_count;
870:   ns       = a->inode.size;     /* Node Size array */

872:   VecGetArray(bb,&b);
873:   VecGetArray(xx,&x);
874:   tmp  = a->solve_work;
875: 
876:   ISGetIndices(isrow,&rout); r = rout;
877:   ISGetIndices(iscol,&cout); c = cout + (n-1);
878: 
879:   /* forward solve the lower triangular */
880:   tmps = tmp + shift;
881:   aa   = a_a +shift;
882:   aj   = a_j + shift;
883:   ad   = a->diag;

885:   for (i = 0,row = 0; i< node_max; ++i){
886:     nsz = ns[i];
887:     aii = ai[row];
888:     v1  = aa + aii;
889:     vi  = aj + aii;
890:     nz  = ad[row]- aii;
891: 
892:     switch (nsz){               /* Each loop in 'case' is unrolled */
893:     case 1 :
894:       sum1 = b[*r++];
895:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
896:       for(j=0; j<nz-1; j+=2){
897:         i0   = vi[0];
898:         i1   = vi[1];
899:         vi  +=2;
900:         tmp0 = tmps[i0];
901:         tmp1 = tmps[i1];
902:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
903:       }
904:       if(j == nz-1){
905:         tmp0 = tmps[*vi++];
906:         sum1 -= *v1++ *tmp0;
907:       }
908:       tmp[row ++]=sum1;
909:       break;
910:     case 2:
911:       sum1 = b[*r++];
912:       sum2 = b[*r++];
913:       v2   = aa + ai[row+1];

915:       for(j=0; j<nz-1; j+=2){
916:         i0   = vi[0];
917:         i1   = vi[1];
918:         vi  +=2;
919:         tmp0 = tmps[i0];
920:         tmp1 = tmps[i1];
921:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
922:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
923:       }
924:       if(j == nz-1){
925:         tmp0 = tmps[*vi++];
926:         sum1 -= *v1++ *tmp0;
927:         sum2 -= *v2++ *tmp0;
928:       }
929:       sum2 -= *v2++ * sum1;
930:       tmp[row ++]=sum1;
931:       tmp[row ++]=sum2;
932:       break;
933:     case 3:
934:       sum1 = b[*r++];
935:       sum2 = b[*r++];
936:       sum3 = b[*r++];
937:       v2   = aa + ai[row+1];
938:       v3   = aa + ai[row+2];
939: 
940:       for (j=0; j<nz-1; j+=2){
941:         i0   = vi[0];
942:         i1   = vi[1];
943:         vi  +=2;
944:         tmp0 = tmps[i0];
945:         tmp1 = tmps[i1];
946:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
947:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
948:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
949:       }
950:       if (j == nz-1){
951:         tmp0 = tmps[*vi++];
952:         sum1 -= *v1++ *tmp0;
953:         sum2 -= *v2++ *tmp0;
954:         sum3 -= *v3++ *tmp0;
955:       }
956:       sum2 -= *v2++ * sum1;
957:       sum3 -= *v3++ * sum1;
958:       sum3 -= *v3++ * sum2;
959:       tmp[row ++]=sum1;
960:       tmp[row ++]=sum2;
961:       tmp[row ++]=sum3;
962:       break;
963: 
964:     case 4:
965:       sum1 = b[*r++];
966:       sum2 = b[*r++];
967:       sum3 = b[*r++];
968:       sum4 = b[*r++];
969:       v2   = aa + ai[row+1];
970:       v3   = aa + ai[row+2];
971:       v4   = aa + ai[row+3];
972: 
973:       for (j=0; j<nz-1; j+=2){
974:         i0   = vi[0];
975:         i1   = vi[1];
976:         vi  +=2;
977:         tmp0 = tmps[i0];
978:         tmp1 = tmps[i1];
979:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
980:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
981:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
982:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
983:       }
984:       if (j == nz-1){
985:         tmp0 = tmps[*vi++];
986:         sum1 -= *v1++ *tmp0;
987:         sum2 -= *v2++ *tmp0;
988:         sum3 -= *v3++ *tmp0;
989:         sum4 -= *v4++ *tmp0;
990:       }
991:       sum2 -= *v2++ * sum1;
992:       sum3 -= *v3++ * sum1;
993:       sum4 -= *v4++ * sum1;
994:       sum3 -= *v3++ * sum2;
995:       sum4 -= *v4++ * sum2;
996:       sum4 -= *v4++ * sum3;
997: 
998:       tmp[row ++]=sum1;
999:       tmp[row ++]=sum2;
1000:       tmp[row ++]=sum3;
1001:       tmp[row ++]=sum4;
1002:       break;
1003:     case 5:
1004:       sum1 = b[*r++];
1005:       sum2 = b[*r++];
1006:       sum3 = b[*r++];
1007:       sum4 = b[*r++];
1008:       sum5 = b[*r++];
1009:       v2   = aa + ai[row+1];
1010:       v3   = aa + ai[row+2];
1011:       v4   = aa + ai[row+3];
1012:       v5   = aa + ai[row+4];
1013: 
1014:       for (j=0; j<nz-1; j+=2){
1015:         i0   = vi[0];
1016:         i1   = vi[1];
1017:         vi  +=2;
1018:         tmp0 = tmps[i0];
1019:         tmp1 = tmps[i1];
1020:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1021:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1022:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1023:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1024:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1025:       }
1026:       if (j == nz-1){
1027:         tmp0 = tmps[*vi++];
1028:         sum1 -= *v1++ *tmp0;
1029:         sum2 -= *v2++ *tmp0;
1030:         sum3 -= *v3++ *tmp0;
1031:         sum4 -= *v4++ *tmp0;
1032:         sum5 -= *v5++ *tmp0;
1033:       }

1035:       sum2 -= *v2++ * sum1;
1036:       sum3 -= *v3++ * sum1;
1037:       sum4 -= *v4++ * sum1;
1038:       sum5 -= *v5++ * sum1;
1039:       sum3 -= *v3++ * sum2;
1040:       sum4 -= *v4++ * sum2;
1041:       sum5 -= *v5++ * sum2;
1042:       sum4 -= *v4++ * sum3;
1043:       sum5 -= *v5++ * sum3;
1044:       sum5 -= *v5++ * sum4;
1045: 
1046:       tmp[row ++]=sum1;
1047:       tmp[row ++]=sum2;
1048:       tmp[row ++]=sum3;
1049:       tmp[row ++]=sum4;
1050:       tmp[row ++]=sum5;
1051:       break;
1052:     default:
1053:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1054:     }
1055:   }
1056:   /* backward solve the upper triangular */
1057:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1058:     nsz = ns[i];
1059:     aii = ai[row+1] -1;
1060:     v1  = aa + aii;
1061:     vi  = aj + aii;
1062:     nz  = aii- ad[row];
1063:     switch (nsz){               /* Each loop in 'case' is unrolled */
1064:     case 1 :
1065:       sum1 = tmp[row];

1067:       for(j=nz ; j>1; j-=2){
1068:         i0   = vi[0];
1069:         i1   = vi[-1];
1070:         vi  -=2;
1071:         tmp0 = tmps[i0];
1072:         tmp1 = tmps[i1];
1073:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1074:       }
1075:       if (j==1){
1076:         tmp0  = tmps[*vi--];
1077:         sum1 -= *v1-- * tmp0;
1078:       }
1079:       x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1080:       break;
1081:     case 2 :
1082:       sum1 = tmp[row];
1083:       sum2 = tmp[row -1];
1084:       v2   = aa + ai[row]-1;
1085:       for (j=nz ; j>1; j-=2){
1086:         i0   = vi[0];
1087:         i1   = vi[-1];
1088:         vi  -=2;
1089:         tmp0 = tmps[i0];
1090:         tmp1 = tmps[i1];
1091:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1092:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1093:       }
1094:       if (j==1){
1095:         tmp0  = tmps[*vi--];
1096:         sum1 -= *v1-- * tmp0;
1097:         sum2 -= *v2-- * tmp0;
1098:       }
1099: 
1100:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1101:       sum2   -= *v2-- * tmp0;
1102:       x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1103:       break;
1104:     case 3 :
1105:       sum1 = tmp[row];
1106:       sum2 = tmp[row -1];
1107:       sum3 = tmp[row -2];
1108:       v2   = aa + ai[row]-1;
1109:       v3   = aa + ai[row -1]-1;
1110:       for (j=nz ; j>1; j-=2){
1111:         i0   = vi[0];
1112:         i1   = vi[-1];
1113:         vi  -=2;
1114:         tmp0 = tmps[i0];
1115:         tmp1 = tmps[i1];
1116:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1117:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1118:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1119:       }
1120:       if (j==1){
1121:         tmp0  = tmps[*vi--];
1122:         sum1 -= *v1-- * tmp0;
1123:         sum2 -= *v2-- * tmp0;
1124:         sum3 -= *v3-- * tmp0;
1125:       }
1126:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1127:       sum2   -= *v2-- * tmp0;
1128:       sum3   -= *v3-- * tmp0;
1129:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1130:       sum3   -= *v3-- * tmp0;
1131:       x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1132: 
1133:       break;
1134:     case 4 :
1135:       sum1 = tmp[row];
1136:       sum2 = tmp[row -1];
1137:       sum3 = tmp[row -2];
1138:       sum4 = tmp[row -3];
1139:       v2   = aa + ai[row]-1;
1140:       v3   = aa + ai[row -1]-1;
1141:       v4   = aa + ai[row -2]-1;

1143:       for (j=nz ; j>1; j-=2){
1144:         i0   = vi[0];
1145:         i1   = vi[-1];
1146:         vi  -=2;
1147:         tmp0 = tmps[i0];
1148:         tmp1 = tmps[i1];
1149:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1150:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1151:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1152:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1153:       }
1154:       if (j==1){
1155:         tmp0  = tmps[*vi--];
1156:         sum1 -= *v1-- * tmp0;
1157:         sum2 -= *v2-- * tmp0;
1158:         sum3 -= *v3-- * tmp0;
1159:         sum4 -= *v4-- * tmp0;
1160:       }

1162:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1163:       sum2   -= *v2-- * tmp0;
1164:       sum3   -= *v3-- * tmp0;
1165:       sum4   -= *v4-- * tmp0;
1166:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1167:       sum3   -= *v3-- * tmp0;
1168:       sum4   -= *v4-- * tmp0;
1169:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1170:       sum4   -= *v4-- * tmp0;
1171:       x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1172:       break;
1173:     case 5 :
1174:       sum1 = tmp[row];
1175:       sum2 = tmp[row -1];
1176:       sum3 = tmp[row -2];
1177:       sum4 = tmp[row -3];
1178:       sum5 = tmp[row -4];
1179:       v2   = aa + ai[row]-1;
1180:       v3   = aa + ai[row -1]-1;
1181:       v4   = aa + ai[row -2]-1;
1182:       v5   = aa + ai[row -3]-1;
1183:       for (j=nz ; j>1; j-=2){
1184:         i0   = vi[0];
1185:         i1   = vi[-1];
1186:         vi  -=2;
1187:         tmp0 = tmps[i0];
1188:         tmp1 = tmps[i1];
1189:         sum1 -= v1[0] * tmp0 + v1[-1] * tmp1; v1 -= 2;
1190:         sum2 -= v2[0] * tmp0 + v2[-1] * tmp1; v2 -= 2;
1191:         sum3 -= v3[0] * tmp0 + v3[-1] * tmp1; v3 -= 2;
1192:         sum4 -= v4[0] * tmp0 + v4[-1] * tmp1; v4 -= 2;
1193:         sum5 -= v5[0] * tmp0 + v5[-1] * tmp1; v5 -= 2;
1194:       }
1195:       if (j==1){
1196:         tmp0  = tmps[*vi--];
1197:         sum1 -= *v1-- * tmp0;
1198:         sum2 -= *v2-- * tmp0;
1199:         sum3 -= *v3-- * tmp0;
1200:         sum4 -= *v4-- * tmp0;
1201:         sum5 -= *v5-- * tmp0;
1202:       }

1204:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]+shift]; row--;
1205:       sum2   -= *v2-- * tmp0;
1206:       sum3   -= *v3-- * tmp0;
1207:       sum4   -= *v4-- * tmp0;
1208:       sum5   -= *v5-- * tmp0;
1209:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]+shift]; row--;
1210:       sum3   -= *v3-- * tmp0;
1211:       sum4   -= *v4-- * tmp0;
1212:       sum5   -= *v5-- * tmp0;
1213:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]+shift]; row--;
1214:       sum4   -= *v4-- * tmp0;
1215:       sum5   -= *v5-- * tmp0;
1216:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]+shift]; row--;
1217:       sum5   -= *v5-- * tmp0;
1218:       x[*c--] = tmp[row] = sum5*a_a[ad[row]+shift]; row--;
1219:       break;
1220:     default:
1221:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1222:     }
1223:   }
1224:   ISRestoreIndices(isrow,&rout);
1225:   ISRestoreIndices(iscol,&cout);
1226:   VecRestoreArray(bb,&b);
1227:   VecRestoreArray(xx,&x);
1228:   PetscLogFlops(2*a->nz - A->n);
1229:   return(0);
1230: }


1233: #undef __FUNCT__  
1235: int MatLUFactorNumeric_SeqAIJ_Inode(Mat A,Mat *B)
1236: {
1237:   Mat          C = *B;
1238:   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ *)C->data;
1239:   IS           iscol = b->col,isrow = b->row,isicol = b->icol;
1240:   int          shift = a->indexshift,*r,*ic,*c,ierr,n = A->m,*bi = b->i;
1241:   int          *bj = b->j+shift,*nbj=b->j +(!shift),*ajtmp,*bjtmp,nz,row,prow;
1242:   int          *ics,i,j,idx,*ai = a->i,*aj = a->j+shift,*bd = b->diag,node_max,nsz;
1243:   int          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj,ndamp = 0;
1244:   PetscScalar  *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1245:   PetscScalar  tmp,*ba = b->a+shift,*aa = a->a+shift,*pv,*rtmps1,*rtmps2,*rtmps3;
1246:   PetscReal    damping = b->lu_damping,zeropivot = b->lu_zeropivot;
1247:   PetscTruth   damp = PETSC_FALSE;

1250:   ierr   = ISGetIndices(isrow,&r);
1251:   ierr   = ISGetIndices(iscol,&c);
1252:   ierr   = ISGetIndices(isicol,&ic);
1253:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1254:   ierr   = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1255:   ics    = ic + shift; rtmps1 = rtmp1 + shift;
1256:   rtmp2  = rtmp1 + n;  rtmps2 = rtmp2 + shift;
1257:   rtmp3  = rtmp2 + n;  rtmps3 = rtmp3 + shift;
1258: 
1259:   node_max = a->inode.node_count;
1260:   ns       = a->inode.size ;
1261:   if (!ns){
1262:     SETERRQ(1,"Matrix without inode information");
1263:   }

1265:   /* If max inode size > 3, split it into two inodes.*/
1266:   /* also map the inode sizes according to the ordering */
1267:   PetscMalloc((n+1)* sizeof(int),&tmp_vec1);
1268:   for (i=0,j=0; i<node_max; ++i,++j){
1269:     if (ns[i]>3) {
1270:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1271:       ++j;
1272:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1273:     } else {
1274:       tmp_vec1[j] = ns[i];
1275:     }
1276:   }
1277:   /* Use the correct node_max */
1278:   node_max = j;

1280:   /* Now reorder the inode info based on mat re-ordering info */
1281:   /* First create a row -> inode_size_array_index map */
1282:   PetscMalloc(n*sizeof(int)+1,&nsmap);
1283:   PetscMalloc(node_max*sizeof(int)+1,&tmp_vec2);
1284:   for (i=0,row=0; i<node_max; i++) {
1285:     nsz = tmp_vec1[i];
1286:     for (j=0; j<nsz; j++,row++) {
1287:       nsmap[row] = i;
1288:     }
1289:   }
1290:   /* Using nsmap, create a reordered ns structure */
1291:   for (i=0,j=0; i< node_max; i++) {
1292:     nsz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1293:     tmp_vec2[i] = nsz;
1294:     j        += nsz;
1295:   }
1296:   PetscFree(nsmap);
1297:   PetscFree(tmp_vec1);
1298:   /* Now use the correct ns */
1299:   ns = tmp_vec2;


1302:   do {
1303:     /* Now loop over each block-row, and do the factorization */
1304:     for (i=0,row=0; i<node_max; i++) {
1305:       nsz   = ns[i];
1306:       nz    = bi[row+1] - bi[row];
1307:       bjtmp = bj + bi[row];
1308: 
1309:       switch (nsz){
1310:       case 1:
1311:         for  (j=0; j<nz; j++){
1312:           idx         = bjtmp[j];
1313:           rtmps1[idx] = 0.0;
1314:         }
1315: 
1316:         /* load in initial (unfactored row) */
1317:         idx   = r[row];
1318:         nz    = ai[idx+1] - ai[idx];
1319:         ajtmp = aj + ai[idx];
1320:         v1    = aa + ai[idx];

1322:         for (j=0; j<nz; j++) {
1323:           idx        = ics[ajtmp[j]];
1324:           rtmp1[idx] = v1[j];
1325:           if (damp && ajtmp[j] == r[row]) {
1326:             rtmp1[idx] += damping;
1327:           }
1328:         }
1329:         prow = *bjtmp++ + shift;
1330:         while (prow < row) {
1331:           pc1 = rtmp1 + prow;
1332:           if (*pc1 != 0.0){
1333:             pv   = ba + bd[prow];
1334:             pj   = nbj + bd[prow];
1335:             mul1 = *pc1 * *pv++;
1336:             *pc1 = mul1;
1337:             nz   = bi[prow+1] - bd[prow] - 1;
1338:             PetscLogFlops(2*nz);
1339:             for (j=0; j<nz; j++) {
1340:               tmp = pv[j];
1341:               idx = pj[j];
1342:               rtmps1[idx] -= mul1 * tmp;
1343:             }
1344:           }
1345:           prow = *bjtmp++ + shift;
1346:         }
1347:         nz  = bi[row+1] - bi[row];
1348:         pj  = bj + bi[row];
1349:         pc1 = ba + bi[row];
1350:         if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1351:           if (damping) {
1352:             if (damp) damping *= 2.0;
1353:             damp = PETSC_TRUE;
1354:             ndamp++;
1355:             goto endofwhile;
1356:           } else {
1357:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1358:           }
1359:         }
1360:         rtmp1[row] = 1.0/rtmp1[row];
1361:         for (j=0; j<nz; j++) {
1362:           idx    = pj[j];
1363:           pc1[j] = rtmps1[idx];
1364:         }
1365:         break;
1366: 
1367:       case 2:
1368:         for  (j=0; j<nz; j++) {
1369:           idx         = bjtmp[j];
1370:           rtmps1[idx] = 0.0;
1371:           rtmps2[idx] = 0.0;
1372:         }
1373: 
1374:         /* load in initial (unfactored row) */
1375:         idx   = r[row];
1376:         nz    = ai[idx+1] - ai[idx];
1377:         ajtmp = aj + ai[idx];
1378:         v1    = aa + ai[idx];
1379:         v2    = aa + ai[idx+1];
1380: 
1381:         for (j=0; j<nz; j++) {
1382:           idx        = ics[ajtmp[j]];
1383:           rtmp1[idx] = v1[j];
1384:           rtmp2[idx] = v2[j];
1385:           if (damp && ajtmp[j] == r[row]) {
1386:             rtmp1[idx] += damping;
1387:           }
1388:           if (damp && ajtmp[j] == r[row+1]) {
1389:             rtmp2[idx] += damping;
1390:           }
1391:         }
1392:         prow = *bjtmp++ + shift;
1393:         while (prow < row) {
1394:           pc1 = rtmp1 + prow;
1395:           pc2 = rtmp2 + prow;
1396:           if (*pc1 != 0.0 || *pc2 != 0.0){
1397:             pv   = ba + bd[prow];
1398:             pj   = nbj + bd[prow];
1399:             mul1 = *pc1 * *pv;
1400:             mul2 = *pc2 * *pv;
1401:             ++pv;
1402:             *pc1 = mul1;
1403:             *pc2 = mul2;
1404: 
1405:             nz   = bi[prow+1] - bd[prow] - 1;
1406:             PetscLogFlops(2*2*nz);
1407:             for (j=0; j<nz; j++) {
1408:               tmp = pv[j];
1409:               idx = pj[j];
1410:               rtmps1[idx] -= mul1 * tmp;
1411:               rtmps2[idx] -= mul2 * tmp;
1412:             }
1413:           }
1414:           prow = *bjtmp++ + shift;
1415:         }
1416:         /* Now take care of the odd element*/
1417:         pc1 = rtmp1 + prow;
1418:         pc2 = rtmp2 + prow;
1419:         if (*pc2 != 0.0){
1420:           pj   = nbj + bd[prow];
1421:           if (PetscAbsScalar(*pc1) < zeropivot) {
1422:             if (damping) {
1423:               if (damp) damping *= 2.0;
1424:               damp = PETSC_TRUE;
1425:               ndamp++;
1426:               goto endofwhile;
1427:             } else {
1428:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1429:             }
1430:           }
1431:           mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1432:           *pc2 = mul2;
1433:           nz   = bi[prow+1] - bd[prow] - 1;
1434:           PetscLogFlops(2*nz);
1435:           for (j=0; j<nz; j++) {
1436:             idx = pj[j] + shift;
1437:             tmp = rtmp1[idx];
1438:             rtmp2[idx] -= mul2 * tmp;
1439:           }
1440:         }
1441: 
1442:         nz  = bi[row+1] - bi[row];
1443:         pj  = bj + bi[row];
1444:         pc1 = ba + bi[row];
1445:         pc2 = ba + bi[row+1];
1446:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1447:           if (damping) {
1448:             if (damp) damping *= 2.0;
1449:             damp = PETSC_TRUE;
1450:             ndamp++;
1451:             goto endofwhile;
1452:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1453:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1454:           } else {
1455:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1456:           }
1457:         }
1458:         rtmp1[row]   = 1.0/rtmp1[row];
1459:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1460:         for (j=0; j<nz; j++) {
1461:           idx    = pj[j];
1462:           pc1[j] = rtmps1[idx];
1463:           pc2[j] = rtmps2[idx];
1464:         }
1465:         break;

1467:       case 3:
1468:         for  (j=0; j<nz; j++) {
1469:           idx         = bjtmp[j];
1470:           rtmps1[idx] = 0.0;
1471:           rtmps2[idx] = 0.0;
1472:           rtmps3[idx] = 0.0;
1473:         }
1474:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1475:         idx   = r[row];
1476:         nz    = ai[idx+1] - ai[idx];
1477:         ajtmp = aj + ai[idx];
1478:         v1    = aa + ai[idx];
1479:         v2    = aa + ai[idx+1];
1480:         v3    = aa + ai[idx+2];
1481:         for (j=0; j<nz; j++) {
1482:           idx        = ics[ajtmp[j]];
1483:           rtmp1[idx] = v1[j];
1484:           rtmp2[idx] = v2[j];
1485:           rtmp3[idx] = v3[j];
1486:           if (damp && ajtmp[j] == r[row]) {
1487:             rtmp1[idx] += damping;
1488:           }
1489:           if (damp && ajtmp[j] == r[row+1]) {
1490:             rtmp2[idx] += damping;
1491:           }
1492:           if (damp && ajtmp[j] == r[row+2]) {
1493:             rtmp3[idx] += damping;
1494:           }
1495:         }
1496:         /* loop over all pivot row blocks above this row block */
1497:         prow = *bjtmp++ + shift;
1498:         while (prow < row) {
1499:           pc1 = rtmp1 + prow;
1500:           pc2 = rtmp2 + prow;
1501:           pc3 = rtmp3 + prow;
1502:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1503:             pv   = ba  + bd[prow];
1504:             pj   = nbj + bd[prow];
1505:             mul1 = *pc1 * *pv;
1506:             mul2 = *pc2 * *pv;
1507:             mul3 = *pc3 * *pv;
1508:             ++pv;
1509:             *pc1 = mul1;
1510:             *pc2 = mul2;
1511:             *pc3 = mul3;
1512: 
1513:             nz   = bi[prow+1] - bd[prow] - 1;
1514:             PetscLogFlops(3*2*nz);
1515:             /* update this row based on pivot row */
1516:             for (j=0; j<nz; j++) {
1517:               tmp = pv[j];
1518:               idx = pj[j];
1519:               rtmps1[idx] -= mul1 * tmp;
1520:               rtmps2[idx] -= mul2 * tmp;
1521:               rtmps3[idx] -= mul3 * tmp;
1522:             }
1523:           }
1524:           prow = *bjtmp++ + shift;
1525:         }
1526:         /* Now take care of diagonal block in this set of rows */
1527:         pc1 = rtmp1 + prow;
1528:         pc2 = rtmp2 + prow;
1529:         pc3 = rtmp3 + prow;
1530:         if (*pc2 != 0.0 || *pc3 != 0.0){
1531:           pj   = nbj + bd[prow];
1532:           if (PetscAbsScalar(*pc1) < zeropivot) {
1533:             if (damping) {
1534:               if (damp) damping *= 2.0;
1535:               damp = PETSC_TRUE;
1536:               ndamp++;
1537:               goto endofwhile;
1538:             } else {
1539:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc1),zeropivot);
1540:             }
1541:           }
1542:           mul2 = (*pc2)/(*pc1);
1543:           mul3 = (*pc3)/(*pc1);
1544:           *pc2 = mul2;
1545:           *pc3 = mul3;
1546:           nz   = bi[prow+1] - bd[prow] - 1;
1547:           PetscLogFlops(2*2*nz);
1548:           for (j=0; j<nz; j++) {
1549:             idx = pj[j] + shift;
1550:             tmp = rtmp1[idx];
1551:             rtmp2[idx] -= mul2 * tmp;
1552:             rtmp3[idx] -= mul3 * tmp;
1553:           }
1554:         }
1555:         ++prow;
1556:         pc2 = rtmp2 + prow;
1557:         pc3 = rtmp3 + prow;
1558:         if (*pc3 != 0.0){
1559:           pj   = nbj + bd[prow];
1560:           pj   = nbj + bd[prow];
1561:           if (PetscAbsScalar(*pc2) < zeropivot) {
1562:             if (damping) {
1563:               if (damp) damping *= 2.0;
1564:               damp = PETSC_TRUE;
1565:               ndamp++;
1566:               goto endofwhile;
1567:             } else {
1568:               SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",prow,PetscAbsScalar(*pc2),zeropivot);
1569:             }
1570:           }
1571:           mul3 = (*pc3)/(*pc2);
1572:           *pc3 = mul3;
1573:           nz   = bi[prow+1] - bd[prow] - 1;
1574:           PetscLogFlops(2*2*nz);
1575:           for (j=0; j<nz; j++) {
1576:             idx = pj[j] + shift;
1577:             tmp = rtmp2[idx];
1578:             rtmp3[idx] -= mul3 * tmp;
1579:           }
1580:         }
1581:         nz  = bi[row+1] - bi[row];
1582:         pj  = bj + bi[row];
1583:         pc1 = ba + bi[row];
1584:         pc2 = ba + bi[row+1];
1585:         pc3 = ba + bi[row+2];
1586:         if (PetscAbsScalar(rtmp1[row]) < zeropivot || PetscAbsScalar(rtmp2[row+1]) < zeropivot || PetscAbsScalar(rtmp3[row+2]) < zeropivot) {
1587:           if (damping) {
1588:             if (damp) damping *= 2.0;
1589:             damp = PETSC_TRUE;
1590:             ndamp++;
1591:             goto endofwhile;
1592:           } else if (PetscAbsScalar(rtmp1[row]) < zeropivot) {
1593:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row,PetscAbsScalar(rtmp1[row]),zeropivot);
1594:           } else if (PetscAbsScalar(rtmp2[row+1]) < zeropivot) {
1595:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+1,PetscAbsScalar(rtmp2[row+1]),zeropivot);
1596:         } else {
1597:             SETERRQ3(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %d value %g tolerance %g",row+2,PetscAbsScalar(rtmp3[row+2]),zeropivot);
1598:           }
1599:         }
1600:         rtmp1[row]   = 1.0/rtmp1[row];
1601:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1602:         rtmp3[row+2] = 1.0/rtmp3[row+2];
1603:         /* copy row entries from dense representation to sparse */
1604:         for (j=0; j<nz; j++) {
1605:           idx    = pj[j];
1606:           pc1[j] = rtmps1[idx];
1607:           pc2[j] = rtmps2[idx];
1608:           pc3[j] = rtmps3[idx];
1609:         }
1610:         break;

1612:       default:
1613:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported n");
1614:       }
1615:       row += nsz;                 /* Update the row */
1616:     }
1617:     endofwhile:;
1618:   } while (damp);
1619:   PetscFree(rtmp1);
1620:   PetscFree(tmp_vec2);
1621:   ISRestoreIndices(isicol,&ic);
1622:   ISRestoreIndices(isrow,&r);
1623:   ISRestoreIndices(iscol,&c);
1624:   C->factor      = FACTOR_LU;
1625:   C->assembled   = PETSC_TRUE;
1626:   if (ndamp || b->lu_damping) {
1627:     PetscLogInfo(0,"MatLUFactorNumerical_SeqAIJ_Inode: number of damping tries %d damping value %gn",ndamp,damping);
1628:   }
1629:   PetscLogFlops(C->n);
1630:   return(0);
1631: }

1633: /*
1634:      This is really ugly. if inodes are used this replaces the 
1635:   permutations with ones that correspond to rows/cols of the matrix
1636:   rather then inode blocks
1637: */
1638: #undef __FUNCT__  
1640: int MatAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1641: {
1642:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1643:   int        ierr,m = A->m,n = A->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1644:   int        row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1645:   int        nslim_col,*ns_col;
1646:   IS         ris = *rperm,cis = *cperm;
1647:   PetscTruth flg;

1650:   PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1651:   if (!flg) return(0);

1653:   if (!a->inode.size) return(0); /* no inodes so return */
1654:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

1656:   ierr  = Mat_AIJ_CreateColInode(A,&nslim_col,&ns_col);
1657:   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(int),&tns);
1658:   ierr  = PetscMalloc((m+n+1)*sizeof(int),&permr);
1659:   permc = permr + m;

1661:   ierr  = ISGetIndices(ris,&ridx);
1662:   ierr  = ISGetIndices(cis,&cidx);

1664:   /* Form the inode structure for the rows of permuted matric using inv perm*/
1665:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

1667:   /* Construct the permutations for rows*/
1668:   for (i=0,row = 0; i<nslim_row; ++i){
1669:     indx      = ridx[i];
1670:     start_val = tns[indx];
1671:     end_val   = tns[indx + 1];
1672:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1673:   }

1675:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1676:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

1678:  /*Construct permutations for columns*/
1679:   for (i=0,col=0; i<nslim_col; ++i){
1680:     indx      = cidx[i];
1681:     start_val = tns[indx];
1682:     end_val   = tns[indx + 1];
1683:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1684:   }

1686:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1687:   ISSetPermutation(*rperm);
1688:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1689:   ISSetPermutation(*cperm);
1690: 
1691:   ierr  = ISRestoreIndices(ris,&ridx);
1692:   ierr  = ISRestoreIndices(cis,&cidx);

1694:   PetscFree(ns_col);
1695:   PetscFree(permr);
1696:   ISDestroy(cis);
1697:   ISDestroy(ris);
1698:   PetscFree(tns);
1699:   return(0);
1700: }


1703: #undef __FUNCT__  

1706: /*@C
1707:    MatSeqAIJGetInodeSizes - Returns the inode information of the SeqAIJ matrix.

1709:    Collective on Mat

1711:    Input Parameter:
1712: .  A - the SeqAIJ matrix.

1714:    Output Parameter:
1715: +  node_count - no of inodes present in the matrix.
1716: .  sizes      - an array of size node_count,with sizes of each inode.
1717: -  limit      - the max size used to generate the inodes.

1719:    Level: advanced

1721:    Notes: This routine returns some internal storage information
1722:    of the matrix, it is intended to be used by advanced users.
1723:    It should be called after the matrix is assembled.
1724:    The contents of the sizes[] array should not be changed.

1726: .keywords: matrix, seqaij, get, inode

1728: .seealso: MatGetInfo()
1729: @*/
1730: int MatSeqAIJGetInodeSizes(Mat A,int *node_count,int *sizes[],int *limit)
1731: {
1732:   Mat_SeqAIJ *a;
1733:   PetscTruth flg;
1734:   int        ierr;

1738:   PetscTypeCompare((PetscObject)A,MATSEQAIJ,&flg);
1739:   if (!flg) SETERRQ(PETSC_ERR_ARG_WRONG,"MatSeqAIJ only");
1740:   a           = (Mat_SeqAIJ*)A->data;
1741:   *node_count = a->inode.node_count;
1742:   *sizes      = a->inode.size;
1743:   *limit      = a->inode.limit;
1744:   return(0);
1745: }