Actual source code: ex2.c
  1: /*$Id: ex2.c,v 1.46 2001/09/11 16:34:32 bsmith Exp $*/

  3: static char help[] = "Reads a a simple unstructured grid from a file. Partitions it,n
  4: and distributes the grid data accordinglynn";

  6: /*T
  7:    Concepts: Mat^partitioning a matrix;
  8:    Processors: n
  9: T*/

 11: /*
 12:     Updates of this example MAY be found at 
 13:        http://www.mcs.anl.gov/petsc/src/dm/ao/examples/tutorials/ex2.c

 15:     This is a very basic, even crude, example of managing an unstructured
 16:     grid in parallel.

 18:     This particular code is for a Galerkin-style finite element method. 

 20:     After the calls below, each processor will have 
 21:       1) a list of elements it "owns"; for each "owned" element it will have the global 
 22:          numbering of the three vertices; stored in gdata->ele;
 23:       2) a list of vertices it "owns". For each owned it will have the x and y 
 24:          coordinates; stored in gdata->vert

 26:     It will not have 
 27:       1) list of ghost elements (since they are not needed for traditional 
 28:          Galerkin style finite element methods). For various finite volume methods
 29:          you may need the ghost element lists, these may be generated using the 
 30:          element neighbor information given in the file database.

 32:     To compute the local element stiffness matrix and load vector, each processor
 33:     will need the vertex coordinates for all of the vertices on the locally 
 34:     "owned" elements.  This data could be obtained by doing the appropriate vector
 35:     scatter on the data stored in gdata->vert; we haven't had time to demonstrate this.

 37:     Clearly writing a complete parallel unstructured grid code with PETSc is still
 38:     a good deal of work and requires a lot of application level coding.  BUT, at least
 39:     PETSc can manage all of the nonlinear and linear solvers (including matrix assembly 
 40:     etc.), which allows the programmer to concentrate his or her efforts on managing 
 41:     the unstructured grid. The PETSc team is developing additional library objects
 42:     to help manage parallel unstructured grid computations.  Unfortunately we have 
 43:     not had time to complete these yet, so the application programmer still must
 44:     manage much of the parallel grid manipulation as indicated below.

 46: */

 48: /* 
 49:   Include "petscmat.h" so that we can use matrices.
 50:   automatically includes:
 51:      petsc.h       - base PETSc routines   petscvec.h    - vectors
 52:      petscsys.h    - system routines       petscmat.h    - matrices
 53:      petscis.h     - index sets            petscviewer.h - viewers               

 55:   Include "petscao.h" allows use of the AO (application ordering) commands,
 56:   used below for renumbering the vertex numbers after the partitioning.

 58:   Include "petscbt.h" for managing logical bit arrays that are used to 
 59:   conserve space. Note that the code does use order N bit arrays on each 
 60:   processor so is theoretically not scalable, but even with 64 million 
 61:   vertices it will only need temporarily 8 megabytes of memory for the 
 62:   bit array so one can still do very large problems with this approach, 
 63:   since the bit arrays are freed before the vectors and matrices are
 64:   created.
 65: */
 66:  #include petscmat.h
 67:  #include petscao.h
 68:  #include petscbt.h

 70: /* 
 71:     This is the user-defined grid data context 
 72: */
 73: typedef struct {
 74:   int    n_vert,n_ele;
 75:   int    mlocal_vert,mlocal_ele;
 76:   int    *ele;
 77:   PetscScalar *vert;
 78:   int    *ia,*ja;
 79:   IS     isnewproc;
 80:   int    *localvert,nlocal; /* used to stash temporarily old global vertex number of new vertex */
 81: } GridData;

 83: /*
 84:    Variables on all processors:
 85:       n_vert          - total number of vertices
 86:       mlocal_vert     - number of vertices on this processor
 87:       vert            - x,y coordinates of local vertices

 89:       n_ele           - total number of elements
 90:       mlocal_ele      - number of elements on this processor
 91:       ele             - vertices of elements on this processor

 93:       ia, ja          - adjacency graph of elements (for partitioning)
 94:     
 95:    Variables on processor 0 during data reading from file:
 96:       mmlocal_vert[i] - number of vertices on each processor
 97:       tmpvert         - x,y coordinates of vertices on any processor (as read in)

 99:       mmlocal_ele[i]  - number of elements on each processor

101:       tmpia, tmpja    - adjacency graph of elements for other processors

103:    Notes:
104:    The code below has a great deal of IO (print statements). This is to allow one to track 
105:    the renumbering and movement of data among processors. In an actual 
106:    production run, IO of this type would be deactivated.

108:    To use the ParMETIS partitioner run with the option -mat_partitioning_type parmetis
109:    otherwise it defaults to the initial element partitioning induced when the data 
110:    is read in.

112:    To understand the parallel performance of this type of code, it is important
113:    to profile the time spent in various events in the code; running with the
114:    option -log_summary will indicate how much time is spent in the routines 
115:    below.   Of course, for very small problems, such as the sample grid used here,
116:    the profiling results are meaningless.
117: */

119: extern int DataRead(GridData *);
120: extern int DataPartitionElements(GridData *);
121: extern int DataMoveElements(GridData *);
122: extern int DataPartitionVertices(GridData *);
123: extern int DataMoveVertices(GridData *);
124: extern int DataDestroy(GridData *);

126: #undef __FUNCT__
128: int main(int argc,char **args)
129: {
130:   int          ierr;
131:   int          READ_EVENT,PARTITION_ELEMENT_EVENT,MOVE_ELEMENT_EVENT;
132:   int          PARTITION_VERTEX_EVENT,MOVE_VERTEX_EVENT;
133:   GridData     gdata;

135:   PetscInitialize(&argc,&args,(char *)0,help);

137:   PetscLogEventRegister(&READ_EVENT,             "Read Data",0);
138:   PetscLogEventRegister(&PARTITION_ELEMENT_EVENT,"Partition elemen",0);
139:   PetscLogEventRegister(&MOVE_ELEMENT_EVENT,     "Move elements",0);
140:   PetscLogEventRegister(&PARTITION_VERTEX_EVENT, "Partition vertic",0);
141:   PetscLogEventRegister(&MOVE_VERTEX_EVENT,      "Move vertices",0);

143:   PetscLogEventBegin(READ_EVENT,0,0,0,0);
144:   DataRead(&gdata);
145:   PetscLogEventEnd(READ_EVENT,0,0,0,0);
146:   PetscLogEventBegin(PARTITION_ELEMENT_EVENT,0,0,0,0);
147:   DataPartitionElements(&gdata);
148:   PetscLogEventEnd(PARTITION_ELEMENT_EVENT,0,0,0,0);
149:   PetscLogEventBegin(MOVE_ELEMENT_EVENT,0,0,0,0);
150:   DataMoveElements(&gdata);
151:   PetscLogEventEnd(MOVE_ELEMENT_EVENT,0,0,0,0);
152:   PetscLogEventBegin(PARTITION_VERTEX_EVENT,0,0,0,0);
153:   DataPartitionVertices(&gdata);
154:   PetscLogEventEnd(PARTITION_VERTEX_EVENT,0,0,0,0);
155:   PetscLogEventBegin(MOVE_VERTEX_EVENT,0,0,0,0);
156:   DataMoveVertices(&gdata);
157:   PetscLogEventEnd(MOVE_VERTEX_EVENT,0,0,0,0);
158:   DataDestroy(&gdata);

160:   PetscFinalize();
161:   return 0;
162: }


165: #undef __FUNCT__
167: /*
168:      Reads in the grid data from a file; each processor is naively 
169:   assigned a continuous chunk of vertex and element data. Later the data
170:   will be partitioned and moved to the appropriate processor.
171: */
172: int DataRead(GridData *gdata)
173: {
174:   int          rank,size,n_vert,*mmlocal_vert,mlocal_vert,i,*ia,*ja,cnt,j;
175:   int          mlocal_ele,*mmlocal_ele,*ele,*tmpele,n_ele,net,a1,a2,a3;
176:   int          *iatmp,*jatmp,ierr;
177:   char         msg[128];
178:   PetscScalar  *vert,*tmpvert;
179:   MPI_Status   status;

182:   /*
183:      Processor 0 opens the file, reads in chunks of data and sends a portion off to
184:    each other processor.

186:      Note: For a truely scalable IO portion of the code, one would store
187:    the grid data in a binary file and use MPI-IO commands to have each 
188:    processor read in the parts that it needs. However in most circumstances
189:    involving up to a say a million nodes and 100 processors this approach 
190:    here is fine.
191:   */
192:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
193:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

195:   if (!rank) {
196:     FILE *fd;
197:     fd = fopen("usgdata","r"); if (!fd) SETERRQ(1,"Cannot open grid file");

199:     /* read in number of vertices */
200:     fgets(msg,128,fd);
201:     printf("File msg:%s",msg);
202:     fscanf(fd,"Number Vertices = %dn",&n_vert);
203:     printf("Number of grid vertices %dn",n_vert);

205:     /* broadcast number of vertices to all processors */
206:     MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
207:     mlocal_vert  = n_vert/size + ((n_vert % size) > 0);

209:     /* 
210:       allocate enough room for the first processor to keep track of how many 
211:       vertices are assigned to each processor. Splitting vertices equally amoung
212:       all processors.
213:     */
214:     PetscMalloc(size*sizeof(int),&mmlocal_vert);
215:     for (i=0; i<size; i++) {
216:       mmlocal_vert[i] = n_vert/size + ((n_vert % size) > i);
217:       printf("Processor %d assigned %d verticesn",i,mmlocal_vert[i]);
218:     }

220:     /*
221:        Read in vertices assigned to first processor
222:     */
223:     PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&vert);
224:     printf("Vertices assigned to processor 0n");
225:     for (i=0; i<mlocal_vert; i++) {
226:       fscanf(fd,"%d %lf %lfn",&cnt,vert+2*i,vert+2*i+1);
227:       printf("%d %g %gn",cnt,PetscRealPart(vert[2*i]),PetscRealPart(vert[2*i+1]));
228:     }

230:     /* 
231:        Read in vertices for all the other processors 
232:     */
233:     PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&tmpvert);
234:     for (j=1; j<size; j++) {
235:       printf("Vertices assigned to processor %dn",j);
236:       for (i=0; i<mmlocal_vert[j]; i++) {
237:         fscanf(fd,"%d %lf %lfn",&cnt,tmpvert+2*i,tmpvert+2*i+1);
238:         printf("%d %g %gn",cnt,tmpvert[2*i],tmpvert[2*i+1]);
239:       }
240:       MPI_Send(tmpvert,2*mmlocal_vert[j],MPIU_SCALAR,j,0,PETSC_COMM_WORLD);
241:     }
242:     PetscFree(tmpvert);
243:     PetscFree(mmlocal_vert);

245:     fscanf(fd,"Number Elements = %dn",&n_ele);
246:     printf("Number of grid elements %dn",n_ele);

248:     /* 
249:        Broadcast number of elements to all processors
250:     */
251:     MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
252:     mlocal_ele  = n_ele/size + ((n_ele % size) > 0);

254:     /* 
255:       Allocate enough room for the first processor to keep track of how many 
256:       elements are assigned to each processor.
257:     */
258:     PetscMalloc(size*sizeof(int),&mmlocal_ele);
259:     for (i=0; i<size; i++) {
260:       mmlocal_ele[i] = n_ele/size + ((n_ele % size) > i);
261:       printf("Processor %d assigned %d elementsn",i,mmlocal_ele[i]);
262:     }
263: 
264:     /*
265:         read in element information for the first processor
266:     */
267:     PetscMalloc(3*mmlocal_ele[0]*sizeof(int),&ele);
268:     printf("Elements assigned to processor 0n");
269:     for (i=0; i<mlocal_ele; i++) {
270:       fscanf(fd,"%d %d %d %dn",&cnt,ele+3*i,ele+3*i+1,ele+3*i+2);
271:       printf("%d %d %d %dn",cnt,ele[3*i],ele[3*i+1],ele[3*i+2]);
272:     }

274:     /* 
275:        Read in elements for all the other processors 
276:     */
277:     PetscMalloc(3*mmlocal_ele[0]*sizeof(int),&tmpele);
278:     for (j=1; j<size; j++) {
279:       printf("Elements assigned to processor %dn",j);
280:       for (i=0; i<mmlocal_ele[j]; i++) {
281:         fscanf(fd,"%d %d %d %dn",&cnt,tmpele+3*i,tmpele+3*i+1,tmpele+3*i+2);
282:         printf("%d %d %d %dn",cnt,tmpele[3*i],tmpele[3*i+1],tmpele[3*i+2]);
283:       }
284:       MPI_Send(tmpele,3*mmlocal_ele[j],MPI_INT,j,0,PETSC_COMM_WORLD);
285:     }
286:     PetscFree(tmpele);

288:     /* 
289:          Read in element neighbors for processor 0 
290:          We don't know how many spaces in ja[] to allocate so we allocate 
291:        3*the number of local elements, this is the maximum it could be
292:     */
293:     PetscMalloc((mlocal_ele+1)*sizeof(int),&ia);
294:     PetscMalloc((3*mlocal_ele+1)*sizeof(int),&ja);
295:     net   = 0;
296:     ia[0] = 0;
297:     printf("Element neighbors on processor 0n");
298:     fgets(msg,128,fd);
299:     for (i=0; i<mlocal_ele; i++) {
300:       fscanf(fd,"%d %d %d %dn",&cnt,&a1,&a2,&a3);
301:       printf("%d %d %d %dn",cnt,a1,a2,a3);
302:       if (a1 >= 0) {ja[net++] = a1;}
303:       if (a2 >= 0) {ja[net++] = a2;}
304:       if (a3 >= 0) {ja[net++] = a3;}
305:       ia[i+1] = net;
306:     }

308:     printf("ia values for processor 0n");
309:     for (i=0; i<mlocal_ele+1; i++) {
310:       printf("%d ",ia[i]);
311:     }
312:     printf("n");
313:     printf("ja values for processor 0n");
314:     for (i=0; i<ia[mlocal_ele]; i++) {
315:       printf("%d ",ja[i]);
316:     }
317:     printf("n");

319:     /*
320:        Read in element neighbor information for all other processors
321:     */
322:     PetscMalloc((mlocal_ele+1)*sizeof(int),&iatmp);
323:     PetscMalloc((3*mlocal_ele+1)*sizeof(int),&jatmp);
324:     for (j=1; j<size; j++) {
325:       net   = 0;
326:       iatmp[0] = 0;
327:       printf("Element neighbors on processor %dn",j);
328:       for (i=0; i<mmlocal_ele[j]; i++) {
329:         fscanf(fd,"%d %d %d %dn",&cnt,&a1,&a2,&a3);
330:         printf("%d %d %d %dn",cnt,a1,a2,a3);
331:         if (a1 >= 0) {jatmp[net++] = a1;}
332:         if (a2 >= 0) {jatmp[net++] = a2;}
333:         if (a3 >= 0) {jatmp[net++] = a3;}
334:         iatmp[i+1] = net;
335:       }

337:       printf("ia values for processor %dn",j);
338:       for (i=0; i<mmlocal_ele[j]+1; i++) {
339:         printf("%d ",iatmp[i]);
340:       }
341:       printf("n");
342:       printf("ja values for processor %dn",j);
343:       for (i=0; i<iatmp[mmlocal_ele[j]]; i++) {
344:         printf("%d ",jatmp[i]);
345:       }
346:       printf("n");

348:       /* send graph off to appropriate processor */
349:       MPI_Send(iatmp,mmlocal_ele[j]+1,MPI_INT,j,0,PETSC_COMM_WORLD);
350:       MPI_Send(jatmp,iatmp[mmlocal_ele[j]],MPI_INT,j,0,PETSC_COMM_WORLD);
351:     }
352:     PetscFree(iatmp);
353:     PetscFree(jatmp);
354:     PetscFree(mmlocal_ele);

356:     fclose(fd);
357:   } else {
358:     /*
359:         We are not the zeroth processor so we do not open the file
360:       rather we wait for processor 0 to send us our data.
361:     */

363:     /* receive total number of vertices */
364:     MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
365:     mlocal_vert = n_vert/size + ((n_vert % size) > rank);

367:     /* receive vertices */
368:     PetscMalloc(2*(mlocal_vert+1)*sizeof(PetscScalar),&vert);
369:     MPI_Recv(vert,2*mlocal_vert,MPIU_SCALAR,0,0,PETSC_COMM_WORLD,&status);

371:     /* receive total number of elements */
372:     MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
373:     mlocal_ele = n_ele/size + ((n_ele % size) > rank);

375:     /* receive elements */
376:     PetscMalloc(3*(mlocal_ele+1)*sizeof(int),&ele);
377:     MPI_Recv(ele,3*mlocal_ele,MPI_INT,0,0,PETSC_COMM_WORLD,&status);

379:     /* receive element adjacency graph */
380:     PetscMalloc((mlocal_ele+1)*sizeof(int),&ia);
381:     MPI_Recv(ia,mlocal_ele+1,MPI_INT,0,0,PETSC_COMM_WORLD,&status);

383:     PetscMalloc((ia[mlocal_ele]+1)*sizeof(int),&ja);
384:     MPI_Recv(ja,ia[mlocal_ele],MPI_INT,0,0,PETSC_COMM_WORLD,&status);
385:   }

387:   gdata->n_vert      = n_vert;
388:   gdata->n_ele       = n_ele;
389:   gdata->mlocal_vert = mlocal_vert;
390:   gdata->mlocal_ele  = mlocal_ele;
391:   gdata->ele         = ele;
392:   gdata->vert        = vert;

394:   gdata->ia          = ia;
395:   gdata->ja          = ja;

397:   return(0);
398: }


401: #undef __FUNCT__
403: /*
404:          Given the grid data spread across the processors, determines a
405:    new partitioning of the CELLS (elements) to reduce the number of cut edges between
406:    cells (elements).
407: */
408: int DataPartitionElements(GridData *gdata)
409: {
410:   Mat             Adj;                /* adjacency matrix */
411:   int             *ia,*ja;
412:   int             mlocal_ele,n_ele,ierr;
413:   MatPartitioning part;
414:   IS              isnewproc;

417:   n_ele       = gdata->n_ele;
418:   mlocal_ele  = gdata->mlocal_ele;

420:   ia          = gdata->ia;
421:   ja          = gdata->ja;

423:   /*
424:       Create the adjacency graph matrix
425:   */
426:   MatCreateMPIAdj(PETSC_COMM_WORLD,mlocal_ele,n_ele,ia,ja,PETSC_NULL,&Adj);

428:   /*
429:       Create the partioning object
430:   */
431:   MatPartitioningCreate(PETSC_COMM_WORLD,&part);
432:   MatPartitioningSetAdjacency(part,Adj);
433:   MatPartitioningSetFromOptions(part);
434:   MatPartitioningApply(part,&isnewproc);
435:   MatPartitioningDestroy(part);

437:   /*
438:        isnewproc - indicates for each local element the new processor it is assigned to
439:   */
440:   PetscPrintf(PETSC_COMM_WORLD,"New processor assignment for each elementn");
441:   ISView(isnewproc,PETSC_VIEWER_STDOUT_WORLD);
442:   gdata->isnewproc = isnewproc;

444:   /*
445:       Free the adjacency graph data structures
446:   */
447:   MatDestroy(Adj);


450:   return(0);
451: }

453: #undef __FUNCT__
455: /*
456:       Moves the grid element data to be on the correct processor for the new
457:    element partitioning.
458: */
459: int DataMoveElements(GridData *gdata)
460: {
461:   int          ierr,*counts,rank,size,i,*idx;
462:   Vec          vele,veleold;
463:   PetscScalar  *array;
464:   IS           isscat,isnum;
465:   VecScatter   vecscat;


469:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
470:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

472:   /* 
473:       Determine how many elements are assigned to each processor 
474:   */
475:   PetscMalloc(size*sizeof(int),&counts);
476:   ierr   = ISPartitioningCount(gdata->isnewproc,counts);

478:   /* 
479:      Create a vector to contain the newly ordered element information 

481:      Note: we use vectors to communicate this data since we can use the powerful
482:      VecScatter routines to get the data to the correct location. This is a little
483:      wasteful since the vectors hold double precision numbers instead of integers,
484:      but since this is just a setup phase in the entire numerical computation that 
485:      is only called once it is not a measureable performance bottleneck.
486:   */
487:   VecCreate(PETSC_COMM_WORLD,&vele);
488:   VecSetSizes(vele,3*counts[rank],PETSC_DECIDE);
489:   VecSetFromOptions(vele);

491:   /* 
492:       Create an index set from the isnewproc index set to indicate the mapping TO 
493:   */
494:   ISPartitioningToNumbering(gdata->isnewproc,&isnum);
495:   ISDestroy(gdata->isnewproc);
496:   /* 
497:       There are three data items per cell (element), the integer vertex numbers of its three 
498:     coordinates (we convert to double to use the scatter) (one can think 
499:     of the vectors of having a block size of 3 and there is one index in idx[] for each block)
500:   */
501:   ISGetIndices(isnum,&idx);
502:   for (i=0; i<gdata->mlocal_ele; i++) {
503:     idx[i] *= 3;
504:   }
505:   ISCreateBlock(PETSC_COMM_WORLD,3,gdata->mlocal_ele,idx,&isscat);
506:   ISRestoreIndices(isnum,&idx);
507:   ISDestroy(isnum);

509:   /* 
510:      Create a vector to contain the old ordered element information
511:   */
512:   VecCreateSeq(PETSC_COMM_SELF,3*gdata->mlocal_ele,&veleold);
513:   VecGetArray(veleold,&array);
514:   for (i=0; i<3*gdata->mlocal_ele; i++) {
515:     array[i] = gdata->ele[i];
516:   }
517:   VecRestoreArray(veleold,&array);
518: 
519:   /* 
520:      Scatter the element vertex information to the correct processor
521:   */
522:   VecScatterCreate(veleold,PETSC_NULL,vele,isscat,&vecscat);
523:   ISDestroy(isscat);
524:   VecScatterBegin(veleold,vele,INSERT_VALUES,SCATTER_FORWARD,vecscat);
525:   VecScatterEnd(veleold,vele,INSERT_VALUES,SCATTER_FORWARD,vecscat);
526:   VecScatterDestroy(vecscat);
527:   VecDestroy(veleold);

529:   /* 
530:      Put the element vertex data into a new allocation of the gdata->ele 
531:   */
532:   PetscFree(gdata->ele);
533:   gdata->mlocal_ele = counts[rank];
534:   PetscFree(counts);
535:   PetscMalloc(3*gdata->mlocal_ele*sizeof(int),&gdata->ele);
536:   VecGetArray(vele,&array);
537:   for (i=0; i<3*gdata->mlocal_ele; i++) {
538:     gdata->ele[i] = (int)PetscRealPart(array[i]);
539:   }
540:   VecRestoreArray(vele,&array);
541:   VecDestroy(vele);

543:   PetscPrintf(PETSC_COMM_WORLD,"Old vertex numbering in new element orderingn");
544:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %dn",rank);
545:   for (i=0; i<gdata->mlocal_ele; i++) {
546:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %dn",i,gdata->ele[3*i],gdata->ele[3*i+1],
547:                             gdata->ele[3*i+2]);
548:   }
549:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

551:   return(0);
552: }

554: #undef __FUNCT__
556: /*
557:          Given the newly partitioned cells (elements), this routine partitions the 
558:      vertices.

560:      The code is not completely scalable since it requires
561:      1) O(n_vert) bits per processor memory
562:      2) uses O(size) stages of communication; each of size O(n_vert) bits
563:      3) it is sequential (each processor marks it vertices ONLY after all processors
564:         to the left have marked theirs.
565:      4) the amount of work on the last processor is O(n_vert)

567:      The algorithm also does not take advantage of vertices that are "interior" to a
568:      processors elements (that is; is not contained in any element on another processor).
569:      A better algorithm would first have all processors determine "interior" vertices and
570:      make sure they are retained on that processor before listing "boundary" vertices.

572:      The algorithm is:
573:      a) each processor waits for a message from the left containing mask of all marked vertices
574:      b) it loops over all local elements, generating a list of vertices it will 
575:         claim (not claiming ones that have already been marked in the bit-array)
576:         it claims at most n_vert/size vertices
577:      c) it sends to the right the mask

579:      This is a quick-and-dirty implementation; it should work fine for many problems,
580:      but will need to be replaced once profiling shows that it takes a large amount of
581:      time. An advantage is it requires no searching or sorting.
582:      
583: */
584: int DataPartitionVertices(GridData *gdata)
585: {
586:   int        n_vert = gdata->n_vert,ierr,*localvert;
587:   int        rank,size,mlocal_ele = gdata->mlocal_ele,*ele = gdata->ele,i,j,nlocal = 0,nmax;
588:   PetscBT    mask;
589:   MPI_Status status;

592:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
593:   MPI_Comm_size(PETSC_COMM_WORLD,&size);

595:   /*
596:       Allocated space to store bit-array indicting vertices marked
597:   */
598:   PetscBTCreate(n_vert,mask);

600:   /*
601:      All processors except last can have a maximum of n_vert/size vertices assigned
602:      (because last processor needs to handle any leftovers)
603:   */
604:   nmax = n_vert/size;
605:   if (rank == size-1) {
606:     nmax = n_vert;
607:   }

609:   /* 
610:      Receive list of marked vertices from left 
611:   */
612:   if (rank) {
613:     MPI_Recv(mask,PetscBTLength(n_vert),MPI_CHAR,rank-1,0,PETSC_COMM_WORLD,&status);
614:   }

616:   if (rank == size-1) {
617:     /* last processor gets all the rest */
618:     for (i=0; i<n_vert; i++) {
619:       if (!PetscBTLookup(mask,i)) {
620:         nlocal++;
621:       }
622:     }
623:     nmax = nlocal;
624:   }

626:   /* 
627:      Now we know how many are local, allocated enough space for them and mark them 
628:   */
629:   PetscMalloc((nmax+1)*sizeof(int),&localvert);

631:   /* generate local list and fill in mask */
632:   nlocal = 0;
633:   if (rank < size-1) {
634:     /* count my vertices */
635:     for (i=0; i<mlocal_ele; i++) {
636:       for (j=0; j<3; j++) {
637:         if (!PetscBTLookupSet(mask,ele[3*i+j])) {
638:           localvert[nlocal++] = ele[3*i+j];
639:           if (nlocal >= nmax) goto foundenough2;
640:         }
641:       }
642:     }
643:     foundenough2:;
644:   } else {
645:     /* last processor gets all the rest */
646:     for (i=0; i<n_vert; i++) {
647:       if (!PetscBTLookup(mask,i)) {
648:         localvert[nlocal++] = i;
649:       }
650:     }
651:   }
652:   /* 
653:       Send bit mask on to next processor
654:   */
655:   if (rank < size-1) {
656:     MPI_Send(mask,PetscBTLength(n_vert),MPI_CHAR,rank+1,0,PETSC_COMM_WORLD);
657:   }
658:   PetscBTDestroy(mask);

660:   gdata->localvert = localvert;
661:   gdata->nlocal    = nlocal;

663:   /* print lists of owned vertices */
664:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Number vertices assigned %dn",rank,nlocal);
665:   PetscSynchronizedFlush(PETSC_COMM_WORLD);
666:   PetscIntView(nlocal,localvert,PETSC_VIEWER_STDOUT_WORLD);

668:   return(0);
669: }

671: #undef __FUNCT__
673: /*
674:      Given the partitioning of the vertices; renumbers the element vertex lists for the 
675:      new vertex numbering and moves the vertex coordinate values to the correct processor
676: */
677: int DataMoveVertices(GridData *gdata)
678: {
679:   AO           ao;
680:   int          ierr,rank,i;
681:   Vec          vert,overt;
682:   VecScatter   vecscat;
683:   IS           isscat;
684:   PetscScalar  *avert;


688:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

690:   /* ---------------------------------------------------------------------
691:       Create a global reodering of the vertex numbers
692:   */
693:   AOCreateBasic(PETSC_COMM_WORLD,gdata->nlocal,gdata->localvert,PETSC_NULL,&ao);

695:   /*
696:      Change the element vertex information to the new vertex numbering
697:   */
698:   AOApplicationToPetsc(ao,3*gdata->mlocal_ele,gdata->ele);
699:   PetscPrintf(PETSC_COMM_WORLD,"New vertex numbering in new element orderingn");
700:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %dn",rank);
701:   for (i=0; i<gdata->mlocal_ele; i++) {
702:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %dn",i,gdata->ele[3*i],gdata->ele[3*i+1],
703:                             gdata->ele[3*i+2]);
704:   }
705:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

707:   /*
708:      Destroy the AO that is no longer needed
709:   */
710:   AODestroy(ao);

712:   /* --------------------------------------------------------------------
713:       Finally ship the vertex coordinate information to its owning process
714:       note, we do this in a way very similar to what was done for the element info
715:   */
716:   /* create a vector to contain the newly ordered vertex information */
717:   VecCreateSeq(PETSC_COMM_SELF,2*gdata->nlocal,&vert);

719:   /* create a vector to contain the old ordered vertex information */
720:   VecCreateMPIWithArray(PETSC_COMM_WORLD,2*gdata->mlocal_vert,PETSC_DECIDE,gdata->vert,
721:                                &overt);

723:   /* 
724:       There are two data items per vertex, the x and y coordinates (i.e. one can think 
725:     of the vectors of having a block size of 2 and there is one index in localvert[] for each block)
726:   */
727:   for (i=0; i<gdata->nlocal; i++) gdata->localvert[i] *= 2;
728:   ISCreateBlock(PETSC_COMM_WORLD,2,gdata->nlocal,gdata->localvert,&isscat);
729:   PetscFree(gdata->localvert);

731:   /* 
732:       Scatter the element vertex information to the correct processor
733:   */
734:   VecScatterCreate(overt,isscat,vert,PETSC_NULL,&vecscat);
735:   ISDestroy(isscat);
736:   VecScatterBegin(overt,vert,INSERT_VALUES,SCATTER_FORWARD,vecscat);
737:   VecScatterEnd(overt,vert,INSERT_VALUES,SCATTER_FORWARD,vecscat);
738:   VecScatterDestroy(vecscat);

740:   VecDestroy(overt);
741:   PetscFree(gdata->vert);
742: 
743:   /*
744:         Put resulting vertex information into gdata->vert array
745:   */
746:   PetscMalloc(2*gdata->nlocal*sizeof(PetscScalar),&gdata->vert);
747:   VecGetArray(vert,&avert);
748:   PetscMemcpy(gdata->vert,avert,2*gdata->nlocal*sizeof(PetscScalar));
749:   VecRestoreArray(vert,&avert);
750:   gdata->mlocal_vert = gdata->nlocal;
751:   VecDestroy(vert);

753:   PetscPrintf(PETSC_COMM_WORLD,"Vertex coordinates in new numberingn");
754:   for (i=0; i<2*gdata->mlocal_vert; i++) {
755:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%gn",gdata->vert[i]);
756:   }
757:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

759:   return(0);
760: }


763: #undef __FUNCT__
765: int DataDestroy(GridData *gdata)
766: {

770:   PetscFree(gdata->ele);
771:   PetscFree(gdata->vert);
772:   return(0);
773: }