Actual source code: ex2.c
1: /*$Id: ex2.c,v 1.46 2001/09/11 16:34:32 bsmith Exp $*/
3: static char help[] = "Reads a a simple unstructured grid from a file. Partitions it,n
4: and distributes the grid data accordinglynn";
6: /*T
7: Concepts: Mat^partitioning a matrix;
8: Processors: n
9: T*/
11: /*
12: Updates of this example MAY be found at
13: http://www.mcs.anl.gov/petsc/src/dm/ao/examples/tutorials/ex2.c
15: This is a very basic, even crude, example of managing an unstructured
16: grid in parallel.
18: This particular code is for a Galerkin-style finite element method.
20: After the calls below, each processor will have
21: 1) a list of elements it "owns"; for each "owned" element it will have the global
22: numbering of the three vertices; stored in gdata->ele;
23: 2) a list of vertices it "owns". For each owned it will have the x and y
24: coordinates; stored in gdata->vert
26: It will not have
27: 1) list of ghost elements (since they are not needed for traditional
28: Galerkin style finite element methods). For various finite volume methods
29: you may need the ghost element lists, these may be generated using the
30: element neighbor information given in the file database.
32: To compute the local element stiffness matrix and load vector, each processor
33: will need the vertex coordinates for all of the vertices on the locally
34: "owned" elements. This data could be obtained by doing the appropriate vector
35: scatter on the data stored in gdata->vert; we haven't had time to demonstrate this.
37: Clearly writing a complete parallel unstructured grid code with PETSc is still
38: a good deal of work and requires a lot of application level coding. BUT, at least
39: PETSc can manage all of the nonlinear and linear solvers (including matrix assembly
40: etc.), which allows the programmer to concentrate his or her efforts on managing
41: the unstructured grid. The PETSc team is developing additional library objects
42: to help manage parallel unstructured grid computations. Unfortunately we have
43: not had time to complete these yet, so the application programmer still must
44: manage much of the parallel grid manipulation as indicated below.
46: */
48: /*
49: Include "petscmat.h" so that we can use matrices.
50: automatically includes:
51: petsc.h - base PETSc routines petscvec.h - vectors
52: petscsys.h - system routines petscmat.h - matrices
53: petscis.h - index sets petscviewer.h - viewers
55: Include "petscao.h" allows use of the AO (application ordering) commands,
56: used below for renumbering the vertex numbers after the partitioning.
58: Include "petscbt.h" for managing logical bit arrays that are used to
59: conserve space. Note that the code does use order N bit arrays on each
60: processor so is theoretically not scalable, but even with 64 million
61: vertices it will only need temporarily 8 megabytes of memory for the
62: bit array so one can still do very large problems with this approach,
63: since the bit arrays are freed before the vectors and matrices are
64: created.
65: */
66: #include petscmat.h
67: #include petscao.h
68: #include petscbt.h
70: /*
71: This is the user-defined grid data context
72: */
73: typedef struct {
74: int n_vert,n_ele;
75: int mlocal_vert,mlocal_ele;
76: int *ele;
77: PetscScalar *vert;
78: int *ia,*ja;
79: IS isnewproc;
80: int *localvert,nlocal; /* used to stash temporarily old global vertex number of new vertex */
81: } GridData;
83: /*
84: Variables on all processors:
85: n_vert - total number of vertices
86: mlocal_vert - number of vertices on this processor
87: vert - x,y coordinates of local vertices
89: n_ele - total number of elements
90: mlocal_ele - number of elements on this processor
91: ele - vertices of elements on this processor
93: ia, ja - adjacency graph of elements (for partitioning)
94:
95: Variables on processor 0 during data reading from file:
96: mmlocal_vert[i] - number of vertices on each processor
97: tmpvert - x,y coordinates of vertices on any processor (as read in)
99: mmlocal_ele[i] - number of elements on each processor
101: tmpia, tmpja - adjacency graph of elements for other processors
103: Notes:
104: The code below has a great deal of IO (print statements). This is to allow one to track
105: the renumbering and movement of data among processors. In an actual
106: production run, IO of this type would be deactivated.
108: To use the ParMETIS partitioner run with the option -mat_partitioning_type parmetis
109: otherwise it defaults to the initial element partitioning induced when the data
110: is read in.
112: To understand the parallel performance of this type of code, it is important
113: to profile the time spent in various events in the code; running with the
114: option -log_summary will indicate how much time is spent in the routines
115: below. Of course, for very small problems, such as the sample grid used here,
116: the profiling results are meaningless.
117: */
119: extern int DataRead(GridData *);
120: extern int DataPartitionElements(GridData *);
121: extern int DataMoveElements(GridData *);
122: extern int DataPartitionVertices(GridData *);
123: extern int DataMoveVertices(GridData *);
124: extern int DataDestroy(GridData *);
126: #undef __FUNCT__
128: int main(int argc,char **args)
129: {
130: int ierr;
131: int READ_EVENT,PARTITION_ELEMENT_EVENT,MOVE_ELEMENT_EVENT;
132: int PARTITION_VERTEX_EVENT,MOVE_VERTEX_EVENT;
133: GridData gdata;
135: PetscInitialize(&argc,&args,(char *)0,help);
137: PetscLogEventRegister(&READ_EVENT, "Read Data",0);
138: PetscLogEventRegister(&PARTITION_ELEMENT_EVENT,"Partition elemen",0);
139: PetscLogEventRegister(&MOVE_ELEMENT_EVENT, "Move elements",0);
140: PetscLogEventRegister(&PARTITION_VERTEX_EVENT, "Partition vertic",0);
141: PetscLogEventRegister(&MOVE_VERTEX_EVENT, "Move vertices",0);
143: PetscLogEventBegin(READ_EVENT,0,0,0,0);
144: DataRead(&gdata);
145: PetscLogEventEnd(READ_EVENT,0,0,0,0);
146: PetscLogEventBegin(PARTITION_ELEMENT_EVENT,0,0,0,0);
147: DataPartitionElements(&gdata);
148: PetscLogEventEnd(PARTITION_ELEMENT_EVENT,0,0,0,0);
149: PetscLogEventBegin(MOVE_ELEMENT_EVENT,0,0,0,0);
150: DataMoveElements(&gdata);
151: PetscLogEventEnd(MOVE_ELEMENT_EVENT,0,0,0,0);
152: PetscLogEventBegin(PARTITION_VERTEX_EVENT,0,0,0,0);
153: DataPartitionVertices(&gdata);
154: PetscLogEventEnd(PARTITION_VERTEX_EVENT,0,0,0,0);
155: PetscLogEventBegin(MOVE_VERTEX_EVENT,0,0,0,0);
156: DataMoveVertices(&gdata);
157: PetscLogEventEnd(MOVE_VERTEX_EVENT,0,0,0,0);
158: DataDestroy(&gdata);
160: PetscFinalize();
161: return 0;
162: }
165: #undef __FUNCT__
167: /*
168: Reads in the grid data from a file; each processor is naively
169: assigned a continuous chunk of vertex and element data. Later the data
170: will be partitioned and moved to the appropriate processor.
171: */
172: int DataRead(GridData *gdata)
173: {
174: int rank,size,n_vert,*mmlocal_vert,mlocal_vert,i,*ia,*ja,cnt,j;
175: int mlocal_ele,*mmlocal_ele,*ele,*tmpele,n_ele,net,a1,a2,a3;
176: int *iatmp,*jatmp,ierr;
177: char msg[128];
178: PetscScalar *vert,*tmpvert;
179: MPI_Status status;
182: /*
183: Processor 0 opens the file, reads in chunks of data and sends a portion off to
184: each other processor.
186: Note: For a truely scalable IO portion of the code, one would store
187: the grid data in a binary file and use MPI-IO commands to have each
188: processor read in the parts that it needs. However in most circumstances
189: involving up to a say a million nodes and 100 processors this approach
190: here is fine.
191: */
192: MPI_Comm_size(PETSC_COMM_WORLD,&size);
193: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
195: if (!rank) {
196: FILE *fd;
197: fd = fopen("usgdata","r"); if (!fd) SETERRQ(1,"Cannot open grid file");
199: /* read in number of vertices */
200: fgets(msg,128,fd);
201: printf("File msg:%s",msg);
202: fscanf(fd,"Number Vertices = %dn",&n_vert);
203: printf("Number of grid vertices %dn",n_vert);
205: /* broadcast number of vertices to all processors */
206: MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
207: mlocal_vert = n_vert/size + ((n_vert % size) > 0);
209: /*
210: allocate enough room for the first processor to keep track of how many
211: vertices are assigned to each processor. Splitting vertices equally amoung
212: all processors.
213: */
214: PetscMalloc(size*sizeof(int),&mmlocal_vert);
215: for (i=0; i<size; i++) {
216: mmlocal_vert[i] = n_vert/size + ((n_vert % size) > i);
217: printf("Processor %d assigned %d verticesn",i,mmlocal_vert[i]);
218: }
220: /*
221: Read in vertices assigned to first processor
222: */
223: PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&vert);
224: printf("Vertices assigned to processor 0n");
225: for (i=0; i<mlocal_vert; i++) {
226: fscanf(fd,"%d %lf %lfn",&cnt,vert+2*i,vert+2*i+1);
227: printf("%d %g %gn",cnt,PetscRealPart(vert[2*i]),PetscRealPart(vert[2*i+1]));
228: }
230: /*
231: Read in vertices for all the other processors
232: */
233: PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&tmpvert);
234: for (j=1; j<size; j++) {
235: printf("Vertices assigned to processor %dn",j);
236: for (i=0; i<mmlocal_vert[j]; i++) {
237: fscanf(fd,"%d %lf %lfn",&cnt,tmpvert+2*i,tmpvert+2*i+1);
238: printf("%d %g %gn",cnt,tmpvert[2*i],tmpvert[2*i+1]);
239: }
240: MPI_Send(tmpvert,2*mmlocal_vert[j],MPIU_SCALAR,j,0,PETSC_COMM_WORLD);
241: }
242: PetscFree(tmpvert);
243: PetscFree(mmlocal_vert);
245: fscanf(fd,"Number Elements = %dn",&n_ele);
246: printf("Number of grid elements %dn",n_ele);
248: /*
249: Broadcast number of elements to all processors
250: */
251: MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
252: mlocal_ele = n_ele/size + ((n_ele % size) > 0);
254: /*
255: Allocate enough room for the first processor to keep track of how many
256: elements are assigned to each processor.
257: */
258: PetscMalloc(size*sizeof(int),&mmlocal_ele);
259: for (i=0; i<size; i++) {
260: mmlocal_ele[i] = n_ele/size + ((n_ele % size) > i);
261: printf("Processor %d assigned %d elementsn",i,mmlocal_ele[i]);
262: }
263:
264: /*
265: read in element information for the first processor
266: */
267: PetscMalloc(3*mmlocal_ele[0]*sizeof(int),&ele);
268: printf("Elements assigned to processor 0n");
269: for (i=0; i<mlocal_ele; i++) {
270: fscanf(fd,"%d %d %d %dn",&cnt,ele+3*i,ele+3*i+1,ele+3*i+2);
271: printf("%d %d %d %dn",cnt,ele[3*i],ele[3*i+1],ele[3*i+2]);
272: }
274: /*
275: Read in elements for all the other processors
276: */
277: PetscMalloc(3*mmlocal_ele[0]*sizeof(int),&tmpele);
278: for (j=1; j<size; j++) {
279: printf("Elements assigned to processor %dn",j);
280: for (i=0; i<mmlocal_ele[j]; i++) {
281: fscanf(fd,"%d %d %d %dn",&cnt,tmpele+3*i,tmpele+3*i+1,tmpele+3*i+2);
282: printf("%d %d %d %dn",cnt,tmpele[3*i],tmpele[3*i+1],tmpele[3*i+2]);
283: }
284: MPI_Send(tmpele,3*mmlocal_ele[j],MPI_INT,j,0,PETSC_COMM_WORLD);
285: }
286: PetscFree(tmpele);
288: /*
289: Read in element neighbors for processor 0
290: We don't know how many spaces in ja[] to allocate so we allocate
291: 3*the number of local elements, this is the maximum it could be
292: */
293: PetscMalloc((mlocal_ele+1)*sizeof(int),&ia);
294: PetscMalloc((3*mlocal_ele+1)*sizeof(int),&ja);
295: net = 0;
296: ia[0] = 0;
297: printf("Element neighbors on processor 0n");
298: fgets(msg,128,fd);
299: for (i=0; i<mlocal_ele; i++) {
300: fscanf(fd,"%d %d %d %dn",&cnt,&a1,&a2,&a3);
301: printf("%d %d %d %dn",cnt,a1,a2,a3);
302: if (a1 >= 0) {ja[net++] = a1;}
303: if (a2 >= 0) {ja[net++] = a2;}
304: if (a3 >= 0) {ja[net++] = a3;}
305: ia[i+1] = net;
306: }
308: printf("ia values for processor 0n");
309: for (i=0; i<mlocal_ele+1; i++) {
310: printf("%d ",ia[i]);
311: }
312: printf("n");
313: printf("ja values for processor 0n");
314: for (i=0; i<ia[mlocal_ele]; i++) {
315: printf("%d ",ja[i]);
316: }
317: printf("n");
319: /*
320: Read in element neighbor information for all other processors
321: */
322: PetscMalloc((mlocal_ele+1)*sizeof(int),&iatmp);
323: PetscMalloc((3*mlocal_ele+1)*sizeof(int),&jatmp);
324: for (j=1; j<size; j++) {
325: net = 0;
326: iatmp[0] = 0;
327: printf("Element neighbors on processor %dn",j);
328: for (i=0; i<mmlocal_ele[j]; i++) {
329: fscanf(fd,"%d %d %d %dn",&cnt,&a1,&a2,&a3);
330: printf("%d %d %d %dn",cnt,a1,a2,a3);
331: if (a1 >= 0) {jatmp[net++] = a1;}
332: if (a2 >= 0) {jatmp[net++] = a2;}
333: if (a3 >= 0) {jatmp[net++] = a3;}
334: iatmp[i+1] = net;
335: }
337: printf("ia values for processor %dn",j);
338: for (i=0; i<mmlocal_ele[j]+1; i++) {
339: printf("%d ",iatmp[i]);
340: }
341: printf("n");
342: printf("ja values for processor %dn",j);
343: for (i=0; i<iatmp[mmlocal_ele[j]]; i++) {
344: printf("%d ",jatmp[i]);
345: }
346: printf("n");
348: /* send graph off to appropriate processor */
349: MPI_Send(iatmp,mmlocal_ele[j]+1,MPI_INT,j,0,PETSC_COMM_WORLD);
350: MPI_Send(jatmp,iatmp[mmlocal_ele[j]],MPI_INT,j,0,PETSC_COMM_WORLD);
351: }
352: PetscFree(iatmp);
353: PetscFree(jatmp);
354: PetscFree(mmlocal_ele);
356: fclose(fd);
357: } else {
358: /*
359: We are not the zeroth processor so we do not open the file
360: rather we wait for processor 0 to send us our data.
361: */
363: /* receive total number of vertices */
364: MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
365: mlocal_vert = n_vert/size + ((n_vert % size) > rank);
367: /* receive vertices */
368: PetscMalloc(2*(mlocal_vert+1)*sizeof(PetscScalar),&vert);
369: MPI_Recv(vert,2*mlocal_vert,MPIU_SCALAR,0,0,PETSC_COMM_WORLD,&status);
371: /* receive total number of elements */
372: MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
373: mlocal_ele = n_ele/size + ((n_ele % size) > rank);
375: /* receive elements */
376: PetscMalloc(3*(mlocal_ele+1)*sizeof(int),&ele);
377: MPI_Recv(ele,3*mlocal_ele,MPI_INT,0,0,PETSC_COMM_WORLD,&status);
379: /* receive element adjacency graph */
380: PetscMalloc((mlocal_ele+1)*sizeof(int),&ia);
381: MPI_Recv(ia,mlocal_ele+1,MPI_INT,0,0,PETSC_COMM_WORLD,&status);
383: PetscMalloc((ia[mlocal_ele]+1)*sizeof(int),&ja);
384: MPI_Recv(ja,ia[mlocal_ele],MPI_INT,0,0,PETSC_COMM_WORLD,&status);
385: }
387: gdata->n_vert = n_vert;
388: gdata->n_ele = n_ele;
389: gdata->mlocal_vert = mlocal_vert;
390: gdata->mlocal_ele = mlocal_ele;
391: gdata->ele = ele;
392: gdata->vert = vert;
394: gdata->ia = ia;
395: gdata->ja = ja;
397: return(0);
398: }
401: #undef __FUNCT__
403: /*
404: Given the grid data spread across the processors, determines a
405: new partitioning of the CELLS (elements) to reduce the number of cut edges between
406: cells (elements).
407: */
408: int DataPartitionElements(GridData *gdata)
409: {
410: Mat Adj; /* adjacency matrix */
411: int *ia,*ja;
412: int mlocal_ele,n_ele,ierr;
413: MatPartitioning part;
414: IS isnewproc;
417: n_ele = gdata->n_ele;
418: mlocal_ele = gdata->mlocal_ele;
420: ia = gdata->ia;
421: ja = gdata->ja;
423: /*
424: Create the adjacency graph matrix
425: */
426: MatCreateMPIAdj(PETSC_COMM_WORLD,mlocal_ele,n_ele,ia,ja,PETSC_NULL,&Adj);
428: /*
429: Create the partioning object
430: */
431: MatPartitioningCreate(PETSC_COMM_WORLD,&part);
432: MatPartitioningSetAdjacency(part,Adj);
433: MatPartitioningSetFromOptions(part);
434: MatPartitioningApply(part,&isnewproc);
435: MatPartitioningDestroy(part);
437: /*
438: isnewproc - indicates for each local element the new processor it is assigned to
439: */
440: PetscPrintf(PETSC_COMM_WORLD,"New processor assignment for each elementn");
441: ISView(isnewproc,PETSC_VIEWER_STDOUT_WORLD);
442: gdata->isnewproc = isnewproc;
444: /*
445: Free the adjacency graph data structures
446: */
447: MatDestroy(Adj);
450: return(0);
451: }
453: #undef __FUNCT__
455: /*
456: Moves the grid element data to be on the correct processor for the new
457: element partitioning.
458: */
459: int DataMoveElements(GridData *gdata)
460: {
461: int ierr,*counts,rank,size,i,*idx;
462: Vec vele,veleold;
463: PetscScalar *array;
464: IS isscat,isnum;
465: VecScatter vecscat;
469: MPI_Comm_size(PETSC_COMM_WORLD,&size);
470: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
472: /*
473: Determine how many elements are assigned to each processor
474: */
475: PetscMalloc(size*sizeof(int),&counts);
476: ierr = ISPartitioningCount(gdata->isnewproc,counts);
478: /*
479: Create a vector to contain the newly ordered element information
481: Note: we use vectors to communicate this data since we can use the powerful
482: VecScatter routines to get the data to the correct location. This is a little
483: wasteful since the vectors hold double precision numbers instead of integers,
484: but since this is just a setup phase in the entire numerical computation that
485: is only called once it is not a measureable performance bottleneck.
486: */
487: VecCreate(PETSC_COMM_WORLD,&vele);
488: VecSetSizes(vele,3*counts[rank],PETSC_DECIDE);
489: VecSetFromOptions(vele);
491: /*
492: Create an index set from the isnewproc index set to indicate the mapping TO
493: */
494: ISPartitioningToNumbering(gdata->isnewproc,&isnum);
495: ISDestroy(gdata->isnewproc);
496: /*
497: There are three data items per cell (element), the integer vertex numbers of its three
498: coordinates (we convert to double to use the scatter) (one can think
499: of the vectors of having a block size of 3 and there is one index in idx[] for each block)
500: */
501: ISGetIndices(isnum,&idx);
502: for (i=0; i<gdata->mlocal_ele; i++) {
503: idx[i] *= 3;
504: }
505: ISCreateBlock(PETSC_COMM_WORLD,3,gdata->mlocal_ele,idx,&isscat);
506: ISRestoreIndices(isnum,&idx);
507: ISDestroy(isnum);
509: /*
510: Create a vector to contain the old ordered element information
511: */
512: VecCreateSeq(PETSC_COMM_SELF,3*gdata->mlocal_ele,&veleold);
513: VecGetArray(veleold,&array);
514: for (i=0; i<3*gdata->mlocal_ele; i++) {
515: array[i] = gdata->ele[i];
516: }
517: VecRestoreArray(veleold,&array);
518:
519: /*
520: Scatter the element vertex information to the correct processor
521: */
522: VecScatterCreate(veleold,PETSC_NULL,vele,isscat,&vecscat);
523: ISDestroy(isscat);
524: VecScatterBegin(veleold,vele,INSERT_VALUES,SCATTER_FORWARD,vecscat);
525: VecScatterEnd(veleold,vele,INSERT_VALUES,SCATTER_FORWARD,vecscat);
526: VecScatterDestroy(vecscat);
527: VecDestroy(veleold);
529: /*
530: Put the element vertex data into a new allocation of the gdata->ele
531: */
532: PetscFree(gdata->ele);
533: gdata->mlocal_ele = counts[rank];
534: PetscFree(counts);
535: PetscMalloc(3*gdata->mlocal_ele*sizeof(int),&gdata->ele);
536: VecGetArray(vele,&array);
537: for (i=0; i<3*gdata->mlocal_ele; i++) {
538: gdata->ele[i] = (int)PetscRealPart(array[i]);
539: }
540: VecRestoreArray(vele,&array);
541: VecDestroy(vele);
543: PetscPrintf(PETSC_COMM_WORLD,"Old vertex numbering in new element orderingn");
544: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %dn",rank);
545: for (i=0; i<gdata->mlocal_ele; i++) {
546: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %dn",i,gdata->ele[3*i],gdata->ele[3*i+1],
547: gdata->ele[3*i+2]);
548: }
549: PetscSynchronizedFlush(PETSC_COMM_WORLD);
551: return(0);
552: }
554: #undef __FUNCT__
556: /*
557: Given the newly partitioned cells (elements), this routine partitions the
558: vertices.
560: The code is not completely scalable since it requires
561: 1) O(n_vert) bits per processor memory
562: 2) uses O(size) stages of communication; each of size O(n_vert) bits
563: 3) it is sequential (each processor marks it vertices ONLY after all processors
564: to the left have marked theirs.
565: 4) the amount of work on the last processor is O(n_vert)
567: The algorithm also does not take advantage of vertices that are "interior" to a
568: processors elements (that is; is not contained in any element on another processor).
569: A better algorithm would first have all processors determine "interior" vertices and
570: make sure they are retained on that processor before listing "boundary" vertices.
572: The algorithm is:
573: a) each processor waits for a message from the left containing mask of all marked vertices
574: b) it loops over all local elements, generating a list of vertices it will
575: claim (not claiming ones that have already been marked in the bit-array)
576: it claims at most n_vert/size vertices
577: c) it sends to the right the mask
579: This is a quick-and-dirty implementation; it should work fine for many problems,
580: but will need to be replaced once profiling shows that it takes a large amount of
581: time. An advantage is it requires no searching or sorting.
582:
583: */
584: int DataPartitionVertices(GridData *gdata)
585: {
586: int n_vert = gdata->n_vert,ierr,*localvert;
587: int rank,size,mlocal_ele = gdata->mlocal_ele,*ele = gdata->ele,i,j,nlocal = 0,nmax;
588: PetscBT mask;
589: MPI_Status status;
592: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
593: MPI_Comm_size(PETSC_COMM_WORLD,&size);
595: /*
596: Allocated space to store bit-array indicting vertices marked
597: */
598: PetscBTCreate(n_vert,mask);
600: /*
601: All processors except last can have a maximum of n_vert/size vertices assigned
602: (because last processor needs to handle any leftovers)
603: */
604: nmax = n_vert/size;
605: if (rank == size-1) {
606: nmax = n_vert;
607: }
609: /*
610: Receive list of marked vertices from left
611: */
612: if (rank) {
613: MPI_Recv(mask,PetscBTLength(n_vert),MPI_CHAR,rank-1,0,PETSC_COMM_WORLD,&status);
614: }
616: if (rank == size-1) {
617: /* last processor gets all the rest */
618: for (i=0; i<n_vert; i++) {
619: if (!PetscBTLookup(mask,i)) {
620: nlocal++;
621: }
622: }
623: nmax = nlocal;
624: }
626: /*
627: Now we know how many are local, allocated enough space for them and mark them
628: */
629: PetscMalloc((nmax+1)*sizeof(int),&localvert);
631: /* generate local list and fill in mask */
632: nlocal = 0;
633: if (rank < size-1) {
634: /* count my vertices */
635: for (i=0; i<mlocal_ele; i++) {
636: for (j=0; j<3; j++) {
637: if (!PetscBTLookupSet(mask,ele[3*i+j])) {
638: localvert[nlocal++] = ele[3*i+j];
639: if (nlocal >= nmax) goto foundenough2;
640: }
641: }
642: }
643: foundenough2:;
644: } else {
645: /* last processor gets all the rest */
646: for (i=0; i<n_vert; i++) {
647: if (!PetscBTLookup(mask,i)) {
648: localvert[nlocal++] = i;
649: }
650: }
651: }
652: /*
653: Send bit mask on to next processor
654: */
655: if (rank < size-1) {
656: MPI_Send(mask,PetscBTLength(n_vert),MPI_CHAR,rank+1,0,PETSC_COMM_WORLD);
657: }
658: PetscBTDestroy(mask);
660: gdata->localvert = localvert;
661: gdata->nlocal = nlocal;
663: /* print lists of owned vertices */
664: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Number vertices assigned %dn",rank,nlocal);
665: PetscSynchronizedFlush(PETSC_COMM_WORLD);
666: PetscIntView(nlocal,localvert,PETSC_VIEWER_STDOUT_WORLD);
668: return(0);
669: }
671: #undef __FUNCT__
673: /*
674: Given the partitioning of the vertices; renumbers the element vertex lists for the
675: new vertex numbering and moves the vertex coordinate values to the correct processor
676: */
677: int DataMoveVertices(GridData *gdata)
678: {
679: AO ao;
680: int ierr,rank,i;
681: Vec vert,overt;
682: VecScatter vecscat;
683: IS isscat;
684: PetscScalar *avert;
688: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
690: /* ---------------------------------------------------------------------
691: Create a global reodering of the vertex numbers
692: */
693: AOCreateBasic(PETSC_COMM_WORLD,gdata->nlocal,gdata->localvert,PETSC_NULL,&ao);
695: /*
696: Change the element vertex information to the new vertex numbering
697: */
698: AOApplicationToPetsc(ao,3*gdata->mlocal_ele,gdata->ele);
699: PetscPrintf(PETSC_COMM_WORLD,"New vertex numbering in new element orderingn");
700: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %dn",rank);
701: for (i=0; i<gdata->mlocal_ele; i++) {
702: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %dn",i,gdata->ele[3*i],gdata->ele[3*i+1],
703: gdata->ele[3*i+2]);
704: }
705: PetscSynchronizedFlush(PETSC_COMM_WORLD);
707: /*
708: Destroy the AO that is no longer needed
709: */
710: AODestroy(ao);
712: /* --------------------------------------------------------------------
713: Finally ship the vertex coordinate information to its owning process
714: note, we do this in a way very similar to what was done for the element info
715: */
716: /* create a vector to contain the newly ordered vertex information */
717: VecCreateSeq(PETSC_COMM_SELF,2*gdata->nlocal,&vert);
719: /* create a vector to contain the old ordered vertex information */
720: VecCreateMPIWithArray(PETSC_COMM_WORLD,2*gdata->mlocal_vert,PETSC_DECIDE,gdata->vert,
721: &overt);
723: /*
724: There are two data items per vertex, the x and y coordinates (i.e. one can think
725: of the vectors of having a block size of 2 and there is one index in localvert[] for each block)
726: */
727: for (i=0; i<gdata->nlocal; i++) gdata->localvert[i] *= 2;
728: ISCreateBlock(PETSC_COMM_WORLD,2,gdata->nlocal,gdata->localvert,&isscat);
729: PetscFree(gdata->localvert);
731: /*
732: Scatter the element vertex information to the correct processor
733: */
734: VecScatterCreate(overt,isscat,vert,PETSC_NULL,&vecscat);
735: ISDestroy(isscat);
736: VecScatterBegin(overt,vert,INSERT_VALUES,SCATTER_FORWARD,vecscat);
737: VecScatterEnd(overt,vert,INSERT_VALUES,SCATTER_FORWARD,vecscat);
738: VecScatterDestroy(vecscat);
740: VecDestroy(overt);
741: PetscFree(gdata->vert);
742:
743: /*
744: Put resulting vertex information into gdata->vert array
745: */
746: PetscMalloc(2*gdata->nlocal*sizeof(PetscScalar),&gdata->vert);
747: VecGetArray(vert,&avert);
748: PetscMemcpy(gdata->vert,avert,2*gdata->nlocal*sizeof(PetscScalar));
749: VecRestoreArray(vert,&avert);
750: gdata->mlocal_vert = gdata->nlocal;
751: VecDestroy(vert);
753: PetscPrintf(PETSC_COMM_WORLD,"Vertex coordinates in new numberingn");
754: for (i=0; i<2*gdata->mlocal_vert; i++) {
755: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%gn",gdata->vert[i]);
756: }
757: PetscSynchronizedFlush(PETSC_COMM_WORLD);
759: return(0);
760: }
763: #undef __FUNCT__
765: int DataDestroy(GridData *gdata)
766: {
770: PetscFree(gdata->ele);
771: PetscFree(gdata->vert);
772: return(0);
773: }