// To measure effects of memory coalescing. Coalescing.cu // B. Wilkinson Jan 30, 2011 #include #include #include #include #define BlockSize 16 // Size of blocks, 32 x 32 threads, fixed, used globally __global__ void gpu_Comput (int *h, int N, int T) { // Array loaded with global thread ID that acesses that location int col = threadIdx.x + blockDim.x * blockIdx.x; int row = threadIdx.y + blockDim.y * blockIdx.y; int threadID = col + row * N; int index = row + col * N; // sequentially down each row for (int t = 0; t < T; t++) // loop to repeat to reduce other time effects h[index] = threadID; // load array with flattened global thread ID } void printArray(int *h, int N) { printf("Results of computation, every N/8 numbers, eight numbers\n"); for (int row = 0; row < N; row += N/8) { for (int col = 0; col < N; col += N/8) printf("%6d ", h[col + row * N]); printf("\n"); } } int main(int argc, char *argv[]) { int T = 100; // number of iterations, entered at keyboard int B = 1; // number of blocks, entered at keyboard char key; int *h, *dev_h; // ptr to array holding numbers on host and device cudaEvent_t start, stop; // cuda events to measure time float elapsed_time_ms1; cudaEventCreate( &start ); cudaEventCreate( &stop ); /* ------------------------- Keyboard input -----------------------------------*/ do { // loop to repeat complete program printf("Grid Structure 2-D grid, 2-D blocks\n"); printf("Blocks fixed at 16 x 16 threads, 512 threads, max for compute cap. 1.x\n"); printf("Enter number of blocks in grid, each dimension, currently %d\n",B); scanf("%d",&B); printf("Enter number of iterations, currently %d\n",T); scanf("%d",&T); int N = B * BlockSize; // size of data array, given input data printf("Array size (and total grid-block size) %d x %d\n", N, N); dim3 Block(BlockSize, BlockSize); //Block structure, 32 x 32 max dim3 Grid(B, B); //Grid structure, B x B /* ------------------------- Allocate Memory-----------------------------------*/ int size = N * N * sizeof(int); // number of bytes in total in array h = (int*) malloc(size); // Array on host cudaMalloc((void**)&dev_h, size); // allocate device memory /* ------------------------- GPU Computation -----------------------------------*/ cudaEventRecord( start, 0 ); gpu_Comput<<< Grid, Block >>>(dev_h, N, T); cudaEventRecord( stop, 0 ); // instrument code to measue end time cudaEventSynchronize( stop ); // wait for all work done by threads cudaEventElapsedTime( &elapsed_time_ms1, start, stop ); cudaMemcpy(h,dev_h, size ,cudaMemcpyDeviceToHost); //Get results to check printArray(h,N); printf("\nTime to calculate results on GPU: %f ms.\n", elapsed_time_ms1); /* -------------------------REPEAT PROGRAM INPUT-----------------------------------*/ printf("\nEnter c to repeat, return to terminate\n"); scanf("%c",&key); scanf("%c",&key); } while (key == 'c'); // loop of complete program /* -------------- clean up ---------------------------------------*/ free(h); cudaFree(dev_h); cudaEventDestroy(start); cudaEventDestroy(stop); return 0; }