Parallel Computing the Heat Equation

1. Initialization and MPI Setup

#include <string>
#include <iostream>
#include <vector>
#include <mpi.h>
#include "Mapper2D.h"
#include "Parameter.h"
#include "utilities.h"
 
int main(int argc, char** argv) {
    MPI_Init(&argc, &argv); 
    // Get the number of processes
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 
    // Get the rank of the process
    int myRank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
   
    MPI_Status status;
    Timer timer;

2. Simulation Parameters and Grid Setup

    constexpr SimulationParameter parameter;
    // The fixed number of partitions in x an y direction, e.g. 2x2 = 4
    // The total number of processes has to match the call of mpirun. 
    // In this case: mpirun -np 4 ./Laplace_mpi
    constexpr int numPartsX = 2;
    constexpr int numPartsY = 2;
 
    constexpr int localNx = parameter.gridNx / numPartsX; //division using integers -> not precise
    constexpr int localNy = parameter.gridNy / numPartsY; 
 
    // -> hence, we calculate the effective numbers again. 
    // Because of truncation error, gridNx might not be the one specified in Parameters.h
    constexpr int realGridNx = localNx * numPartsX;
    constexpr int realGridNy = localNy * numPartsY;
 
    constexpr Mapper2D innerGrid(localNx, localNy); // creates instance called innerGrid of dimension localNx * localNy
    // constexpr (creates the variable at compile)
 
    // Derive the 2D-process number from the 1D-MPI-rank
    Mapper2D processTopology = Mapper2D(numPartsX, numPartsY);
    int myXRank = processTopology.xForPos(myRank);  // xForPos is a method of Mapper2D
    int myYRank = processTopology.yForPos(myRank);
    
    // The entire grid has a ghost layer on each side.
    constexpr Mapper2D entireGrid(innerGrid.nx() + 2, innerGrid.ny() + 2);

3. Buffers and Data Initialization

    /* receive buffers for ghost layer data */
    double *leftReceiveBuffer   = new double[innerGrid.ny()];
    double *rightReceiveBuffer  = new double[innerGrid.ny()];
    double *topReceiveBuffer    = new double[innerGrid.nx()];
    double *bottomReceiveBuffer = new double[innerGrid.nx()];
 
    /* send buffers data from the inner grid */
	double* leftSendBuffer      = new double[entireGrid.ny()-2];    // test innerGrid.ny directly 
																    //should give the same result
    double* rightSendBuffer     = new double[entireGrid.ny()-2];
    double* topSendBuffer       = new double[entireGrid.ny()-2];
    double* bottomSendBuffer    = new double[entireGrid.ny()-2];
 
    std::vector<double> oldData (entireGrid.size());
    std::vector<double> newData (entireGrid.size());
 
    /* initialization */
    for (size_t i = 0; i < entireGrid.size(); i++)
        oldData[i] = 0.0;
 
    /* In the parallel version also initialize send buffers here ... */
    for (size_t i = 0; i < innerGrid.ny(); i++) {
        leftSendBuffer[i] = 0.0;
        rightSendBuffer[i] = 0.0;
    }
 
    for (size_t i = 0; i < innerGrid.nx(); i++) {
        topSendBuffer[i] = 0.0;
        bottomSendBuffer[i] = 0.0;
    }

4. Boundary Condition Setup

    /* In the parallel version the following variables need to be calculated. The name "cell" is an equivalent for process. */
    bool isLeftBoundaryCell = true;     // this boolean helps identify which cell is not at the boundary
    bool isRightBoundaryCell = true;    // in the boundary we have no neighbours, hence no communication is needed
    bool isBottomBoundaryCell = true;
    bool isTopBoundaryCell = true;
 
    /* set boundary conditions ... */
    if (isLeftBoundaryCell)
        for (size_t i = 0; i < innerGrid.ny(); i++)
            leftReceiveBuffer[i] = parameter.bcLeft;
 
    if (isRightBoundaryCell)
        for (size_t i = 0; i < innerGrid.ny(); i++)
            rightReceiveBuffer[i] = parameter.bcRight;
 
    if (isTopBoundaryCell)
        for (size_t i = 0; i < innerGrid.nx(); i++)
            topReceiveBuffer[i] = parameter.bcTop;
 
    if (isBottomBoundaryCell)
        for (size_t i = 0; i < innerGrid.nx(); i++)
            bottomReceiveBuffer[i] = parameter.bcBottom;

5. Iteration and Computation

    int iteration = 0;
    timer.startNupsTimer();
    bool done = false;
 
    while (!done)
    {
        double error = 0.0;
        double diff;
 
        /* in the parallel version: Do the send and receive here. Prefer doing it in the background 
           (nonblocking / async). Watch out for deadlocks! */
        MPI_Request requests[8];   // array to hold the requests for non-blocking communication.
							       // I use a max of 8 (4 Isend and 4 Ireceive (top, bottom, left, right))
        int reqCount = 0;          // track number of requests 
        if (!isLeftBoundaryCell) { // perform communication only if neighbouring process/cell exist. i.e not a boundary
            for (size_t i = 0; i < innerGrid.ny(); i++) {
                leftSendBuffer[i] = oldData[entireGrid.pos(1, i + 1)]; // i+1 is the ghost layer 
            }
            // mXRank - 1 gives acess to the process left of the current myXRank
            MPI_Isend(leftSendBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank - 1, myYRank), 0, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(leftReceiveBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank - 1, myYRank), 0, MPI_COMM_WORLD, &requests[reqCount++]);
        }
        if (!isRightBoundaryCell) {
            for (size_t i = 0; i < innerGrid.ny(); i++) {
                rightSendBuffer[i] = oldData[entireGrid.pos(innerGrid.nx(), i + 1)];
            }
            MPI_Isend(rightSendBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank + 1, myYRank), 1, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(rightReceiveBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank + 1, myYRank), 1, MPI_COMM_WORLD, &requests[reqCount++]);
        }
 
        // Send data to the top and bottom neighbors
        if (!isTopBoundaryCell) {
            for (size_t i = 0; i < innerGrid.nx(); i++) {
                topSendBuffer[i] = oldData[entireGrid.pos(i + 1, 1)];
            }
            MPI_Isend(topSendBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank - 1), 2, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(topReceiveBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank - 1), 2, MPI_COMM_WORLD, &requests[reqCount++]);
        }
        if (!isBottomBoundaryCell) {
            for (size_t i = 0; i < innerGrid.nx(); i++) {
                bottomSendBuffer[i] = oldData[entireGrid.pos(i + 1, innerGrid.ny())];
            }
            MPI_Isend(bottomSendBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank + 1), 3, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(bottomReceiveBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank + 1), 3, MPI_COMM_WORLD, &requests[reqCount++]);
        }
 
MPI_Waitall(reqCount, requests, MPI_STATUSES_IGNORE);

If you are using 4 processors arranged in a 2x2 grid, the communication pattern will be as follows:
- Processor (0,0): Communicates with Processor (1,0) (down) and Processor (0,1) (right).
- Processor (0,1): Communicates with Processor (1,1) (down) and Processor (0,0) (left).
- Processor (1,0): Communicates with Processor (0,0) (up) and Processor (1,1) (right).
- Processor (1,1): Communicates with Processor (0,1) (up) and Processor (1,0) (left).

 
 
 
        // Update ghost layers after receiving data
        if (!isTopBoundaryCell) {
            for (size_t x = 1; x < entireGrid.nx() - 1; x++) { //we use -1 instead of -2 because of the increment op.
                oldData[entireGrid.pos(x, 0)] = topReceiveBuffer[x - 1];
            }
        }
        if (!isBottomBoundaryCell) {
            for (size_t x = 1; x < entireGrid.nx() - 1; x++) {
                oldData[entireGrid.pos(x, entireGrid.ny() - 1)] = bottomReceiveBuffer[x - 1];
            }
        }
        if (!isLeftBoundaryCell) {
            for (size_t y = 1; y < entireGrid.ny() - 1; y++) {
                oldData[entireGrid.pos(0, y)] = leftReceiveBuffer[y - 1];
            }
        }
        if (!isRightBoundaryCell) {
            for (size_t y = 1; y < entireGrid.ny() - 1; y++) {
                oldData[entireGrid.pos(entireGrid.nx() - 1, y)] = rightReceiveBuffer[y - 1];
            }
        }
 
        /* first do the calculations without the ghost layers */
        for (size_t y = 2; y < entireGrid.ny() - 2; y++)
            for (size_t x = 2; x < entireGrid.nx() - 2; x++)
            {
                newData[entireGrid.pos(x, y)] = 0.25 * (oldData[entireGrid.pos(x - 1, y)] +
                                                        oldData[entireGrid.pos(x + 1, y)] +
                                                        oldData[entireGrid.pos(x, y - 1)] +
                                                        oldData[entireGrid.pos(x, y + 1)]);
                diff = newData[entireGrid.pos(x, y)] - oldData[entireGrid.pos(x, y)];
                error = error + diff * diff;
            }
 
        /* now ghost layers should have been received ... */
 
        /* insert ghost layers */
 
        for (size_t x = 1; x < entireGrid.nx() - 1; x++)
        {
            oldData[entireGrid.pos(x, 0)] = topReceiveBuffer[x - 1];
            oldData[entireGrid.pos(x, entireGrid.ny() - 1)] = bottomReceiveBuffer[x - 1];
        }
 
        for (size_t y = 1; y < entireGrid.ny() - 1; y++)
        {
            oldData[entireGrid.pos(0, y)] = leftReceiveBuffer[y - 1];
            oldData[entireGrid.pos(entireGrid.nx() - 1, y)] = rightReceiveBuffer[y - 1];
        }
 
        /* Now do the rest of the calculation including the ghost layers. */
        for (size_t x = 1; x < entireGrid.nx() - 1; x++)
        {
            // top
            newData[entireGrid.pos(x, 1)] = 0.25 * (oldData[entireGrid.pos(x - 1, 1)] +
                                                    oldData[entireGrid.pos(x + 1, 1)] +
                                                    oldData[entireGrid.pos(x, 0)] +
                                                    oldData[entireGrid.pos(x, 2)]);
            diff = newData[entireGrid.pos(x, 1)] - oldData[entireGrid.pos(x, 1)];
            error = error + diff * diff;
            topSendBuffer[x-1] = newData[entireGrid.pos(x,1)];
 
            // bottom
            newData[entireGrid.pos(x, entireGrid.ny() - 2)] = 0.25 * (oldData[entireGrid.pos(x - 1, entireGrid.ny() - 2)] +
                                                                         oldData[entireGrid.pos(x + 1, entireGrid.ny() - 2)] +
                                                                         oldData[entireGrid.pos(x, entireGrid.ny() - 3)] +
                                                                         oldData[entireGrid.pos(x, entireGrid.ny() - 1)]);
            diff = newData[entireGrid.pos(x, entireGrid.ny() - 2)] - oldData[entireGrid.pos(x, entireGrid.ny() - 2)];
            error = error + diff * diff;
            bottomSendBuffer[x-1] = newData[entireGrid.pos(x,entireGrid.ny()-2)];
        }
 
        /* Insert corners */
        leftSendBuffer[0] = newData[entireGrid.pos(1,1)];
        leftSendBuffer[entireGrid.ny()-3] = newData[entireGrid.pos(1,entireGrid.ny()-2)];
        rightSendBuffer[0] = newData[entireGrid.pos(entireGrid.nx()-2,1)];
        rightSendBuffer[entireGrid.ny()-3] = newData[entireGrid.pos(entireGrid.nx()-2,entireGrid.ny()-2)];
        for (size_t y = 2; y < entireGrid.ny() - 2; y++)
        {
            newData[entireGrid.pos(1, y)] = 0.25 * (oldData[entireGrid.pos(1, y - 1)] +
                                                    oldData[entireGrid.pos(1, y + 1)] +
                                                    oldData[entireGrid.pos(0, y)] +
                                                    oldData[entireGrid.pos(2, y)]);
            diff = newData[entireGrid.pos(1, y)] - oldData[entireGrid.pos(1, y)];
            error = error + diff * diff;
            leftSendBuffer[y-1] = newData[entireGrid.pos(1,y)];
 
            newData[entireGrid.pos(entireGrid.nx() - 2, y)] = 0.25 * (oldData[entireGrid.pos(entireGrid.nx() - 3, y)] +
                                                                         oldData[entireGrid.pos(entireGrid.nx() - 1, y)] +
                                                                         oldData[entireGrid.pos(entireGrid.nx() - 2, y - 1)] +
                                                                         oldData[entireGrid.pos(entireGrid.nx() - 2, y + 1)]);
            diff = newData[entireGrid.pos(entireGrid.nx() - 2, y)] - oldData[entireGrid.pos(entireGrid.nx() - 2, y)];
            error = error + diff * diff;
            rightSendBuffer[y-1] = newData[entireGrid.pos(entireGrid.nx()-2,y)];
        }
 
        std::swap(oldData, newData);
 
        /* Stop in case of little change */
        done = (error < 1.0e-4);
 
        iteration++;
        if ((iteration % parameter.outputInterval) == 0)
        {
            const auto mnups = timer.getMNups(innerGrid.size() * parameter.outputInterval);
            std::cout << "time step: " << iteration << " error: " << error << " MNUPS: " << mnups << "\n";
 
            timer.startNupsTimer();
        }
    }

6. Output Results and Finalization

    /* Output (Only process 0. In the parallel case process 0 needs to collect the necessary data for the output from the other processes. */
 
    if (myRank == 0)
    {
        double *resultData = new double[realGridNx * realGridNy];
        Mapper2D globalGrid = Mapper2D(realGridNx, realGridNy);
        for (size_t x = 1; x < entireGrid.nx() - 1; x++)
            for (size_t y = 1; y < entireGrid.ny() - 1; y++)
                resultData[globalGrid.pos(x - 1, y - 1)] = oldData[entireGrid.pos(x, y)];
 
        for (int partX = 0; partX < numPartsX; partX++)
            for (int partY = 0; partY < numPartsY; partY++)
                if (partX || partY) //if (!(i==0 && j==0)
                {
                    std::cout << "Partition X = " << partX << ", Partition Y = " << partY << std::endl;
                    for (size_t y = 0; y < entireGrid.ny() - 2; y++) // line by line
                        MPI_Recv(resultData + globalGrid.pos(partX * localNx, partY * localNy + y), entireGrid.nx() - 2, MPI_DOUBLE, processTopology.pos(partX, partY), 0, MPI_COMM_WORLD, &status);
                }
        writeUCDFile(parameter.outputFileName, resultData, globalGrid);
        delete[] resultData;
    }
    else
    {
        for (size_t y = 1; y < entireGrid.ny() - 1; y++) // line by line
            MPI_Ssend(oldData.data() + entireGrid.pos(1, y), entireGrid.nx() - 2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
    }
 
    const auto runtime = timer.getRuntimeSeconds();
    std::cout << "Runtime: " << runtime << " s. " << std::endl;
    std::cout << "Average MNUPS:" << timer.getAverageMNups(innerGrid.size() * iteration) << std::endl;
 
    /* in case of parallel processing remove this line */
    writeUCDFile(parameter.outputFileName, oldData, entireGrid);
 
    delete[] leftReceiveBuffer;
    delete[] rightReceiveBuffer;
    delete[] topReceiveBuffer;
    delete[] bottomReceiveBuffer;
   
    MPI_Finalize();
 
    return 0;
}

Ashu's Online Notes

Explorer

Parallel Computing the Heat Equation

1. Initialization and MPI Setup

2. Simulation Parameters and Grid Setup

3. Buffers and Data Initialization

4. Boundary Condition Setup

5. Iteration and Computation

6. Output Results and Finalization

Graph View

Table of Contents

Backlinks