1. Initialization and MPI Setup

#include <string>
#include <iostream>
#include <vector>
#include <mpi.h>
#include "Mapper2D.h"
#include "Parameter.h"
#include "utilities.h"
 
int main(int argc, char** argv) {
    MPI_Init(&argc, &argv); 
    // Get the number of processes
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 
    // Get the rank of the process
    int myRank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
   
    MPI_Status status;
    Timer timer;

2. Simulation Parameters and Grid Setup

    constexpr SimulationParameter parameter;
    // The fixed number of partitions in x an y direction, e.g. 2x2 = 4
    // The total number of processes has to match the call of mpirun. 
    // In this case: mpirun -np 4 ./Laplace_mpi
    constexpr int numPartsX = 2;
    constexpr int numPartsY = 2;
 
    constexpr int localNx = parameter.gridNx / numPartsX; //division using integers -> not precise
    constexpr int localNy = parameter.gridNy / numPartsY; 
 
    // -> hence, we calculate the effective numbers again. 
    // Because of truncation error, gridNx might not be the one specified in Parameters.h
    constexpr int realGridNx = localNx * numPartsX;
    constexpr int realGridNy = localNy * numPartsY;
 
    constexpr Mapper2D innerGrid(localNx, localNy); // creates instance called innerGrid of dimension localNx * localNy
    // constexpr (creates the variable at compile)
 
    // Derive the 2D-process number from the 1D-MPI-rank
    Mapper2D processTopology = Mapper2D(numPartsX, numPartsY);
    int myXRank = processTopology.xForPos(myRank);  // xForPos is a method of Mapper2D
    int myYRank = processTopology.yForPos(myRank);
    
    // The entire grid has a ghost layer on each side.
    constexpr Mapper2D entireGrid(innerGrid.nx() + 2, innerGrid.ny() + 2);

3. Buffers and Data Initialization

    /* receive buffers for ghost layer data */
    double *leftReceiveBuffer   = new double[innerGrid.ny()];
    double *rightReceiveBuffer  = new double[innerGrid.ny()];
    double *topReceiveBuffer    = new double[innerGrid.nx()];
    double *bottomReceiveBuffer = new double[innerGrid.nx()];
 
    /* send buffers data from the inner grid */
	double* leftSendBuffer      = new double[entireGrid.ny()-2];    // test innerGrid.ny directly 
																    //should give the same result
    double* rightSendBuffer     = new double[entireGrid.ny()-2];
    double* topSendBuffer       = new double[entireGrid.ny()-2];
    double* bottomSendBuffer    = new double[entireGrid.ny()-2];
 
    std::vector<double> oldData (entireGrid.size());
    std::vector<double> newData (entireGrid.size());
 
    /* initialization */
    for (size_t i = 0; i < entireGrid.size(); i++)
        oldData[i] = 0.0;
 
    /* In the parallel version also initialize send buffers here ... */
    for (size_t i = 0; i < innerGrid.ny(); i++) {
        leftSendBuffer[i] = 0.0;
        rightSendBuffer[i] = 0.0;
    }
 
    for (size_t i = 0; i < innerGrid.nx(); i++) {
        topSendBuffer[i] = 0.0;
        bottomSendBuffer[i] = 0.0;
    }

4. Boundary Condition Setup

    /* In the parallel version the following variables need to be calculated. The name "cell" is an equivalent for process. */
    bool isLeftBoundaryCell = true;     // this boolean helps identify which cell is not at the boundary
    bool isRightBoundaryCell = true;    // in the boundary we have no neighbours, hence no communication is needed
    bool isBottomBoundaryCell = true;
    bool isTopBoundaryCell = true;
 
    /* set boundary conditions ... */
    if (isLeftBoundaryCell)
        for (size_t i = 0; i < innerGrid.ny(); i++)
            leftReceiveBuffer[i] = parameter.bcLeft;
 
    if (isRightBoundaryCell)
        for (size_t i = 0; i < innerGrid.ny(); i++)
            rightReceiveBuffer[i] = parameter.bcRight;
 
    if (isTopBoundaryCell)
        for (size_t i = 0; i < innerGrid.nx(); i++)
            topReceiveBuffer[i] = parameter.bcTop;
 
    if (isBottomBoundaryCell)
        for (size_t i = 0; i < innerGrid.nx(); i++)
            bottomReceiveBuffer[i] = parameter.bcBottom;

5. Iteration and Computation

    int iteration = 0;
    timer.startNupsTimer();
    bool done = false;
 
    while (!done)
    {
        double error = 0.0;
        double diff;
 
        /* in the parallel version: Do the send and receive here. Prefer doing it in the background 
           (nonblocking / async). Watch out for deadlocks! */
        MPI_Request requests[8];   // array to hold the requests for non-blocking communication.
							       // I use a max of 8 (4 Isend and 4 Ireceive (top, bottom, left, right))
        int reqCount = 0;          // track number of requests 
        if (!isLeftBoundaryCell) { // perform communication only if neighbouring process/cell exist. i.e not a boundary
            for (size_t i = 0; i < innerGrid.ny(); i++) {
                leftSendBuffer[i] = oldData[entireGrid.pos(1, i + 1)]; // i+1 is the ghost layer 
            }
            // mXRank - 1 gives acess to the process left of the current myXRank
            MPI_Isend(leftSendBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank - 1, myYRank), 0, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(leftReceiveBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank - 1, myYRank), 0, MPI_COMM_WORLD, &requests[reqCount++]);
        }
        if (!isRightBoundaryCell) {
            for (size_t i = 0; i < innerGrid.ny(); i++) {
                rightSendBuffer[i] = oldData[entireGrid.pos(innerGrid.nx(), i + 1)];
            }
            MPI_Isend(rightSendBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank + 1, myYRank), 1, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(rightReceiveBuffer, innerGrid.ny(), MPI_DOUBLE, processTopology.pos(myXRank + 1, myYRank), 1, MPI_COMM_WORLD, &requests[reqCount++]);
        }
 
        // Send data to the top and bottom neighbors
        if (!isTopBoundaryCell) {
            for (size_t i = 0; i < innerGrid.nx(); i++) {
                topSendBuffer[i] = oldData[entireGrid.pos(i + 1, 1)];
            }
            MPI_Isend(topSendBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank - 1), 2, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(topReceiveBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank - 1), 2, MPI_COMM_WORLD, &requests[reqCount++]);
        }
        if (!isBottomBoundaryCell) {
            for (size_t i = 0; i < innerGrid.nx(); i++) {
                bottomSendBuffer[i] = oldData[entireGrid.pos(i + 1, innerGrid.ny())];
            }
            MPI_Isend(bottomSendBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank + 1), 3, MPI_COMM_WORLD, &requests[reqCount++]);
            MPI_Irecv(bottomReceiveBuffer, innerGrid.nx(), MPI_DOUBLE, processTopology.pos(myXRank, myYRank + 1), 3, MPI_COMM_WORLD, &requests[reqCount++]);
        }
 
MPI_Waitall(reqCount, requests, MPI_STATUSES_IGNORE);
 
  • If you are using 4 processors arranged in a 2x2 grid, the communication pattern will be as follows:
    • Processor (0,0): Communicates with Processor (1,0) (down) and Processor (0,1) (right).
    • Processor (0,1): Communicates with Processor (1,1) (down) and Processor (0,0) (left).
    • Processor (1,0): Communicates with Processor (0,0) (up) and Processor (1,1) (right).
    • Processor (1,1): Communicates with Processor (0,1) (up) and Processor (1,0) (left).
 
 
 
        // Update ghost layers after receiving data
        if (!isTopBoundaryCell) {
            for (size_t x = 1; x < entireGrid.nx() - 1; x++) { //we use -1 instead of -2 because of the increment op.
                oldData[entireGrid.pos(x, 0)] = topReceiveBuffer[x - 1];
            }
        }
        if (!isBottomBoundaryCell) {
            for (size_t x = 1; x < entireGrid.nx() - 1; x++) {
                oldData[entireGrid.pos(x, entireGrid.ny() - 1)] = bottomReceiveBuffer[x - 1];
            }
        }
        if (!isLeftBoundaryCell) {
            for (size_t y = 1; y < entireGrid.ny() - 1; y++) {
                oldData[entireGrid.pos(0, y)] = leftReceiveBuffer[y - 1];
            }
        }
        if (!isRightBoundaryCell) {
            for (size_t y = 1; y < entireGrid.ny() - 1; y++) {
                oldData[entireGrid.pos(entireGrid.nx() - 1, y)] = rightReceiveBuffer[y - 1];
            }
        }
 
        /* first do the calculations without the ghost layers */
        for (size_t y = 2; y < entireGrid.ny() - 2; y++)
            for (size_t x = 2; x < entireGrid.nx() - 2; x++)
            {
                newData[entireGrid.pos(x, y)] = 0.25 * (oldData[entireGrid.pos(x - 1, y)] +
                                                        oldData[entireGrid.pos(x + 1, y)] +
                                                        oldData[entireGrid.pos(x, y - 1)] +
                                                        oldData[entireGrid.pos(x, y + 1)]);
                diff = newData[entireGrid.pos(x, y)] - oldData[entireGrid.pos(x, y)];
                error = error + diff * diff;
            }
 
        /* now ghost layers should have been received ... */
 
        /* insert ghost layers */
 
        for (size_t x = 1; x < entireGrid.nx() - 1; x++)
        {
            oldData[entireGrid.pos(x, 0)] = topReceiveBuffer[x - 1];
            oldData[entireGrid.pos(x, entireGrid.ny() - 1)] = bottomReceiveBuffer[x - 1];
        }
 
        for (size_t y = 1; y < entireGrid.ny() - 1; y++)
        {
            oldData[entireGrid.pos(0, y)] = leftReceiveBuffer[y - 1];
            oldData[entireGrid.pos(entireGrid.nx() - 1, y)] = rightReceiveBuffer[y - 1];
        }
 
        /* Now do the rest of the calculation including the ghost layers. */
        for (size_t x = 1; x < entireGrid.nx() - 1; x++)
        {
            // top
            newData[entireGrid.pos(x, 1)] = 0.25 * (oldData[entireGrid.pos(x - 1, 1)] +
                                                    oldData[entireGrid.pos(x + 1, 1)] +
                                                    oldData[entireGrid.pos(x, 0)] +
                                                    oldData[entireGrid.pos(x, 2)]);
            diff = newData[entireGrid.pos(x, 1)] - oldData[entireGrid.pos(x, 1)];
            error = error + diff * diff;
            topSendBuffer[x-1] = newData[entireGrid.pos(x,1)];
 
            // bottom
            newData[entireGrid.pos(x, entireGrid.ny() - 2)] = 0.25 * (oldData[entireGrid.pos(x - 1, entireGrid.ny() - 2)] +
                                                                         oldData[entireGrid.pos(x + 1, entireGrid.ny() - 2)] +
                                                                         oldData[entireGrid.pos(x, entireGrid.ny() - 3)] +
                                                                         oldData[entireGrid.pos(x, entireGrid.ny() - 1)]);
            diff = newData[entireGrid.pos(x, entireGrid.ny() - 2)] - oldData[entireGrid.pos(x, entireGrid.ny() - 2)];
            error = error + diff * diff;
            bottomSendBuffer[x-1] = newData[entireGrid.pos(x,entireGrid.ny()-2)];
        }
 
        /* Insert corners */
        leftSendBuffer[0] = newData[entireGrid.pos(1,1)];
        leftSendBuffer[entireGrid.ny()-3] = newData[entireGrid.pos(1,entireGrid.ny()-2)];
        rightSendBuffer[0] = newData[entireGrid.pos(entireGrid.nx()-2,1)];
        rightSendBuffer[entireGrid.ny()-3] = newData[entireGrid.pos(entireGrid.nx()-2,entireGrid.ny()-2)];
        for (size_t y = 2; y < entireGrid.ny() - 2; y++)
        {
            newData[entireGrid.pos(1, y)] = 0.25 * (oldData[entireGrid.pos(1, y - 1)] +
                                                    oldData[entireGrid.pos(1, y + 1)] +
                                                    oldData[entireGrid.pos(0, y)] +
                                                    oldData[entireGrid.pos(2, y)]);
            diff = newData[entireGrid.pos(1, y)] - oldData[entireGrid.pos(1, y)];
            error = error + diff * diff;
            leftSendBuffer[y-1] = newData[entireGrid.pos(1,y)];
 
            newData[entireGrid.pos(entireGrid.nx() - 2, y)] = 0.25 * (oldData[entireGrid.pos(entireGrid.nx() - 3, y)] +
                                                                         oldData[entireGrid.pos(entireGrid.nx() - 1, y)] +
                                                                         oldData[entireGrid.pos(entireGrid.nx() - 2, y - 1)] +
                                                                         oldData[entireGrid.pos(entireGrid.nx() - 2, y + 1)]);
            diff = newData[entireGrid.pos(entireGrid.nx() - 2, y)] - oldData[entireGrid.pos(entireGrid.nx() - 2, y)];
            error = error + diff * diff;
            rightSendBuffer[y-1] = newData[entireGrid.pos(entireGrid.nx()-2,y)];
        }
 
        std::swap(oldData, newData);
 
        /* Stop in case of little change */
        done = (error < 1.0e-4);
 
        iteration++;
        if ((iteration % parameter.outputInterval) == 0)
        {
            const auto mnups = timer.getMNups(innerGrid.size() * parameter.outputInterval);
            std::cout << "time step: " << iteration << " error: " << error << " MNUPS: " << mnups << "\n";
 
            timer.startNupsTimer();
        }
    }

6. Output Results and Finalization

    /* Output (Only process 0. In the parallel case process 0 needs to collect the necessary data for the output from the other processes. */
 
    if (myRank == 0)
    {
        double *resultData = new double[realGridNx * realGridNy];
        Mapper2D globalGrid = Mapper2D(realGridNx, realGridNy);
        for (size_t x = 1; x < entireGrid.nx() - 1; x++)
            for (size_t y = 1; y < entireGrid.ny() - 1; y++)
                resultData[globalGrid.pos(x - 1, y - 1)] = oldData[entireGrid.pos(x, y)];
 
        for (int partX = 0; partX < numPartsX; partX++)
            for (int partY = 0; partY < numPartsY; partY++)
                if (partX || partY) //if (!(i==0 && j==0)
                {
                    std::cout << "Partition X = " << partX << ", Partition Y = " << partY << std::endl;
                    for (size_t y = 0; y < entireGrid.ny() - 2; y++) // line by line
                        MPI_Recv(resultData + globalGrid.pos(partX * localNx, partY * localNy + y), entireGrid.nx() - 2, MPI_DOUBLE, processTopology.pos(partX, partY), 0, MPI_COMM_WORLD, &status);
                }
        writeUCDFile(parameter.outputFileName, resultData, globalGrid);
        delete[] resultData;
    }
    else
    {
        for (size_t y = 1; y < entireGrid.ny() - 1; y++) // line by line
            MPI_Ssend(oldData.data() + entireGrid.pos(1, y), entireGrid.nx() - 2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
    }
 
    const auto runtime = timer.getRuntimeSeconds();
    std::cout << "Runtime: " << runtime << " s. " << std::endl;
    std::cout << "Average MNUPS:" << timer.getAverageMNups(innerGrid.size() * iteration) << std::endl;
 
    /* in case of parallel processing remove this line */
    writeUCDFile(parameter.outputFileName, oldData, entireGrid);
 
    delete[] leftReceiveBuffer;
    delete[] rightReceiveBuffer;
    delete[] topReceiveBuffer;
    delete[] bottomReceiveBuffer;
   
    MPI_Finalize();
 
    return 0;
}