#include <stdio.h>
#include <stdlib.h>

int** allocate(int rows, int cols);
void fill_world(int** mat, int rows, int cols);
int alive(int** world, int rows, int cols);
void save(int** world, int** oworld, int rows, int cols);
void next(int** world, int** oworld, int rows, int cols);
void destroy(int** mat, int rows, int cols);

int main(int argc, char** argv)
{
    int rows, cols, generations;
    int cells;
    int** world;
    int** oworld;
    if (argc < 4)
    {
        printf("Wrong number of arguments: Please give rows cols and generations\n");
        return 1;
    }

    rows = strtol(argv[1], NULL, 10);
    cols = strtol(argv[2], NULL, 10);
    generations = strtol(argv[3], NULL, 10);

    world = allocate(rows, cols);
    oworld = allocate(rows, cols);

    fill_world(world, rows, cols);
#pragma acc update device(world[0:rows+2][0:cols+2])
    printf("Cells alive at generation %d: %d\n", 0, alive(world, rows, cols));
    for (int g=1; g <= generations; ++g)
    {
        save(world, oworld, rows, cols);
        next(world, oworld, rows, cols);
       
        printf("Cells alive at generation %d: %d\n", g, alive(world, rows, cols));
    }

    destroy(world, rows, cols);
    destroy(oworld, rows, cols);

    return 0;
}

void next(int** restrict world, int** restrict  oworld, int rows, int cols)
{
    int neigh=0;
#pragma acc parallel loop collapse(2)
    for (int r=1; r<=rows; ++r)
        for (int c=1; c<=cols; ++c)
        {
            neigh = oworld[r-1][c-1] + oworld[r][c-1]+ oworld[r+1][c-1] +
                    oworld[r-1][c]   + oworld[r+1][c] +
                    oworld[r-1][c+1] + oworld[r][c+1]+ oworld[r+1][c+1];
            if (oworld[r][c] == 1 && (neigh<2||neigh>3))
                world[r][c] = 0;
            else if (neigh==3)
                world[r][c] = 1;
        }
}

void save(int** restrict world, int** restrict oworld, int rows, int cols)
{
#pragma acc parallel loop collapse(2)
    for (int r=1; r<=rows; ++r)
        for (int c=1; c <= cols; ++c)
            oworld[r][c] = world[r][c];
}

int alive(int** restrict world, int rows, int cols)
{
    int cells = 0;
#pragma acc parallel loop collapse(2) reduction(+:cells)
    for (int r=1; r <= rows; ++r)
        for (int c=1; c <= cols; ++c)
            cells += world[r][c];
    return cells;
}

void fill_world(int** restrict mat, int rows, int cols)
{
    for (int r=1; r <= rows; ++r)
        for (int c=1; c <= cols; ++c)
            mat[r][c] = (rand()%4)%2;
    for (int i=0;i<=rows;++i)
    {
        mat[i][0] = 0;
        mat[i][cols+1] = 0;
        mat[0][i] = 0;
        mat[rows+1][i] = 0;
    }
}

int** allocate(int rows, int cols)
{
    int** mat = (int**) malloc((rows+2)*sizeof(int*));
    for (int i=0; i <= rows+1; ++i)
        mat[i] = (int*) malloc((cols+2)*sizeof(int));
#pragma acc enter data create(mat[0:rows+2][0:cols+2])
    return mat;
}

void destroy(int** mat, int rows, int cols)
{
#pragma acc exit data delete(mat[0:rows+2][0:cols+2])
    for (int i=0; i <rows+1; ++i)
        free(mat[i]);
    free(mat);
}
