//
//  TurboGenC
//
//  The MIT License (MIT)
//
//  Copyright (c) 2015, Tony Saad
//
//  Permission is hereby granted, free of charge, to any person obtaining a copy
//  of this software and associated documentation files (the "Software"), to deal
//  in the Software without restriction, including without limitation the rights
//  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
//  copies of the Software, and to permit persons to whom the Software is
//  furnished to do so, subject to the following conditions:
//
//  The above copyright notice and this permission notice shall be included in
//  all copies or substantial portions of the Software.
//
//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
//  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
//  THE SOFTWARE.
//

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <math.h>
#include <time.h>
#include <ctype.h>
#include <unistd.h>

/*
 *
 */
void help()
{
  printf("Usage: TurboGen -n number_of_points -m number_of_modes -l domain_length -o \n");
}

/*
 *
 */
double karman_spec(const double k)
{
  // INPUTS
  const double ke_ = 40.0;   // ke - peak wave number
  const double nu = 1.0e-5;
  const double urms = 0.25;
  
  // computed from inputs to satisfy isotropic turbulence properties
  const double ke = sqrt(5.0/12)*ke_;
  const double L = 0.746834/ke; //integral length scale - sqrt(Pi)*Gamma(5/6)/Gamma(1/3)*1/ke
  const double alpha = 1.452762113; // 55/(9 Sqrt[Pi]) Gamma[5/6]/Gamma[1/3]
  const double epsilone = urms*urms*urms/L;
  const double keta = pow(epsilone, 0.25)*pow(nu,-3.0/4.0);
  const double r1 = k/ke;
  const double r2 = k/keta;
  const double tke = alpha*(urms*urms/ke)*(r1*r1*r1*r1 / pow(1.0 + r1*r1, 17.0/6.0))*exp(-2.0*r2*r2);
  return tke;
}

void generate_turbulence(const int myid,
                             const double km0,
                             const double kmmax,
                             const int nModes,
                             double (*whichspec)(double),
                             const double lx,
                             const double ly,
                             const double lz,
                             const double xlo,
                             const double ylo,
                             const double zlo,
                             const int npx, // patch nx
                             const int npy, // patch ny
                             const int npz, // patch nz
                             const int nx,
                             const int ny,
                             const int nz,
                             double** u,
                             double** v,
                             double** w)
{
  // find dx and half dx (hdx) et al.
  const double dx = lx/nx;
  const double hdx = dx/2.0;
  const double dy = ly/ny;
  const double hdy = dy/2.0;
  const double dz = lz/nz;
  const double hdz = dz/2.0;
  
  //______________________________________________________________________
  // compute wave arrays using standard random number generation
  double* phi   = (double*) malloc(sizeof(double)*(nModes+1));
  double* nu    = (double*) malloc(sizeof(double)*(nModes+1));
  double* theta = (double*) malloc(sizeof(double)*(nModes+1));
  double* psi   = (double*) malloc(sizeof(double)*(nModes+1));
  
  
  //time_t t;
  //srand((unsigned) time(&t));
  srand(0);
  for (int i=0; i<=nModes; ++i) {
    phi[i]   = 2.0*M_PI* (double) rand()/RAND_MAX;
    nu[i]    = (double) rand()/RAND_MAX;
    theta[i] = acos(2.0*nu[i] -1.0);
    psi[i]   = M_PI* (double) rand()/RAND_MAX - M_PI/2.0;
  }
  
  // maximum wave number supported on this grid
  //const double kmmax = M_PI/dx;
  if (myid == 0 ) printf ("I will generate data up to wave number: %f\n", kmmax);
  
  // find spacing in wave-space
  const double dk = (kmmax-km0)/nModes;
  // create an array of wave numbers
  double* km = malloc(sizeof(double)*(nModes+1));
  for (int i=0; i<=nModes; ++i) {
    km[i] = km0 + i*dk;
  }
  
  // create wave vector (kx, ky, kz)
  double* kx = (double*) malloc(sizeof(double)*(nModes+1));
  double* ky = (double*) malloc(sizeof(double)*(nModes+1));
  double* kz = (double*) malloc(sizeof(double)*(nModes+1));
  for (int i=0; i<=nModes; ++i) {
    kx[i] = sin(theta[i])*cos(phi[i])*km[i];
    ky[i] = sin(theta[i])*sin(phi[i])*km[i];
    kz[i] = cos(theta[i])*km[i];
  }
  
  //________________________________________________________
  // ENFORCE MASS CONSERVATION. compute k_tilde vector (see Saad et al. Manuscript)
  double* ktx = (double*) malloc(sizeof(double)*(nModes+1));
  double* kty = (double*) malloc(sizeof(double)*(nModes+1));
  double* ktz = (double*) malloc(sizeof(double)*(nModes+1));
  
  for (int i=0; i<=nModes; ++i) {
    ktx[i] = sin(kx[i]*hdx)/dx;
    kty[i] = sin(ky[i]*hdy)/dy;
    ktz[i] = sin(kz[i]*hdz)/dz;
  }
  
  // find the direction vector sigma
  double* sxm = (double*) malloc(sizeof(double)*(nModes+1));
  double* sym = (double*) malloc(sizeof(double)*(nModes+1));
  double* szm = (double*) malloc(sizeof(double)*(nModes+1));
  
  // CREATE VECTOR ZETA AND MAKE SIGMA = ZETA x K_TILDE
  for (int i=0; i<=nModes; ++i) {
    phi[i] = 2.0*M_PI* (double) rand()/RAND_MAX;
    nu[i]    = (double) rand()/RAND_MAX;
    theta[i] = acos(2.0*nu[i] -1.0);
  }
  for (int i=0; i<=nModes; ++i) {
    double zetax = sin(theta[i])*cos(phi[i]);
    double zetay = sin(theta[i])*sin(phi[i]);
    double zetaz = cos(theta[i]);
    // take cross product of zeta with k_tilde
    sxm[i] =  (zetay*ktz[i] - zetaz*kty[i]);
    sym[i] = -(zetax*ktz[i] - zetaz*ktx[i]);
    szm[i] =  (zetax*kty[i] - zetay*ktx[i]);
    // now make sigma a unit vector
    double smag = sqrt(sxm[i]*sxm[i] + sym[i]*sym[i] + szm[i]*szm[i]);
    sxm[i] = sxm[i]/smag;
    sym[i] = sym[i]/smag;
    szm[i] = szm[i]/smag;
  }
  
  //________________________________________________________
  // update the sigma vector with the amplitude (reduces amount of computation)
  double espec;
  for (int i=0; i<=nModes; ++i) {
    espec = sqrt( whichspec(km[i]) * dk);
    sxm[i] *= 2.0*espec;
    sym[i] *= 2.0*espec;
    szm[i] *= 2.0*espec;
  }
  
  //___________________________________________________________________________________________
  // cell centered coordinates
  double* xc = (double*) malloc(sizeof(double)*npx);
  double* yc = (double*) malloc(sizeof(double)*npy);
  double* zc = (double*) malloc(sizeof(double)*npz);
  
  for (int i=0; i<npx; ++i) {
    xc[i] = xlo + i*dx;
  }

  for (int j=0; j<npy; ++j) {
    yc[j] = ylo + j*dy;
  }

  for (int k=0; k<npz; ++k) {
    zc[k] = zlo + k*dz;
  }
  
  //___________________________________________________________________________________________
  // compute the Fourier series at each point!
  for (int k=0; k<npz; ++k) {
    const int knpxnpy = k*npx*npy;
    for (int j=0; j<npy; ++j) {
      const int jnpx = j*npx;
      for (int i=0; i<npx; ++i) {
        const int flat = i + jnpx + knpxnpy;
        int m = 0;
        double arg = kx[m]*xc[i] + ky[m]*yc[j] + kz[m]*zc[k] + psi[m];
        (*u)[flat] = cos(arg - kx[m]*hdx)*sxm[m];
        (*v)[flat] = cos(arg - ky[m]*hdy)*sym[m];
        (*w)[flat] = cos(arg - kz[m]*hdz)*szm[m];
        for (int m=1; m<=nModes; ++m) {
          arg = kx[m]*xc[i] + ky[m]*yc[j] + kz[m]*zc[k] - psi[m];
          (*u)[flat] += cos(arg - kx[m]*hdx)*sxm[m];
          (*v)[flat] += cos(arg - ky[m]*hdy)*sym[m];
          (*w)[flat] += cos(arg - kz[m]*hdz)*szm[m];
        }
      }
    }
  }

#ifdef DEBUG
  // check for divergence
  if (myid==0) {
    int count = 0;
    double src = 0.0;
    for (int k = 0; k < npz-1; ++k) {
      for (int j = 0; j < npy-1; ++j) {
        for (int i = 0; i < npx-1; ++i) {
          const int flat = i + j*npx + k*npx*npy;
          const int ip1 = flat + 1;
          const int jp1 = flat + npx;
          const int kp1 = flat + npx*npy;
          src = ( (*u)[ip1] - (*u)[flat])/dx + ( (*v)[jp1] - (*v)[flat])/dy + ( (*w)[kp1] - (*w)[flat])/dz;
          if (src > 1.0e-10) {
            count++;
          }
        }
      }
    }
    printf("cells with divergence = %i\n", count);
  }
#endif // DEBUG

  // cleanup
  free(phi);
  free(nu);
  free(theta);
  free(psi);
  
  free(km);
  free(kx);
  free(ky);
  free(kz);
  
  free(ktx);
  free(kty);
  free(ktz);
  
  free(sxm);
  free(sym);
  free(szm);
  
  free(xc);
  free(yc);
  free(zc);
  
}

int main(int argc, char ** argv) {
  
  extern char *optarg;
  int myid = 0, numprocs = 1;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myid);
  if (myid == 0) printf("initializing MPI \n");

  //__________________________________
  // Parse arguments
  int N = 32;
  int nModes = 100;
  double L = 9.0*2.0*M_PI/100;
  bool enableIO = false;
  for (int i=0; i<argc; ++i) {
    if (argv[i][1] == 'n') {
      N = atoi(argv[i+1]);
    }  else if (argv[i][1] == 'm') {
      nModes = atoi(argv[i+1]);
    }  else if (argv[i][1] == 'l') {
      L = atoi(argv[i+1]);
    } else if (argv[i][1] == 'o') {
      enableIO = true;
    } else if (argv[i][1] == 'h') {
      help();
      return 0;
    }
  }
  
  if (numprocs > N) {
    if(myid ==0 )
      printf("Error: In this version of the turbulence generator, you cannot use more processors than points.\n");
    MPI_Finalize();
    return 0;
  }


  // timer
  if (myid == 0) printf("starting clock...\n");
  clock_t begin, end;
  double time_spent;
  begin = clock();

  // specify which spectrum to use.
  double (*which_spec)(const double);
  which_spec = &karman_spec;

  // lowest wave number
  const double km0 = 2.0*M_PI/L;
    printf("km0 = %f\n", km0);
  // specify domain size
  const double lx = L;
  const double ly = L;
  const double lz = L;

  // domain decomposition in the z direction only :)
  const int px = 1;
  const int py = 1;
  const int pz = numprocs;

  // get patch IJK indexes. I know that we're decomposing in the z direction only, but keep
  // this for future updates if we change our minds
  int patchIJK[3];
  patchIJK[0] =  myid % px;
  patchIJK[1] = (myid/px) % py;
  patchIJK[2] =  myid/(px*py);

#ifdef DEBUG
  printf("my id = %i. my patch index is %i, %i, %i\n", myid, patchIJK[0], patchIJK[1],patchIJK[2]);
#endif

  // set the x, y, and z resolutions. Currently support same res in all directions
  const int nx = N;
  const int ny = N;
  const int nz = N;
  
  // get the number of grid points on the current processor
  const int npx = nx/px;
  const int npy = ny/py;
  int       npz = nz/pz;
  const int remaining = N - npz*numprocs;
  if (myid < remaining) npz +=1;
#ifdef  DEBUG
  printf("id = %i, npz = %i \n", myid, npz);
#endif
  // find dx and half dx (hdx) et al.
  const double dx = lx/nx;
  const double hdx = dx/2.0;
  const double dy = ly/ny;
  const double hdy = dy/2.0;
  const double dz = lz/nz;
  const double hdz = dz/2.0;
  // maximum wave number supported on this grid
  const double kmmax = M_PI/(dx);
  const double xlo = hdx + npx*patchIJK[0]*dx;
  const double ylo = hdy + npy*patchIJK[1]*dy;
  const double zlo = hdz + npz*patchIJK[2]*dz;
#ifdef DEBUG
  printf("xlo [%i, %i, %i] = %f \n", patchIJK[0], patchIJK[1], patchIJK[2], xlo);
  printf("ylo [%i, %i, %i] = %f \n", patchIJK[0], patchIJK[1], patchIJK[2], ylo);
  printf("zlo [%i, %i, %i] = %f \n", patchIJK[0], patchIJK[1], patchIJK[2], zlo);
#endif
//
//  //___________________________________________________________________________________________
//  // compute the Fourier series at each point!
  const double nt = npx*npy*npz; // total number of points on this processor
  double* u = (double*) malloc(sizeof(double)*nt);
  double* v = (double*) malloc(sizeof(double)*nt);
  double* w = (double*) malloc(sizeof(double)*nt);

  generate_turbulence(myid, km0, kmmax, nModes, which_spec , lx, ly, lz, xlo, ylo, zlo, npx, npy, npz, nx, ny, nz, &u, &v, &w);
  // after Fourier series is computed, get the average time spend on computing
  end = clock();
  time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
  double totaltime;
  MPI_Reduce(&time_spent, &totaltime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  if (myid == 0) {
    printf("Done generating Turbulence in %f s \n", totaltime/numprocs);
  }
  
//  MPI_Barrier(MPI_COMM_WORLD);

  
  // parallel write to file
  begin = clock();
  MPI_Offset offset;
  MPI_File ufile, vfile, wfile;
  MPI_Status status;
  const int msgsize = nt*sizeof(double);
  offset = msgsize*myid;
  char fname[50];
  
  sprintf(fname, "u_n%i_m%i.dat",N,nModes);
  MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &ufile);
  
  sprintf(fname, "v_n%i_m%i.dat",N,nModes);
  MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &vfile);

  sprintf(fname, "w_n%i_m%i.dat",N,nModes);
  MPI_File_open(MPI_COMM_WORLD, fname, MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &wfile);
  
  MPI_File_seek(ufile, offset, MPI_SEEK_SET);
  MPI_File_seek(vfile, offset, MPI_SEEK_SET);
  MPI_File_seek(wfile, offset, MPI_SEEK_SET);
  
  MPI_File_write(ufile, u, nt, MPI_DOUBLE, &status);
  MPI_File_write(vfile, v, nt, MPI_DOUBLE, &status);
  MPI_File_write(wfile, w, nt, MPI_DOUBLE, &status);
  
  MPI_File_close(&ufile);
  MPI_File_close(&vfile);
  MPI_File_close(&wfile);
  end = clock();
  time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
  totaltime = 0.0;
  MPI_Reduce(&time_spent, &totaltime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  if (myid == 0) {
    printf("Done writing to disk in %f s \n", totaltime/numprocs);
  }

//  for (int i =0 ; i<nx*ny*nz; ++i) {
//    printf("u = %f \n", u[i]);
//  }

//  // Now collect all u arrays from the different processors and lump them into giant u, v, w arrays
//  begin = clock();
//  double* uall=NULL;
//  if(myid ==0) uall = (double*)malloc(sizeof(double)*nx*ny*nz);
//  int *counts = (int*)malloc(sizeof(int)*numprocs);
//  MPI_Gather(&npz, 1, MPI_INT, counts, 1, MPI_INT, 0, MPI_COMM_WORLD);
//  // Displacements in the receive buffer for MPI_GATHERV
//  int *disps = (int*)malloc(sizeof(int)*numprocs);
//  // Displacement for the first chunk of data - 0
//  for (int i = 0; i < numprocs; i++) {
//    counts[i] *= nx*ny;
//    disps[i] = (i > 0) ? (disps[i-1] + counts[i-1]) : 0;
//  }
//  MPI_Gatherv(u, nt, MPI_DOUBLE, uall, counts, disps, MPI_DOUBLE, 0, MPI_COMM_WORLD);
////  MPI_Gather(u, nt, MPI_DOUBLE, uall, nt, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//  double* vall;
//  if (myid ==0) vall = (double*)malloc(sizeof(double)*nx*ny*nz);
//  MPI_Gatherv(v, nt, MPI_DOUBLE, vall, counts, disps, MPI_DOUBLE, 0, MPI_COMM_WORLD);
////  MPI_Gather(v, nt, MPI_DOUBLE, vall, nt, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//
//  double* wall;
//  if(myid ==0 ) wall = (double*)malloc(sizeof(double)*nx*ny*nz);
//  MPI_Gatherv(w, nt, MPI_DOUBLE, wall, counts, disps, MPI_DOUBLE, 0, MPI_COMM_WORLD);
////  MPI_Gather(w, nt, MPI_DOUBLE, wall, nt, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//  
//  // after Fourier series is computed, get the average time spend on computing
//  end = clock();
//  time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
//  totaltime = 0;
//  MPI_Reduce(&time_spent, &totaltime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
//  
//  if (myid == 0) {
//    printf("Done collecting data in %f s \n", totaltime/numprocs);
//  }
//
//  //___________________________________________________________________________________________
//  // write to disk
//  
//  if (enableIO && myid == 0) {
//    
//    begin = clock();
//    char fname[50];
//    
//    sprintf(fname, "u_n%i_m%i.txt", N, nModes);
//    FILE *uFile = fopen(fname, "w");
//    fprintf(uFile, "FLAT\n");
//    fprintf(uFile, "%i \t %i \t %i \t \n", nx, ny, nz);
//    for (int i =0; i<nx*ny*nz; ++i) {
//      fprintf(uFile, "%.32f\n",uall[i]);
//    }
//    fclose(uFile);
//
//    sprintf(fname, "v_n%i_m%i.txt", N, nModes);
//    FILE *vFile = fopen(fname, "w");
//    fprintf(vFile, "FLAT\n");
//    fprintf(vFile, "%i \t %i \t %i \t \n", nx, ny, nz);
//    for (int i =0; i<nx*ny*nz; ++i) {
//      fprintf(vFile, "%.32f\n",vall[i]);
//    }
//    fclose(vFile);
//
//    sprintf(fname, "w_n%i_m%i.txt", N, nModes);
//    FILE *wFile = fopen(fname, "w");
//    fprintf(wFile, "FLAT\n");
//    fprintf(wFile, "%i \t %i \t %i \t \n", nx, ny, nz);
//    for (int i =0; i<nx*ny*nz; ++i) {
//      fprintf(wFile, "%.32f\n",wall[i]);
//    }
//    fclose(wFile);
//
//    end = clock();
//    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
//    printf("Done writing to disk in %f s \n", time_spent);
//  }
//  
//  free(u);
//  free(v);
//  free(w);
//
//  if (myid ==0) free(uall);
//  if (myid ==0) free(vall);
//  if (myid ==0) free(wall);
  //
  MPI_Finalize();
  return 0;
}