#include <stdio.h>
#include <omp.h>
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <ostream>
#include <istream>
#include <sstream>
#include <offload.h>

using namespace std;

#define NP 1000


// double *U_old, *U_new, *malha_x , *malha_y, *X_Foot, *Y_Foot;

__declspec ( target (mic)) double x , y , h , velX , velY , tempoFinal , deltaT , deltaX , deltaY, tempo , alfa , beta , gama;

double t_ini , t_fim;
double xTil , yTil , suporteInferior , suporteSuperior;

__declspec ( target (mic)) long int m , k , N , contagemTempo;
__declspec ( target (mic)) int myRank , numProcs , numLocalPontos, inicioLocal, finalLocal, vizNorte , vizSul;


__declspec ( target (mic)) float U_new[(NP+1)*(NP+1)] __attribute__((aligned(64)));
__declspec ( target (mic)) float U_old[(NP+1)*(NP+1)] __attribute__((aligned(64)));
__declspec ( target (mic)) double malha_x[(NP+1)*(NP+1)] __attribute__((aligned(64)));
__declspec ( target (mic)) double malha_y[(NP+1)*(NP+1)] __attribute__((aligned(64)));


double pulso (double D , double xo , double yo , double x , double y) {
  return (exp(-D*((x-xo)*(x-xo) + (y-yo)*(y-yo))));
}


int main (int argc , char * argv[]){


  N = NP+1;
  h = (double) 1/(N-1);
  deltaX = h;
  deltaY = h;


  for (int i = 0 ; i <= N-1 ; i++) {
      for (int j = 0 ; j <= N-1 ; j++) {
          x = (double) i*h;
          y = (double) j*h;
          malha_x[i*(NP+1)+j] = x;
          malha_y[i*(NP+1)+j] = y;
          U_old[i*(NP+1)+j] = pulso (100 , 0.5 , 0.5 , x , y);
          U_new[i*(NP+1)+j] = U_old[i*(NP+1)+j];
      }
   }

   gama = atof(argv[1]);
   tempoFinal = atof(argv[2]);
   deltaT = atof(argv[3]);
   velX = atof(argv[4]);
   velY = atof(argv[5]);

   alfa = (gama*deltaT)/(deltaX*deltaX);
   beta  = (gama*deltaT)/(deltaY*deltaY);  

   k = 0;
   m = 0;

   t_ini = omp_get_wtime();


   #pragma offload target(mic:0) inout (U_new,U_old,k) in(m,tempoFinal,deltaT,N,alfa,beta,gama,velX,velY) out(tempo)
   {

      // cout << omp_get_num_threads() << "Threads" << endl;

      #pragma omp parallel firstprivate (tempo,tempoFinal,myRank,numProcs,inicioLocal,finalLocal,numLocalPontos,k,m)
      {
                //cout << omp_get_num_threads() << " Threads" << endl;

      		numProcs = omp_get_num_threads();
      		myRank = omp_get_thread_num();
      		numLocalPontos = (N-1)/numProcs;
      		inicioLocal = numLocalPontos*myRank + 1;
      		finalLocal = inicioLocal + numLocalPontos - 1;
      		if (myRank==numProcs-1){
        		finalLocal--;
      		}
  

  		while (tempo < tempoFinal){

       			tempo = (double) (k*deltaT); 


                        // ********************************* Calcula informacao no PE da Caracteristica ************************** //
         		#pragma omp for private (x , y)
          		for (int j = 1 ; j <= N-2 ; j++) {
             			//#pragma simd
				#pragma ivdep
				#pragma vector aligned                    
             			for (int i = inicioLocal ; i <= finalLocal ; i++) {
                			double x = (double) i*deltaX;
                			double y = (double) j*deltaY;
                			double y2 = y;
                			double y1 = y2 - deltaY;
                			double x2 = x;
                			double x1 = x2 - deltaX;
                			double xTil = x - 2*velX*deltaT;
                			double yTil = y - 2*velY*deltaT;
                			U_new[i*(NP+1)+j] = ((y2 - yTil)/(y2 - y1))*(((x2 - xTil)/(x2 - x1))*(U_old[(i-1)*(NP+1)+(j-1)]) + ((xTil-x1)/(x2 - x1))*(U_old[i*(NP+1)+(j-1)]))
                        		+
                              				    ((yTil - y1)/(y2 - y1))*(((x2 - xTil)/(x2 - x1))*(U_old[(i-1)*(NP+1)+j]) + ((xTil-x1)/(x2 - x1))*(U_old[i*(NP+1)+j]));
             			}
          		}


                        // ********************************* 1o Semi Passo Explicito ************************** //
          		#pragma omp for
          		for (int j = 1 ; j <= N-2 ; j++) {
           	 		//#pragma simd
                                #pragma ivdep
                                #pragma vector aligned           
            			for (int i = inicioLocal ; i <= finalLocal ; i++) { 
              				if ( (i+j+m) % 2 == 0 ) {
                 				U_old[i*(NP+1)+j] = U_new[i*(NP+1)+j] + alfa*(U_new[(i+1)*(NP+1)+j] - 2*U_new[i*(NP+1)+j] + U_new[(i-1)*(NP+1)+j]) + 
							beta*(U_new[i*(NP+1)+j+1] - 2*U_new[i*(NP+1)+j] + U_new[i*(NP+1)+j-1]);
              				}
            			}
          		} 

                        // ********************************* 1o Semi Passo Implicito ************************** //
          		#pragma omp for
          		for (int j = 1 ; j <= N-2 ; j++) {
             			//#pragma simd
                                #pragma ivdep
                                #pragma vector aligned         
	     			for (int i = inicioLocal ; i <= finalLocal ; i++) {     
                			if ( (i+j+m) % 2 == 1 ) {
                   				U_old[i*(NP+1)+j] = (U_new[i*(NP+1)+j] + alfa*(U_old[(i+1)*(NP+1)+j] +
							 U_old[(i-1)*(NP+1)+j]) + beta*(U_old[i*(NP+1)+(j+1)] + U_old[i*(NP+1)+(j-1)]))/(1 + 2*alfa + 2*beta);
                			}
             			}
          		}


         		m++;


                        // ********************************* 2o Semi Passo Explicito ************************** //
          		#pragma omp for
          		for (int j = 1 ; j <= N-2 ; j++) {
            			//#pragma simd
                                #pragma ivdep
                                #pragma vector aligned           
            			for (int i = inicioLocal ; i <= finalLocal ; i++) {
              				if ( (i+j+m) % 2 == 0 ) {
                 				U_new[i*(NP+1)+j] = U_old[i*(NP+1)+j] + alfa*(U_old[(i+1)*(NP+1)+j] - 2*U_old[i*(NP+1)+j] + U_old[(i-1)*(NP+1)+j]) +
                                        		beta*(U_old[i*(NP+1)+j+1] - 2*U_old[i*(NP+1)+j] + U_old[i*(NP+1)+j-1]);
              				}
            			}
          		}


                        // ********************************* 2o Semi Passo Implicito ************************** //
          		#pragma omp for
          		for (int j = 1 ; j <= N-2 ; j++) {
             			//#pragma simd
                                #pragma ivdep
                                #pragma vector aligned         
             			for (int i = inicioLocal ; i <= finalLocal ; i++) {
                			if ( (i+j+m) % 2 == 1 ) {
                   				U_new[i*(NP+1)+j] = (U_old[i*(NP+1)+j] + alfa*(U_new[(i+1)*(NP+1)+j] + U_new[(i-1)*(NP+1)+j]) + 
							beta*(U_new[i*(NP+1)+(j+1)] + U_new[i*(NP+1)+(j-1)]))/(1 + 2*alfa + 2*beta);
                			}
             			}
          		}

          		#pragma omp for
          		for (int j = 1 ; j <= N-2 ; j++) {
             			//#pragma simd
                                #pragma ivdep
                                #pragma vector aligned         
             			for (int i = inicioLocal ; i <= finalLocal ; i++) {
                 			U_old[i*(NP+1)+j] = U_new[i*(NP+1)+j];
             			}
          		}

          		m++;
			k++;
		
		} // Fim WHILE

	} // Fim Regiao Paralela

   } // Fim OFFLOAD


   t_fim = omp_get_wtime();

   cout << "with(plots):" << endl;
   cout << "pointplot3d([";
   for (int i = 0 ; i <= N-1 ; i+=10) {
       for (int j = 0 ; j <= N-1 ; j+=10) {
           cout << "[" << malha_x[i*(NP+1)+j] << "," << malha_y[i*(NP+1)+j] << "," << U_old[i*(NP+1)+j] << "]";
           if ((i != N-1) || (j != N-1))
              cout << ",";
       }
   }
   cout << "]);" << endl << endl;
   cout << "# Tempo = " << "  " << (double) t_fim - t_ini << " segundos ..." << endl;  
   cout << "# Dados: Tempo Final = " << tempo << " Difusividade = " << gama << " iteracoes = " << k << endl;

   return(0);

}
