/***********************************************************************/
/*    LICENSED MATERIALS - PROPERTY OF IBM                             */
/*    "RESTRICTED MATERIALS OF IBM"                                    */
/*                                                                     */
/*    5765-422                                                         */
/*    (C) COPYRIGHT IBM CORP. 1995. ALL RIGHTS RESERVED.               */
/*                                                                     */
/*    U.S. GOVERNMENT USERS RESTRICTED RIGHTS - USE, DUPLICATION       */
/*    OR DISCLOSURE RESTRICTED BY GSA ADP SCHEDULE CONTRACT WITH       */
/*    IBM CORP.                                                        */
/***********************************************************************/
#include "diffus.h"

void factor_nodes(void);

int pn_fac[5];

void get_diffusion_matrix(double **f, G_index *f_i, int *f_d)
  {
/**************************************************************************
    f is the diffusion operator in the sine function basis
    f_i contains the global indices of the function matrix f
    f_d is the PESSL description array of f, with a few extra elements
**************************************************************************/
 

/**************************************************************************
                 Local variables 
   df contains the diffusion coefficient before the call to pdcft2
   df contains the fourier transpose of diffusion coefficients after the call
   dfg contains the entire fourier transpose of df on each node
**************************************************************************/

  dcmplx **df;
  int df_d[DESC_DIM];
  G_index *df_i;
  double **dfg;
  
/**************************************************************************
   ixi and iyi are arrays which, given a global index
   returns the x and y offsets.  Recall that the large arrays
   are 4 dimensional arrays collapsed into 2 dimensions
   where i = iy*dif_nx + ix
**************************************************************************/
 
  int *ixi, *iyi;
  double scale_f;
  int nx, ny, ix, iy, ixp, iyp;
  int ixdiff, iydiff, ixsum, iysum, num_errors=0;
  int naux1, naux2, i, j, factor1, factor2, idum;
  int ib, jb, il, jl, lda;


/*  ip is a support array for pdcft2 */
  int ip[40];
  int blk_index;
 
  /* start of executable code */

  factor_nodes();
  factor1 = 1;
  if(pn_fac[4])
    factor1 *= 11;
  if(pn_fac[3])
    factor1 *= 7;
  if(pn_fac[2])
    factor1 *= 5;
  while ( pn_fac[1] )
    {
    factor1 *=3;
    pn_fac[1]--;
    }
  factor1 << pn_fac[0];
/* check it out */

/********************************************************************
   Here we are trying to find the smallest number which is evenly
   divisible by the number of processors and is larger than 4*(n+1)
*********************************************************************/
  factor2 = (4*(dif_nx+1) + factor1 -1)/factor1;
  nx = minpower2( factor2,&idum) * factor1;

  factor2 = (4*(dif_ny+1) + factor1 -1)/factor1;
  ny = minpower2( factor2,&idum) * factor1;

  scale_f = 1.0/ (double ) (nx*ny);

/*  get storage for diffusion array */

  initialize_scale(ny, &blk_index);
  initialize_carray(&df, df_d, &df_i, nx, ny, blk_index);
  
/*********************************************************************
     here, we initialize the local part of the global array df, which
     contains the value of the diffusion coefficient function, evenly
     evaluated between (0, 2*pi).  We will do a two dimensional Fourier
     transform on the data.  Since the size of this array is so small,
     nx by ny, and ultimately we have to transfer the whole array to 
     each node, it would probably be more efficient to locally do the
     calculation on each node.
*********************************************************************/
 

/*    get the value of the diffusion coefficient function at */
/*    the necessary points                                   */

/*********************************************************************
    this loop can be simplified considerably. Since blocks of the
    array are column distributed with the block size equal to the number
    of columns divided by the number of processors there is only a single
    column block.  Also since the processor are distributed in a 1 x np
    arrangement, the local row index will equal the global row index.
    However, the loop is perfectly general for other processor arrangements
    and is correct for this particular case
*********************************************************************/
 
  jl = -1;
  for( jb=0; jb< df_i->num_col_blks; jb++)
    for( j=df_i->g_start_col_blk[jb]; j<= df_i->g_end_col_blk[jb]; j++)
      {
      jl++;
      il = -1;
      for (ib=0; ib<df_i->num_row_blks; ib++)
        for(i=df_i->g_start_row_blk[ib]; i<=df_i->g_end_row_blk[ib]; i++)
          {
          il++;
          RE(df[jl][il]) =      /* real part of complex value */
                 diff_coef((TWOPI*(double) i)/(double) nx,
                           (TWOPI*(double) j)/(double) ny);
          IM(df[jl][il]) = 0.;  /* imaginary part of complex value */
          }
      }      


  allocate_rarray( &dfg, nx, ny);

/*  do the Fourier transform */
 
  for(i=0; i < 40; i++)
        ip[i] = 0;
/*  store the array in normal mode overwritting original array */
  ip[0] = 1;
  ip[1] = 1;

  pdcft2(*df, *df, nx, ny, 1, scale_f, sc_icontext, ip); 
 

/*   df now has the fourier coefficients for the diffusion coefficient       */
/*      function                                                             */
/*                                                                           */
/*   Since each processor will need most of the fourier transformed diffusion*/
/*   coefficients, it is useful to broadcast all parts of this matrix        */
/*   to each processor.                                                      */

/*  first allocate the index arrays */

  num_errors=0;
  ixi = ( int *) malloc(sizeof(int) * dif_nx * dif_ny);
  iyi = ( int *) malloc(sizeof(int) * dif_nx * dif_ny);


/*  allocate array for holding global fourier transform */
  
  if( iyi == NULL || ixi == NULL || dfg == NULL)
       num_errors++;

  igamx2d(sc_icontext,"A"," ",1,1,&num_errors,1,-1, -1,-1,-1,-1);
  if( num_errors > 0 ) 
    if( sc_iam == 0 ) 
      {
      printf("Error in allocating scratch arrays in get_diffusion_matrix");
      blacs_abort(sc_icontext, 1);
      }


  clocal_to_rglobal( df, df_d, *dfg, nx );
  blacs_barrier(sc_icontext,"ALL");


/**************************************************************************
    here df contains only local portions of the global array, while
    dfg contains the entire global array.
 
   now load up the diffusion operator
     f(ix,iy;ix',iy')
     
     here we transform the 4d matrix into the 2d matrix where
        i = (iy-1)* dif_nx + ix + 1
        j = (iy'-1)* dif_nx + ix' + 1
**************************************************************************/

 
   /* first calculate the index arrays */
 
  for( ix=1; ix <= dif_nx; ix++ )
    for( iy=1; iy <= dif_ny; iy++ )
      {
      ixi[A_INDEX(ix-1,iy-1,dif_nx)] = ix;
      iyi[A_INDEX(ix-1,iy-1,dif_nx)] = iy;
      }

  jl = -1;
/* loop over column blocks and column index */
  for( jb=0; jb < f_i->num_col_blks; jb++)
    for( j=f_i->g_start_col_blk[jb]; j <= f_i->g_end_col_blk[jb]; j++)
      {
      jl++;  /* jl is local array column index */
      iyp = iyi[j];
      ixp = ixi[j];
      il = -1;
/* now loop over row blocks and row indices */
      for( ib=0; ib < f_i->num_row_blks; ib++)
        for( i=f_i->g_start_row_blk[ib]; i <= f_i->g_end_row_blk[ib]; i++)
          {
          il++;  /*il is local array column index */
	  iy = iyi[i];
	  ix = ixi[i];
	  ixdiff = abs(ix-ixp); 
	  iydiff = abs(iy-iyp);
	  ixsum = (ix+ixp); 
	  iysum = (iy+iyp);
          f[jl][il]  =
             ( ix*ixp + iy*iyp*dif_ly_ratio ) *
             (dfg[iydiff][ixdiff] - dfg[iysum][ixsum] )
             +  ( ix*ixp - iy*iyp*dif_ly_ratio ) * 
             (dfg[iydiff][ixsum] - dfg[iysum][ixdiff]);
          }
      }
  free(ixi);
  free(iyi);
  }

void expand_temp_profile(double **a, G_index *a_i, int *a_d, int lda)
  {
  dcmplx **atmp;
  double *aux1, *aux2;
  int i,j, nx, ny, naux1, naux2, nerrs=0, jl;
  int idum, jb, jx, jy;
  double scale_f;

        
/*********************************************************************
       calculate the minimum power of 2 to hold twice the number of
       Fourier coefficients a sine coefficients. The top half of the
       Fourier coefficients will just minus the bottom half since
       we are forcing the temperature profile to be odd
*********************************************************************/
  nx = minpower2( 2*(dif_nx+1), &idum);
  ny = minpower2( 2*(dif_ny+1), &idum);
  scale_f = -TWOPI / (double)  (nx*ny );

/*       temperature profile allocation */
  allocate_carray( &atmp, nx, ny);
  /*atmp = ( dcmplx *) malloc(nx * ny * sizeof(dcmplx) ); */

  naux1 = 40000 + 2.28*( nx + ny);
  naux2 = 20000 + 66*( 256 + 2*max(nx , ny));

/*  allocate work storage */

  aux1 = ( double *) malloc( sizeof(double) * naux1);
  aux2 = ( double *) malloc( sizeof(double) * naux2);


/*   check for allocation errors */
  if(atmp == NULL || aux1 == NULL || aux2 == NULL )
        nerrs++;
  igamx2d(sc_icontext,"A"," ",1,1,&nerrs,1,-1,-1, -1,-1,-1);
  
  if( nerrs > 0 )
    if( sc_iam == 0 )
      {
      printf("Error in allocating scratch arrays in expand_temp_profile\n");
      blacs_abort(sc_icontext, 1);
      }

/*       fill atmp with the initial temperatures */

  for (i=0; i< nx; i++)
    for (j=0; j< ny; j++)
      {
      RE(atmp[j][i]) =    /* real part of array */
	     init_temp( TWOPI*(double) i/ (double) nx, 
                        TWOPI*(double) j/ (double) ny);
      IM(atmp[j][i]) = 0.; /* imaginary part of array */
      }



/*  do the 2d fourier transform of atmp */
 
/*  first do initialization */
 

  dcft2(1,*atmp,1,nx,*atmp,1,nx,nx,ny,1,scale_f,
        aux1,naux1,aux2,naux2 );

/* then transform */

  dcft2(0,*atmp,1,nx,*atmp,1,nx,nx,ny,1,scale_f,
        aux1,naux1,aux2,naux2);

/* end of fourier transforms */



  jl = -1;
    for( jb=0; jb < a_i->num_col_blks; jb++)
      {
      for( j=a_i->g_start_col_blk[jb]; j <= a_i->g_end_col_blk[jb]; j++)
        {
	jx = j % dif_nx + 1;         
	jy = j / dif_nx + 1;
	jl++;
	a[jl][0] = RE(atmp[jy][jx]);
        }   /* all the imaginary values are 0 by design */
     }

  free(aux1);
  free(aux2);
  return;
  }

void factor_nodes(void)
  {
  int n1, n2, l2;

/*************************************************************
    Determine the prime factorization of nnodes, which must
        be of the form 2**n *3**m * 5**i * 7**j * 11**k
        where m cannot be greater than 2 and i, j or k cannot
        be greater than 1
***************************************************************/
 
  n2 = sc_nnodes;
  n1 = n2/11;
  if( n1*11 == n2)
    {
    pn_fac[4] = 1;
    n2 = n1;
    }

  n1 = n2/7;
  if( n1*7 == n2) 
    {
    pn_fac[3] = 1;
    n2 = n1;
    }
        
  n1 = n2/5;
  if( n1*5 == n2) 
    {
    pn_fac[2] = 1;
    n2 = n1;
    }
        
  n1 = n2/3;
  if( n1*3 == n2) 
    if ( (n1/3)*3 == n1 )
      {
      pn_fac[1] = 2;
      n2 = n1/3;
      }
    else
      {
      pn_fac[1] = 1;
      n2 = n1;
      }
        
  n1 = minpower2(n2,&l2);
  pn_fac[0] = l2;

  if( n1 != n2)
    if( sc_iam == 0)
      {
      printf("Invalid number of nodes, it must have the form:\n");
      printf("2**n * 3**m * 5**i * 7**j * 11**k, where \n");
      printf(" n >= 0, 0<=m<=2 and 0<= i,j,k <=1 \n");
      printf(" choose a power of 2 for best performance\n");
      blacs_abort(sc_icontext,1);
      }
	
  }

int minpower2( int n, int *log2n)
  {
  int  i=-1, m;

  m = n;

  if( m < 1 ) 
    {
    printf("n cannot be negative or zero\n");
    blacs_abort(sc_icontext, 1);
    }
 
  while (m)
    {
    i++;
    m /= 2;
    }

  if ( i > 30 )
    {
    printf("N may not be greater than 2^30, n is %d\n",n);
    blacs_abort(sc_icontext, 1);
    }

  if ( 1 << i != n )
    *log2n = ++i;
  else
    *log2n = i;

  return 1 << i;
  }
int ipow( int i, int j)
  {
   int l1=1;
   if( i == 0) 
      return 0;
   if( j < 0 ) 
      return 0;
   while(j--) 
      l1 *= i;
   return l1;
  }


