/***********************************************************************/
/*    LICENSED MATERIALS - PROPERTY OF IBM                             */
/*    "RESTRICTED MATERIALS OF IBM"                                    */
/*                                                                     */
/*    5765-422                                                         */
/*    (C) COPYRIGHT IBM CORP. 1995. ALL RIGHTS RESERVED.               */
/*                                                                     */
/*    U.S. GOVERNMENT USERS RESTRICTED RIGHTS - USE, DUPLICATION       */
/*    OR DISCLOSURE RESTRICTED BY GSA ADP SCHEDULE CONTRACT WITH       */
/*    IBM CORP.                                                        */
/***********************************************************************/
#include "diffus.h"

/*   external variables */

int sc_icontext, sc_iam, sc_nnodes;

static int nprow, npcol, myrow, mycol;
/**************************************************************************
 
   in order to simplify the calculations, the initialization routines
   will calculate a block size based on an estimated array size and
   return an integer offset into block size description arrays. That
   way we can have a routine which will allocate array spaces for 
   different array sizes, but have the same or different block size
   under user control
***************************************************************************/ 

static int nblock[MAX_SC_INDEX], mblock[MAX_SC_INDEX];
static int nsize[MAX_SC_INDEX];

static int icontext, iam, nnodes;
static int sc_indx=-1;
static int rsrc=0, csrc=0;
static int initialized = 0;


void initialize_scale(int n, int *index)
  {

/**************************************************************************
 
   this routine really just assigns the block size based on a given
   n and returns an index, so that multiple arrays can be created with
   compatible block sizes 
***************************************************************************/ 
 
  int npc, npr, i;
        
  if ( ! initialized ) 
    {
    blacs_pinfo(&iam, &nnodes);

/*   get the number of nodes and my node */
    sc_iam = iam;
    sc_nnodes = nnodes;

/****************************************************************
    The Fourier transform routines require that the processors 
    be layed out in a 1 x nnodes arrangement
*****************************************************************/
    nprow = 1;
    npcol = nnodes;
        
    blacs_get(0,0,&icontext);
    blacs_gridinit( &icontext, "R", nprow, npcol);
    blacs_gridinfo( icontext, &npr, &npc, &myrow, &mycol);
    sc_icontext = icontext;

/****************************************************************
       This is primarialy a sanity check that the
       system was gridded as expected
*****************************************************************/

    if( npr != nprow || npc != npcol)
      {
      if( iam == 0)
        {
        printf("number of processor rows and columns is incorrect\n");
        printf("nprow, npr, npcol, npc are %d %d %d %d\n",
			nprow, npr, npcol, npc);
        blacs_abort(icontext,1);
        }
      }
    initialized = 1;
    }
	
/*****************************************************************
  if we have already calculated an block size based on estimated
  array size, return the index for that block size
*****************************************************************/

  for( i=0; i < sc_indx; i++)
    if( n == nsize[i])
       {
       *index = i;
       return;
       }
        
  sc_indx++;
/*     compute a block size */
        
  if ( sc_indx > MAX_SC_INDEX )
    if( iam == 0 ) 
      {
      printf("Used more than the maximum number of indices available\n");
      printf("in initialize_scale, Maximum is %d\n", MAX_SC_INDEX);
      blacs_abort(icontext,1);
      }
		
  *index = sc_indx;
  nsize[*index] = n;
        
 
/*  alway choose a square block with a maximum dimension of MAXBLOCK */
/*  Note that MAXBLOCK has been set to a very high number so that the */
/*  fft routines will be properly blocked */

  if( n / nnodes > MAXBLOCK ) 
    mblock[*index] = MAXBLOCK;
  else
    mblock[*index] = n / nnodes;

  if( mblock[*index] < 1 )
    if( iam == 0 )
      {
      printf("problem size too small for number of nodes\n");
      printf("try increasing the nx and ny\n");
      blacs_abort(icontext,1);
      }
  nblock[*index] = mblock[*index];
  }

void allocate_rarray( double ***array, int m, int n)
  {
  int i,j,errors=0;


/* allocate an array of pointers, a pointer for each row */
  *array = ( double **) malloc( sizeof(double *) * n);
  if( *array == NULL ) errors++;


/* allocate all of the space required for the array */
  (*array)[0]  = ( double *) malloc( sizeof(double) * m * n);
  if( (*array)[0] == NULL ) errors++;

/* for each column give a pointer to the column of data */
  if ( errors == 0)
    for( j = 1; j< n; j++)
      (*array)[j] = (*array)[0] + j*m;

  /* at this point, check if there is any allocation errors */
  /* collect the variable errors from each processor */

  igamx2d(sc_icontext,"A"," ",1,1,&errors,1,-1,-1,-1,-1,-1);

  /* if errors is non-zero, abort */
  if( errors)
    {
    printf("allocation failure in initialize_rarray\n");
    blacs_abort(sc_icontext,1);
    }
  }

void allocate_carray( dcmplx ***array, int m, int n)
  {
  int i,j,errors=0;


/* allocate an array of pointers, a pointer for each row */
  *array = ( dcmplx **) malloc( sizeof(dcmplx *) * n);
  if( *array == NULL ) errors++;


/* allocate all of the space required for the array */
  (*array)[0]  = ( dcmplx *) malloc( sizeof(dcmplx) * m * n);
  if( (*array)[0] == NULL ) errors++;

/* for each column give a pointer to the column of data */
  if ( errors == 0)
    for( j = 1; j< n; j++)
      (*array)[j] = (*array)[0] + j*m;

  /* at this point, check if there is any allocation errors */
  /* collect the variable errors from each processor */

  igamx2d(sc_icontext,"A"," ",1,1,&errors,1,-1,-1,-1,-1,-1);
  /* if errors is non-zero, abort */
  if( errors)
    {
    printf("allocation failure in initialize_rarray\n");
    blacs_abort(sc_icontext,1);
    }
  }


/****************************************************************
 
    this routine is provided to dynamically allocate the space
    needed the local part of a global array and initialize the
    associated descriptor array.  It also returns array useful
    indices to do local to global index conversion
 
   initialize_rarray => real array initialization
   initialize_carray => complex array initialization
*****************************************************************/
 
void initialize_rarray( double ***array, int *desc_array,
                        G_index **index_array, int m, int n, int blk_index)
  {

/****************************************************************
    array is a pointer to space which will be initialize
    desc_array contains the descriptor array
    index_array contains the global indices of for each block of the 
                 array
    m    contains the number of rows in the global array
    n    contains the number of columns in the global array
    blk_index contains an index to previously calculated block size
*****************************************************************/
 

  int *address;
  int irows, icols, istat, i, j;
  int start_row_block, start_col_block;
  int mb, nb, num_mb, num_nb,errors=0;


 
/*   check to see if I have calculated block sizes already */

  if ( blk_index < 0 || blk_index > sc_indx )
    if( iam == 0 )
      {
      printf("No initialization done for index %d\n",blk_index);
      blacs_abort(icontext, 1);
      }

  mb = mblock[blk_index];
  nb = nblock[blk_index];

  irows = numroc( m, mb, myrow, rsrc, nprow );
  icols = numroc( n, nb, mycol, csrc, npcol );
        
/* allocate an array of pointers, a pointer for each row */
  *array = ( double **) malloc( sizeof(double *) * icols);
  if( *array == NULL ) errors++;

/* allocate all of the space required for the array */
  (*array)[0]  = ( double *) malloc( sizeof(double) * irows * icols);
  if( (*array)[0] == NULL ) errors++;

/* for each column give a pointer to the row of data */
  for( j = 1; j< icols; j++)
   (*array)[j] = (*array)[0] + j*irows;   


/*   calculate the number of row and column blocks */
 
  num_mb = ( (irows + mb -1)/ mb );
  num_nb = ( (icols + nb -1)/ nb );
	  
  
/* allocate memory for the global index array */
  
  *index_array = ( struct _G_index *) malloc( sizeof( struct _G_index ));
  if ( *index_array == NULL ) 
    {
    errors++;
    }
  else
    {
      /* room for row start block */
    (**index_array).g_start_row_blk = ( int *) malloc( sizeof(int) * num_mb);
    if( (**index_array).g_start_row_blk == NULL ) errors++;

      /* room for row end block */
    (**index_array).g_end_row_blk = ( int *) malloc( sizeof(int) * num_mb);
    if( (**index_array).g_end_row_blk == NULL ) errors++;

      /* room for column start block */
    (**index_array).g_start_col_blk = ( int *) malloc( sizeof(int) * num_nb);
    if( (**index_array).g_start_col_blk == NULL ) errors++;

      /* room for column end block */
    (**index_array).g_end_col_blk = ( int *) malloc( sizeof(int) * num_nb);
    if( (**index_array).g_end_col_blk == NULL ) errors++;
    }

  /* at this point, check if there is any allocation errors */
  /* collect the variable errors from each processor */
  igamx2d(sc_icontext,"A"," ",1,1,&errors,1,-1,-1,-1, -1,-1);

  /* if errors is non-zero, abort */
  if( errors)
    {
    printf("allocation failure in initialize_rarray\n");
    blacs_abort(sc_icontext,1);
    }
   
  /* collect the maximum error from all processes */
  desc_array[DTYPE] = TWOD_TYPE;
  desc_array[M] = m;
  desc_array[N] = n;
  desc_array[MB] = mb;
  desc_array[NB] = nb;
  desc_array[RSRC] = rsrc;
  desc_array[CSRC] = csrc;
  desc_array[CTXT] = icontext;
  desc_array[LLD] = irows;


  start_row_block = ( nprow + myrow - rsrc ) % nprow;
  start_col_block = ( npcol + mycol - csrc ) % npcol;

  (**index_array).num_row_blks = num_mb;
  (**index_array).num_col_blks = num_nb;

/* fill in the row portion of the global index array */
  for ( i=0; i < num_mb; i++)
    {
    (**index_array).g_start_row_blk[i] = (start_row_block + i*nprow)*mb;
    (**index_array).g_end_row_blk[i] = (start_row_block + i*nprow)*mb + mb -1;
    }
    (**index_array).g_end_row_blk[num_mb-1] =
	   (**index_array).g_start_row_blk[num_mb-1] + (irows -1) % mb;
                     
  for ( i=0; i < num_nb; i++)
    {
    (**index_array).g_start_col_blk[i] = (start_col_block + i*npcol)*nb;
    (**index_array).g_end_col_blk[i] = (start_col_block + i*npcol)*nb + nb -1;
    }
    (**index_array).g_end_col_blk[num_nb-1] =
	   (**index_array).g_start_col_blk[num_nb-1] + (icols -1) % nb;
                     
  }



void initialize_carray( dcmplx ***array, int *desc_array,
                        G_index **index_array, int m, int n, int blk_index)
  {

/****************************************************************
    array is a pointer to space which will be initialize
    desc_array contains the PESSL array descriptor
    index_array contains the global indices of for each block of the 
                 array
    m    contains the number of rows in the global array
    n    contains the number of columns in the global array
    blk_index contains an index to previously calculated block size
*****************************************************************/
 

int irows, icols, istat, i, j;
int start_row_block, start_col_block;
int mb, nb, num_mb, num_nb,errors=0;


 
/*   check to see if I have calculated block sizes already */

  if ( blk_index < 1 || blk_index > sc_indx )
    if( iam == 0 )
      {
      printf("No initialization done for index %d\n",blk_index);
      blacs_abort(icontext, 1);
      }

  mb = mblock[blk_index];
  nb = nblock[blk_index];

  irows = numroc( m, mb, myrow, rsrc, nprow );
  icols = numroc( n, nb, mycol, csrc, npcol );
        
/* this is the only real difference between the real and complex   */
/* array initialization, we give the complex array twice the space */

/* allocate an array of pointers, a pointer for each row */
  *array = ( dcmplx **) malloc( sizeof(dcmplx *) * icols);
  if( *array == NULL ) errors++;

/* allocate all of the space required for the array */
  (*array)[0]  = ( dcmplx *) malloc( sizeof(dcmplx) * irows * icols);
  if( (*array)[0] == NULL ) errors++;

/* for each column give a pointer to the row of data */
  for( j = 1; j< icols; j++)
   (*array)[j] = (*array)[0] + j*irows;



/*   calculate the number of row and column blocks */
 
  num_mb = ( (irows + mb -1)/ mb );
  num_nb = ( (icols + nb -1)/ nb );
	  
  
/* allocate memory for the global index array */
  
  *index_array = ( struct _G_index *) malloc( sizeof( struct _G_index ));
  if ( *index_array == NULL ) 
    {
    errors++;
    }
  else
    {
      /* room for row start block */
    (**index_array).g_start_row_blk = ( int *) malloc( sizeof(int) * num_mb);
    if( (**index_array).g_start_row_blk == NULL ) errors++;

      /* room for row end block */
    (**index_array).g_end_row_blk = ( int *) malloc( sizeof(int) * num_mb);
    if( (**index_array).g_end_row_blk == NULL ) errors++;

      /* room for column start block */
    (**index_array).g_start_col_blk = ( int *) malloc( sizeof(int) * num_nb);
    if( (**index_array).g_start_col_blk == NULL ) errors++;

      /* room for column end block */
    (**index_array).g_end_col_blk = ( int *) malloc( sizeof(int) * num_nb);
    if( (**index_array).g_end_col_blk == NULL ) errors++;
    }

  /* at this point, check if there is any allocation errors */
  /* collect the variable errors from each processor */
  igamx2d(sc_icontext,"A"," ",1,1,&errors,1,-1,-1,-1, -1,-1);

  /* if errors is non-zero, abort */
  if( errors)
    {
    printf("allocation failure in initialize_rarray\n");
    blacs_abort(sc_icontext,1);
    }
   
  /* collect the maximum error from all processes */

  desc_array[DTYPE] = TWOD_TYPE;
  desc_array[M] = m;
  desc_array[N] = n;
  desc_array[MB] = mb;
  desc_array[NB] = nb;
  desc_array[RSRC] = rsrc;
  desc_array[CSRC] = csrc;
  desc_array[CTXT] = icontext;
  desc_array[LLD] = irows;


  start_row_block = ( nprow + myrow - rsrc ) % nprow;
  start_col_block = ( npcol + mycol - csrc ) % npcol;

  (**index_array).num_row_blks = num_mb;
  (**index_array).num_col_blks = num_nb;

/* fill in the row portion of the global index array */
  for ( i=0; i < num_mb; i++)
    {
    (**index_array).g_start_row_blk[i] = (start_row_block + i*nprow)*mb;
    (**index_array).g_end_row_blk[i] = (start_row_block + i*nprow)*mb + mb -1;
    }
    (**index_array).g_end_row_blk[num_mb-1] =
	   (**index_array).g_start_row_blk[num_mb-1] + (irows -1) % mb;
                     
  for ( i=0; i < num_nb; i++)
    {
    (**index_array).g_start_col_blk[i] = (start_col_block + i*npcol)*nb;
    (**index_array).g_end_col_blk[i] = (start_col_block + i*npcol)*nb + nb -1;
    }
    (**index_array).g_end_col_blk[num_nb-1] =
	   (**index_array).g_start_col_blk[num_nb-1] + (icols -1) % nb;
                     
  }





void clocal_to_rglobal(dcmplx **a, int *a_d, double *a_glob, int ldag)
  {
/*********************************************************************
     a is the local part of a global complex array
     a_d is the PESSL array descriptor for a
     a_glob is the global matrix outputted on each node
*********************************************************************/
 

/*    Local variables */

int nrow_blks, ncol_blks, ib, jb, ibl, jbl, i, j;
int m, n, nb, mb, ig, jg, il, jl, prow, pcol;
int iarow, iacol, ni, nj, ldal;

/***********************************************************************
     m is number of rows in global matrix
     n is number of columns in global matrix
     mb and nb are rows an columns in each block
     prow and  pcol are the processor row and column containing a block
    ib, jb, ibl, jbl are global and local  block indices
    il, jb, ig, jg are local and global matrix indices
***********************************************************************/
 

/*     Start of executable code */

/* determine the total number of row and column blocks */
  m = a_d[M];
  n = a_d[N];
  mb = a_d[MB];
  nb = a_d[NB];
  iarow = a_d[RSRC];
  iacol = a_d[CSRC];
  ldal = a_d[LLD];

  nrow_blks =  ( m + mb -1 )/ mb;
  ncol_blks = ( n + nb - 1 )/ nb;

 
 /*  loop over all of the blocks */
 
  for ( jb=0; jb < ncol_blks; jb++)
    {

/*  loop over column blocks, determining both local and global j indices */

    jg = jb * nb;
    nj = nb;
    if (jb == ncol_blks - 1) 
         nj = (n - 1) % nb + 1; 
    jbl = ( jb + iacol ) / npcol  - (mycol + iacol) / npcol;
    jl = jbl * nb;
    pcol = (jb + iacol) % npcol;

    for( ib=0; ib < nrow_blks; ib++)
      {

/*  loop over row blocks, determining both local and global i indices */
      ig = ib * mb;
      ni = mb;
      if (ib == nrow_blks - 1) 
              ni = (m - 1) % mb + 1;
      ibl = ( ib + iarow ) / nprow - (myrow + iarow) /nprow;
      il = ibl * mb;
      prow = (ib + iarow) % nprow;

/*   determine if this block is on my processor */

      if( prow == myrow  && pcol == mycol)
        {
/*   block is on my processor */

      /* copy local portion of array to global array block */
        for( j=0; j < nj; j++ )
          for( i=0; i < ni; i++ )
            {
            a_glob[A_INDEX(ig+i, jg+j, ldag)] = RE(a[jl+j][il+i]);
            }

     /* send that array block to all other processors */
        dgebs2d(icontext,"ALL"," ",ni,nj, 
                     &a_glob[A_INDEX(ig,jg,ldag)],ldag);
        }

      else
/*   block is on somebody elses processor */
        {
        dgebr2d(icontext,"ALL"," ",ni,nj,
                  &a_glob[A_INDEX(ig,jg,ldag)],ldag, prow, pcol);
        }
      }
    }
  return;
  }







void rlocal_to_rglobal(double **a, int *a_d, double *a_glob, int ldag)
  {
/*********************************************************************
     a is the local part of a global real array
     a_d is the PESSL array descriptor for a
     a_glob is the global matrix outputted on each node
*********************************************************************/
 

/*    Local variables */

int nrow_blks, ncol_blks, ib, jb, ibl, jbl, i, j;
int m, n, nb, mb, ig, jg, il, jl, prow, pcol;
int iarow, iacol, ni, nj, ldal;

/***********************************************************************
     m is number of rows in global matrix
     n is number of columns in global matrix
     mb and nb are rows an columns in each block
     prow and  pcol are the processor row and column containing a block
    ib, jb, ibl, jbl are global and local  block indices
    il, jb, ig, jg are local and global matrix indices
***********************************************************************/
 

/*     Start of executable code */

/* determine the total number of row and column blocks */
  m = a_d[M];
  n = a_d[N];
  mb = a_d[MB];
  nb = a_d[NB];
  iarow = a_d[RSRC];
  iacol = a_d[CSRC];
  ldal = a_d[LLD];

  nrow_blks =  ( m + mb -1 )/ mb;
  ncol_blks = ( n + nb - 1 )/ nb;

 
 /*  loop over all of the blocks */
 
  for ( jb=0; jb < ncol_blks; jb++)
    {

/*  loop over column blocks, determining both local and global j indices */

    jg = jb * nb;
    nj = nb;
    if (jb == ncol_blks - 1) 
         nj = (n - 1) % nb + 1; 
    jbl = ( jb + iacol ) / npcol  - (mycol + iacol) / npcol;
    jl = jbl * nb;
    pcol = (jb + iacol) % npcol;

    for( ib=0; ib < nrow_blks; ib++)
      {

/*  loop over row blocks, determining both local and global i indices */
      ig = ib * mb;
      ni = mb;
      if (ib == nrow_blks - 1) 
              ni = (m - 1) % mb + 1;
      ibl = ( ib + iarow ) / nprow - (myrow + iarow) /nprow;
      il = ibl * mb;
      prow = (ib + iarow) % nprow;

/*   determine if this block is on my processor */

      if( prow == myrow  && pcol == mycol)
        {
/*   block is on my processor */

      /* copy local portion of array to global array block */
        for( j=0; j < nj; j++ )
          for( i=0; i < ni; i++ )
            a_glob[A_INDEX(ig+i, jg+j, ldag)] = a[jl+j][il+i];

     /* send that array block to all other processors */
        dgebs2d(icontext,"ALL"," ",ni,nj, 
                     &a_glob[A_INDEX(ig,jg,ldag)],ldag);
        }

      else
/*   block is on somebody elses processor */
        {
        dgebr2d(icontext,"ALL"," ",ni,nj,
                  &a_glob[A_INDEX(ig,jg,ldag)],ldag, prow, pcol);
        }
      }
    }
  return;
  }




