/** \file wannier.c
 * Maximally Localized Wannier Functions.
 * 
 * Contains the on function that minimises the spread of all orbitals in one rush in a parallel
 * Jacobi-Diagonalization implementation, ComputeMLWF(), and one routine CalculateSpread() to
 * calculate the spread of a specific orbital, which may be useful in checking on the change of
 * spread during other calculations. convertComplex() helps in typecasting fftw_complex to gsl_complex.
 * 
  Project: ParallelCarParrinello
 \author Frederik Heber
 \date 2006

  File: wannier.c
  $Id: wannier.c,v 1.7 2007-10-12 15:50:38 heber Exp $
*/

#include <math.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_eigen.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_complex.h>
#include <gsl/gsl_complex_math.h>
#include <gsl/gsl_sort_vector.h>
#include <gsl/gsl_heapsort.h>
#include <gsl/gsl_blas.h>
#include <string.h>

#include "data.h"
#include "density.h"
#include "errors.h"
#include "gramsch.h"
#include "helpers.h"
#include "init.h"
#include "myfft.h"
#include "mymath.h"
#include "output.h"
#include "perturbed.h"
#include "wannier.h"


#define max_operators NDIM*2 //!< number of chosen self-adjoint operators when evaluating the spread
#define type Occupied


/** Converts type fftw_complex to gsl_complex.
 * \param a complex number
 * \return b complex number
 */
gsl_complex convertComplex (fftw_complex a) {
  return gsl_complex_rect(c_re(a),c_im(a));
}

/** "merry go round" implementation for parallel index ordering.
 * Given two arrays, one for the upper/left matrix columns, one for the lower/right ones, one step of an index generation is
 * performed which generates once each possible pairing.  
 * \param *top index array 1
 * \param *bot index array 2
 * \param m N/2, where N is the matrix row/column dimension
 * \note taken from [Golub, Matrix computations, 1989, p451]
 */
void MerryGoRoundIndices(int *top, int *bot, int m)
{
  int *old_top, *old_bot;
  int k;
  old_top = (int *) Malloc(sizeof(int)*m, "music: old_top");
  old_bot = (int *) Malloc(sizeof(int)*m, "music: old_bot");
/*  fprintf(stderr,"oldtop\t");
  for (k=0;k<m;k++)
    fprintf(stderr,"%i\t", top[k]);
  fprintf(stderr,"\n");
  fprintf(stderr,"oldbot\t");
  for (k=0;k<m;k++)
    fprintf(stderr,"%i\t", bot[k]);
  fprintf(stderr,"\n");*/
  // first copy arrays    
  for (k=0;k<m;k++) {
    old_top[k] = top[k];
    old_bot[k] = bot[k];
  }
  // then let the music play
  for (k=0;k<m;k++) {
    if (k==1)
      top[k] = old_bot[0];
    else if (k > 1)
      top[k] = old_top[k-1];
    if (k==m-1)
      bot[k] = old_top[k];
    else
      bot[k] = old_bot[k+1];
  }
/*  fprintf(stderr,"top\t");
  for (k=0;k<m;k++)
    fprintf(stderr,"%i\t", top[k]);
  fprintf(stderr,"\n");
  fprintf(stderr,"bot\t");
  for (k=0;k<m;k++)
    fprintf(stderr,"%i\t", bot[k]);
  fprintf(stderr,"\n");*/
  // and finito
  Free(old_top, "MerryGoRoundIndices: old_top");
  Free(old_bot, "MerryGoRoundIndices: old_bot");
}

/** merry-go-round for matrix columns.
 * The trick here is that we must be aware of multiple rotations per process, thus only some of the
 * whole lot of local columns get sent/received, most of them are just shifted via exchanging the various
 * pointers to the matrix columns within the local array.
 * \param comm communicator for circulation
 * \param *Aloc local array of columns
 * \param Num entries per column
 * \param max_rounds number of column pairs in \a *Around
 * \param k offset for tag
 * \param tagS0 MPI tag for sending left column
 * \param tagS1 MPI tag for sending right column
 * \param tagR0 MPI tag for receiving left column
 * \param tagR1 MPI tag for receiving right column
 */ 
void MerryGoRoundColumns(MPI_Comm comm, double **Aloc, int Num, int max_rounds, int k, int tagS0, int tagS1, int tagR0, int tagR1) {
  //double *A_locS1, *A_locS2;  // local columns of A[k]
  //double *A_locR1, *A_locR2;  // local columns of A[k]
  MPI_Request requestS0, requestS1, requestR0, requestR1;
  MPI_Status status;
  int ProcRank, ProcNum;
  int l; 
  MPI_Comm_size (comm, &ProcNum);
  MPI_Comm_rank (comm, &ProcRank);
  double *Abuffer1, *Abuffer2;  // mark the columns that are circulated
  
  //fprintf(stderr,"shifting...");
  if (ProcRank == 0) {
    if (max_rounds > 1) { 
      // get last left column
      Abuffer1 = Aloc[2*(max_rounds-1)]; // note down the free column
      MPI_Isend(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, WannierALTag+2*k, comm, &requestS0);
    } else {
      // get right column
      Abuffer1 = Aloc[1]; // note down the free column
      MPI_Isend(Abuffer1, Num, MPI_DOUBLE, ProcRank+1,  tagS1+2*k, comm, &requestS0);
    } 
    
    //fprintf(stderr,"...left columns...");
    for(l=2*max_rounds-2;l>2;l-=2) // left columns become shifted one place to the right
      Aloc[l] = Aloc[l-2];

    if (max_rounds > 1) { 
      //fprintf(stderr,"...first right...");
      Aloc[2] = Aloc[1]; // get first right column
    }

    //fprintf(stderr,"...right columns...");
    for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
      Aloc[l] = Aloc[l+2];

    //fprintf(stderr,"...last right...");
    Aloc[(2*max_rounds-1)] = Abuffer1;
    MPI_Irecv(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, WannierARTag+2*k, comm, &requestR1);  

  } else if (ProcRank == ProcNum-1) {
    //fprintf(stderr,"...first right...");
    // get first right column
    Abuffer2 = Aloc[1]; // note down the free column
    MPI_Isend(Abuffer2, Num, MPI_DOUBLE, ProcRank-1, WannierARTag+2*k, comm, &requestS1);

    //fprintf(stderr,"...right columns...");
    for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
      Aloc[(l)] = Aloc[(l+2)];

    //fprintf(stderr,"...last right...");
    Aloc[(2*max_rounds-1)] = Aloc[2*(max_rounds-1)]; // Put last left into last right column

    //fprintf(stderr,"...left columns...");
    for(l=2*(max_rounds-1);l>0;l-=2) // left columns become shifted one place to the right
      Aloc[(l)] = Aloc[(l-2)];

    //fprintf(stderr,"...first left...");
//    if (max_rounds > 1)
      Aloc[0] = Abuffer2; // get first left column
    MPI_Irecv(Abuffer2, Num, MPI_DOUBLE, ProcRank-1, WannierALTag+2*k, comm, &requestR0);

  } else {
    // get last left column
    MPI_Isend(Aloc[2*(max_rounds-1)], Num, MPI_DOUBLE, ProcRank+1, WannierALTag+2*k, comm, &requestS0);
    Abuffer1 = Aloc[2*(max_rounds-1)]; // note down the free column

    //fprintf(stderr,"...first right...");
    // get first right column
    MPI_Isend(Aloc[1], Num, MPI_DOUBLE, ProcRank-1, WannierARTag+2*k, comm, &requestS1);
    Abuffer2 = Aloc[1]; // note down the free column

    //fprintf(stderr,"...left columns...");
    for(l=2*(max_rounds-1);l>0;l-=2) // left columns become shifted one place to the right
      Aloc[(l)] = Aloc[(l-2)];

    //fprintf(stderr,"...right columns...");
    for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
      Aloc[(l)] = Aloc[(l+2)];

    //fprintf(stderr,"...first left...");
    Aloc[0] = Abuffer1; // get first left column
    MPI_Irecv(Aloc[0], Num, MPI_DOUBLE, ProcRank-1, WannierALTag+2*k, comm, &requestR0);

    //fprintf(stderr,"...last right...");
    Aloc[(2*max_rounds-1)] = Abuffer2;
    MPI_Irecv(Aloc[(2*max_rounds-1)], Num, MPI_DOUBLE, ProcRank+1, WannierARTag+2*k, comm, &requestR1);
  }
    
  //fprintf(stderr,"...waiting...");
  if (ProcRank != ProcNum-1)
    MPI_Wait(&requestS0, &status);
  if (ProcRank != 0)  // first left column
    MPI_Wait(&requestR0, &status);
  if (ProcRank != 0)
    MPI_Wait(&requestS1, &status);
  if (ProcRank != ProcNum-1)
    MPI_Wait(&requestR1, &status);
  //fprintf(stderr,"...done\n");
}

/** By testing of greatest common divisor of the matrix rows (\a AllocNum) finds suitable parallel cpu group.
 * \param *P Problem at hand
 * \param AllocNum number of rows in matrix
 * \return address of MPI communicator
 */
#ifdef HAVE_INLINE
inline MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
#else
MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
#endif
{
  MPI_Comm *comm = &P->Par.comm_ST;
  
  //if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Comparing groups - AllocNum %i --- All %i\t Psi %i\t PsiT %i\n",P->Par.me, AllocNum, P->Par.Max_me_comm_ST, P->Par.Max_me_comm_ST_Psi, P->Par.Max_my_color_comm_ST_Psi);
  //if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Jacobi diagonalization is done parallely by ", P->Par.me);
  if (AllocNum % (P->Par.Max_me_comm_ST*2) == 0) {  // all parallel
    comm = &P->Par.comm_ST;
    //if (P->Call.out[ReadOut]) fprintf(stderr,"all\n");           
  } else if (P->Par.Max_me_comm_ST_Psi >= P->Par.Max_my_color_comm_ST_Psi) { // always the bigger group comes first  
    if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) {  // coefficients parallel
      comm = &P->Par.comm_ST_Psi;
      //if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");       
    } else if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) {  // Psis parallel
      comm = &P->Par.comm_ST_PsiT;
      //if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");       
    }
  } else {
    if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) {  // Psis parallel 
      comm = &P->Par.comm_ST_PsiT;
      //if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");       
    } else if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) {  // coefficients parallel
      comm = &P->Par.comm_ST_Psi;
      //if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");       
    }
  }
  return comm;
}

/** Allocates and fills Lookup table for sin/cos values at each grid node.
 * \param ***cos_table pointer to two-dimensional lookup table for cosine values
 * \param ***sin_table pointer to two-dimensional lookup table for sine values
 * \param *N array with number of nodes per \a NDIM axis
 */
void CreateSinCosLookupTable(double ***cos_table, double ***sin_table, int *N)
{
  int i, j;
  double argument;
  double **cos_lookup, **sin_lookup;
  
  // create lookup table for sin/cos values
  cos_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *cos_lookup");
  sin_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *sin_lookup");  
  for (i=0;i<NDIM;i++) {
    // allocate memory
    cos_lookup[i] = (double *) Malloc(sizeof(double)*N[i], "ComputeMLWF: cos_lookup");
    sin_lookup[i] = (double *) Malloc(sizeof(double)*N[i], "ComputeMLWF: sin_lookup");
    
    // reset arrays
    SetArrayToDouble0(cos_lookup[i],N[i]);
    SetArrayToDouble0(sin_lookup[i],N[i]);
    
    // create lookup values
    for (j=0;j<N[i];j++) {
      argument = 2*PI/(double)N[i]*(double)j; 
      cos_lookup[i][j] = cos(argument);
      sin_lookup[i][j] = sin(argument);
    }
  }
  *cos_table = cos_lookup;
  *sin_table = sin_lookup;
}

/** Frees memory allocated during CreateSinCosLookupTable().
 * \param ***cos_lookup pointer to two-dimensional lookup table for cosine values
 * \param ***sin_lookup pointer to two-dimensional lookup table for sine values
 */
void FreeSinCosLookupTable(double **cos_lookup, double **sin_lookup)
{
  int i;
  // free lookups
  for (i=0;i<NDIM;i++) {
    Free(cos_lookup[i], "FreeSinCosLookupTable: cos_lookup[i]");
    Free(sin_lookup[i], "FreeSinCosLookupTable: sin_lookup[i]");
  }
  Free(cos_lookup, "FreeSinCosLookupTable: cos_lookup");
  Free(sin_lookup, "FreeSinCosLookupTable: sin_lookup");
}

/** Fills the entries of the six variance matrices.
 * These matrices are parallely diagonalized during Wannier Localization. They are calculated from the
 * wave function and by diagonalization one obtains the unitary transformation with which the Psis are
 * treated afterwards.
 * \param *P Problem at hand
 * \param AllocNum number of rows/columns
 * \param **A pointer to variance matrices
 * \sa ComputeMLWF() - master function. 
 */
void FillHigherOrderRealMomentsMatrices(struct Problem *P, int AllocNum, gsl_matrix **A)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Density *Dens0 = Lev0->Dens;
  struct OnePsiElement *OnePsiA, *OnePsiB, *LOnePsiB;
  struct fft_plan_3d *plan = Lat->plan; 
  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
  fftw_real *PsiCR = (fftw_real *)PsiC;
  fftw_complex *work = Dens0->DensityCArray[Temp2Density]; 
  fftw_real **HGcR = &Dens0->DensityArray[HGDensity];  // use HGDensity, 4x Gap..Density, TempDensity as a storage array
  fftw_complex **HGcRC = (fftw_complex**)HGcR;
  fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity];  // use HGcDensity, 4x Gap..Density, TempDensity as an array
  fftw_real **HGcR2 = (fftw_real**)HGcR2C;
  int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
  MPI_Status status;
  fftw_complex *LPsiDatA=NULL, *LPsiDatB=NULL;
  int n[NDIM],n0,i0,iS, Index;
  int Num = Psi->NoOfPsis;  // is number of occupied plus unoccupied states for rows
  int N0 = LevS->Plan0.plan->local_nx;
  int *N = LevS->Plan0.plan->N;
  const int NUpx = LevS->NUp[0];
  const int NUpy = LevS->NUp[1];
  const int NUpz = LevS->NUp[2];
  double argument, PsiAtNode;
  int e,g,i,j,k,l,m,p,u;
  double a_ij = 0, b_ij = 0, A_ij = 0, B_ij = 0;
  double **cos_lookup =  NULL,**sin_lookup = NULL; 

  if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 2\n",P->Par.me);

  debug(P, "Creating Lookup Table");
  CreateSinCosLookupTable(&cos_lookup, &sin_lookup, N);

  debug(P, "Calculating each entry of variance matrices");
  l=-1; // to access U matrix element (0..Num-1)
  // fill the matrices
  for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions
    OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
      l++;  // increase l if it is non-extra wave function
      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
        LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
      else
        LPsiDatA = NULL;  // otherwise processes won't enter second loop, though they're supposed to send coefficients!
      
      //fprintf(stderr,"(%i),(%i,%i): fft'd, A[..] and B, back-fft'd acting on \\phi_A\n",P->Par.me,l,0);
      if (LPsiDatA != NULL) {
        CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
        // note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
        for (n0=0;n0<N0;n0++)  
          for (n[1]=0;n[1]<N[1];n[1]++) 
            for (n[2]=0;n[2]<N[2];n[2]++) {
              i0 = n[2]*NUpz+N[2]*NUpz*(n[1]*NUpy+N[1]*NUpy*n0*NUpx);
              iS = n[2]+N[2]*(n[1]+N[1]*n0);
              n[0] = n0 + LevS->Plan0.plan->start_nx;
              for (k=0;k<max_operators;k+=2) {
                e = k/2;
                argument = 2.*PI/(double)(N[e])*(double)(n[e]);
                PsiAtNode = PsiCR[i0] /LevS->MaxN;
                // check lookup
                if (!l) // perform check on first wave function only
                  if ((fabs(cos(argument) - cos_lookup[e][n[e]]) > MYEPSILON) || (fabs(sin(argument) - sin_lookup[e][n[e]]) > MYEPSILON)) {
                    Error(SomeError, "Lookup table does not match real value!");
                  }
                HGcR[k][iS] = cos_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
                HGcR2[k][iS] = cos_lookup[e][n[e]] * HGcR[k][iS]; /* Matrix Vector Mult */
                HGcR[k+1][iS] = sin_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
                HGcR2[k+1][iS] = sin_lookup[e][n[e]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
              }
            }
        for (u=0;u<max_operators;u++) {
          fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[u], work);
          fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[u], work);
        }
      }  
      m = -1; // to access U matrix element (0..Num-1)
      for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
        OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
        if (OnePsiB->PsiType == type) {   // drop all but occupied ones
          m++;  // increase m if it is non-extra wave function
          if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
             LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
          else 
             LOnePsiB = NULL;
          if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
            RecvSource = OnePsiB->my_color_comm_ST_Psi;
            MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
            LPsiDatB=LevS->LPsi->TempPsi;
          } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
            for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
              if (p != OnePsiB->my_color_comm_ST_Psi) 
                MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
            LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
          } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received

          for (u=0;u<max_operators;u++) {
            a_ij = 0;
            b_ij = 0;
            if (LPsiDatA != NULL) { // calculate, reduce and send to all ...
              //fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,u);
              g=0;
              if (LevS->GArray[0].GSq == 0.0) {
                Index = LevS->GArray[g].Index;
                a_ij = (LPsiDatB[0].re*HGcRC[u][Index].re + LPsiDatB[0].im*HGcRC[u][Index].im);
                b_ij = (LPsiDatB[0].re*HGcR2C[u][Index].re + LPsiDatB[0].im*HGcR2C[u][Index].im);
                g++;
              }
              for (; g < LevS->MaxG; g++) {
                Index = LevS->GArray[g].Index;
                a_ij += 2*(LPsiDatB[g].re*HGcRC[u][Index].re + LPsiDatB[g].im*HGcRC[u][Index].im);
                b_ij += 2*(LPsiDatB[g].re*HGcR2C[u][Index].re + LPsiDatB[g].im*HGcR2C[u][Index].im);
              } // due to the symmetry the resulting matrix element is real and symmetric in (i,j) ! (complex multiplication simplifies ...)
              // sum up elements from all coefficients sharing processes
              MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
              MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
              a_ij = A_ij;
              b_ij = B_ij;
              // send element to all Psi-sharing who don't have l local (MPI_Send is a lot slower than AllReduce!)
              MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
              MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
            } else { // receive ...
              MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
              MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);              
            }
            // ... and store
            //fprintf(stderr,"(%i),(%i,%i): A[%i]: setting component (local: %lg, total: %lg)\n",P->Par.me, l,m,u,a_ij,A_ij);
            //fprintf(stderr,"(%i),(%i,%i): B: adding upon component (local: %lg, total: %lg)\n",P->Par.me, l,m,b_ij,B_ij);
            gsl_matrix_set(A[u], l, m, A_ij);
            gsl_matrix_set(A[max_operators], l, m, B_ij + gsl_matrix_get(A[max_operators],l,m));
          }
        }
      }
    }
  }
  // reset extra entries
  for (u=0;u<=max_operators;u++) {
    for (i=Num;i<AllocNum;i++)
      for (j=0;j<AllocNum;j++)
        gsl_matrix_set(A[u], i,j, 0.);
    for (i=Num;i<AllocNum;i++)
      for (j=0;j<AllocNum;j++)
        gsl_matrix_set(A[u], j,i, 0.);
  }
  FreeSinCosLookupTable(cos_lookup, sin_lookup);

/*// print A matrices for debug
  if (P->Par.me == 0) 
    for (u=0;u<max_operators+1;u++) {
     fprintf(stderr, "A[%i] = \n",u);
      for (i=0;i<Num;i++) {
        for (j=0;j<Num;j++)
          fprintf(stderr, "%e\t",gsl_matrix_get(A[u],i,j));
        fprintf(stderr, "\n");
      }
    }  
*/
}

/** Calculates reciprocal second order moments of each PsiType#Occupied orbital.
 * First order is zero due to wave function being real (symmetry condition).
 * \param *P Problem at hand
 */
void CalculateSecondOrderReciprocalMoment(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct FileData *F = &P->Files;
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *LevS = R->LevS;
  double result, Result; 
  fftw_complex *LPsiDatA=NULL;
  struct OnePsiElement *OnePsiA;
  int i,j,l,g;
  char spin[12], suffix[18];

  switch (Lat->Psi.PsiST) {
  case SpinDouble:
    strcpy(suffix,".recispread.csv");    
    strcpy(spin,"SpinDouble");
    break;
  case SpinUp:
    strcpy(suffix,".recispread_up.csv");    
    strcpy(spin,"SpinUp");
    break;
  case SpinDown:
    strcpy(suffix,".recispread_down.csv");    
    strcpy(spin,"SpinDown");
    break;
  }
  if(P->Par.me_comm_ST == 0) {
    if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Calculating reciprocal moments ...\n",P->Par.me);
    if (R->LevSNo == Lat->MaxLevel-1) // open freshly if first level 
      OpenFile(P, &F->ReciSpreadFile, suffix, "w", P->Call.out[ReadOut]); // only open on starting level
    else if (F->ReciSpreadFile == NULL) // re-op,18en if not first level and not opened yet (or closed from ParseWannierFile)
      OpenFile(P, &F->ReciSpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
    if (F->ReciSpreadFile == NULL) {
      Error(SomeError,"ComputeMLWF: Error opening Reciprocal spread File!\n");
    } else {   
      fprintf(F->ReciSpreadFile,"===== Reciprocal Spreads of type %s ==========================================================================\n", spin);
    }
  }

  // integrate second order moment
  for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {  // go through all wave functions
    OnePsiA = &Psi->AllPsiStatus[l];    // grab OnePsiA
    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
        LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
      else
        LPsiDatA = NULL;
      
      if (LPsiDatA != NULL) {
        if (P->Par.me_comm_ST == 0) 
          fprintf(F->ReciSpreadFile,"Psi%d_Lev%d\t", Psi->AllPsiStatus[l].MyGlobalNo, R->LevSNo); 
        for (i=0;i<NDIM;i++) {
          for (j=0;j<NDIM;j++) {
            result = 0.;
            g = 0;
            if (LevS->GArray[0].GSq == 0.0) {
              result += LevS->GArray[g].G[i]*LevS->GArray[g].G[j]*(LPsiDatA[0].re * LPsiDatA[0].re);
              g++;
            }
            for (;g<LevS->MaxG;g++)
              result += LevS->GArray[g].G[i]*LevS->GArray[g].G[j]*2.*(LPsiDatA[g].re * LPsiDatA[g].re + LPsiDatA[g].im * LPsiDatA[g].im);
            //result *= Lat->Volume/LevS->MaxG;
            MPI_Allreduce ( &result, &Result, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
            if (P->Par.me_comm_ST == 0) 
              fprintf(F->ReciSpreadFile,"%2.5lg  ", Result);
          }
        }
        if (P->Par.me_comm_ST == 0) 
          fprintf(F->ReciSpreadFile,"\n");
      }
    }
  }
  if(P->Par.me_comm_ST == 0) { 
    fprintf(F->ReciSpreadFile,"====================================================================================================================\n\n");
    fflush(F->ReciSpreadFile);
  }
}

/** Given the unitary matrix the transformation is performed on the Psis.
 * \param *P Problem at hand
 * \param *U gsl matrix containing the transformation matrix
 * \param Num dimension parameter for the matrix, i.e. number of wave functions
 */
void UnitaryTransformationOnWavefunctions(struct Problem *P, gsl_matrix *U, int Num)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *LevS = R->LevS;
  MPI_Status status;
  struct OnePsiElement *OnePsiB, *OnePsiA, *LOnePsiB;
  int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
  fftw_complex *LPsiDatA=NULL, *LPsiDatB=NULL;
  int g,i,j,l,k,m,p;

  //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation of all wave functions according to U\n",P->Par.me); 
  
  Num = Psi->TypeStartIndex[type+1] - Psi->TypeStartIndex[type]; // recalc Num as we can only work with local Psis from here
  fftw_complex **coeffs_buffer = Malloc(sizeof(fftw_complex *)*Num, "ComputeMLWF: **coeffs_buffer");

  for (l=0;l<Num;l++) // allocate for each local wave function
    coeffs_buffer[l] = LevS->LPsi->OldLocalPsi[l];
  
  //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation ...\n",P->Par.me);
  l=-1; // to access U matrix element (0..Num-1)
  k=-1; // to access the above swap coeffs_buffer (0..LocalNo-1)
  for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions
    OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
      l++;  // increase l if it is occupied wave function
      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // local?
        k++; // increase k only if it is a local, non-extra orbital wave function
        LPsiDatA = (fftw_complex *) coeffs_buffer[k];    // new coeffs first go to copy buffer, old ones must not be overwritten yet
        SetArrayToDouble0((double *)LPsiDatA, 2*LevS->MaxG);  // zero buffer part
      } else
        LPsiDatA = NULL;  // otherwise processes won't enter second loop, though they're supposed to send coefficients!

      m = -1; // to access U matrix element (0..Num-1)
      for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
        OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
        if (OnePsiB->PsiType == type) {   // drop all but occupied ones
          m++;  // increase m if it is occupied wave function
          if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
             LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
          else 
             LOnePsiB = NULL;
          if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
            RecvSource = OnePsiB->my_color_comm_ST_Psi;
            MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
            LPsiDatB=LevS->LPsi->TempPsi;
          } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
            for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
              if (p != OnePsiB->my_color_comm_ST_Psi) 
                MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
            LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
          } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received

          if (LPsiDatA != NULL) {
            double tmp = gsl_matrix_get(U,l,m);
            g=0;
            if (LevS->GArray[0].GSq == 0.0) {
              LPsiDatA[g].re += LPsiDatB[g].re * tmp;
              LPsiDatA[g].im += LPsiDatB[g].im * tmp;
              g++;
            }
            for (; g < LevS->MaxG; g++) {
              LPsiDatA[g].re += LPsiDatB[g].re * tmp;
              LPsiDatA[g].im += LPsiDatB[g].im * tmp;
            }
          }
        }
      }
    }
  }

  //if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) STEP 6: Swapping buffer mem\n",P->Par.me);
  // now, as all wave functions are updated, swap the buffer
  l = -1;
  for (k=0;k<Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT;k++) { // go through each local occupied wave function
    if (Psi->AllPsiStatus[k].PsiType == type && Psi->AllPsiStatus[k].my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {
      l++;
      //if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) (k:%i,l:%i) LocalNo = (%i,%i)\t AllPsiNo = (%i,%i)\n", P->Par.me, k,l,Psi->LocalPsiStatus[l].MyLocalNo, Psi->LocalPsiStatus[l].MyGlobalNo, Psi->AllPsiStatus[k].MyLocalNo, Psi->AllPsiStatus[k].MyGlobalNo);
      LPsiDatA = (fftw_complex *)coeffs_buffer[l];
      LPsiDatB = LevS->LPsi->LocalPsi[l];
      for (g=0;g<LevS->MaxG;g++) {
        LPsiDatB[g].re = LPsiDatA[g].re;
        LPsiDatB[g].im = LPsiDatA[g].im;
      }
      // recalculating non-local form factors which are coefficient dependent!
      CalculateNonLocalEnergyNoRT(P, Psi->LocalPsiStatus[l].MyLocalNo);
    }
  } 
  // and free allocated buffer memory
  Free(coeffs_buffer, "UnitaryTransformationOnWavefunctions: coeffs_buffer");
}

/** Changes Wannier Centres according to RunStruct#CommonWannier.
 * \param *P Problem at hand.
 * \param Num number of Psis
 * \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
 * \param *WannierSpread array with wannier spread per wave function
 */
void ChangeWannierCentres(struct Problem *P, int Num, double **WannierCentre, double *WannierSpread)
{
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct LatticeLevel *LevS = R->LevS;
  int *marker, **group;
  int partner[Num];
  int i,j,l,k;
  int totalflag, flag;
  double q[NDIM], center[NDIM];
  double Spread;
  int *N = LevS->Plan0.plan->N;
  
  switch (R->CommonWannier) {
   case 4:
    debug(P,"Shifting each Wannier centers to cell center");
    for (j=0;j<NDIM;j++)  // center point in [0,1]^3
      center[j] = 0.5;
    RMat33Vec3(q,Lat->RealBasis, center); // transform to real coordinates
    for (i=0; i < Num; i++) {  // go through all occupied wave functions
      for (j=0;j<NDIM;j++)  // put into Wannier centres
        WannierCentre[i][j] = q[j]; 
    }    
    break;
   case 3:
    debug(P,"Shifting Wannier centers individually to nearest grid point");
    for (i=0;i < Num; i++) {  // go through all wave functions
      RMat33Vec3(q, Lat->ReciBasis, WannierCentre[i]);
      for (j=0;j<NDIM;j++) {  // Recibasis is not true inverse but times 2.*PI
        q[j] *= (double)N[j]/(2.*PI);
        
        //fprintf(stderr,"(%i) N[%i]: %i\t tmp %e\t floor %e\t ceil %e\n",P->Par.me, j, N[j], tmp, floor(tmp), ceil(tmp));
        if (fabs((double)floor(q[j]) - q[j]) < fabs((double)ceil(q[j]) - q[j]))
          q[j] = floor(q[j])/(double)N[j];
        else
          q[j] = ceil(q[j])/(double)N[j];
      }
      RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
    }
    break;
   case 2:
    debug(P,"Combining individual orbitals according to spread.");
    //fprintf(stderr,"(%i) Finding multiple bindings and Reweighting Wannier centres\n",P->Par.me);
    debug(P,"finding partners");
    marker = (int*) Malloc(sizeof(int)*(Num+1),"ComputeMLWF: marker");
    group = (int**) Malloc(sizeof(int *)*Num,"ComputeMLWF: group");
    for (l=0;l<Num;l++) {
      group[l] = (int*) Malloc(sizeof(int)*(Num+1),"ComputeMLWF: group[l]");  // each must group must have one more as end marker
      for (k=0;k<=Num;k++)
        group[l][k] = -1; // reset partner group
    }
    for (k=0;k<Num;k++)
      partner[k] = 0;
    debug(P,"mem allocated");
    // go for each orbital through every other, check distance against the sum of both spreads
    // if smaller add to group of this orbital
    for (l=0;l<Num;l++) {
      j=0;  // index for partner group
      for (k=0;k<Num;k++) {  // check this against l
        Spread = 0.;
        for (i=0;i<NDIM;i++) {
          //fprintf(stderr,"(%i) Spread += (%e - %e)^2 \n", P->Par.me, WannierCentre[l][i], WannierCentre[k][i]); 
          Spread += (WannierCentre[l][i] - WannierCentre[k][i])*(WannierCentre[l][i] - WannierCentre[k][i]);
        }
        Spread = sqrt(Spread);  // distance in Spread
        //fprintf(stderr,"(%i) %i to %i: distance %e, SpreadSum = %e + %e = %e \n", P->Par.me, l, k, Spread, WannierSpread[l], WannierSpread[k], WannierSpread[l]+WannierSpread[k]); 
        if (Spread < 1.5*(WannierSpread[l]+WannierSpread[k])) {// if distance smaller than sum of spread
          group[l][j++] = k;  // add k to group of l
          partner[l]++;
          //fprintf(stderr,"(%i) %i added as %i-th member to %i's group.\n", P->Par.me, k, j, l);
        } 
      }
    }
    
    // consistency, for each orbital check if this orbital is also in the group of each referred orbital
    debug(P,"checking consistency");
    totalflag = 1;
    for (l=0;l<Num;l++) // checking l's group
      for (k=0;k<Num;k++) { // k is partner index
        if (group[l][k] != -1) {  // if current index k is a partner
          flag = 0;
          for(j=0;j<Num;j++) {  // go through each entry in l partner's partner group if l exists
            if ((group[ group[l][k] ][j] == l))
              flag = 1;
          }
          //if (flag == 0) fprintf(stderr, "(%i) in %i's group %i is referred as a partner, but not the other way round!\n", P->Par.me, l, group[l][k]);
          if (totalflag == 1) totalflag = flag;
        }
      }
    // for each orbital group (marker group) weight each center to a total and put this into the local WannierCentres
    debug(P,"weight and calculate new centers for partner groups");
    for (l=0;l<=Num;l++)
      marker[l] = 1;
    if (totalflag) {
      for (l=0;l<Num;l++) { // go through each orbital
        if (marker[l] != 0) { // if it hasn't been reweighted
          marker[l] = 0;
          for (i=0;i<NDIM;i++) 
            q[i] = 0.;
          j = 0;
          while (group[l][j] != -1) {
            marker[group[l][j]] = 0;
            for (i=0;i<NDIM;i++) {
              //fprintf(stderr,"(%i) Adding to %i's group, %i entry of %i: %e\n", P->Par.me, l, i, group[l][j], WannierCentre[ group[l][j] ][i]);
              q[i] += WannierCentre[ group[l][j] ][i];
            } 
            j++;
          }
          //fprintf(stderr,"(%i) %i's group: (%e,%e,%e)/%i = (%e,%e,%e)\n", P->Par.me, l, q[0], q[1], q[2], j, q[0]/(double)j, q[1]/(double)j, q[2]/(double)j);
          for (i=0;i<NDIM;i++) {// weight by number of elements in partner group
            q[i] /= (double)(j);
          }
  
          // put WannierCentre into own and all partners'      
          for (i=0;i<NDIM;i++)
            WannierCentre[l][i] = q[i];
          j = 0;  
          while (group[l][j] != -1) {
            for (i=0;i<NDIM;i++)
              WannierCentre[group[l][j]][i] = q[i];
            j++;
          }
        }
      }
    }
    if (P->Call.out[StepLeaderOut]) {
      fprintf(stderr,"Summary:\n");
      fprintf(stderr,"========\n");
      for (i=0;i<Num;i++)
        fprintf(stderr,"%i belongs to a %i-ple binding.\n",i,partner[i]);
    } 
    debug(P,"done");
  
    Free(marker, "ChangeWannierCentres: marker");
    for (l=0;l<Num;l++)
      Free(group[l], "ChangeWannierCentres: group[l]");
    Free(group, "ChangeWannierCentres: group");
    break;
   case 1:
    debug(P,"Individual orbitals are changed to center of all.");
    for (i=0;i<NDIM;i++)  // zero center of weight
      q[i] = 0.;
    for (k=0;k<Num;k++)
      for (i=0;i<NDIM;i++) {  // sum up all orbitals each component
        q[i] += WannierCentre[k][i];
      }
    for (i=0;i<NDIM;i++)  // divide by number
      q[i] /= Num;
    for (k=0;k<Num;k++)
      for (i=0;i<NDIM;i++) {  // put into this function's array
        WannierCentre[k][i] = q[i];
      }
    break;
   case 0:
   default:
   break;
  }
}

/** From the entries of the variance matrices the spread is calculated.
 * WannierCentres are evaluated according to Resta operator: \f$\langle X \rangle = \frac{L}{2\pi} \sum_j {\cal I} \ln{\langle \Psi_j | \exp{(i \frac{2\pi}{L} X)} | \Psi_j \rangle}\f$
 * WannierSpread is: \f$ \sum_j \langle \Psi_j | r^2 | \Psi_j \rangle - \langle \Psi_j | r | psi_j \rangle^2\f$
 * \param *P Problem at hand
 * \param *A variance matrices
 * \param old_spread first term of wannier spread
 * \param spread second term of wannier spread
 * \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
 * \param *WannierSpread array with wannier spread per wave function
 */
void ComputeWannierCentresfromVarianceMatrices(struct Problem *P, gsl_matrix **A, double *spread, double *old_spread, double **WannierCentre, double *WannierSpread)
{
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  struct OnePsiElement *OnePsiA;
  int i,j,k,l;
  double tmp, q[NDIM];
  
  *old_spread = 0;
  *spread = 0;

  // the spread for x,y,z resides in the respective diagonal element of A_.. for each orbital
  i=-1;
  for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {  // go through all wave functions
    OnePsiA = &Psi->AllPsiStatus[l];    // grab OnePsiA
    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
      i++;  // increase l if it is occupied wave function
      //fprintf(stderr,"(%i) Wannier for %i\n", P->Par.me, i);
      
      // calculate Wannier Centre
      for (j=0;j<NDIM;j++) {
        q[j] = 1./(2.*PI) * GSL_IMAG( gsl_complex_log( gsl_complex_rect(gsl_matrix_get(A[j*2],i,i),gsl_matrix_get(A[j*2+1],i,i))));
        if (q[j] < 0) // change wrap around of above operator to smooth 0...Lat->RealBasisSQ
          q[j] += 1.;
      }
      RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
      
      // store orbital spread and centre in file
      tmp = - pow(gsl_matrix_get(A[0],i,i),2) - pow(gsl_matrix_get(A[1],i,i),2)
            - pow(gsl_matrix_get(A[2],i,i),2) - pow(gsl_matrix_get(A[3],i,i),2)
            - pow(gsl_matrix_get(A[4],i,i),2) - pow(gsl_matrix_get(A[5],i,i),2);          
      WannierSpread[i] = gsl_matrix_get(A[max_operators],i,i) + tmp; 
      //fprintf(stderr,"(%i) WannierSpread[%i] = %e\n", P->Par.me, i, WannierSpread[i]);
      //if (P->Par.me == 0) fprintf(F->SpreadFile,"Orbital %d:\t Wannier center (x,y,z)=(%lg,%lg,%lg)\t Spread sigma^2 = %lg - %lg = %lg\n",
        //Psi->AllPsiStatus[i].MyGlobalNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], gsl_matrix_get(A[max_operators],i,i), -tmp, WannierSpread[i]); 
      //if (P->Par.me == 0) fprintf(F->SpreadFile,"%e\t%e\t%e\n",
        //WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2]);
      
      // gather all spreads
      *old_spread += gsl_matrix_get(A[max_operators],i,i); // tr(U^H B U)
      for (k=0;k<max_operators;k++)
        *spread += pow(gsl_matrix_get(A[k],i,i),2);
    }
  }
}

/** Prints gsl_matrix nicely to screen.
 * \param *P Problem at hand
 * \param *U gsl matrix
 * \param Num number of rows/columns
 * \param *msg name of matrix, prepended before entries
 */
void PrintGSLMatrix(struct Problem *P, gsl_matrix *U, int Num, const char *msg)
{
  int k,l;
  fprintf(stderr,"(%i) %s = \n",P->Par.me, msg);
  for (k=0;k<Num;k++) {    
    for (l=0;l<Num;l++)
      fprintf(stderr,"%e\t",gsl_matrix_get(U,l,k));
    fprintf(stderr,"\n");
  }
}

/** Allocates memory for the  DiagonalizationData structure
 * \param *P Problem at hand
 * \param DiagData pointer to structure
 * \param Num number of rows and columns in matrix
 * \param NumMatrices number of matrices to be simultaneously diagonalized
 * \param extra number of extra matrices to be passively also diagonalized
 */
void InitDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData, int Num, int NumMatrices, int extra)
{
  int i;
  
  // store integer values
  //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 1\n",P->Par.me);
  DiagData->Num = Num;
  DiagData->AllocNum = ceil((double)Num / 2. ) *2;
  DiagData->NumMatrices = NumMatrices;
  DiagData->extra = extra;
  
  // determine communicator and our rank and its size
  DiagData->comm = DetermineParallelGroupbyGCD(P,DiagData->AllocNum);
  MPI_Comm_size (*(DiagData->comm), &(DiagData->ProcNum));
  MPI_Comm_rank (*(DiagData->comm), &(DiagData->ProcRank));
  
  // allocate memory
  DiagData->U = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
  gsl_matrix_set_identity(DiagData->U);
  
  DiagData->A = (gsl_matrix **) Malloc((DiagData->NumMatrices+DiagData->extra) * sizeof(gsl_matrix *), "InitDiagonalization: *A"); 
  for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++) {
    DiagData->A[i] = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
    gsl_matrix_set_zero(DiagData->A[i]);
  }

  // init merry-go-round array for diagonilzation
  DiagData->top = (int *) Malloc(sizeof(int)*DiagData->AllocNum/2, "InitDiagonalization: top");
  DiagData->bot = (int *) Malloc(sizeof(int)*DiagData->AllocNum/2, "InitDiagonalization: bot");
  for (i=0;i<DiagData->AllocNum/2;i++) {
    DiagData->top[i] = 2*i;
    DiagData->bot[i] = 2*i+1;      
  }
/*    // print starting values of index generation tables top and bot
      fprintf(stderr,"top\t");
      for (k=0;k<AllocNum/2;k++)
        fprintf(stderr,"%i\t", top[k]);
      fprintf(stderr,"\n");
      fprintf(stderr,"bot\t");
      for (k=0;k<AllocNum/2;k++)
        fprintf(stderr,"%i\t", bot[k]);
      fprintf(stderr,"\n");*/
}

/** Frees allocated memory for the DiagonalizationData structure.
 * \param DiagData pointer to structure
 */
void FreeDiagonalization(struct DiagonalizationData *DiagData)
{
  int i;
  
  Free(DiagData->top, "FreeDiagonalization: DiagData->top");
  Free(DiagData->bot, "FreeDiagonalization: DiagData->bot");
  gsl_matrix_free(DiagData->U);
  for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++)
    gsl_matrix_free(DiagData->A[i]);   
  Free(DiagData->A, "FreeDiagonalization: DiagData->A");
}

/** Orthogonalizing PsiType#Occupied wave functions for MD.
 * Orthogonalizes wave functions by finding a unitary transformation such that the density remains unchanged.
 * To this end, the overlap matrix \f$\langle \Psi_i | \Psi_j \rangle\f$ is created and diagonalised by Jacobi
 * transformation (product of rotation matrices). The found transformation matrix is applied to all Psis.
 * \param *P Problem at hand
 * \note [Payne] states that GramSchmidt is not well-suited. However, this Jacobi diagonalisation is not well
 * suited for our CG minimisation either. If one wave functions is changed in course of the minimsation
 * procedure, in a Gram-Schmidt-Orthogonalisation only this wave function is changed (although it remains under
 * debate whether subsequent Psis are still orthogonal to this changed wave function!), in a Jacobi-Orthogonali-
 * stion however at least two if not all wave functions are changed. Thus inhibits our fast way of updating the
 * density after a minimisation step -- UpdateDensityCalculation(). Thus, this routine can only be used for a
 * final orthogonalisation of all Psis, after the CG minimisation.
 */
void OrthogonalizePsis(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *LevS = P->R.LevS;
  int Num = Psi->NoOfPsis;
  int i,j,l;
  struct DiagonalizationData DiagData;
  double PsiSP;

  InitDiagonalization(P, &DiagData, Num, 1, 0);
    
  // Calculate Overlap matrix
  for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
    CalculateOverlap(P, l, Occupied);  
  for (i=0;i<Num;i++) // fill gsl matrix with overlap values
    for (j=0;j<Num;j++)
      gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
  //if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i|Psi_J> (before)");
  
  //if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (before)");

  // diagonalize overlap matrix
  Diagonalize(P, &DiagData);

  //if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (after)");

  // apply found unitary transformation to wave functions
  UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);

  // update GramSch stati
  for (i=Psi->TypeStartIndex[Occupied];i<Psi->TypeStartIndex[UnOccupied];i++) {
    GramSchNormalize(P,LevS,LevS->LPsi->LocalPsi[i], 0.);   // calculates square product if 0. is given...
    PsiSP = GramSchGetNorm2(P,LevS,LevS->LPsi->LocalPsi[i]);
    //if (P->Par.me_comm_ST_Psi == 0) fprintf(stderr,"(%i) PsiSP[%i] = %lg\n", P->Par.me, i, PsiSP);
    Psi->LocalPsiStatus[i].PsiGramSchStatus = (int)IsOrthonormal;
  }
  UpdateGramSchAllPsiStatus(P,Psi);
/*
  for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
    CalculateOverlap(P, l, Occupied);  
  for (i=0;i<Num;i++) // fill gsl matrix with overlap values
    for (j=0;j<Num;j++)
      gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
  if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i|Psi_J> (after)");
  */
  // free memory
  FreeDiagonalization(&DiagData);
}

/** Orthogonalizing PsiType#Occupied wave functions and their time derivatives for MD.
 * Ensures that two orthonormality condition are met for Psis: 
 * \f$\langle \Psi_i | \Psi_j \rangle = \delta_{ij}\f$
 * \f$\langle \dot{\Psi_i} | \dot{\Psi_j} \rangle = \delta_{ij}\f$
 * \param *P Problem at hand
 * \note [Payne] states that GramSchmidt is not well-suited, and this stronger
 *       condition is needed for numerical stability
 * \todo create and fill 1stoverlap with finite difference between Psi, OldPsi with R->Deltat
 */
void StrongOrthogonalizePsis(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  int Num = Psi->NoOfPsis;
  int i,j,l;
  struct DiagonalizationData DiagData;

  InitDiagonalization(P, &DiagData, Num, 2, 0);
    
  // Calculate Overlap matrix
  for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++) {
    CalculateOverlap(P, l, Occupied);  
    //Calculate1stOverlap(P, l, Occupied);
  }  
  for (i=0;i<Num;i++) // fill gsl matrix with overlap values
    for (j=0;j<Num;j++) {
      gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
      //gsl_matrix_set(DiagData.A[1],i,j,Psi->1stOverlap[i][j]);
    }
      
  // diagonalize overlap matrix
  Diagonalize(P, &DiagData);
  
  // apply found unitary transformation to wave functions
  UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);
  
  // free memory
  FreeDiagonalization(&DiagData);
}

/** Uses either serial or parallel diagonalization.
 * \param *P Problem at hand
 * \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
 * \sa ParallelDiagonalization(), SerialDiagonalization()
 */
void Diagonalize(struct Problem *P, struct DiagonalizationData *DiagData)
{
  if (DiagData->Num != 1) { // one- or multi-process case?
    if (((DiagData->AllocNum % 2) == 0) && (DiagData->ProcNum != 1) && ((DiagData->AllocNum / 2) % DiagData->ProcNum == 0)) {
      /*
      debug(P,"Testing with silly matrix");
      ParallelDiagonalization(P, test, Utest, 1, 0, 4, Num, comm, ProcRank, ProcNum, top, bot);
      if(P->Call.out[ReadOut]) // && P->Par.me == 0) 
        PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
      if(P->Call.out[ReadOut]) // && P->Par.me == 0) 
        PrintGSLMatrix(P, Utest, 4, "Utest (final)");
        */  
      debug(P,"ParallelDiagonalization");
      ParallelDiagonalization(P, DiagData);
    } else {/*
      debug(P,"Testing with silly matrix");
      SerialDiagonalization(P, test, Utest, 1, 0, 4, Num, top, bot);
      if(P->Call.out[ReadOut]) // && P->Par.me == 0) 
        PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
      if(P->Call.out[ReadOut]) // && P->Par.me == 0) 
        PrintGSLMatrix(P, Utest, 4, "Utest (final)");
         */ 
      debug(P,"SerialDiagonalization");
      SerialDiagonalization(P, DiagData);
    }  
    
    //if(P->Call.out[ReadOut]) // && P->Par.me == 0) 
      //PrintGSLMatrix(P, DiagData->U, DiagData->AllocNum, "U");
  }
}

/** Computation of Maximally Localized Wannier Functions.
 * Maximally localized functions are prime when evulating a Hamiltonian with
 * magnetic fields under periodic boundary conditions, as the common position
 * operator is no longer valid. These can be obtained by orbital rotations, which
 * are looked for iteratively and gathered in one transformation matrix, to be
 * later applied to the set of orbital wave functions.
 * 
 * In order to obtain these, the following algorithm is applied:
 * -# Initialize U (identity) as the sought-for transformation matrix
 * -# Compute the real symmetric (due to Gamma point symmetry!) matrix elements
 *    \f$A^{(k)}_{ij} = \langle \phi_i | A^{(k)} | \phi_j \rangle\f$ for the six operators
 *    \f$A^{(k)}\f$
 * -# For each pair of indices (i,j) (i<j) do the following:
 *    -# Compute the 2x2 matrix \f$G = \Re \Bigl ( \sum_k h^H(A^{(k)}) h(A^{(k)}) \Bigr)\f$
 *       where \f$h(A) = [a_{ii} - a_{jj}, a_{ij} + a_{ji}]\f$
 *    -# Obtain eigenvalues and eigenvectors of G. Set \f$[x,y]^T\f$ to the eigenvector of G
 *       corresponding to the greatest eigenvalue, such that \f$x\geq0\f$
 *    -# Compute the rotation matrix R elements (ii,ij,ji,jj) \f$[c,s,-s,c]\f$ different from the 
 *       identity matrix by \f$r=\sqrt{x^2+y^2}\f$, \f$c = \sqrt{\frac{x+r}{2r}}\f$
 *       \f$s=\frac{y}{\sqrt{2r(x+r)}}\f$
 *    -# Perform the similarity operation \f$A^{(k)} \rightarrow R A^{(k)} R\f$
 *    -# Gather the rotations in \f$U = U R\f$
 * -# Compute the total spread \f$\sigma^2_{A^{(k)}}\f$
 * -# Compare the change in spread to a desired minimum RunStruct#EpsWannier, if still greater go to step 3.
 * -# Apply transformations to the orbital wavefunctions \f$ | \phi_i \rangle = \sum_j U_{ij} | \phi_j \rangle\f$
 * -# Compute the position of the Wannier centers from diagonal elements of \f$A^{(k)}\f$, store in
 *    OnePsiElementAddData#WannierCentre
 * 
 * Afterwards, the routine applies the found unitary rotation to the unperturbed group of wave functions.
 * Note that hereby additional memory is needed as old and transformed wave functions must be present at the same
 * time.
 * 
 * The routine uses parallelization if possible. A parallel Jacobi-Diagonalization is implemented using the index
 * generation in music() and shift-columns() such that the evaluated position operator eigenvalue matrices
 * may be diagonalized simultaneously and parallely. We use the implementation explained in
 * [Golub, Matrix computations, 1989, p451].
 * 
 * \param *P Problem at hand
 */
void ComputeMLWF(struct Problem *P) {
  // variables and allocation
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  int i;
  int Num = Psi->NoOfPsis;  // is number of occupied plus unoccupied states for rows
  double **WannierCentre;
  double *WannierSpread;
  double spread, spreadSQ;
  struct DiagonalizationData DiagData;

  if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Beginning localization of orbitals ...\n",P->Par.me);

  InitDiagonalization(P, &DiagData, Num, max_operators, 1);

  // STEP 2: Calculate A[k]_ij = V/N \sum_{G1,G2} C^\ast_{l,G1} c_{m,G2} \sum_R A^{(k)}(R) exp(iR(G2-G1))
  //debug (P,"Calculatung Variance matrices");
  FillHigherOrderRealMomentsMatrices(P, DiagData.AllocNum, DiagData.A);

  //debug (P,"Diagonalizing");
  Diagonalize(P, &DiagData);

  // STEP 6: apply transformation U to all wave functions \sum_i^Num U_ji | \phi_i \rangle = | \phi_j^\ast \rangle
  //debug (P,"Performing Unitary transformation");
  UnitaryTransformationOnWavefunctions(P, DiagData.U, Num);

  if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 7: Compute centres and spread printout\n",P->Par.me);
  WannierCentre = (double **) Malloc(sizeof(double *)*Num, "ComputeMLWF: *WannierCentre");
  WannierSpread = (double *) Malloc(sizeof(double)*Num, "ComputeMLWF: WannierSpread"); 
  for (i=0;i<Num;i++)
    WannierCentre[i] = (double *) Malloc(sizeof(double)*NDIM, "ComputeMLWF: WannierCentre");

  //debug (P,"Computing Wannier centres from variance matrix"); 
  ComputeWannierCentresfromVarianceMatrices(P, DiagData.A, &spread, &spreadSQ, WannierCentre, WannierSpread);

  // join Wannier orbital to groups with common centres under certain conditions
  //debug (P,"Changing Wannier Centres according to CommonWannier");
  ChangeWannierCentres(P, Num, WannierCentre, WannierSpread); 
  
  // write to file
  //debug (P,"Writing spread file");
  WriteWannierFile(P, spread, spreadSQ, WannierCentre, WannierSpread);  

  debug(P,"Free'ing memory");
  // free all remaining memory
  FreeDiagonalization(&DiagData);
  for (i=0;i<Num;i++)
    Free(WannierCentre[i], "ComputeMLWF: WannierCentre[i]");
  Free(WannierCentre, "ComputeMLWF: WannierCentre");
  Free(WannierSpread, "ComputeMLWF: WannierSpread");
}

/** Solves directly for eigenvectors and eigenvalues of a real 2x2 matrix.
 * The eigenvalues are determined by \f$\det{(A-\lambda \cdot I)}\f$, where \f$I\f$ is the unit matrix and
 * \f$\lambda\f$ an eigenvalue. This leads to a pq-formula which is easily evaluated in the first part.
 * 
 * The eigenvectors are then obtained by solving \f$A-\lambda \cdot I)x = 0\f$ for a given eigenvalue
 * \f$\lambda\f$. However, the eigenvector magnitudes are not specified, thus we the equation system is
 * still lacking such an equation. We use this fact to set either coordinate arbitrarily to 1 and then
 * derive the other by solving the equation system. Finally, the eigenvector is normalized.  
 * \param *P Problem at hand
 * \param *A matrix whose eigenvalues/-vectors are to be found
 * \param *eval vector with eigenvalues
 * \param *evec matrix with corresponding eigenvectors in columns
 */
#ifdef HAVE_INLINE 
inline void EigensolverFor22Matrix(struct Problem *P, gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec)
#else
void EigensolverFor22Matrix(struct Problem *P, gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec)
#endif
{
  double a11,a12,a21,a22;
  double ev[2], summand1, summand2, norm;
  int i;
  // find eigenvalues
  a11 = gsl_matrix_get(A,0,0);
  a12 = gsl_matrix_get(A,0,1);
  a21 = gsl_matrix_get(A,1,0);
  a22 = gsl_matrix_get(A,1,1);
  summand1 = (a11+a22)/2.;
  summand2 = sqrt(summand1*summand1 + a12*a21 - a11*a22);
  ev[0] = summand1 + summand2;
  ev[1] = summand1 - summand2;
  gsl_vector_set(eval, 0, ev[0]);
  gsl_vector_set(eval, 1, ev[1]);
  //fprintf (stderr,"(%i) ev1 %lg \t ev2 %lg\n", P->Par.me, ev1, ev2);
  
  // find eigenvectors
  for(i=0;i<2;i++) {
    if (fabs(ev[i]) < MYEPSILON) {
      gsl_matrix_set(evec, 0,i, 0.);
      gsl_matrix_set(evec, 1,i, 0.);
    } else if (fabs(a22-ev[i]) > MYEPSILON) {
      norm = sqrt(1*1 + (a21*a21/(a22-ev[i])/(a22-ev[i])));
      gsl_matrix_set(evec, 0,i, 1./norm);
      gsl_matrix_set(evec, 1,i, -(a21/(a22-ev[i]))/norm);
      //fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
    } else if (fabs(a12) > MYEPSILON) {
      norm = sqrt(1*1 + (a11-ev[i])*(a11-ev[i])/(a12*a12));
      gsl_matrix_set(evec, 0,i, 1./norm);
      gsl_matrix_set(evec, 1,i, -((a11-ev[i])/a12)/norm);
      //fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
    } else {
      if (fabs(a11-ev[i]) > MYEPSILON) {
        norm = sqrt(1*1 + (a12*a12/(a11-ev[i])/(a11-ev[i])));
        gsl_matrix_set(evec, 0,i, -(a12/(a11-ev[i]))/norm);
        gsl_matrix_set(evec, 1,i, 1./norm);
        //fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
      } else if (fabs(a21) > MYEPSILON) {
        norm = sqrt(1*1 + (a22-ev[i])*(a22-ev[i])/(a21*a21));
        gsl_matrix_set(evec, 0,i, -(a22-ev[i])/a21/norm);
        gsl_matrix_set(evec, 1,i, 1./norm);
        //fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
      } else {
        //gsl_matrix_set(evec, 0,i, 0.);
        //gsl_matrix_set(evec, 1,i, 1.);
        fprintf (stderr,"\t evec %i undetermined\n", i);
      }
      //gsl_matrix_set(evec, 0,0, 1.);
      //gsl_matrix_set(evec, 1,0, 0.);
      //fprintf (stderr,"(%i) evec1 undetermined", P->Par.me);
    }
  }
}

/** Calculates sine and cosine values for multiple matrix diagonalization.
 * \param *evec eigenvectors in columns
 * \param *eval corresponding eigenvalues
 * \param *c cosine to be returned
 * \param *s sine to be returned
 */
#ifdef HAVE_INLINE
inline void CalculateRotationAnglesFromEigenvalues(gsl_matrix *evec, gsl_vector *eval, double *c, double *s)
#else
void CalculateRotationAnglesFromEigenvalues(gsl_matrix *evec, gsl_vector *eval, double *c, double *s)
#endif
{
  int index;
  double x,y,r;
  
  index = gsl_vector_max_index (eval);  // get biggest eigenvalue
  //fprintf(stderr,"\t1st: %lg\t2nd: %lg ---  biggest: %i\n", gsl_vector_get(eval, 0), gsl_vector_get(eval, 1), index);
  x = gsl_matrix_get(evec, 0, index);
  y = gsl_matrix_get(evec, 1, index);
  if (x < 0) {  // ensure x>=0 so that rotation angles remain smaller Pi/4
    y = -y;
    x = -x;  
  }
  //z = gsl_matrix_get(evec, 2, index) * x/fabs(x);
  //fprintf(stderr,"\tx %lg\ty %lg\n", x,y);
  
  //fprintf(stderr,"(%i),(%i,%i) STEP 3c\n",P->Par.me,i,j);
  // STEP 3c: calculate R = [[c,s^\ast],[-s,c^\ast]]
  r = sqrt(x*x + y*y); // + z*z);
  if (fabs(r) > MYEPSILON) {
    *c = sqrt((x + r) / (2.*r));
    *s = y / sqrt(2.*r*(x+r)); //, -z / sqrt(2*r*(x+r)));
  } else {
    *c = 1.;
    *s = 0.;
  }
}

/*
 * \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
 * \param *U transformation matrix set to unity matrix
 * \param NumMatrices number of matrices to be diagonalized simultaneously
 * \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
 * \param AllocNum number of rows/columns in matrices
 * \param Num number of wave functions
 * \param ProcRank index in group for this cpu
 * \param ProcNum number of cpus in group
 * \param *top array with top row indices (merry-go-round)
 * \param *bot array with bottom row indices (merry-go-round) 
 */

/** Simultaneous diagonalization of matrices with multiple cpus.
 * \param *P Problem at hand
 * \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
 * \note this is slower given one cpu only than SerialDiagonalization()
 */
void ParallelDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData)
{
  struct RunStruct *R =&P->R;
  int tagR0, tagR1, tagS0, tagS1;
  int iloc, jloc;
  double *s_all, *c_all;
  int round, max_rounds;
  int start;
  int *rcounts, *rdispls;
  double *c, *s;
  int Lsend, Rsend, Lrecv, Rrecv; // where left(right) column is sent to or where it originates from
  int left, right; // left or right neighbour for process
  double spread = 0., old_spread=0., Spread=0.;
  int i,j,k,l,m,u;
  int set;
  int it_steps; // iteration step counter
  double **Aloc[DiagData->NumMatrices+1], **Uloc; // local columns for one step of A[k]
  double *Around[DiagData->NumMatrices+1], *Uround; // all local columns for one round of A[k]
  double *Atotal[DiagData->NumMatrices+1], *Utotal; // all local columns for one round of A[k]
  double a_i, a_j;
  gsl_matrix *G;
  gsl_vector *h;
  gsl_vector *eval;
  gsl_matrix *evec;
  //gsl_eigen_symmv_workspace *w;

  max_rounds = (DiagData->AllocNum / 2)/DiagData->ProcNum;  // each process must perform multiple rotations per step of a set
  if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) start %i\tstep %i\tmax.rounds %i\n",P->Par.me, DiagData->ProcRank, DiagData->ProcNum, max_rounds);
  
  // allocate column vectors for interchange of columns
  debug(P,"allocate column vectors for interchange of columns");
  c = (double *) Malloc(sizeof(double)*max_rounds, "ComputeMLWF: c");
  s = (double *) Malloc(sizeof(double)*max_rounds, "ComputeMLWF: s");
  c_all = (double *) Malloc(sizeof(double)*DiagData->AllocNum/2, "ComputeMLWF: c_all");
  s_all = (double *) Malloc(sizeof(double)*DiagData->AllocNum/2, "ComputeMLWF: s_all");
  rcounts = (int *) Malloc(sizeof(int)*DiagData->ProcNum, "ComputeMLWF: rcounts");
  rdispls = (int *) Malloc(sizeof(int)*DiagData->ProcNum, "ComputeMLWF: rdispls");

  // allocate eigenvector stuff
  debug(P,"allocate eigenvector stuff");
  G = gsl_matrix_calloc (2,2);
  h = gsl_vector_alloc (2);
  eval = gsl_vector_alloc (2);
  evec = gsl_matrix_alloc (2,2);
  //w = gsl_eigen_symmv_alloc(2);
  
  // establish communication partners
  debug(P,"establish communication partners");
  if (DiagData->ProcRank == 0) {
    tagS0 = WannierALTag;   // left p0 always remains left p0
  } else {
    tagS0 = (DiagData->ProcRank == DiagData->ProcNum - 1) ? WannierARTag : WannierALTag; // left p_last becomes right p_last          
  }
  tagS1 = (DiagData->ProcRank == 0) ? WannierALTag : WannierARTag; // right p0 always goes into left p1
  tagR0 = WannierALTag; // 
  tagR1 = WannierARTag; // first process
  if (DiagData->ProcRank == 0) {
    left = DiagData->ProcNum-1;
    right = 1;
    Lsend = 0;
    Rsend = 1;
    Lrecv = 0;
    Rrecv = 1;        
  } else if (DiagData->ProcRank == DiagData->ProcNum - 1) {
    left = DiagData->ProcRank - 1;
    right = 0;
    Lsend = DiagData->ProcRank;
    Rsend = DiagData->ProcRank - 1;
    Lrecv = DiagData->ProcRank - 1;
    Rrecv = DiagData->ProcRank;        
  } else {
    left = DiagData->ProcRank - 1;
    right = DiagData->ProcRank + 1;
    Lsend = DiagData->ProcRank+1;
    Rsend = DiagData->ProcRank - 1;
    Lrecv = DiagData->ProcRank - 1;
    Rrecv = DiagData->ProcRank+1;
  }
  //if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) left %i\t right %i --- Lsend %i\tRsend%i\tLrecv %i\tRrecv%i\n",P->Par.me, left, right, Lsend, Rsend, Lrecv, Rrecv);
  
  // initialise A_loc
  debug(P,"initialise A_loc");
  for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
    //Aloc[k] = (double *) Malloc(sizeof(double)*AllocNum*2, "ComputeMLWF: Aloc[k]");
    Around[k] = (double *) Malloc(sizeof(double)*DiagData->AllocNum*2*max_rounds, "ComputeMLWF: Around[k]");
    Atotal[k] = (double *) Malloc(sizeof(double)*DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Atotal[k]");
    Aloc[k] = (double **) Malloc(sizeof(double *)*2*max_rounds, "ComputeMLWF: Aloc[k]");
    //Around[k] = &Atotal[k][ProcRank*AllocNum*2*max_rounds];
    
    for (round=0;round<max_rounds;round++) {
      Aloc[k][2*round] = &Around[k][DiagData->AllocNum*(2*round)];
      Aloc[k][2*round+1] = &Around[k][DiagData->AllocNum*(2*round+1)];
      for (l=0;l<DiagData->AllocNum;l++) {
        Aloc[k][2*round][l] = gsl_matrix_get(DiagData->A[k],l,2*(DiagData->ProcRank*max_rounds+round));
        Aloc[k][2*round+1][l] = gsl_matrix_get(DiagData->A[k],l,2*(DiagData->ProcRank*max_rounds+round)+1);
        //fprintf(stderr,"(%i) (%i, 0/1) A_loc1 %e\tA_loc2 %e\n",P->Par.me, l, Aloc[k][l],Aloc[k][l+AllocNum]);
      }
    }
  }
  // initialise U_loc
  debug(P,"initialise U_loc");
  //Uloc = (double *) Malloc(sizeof(double)*AllocNum*2, "ComputeMLWF: Uloc");
  Uround = (double *) Malloc(sizeof(double)*DiagData->AllocNum*2*max_rounds, "ComputeMLWF: Uround");
  Utotal = (double *) Malloc(sizeof(double)*DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Utotal");
  Uloc = (double **) Malloc(sizeof(double *)*2*max_rounds, "ComputeMLWF: Uloc");
  //Uround = &Utotal[ProcRank*AllocNum*2*max_rounds];
  for (round=0;round<max_rounds;round++) {
    Uloc[2*round] = &Uround[DiagData->AllocNum*(2*round)];
    Uloc[2*round+1] = &Uround[DiagData->AllocNum*(2*round+1)];
    for (l=0;l<DiagData->AllocNum;l++) {
      Uloc[2*round][l] = gsl_matrix_get(DiagData->U,l,2*(DiagData->ProcRank*max_rounds+round));
      Uloc[2*round+1][l] = gsl_matrix_get(DiagData->U,l,2*(DiagData->ProcRank*max_rounds+round)+1);
      //fprintf(stderr,"(%i) (%i, 0/1) U_loc1 %e\tU_loc2 %e\n",P->Par.me, l, Uloc[l+AllocNum*0],Uloc[l+AllocNum*1]);
    }
  }
  
  // now comes the iteration loop
  debug(P,"now comes the iteration loop");
  it_steps = 0;
  do {
    it_steps++;
    //if (P->Par.me == 0) fprintf(stderr,"(%i) Beginning parallel iteration %i ... ",P->Par.me,it_steps);
    for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 staying at its place all the time
      //fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
      for (round = 0; round < max_rounds;round++) {
        start = DiagData->ProcRank * max_rounds + round;
        // get indices
        i = DiagData->top[start] < DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // minimum of the two indices
        iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1; 
        j = DiagData->top[start] > DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // maximum of the two indices: thus j >  i
        jloc =  DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
        //fprintf(stderr,"(%i) my (%i,%i), loc(%i,%i)\n",P->Par.me, i,j, iloc, jloc);
        
        // calculate rotation angle, i.e. c and s
        //fprintf(stderr,"(%i),(%i,%i) calculate rotation angle\n",P->Par.me,i,j);
        gsl_matrix_set_zero(G);        
        for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
          // Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]         
          //fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,Aloc[k][2*round+iloc][i], Aloc[k][2*round+jloc][j],Aloc[k][2*round+iloc][i] - Aloc[k][2*round+jloc][j]);          
          //fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,Aloc[k][2*round+jloc][i], Aloc[k][2*round+iloc][j],Aloc[k][2*round+jloc][i] + Aloc[k][2*round+iloc][j]);          
          gsl_vector_set(h, 0, Aloc[k][2*round+iloc][i] - Aloc[k][2*round+jloc][j]);
          gsl_vector_set(h, 1, Aloc[k][2*round+jloc][i] + Aloc[k][2*round+iloc][j]);
          
          // Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
          for (l=0;l<2;l++)
            for (m=0;m<2;m++)
                gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
        }
        //fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
        // STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
        EigensolverFor22Matrix(P,G,eval,evec);
        //gsl_eigen_symmv(G, eval, evec, w);  // calculates eigenvalues and eigenvectors of G    

        CalculateRotationAnglesFromEigenvalues(evec, eval, &c[round], &s[round]);
        //fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[round],s[round]);

        //fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
        // V_loc = V_loc * V_small
        //debug(P,"apply rotation to local U");
        for (l=0;l<DiagData->AllocNum;l++) {
          a_i = Uloc[2*round+iloc][l];
          a_j = Uloc[2*round+jloc][l];
          Uloc[2*round+iloc][l] =  c[round] * a_i + s[round] * a_j;
          Uloc[2*round+jloc][l] = -s[round] * a_i + c[round] * a_j;
        }
      }  // end of round        
      // circulate the rotation angles
      //debug(P,"circulate the rotation angles");
      MPI_Allgather(c, max_rounds, MPI_DOUBLE, c_all, max_rounds, MPI_DOUBLE, *(DiagData->comm));   // MPI_Allgather is waaaaay faster than ring circulation
      MPI_Allgather(s, max_rounds, MPI_DOUBLE, s_all, max_rounds, MPI_DOUBLE, *(DiagData->comm));
      //m = start;
      for (l=0;l<DiagData->AllocNum/2;l++) { // for each process
        // we have V_small from process k
        //debug(P,"Apply V_small from other process");
        i = DiagData->top[l] < DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // minimum of the two indices
        j = DiagData->top[l] > DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // maximum of the two indices: thus j >  i
        iloc = DiagData->top[l] < DiagData->bot[l] ? 0 : 1; 
        jloc = DiagData->top[l] > DiagData->bot[l] ? 0 : 1;
        for (m=0;m<max_rounds;m++) {
          //fprintf(stderr,"(%i) %i processes' (%i,%i)\n",P->Par.me, m,i,j);
          // apply row rotation to each A[k]
          for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
            //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+iloc][i],j,Aloc[k][2*m+iloc][j]);
            //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+jloc][i],j,Aloc[k][2*m+jloc][j]);
            a_i = Aloc[k][2*m+iloc][i];
            a_j = Aloc[k][2*m+iloc][j];
            Aloc[k][2*m+iloc][i] =  c_all[l] * a_i + s_all[l] * a_j;
            Aloc[k][2*m+iloc][j] = -s_all[l] * a_i + c_all[l] * a_j;
            a_i = Aloc[k][2*m+jloc][i];
            a_j = Aloc[k][2*m+jloc][j];
            Aloc[k][2*m+jloc][i] =  c_all[l] * a_i + s_all[l] * a_j;
            Aloc[k][2*m+jloc][j] = -s_all[l] * a_i + c_all[l] * a_j;
            //fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+iloc][i],j,Aloc[k][2*m+iloc][j]);
            //fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+jloc][i],j,Aloc[k][2*m+jloc][j]);
          }
        }
      }
      // apply rotation to local operator matrices
      // A_loc = A_loc * V_small 
      //debug(P,"apply rotation to local operator matrices A[k]");
      for (m=0;m<max_rounds;m++) {
        start = DiagData->ProcRank * max_rounds + m;
        iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1; 
        jloc = DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
        for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// extra for B matrix !
          for (l=0;l<DiagData->AllocNum;l++) {
            // Columns, i and j belong to this process only!
            a_i = Aloc[k][2*m+iloc][l];
            a_j = Aloc[k][2*m+jloc][l];
            Aloc[k][2*m+iloc][l] =  c[m] * a_i + s[m] * a_j;
            Aloc[k][2*m+jloc][l] = -s[m] * a_i + c[m] * a_j;
            //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, l, Aloc[k][2*m+iloc][l],l,Aloc[k][2*m+jloc][l]);
          }
        }
      }
      // Shuffling of these round's columns to prepare next rotation set
      for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix ! 
        // extract columns from A
        //debug(P,"extract columns from A");
        MerryGoRoundColumns(*(DiagData->comm), Aloc[k], DiagData->AllocNum, max_rounds, k, tagS0, tagS1, tagR0, tagR1);
                  
      }       
      // and also for V ...
      //debug(P,"extract columns from U");
      MerryGoRoundColumns(*(DiagData->comm), Uloc, DiagData->AllocNum, max_rounds, 0, tagS0, tagS1, tagR0, tagR1);


      // and merry-go-round for the indices too            
      //debug(P,"and merry-go-round for the indices too");
      MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
    }

    //fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
    // STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
    old_spread = Spread;
    spread = 0.;
    for(k=0;k<DiagData->NumMatrices;k++) {   // go through all self-adjoint operators
      for (i=0; i < 2*max_rounds; i++) {  // go through all wave functions
          spread += Aloc[k][i][i+DiagData->ProcRank*2*max_rounds]*Aloc[k][i][i+DiagData->ProcRank*2*max_rounds];
          //spread += gsl_matrix_get(A[k],i,i)*gsl_matrix_get(A[k],i,i);
      }
    }
    MPI_Allreduce(&spread, &Spread, 1, MPI_DOUBLE, MPI_SUM, *(DiagData->comm));
    //Spread = spread;
    if (P->Par.me == 0) { 
      //if(P->Call.out[ReadOut]) 
      //  fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,Spread,R->EpsWannier);
      //else 
        //fprintf(stderr,"%2.9e\n",Spread);
    }
    // STEP 5: check change of variance
  } while (fabs(old_spread-Spread) >= R->EpsWannier);
  // end of iterative diagonalization loop: We have found our final orthogonal U!

  // gather local parts of U into complete matrix 
  for (l=0;l<DiagData->ProcNum;l++)
    rcounts[l] = DiagData->AllocNum;
  debug(P,"allgather U");
  for (round=0;round<2*max_rounds;round++) {
    for (l=0;l<DiagData->ProcNum;l++)
      rdispls[l] = (l*2*max_rounds + round)*DiagData->AllocNum;
    MPI_Allgatherv(Uloc[round], DiagData->AllocNum, MPI_DOUBLE, Utotal, rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
  }
  for (k=0;k<DiagData->AllocNum;k++) {    
    for(l=0;l<DiagData->AllocNum;l++) {
      gsl_matrix_set(DiagData->U,k,l, Utotal[l+k*DiagData->AllocNum]);
    }
  }

  // after one set, gather A[k] from all and calculate spread
  for (l=0;l<DiagData->ProcNum;l++)
    rcounts[l] = DiagData->AllocNum;
  debug(P,"gather A[k] for spread");
  for (u=0;u<DiagData->NumMatrices+DiagData->extra;u++) {// extra for B matrix !
    debug(P,"A[k] all gather");
    for (round=0;round<2*max_rounds;round++) {
      for (l=0;l<DiagData->ProcNum;l++)
        rdispls[l] = (l*2*max_rounds + round)*DiagData->AllocNum;
      MPI_Allgatherv(Aloc[u][round], DiagData->AllocNum, MPI_DOUBLE, Atotal[u], rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
    }
    for (k=0;k<DiagData->AllocNum;k++) {    
      for(l=0;l<DiagData->AllocNum;l++) {
        gsl_matrix_set(DiagData->A[u],k,l, Atotal[u][l+k*DiagData->AllocNum]);
      }
    }
  }
   
  // free eigenvector stuff     
  gsl_vector_free(h);
  gsl_matrix_free(G);
  //gsl_eigen_symmv_free(w);  
  gsl_vector_free(eval);
  gsl_matrix_free(evec);
 
  // Free column vectors
  for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
    Free(Atotal[k], "ParallelDiagonalization: Atotal[k]");
    Free(Around[k], "ParallelDiagonalization: Around[k]");
  }
  Free(Uround, "ParallelDiagonalization: Uround");
  Free(Utotal, "ParallelDiagonalization: Utotal");
  Free(c_all, "ParallelDiagonalization: c_all");
  Free(s_all, "ParallelDiagonalization: s_all");
  Free(c, "ParallelDiagonalization: c");
  Free(s, "ParallelDiagonalization: s");
  Free(rcounts, "ParallelDiagonalization: rcounts");
  Free(rdispls, "ParallelDiagonalization: rdispls");
}
/*
 * \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
 * \param *U transformation matrix set to unity matrix
 * \param NumMatrices number of matrices to be diagonalized
 * \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
 * \param AllocNum number of rows/columns in matrices
 * \param Num number of wave functions
 * \param *top array with top row indices (merry-go-round)
 * \param *bot array with bottom row indices (merry-go-round) 
 */

/** Simultaneous Diagonalization of variances matrices with one cpu.
 * \param *P Problem at hand
 * \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
 * \note this is faster given one cpu only than ParallelDiagonalization()
 */
void SerialDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData)
{
  struct RunStruct *R = &P->R;
  gsl_matrix *G;
  gsl_vector *h;
  gsl_vector *eval;
  gsl_matrix *evec;
  //gsl_eigen_symmv_workspace *w;
  double *c,*s,a_i,a_j;
  int it_steps, set, ProcRank;
  int i,j,k,l,m;
  double spread = 0., old_spread = 0.;\

  // allocate eigenvector stuff
  debug(P,"allocate eigenvector stuff");
  G = gsl_matrix_calloc (2,2);
  h = gsl_vector_alloc (2);
  eval = gsl_vector_alloc (2);
  evec = gsl_matrix_alloc (2,2);
  //w = gsl_eigen_symmv_alloc(2);

  c = (double *) Malloc(sizeof(double), "ComputeMLWF: c");
  s = (double *) Malloc(sizeof(double), "ComputeMLWF: s");
  debug(P,"now comes the iteration loop");
  it_steps=0;
  do {
    it_steps++;
    //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 3: Iteratively maximize negative spread part\n",P->Par.me);
    //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Beginning iteration %i ... ",P->Par.me,it_steps);
    for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 stating at its place all the time
      //fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
      // STEP 3: for all index pairs 0<= i<j <AllocNum
      for (ProcRank=0;ProcRank<DiagData->AllocNum/2;ProcRank++) {
        // get indices
        i = DiagData->top[ProcRank] < DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // minimum of the two indices
        j = DiagData->top[ProcRank] > DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // maximum of the two indices: thus j >  i 
        //fprintf(stderr,"(%i),(%i,%i) STEP 3a\n",P->Par.me,i,j);
        // STEP 3a: Calculate G
        gsl_matrix_set_zero(G);
        
        for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
          // Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]
          //fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,i), gsl_matrix_get(DiagData->A[k],j,j),gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));          
          //fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,j), gsl_matrix_get(DiagData->A[k],j,i),gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));          
          gsl_vector_set(h, 0, gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));
          gsl_vector_set(h, 1, gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));
          //gsl_vector_complex_set(h, 2, gsl_complex_mul_imag(gsl_complex_add(gsl_matrix_complex_get(A[k],j,i), gsl_matrix_complex_get(A[k],i,j)),1));
          
          // Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
          for (l=0;l<2;l++)
            for (m=0;m<2;m++)
                gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
                
          //PrintGSLMatrix(P,G,2, "G");
        }
      
        //fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
        // STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
        EigensolverFor22Matrix(P,G,eval,evec);
        //gsl_eigen_symmv(G, eval, evec, w);  // calculates eigenvalues and eigenvectors of G

        //PrintGSLMatrix(P, evec, 2, "Eigenvectors");
        CalculateRotationAnglesFromEigenvalues(evec, eval, c, s);
        //fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[0],s[0]);
     
        //fprintf(stderr,"(%i),(%i,%i) STEP 3d\n",P->Par.me,i,j);
        // STEP 3d: apply rotation R to rows and columns of A^{(k)}
        for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
          //PrintGSLMatrix(P,A[k],AllocNum, "A (before rot)");
          for (l=0;l<DiagData->AllocNum;l++) {
            // Rows
            a_i = gsl_matrix_get(DiagData->A[k],i,l);
            a_j = gsl_matrix_get(DiagData->A[k],j,l);
            gsl_matrix_set(DiagData->A[k], i, l,  c[0] * a_i + s[0] * a_j);
            gsl_matrix_set(DiagData->A[k], j, l, -s[0] * a_i + c[0] * a_j);
          }
          for (l=0;l<DiagData->AllocNum;l++) {
            // Columns
            a_i = gsl_matrix_get(DiagData->A[k],l,i);
            a_j = gsl_matrix_get(DiagData->A[k],l,j);
            gsl_matrix_set(DiagData->A[k], l, i,  c[0] * a_i + s[0] * a_j);
            gsl_matrix_set(DiagData->A[k], l, j, -s[0] * a_i + c[0] * a_j);
          }
          //PrintGSLMatrix(P,A[k],AllocNum, "A (after rot)");
        }
        //fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
        //PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (before rot)");
        // STEP 3e: apply U = R*U
        for (l=0;l<DiagData->AllocNum;l++) {
            a_i = gsl_matrix_get(DiagData->U,i,l);
            a_j = gsl_matrix_get(DiagData->U,j,l);
            gsl_matrix_set(DiagData->U, i, l,  c[0] * a_i + s[0] * a_j);
            gsl_matrix_set(DiagData->U, j, l, -s[0] * a_i + c[0] * a_j);
        }
        //PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (after rot)");
      }
      // and merry-go-round for the indices too            
      //debug(P,"and merry-go-round for the indices too");
      if (DiagData->AllocNum > 2) MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
    }
   
    //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
    // STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
    old_spread = spread;
    spread = 0.;
    for(k=0;k<DiagData->NumMatrices;k++) {   // go through all self-adjoint operators
      for (i=0; i < DiagData->AllocNum; i++) {  // go through all wave functions
          spread += pow(gsl_matrix_get(DiagData->A[k],i,i),2);
      }
    }
    if(P->Par.me == 0) {
      //if(P->Call.out[ReadOut]) 
      //  fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,spread,R->EpsWannier);
      //else 
        //fprintf(stderr,"%2.9e\n",spread);
    }
    // STEP 5: check change of variance
  } while (fabs(old_spread-spread) >= R->EpsWannier);
  // end of iterative diagonalization loop: We have found our final orthogonal U!
  Free(c, "SerialDiagonalization: c");
  Free(s, "SerialDiagonalization: s");
  // free eigenvector stuff     
  gsl_vector_free(h);
  gsl_matrix_free(G);
  //gsl_eigen_symmv_free(w);  
  gsl_vector_free(eval);
  gsl_matrix_free(evec);
}

/** Writes the wannier centres and spread to file.
 * Also puts centres from array into OnePsiElementAddData structure.
 * \param *P Problem at hand
 * \param old_spread first term of wannier spread
 * \param spread second term of wannier spread
 * \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
 * \param *WannierSpread array with wannier spread per wave function
 */
void WriteWannierFile(struct Problem *P, double spread, double old_spread, double **WannierCentre, double *WannierSpread)
{
  struct FileData *F = &P->Files;
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  struct OnePsiElement *OnePsiA;
  char spin[12], suffix[18];
  int i,j,l;

  if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Spread printout\n", P->Par.me);

  switch (Lat->Psi.PsiST) {
  case SpinDouble:
    strcpy(suffix,".spread.csv");    
    strcpy(spin,"SpinDouble");
    break;
  case SpinUp:
    strcpy(suffix,".spread_up.csv");    
    strcpy(spin,"SpinUp");
    break;
  case SpinDown:
    strcpy(suffix,".spread_down.csv");    
    strcpy(spin,"SpinDown");
    break;
  }
  if (P->Par.me_comm_ST == 0) {
    if (R->LevSNo == Lat->MaxLevel-1) // open freshly if first level 
      OpenFile(P, &F->SpreadFile, suffix, "w", P->Call.out[ReadOut]); // only open on starting level
    else if (F->SpreadFile == NULL) // re-open if not first level and not opened yet (or closed from ParseWannierFile)
      OpenFile(P, &F->SpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
    if (F->SpreadFile == NULL) {
      Error(SomeError,"WriteWannierFile: Error opening Wannier File!\n");
    } else {
      fprintf(F->SpreadFile,"#===== W A N N I E R  C E N T R E S for Level %d of type %s ========================\n", R->LevSNo, spin);
      fprintf(F->SpreadFile,"# Orbital+Level\tx\ty\tz\tSpread\n");
    }
  }
  
  // put (new) WannierCentres into local ones and into file
  i=-1;
  for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {  // go through all wave functions
    OnePsiA = &Psi->AllPsiStatus[l];    // grab OnePsiA
    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
      i++;  // increase l if it is occupied wave function
      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {// is this local?
        for (j=0;j<NDIM;j++)
          Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[i][j];
      }
      if (P->Par.me_comm_ST == 0) 
        fprintf(F->SpreadFile,"Psi%d_Lev%d\t%lg\t%lg\t%lg\t%lg\n", Psi->AllPsiStatus[i].MyGlobalNo, R->LevSNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], WannierSpread[i]); 
    }
  }    
  if (P->Par.me_comm_ST == 0) {
    fprintf(F->SpreadFile,"\n#Matrix traces\tB_ii\tA_ii^2\tTotal (B_ii - A_ii^2)\n");
    fprintf(F->SpreadFile,"TotalSpread_L%d\t%lg\t%lg\t%lg\n\n",R->LevSNo, old_spread, spread, old_spread - spread);
    fflush(F->SpreadFile);
  }
}


/** Parses the spread file and puts values into OnePsiElementAddData#WannierCentre.
 * \param *P Problem at hand
 * \return 1 - success, 0 - failure
 */
int ParseWannierFile(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &Lat->Psi;
  struct OnePsiElement *OnePsiA;
  int i,l,j, msglen;
  FILE *SpreadFile;
  char *tagname = NULL;
  char suffix[18];
  double WannierCentre[NDIM+1]; // combined centre and spread
  MPI_Status status;
  int signal = 0; // 1 - ok, 0 - error

  switch (Lat->Psi.PsiST) {
  case SpinDouble:
    strcpy(suffix,".spread.csv");    
    break;
  case SpinUp:
    strcpy(suffix,".spread_up.csv");    
    break;
  case SpinDown:
    strcpy(suffix,".spread_down.csv");    
    break;
  }
  if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Parsing Wannier Centres from file ... \n", P->Par.me);
  
  if (P->Par.me_comm_ST == 0) {
    tagname = (char *) Malloc(sizeof(char)*255, "ParseWannierFile: *tagname");
    if(!OpenFile(P, &SpreadFile, suffix, "r", P->Call.out[ReadOut])) { // check if file exists
      if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
        Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
      return 0;
      //Error(SomeError,"ParseWannierFile: Opening failed\n");
    }
    signal = 1;
    if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
      Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
  } else {
    if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
      Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
    if (signal == 0)
      return 0;
  }
  i=-1;
  for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {  // go through all wave functions
    OnePsiA = &Psi->AllPsiStatus[l];    // grab OnePsiA
    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
      i++;  // increase l if it is occupied wave function
      if (P->Par.me_comm_ST == 0) { // only process 0 may access the spread file
        sprintf(tagname,"Psi%d_Lev%d",i,R->LevSNo);
        signal = 0;
        if (!ParseForParameter(0,SpreadFile,tagname,0,3,1,row_double,WannierCentre,1,optional)) {
          //Error(SomeError,"ParseWannierFile: Parsing WannierCentre failed");
          if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
            Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
          return 0;
        }
        if (!ParseForParameter(0,SpreadFile,tagname,0,4,1,double_type,&WannierCentre[NDIM],1,optional)) {
          //Error(SomeError,"ParseWannierFile: Parsing WannierSpread failed");
          if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
            Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
          return 0;
        }
        signal = 1;
        if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
          Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
      } else {
        if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
          Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
        if (signal == 0)
          return 0;
      }
      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // is this Psi local?
        if ((P->Par.me_comm_ST != 0) && (P->Par.me_comm_ST_Psi == 0)) { // if they don't belong to process 0 and we are a leader of a Psi group, receive 'em
          if (MPI_Recv(WannierCentre, NDIM+1, MPI_DOUBLE, 0, ParseWannierTag, P->Par.comm_ST_PsiT, &status) != MPI_SUCCESS)
            Error(SomeError,"ParseWannierFile: MPI_Recv of WannierCentre/Spread from process 0 failed");
            //return 0;
          MPI_Get_count(&status, MPI_DOUBLE, &msglen);
          if (msglen != NDIM+1) 
            Error(SomeError,"ParseWannierFile: MPI_Recv of WannierCentre/Spread from process 0 failed due to wrong item count");
            //return 0;
        }
        if (MPI_Bcast(WannierCentre, NDIM+1, MPI_DOUBLE, 0, P->Par.comm_ST_Psi) != MPI_SUCCESS) // Bcast to all processes of the Psi group from leader
          Error(SomeError,"ParseWannierFile: MPI_Bcast of WannierCentre/Spread to sub process in Psi group failed");
          //return 0;
        // and store 'em (for all who have this Psi local)
        if ((P->Par.me == 0) && P->Call.out[ValueOut]) fprintf(stderr,"(%i) Psi %i, L %i: (x,y,z) = (%lg, %lg, %lg), Spread %lg\n",P->Par.me,i, R->LevSNo, WannierCentre[0], WannierCentre[1], WannierCentre[2], WannierCentre[NDIM]);
        for (j=0;j<NDIM;j++) Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[j];
        Psi->AddData[OnePsiA->MyLocalNo].WannierSpread = WannierCentre[NDIM];
        //if (P->Par.me == 0 && P->Call.out[ValueOut]) fprintf(stderr,"(%i) %s\t%lg\t%lg\t%lg\t\t%lg\n",P->Par.me, tagname,Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[0],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[1],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[2],Psi->AddData[OnePsiA->MyLocalNo].WannierSpread);
      } else if (P->Par.me_comm_ST == 0) { // if they are not local, yet we are process 0, send 'em to leader of its Psi group
        if (MPI_Send(WannierCentre, NDIM+1, MPI_DOUBLE, OnePsiA->my_color_comm_ST_Psi, ParseWannierTag, P->Par.comm_ST_PsiT) != MPI_SUCCESS)
          Error(SomeError,"ParseWannierFile: MPI_Send of WannierCentre/Spread to process 0 of owning Psi group failed");
          //return 0;
      }
    }
  }
  if ((SpreadFile != NULL) && (P->Par.me_comm_ST == 0)) { 
    fclose(SpreadFile);
    Free(tagname, "ParseWannierFile: *tagname");
  }
  //fprintf(stderr,"(%i) Parsing Wannier files succeeded!\n", P->Par.me);
  return 1;
}

/** Calculates the spread of orbital \a i.
 * Stored in OnePsiElementAddData#WannierSpread.
 * \param *P Problem at hand
 * \param i i-th wave function (note "extra" ones are not counted!)
 * \return spread \f$\sigma^2_{A^{(k)}}\f$
 */
double CalculateSpread(struct Problem *P, int i) {
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Density *Dens0 = Lev0->Dens;
  struct fft_plan_3d *plan = Lat->plan; 
  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
  fftw_real *PsiCR = (fftw_real *)PsiC;
  fftw_complex *work = Dens0->DensityCArray[Temp2Density];
  fftw_real **HGcR = &Dens0->DensityArray[HGcDensity];  // use HGcDensity, 4x Gap..Density, TempDensity as a storage array
  fftw_complex **HGcRC = (fftw_complex**)HGcR;
  fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity];  // use HGcDensity, 4x Gap..Density, TempDensity as an array
  fftw_real **HGcR2 = (fftw_real**)HGcR2C;
  MPI_Status status;
  struct OnePsiElement *OnePsiA, *LOnePsiA;
  int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
  fftw_complex *LPsiDatA=NULL;
  int k,n[NDIM],n0, *N,N0, g, p, iS, i0, Index;
  N0 = LevS->Plan0.plan->local_nx;
  N = LevS->Plan0.plan->N;
  const int NUpx = LevS->NUp[0];
  const int NUpy = LevS->NUp[1];
  const int NUpz = LevS->NUp[2];
  double a_ij, b_ij, A_ij, B_ij;
  double tmp, tmp2, spread = 0;
  double **cos_lookup, **sin_lookup;
  
  b_ij = 0;

  CreateSinCosLookupTable(&cos_lookup,&sin_lookup,N);

  // fill matrices
  OnePsiA = &Psi->AllPsiStatus[i];    // grab the desired OnePsiA
  if (OnePsiA->PsiType != Extra) {   // drop if extra one
    if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
       LOnePsiA = &Psi->LocalPsiStatus[OnePsiA->MyLocalNo];
    else 
       LOnePsiA = NULL;
    if (LOnePsiA == NULL) {   // if it's not local ... receive it from respective process into TempPsi
      RecvSource = OnePsiA->my_color_comm_ST_Psi;
      MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag1, P->Par.comm_ST_PsiT, &status );
      LPsiDatA=LevS->LPsi->TempPsi;
    } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
      for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
        if (p != OnePsiA->my_color_comm_ST_Psi) 
          MPI_Send( LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag1, P->Par.comm_ST_PsiT);
      LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
    } // LPsiDatA is now set to the coefficients of OnePsi either stored or MPI_Received
      
    CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
    // note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
    for (n0=0;n0<N0;n0++)  
      for (n[1]=0;n[1]<N[1];n[1]++) 
        for (n[2]=0;n[2]<N[2];n[2]++) {
          i0 = n[2]*NUpz+N[2]*NUpz*(n[1]*NUpy+N[1]*NUpy*n0*NUpx);
          iS = n[2]+N[2]*(n[1]+N[1]*n0);
          n[0] = n0 + LevS->Plan0.plan->start_nx;
          for (k=0;k<max_operators;k+=2) {
            tmp = 2*PI/(double)(N[k/2])*(double)(n[k/2]);
            tmp2 = PsiCR[i0] /LevS->MaxN;
            // check lookup
            if ((fabs(cos(tmp) - cos_lookup[k/2][n[k/2]]) > MYEPSILON) || (fabs(sin(tmp) - sin_lookup[k/2][n[k/2]]) > MYEPSILON)) {
              fprintf(stderr,"(%i) (cos) %2.15e against (lookup) %2.15e,\t(sin) %2.15e against (lookup) %2.15e\n", P->Par.me, cos(tmp), cos_lookup[k/2][n[k/2]],sin(tmp),sin_lookup[k/2][n[k/2]]);   
              Error(SomeError, "Lookup table does not match real value!");
            }
//            HGcR[k][iS] = cos_lookup[k/2][n[k/2]] * tmp2; /* Matrix Vector Mult */
//            HGcR2[k][iS] = cos_lookup[k/2][n[k/2]] * HGcR[k][iS]; /* Matrix Vector Mult */
//            HGcR[k+1][iS] = sin_lookup[k/2][n[k/2]] * tmp2; /* Matrix Vector Mult */
//            HGcR2[k+1][iS] = sin_lookup[k/2][n[k/2]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
            HGcR[k][iS] = cos(tmp) * tmp2; /* Matrix Vector Mult */
            HGcR2[k][iS] = pow(cos(tmp),2) * tmp2; /* Matrix Vector Mult */
            HGcR[k+1][iS] = sin(tmp) * tmp2; /* Matrix Vector Mult */
            HGcR2[k+1][iS] = pow(sin(tmp),2) * tmp2; /* Matrix Vector Mult */
          }
        }
    for (k=0;k<max_operators;k++) {
      fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[k], work);
      fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[k], work);
    }


    for (k=0;k<max_operators;k++) {
      a_ij = 0;
      //fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,k);
      // sum directly in a_ij and b_ij the two desired terms
      g=0;
      if (LevS->GArray[0].GSq == 0.0) {
        Index = LevS->GArray[g].Index;
        a_ij += (LPsiDatA[0].re*HGcRC[k][Index].re + LPsiDatA[0].im*HGcRC[k][Index].im);
        b_ij += (LPsiDatA[0].re*HGcR2C[k][Index].re + LPsiDatA[0].im*HGcR2C[k][Index].im);
        g++;
      }
      for (; g < LevS->MaxG; g++) {
        Index = LevS->GArray[g].Index;
        a_ij += 2*(LPsiDatA[g].re*HGcRC[k][Index].re + LPsiDatA[g].im*HGcRC[k][Index].im);
        b_ij += 2*(LPsiDatA[g].re*HGcR2C[k][Index].re + LPsiDatA[g].im*HGcR2C[k][Index].im);
      } // due to the symmetry the resulting matrix element is real and symmetric in (i,i) ! (complex multiplication simplifies ...)
      MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
      spread += pow(A_ij,2);
    }
  }
  MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);

  // store spread in OnePsiElementAdd
  Psi->AddData[i].WannierSpread = B_ij - spread;

  FreeSinCosLookupTable(cos_lookup,sin_lookup);
  
  return (B_ij - spread);
}