Changeset 4f1369


Ignore:
Timestamp:
Apr 23, 2008, 1:45:37 PM (17 years ago)
Author:
Frederik Heber <heber@…>
Children:
77f1ae
Parents:
9bdd86
Message:

Wannier-file split up: ComputeMWLF() into smaller functions

This split up was done in the SVN repository and tested there and is working. It was impossible to piece it up into smaller chunks in order to maintain "bisectability", hence the last wannier.* from the SVN rep. was used. However, there was one bug in ParseWannierFile: There is the define "type" and there was a enum variable also of name type which caused errors on reading perturbed densities in the resulting shielding values. Removing the variable completely fixed this.
Note: With regards to the shielding values of c2h4 at 24Ht the resulting values are still absolutely the same as with revision initial_commit

Location:
pcp/src
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • pcp/src/wannier.c

    r9bdd86 r4f1369  
    1212
    1313  File: wannier.c
    14   $Id: wannier.c,v 1.63 2007-02-13 14:15:29 foo Exp $
     14  $Id: wannier.c,v 1.7 2007-10-12 15:50:38 heber Exp $
    1515*/
    1616
     
    3030#include "density.h"
    3131#include "errors.h"
     32#include "gramsch.h"
    3233#include "helpers.h"
    3334#include "init.h"
     
    3536#include "mymath.h"
    3637#include "output.h"
     38#include "perturbed.h"
    3739#include "wannier.h"
    3840
    3941
    4042#define max_operators NDIM*2 //!< number of chosen self-adjoint operators when evaluating the spread
     43#define type Occupied
    4144
    4245
     
    5760 * \note taken from [Golub, Matrix computations, 1989, p451]
    5861 */
    59 void music(int *top, int *bot, int m)
     62void MerryGoRoundIndices(int *top, int *bot, int m)
    6063{
    6164  int *old_top, *old_bot;
     
    9699  fprintf(stderr,"\n");*/
    97100  // and finito
    98   Free(old_top, "bla");
    99   Free(old_bot, "bla");
     101  Free(old_top, "MerryGoRoundIndices: old_top");
     102  Free(old_bot, "MerryGoRoundIndices: old_bot");
    100103}
    101104
     
    114117 * \param tagR1 MPI tag for receiving right column
    115118 */
    116 void shiftcolumns(MPI_Comm comm, double **Aloc, int Num, int max_rounds, int k, int tagS0, int tagS1, int tagR0, int tagR1) {
     119void MerryGoRoundColumns(MPI_Comm comm, double **Aloc, int Num, int max_rounds, int k, int tagS0, int tagS1, int tagR0, int tagR1) {
    117120  //double *A_locS1, *A_locS2;  // local columns of A[k]
    118121  //double *A_locR1, *A_locR2;  // local columns of A[k]
     
    213216    MPI_Wait(&requestR1, &status);
    214217  //fprintf(stderr,"...done\n");
     218}
     219
     220/** By testing of greatest common divisor of the matrix rows (\a AllocNum) finds suitable parallel cpu group.
     221 * \param *P Problem at hand
     222 * \param AllocNum number of rows in matrix
     223 * \return address of MPI communicator
     224 */
     225#ifdef HAVE_INLINE
     226inline MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
     227#else
     228MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
     229#endif
     230{
     231  MPI_Comm *comm = &P->Par.comm_ST;
     232 
     233  //if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Comparing groups - AllocNum %i --- All %i\t Psi %i\t PsiT %i\n",P->Par.me, AllocNum, P->Par.Max_me_comm_ST, P->Par.Max_me_comm_ST_Psi, P->Par.Max_my_color_comm_ST_Psi);
     234  //if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Jacobi diagonalization is done parallely by ", P->Par.me);
     235  if (AllocNum % (P->Par.Max_me_comm_ST*2) == 0) {  // all parallel
     236    comm = &P->Par.comm_ST;
     237    //if (P->Call.out[ReadOut]) fprintf(stderr,"all\n");           
     238  } else if (P->Par.Max_me_comm_ST_Psi >= P->Par.Max_my_color_comm_ST_Psi) { // always the bigger group comes first 
     239    if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) {  // coefficients parallel
     240      comm = &P->Par.comm_ST_Psi;
     241      //if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");       
     242    } else if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) {  // Psis parallel
     243      comm = &P->Par.comm_ST_PsiT;
     244      //if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");       
     245    }
     246  } else {
     247    if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) {  // Psis parallel
     248      comm = &P->Par.comm_ST_PsiT;
     249      //if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");       
     250    } else if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) {  // coefficients parallel
     251      comm = &P->Par.comm_ST_Psi;
     252      //if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");       
     253    }
     254  }
     255  return comm;
     256}
     257
     258/** Allocates and fills Lookup table for sin/cos values at each grid node.
     259 * \param ***cos_table pointer to two-dimensional lookup table for cosine values
     260 * \param ***sin_table pointer to two-dimensional lookup table for sine values
     261 * \param *N array with number of nodes per \a NDIM axis
     262 */
     263void CreateSinCosLookupTable(double ***cos_table, double ***sin_table, int *N)
     264{
     265  int i, j;
     266  double argument;
     267  double **cos_lookup, **sin_lookup;
     268 
     269  // create lookup table for sin/cos values
     270  cos_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *cos_lookup");
     271  sin_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *sin_lookup"); 
     272  for (i=0;i<NDIM;i++) {
     273    // allocate memory
     274    cos_lookup[i] = (double *) Malloc(sizeof(double)*N[i], "ComputeMLWF: cos_lookup");
     275    sin_lookup[i] = (double *) Malloc(sizeof(double)*N[i], "ComputeMLWF: sin_lookup");
     276   
     277    // reset arrays
     278    SetArrayToDouble0(cos_lookup[i],N[i]);
     279    SetArrayToDouble0(sin_lookup[i],N[i]);
     280   
     281    // create lookup values
     282    for (j=0;j<N[i];j++) {
     283      argument = 2*PI/(double)N[i]*(double)j;
     284      cos_lookup[i][j] = cos(argument);
     285      sin_lookup[i][j] = sin(argument);
     286    }
     287  }
     288  *cos_table = cos_lookup;
     289  *sin_table = sin_lookup;
     290}
     291
     292/** Frees memory allocated during CreateSinCosLookupTable().
     293 * \param ***cos_lookup pointer to two-dimensional lookup table for cosine values
     294 * \param ***sin_lookup pointer to two-dimensional lookup table for sine values
     295 */
     296void FreeSinCosLookupTable(double **cos_lookup, double **sin_lookup)
     297{
     298  int i;
     299  // free lookups
     300  for (i=0;i<NDIM;i++) {
     301    Free(cos_lookup[i], "FreeSinCosLookupTable: cos_lookup[i]");
     302    Free(sin_lookup[i], "FreeSinCosLookupTable: sin_lookup[i]");
     303  }
     304  Free(cos_lookup, "FreeSinCosLookupTable: cos_lookup");
     305  Free(sin_lookup, "FreeSinCosLookupTable: sin_lookup");
     306}
     307
     308/** Fills the entries of the six variance matrices.
     309 * These matrices are parallely diagonalized during Wannier Localization. They are calculated from the
     310 * wave function and by diagonalization one obtains the unitary transformation with which the Psis are
     311 * treated afterwards.
     312 * \param *P Problem at hand
     313 * \param AllocNum number of rows/columns
     314 * \param **A pointer to variance matrices
     315 * \sa ComputeMLWF() - master function.
     316 */
     317void FillHigherOrderRealMomentsMatrices(struct Problem *P, int AllocNum, gsl_matrix **A)
     318{
     319  struct Lattice *Lat = &P->Lat;
     320  struct RunStruct *R = &P->R;
     321  struct Psis *Psi = &Lat->Psi;
     322  struct LatticeLevel *Lev0 = R->Lev0;
     323  struct LatticeLevel *LevS = R->LevS;
     324  struct Density *Dens0 = Lev0->Dens;
     325  struct OnePsiElement *OnePsiA, *OnePsiB, *LOnePsiB;
     326  struct fft_plan_3d *plan = Lat->plan;
     327  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
     328  fftw_real *PsiCR = (fftw_real *)PsiC;
     329  fftw_complex *work = Dens0->DensityCArray[Temp2Density];
     330  fftw_real **HGcR = &Dens0->DensityArray[HGDensity];  // use HGDensity, 4x Gap..Density, TempDensity as a storage array
     331  fftw_complex **HGcRC = (fftw_complex**)HGcR;
     332  fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity];  // use HGcDensity, 4x Gap..Density, TempDensity as an array
     333  fftw_real **HGcR2 = (fftw_real**)HGcR2C;
     334  int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
     335  MPI_Status status;
     336  fftw_complex *LPsiDatA=NULL, *LPsiDatB=NULL;
     337  int n[NDIM],n0,i0,iS, Index;
     338  int Num = Psi->NoOfPsis;  // is number of occupied plus unoccupied states for rows
     339  int N0 = LevS->Plan0.plan->local_nx;
     340  int *N = LevS->Plan0.plan->N;
     341  const int NUpx = LevS->NUp[0];
     342  const int NUpy = LevS->NUp[1];
     343  const int NUpz = LevS->NUp[2];
     344  double argument, PsiAtNode;
     345  int e,g,i,j,k,l,m,p,u;
     346  double a_ij = 0, b_ij = 0, A_ij = 0, B_ij = 0;
     347  double **cos_lookup =  NULL,**sin_lookup = NULL;
     348
     349  if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 2\n",P->Par.me);
     350
     351  debug(P, "Creating Lookup Table");
     352  CreateSinCosLookupTable(&cos_lookup, &sin_lookup, N);
     353
     354  debug(P, "Calculating each entry of variance matrices");
     355  l=-1; // to access U matrix element (0..Num-1)
     356  // fill the matrices
     357  for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions
     358    OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
     359    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
     360      l++;  // increase l if it is non-extra wave function
     361      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
     362        LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
     363      else
     364        LPsiDatA = NULL;  // otherwise processes won't enter second loop, though they're supposed to send coefficients!
     365     
     366      //fprintf(stderr,"(%i),(%i,%i): fft'd, A[..] and B, back-fft'd acting on \\phi_A\n",P->Par.me,l,0);
     367      if (LPsiDatA != NULL) {
     368        CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
     369        // note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
     370        for (n0=0;n0<N0;n0++) 
     371          for (n[1]=0;n[1]<N[1];n[1]++)
     372            for (n[2]=0;n[2]<N[2];n[2]++) {
     373              i0 = n[2]*NUpz+N[2]*NUpz*(n[1]*NUpy+N[1]*NUpy*n0*NUpx);
     374              iS = n[2]+N[2]*(n[1]+N[1]*n0);
     375              n[0] = n0 + LevS->Plan0.plan->start_nx;
     376              for (k=0;k<max_operators;k+=2) {
     377                e = k/2;
     378                argument = 2.*PI/(double)(N[e])*(double)(n[e]);
     379                PsiAtNode = PsiCR[i0] /LevS->MaxN;
     380                // check lookup
     381                if (!l) // perform check on first wave function only
     382                  if ((fabs(cos(argument) - cos_lookup[e][n[e]]) > MYEPSILON) || (fabs(sin(argument) - sin_lookup[e][n[e]]) > MYEPSILON)) {
     383                    Error(SomeError, "Lookup table does not match real value!");
     384                  }
     385                HGcR[k][iS] = cos_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
     386                HGcR2[k][iS] = cos_lookup[e][n[e]] * HGcR[k][iS]; /* Matrix Vector Mult */
     387                HGcR[k+1][iS] = sin_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
     388                HGcR2[k+1][iS] = sin_lookup[e][n[e]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
     389              }
     390            }
     391        for (u=0;u<max_operators;u++) {
     392          fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[u], work);
     393          fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[u], work);
     394        }
     395      } 
     396      m = -1; // to access U matrix element (0..Num-1)
     397      for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
     398        OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
     399        if (OnePsiB->PsiType == type) {   // drop all but occupied ones
     400          m++;  // increase m if it is non-extra wave function
     401          if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
     402             LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
     403          else
     404             LOnePsiB = NULL;
     405          if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
     406            RecvSource = OnePsiB->my_color_comm_ST_Psi;
     407            MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
     408            LPsiDatB=LevS->LPsi->TempPsi;
     409          } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
     410            for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
     411              if (p != OnePsiB->my_color_comm_ST_Psi)
     412                MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
     413            LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
     414          } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
     415
     416          for (u=0;u<max_operators;u++) {
     417            a_ij = 0;
     418            b_ij = 0;
     419            if (LPsiDatA != NULL) { // calculate, reduce and send to all ...
     420              //fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,u);
     421              g=0;
     422              if (LevS->GArray[0].GSq == 0.0) {
     423                Index = LevS->GArray[g].Index;
     424                a_ij = (LPsiDatB[0].re*HGcRC[u][Index].re + LPsiDatB[0].im*HGcRC[u][Index].im);
     425                b_ij = (LPsiDatB[0].re*HGcR2C[u][Index].re + LPsiDatB[0].im*HGcR2C[u][Index].im);
     426                g++;
     427              }
     428              for (; g < LevS->MaxG; g++) {
     429                Index = LevS->GArray[g].Index;
     430                a_ij += 2*(LPsiDatB[g].re*HGcRC[u][Index].re + LPsiDatB[g].im*HGcRC[u][Index].im);
     431                b_ij += 2*(LPsiDatB[g].re*HGcR2C[u][Index].re + LPsiDatB[g].im*HGcR2C[u][Index].im);
     432              } // due to the symmetry the resulting matrix element is real and symmetric in (i,j) ! (complex multiplication simplifies ...)
     433              // sum up elements from all coefficients sharing processes
     434              MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
     435              MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
     436              a_ij = A_ij;
     437              b_ij = B_ij;
     438              // send element to all Psi-sharing who don't have l local (MPI_Send is a lot slower than AllReduce!)
     439              MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
     440              MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
     441            } else { // receive ...
     442              MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
     443              MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);             
     444            }
     445            // ... and store
     446            //fprintf(stderr,"(%i),(%i,%i): A[%i]: setting component (local: %lg, total: %lg)\n",P->Par.me, l,m,u,a_ij,A_ij);
     447            //fprintf(stderr,"(%i),(%i,%i): B: adding upon component (local: %lg, total: %lg)\n",P->Par.me, l,m,b_ij,B_ij);
     448            gsl_matrix_set(A[u], l, m, A_ij);
     449            gsl_matrix_set(A[max_operators], l, m, B_ij + gsl_matrix_get(A[max_operators],l,m));
     450          }
     451        }
     452      }
     453    }
     454  }
     455  // reset extra entries
     456  for (u=0;u<=max_operators;u++) {
     457    for (i=Num;i<AllocNum;i++)
     458      for (j=0;j<AllocNum;j++)
     459        gsl_matrix_set(A[u], i,j, 0.);
     460    for (i=Num;i<AllocNum;i++)
     461      for (j=0;j<AllocNum;j++)
     462        gsl_matrix_set(A[u], j,i, 0.);
     463  }
     464  FreeSinCosLookupTable(cos_lookup, sin_lookup);
     465
     466/*// print A matrices for debug
     467  if (P->Par.me == 0)
     468    for (u=0;u<max_operators+1;u++) {
     469     fprintf(stderr, "A[%i] = \n",u);
     470      for (i=0;i<Num;i++) {
     471        for (j=0;j<Num;j++)
     472          fprintf(stderr, "%e\t",gsl_matrix_get(A[u],i,j));
     473        fprintf(stderr, "\n");
     474      }
     475    } 
     476*/
     477}
     478
     479/** Calculates reciprocal second order moments of each PsiType#Occupied orbital.
     480 * First order is zero due to wave function being real (symmetry condition).
     481 * \param *P Problem at hand
     482 */
     483void CalculateSecondOrderReciprocalMoment(struct Problem *P)
     484{
     485  struct Lattice *Lat = &P->Lat;
     486  struct RunStruct *R = &P->R;
     487  struct FileData *F = &P->Files;
     488  struct Psis *Psi = &Lat->Psi;
     489  struct LatticeLevel *LevS = R->LevS;
     490  double result, Result;
     491  fftw_complex *LPsiDatA=NULL;
     492  struct OnePsiElement *OnePsiA;
     493  int i,j,l,g;
     494  char spin[12], suffix[18];
     495
     496  switch (Lat->Psi.PsiST) {
     497  case SpinDouble:
     498    strcpy(suffix,".recispread.csv");   
     499    strcpy(spin,"SpinDouble");
     500    break;
     501  case SpinUp:
     502    strcpy(suffix,".recispread_up.csv");   
     503    strcpy(spin,"SpinUp");
     504    break;
     505  case SpinDown:
     506    strcpy(suffix,".recispread_down.csv");   
     507    strcpy(spin,"SpinDown");
     508    break;
     509  }
     510  if(P->Par.me_comm_ST == 0) {
     511    if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Calculating reciprocal moments ...\n",P->Par.me);
     512    if (R->LevSNo == Lat->MaxLevel-1) // open freshly if first level
     513      OpenFile(P, &F->ReciSpreadFile, suffix, "w", P->Call.out[ReadOut]); // only open on starting level
     514    else if (F->ReciSpreadFile == NULL) // re-op,18en if not first level and not opened yet (or closed from ParseWannierFile)
     515      OpenFile(P, &F->ReciSpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
     516    if (F->ReciSpreadFile == NULL) {
     517      Error(SomeError,"ComputeMLWF: Error opening Reciprocal spread File!\n");
     518    } else {   
     519      fprintf(F->ReciSpreadFile,"===== Reciprocal Spreads of type %s ==========================================================================\n", spin);
     520    }
     521  }
     522
     523  // integrate second order moment
     524  for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {  // go through all wave functions
     525    OnePsiA = &Psi->AllPsiStatus[l];    // grab OnePsiA
     526    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
     527      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
     528        LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
     529      else
     530        LPsiDatA = NULL;
     531     
     532      if (LPsiDatA != NULL) {
     533        if (P->Par.me_comm_ST == 0)
     534          fprintf(F->ReciSpreadFile,"Psi%d_Lev%d\t", Psi->AllPsiStatus[l].MyGlobalNo, R->LevSNo);
     535        for (i=0;i<NDIM;i++) {
     536          for (j=0;j<NDIM;j++) {
     537            result = 0.;
     538            g = 0;
     539            if (LevS->GArray[0].GSq == 0.0) {
     540              result += LevS->GArray[g].G[i]*LevS->GArray[g].G[j]*(LPsiDatA[0].re * LPsiDatA[0].re);
     541              g++;
     542            }
     543            for (;g<LevS->MaxG;g++)
     544              result += LevS->GArray[g].G[i]*LevS->GArray[g].G[j]*2.*(LPsiDatA[g].re * LPsiDatA[g].re + LPsiDatA[g].im * LPsiDatA[g].im);
     545            //result *= Lat->Volume/LevS->MaxG;
     546            MPI_Allreduce ( &result, &Result, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
     547            if (P->Par.me_comm_ST == 0)
     548              fprintf(F->ReciSpreadFile,"%2.5lg  ", Result);
     549          }
     550        }
     551        if (P->Par.me_comm_ST == 0)
     552          fprintf(F->ReciSpreadFile,"\n");
     553      }
     554    }
     555  }
     556  if(P->Par.me_comm_ST == 0) {
     557    fprintf(F->ReciSpreadFile,"====================================================================================================================\n\n");
     558    fflush(F->ReciSpreadFile);
     559  }
     560}
     561
     562/** Given the unitary matrix the transformation is performed on the Psis.
     563 * \param *P Problem at hand
     564 * \param *U gsl matrix containing the transformation matrix
     565 * \param Num dimension parameter for the matrix, i.e. number of wave functions
     566 */
     567void UnitaryTransformationOnWavefunctions(struct Problem *P, gsl_matrix *U, int Num)
     568{
     569  struct Lattice *Lat = &P->Lat;
     570  struct RunStruct *R = &P->R;
     571  struct Psis *Psi = &Lat->Psi;
     572  struct LatticeLevel *LevS = R->LevS;
     573  MPI_Status status;
     574  struct OnePsiElement *OnePsiB, *OnePsiA, *LOnePsiB;
     575  int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
     576  fftw_complex *LPsiDatA=NULL, *LPsiDatB=NULL;
     577  int g,i,j,l,k,m,p;
     578
     579  //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation of all wave functions according to U\n",P->Par.me);
     580 
     581  Num = Psi->TypeStartIndex[type+1] - Psi->TypeStartIndex[type]; // recalc Num as we can only work with local Psis from here
     582  fftw_complex **coeffs_buffer = Malloc(sizeof(fftw_complex *)*Num, "ComputeMLWF: **coeffs_buffer");
     583
     584  for (l=0;l<Num;l++) // allocate for each local wave function
     585    coeffs_buffer[l] = LevS->LPsi->OldLocalPsi[l];
     586 
     587  //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation ...\n",P->Par.me);
     588  l=-1; // to access U matrix element (0..Num-1)
     589  k=-1; // to access the above swap coeffs_buffer (0..LocalNo-1)
     590  for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions
     591    OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
     592    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
     593      l++;  // increase l if it is occupied wave function
     594      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // local?
     595        k++; // increase k only if it is a local, non-extra orbital wave function
     596        LPsiDatA = (fftw_complex *) coeffs_buffer[k];    // new coeffs first go to copy buffer, old ones must not be overwritten yet
     597        SetArrayToDouble0((double *)LPsiDatA, 2*LevS->MaxG);  // zero buffer part
     598      } else
     599        LPsiDatA = NULL;  // otherwise processes won't enter second loop, though they're supposed to send coefficients!
     600
     601      m = -1; // to access U matrix element (0..Num-1)
     602      for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
     603        OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
     604        if (OnePsiB->PsiType == type) {   // drop all but occupied ones
     605          m++;  // increase m if it is occupied wave function
     606          if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
     607             LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
     608          else
     609             LOnePsiB = NULL;
     610          if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
     611            RecvSource = OnePsiB->my_color_comm_ST_Psi;
     612            MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
     613            LPsiDatB=LevS->LPsi->TempPsi;
     614          } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
     615            for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
     616              if (p != OnePsiB->my_color_comm_ST_Psi)
     617                MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
     618            LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
     619          } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
     620
     621          if (LPsiDatA != NULL) {
     622            double tmp = gsl_matrix_get(U,l,m);
     623            g=0;
     624            if (LevS->GArray[0].GSq == 0.0) {
     625              LPsiDatA[g].re += LPsiDatB[g].re * tmp;
     626              LPsiDatA[g].im += LPsiDatB[g].im * tmp;
     627              g++;
     628            }
     629            for (; g < LevS->MaxG; g++) {
     630              LPsiDatA[g].re += LPsiDatB[g].re * tmp;
     631              LPsiDatA[g].im += LPsiDatB[g].im * tmp;
     632            }
     633          }
     634        }
     635      }
     636    }
     637  }
     638
     639  //if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) STEP 6: Swapping buffer mem\n",P->Par.me);
     640  // now, as all wave functions are updated, swap the buffer
     641  l = -1;
     642  for (k=0;k<Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT;k++) { // go through each local occupied wave function
     643    if (Psi->AllPsiStatus[k].PsiType == type && Psi->AllPsiStatus[k].my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {
     644      l++;
     645      //if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) (k:%i,l:%i) LocalNo = (%i,%i)\t AllPsiNo = (%i,%i)\n", P->Par.me, k,l,Psi->LocalPsiStatus[l].MyLocalNo, Psi->LocalPsiStatus[l].MyGlobalNo, Psi->AllPsiStatus[k].MyLocalNo, Psi->AllPsiStatus[k].MyGlobalNo);
     646      LPsiDatA = (fftw_complex *)coeffs_buffer[l];
     647      LPsiDatB = LevS->LPsi->LocalPsi[l];
     648      for (g=0;g<LevS->MaxG;g++) {
     649        LPsiDatB[g].re = LPsiDatA[g].re;
     650        LPsiDatB[g].im = LPsiDatA[g].im;
     651      }
     652      // recalculating non-local form factors which are coefficient dependent!
     653      CalculateNonLocalEnergyNoRT(P, Psi->LocalPsiStatus[l].MyLocalNo);
     654    }
     655  }
     656  // and free allocated buffer memory
     657  Free(coeffs_buffer, "UnitaryTransformationOnWavefunctions: coeffs_buffer");
     658}
     659
     660/** Changes Wannier Centres according to RunStruct#CommonWannier.
     661 * \param *P Problem at hand.
     662 * \param Num number of Psis
     663 * \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
     664 * \param *WannierSpread array with wannier spread per wave function
     665 */
     666void ChangeWannierCentres(struct Problem *P, int Num, double **WannierCentre, double *WannierSpread)
     667{
     668  struct RunStruct *R = &P->R;
     669  struct Lattice *Lat = &P->Lat;
     670  struct LatticeLevel *LevS = R->LevS;
     671  int *marker, **group;
     672  int partner[Num];
     673  int i,j,l,k;
     674  int totalflag, flag;
     675  double q[NDIM], center[NDIM];
     676  double Spread;
     677  int *N = LevS->Plan0.plan->N;
     678 
     679  switch (R->CommonWannier) {
     680   case 4:
     681    debug(P,"Shifting each Wannier centers to cell center");
     682    for (j=0;j<NDIM;j++)  // center point in [0,1]^3
     683      center[j] = 0.5;
     684    RMat33Vec3(q,Lat->RealBasis, center); // transform to real coordinates
     685    for (i=0; i < Num; i++) {  // go through all occupied wave functions
     686      for (j=0;j<NDIM;j++)  // put into Wannier centres
     687        WannierCentre[i][j] = q[j];
     688    }   
     689    break;
     690   case 3:
     691    debug(P,"Shifting Wannier centers individually to nearest grid point");
     692    for (i=0;i < Num; i++) {  // go through all wave functions
     693      RMat33Vec3(q, Lat->ReciBasis, WannierCentre[i]);
     694      for (j=0;j<NDIM;j++) {  // Recibasis is not true inverse but times 2.*PI
     695        q[j] *= (double)N[j]/(2.*PI);
     696       
     697        //fprintf(stderr,"(%i) N[%i]: %i\t tmp %e\t floor %e\t ceil %e\n",P->Par.me, j, N[j], tmp, floor(tmp), ceil(tmp));
     698        if (fabs((double)floor(q[j]) - q[j]) < fabs((double)ceil(q[j]) - q[j]))
     699          q[j] = floor(q[j])/(double)N[j];
     700        else
     701          q[j] = ceil(q[j])/(double)N[j];
     702      }
     703      RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
     704    }
     705    break;
     706   case 2:
     707    debug(P,"Combining individual orbitals according to spread.");
     708    //fprintf(stderr,"(%i) Finding multiple bindings and Reweighting Wannier centres\n",P->Par.me);
     709    debug(P,"finding partners");
     710    marker = (int*) Malloc(sizeof(int)*(Num+1),"ComputeMLWF: marker");
     711    group = (int**) Malloc(sizeof(int *)*Num,"ComputeMLWF: group");
     712    for (l=0;l<Num;l++) {
     713      group[l] = (int*) Malloc(sizeof(int)*(Num+1),"ComputeMLWF: group[l]");  // each must group must have one more as end marker
     714      for (k=0;k<=Num;k++)
     715        group[l][k] = -1; // reset partner group
     716    }
     717    for (k=0;k<Num;k++)
     718      partner[k] = 0;
     719    debug(P,"mem allocated");
     720    // go for each orbital through every other, check distance against the sum of both spreads
     721    // if smaller add to group of this orbital
     722    for (l=0;l<Num;l++) {
     723      j=0;  // index for partner group
     724      for (k=0;k<Num;k++) {  // check this against l
     725        Spread = 0.;
     726        for (i=0;i<NDIM;i++) {
     727          //fprintf(stderr,"(%i) Spread += (%e - %e)^2 \n", P->Par.me, WannierCentre[l][i], WannierCentre[k][i]);
     728          Spread += (WannierCentre[l][i] - WannierCentre[k][i])*(WannierCentre[l][i] - WannierCentre[k][i]);
     729        }
     730        Spread = sqrt(Spread);  // distance in Spread
     731        //fprintf(stderr,"(%i) %i to %i: distance %e, SpreadSum = %e + %e = %e \n", P->Par.me, l, k, Spread, WannierSpread[l], WannierSpread[k], WannierSpread[l]+WannierSpread[k]);
     732        if (Spread < 1.5*(WannierSpread[l]+WannierSpread[k])) {// if distance smaller than sum of spread
     733          group[l][j++] = k;  // add k to group of l
     734          partner[l]++;
     735          //fprintf(stderr,"(%i) %i added as %i-th member to %i's group.\n", P->Par.me, k, j, l);
     736        }
     737      }
     738    }
     739   
     740    // consistency, for each orbital check if this orbital is also in the group of each referred orbital
     741    debug(P,"checking consistency");
     742    totalflag = 1;
     743    for (l=0;l<Num;l++) // checking l's group
     744      for (k=0;k<Num;k++) { // k is partner index
     745        if (group[l][k] != -1) {  // if current index k is a partner
     746          flag = 0;
     747          for(j=0;j<Num;j++) {  // go through each entry in l partner's partner group if l exists
     748            if ((group[ group[l][k] ][j] == l))
     749              flag = 1;
     750          }
     751          //if (flag == 0) fprintf(stderr, "(%i) in %i's group %i is referred as a partner, but not the other way round!\n", P->Par.me, l, group[l][k]);
     752          if (totalflag == 1) totalflag = flag;
     753        }
     754      }
     755    // for each orbital group (marker group) weight each center to a total and put this into the local WannierCentres
     756    debug(P,"weight and calculate new centers for partner groups");
     757    for (l=0;l<=Num;l++)
     758      marker[l] = 1;
     759    if (totalflag) {
     760      for (l=0;l<Num;l++) { // go through each orbital
     761        if (marker[l] != 0) { // if it hasn't been reweighted
     762          marker[l] = 0;
     763          for (i=0;i<NDIM;i++)
     764            q[i] = 0.;
     765          j = 0;
     766          while (group[l][j] != -1) {
     767            marker[group[l][j]] = 0;
     768            for (i=0;i<NDIM;i++) {
     769              //fprintf(stderr,"(%i) Adding to %i's group, %i entry of %i: %e\n", P->Par.me, l, i, group[l][j], WannierCentre[ group[l][j] ][i]);
     770              q[i] += WannierCentre[ group[l][j] ][i];
     771            }
     772            j++;
     773          }
     774          //fprintf(stderr,"(%i) %i's group: (%e,%e,%e)/%i = (%e,%e,%e)\n", P->Par.me, l, q[0], q[1], q[2], j, q[0]/(double)j, q[1]/(double)j, q[2]/(double)j);
     775          for (i=0;i<NDIM;i++) {// weight by number of elements in partner group
     776            q[i] /= (double)(j);
     777          }
     778 
     779          // put WannierCentre into own and all partners'     
     780          for (i=0;i<NDIM;i++)
     781            WannierCentre[l][i] = q[i];
     782          j = 0; 
     783          while (group[l][j] != -1) {
     784            for (i=0;i<NDIM;i++)
     785              WannierCentre[group[l][j]][i] = q[i];
     786            j++;
     787          }
     788        }
     789      }
     790    }
     791    if (P->Call.out[StepLeaderOut]) {
     792      fprintf(stderr,"Summary:\n");
     793      fprintf(stderr,"========\n");
     794      for (i=0;i<Num;i++)
     795        fprintf(stderr,"%i belongs to a %i-ple binding.\n",i,partner[i]);
     796    }
     797    debug(P,"done");
     798 
     799    Free(marker, "ChangeWannierCentres: marker");
     800    for (l=0;l<Num;l++)
     801      Free(group[l], "ChangeWannierCentres: group[l]");
     802    Free(group, "ChangeWannierCentres: group");
     803    break;
     804   case 1:
     805    debug(P,"Individual orbitals are changed to center of all.");
     806    for (i=0;i<NDIM;i++)  // zero center of weight
     807      q[i] = 0.;
     808    for (k=0;k<Num;k++)
     809      for (i=0;i<NDIM;i++) {  // sum up all orbitals each component
     810        q[i] += WannierCentre[k][i];
     811      }
     812    for (i=0;i<NDIM;i++)  // divide by number
     813      q[i] /= Num;
     814    for (k=0;k<Num;k++)
     815      for (i=0;i<NDIM;i++) {  // put into this function's array
     816        WannierCentre[k][i] = q[i];
     817      }
     818    break;
     819   case 0:
     820   default:
     821   break;
     822  }
     823}
     824
     825/** From the entries of the variance matrices the spread is calculated.
     826 * WannierCentres are evaluated according to Resta operator: \f$\langle X \rangle = \frac{L}{2\pi} \sum_j {\cal I} \ln{\langle \Psi_j | \exp{(i \frac{2\pi}{L} X)} | \Psi_j \rangle}\f$
     827 * WannierSpread is: \f$ \sum_j \langle \Psi_j | r^2 | \Psi_j \rangle - \langle \Psi_j | r | psi_j \rangle^2\f$
     828 * \param *P Problem at hand
     829 * \param *A variance matrices
     830 * \param old_spread first term of wannier spread
     831 * \param spread second term of wannier spread
     832 * \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
     833 * \param *WannierSpread array with wannier spread per wave function
     834 */
     835void ComputeWannierCentresfromVarianceMatrices(struct Problem *P, gsl_matrix **A, double *spread, double *old_spread, double **WannierCentre, double *WannierSpread)
     836{
     837  struct Lattice *Lat = &P->Lat;
     838  struct Psis *Psi = &Lat->Psi;
     839  struct OnePsiElement *OnePsiA;
     840  int i,j,k,l;
     841  double tmp, q[NDIM];
     842 
     843  *old_spread = 0;
     844  *spread = 0;
     845
     846  // the spread for x,y,z resides in the respective diagonal element of A_.. for each orbital
     847  i=-1;
     848  for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {  // go through all wave functions
     849    OnePsiA = &Psi->AllPsiStatus[l];    // grab OnePsiA
     850    if (OnePsiA->PsiType == type) {   // drop all but occupied ones
     851      i++;  // increase l if it is occupied wave function
     852      //fprintf(stderr,"(%i) Wannier for %i\n", P->Par.me, i);
     853     
     854      // calculate Wannier Centre
     855      for (j=0;j<NDIM;j++) {
     856        q[j] = 1./(2.*PI) * GSL_IMAG( gsl_complex_log( gsl_complex_rect(gsl_matrix_get(A[j*2],i,i),gsl_matrix_get(A[j*2+1],i,i))));
     857        if (q[j] < 0) // change wrap around of above operator to smooth 0...Lat->RealBasisSQ
     858          q[j] += 1.;
     859      }
     860      RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
     861     
     862      // store orbital spread and centre in file
     863      tmp = - pow(gsl_matrix_get(A[0],i,i),2) - pow(gsl_matrix_get(A[1],i,i),2)
     864            - pow(gsl_matrix_get(A[2],i,i),2) - pow(gsl_matrix_get(A[3],i,i),2)
     865            - pow(gsl_matrix_get(A[4],i,i),2) - pow(gsl_matrix_get(A[5],i,i),2);         
     866      WannierSpread[i] = gsl_matrix_get(A[max_operators],i,i) + tmp;
     867      //fprintf(stderr,"(%i) WannierSpread[%i] = %e\n", P->Par.me, i, WannierSpread[i]);
     868      //if (P->Par.me == 0) fprintf(F->SpreadFile,"Orbital %d:\t Wannier center (x,y,z)=(%lg,%lg,%lg)\t Spread sigma^2 = %lg - %lg = %lg\n",
     869        //Psi->AllPsiStatus[i].MyGlobalNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], gsl_matrix_get(A[max_operators],i,i), -tmp, WannierSpread[i]);
     870      //if (P->Par.me == 0) fprintf(F->SpreadFile,"%e\t%e\t%e\n",
     871        //WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2]);
     872     
     873      // gather all spreads
     874      *old_spread += gsl_matrix_get(A[max_operators],i,i); // tr(U^H B U)
     875      for (k=0;k<max_operators;k++)
     876        *spread += pow(gsl_matrix_get(A[k],i,i),2);
     877    }
     878  }
     879}
     880
     881/** Prints gsl_matrix nicely to screen.
     882 * \param *P Problem at hand
     883 * \param *U gsl matrix
     884 * \param Num number of rows/columns
     885 * \param *msg name of matrix, prepended before entries
     886 */
     887void PrintGSLMatrix(struct Problem *P, gsl_matrix *U, int Num, const char *msg)
     888{
     889  int k,l;
     890  fprintf(stderr,"(%i) %s = \n",P->Par.me, msg);
     891  for (k=0;k<Num;k++) {   
     892    for (l=0;l<Num;l++)
     893      fprintf(stderr,"%e\t",gsl_matrix_get(U,l,k));
     894    fprintf(stderr,"\n");
     895  }
     896}
     897
     898/** Allocates memory for the  DiagonalizationData structure
     899 * \param *P Problem at hand
     900 * \param DiagData pointer to structure
     901 * \param Num number of rows and columns in matrix
     902 * \param NumMatrices number of matrices to be simultaneously diagonalized
     903 * \param extra number of extra matrices to be passively also diagonalized
     904 */
     905void InitDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData, int Num, int NumMatrices, int extra)
     906{
     907  int i;
     908 
     909  // store integer values
     910  //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 1\n",P->Par.me);
     911  DiagData->Num = Num;
     912  DiagData->AllocNum = ceil((double)Num / 2. ) *2;
     913  DiagData->NumMatrices = NumMatrices;
     914  DiagData->extra = extra;
     915 
     916  // determine communicator and our rank and its size
     917  DiagData->comm = DetermineParallelGroupbyGCD(P,DiagData->AllocNum);
     918  MPI_Comm_size (*(DiagData->comm), &(DiagData->ProcNum));
     919  MPI_Comm_rank (*(DiagData->comm), &(DiagData->ProcRank));
     920 
     921  // allocate memory
     922  DiagData->U = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
     923  gsl_matrix_set_identity(DiagData->U);
     924 
     925  DiagData->A = (gsl_matrix **) Malloc((DiagData->NumMatrices+DiagData->extra) * sizeof(gsl_matrix *), "InitDiagonalization: *A");
     926  for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++) {
     927    DiagData->A[i] = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
     928    gsl_matrix_set_zero(DiagData->A[i]);
     929  }
     930
     931  // init merry-go-round array for diagonilzation
     932  DiagData->top = (int *) Malloc(sizeof(int)*DiagData->AllocNum/2, "InitDiagonalization: top");
     933  DiagData->bot = (int *) Malloc(sizeof(int)*DiagData->AllocNum/2, "InitDiagonalization: bot");
     934  for (i=0;i<DiagData->AllocNum/2;i++) {
     935    DiagData->top[i] = 2*i;
     936    DiagData->bot[i] = 2*i+1;     
     937  }
     938/*    // print starting values of index generation tables top and bot
     939      fprintf(stderr,"top\t");
     940      for (k=0;k<AllocNum/2;k++)
     941        fprintf(stderr,"%i\t", top[k]);
     942      fprintf(stderr,"\n");
     943      fprintf(stderr,"bot\t");
     944      for (k=0;k<AllocNum/2;k++)
     945        fprintf(stderr,"%i\t", bot[k]);
     946      fprintf(stderr,"\n");*/
     947}
     948
     949/** Frees allocated memory for the DiagonalizationData structure.
     950 * \param DiagData pointer to structure
     951 */
     952void FreeDiagonalization(struct DiagonalizationData *DiagData)
     953{
     954  int i;
     955 
     956  Free(DiagData->top, "FreeDiagonalization: DiagData->top");
     957  Free(DiagData->bot, "FreeDiagonalization: DiagData->bot");
     958  gsl_matrix_free(DiagData->U);
     959  for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++)
     960    gsl_matrix_free(DiagData->A[i]);   
     961  Free(DiagData->A, "FreeDiagonalization: DiagData->A");
     962}
     963
     964/** Orthogonalizing PsiType#Occupied wave functions for MD.
     965 * Orthogonalizes wave functions by finding a unitary transformation such that the density remains unchanged.
     966 * To this end, the overlap matrix \f$\langle \Psi_i | \Psi_j \rangle\f$ is created and diagonalised by Jacobi
     967 * transformation (product of rotation matrices). The found transformation matrix is applied to all Psis.
     968 * \param *P Problem at hand
     969 * \note [Payne] states that GramSchmidt is not well-suited. However, this Jacobi diagonalisation is not well
     970 * suited for our CG minimisation either. If one wave functions is changed in course of the minimsation
     971 * procedure, in a Gram-Schmidt-Orthogonalisation only this wave function is changed (although it remains under
     972 * debate whether subsequent Psis are still orthogonal to this changed wave function!), in a Jacobi-Orthogonali-
     973 * stion however at least two if not all wave functions are changed. Thus inhibits our fast way of updating the
     974 * density after a minimisation step -- UpdateDensityCalculation(). Thus, this routine can only be used for a
     975 * final orthogonalisation of all Psis, after the CG minimisation.
     976 */
     977void OrthogonalizePsis(struct Problem *P)
     978{
     979  struct Lattice *Lat = &P->Lat;
     980  struct Psis *Psi = &Lat->Psi;
     981  struct LatticeLevel *LevS = P->R.LevS;
     982  int Num = Psi->NoOfPsis;
     983  int i,j,l;
     984  struct DiagonalizationData DiagData;
     985  double PsiSP;
     986
     987  InitDiagonalization(P, &DiagData, Num, 1, 0);
     988   
     989  // Calculate Overlap matrix
     990  for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
     991    CalculateOverlap(P, l, Occupied); 
     992  for (i=0;i<Num;i++) // fill gsl matrix with overlap values
     993    for (j=0;j<Num;j++)
     994      gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
     995  //if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i|Psi_J> (before)");
     996 
     997  //if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (before)");
     998
     999  // diagonalize overlap matrix
     1000  Diagonalize(P, &DiagData);
     1001
     1002  //if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (after)");
     1003
     1004  // apply found unitary transformation to wave functions
     1005  UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);
     1006
     1007  // update GramSch stati
     1008  for (i=Psi->TypeStartIndex[Occupied];i<Psi->TypeStartIndex[UnOccupied];i++) {
     1009    GramSchNormalize(P,LevS,LevS->LPsi->LocalPsi[i], 0.);   // calculates square product if 0. is given...
     1010    PsiSP = GramSchGetNorm2(P,LevS,LevS->LPsi->LocalPsi[i]);
     1011    //if (P->Par.me_comm_ST_Psi == 0) fprintf(stderr,"(%i) PsiSP[%i] = %lg\n", P->Par.me, i, PsiSP);
     1012    Psi->LocalPsiStatus[i].PsiGramSchStatus = (int)IsOrthonormal;
     1013  }
     1014  UpdateGramSchAllPsiStatus(P,Psi);
     1015/*
     1016  for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
     1017    CalculateOverlap(P, l, Occupied); 
     1018  for (i=0;i<Num;i++) // fill gsl matrix with overlap values
     1019    for (j=0;j<Num;j++)
     1020      gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
     1021  if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i|Psi_J> (after)");
     1022  */
     1023  // free memory
     1024  FreeDiagonalization(&DiagData);
     1025}
     1026
     1027/** Orthogonalizing PsiType#Occupied wave functions and their time derivatives for MD.
     1028 * Ensures that two orthonormality condition are met for Psis:
     1029 * \f$\langle \Psi_i | \Psi_j \rangle = \delta_{ij}\f$
     1030 * \f$\langle \dot{\Psi_i} | \dot{\Psi_j} \rangle = \delta_{ij}\f$
     1031 * \param *P Problem at hand
     1032 * \note [Payne] states that GramSchmidt is not well-suited, and this stronger
     1033 *       condition is needed for numerical stability
     1034 * \todo create and fill 1stoverlap with finite difference between Psi, OldPsi with R->Deltat
     1035 */
     1036void StrongOrthogonalizePsis(struct Problem *P)
     1037{
     1038  struct Lattice *Lat = &P->Lat;
     1039  struct Psis *Psi = &Lat->Psi;
     1040  int Num = Psi->NoOfPsis;
     1041  int i,j,l;
     1042  struct DiagonalizationData DiagData;
     1043
     1044  InitDiagonalization(P, &DiagData, Num, 2, 0);
     1045   
     1046  // Calculate Overlap matrix
     1047  for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++) {
     1048    CalculateOverlap(P, l, Occupied); 
     1049    //Calculate1stOverlap(P, l, Occupied);
     1050  } 
     1051  for (i=0;i<Num;i++) // fill gsl matrix with overlap values
     1052    for (j=0;j<Num;j++) {
     1053      gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
     1054      //gsl_matrix_set(DiagData.A[1],i,j,Psi->1stOverlap[i][j]);
     1055    }
     1056     
     1057  // diagonalize overlap matrix
     1058  Diagonalize(P, &DiagData);
     1059 
     1060  // apply found unitary transformation to wave functions
     1061  UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);
     1062 
     1063  // free memory
     1064  FreeDiagonalization(&DiagData);
     1065}
     1066
     1067/** Uses either serial or parallel diagonalization.
     1068 * \param *P Problem at hand
     1069 * \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
     1070 * \sa ParallelDiagonalization(), SerialDiagonalization()
     1071 */
     1072void Diagonalize(struct Problem *P, struct DiagonalizationData *DiagData)
     1073{
     1074  if (DiagData->Num != 1) { // one- or multi-process case?
     1075    if (((DiagData->AllocNum % 2) == 0) && (DiagData->ProcNum != 1) && ((DiagData->AllocNum / 2) % DiagData->ProcNum == 0)) {
     1076      /*
     1077      debug(P,"Testing with silly matrix");
     1078      ParallelDiagonalization(P, test, Utest, 1, 0, 4, Num, comm, ProcRank, ProcNum, top, bot);
     1079      if(P->Call.out[ReadOut]) // && P->Par.me == 0)
     1080        PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
     1081      if(P->Call.out[ReadOut]) // && P->Par.me == 0)
     1082        PrintGSLMatrix(P, Utest, 4, "Utest (final)");
     1083        */ 
     1084      debug(P,"ParallelDiagonalization");
     1085      ParallelDiagonalization(P, DiagData);
     1086    } else {/*
     1087      debug(P,"Testing with silly matrix");
     1088      SerialDiagonalization(P, test, Utest, 1, 0, 4, Num, top, bot);
     1089      if(P->Call.out[ReadOut]) // && P->Par.me == 0)
     1090        PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
     1091      if(P->Call.out[ReadOut]) // && P->Par.me == 0)
     1092        PrintGSLMatrix(P, Utest, 4, "Utest (final)");
     1093         */
     1094      debug(P,"SerialDiagonalization");
     1095      SerialDiagonalization(P, DiagData);
     1096    } 
     1097   
     1098    //if(P->Call.out[ReadOut]) // && P->Par.me == 0)
     1099      //PrintGSLMatrix(P, DiagData->U, DiagData->AllocNum, "U");
     1100  }
    2151101}
    2161102
     
    2561142void ComputeMLWF(struct Problem *P) {
    2571143  // variables and allocation
    258   struct FileData *F = &P->Files;
    2591144  struct Lattice *Lat = &P->Lat;
    260   struct RunStruct *R = &P->R;
    2611145  struct Psis *Psi = &Lat->Psi;
    262   struct LatticeLevel *Lev0 = R->Lev0;
    263   struct LatticeLevel *LevS = R->LevS;
    264   struct Density *Dens0 = Lev0->Dens;
    265   struct fft_plan_3d *plan = Lat->plan;
    266   fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
    267   fftw_real *PsiCR = (fftw_real *)PsiC;
    268   fftw_complex *work = Dens0->DensityCArray[Temp2Density];
    269   fftw_real **HGcR = &Dens0->DensityArray[HGDensity];  // use HGDensity, 4x Gap..Density, TempDensity as a storage array
    270   fftw_complex **HGcRC = (fftw_complex**)HGcR;
    271   fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity];  // use HGcDensity, 4x Gap..Density, TempDensity as an array
    272   fftw_real **HGcR2 = (fftw_real**)HGcR2C;
    273   MPI_Status status;
    274   struct OnePsiElement *OnePsiB, *OnePsiA, *LOnePsiB;
    275   int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
    276   fftw_complex *LPsiDatA=NULL, *LPsiDatB=NULL;
    277   int n[NDIM],n0,i0,iS, Index;
    278   int N0;
    279   int N[NDIM];
    280   const int NUpx = LevS->NUp[0];
    281   const int NUpy = LevS->NUp[1];
    282   const int NUpz = LevS->NUp[2];
    283   int e,i,j,k,l,m,u,p,g;
     1146  int i;
    2841147  int Num = Psi->NoOfPsis;  // is number of occupied plus unoccupied states for rows
     1148  double **WannierCentre;
     1149  double *WannierSpread;
     1150  double spread, spreadSQ;
     1151  struct DiagonalizationData DiagData;
     1152
     1153  if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Beginning localization of orbitals ...\n",P->Par.me);
     1154
     1155  InitDiagonalization(P, &DiagData, Num, max_operators, 1);
     1156
     1157  // STEP 2: Calculate A[k]_ij = V/N \sum_{G1,G2} C^\ast_{l,G1} c_{m,G2} \sum_R A^{(k)}(R) exp(iR(G2-G1))
     1158  //debug (P,"Calculatung Variance matrices");
     1159  FillHigherOrderRealMomentsMatrices(P, DiagData.AllocNum, DiagData.A);
     1160
     1161  //debug (P,"Diagonalizing");
     1162  Diagonalize(P, &DiagData);
     1163
     1164  // STEP 6: apply transformation U to all wave functions \sum_i^Num U_ji | \phi_i \rangle = | \phi_j^\ast \rangle
     1165  //debug (P,"Performing Unitary transformation");
     1166  UnitaryTransformationOnWavefunctions(P, DiagData.U, Num);
     1167
     1168  if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 7: Compute centres and spread printout\n",P->Par.me);
     1169  WannierCentre = (double **) Malloc(sizeof(double *)*Num, "ComputeMLWF: *WannierCentre");
     1170  WannierSpread = (double *) Malloc(sizeof(double)*Num, "ComputeMLWF: WannierSpread");
     1171  for (i=0;i<Num;i++)
     1172    WannierCentre[i] = (double *) Malloc(sizeof(double)*NDIM, "ComputeMLWF: WannierCentre");
     1173
     1174  //debug (P,"Computing Wannier centres from variance matrix");
     1175  ComputeWannierCentresfromVarianceMatrices(P, DiagData.A, &spread, &spreadSQ, WannierCentre, WannierSpread);
     1176
     1177  // join Wannier orbital to groups with common centres under certain conditions
     1178  //debug (P,"Changing Wannier Centres according to CommonWannier");
     1179  ChangeWannierCentres(P, Num, WannierCentre, WannierSpread);
     1180 
     1181  // write to file
     1182  //debug (P,"Writing spread file");
     1183  WriteWannierFile(P, spread, spreadSQ, WannierCentre, WannierSpread); 
     1184
     1185  debug(P,"Free'ing memory");
     1186  // free all remaining memory
     1187  FreeDiagonalization(&DiagData);
     1188  for (i=0;i<Num;i++)
     1189    Free(WannierCentre[i], "ComputeMLWF: WannierCentre[i]");
     1190  Free(WannierCentre, "ComputeMLWF: WannierCentre");
     1191  Free(WannierSpread, "ComputeMLWF: WannierSpread");
     1192}
     1193
     1194/** Solves directly for eigenvectors and eigenvalues of a real 2x2 matrix.
     1195 * The eigenvalues are determined by \f$\det{(A-\lambda \cdot I)}\f$, where \f$I\f$ is the unit matrix and
     1196 * \f$\lambda\f$ an eigenvalue. This leads to a pq-formula which is easily evaluated in the first part.
     1197 *
     1198 * The eigenvectors are then obtained by solving \f$A-\lambda \cdot I)x = 0\f$ for a given eigenvalue
     1199 * \f$\lambda\f$. However, the eigenvector magnitudes are not specified, thus we the equation system is
     1200 * still lacking such an equation. We use this fact to set either coordinate arbitrarily to 1 and then
     1201 * derive the other by solving the equation system. Finally, the eigenvector is normalized. 
     1202 * \param *P Problem at hand
     1203 * \param *A matrix whose eigenvalues/-vectors are to be found
     1204 * \param *eval vector with eigenvalues
     1205 * \param *evec matrix with corresponding eigenvectors in columns
     1206 */
     1207#ifdef HAVE_INLINE
     1208inline void EigensolverFor22Matrix(struct Problem *P, gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec)
     1209#else
     1210void EigensolverFor22Matrix(struct Problem *P, gsl_matrix *A, gsl_vector *eval, gsl_matrix *evec)
     1211#endif
     1212{
     1213  double a11,a12,a21,a22;
     1214  double ev[2], summand1, summand2, norm;
     1215  int i;
     1216  // find eigenvalues
     1217  a11 = gsl_matrix_get(A,0,0);
     1218  a12 = gsl_matrix_get(A,0,1);
     1219  a21 = gsl_matrix_get(A,1,0);
     1220  a22 = gsl_matrix_get(A,1,1);
     1221  summand1 = (a11+a22)/2.;
     1222  summand2 = sqrt(summand1*summand1 + a12*a21 - a11*a22);
     1223  ev[0] = summand1 + summand2;
     1224  ev[1] = summand1 - summand2;
     1225  gsl_vector_set(eval, 0, ev[0]);
     1226  gsl_vector_set(eval, 1, ev[1]);
     1227  //fprintf (stderr,"(%i) ev1 %lg \t ev2 %lg\n", P->Par.me, ev1, ev2);
     1228 
     1229  // find eigenvectors
     1230  for(i=0;i<2;i++) {
     1231    if (fabs(ev[i]) < MYEPSILON) {
     1232      gsl_matrix_set(evec, 0,i, 0.);
     1233      gsl_matrix_set(evec, 1,i, 0.);
     1234    } else if (fabs(a22-ev[i]) > MYEPSILON) {
     1235      norm = sqrt(1*1 + (a21*a21/(a22-ev[i])/(a22-ev[i])));
     1236      gsl_matrix_set(evec, 0,i, 1./norm);
     1237      gsl_matrix_set(evec, 1,i, -(a21/(a22-ev[i]))/norm);
     1238      //fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
     1239    } else if (fabs(a12) > MYEPSILON) {
     1240      norm = sqrt(1*1 + (a11-ev[i])*(a11-ev[i])/(a12*a12));
     1241      gsl_matrix_set(evec, 0,i, 1./norm);
     1242      gsl_matrix_set(evec, 1,i, -((a11-ev[i])/a12)/norm);
     1243      //fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
     1244    } else {
     1245      if (fabs(a11-ev[i]) > MYEPSILON) {
     1246        norm = sqrt(1*1 + (a12*a12/(a11-ev[i])/(a11-ev[i])));
     1247        gsl_matrix_set(evec, 0,i, -(a12/(a11-ev[i]))/norm);
     1248        gsl_matrix_set(evec, 1,i, 1./norm);
     1249        //fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
     1250      } else if (fabs(a21) > MYEPSILON) {
     1251        norm = sqrt(1*1 + (a22-ev[i])*(a22-ev[i])/(a21*a21));
     1252        gsl_matrix_set(evec, 0,i, -(a22-ev[i])/a21/norm);
     1253        gsl_matrix_set(evec, 1,i, 1./norm);
     1254        //fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
     1255      } else {
     1256        //gsl_matrix_set(evec, 0,i, 0.);
     1257        //gsl_matrix_set(evec, 1,i, 1.);
     1258        fprintf (stderr,"\t evec %i undetermined\n", i);
     1259      }
     1260      //gsl_matrix_set(evec, 0,0, 1.);
     1261      //gsl_matrix_set(evec, 1,0, 0.);
     1262      //fprintf (stderr,"(%i) evec1 undetermined", P->Par.me);
     1263    }
     1264  }
     1265}
     1266
     1267/** Calculates sine and cosine values for multiple matrix diagonalization.
     1268 * \param *evec eigenvectors in columns
     1269 * \param *eval corresponding eigenvalues
     1270 * \param *c cosine to be returned
     1271 * \param *s sine to be returned
     1272 */
     1273#ifdef HAVE_INLINE
     1274inline void CalculateRotationAnglesFromEigenvalues(gsl_matrix *evec, gsl_vector *eval, double *c, double *s)
     1275#else
     1276void CalculateRotationAnglesFromEigenvalues(gsl_matrix *evec, gsl_vector *eval, double *c, double *s)
     1277#endif
     1278{
     1279  int index;
    2851280  double x,y,r;
    286   double q[NDIM];
    287   double *c,*s;
    288   int index;
    289   double spread = 0., old_spread=0., Spread=0.;
    290   double WannierCentre[Num][NDIM];
    291   double WannierSpread[Num];
    292   double tmp,tmp2;
    293   double a_ij = 0, b_ij = 0, A_ij = 0, B_ij = 0;
    294   double **cos_lookup,**sin_lookup;
    295   gsl_matrix *G;
    296   gsl_vector *h;
    297   gsl_vector *eval;
    298   gsl_matrix *evec;
    299   gsl_eigen_symmv_workspace *w;
    300   int ProcNum, ProcRank, set;
    301   int it_steps; // iteration step counter
    302   int *top, *bot;
    303   int Lsend, Rsend, Lrecv, Rrecv; // where left(right) column is sent to or where it originates from
    304   int left, right; // left or right neighbour for process
    305   double **Aloc[max_operators+1], **Uloc; // local columns for one step of A[k]
    306   double *Around[max_operators+1], *Uround; // all local columns for one round of A[k]
    307   double *Atotal[max_operators+1], *Utotal; // all local columns for one round of A[k]
    308   double a_i, a_j;
     1281 
     1282  index = gsl_vector_max_index (eval);  // get biggest eigenvalue
     1283  //fprintf(stderr,"\t1st: %lg\t2nd: %lg ---  biggest: %i\n", gsl_vector_get(eval, 0), gsl_vector_get(eval, 1), index);
     1284  x = gsl_matrix_get(evec, 0, index);
     1285  y = gsl_matrix_get(evec, 1, index);
     1286  if (x < 0) {  // ensure x>=0 so that rotation angles remain smaller Pi/4
     1287    y = -y;
     1288    x = -x; 
     1289  }
     1290  //z = gsl_matrix_get(evec, 2, index) * x/fabs(x);
     1291  //fprintf(stderr,"\tx %lg\ty %lg\n", x,y);
     1292 
     1293  //fprintf(stderr,"(%i),(%i,%i) STEP 3c\n",P->Par.me,i,j);
     1294  // STEP 3c: calculate R = [[c,s^\ast],[-s,c^\ast]]
     1295  r = sqrt(x*x + y*y); // + z*z);
     1296  if (fabs(r) > MYEPSILON) {
     1297    *c = sqrt((x + r) / (2.*r));
     1298    *s = y / sqrt(2.*r*(x+r)); //, -z / sqrt(2*r*(x+r)));
     1299  } else {
     1300    *c = 1.;
     1301    *s = 0.;
     1302  }
     1303}
     1304
     1305/*
     1306 * \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
     1307 * \param *U transformation matrix set to unity matrix
     1308 * \param NumMatrices number of matrices to be diagonalized simultaneously
     1309 * \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
     1310 * \param AllocNum number of rows/columns in matrices
     1311 * \param Num number of wave functions
     1312 * \param ProcRank index in group for this cpu
     1313 * \param ProcNum number of cpus in group
     1314 * \param *top array with top row indices (merry-go-round)
     1315 * \param *bot array with bottom row indices (merry-go-round)
     1316 */
     1317
     1318/** Simultaneous diagonalization of matrices with multiple cpus.
     1319 * \param *P Problem at hand
     1320 * \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
     1321 * \note this is slower given one cpu only than SerialDiagonalization()
     1322 */
     1323void ParallelDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData)
     1324{
     1325  struct RunStruct *R =&P->R;
    3091326  int tagR0, tagR1, tagS0, tagS1;
    3101327  int iloc, jloc;
     
    3131330  int start;
    3141331  int *rcounts, *rdispls;
    315   int AllocNum = ceil((double)Num / 2. ) *2;
    316   int totalflag, flag;
    317   int *marker, **group;
    318   int partner[Num];
    319   int type = Occupied;
    320   MPI_Comm *comm;
    321   char spin[12], suffix[18];
    322  
    323   N0 = LevS->Plan0.plan->local_nx;
    324   N[0] = LevS->Plan0.plan->N[0];
    325   N[1] = LevS->Plan0.plan->N[1];
    326   N[2] = LevS->Plan0.plan->N[2];
    327  
    328   comm = &P->Par.comm_ST;
    329  
    330   fprintf(stderr,"(%i) Comparing groups - AllocNum %i --- All %i\t Psi %i\t PsiT %i\n",P->Par.me, AllocNum, P->Par.Max_me_comm_ST, P->Par.Max_me_comm_ST_Psi, P->Par.Max_my_color_comm_ST_Psi);
    331   if (AllocNum % (P->Par.Max_me_comm_ST*2) == 0) {  // all parallel
    332     comm = &P->Par.comm_ST;
    333     fprintf(stderr,"(%i) Jacobi is done parallely by all\n", P->Par.me);           
    334   } else if (P->Par.Max_me_comm_ST_Psi >= P->Par.Max_my_color_comm_ST_Psi) { // always the bigger group comes first 
    335     if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) {  // coefficients parallel
    336       comm = &P->Par.comm_ST_Psi;
    337       fprintf(stderr,"(%i) Jacobi is done parallely by Psi\n", P->Par.me);       
    338     } else if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) {  // Psis parallel
    339       comm = &P->Par.comm_ST_PsiT;
    340       fprintf(stderr,"(%i) Jacobi is done parallely by PsiT\n", P->Par.me);       
    341     }
     1332  double *c, *s;
     1333  int Lsend, Rsend, Lrecv, Rrecv; // where left(right) column is sent to or where it originates from
     1334  int left, right; // left or right neighbour for process
     1335  double spread = 0., old_spread=0., Spread=0.;
     1336  int i,j,k,l,m,u;
     1337  int set;
     1338  int it_steps; // iteration step counter
     1339  double **Aloc[DiagData->NumMatrices+1], **Uloc; // local columns for one step of A[k]
     1340  double *Around[DiagData->NumMatrices+1], *Uround; // all local columns for one round of A[k]
     1341  double *Atotal[DiagData->NumMatrices+1], *Utotal; // all local columns for one round of A[k]
     1342  double a_i, a_j;
     1343  gsl_matrix *G;
     1344  gsl_vector *h;
     1345  gsl_vector *eval;
     1346  gsl_matrix *evec;
     1347  //gsl_eigen_symmv_workspace *w;
     1348
     1349  max_rounds = (DiagData->AllocNum / 2)/DiagData->ProcNum;  // each process must perform multiple rotations per step of a set
     1350  if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) start %i\tstep %i\tmax.rounds %i\n",P->Par.me, DiagData->ProcRank, DiagData->ProcNum, max_rounds);
     1351 
     1352  // allocate column vectors for interchange of columns
     1353  debug(P,"allocate column vectors for interchange of columns");
     1354  c = (double *) Malloc(sizeof(double)*max_rounds, "ComputeMLWF: c");
     1355  s = (double *) Malloc(sizeof(double)*max_rounds, "ComputeMLWF: s");
     1356  c_all = (double *) Malloc(sizeof(double)*DiagData->AllocNum/2, "ComputeMLWF: c_all");
     1357  s_all = (double *) Malloc(sizeof(double)*DiagData->AllocNum/2, "ComputeMLWF: s_all");
     1358  rcounts = (int *) Malloc(sizeof(int)*DiagData->ProcNum, "ComputeMLWF: rcounts");
     1359  rdispls = (int *) Malloc(sizeof(int)*DiagData->ProcNum, "ComputeMLWF: rdispls");
     1360
     1361  // allocate eigenvector stuff
     1362  debug(P,"allocate eigenvector stuff");
     1363  G = gsl_matrix_calloc (2,2);
     1364  h = gsl_vector_alloc (2);
     1365  eval = gsl_vector_alloc (2);
     1366  evec = gsl_matrix_alloc (2,2);
     1367  //w = gsl_eigen_symmv_alloc(2);
     1368 
     1369  // establish communication partners
     1370  debug(P,"establish communication partners");
     1371  if (DiagData->ProcRank == 0) {
     1372    tagS0 = WannierALTag;   // left p0 always remains left p0
    3421373  } else {
    343     if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) {  // Psis parallel
    344       comm = &P->Par.comm_ST_PsiT;
    345       fprintf(stderr,"(%i) Jacobi is done parallely by PsiT\n", P->Par.me);       
    346     } else if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) {  // coefficients parallel
    347       comm = &P->Par.comm_ST_Psi;
    348       fprintf(stderr,"(%i) Jacobi is done parallely by Psi\n", P->Par.me);       
    349     }
    350   }
    351 
    352   MPI_Comm_size (*comm, &ProcNum);
    353   MPI_Comm_rank (*comm, &ProcRank);
    354  
    355   if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Beginning localization of orbitals ...\n",P->Par.me);
    356  
    357   if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 2\n",P->Par.me);
    358 
    359   // STEP 2: Calculate A[k]_ij = V/N \sum_{G1,G2} C^\ast_{l,G1} c_{m,G2} \sum_R A^{(k)}(R) exp(iR(G2-G1))
    360   gsl_matrix *A[max_operators+1];   // one extra for B matrix
    361   for (u=0;u<=max_operators;u++)
    362     A[u] = gsl_matrix_calloc (AllocNum,AllocNum);  // allocate matrix   
    363 
    364   // create lookup table for sin/cos values
    365   cos_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *cos_lookup");
    366   sin_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *sin_lookup");
    367   for (i=0;i<NDIM;i++) {
    368     // allocate memory
    369     cos_lookup[i] = (double *) Malloc(sizeof(double)*LevS->Plan0.plan->N[i], "ComputeMLWF: cos_lookup");
    370     sin_lookup[i] = (double *) Malloc(sizeof(double)*LevS->Plan0.plan->N[i], "ComputeMLWF: sin_lookup");
    371     // reset arrays
    372     SetArrayToDouble0(cos_lookup[i],LevS->Plan0.plan->N[i]);
    373     SetArrayToDouble0(sin_lookup[i],LevS->Plan0.plan->N[i]);
    374     // create lookup values
    375     for (j=0;j<LevS->Plan0.plan->N[i];j++) {
    376       tmp = 2*PI/(double)LevS->Plan0.plan->N[i]*(double)j;
    377       cos_lookup[i][j] = cos(tmp);
    378       sin_lookup[i][j] = sin(tmp);
    379     }
    380   }
    381   l=-1; // to access U matrix element (0..Num-1)
    382   // fill the matrices
    383   for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions
    384     OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
    385     if (OnePsiA->PsiType == type) {   // drop all but occupied ones
    386       l++;  // increase l if it is non-extra wave function
    387       if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
    388         LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
    389       else
    390         LPsiDatA = NULL;  // otherwise processes won't enter second loop, though they're supposed to send coefficients!
    391      
    392       //fprintf(stderr,"(%i),(%i,%i): fft'd, A[..] and B, back-fft'd acting on \\phi_A\n",P->Par.me,l,0);
    393       if (LPsiDatA != NULL) {
    394         CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
    395         // note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
    396         for (n0=0;n0<N0;n0++) 
    397           for (n[1]=0;n[1]<N[1];n[1]++)
    398             for (n[2]=0;n[2]<N[2];n[2]++) {
    399               i0 = n[2]*NUpz+N[2]*NUpz*(n[1]*NUpy+N[1]*NUpy*n0*NUpx);
    400               iS = n[2]+N[2]*(n[1]+N[1]*n0);
    401               n[0] = n0 + LevS->Plan0.plan->start_nx;
    402               for (k=0;k<max_operators;k+=2) {
    403                 e = k/2;
    404                 tmp = 2*PI/(double)(N[e])*(double)(n[e]);
    405                 tmp2 = PsiCR[i0] /LevS->MaxN;
    406                 // check lookup
    407                 if (!l) // perform check on first wave function only
    408                   if ((fabs(cos(tmp) - cos_lookup[e][n[e]]) > MYEPSILON) || (fabs(sin(tmp) - sin_lookup[e][n[e]]) > MYEPSILON)) {
    409                     Error(SomeError, "Lookup table does not match real value!");
    410                   }
    411                 HGcR[k][iS] = cos_lookup[e][n[e]] * tmp2; /* Matrix Vector Mult */
    412                 HGcR2[k][iS] = cos_lookup[e][n[e]] * HGcR[k][iS]; /* Matrix Vector Mult */
    413                 HGcR[k+1][iS] = sin_lookup[e][n[e]] * tmp2; /* Matrix Vector Mult */
    414                 HGcR2[k+1][iS] = sin_lookup[e][n[e]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
    415               }
    416             }
    417         for (u=0;u<max_operators;u++) {
    418           fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[u], work);
    419           fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[u], work);
     1374    tagS0 = (DiagData->ProcRank == DiagData->ProcNum - 1) ? WannierARTag : WannierALTag; // left p_last becomes right p_last         
     1375  }
     1376  tagS1 = (DiagData->ProcRank == 0) ? WannierALTag : WannierARTag; // right p0 always goes into left p1
     1377  tagR0 = WannierALTag; //
     1378  tagR1 = WannierARTag; // first process
     1379  if (DiagData->ProcRank == 0) {
     1380    left = DiagData->ProcNum-1;
     1381    right = 1;
     1382    Lsend = 0;
     1383    Rsend = 1;
     1384    Lrecv = 0;
     1385    Rrecv = 1;       
     1386  } else if (DiagData->ProcRank == DiagData->ProcNum - 1) {
     1387    left = DiagData->ProcRank - 1;
     1388    right = 0;
     1389    Lsend = DiagData->ProcRank;
     1390    Rsend = DiagData->ProcRank - 1;
     1391    Lrecv = DiagData->ProcRank - 1;
     1392    Rrecv = DiagData->ProcRank;       
     1393  } else {
     1394    left = DiagData->ProcRank - 1;
     1395    right = DiagData->ProcRank + 1;
     1396    Lsend = DiagData->ProcRank+1;
     1397    Rsend = DiagData->ProcRank - 1;
     1398    Lrecv = DiagData->ProcRank - 1;
     1399    Rrecv = DiagData->ProcRank+1;
     1400  }
     1401  //if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) left %i\t right %i --- Lsend %i\tRsend%i\tLrecv %i\tRrecv%i\n",P->Par.me, left, right, Lsend, Rsend, Lrecv, Rrecv);
     1402 
     1403  // initialise A_loc
     1404  debug(P,"initialise A_loc");
     1405  for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
     1406    //Aloc[k] = (double *) Malloc(sizeof(double)*AllocNum*2, "ComputeMLWF: Aloc[k]");
     1407    Around[k] = (double *) Malloc(sizeof(double)*DiagData->AllocNum*2*max_rounds, "ComputeMLWF: Around[k]");
     1408    Atotal[k] = (double *) Malloc(sizeof(double)*DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Atotal[k]");
     1409    Aloc[k] = (double **) Malloc(sizeof(double *)*2*max_rounds, "ComputeMLWF: Aloc[k]");
     1410    //Around[k] = &Atotal[k][ProcRank*AllocNum*2*max_rounds];
     1411   
     1412    for (round=0;round<max_rounds;round++) {
     1413      Aloc[k][2*round] = &Around[k][DiagData->AllocNum*(2*round)];
     1414      Aloc[k][2*round+1] = &Around[k][DiagData->AllocNum*(2*round+1)];
     1415      for (l=0;l<DiagData->AllocNum;l++) {
     1416        Aloc[k][2*round][l] = gsl_matrix_get(DiagData->A[k],l,2*(DiagData->ProcRank*max_rounds+round));
     1417        Aloc[k][2*round+1][l] = gsl_matrix_get(DiagData->A[k],l,2*(DiagData->ProcRank*max_rounds+round)+1);
     1418        //fprintf(stderr,"(%i) (%i, 0/1) A_loc1 %e\tA_loc2 %e\n",P->Par.me, l, Aloc[k][l],Aloc[k][l+AllocNum]);
     1419      }
     1420    }
     1421  }
     1422  // initialise U_loc
     1423  debug(P,"initialise U_loc");
     1424  //Uloc = (double *) Malloc(sizeof(double)*AllocNum*2, "ComputeMLWF: Uloc");
     1425  Uround = (double *) Malloc(sizeof(double)*DiagData->AllocNum*2*max_rounds, "ComputeMLWF: Uround");
     1426  Utotal = (double *) Malloc(sizeof(double)*DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Utotal");
     1427  Uloc = (double **) Malloc(sizeof(double *)*2*max_rounds, "ComputeMLWF: Uloc");
     1428  //Uround = &Utotal[ProcRank*AllocNum*2*max_rounds];
     1429  for (round=0;round<max_rounds;round++) {
     1430    Uloc[2*round] = &Uround[DiagData->AllocNum*(2*round)];
     1431    Uloc[2*round+1] = &Uround[DiagData->AllocNum*(2*round+1)];
     1432    for (l=0;l<DiagData->AllocNum;l++) {
     1433      Uloc[2*round][l] = gsl_matrix_get(DiagData->U,l,2*(DiagData->ProcRank*max_rounds+round));
     1434      Uloc[2*round+1][l] = gsl_matrix_get(DiagData->U,l,2*(DiagData->ProcRank*max_rounds+round)+1);
     1435      //fprintf(stderr,"(%i) (%i, 0/1) U_loc1 %e\tU_loc2 %e\n",P->Par.me, l, Uloc[l+AllocNum*0],Uloc[l+AllocNum*1]);
     1436    }
     1437  }
     1438 
     1439  // now comes the iteration loop
     1440  debug(P,"now comes the iteration loop");
     1441  it_steps = 0;
     1442  do {
     1443    it_steps++;
     1444    //if (P->Par.me == 0) fprintf(stderr,"(%i) Beginning parallel iteration %i ... ",P->Par.me,it_steps);
     1445    for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 staying at its place all the time
     1446      //fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
     1447      for (round = 0; round < max_rounds;round++) {
     1448        start = DiagData->ProcRank * max_rounds + round;
     1449        // get indices
     1450        i = DiagData->top[start] < DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // minimum of the two indices
     1451        iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1;
     1452        j = DiagData->top[start] > DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // maximum of the two indices: thus j >  i
     1453        jloc =  DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
     1454        //fprintf(stderr,"(%i) my (%i,%i), loc(%i,%i)\n",P->Par.me, i,j, iloc, jloc);
     1455       
     1456        // calculate rotation angle, i.e. c and s
     1457        //fprintf(stderr,"(%i),(%i,%i) calculate rotation angle\n",P->Par.me,i,j);
     1458        gsl_matrix_set_zero(G);       
     1459        for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
     1460          // Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]         
     1461          //fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,Aloc[k][2*round+iloc][i], Aloc[k][2*round+jloc][j],Aloc[k][2*round+iloc][i] - Aloc[k][2*round+jloc][j]);         
     1462          //fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,Aloc[k][2*round+jloc][i], Aloc[k][2*round+iloc][j],Aloc[k][2*round+jloc][i] + Aloc[k][2*round+iloc][j]);         
     1463          gsl_vector_set(h, 0, Aloc[k][2*round+iloc][i] - Aloc[k][2*round+jloc][j]);
     1464          gsl_vector_set(h, 1, Aloc[k][2*round+jloc][i] + Aloc[k][2*round+iloc][j]);
     1465         
     1466          // Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
     1467          for (l=0;l<2;l++)
     1468            for (m=0;m<2;m++)
     1469                gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
    4201470        }
    421       } 
    422       m = -1; // to access U matrix element (0..Num-1)
    423       for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
    424         OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
    425         if (OnePsiB->PsiType == type) {   // drop all but occupied ones
    426           m++;  // increase m if it is non-extra wave function
    427           if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
    428              LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
    429           else
    430              LOnePsiB = NULL;
    431           if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
    432             RecvSource = OnePsiB->my_color_comm_ST_Psi;
    433             MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
    434             LPsiDatB=LevS->LPsi->TempPsi;
    435           } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
    436             for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
    437               if (p != OnePsiB->my_color_comm_ST_Psi)
    438                 MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
    439             LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
    440           } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
    441 
    442           for (u=0;u<max_operators;u++) {
    443             a_ij = 0;
    444             b_ij = 0;
    445             if (LPsiDatA != NULL) { // calculate, reduce and send to all ...
    446               //fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,u);
    447               g=0;
    448               if (LevS->GArray[0].GSq == 0.0) {
    449                 Index = LevS->GArray[g].Index;
    450                 a_ij = (LPsiDatB[0].re*HGcRC[u][Index].re + LPsiDatB[0].im*HGcRC[u][Index].im);
    451                 b_ij = (LPsiDatB[0].re*HGcR2C[u][Index].re + LPsiDatB[0].im*HGcR2C[u][Index].im);
    452                 g++;
    453               }
    454               for (; g < LevS->MaxG; g++) {
    455                 Index = LevS->GArray[g].Index;
    456                 a_ij += 2*(LPsiDatB[g].re*HGcRC[u][Index].re + LPsiDatB[g].im*HGcRC[u][Index].im);
    457                 b_ij += 2*(LPsiDatB[g].re*HGcR2C[u][Index].re + LPsiDatB[g].im*HGcR2C[u][Index].im);
    458               } // due to the symmetry the resulting matrix element is real and symmetric in (i,j) ! (complex multiplication simplifies ...)
    459               // sum up elements from all coefficients sharing processes
    460               MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
    461               MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
    462               a_ij = A_ij;
    463               b_ij = B_ij;
    464               // send element to all Psi-sharing who don't have l local (MPI_Send is a lot slower than AllReduce!)
    465               MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
    466               MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
    467             } else { // receive ...
    468               MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
    469               MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);             
    470             }
    471             // ... and store
    472             //fprintf(stderr,"(%i),(%i,%i): A[%i]: setting component (local: %lg, total: %lg)\n",P->Par.me, l,m,u,a_ij,A_ij);
    473             //fprintf(stderr,"(%i),(%i,%i): B: adding upon component (local: %lg, total: %lg)\n",P->Par.me, l,m,b_ij,B_ij);
    474             gsl_matrix_set(A[u], l, m, A_ij);
    475             gsl_matrix_set(A[max_operators], l, m, B_ij + gsl_matrix_get(A[max_operators],l,m));
     1471        //fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
     1472        // STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
     1473        EigensolverFor22Matrix(P,G,eval,evec);
     1474        //gsl_eigen_symmv(G, eval, evec, w);  // calculates eigenvalues and eigenvectors of G   
     1475
     1476        CalculateRotationAnglesFromEigenvalues(evec, eval, &c[round], &s[round]);
     1477        //fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[round],s[round]);
     1478
     1479        //fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
     1480        // V_loc = V_loc * V_small
     1481        //debug(P,"apply rotation to local U");
     1482        for (l=0;l<DiagData->AllocNum;l++) {
     1483          a_i = Uloc[2*round+iloc][l];
     1484          a_j = Uloc[2*round+jloc][l];
     1485          Uloc[2*round+iloc][l] =  c[round] * a_i + s[round] * a_j;
     1486          Uloc[2*round+jloc][l] = -s[round] * a_i + c[round] * a_j;
     1487        }
     1488      }  // end of round       
     1489      // circulate the rotation angles
     1490      //debug(P,"circulate the rotation angles");
     1491      MPI_Allgather(c, max_rounds, MPI_DOUBLE, c_all, max_rounds, MPI_DOUBLE, *(DiagData->comm));   // MPI_Allgather is waaaaay faster than ring circulation
     1492      MPI_Allgather(s, max_rounds, MPI_DOUBLE, s_all, max_rounds, MPI_DOUBLE, *(DiagData->comm));
     1493      //m = start;
     1494      for (l=0;l<DiagData->AllocNum/2;l++) { // for each process
     1495        // we have V_small from process k
     1496        //debug(P,"Apply V_small from other process");
     1497        i = DiagData->top[l] < DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // minimum of the two indices
     1498        j = DiagData->top[l] > DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // maximum of the two indices: thus j >  i
     1499        iloc = DiagData->top[l] < DiagData->bot[l] ? 0 : 1;
     1500        jloc = DiagData->top[l] > DiagData->bot[l] ? 0 : 1;
     1501        for (m=0;m<max_rounds;m++) {
     1502          //fprintf(stderr,"(%i) %i processes' (%i,%i)\n",P->Par.me, m,i,j);
     1503          // apply row rotation to each A[k]
     1504          for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
     1505            //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+iloc][i],j,Aloc[k][2*m+iloc][j]);
     1506            //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+jloc][i],j,Aloc[k][2*m+jloc][j]);
     1507            a_i = Aloc[k][2*m+iloc][i];
     1508            a_j = Aloc[k][2*m+iloc][j];
     1509            Aloc[k][2*m+iloc][i] =  c_all[l] * a_i + s_all[l] * a_j;
     1510            Aloc[k][2*m+iloc][j] = -s_all[l] * a_i + c_all[l] * a_j;
     1511            a_i = Aloc[k][2*m+jloc][i];
     1512            a_j = Aloc[k][2*m+jloc][j];
     1513            Aloc[k][2*m+jloc][i] =  c_all[l] * a_i + s_all[l] * a_j;
     1514            Aloc[k][2*m+jloc][j] = -s_all[l] * a_i + c_all[l] * a_j;
     1515            //fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+iloc][i],j,Aloc[k][2*m+iloc][j]);
     1516            //fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+jloc][i],j,Aloc[k][2*m+jloc][j]);
    4761517          }
    4771518        }
    4781519      }
    479     }
    480   }
    481   // reset extra entries
    482   for (u=0;u<=max_operators;u++) {
    483     for (i=Num;i<AllocNum;i++)
    484       for (j=0;j<AllocNum;j++)
    485         gsl_matrix_set(A[u], i,j, 0.);
    486     for (i=Num;i<AllocNum;i++)
    487       for (j=0;j<AllocNum;j++)
    488         gsl_matrix_set(A[u], j,i, 0.);
    489   }
    490   // free lookups
    491   for (i=0;i<NDIM;i++) {
    492     Free(cos_lookup[i], "bla");
    493     Free(sin_lookup[i], "bla");
    494   }
    495   Free(cos_lookup, "bla");
    496   Free(sin_lookup, "bla");
    497  
    498   if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 1\n",P->Par.me);
    499   // STEP 1: Initialize U = 1
    500   gsl_matrix *U = gsl_matrix_alloc (AllocNum,AllocNum);
    501   gsl_matrix_set_identity(U);
    502 
    503   // init merry-go-round array
    504   top = (int *) Malloc(sizeof(int)*AllocNum/2, "ComputeMLWF: top");
    505   bot = (int *) Malloc(sizeof(int)*AllocNum/2, "ComputeMLWF: bot");
    506   //debug(P,"init merry-go-round array");
    507   for (i=0;i<AllocNum/2;i++) {
    508     top[i] = 2*i;
    509     bot[i] = 2*i+1;     
    510   }
    511 
    512 /*// print A matrices for debug
    513   if (P->Par.me == 0)
    514     for (u=0;u<max_operators+1;u++) {
    515      fprintf(stderr, "A[%i] = \n",u);
    516       for (i=0;i<Num;i++) {
    517         for (j=0;j<Num;j++)
    518           fprintf(stderr, "%e\t",gsl_matrix_get(A[u],i,j));
    519         fprintf(stderr, "\n");
    520       }
    521     } 
    522 */
    523   if (Num != 1) {
    524     // one- or multi-process case?
    525     if (((AllocNum % 2) == 0) && (ProcNum != 1) && ((AllocNum / 2) % ProcNum == 0)) {
    526       max_rounds = (AllocNum / 2)/ProcNum;  // each process must perform multiple rotations per step of a set
    527       //fprintf(stderr,"(%i) start %i\tstep %i\tmax.rounds %i\n",P->Par.me, ProcRank, ProcNum, max_rounds);
    528       // allocate column vectors for interchange of columns
    529       //debug(P,"allocate column vectors for interchange of columns");
    530       c = (double *) Malloc(sizeof(double)*max_rounds, "ComputeMLWF: c");
    531       s = (double *) Malloc(sizeof(double)*max_rounds, "ComputeMLWF: s");
    532       c_all = (double *) Malloc(sizeof(double)*AllocNum/2, "ComputeMLWF: c_all");
    533       s_all = (double *) Malloc(sizeof(double)*AllocNum/2, "ComputeMLWF: s_all");
    534       rcounts = (int *) Malloc(sizeof(int)*ProcNum, "ComputeMLWF: rcounts");
    535       rdispls = (int *) Malloc(sizeof(int)*ProcNum, "ComputeMLWF: rdispls");
    536 /*    // print starting values of index generation tables top and bot
    537       fprintf(stderr,"top\t");
    538       for (k=0;k<AllocNum/2;k++)
    539         fprintf(stderr,"%i\t", top[k]);
    540       fprintf(stderr,"\n");
    541       fprintf(stderr,"bot\t");
    542       for (k=0;k<AllocNum/2;k++)
    543         fprintf(stderr,"%i\t", bot[k]);
    544       fprintf(stderr,"\n");*/
    545       // establish communication partners
    546       //debug(P,"establish communication partners");
    547       if (ProcRank == 0) {
    548         tagS0 = WannierALTag;   // left p0 always remains left p0
    549       } else {
    550         tagS0 = ProcRank == ProcNum - 1 ? WannierARTag : WannierALTag; // left p_last becomes right p_last         
    551       }
    552       tagS1 = ProcRank == 0 ? WannierALTag : WannierARTag; // right p0 always goes into left p1
    553       tagR0 = WannierALTag; //
    554       tagR1 = WannierARTag; // first process
    555       if (ProcRank == 0) {
    556         left = ProcNum-1;
    557         right = 1;
    558         Lsend = 0;
    559         Rsend = 1;
    560         Lrecv = 0;
    561         Rrecv = 1;       
    562       } else if (ProcRank == ProcNum - 1) {
    563         left = ProcRank - 1;
    564         right = 0;
    565         Lsend = ProcRank;
    566         Rsend = ProcRank - 1;
    567         Lrecv = ProcRank - 1;
    568         Rrecv = ProcRank;       
    569       } else {
    570         left = ProcRank - 1;
    571         right = ProcRank + 1;
    572         Lsend = ProcRank+1;
    573         Rsend = ProcRank - 1;
    574         Lrecv = ProcRank - 1;
    575         Rrecv = ProcRank+1;
    576       }
    577       //fprintf(stderr,"(%i) left %i\t right %i --- Lsend %i\tRsend%i\tLrecv %i\tRrecv%i\n",P->Par.me, left, right, Lsend, Rsend, Lrecv, Rrecv);
    578       // allocate eigenvector stuff
    579       //debug(P,"allocate eigenvector stuff");
    580       G = gsl_matrix_calloc (2,2);
    581       h = gsl_vector_alloc (2);
    582       eval = gsl_vector_alloc (2);
    583       evec = gsl_matrix_alloc (2,2);
    584       w = gsl_eigen_symmv_alloc(2);
    585       // initialise A_loc
    586       //debug(P,"initialise A_loc");
    587       for (k=0;k<max_operators+1;k++) {
    588         //Aloc[k] = (double *) Malloc(sizeof(double)*AllocNum*2, "ComputeMLWF: Aloc[k]");
    589         Around[k] = (double *) Malloc(sizeof(double)*AllocNum*2*max_rounds, "ComputeMLWF: Around[k]");
    590         Atotal[k] = (double *) Malloc(sizeof(double)*AllocNum*AllocNum, "ComputeMLWF: Atotal[k]");
    591         Aloc[k] = (double **) Malloc(sizeof(double *)*2*max_rounds, "ComputeMLWF: Aloc[k]");
    592         //Around[k] = &Atotal[k][ProcRank*AllocNum*2*max_rounds];
    593        
    594         for (round=0;round<max_rounds;round++) {
    595           Aloc[k][2*round] = &Around[k][AllocNum*(2*round)];
    596           Aloc[k][2*round+1] = &Around[k][AllocNum*(2*round+1)];
    597           for (l=0;l<AllocNum;l++) {
    598             Aloc[k][2*round][l] = gsl_matrix_get(A[k],l,2*(ProcRank*max_rounds+round));
    599             Aloc[k][2*round+1][l] = gsl_matrix_get(A[k],l,2*(ProcRank*max_rounds+round)+1);
    600             //fprintf(stderr,"(%i) (%i, 0/1) A_loc1 %e\tA_loc2 %e\n",P->Par.me, l, Aloc[k][l],Aloc[k][l+AllocNum]);
     1520      // apply rotation to local operator matrices
     1521      // A_loc = A_loc * V_small
     1522      //debug(P,"apply rotation to local operator matrices A[k]");
     1523      for (m=0;m<max_rounds;m++) {
     1524        start = DiagData->ProcRank * max_rounds + m;
     1525        iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1;
     1526        jloc = DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
     1527        for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// extra for B matrix !
     1528          for (l=0;l<DiagData->AllocNum;l++) {
     1529            // Columns, i and j belong to this process only!
     1530            a_i = Aloc[k][2*m+iloc][l];
     1531            a_j = Aloc[k][2*m+jloc][l];
     1532            Aloc[k][2*m+iloc][l] =  c[m] * a_i + s[m] * a_j;
     1533            Aloc[k][2*m+jloc][l] = -s[m] * a_i + c[m] * a_j;
     1534            //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, l, Aloc[k][2*m+iloc][l],l,Aloc[k][2*m+jloc][l]);
    6011535          }
    6021536        }
    6031537      }
    604       // initialise U_loc
    605       //debug(P,"initialise U_loc");
    606       //Uloc = (double *) Malloc(sizeof(double)*AllocNum*2, "ComputeMLWF: Uloc");
    607       Uround = (double *) Malloc(sizeof(double)*AllocNum*2*max_rounds, "ComputeMLWF: Uround");
    608       Utotal = (double *) Malloc(sizeof(double)*AllocNum*AllocNum, "ComputeMLWF: Utotal");
    609       Uloc = (double **) Malloc(sizeof(double *)*2*max_rounds, "ComputeMLWF: Uloc");
    610       //Uround = &Utotal[ProcRank*AllocNum*2*max_rounds];
    611       for (round=0;round<max_rounds;round++) {
    612         Uloc[2*round] = &Uround[AllocNum*(2*round)];
    613         Uloc[2*round+1] = &Uround[AllocNum*(2*round+1)];
    614         for (l=0;l<AllocNum;l++) {
    615           Uloc[2*round][l] = gsl_matrix_get(U,l,2*(ProcRank*max_rounds+round));
    616           Uloc[2*round+1][l] = gsl_matrix_get(U,l,2*(ProcRank*max_rounds+round)+1);
    617           //fprintf(stderr,"(%i) (%i, 0/1) U_loc1 %e\tU_loc2 %e\n",P->Par.me, l, Uloc[l+AllocNum*0],Uloc[l+AllocNum*1]);
     1538      // Shuffling of these round's columns to prepare next rotation set
     1539      for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
     1540        // extract columns from A
     1541        //debug(P,"extract columns from A");
     1542        MerryGoRoundColumns(*(DiagData->comm), Aloc[k], DiagData->AllocNum, max_rounds, k, tagS0, tagS1, tagR0, tagR1);
     1543                 
     1544      }       
     1545      // and also for V ...
     1546      //debug(P,"extract columns from U");
     1547      MerryGoRoundColumns(*(DiagData->comm), Uloc, DiagData->AllocNum, max_rounds, 0, tagS0, tagS1, tagR0, tagR1);
     1548
     1549
     1550      // and merry-go-round for the indices too           
     1551      //debug(P,"and merry-go-round for the indices too");
     1552      MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
     1553    }
     1554
     1555    //fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
     1556    // STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
     1557    old_spread = Spread;
     1558    spread = 0.;
     1559    for(k=0;k<DiagData->NumMatrices;k++) {   // go through all self-adjoint operators
     1560      for (i=0; i < 2*max_rounds; i++) {  // go through all wave functions
     1561          spread += Aloc[k][i][i+DiagData->ProcRank*2*max_rounds]*Aloc[k][i][i+DiagData->ProcRank*2*max_rounds];
     1562          //spread += gsl_matrix_get(A[k],i,i)*gsl_matrix_get(A[k],i,i);
     1563      }
     1564    }
     1565    MPI_Allreduce(&spread, &Spread, 1, MPI_DOUBLE, MPI_SUM, *(DiagData->comm));
     1566    //Spread = spread;
     1567    if (P->Par.me == 0) {
     1568      //if(P->Call.out[ReadOut])
     1569      //  fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,Spread,R->EpsWannier);
     1570      //else
     1571        //fprintf(stderr,"%2.9e\n",Spread);
     1572    }
     1573    // STEP 5: check change of variance
     1574  } while (fabs(old_spread-Spread) >= R->EpsWannier);
     1575  // end of iterative diagonalization loop: We have found our final orthogonal U!
     1576
     1577  // gather local parts of U into complete matrix
     1578  for (l=0;l<DiagData->ProcNum;l++)
     1579    rcounts[l] = DiagData->AllocNum;
     1580  debug(P,"allgather U");
     1581  for (round=0;round<2*max_rounds;round++) {
     1582    for (l=0;l<DiagData->ProcNum;l++)
     1583      rdispls[l] = (l*2*max_rounds + round)*DiagData->AllocNum;
     1584    MPI_Allgatherv(Uloc[round], DiagData->AllocNum, MPI_DOUBLE, Utotal, rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
     1585  }
     1586  for (k=0;k<DiagData->AllocNum;k++) {   
     1587    for(l=0;l<DiagData->AllocNum;l++) {
     1588      gsl_matrix_set(DiagData->U,k,l, Utotal[l+k*DiagData->AllocNum]);
     1589    }
     1590  }
     1591
     1592  // after one set, gather A[k] from all and calculate spread
     1593  for (l=0;l<DiagData->ProcNum;l++)
     1594    rcounts[l] = DiagData->AllocNum;
     1595  debug(P,"gather A[k] for spread");
     1596  for (u=0;u<DiagData->NumMatrices+DiagData->extra;u++) {// extra for B matrix !
     1597    debug(P,"A[k] all gather");
     1598    for (round=0;round<2*max_rounds;round++) {
     1599      for (l=0;l<DiagData->ProcNum;l++)
     1600        rdispls[l] = (l*2*max_rounds + round)*DiagData->AllocNum;
     1601      MPI_Allgatherv(Aloc[u][round], DiagData->AllocNum, MPI_DOUBLE, Atotal[u], rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
     1602    }
     1603    for (k=0;k<DiagData->AllocNum;k++) {   
     1604      for(l=0;l<DiagData->AllocNum;l++) {
     1605        gsl_matrix_set(DiagData->A[u],k,l, Atotal[u][l+k*DiagData->AllocNum]);
     1606      }
     1607    }
     1608  }
     1609   
     1610  // free eigenvector stuff     
     1611  gsl_vector_free(h);
     1612  gsl_matrix_free(G);
     1613  //gsl_eigen_symmv_free(w); 
     1614  gsl_vector_free(eval);
     1615  gsl_matrix_free(evec);
     1616 
     1617  // Free column vectors
     1618  for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
     1619    Free(Atotal[k], "ParallelDiagonalization: Atotal[k]");
     1620    Free(Around[k], "ParallelDiagonalization: Around[k]");
     1621  }
     1622  Free(Uround, "ParallelDiagonalization: Uround");
     1623  Free(Utotal, "ParallelDiagonalization: Utotal");
     1624  Free(c_all, "ParallelDiagonalization: c_all");
     1625  Free(s_all, "ParallelDiagonalization: s_all");
     1626  Free(c, "ParallelDiagonalization: c");
     1627  Free(s, "ParallelDiagonalization: s");
     1628  Free(rcounts, "ParallelDiagonalization: rcounts");
     1629  Free(rdispls, "ParallelDiagonalization: rdispls");
     1630}
     1631/*
     1632 * \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
     1633 * \param *U transformation matrix set to unity matrix
     1634 * \param NumMatrices number of matrices to be diagonalized
     1635 * \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
     1636 * \param AllocNum number of rows/columns in matrices
     1637 * \param Num number of wave functions
     1638 * \param *top array with top row indices (merry-go-round)
     1639 * \param *bot array with bottom row indices (merry-go-round)
     1640 */
     1641
     1642/** Simultaneous Diagonalization of variances matrices with one cpu.
     1643 * \param *P Problem at hand
     1644 * \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
     1645 * \note this is faster given one cpu only than ParallelDiagonalization()
     1646 */
     1647void SerialDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData)
     1648{
     1649  struct RunStruct *R = &P->R;
     1650  gsl_matrix *G;
     1651  gsl_vector *h;
     1652  gsl_vector *eval;
     1653  gsl_matrix *evec;
     1654  //gsl_eigen_symmv_workspace *w;
     1655  double *c,*s,a_i,a_j;
     1656  int it_steps, set, ProcRank;
     1657  int i,j,k,l,m;
     1658  double spread = 0., old_spread = 0.;\
     1659
     1660  // allocate eigenvector stuff
     1661  debug(P,"allocate eigenvector stuff");
     1662  G = gsl_matrix_calloc (2,2);
     1663  h = gsl_vector_alloc (2);
     1664  eval = gsl_vector_alloc (2);
     1665  evec = gsl_matrix_alloc (2,2);
     1666  //w = gsl_eigen_symmv_alloc(2);
     1667
     1668  c = (double *) Malloc(sizeof(double), "ComputeMLWF: c");
     1669  s = (double *) Malloc(sizeof(double), "ComputeMLWF: s");
     1670  debug(P,"now comes the iteration loop");
     1671  it_steps=0;
     1672  do {
     1673    it_steps++;
     1674    //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 3: Iteratively maximize negative spread part\n",P->Par.me);
     1675    //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Beginning iteration %i ... ",P->Par.me,it_steps);
     1676    for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 stating at its place all the time
     1677      //fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
     1678      // STEP 3: for all index pairs 0<= i<j <AllocNum
     1679      for (ProcRank=0;ProcRank<DiagData->AllocNum/2;ProcRank++) {
     1680        // get indices
     1681        i = DiagData->top[ProcRank] < DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // minimum of the two indices
     1682        j = DiagData->top[ProcRank] > DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // maximum of the two indices: thus j >  i
     1683        //fprintf(stderr,"(%i),(%i,%i) STEP 3a\n",P->Par.me,i,j);
     1684        // STEP 3a: Calculate G
     1685        gsl_matrix_set_zero(G);
     1686       
     1687        for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
     1688          // Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]
     1689          //fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,i), gsl_matrix_get(DiagData->A[k],j,j),gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));         
     1690          //fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,j), gsl_matrix_get(DiagData->A[k],j,i),gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));         
     1691          gsl_vector_set(h, 0, gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));
     1692          gsl_vector_set(h, 1, gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));
     1693          //gsl_vector_complex_set(h, 2, gsl_complex_mul_imag(gsl_complex_add(gsl_matrix_complex_get(A[k],j,i), gsl_matrix_complex_get(A[k],i,j)),1));
     1694         
     1695          // Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
     1696          for (l=0;l<2;l++)
     1697            for (m=0;m<2;m++)
     1698                gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
     1699               
     1700          //PrintGSLMatrix(P,G,2, "G");
    6181701        }
    619       }
    620       // now comes the iteration loop
    621       //debug(P,"now comes the iteration loop");
    622       it_steps = 0;
    623       do {
    624         it_steps++;
    625         fprintf(stderr,"(%i) Beginning parallel iteration %i ... ",P->Par.me,it_steps);
    626         for (set=0; set < AllocNum-1; set++) { // one column less due to column 0 stating at its place all the time
    627           //fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
    628           for (round = 0; round < max_rounds;round++) {
    629             start = ProcRank * max_rounds + round;
    630             // get indices
    631             i = top[start] < bot[start] ? top[start] : bot[start]; // minimum of the two indices
    632             iloc = top[start] < bot[start] ? 0 : 1;
    633             j = top[start] > bot[start] ? top[start] : bot[start]; // maximum of the two indices: thus j >  i
    634             jloc =  top[start] > bot[start] ? 0 : 1;
    635             //fprintf(stderr,"(%i) my (%i,%i), loc(%i,%i)\n",P->Par.me, i,j, iloc, jloc);
    636            
    637             // calculate rotation angle, i.e. c and s
    638             //fprintf(stderr,"(%i),(%i,%i) calculate rotation angle\n",P->Par.me,i,j);
    639             gsl_matrix_set_zero(G);       
    640             for (k=0;k<max_operators;k++) { // go through all operators ...
    641               // Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]         
    642               //fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,Aloc[k][2*round+iloc][i], Aloc[k][2*round+jloc][j],Aloc[k][2*round+iloc][i] - Aloc[k][2*round+jloc][j]);         
    643               //fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e - %e = %e\n",P->Par.me, k,Aloc[k][2*round+jloc][i], Aloc[k][2*round+iloc][j],Aloc[k][2*round+jloc][i] + Aloc[k][2*round+iloc][j]);         
    644               gsl_vector_set(h, 0, Aloc[k][2*round+iloc][i] - Aloc[k][2*round+jloc][j]);
    645               gsl_vector_set(h, 1, Aloc[k][2*round+jloc][i] + Aloc[k][2*round+iloc][j]);
    646              
    647               // Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
    648               for (l=0;l<2;l++)
    649                 for (m=0;m<2;m++)
    650                     gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
    651             }
    652             //fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
    653             // STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
    654             gsl_eigen_symmv(G, eval, evec, w);  // calculates eigenvalues and eigenvectors of G   
    655             index = gsl_vector_max_index (eval);  // get biggest eigenvalue
    656             x = gsl_matrix_get(evec, 0, index);
    657             y = gsl_matrix_get(evec, 1, index) * x/fabs(x);
    658             x = fabs(x);  // ensure x>=0 so that rotation angles remain smaller Pi/4     
    659             //fprintf(stderr,"(%i),(%i,%i) STEP 3c\n",P->Par.me,i,j);
    660             // STEP 3c: calculate R = [[c,s^\ast],[-s,c^\ast]]
    661             r = sqrt(x*x + y*y);
    662             c[round] = sqrt((x + r) / (2*r));
    663             s[round] = y / sqrt(2*r*(x+r));
    664             // [[c,s],[-s,c]]= V_small
    665             //fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[round],s[round]);
    666    
    667             //fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
    668             // V_loc = V_loc * V_small
    669             //debug(P,"apply rotation to local U");
    670             for (l=0;l<AllocNum;l++) {
    671               a_i = Uloc[2*round+iloc][l];
    672               a_j = Uloc[2*round+jloc][l];
    673               Uloc[2*round+iloc][l] =  c[round] * a_i + s[round] * a_j;
    674               Uloc[2*round+jloc][l] = -s[round] * a_i + c[round] * a_j;
    675             }
    676           }  // end of round       
    677           // circulate the rotation angles
    678           //debug(P,"circulate the rotation angles");
    679           MPI_Allgather(c, max_rounds, MPI_DOUBLE, c_all, max_rounds, MPI_DOUBLE, *comm);   // MPI_Allgather is waaaaay faster than ring circulation
    680           MPI_Allgather(s, max_rounds, MPI_DOUBLE, s_all, max_rounds, MPI_DOUBLE, *comm);
    681           //m = start;
    682           for (l=0;l<AllocNum/2;l++) { // for each process
    683             // we have V_small from process k
    684             //debug(P,"Apply V_small from other process");
    685             i = top[l] < bot[l] ? top[l] : bot[l]; // minimum of the two indices
    686             j = top[l] > bot[l] ? top[l] : bot[l]; // maximum of the two indices: thus j >  i
    687             iloc = top[l] < bot[l] ? 0 : 1;
    688             jloc =  top[l] > bot[l] ? 0 : 1;
    689             for (m=0;m<max_rounds;m++) {
    690               //fprintf(stderr,"(%i) %i processes' (%i,%i)\n",P->Par.me, m,i,j);
    691               // apply row rotation to each A[k]
    692               for (k=0;k<max_operators+1;k++) {// one extra for B matrix !
    693                 //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+iloc][i],j,Aloc[k][2*m+iloc][j]);
    694                 //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+jloc][i],j,Aloc[k][2*m+jloc][j]);
    695                 a_i = Aloc[k][2*m+iloc][i];
    696                 a_j = Aloc[k][2*m+iloc][j];
    697                 Aloc[k][2*m+iloc][i] =  c_all[l] * a_i + s_all[l] * a_j;
    698                 Aloc[k][2*m+iloc][j] = -s_all[l] * a_i + c_all[l] * a_j;
    699                 a_i = Aloc[k][2*m+jloc][i];
    700                 a_j = Aloc[k][2*m+jloc][j];
    701                 Aloc[k][2*m+jloc][i] =  c_all[l] * a_i + s_all[l] * a_j;
    702                 Aloc[k][2*m+jloc][j] = -s_all[l] * a_i + c_all[l] * a_j;
    703                 //fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+iloc][i],j,Aloc[k][2*m+iloc][j]);
    704                 //fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2*m+jloc][i],j,Aloc[k][2*m+jloc][j]);
    705               }
    706             }
     1702     
     1703        //fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
     1704        // STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
     1705        EigensolverFor22Matrix(P,G,eval,evec);
     1706        //gsl_eigen_symmv(G, eval, evec, w);  // calculates eigenvalues and eigenvectors of G
     1707
     1708        //PrintGSLMatrix(P, evec, 2, "Eigenvectors");
     1709        CalculateRotationAnglesFromEigenvalues(evec, eval, c, s);
     1710        //fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[0],s[0]);
     1711     
     1712        //fprintf(stderr,"(%i),(%i,%i) STEP 3d\n",P->Par.me,i,j);
     1713        // STEP 3d: apply rotation R to rows and columns of A^{(k)}
     1714        for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
     1715          //PrintGSLMatrix(P,A[k],AllocNum, "A (before rot)");
     1716          for (l=0;l<DiagData->AllocNum;l++) {
     1717            // Rows
     1718            a_i = gsl_matrix_get(DiagData->A[k],i,l);
     1719            a_j = gsl_matrix_get(DiagData->A[k],j,l);
     1720            gsl_matrix_set(DiagData->A[k], i, l,  c[0] * a_i + s[0] * a_j);
     1721            gsl_matrix_set(DiagData->A[k], j, l, -s[0] * a_i + c[0] * a_j);
    7071722          }
    708           // apply rotation to local operator matrices
    709           // A_loc = A_loc * V_small
    710           //debug(P,"apply rotation to local operator matrices A[k]");
    711           for (m=0;m<max_rounds;m++) {
    712             start = ProcRank * max_rounds + m;
    713             iloc = top[start] < bot[start] ? 0 : 1;
    714             jloc =  top[start] > bot[start] ? 0 : 1;
    715             for (k=0;k<max_operators+1;k++) {// one extra for B matrix !
    716               for (l=0;l<AllocNum;l++) {
    717                 // Columns, i and j belong to this process only!
    718                 a_i = Aloc[k][2*m+iloc][l];
    719                 a_j = Aloc[k][2*m+jloc][l];
    720                 Aloc[k][2*m+iloc][l] =  c[m] * a_i + s[m] * a_j;
    721                 Aloc[k][2*m+jloc][l] = -s[m] * a_i + c[m] * a_j;
    722                 //fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, l, Aloc[k][2*m+iloc][l],l,Aloc[k][2*m+jloc][l]);
    723               }
    724             }
     1723          for (l=0;l<DiagData->AllocNum;l++) {
     1724            // Columns
     1725            a_i = gsl_matrix_get(DiagData->A[k],l,i);
     1726            a_j = gsl_matrix_get(DiagData->A[k],l,j);
     1727            gsl_matrix_set(DiagData->A[k], l, i,  c[0] * a_i + s[0] * a_j);
     1728            gsl_matrix_set(DiagData->A[k], l, j, -s[0] * a_i + c[0] * a_j);
    7251729          }
    726           // Shuffling of these round's columns to prepare next rotation set
    727           for (k=0;k<max_operators+1;k++) {// one extra for B matrix !
    728             // extract columns from A
    729             //debug(P,"extract columns from A");
    730             shiftcolumns(*comm, Aloc[k], AllocNum, max_rounds, k, tagS0, tagS1, tagR0, tagR1);
    731                      
    732           }       
    733           // and also for V ...
    734           //debug(P,"extract columns from U");
    735           shiftcolumns(*comm, Uloc, AllocNum, max_rounds, 0, tagS0, tagS1, tagR0, tagR1);
    736 
    737    
    738           // and merry-go-round for the indices too           
    739           //debug(P,"and merry-go-round for the indices too");
    740           music(top, bot, AllocNum/2);
     1730          //PrintGSLMatrix(P,A[k],AllocNum, "A (after rot)");
    7411731        }
    742  
    743         //fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
    744         // STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
    745         old_spread = Spread;
    746         spread = 0.;
    747         for(k=0;k<max_operators;k++) {   // go through all self-adjoint operators
    748           for (i=0; i < 2*max_rounds; i++) {  // go through all wave functions
    749               spread += Aloc[k][i][i+ProcRank*2*max_rounds]*Aloc[k][i][i+ProcRank*2*max_rounds];
    750               //spread += gsl_matrix_get(A[k],i,i)*gsl_matrix_get(A[k],i,i);
    751           }
     1732        //fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
     1733        //PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (before rot)");
     1734        // STEP 3e: apply U = R*U
     1735        for (l=0;l<DiagData->AllocNum;l++) {
     1736            a_i = gsl_matrix_get(DiagData->U,i,l);
     1737            a_j = gsl_matrix_get(DiagData->U,j,l);
     1738            gsl_matrix_set(DiagData->U, i, l,  c[0] * a_i + s[0] * a_j);
     1739            gsl_matrix_set(DiagData->U, j, l, -s[0] * a_i + c[0] * a_j);
    7521740        }
    753         MPI_Allreduce(&spread, &Spread, 1, MPI_DOUBLE, MPI_SUM, *comm);
    754         //Spread = spread;
    755         if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,Spread,R->EpsWannier);
    756         else fprintf(stderr,"%2.9e\n",Spread);
    757         // STEP 5: check change of variance
    758       } while (fabs(old_spread-Spread) >= R->EpsWannier);
    759       // end of iterative diagonalization loop: We have found our final orthogonal U!
    760 
    761       for (l=0;l<ProcNum;l++)
    762         rcounts[l] = AllocNum;
    763       //debug(P,"allgather U");
    764       for (round=0;round<2*max_rounds;round++) {
    765         for (l=0;l<ProcNum;l++)
    766           rdispls[l] = (l*2*max_rounds + round)*AllocNum;
    767         MPI_Allgatherv(Uloc[round], AllocNum, MPI_DOUBLE, Utotal, rcounts, rdispls, MPI_DOUBLE, *comm);
    768       }
    769       for (k=0;k<AllocNum;k++) {   
    770         for(l=0;l<AllocNum;l++) {
    771           gsl_matrix_set(U,k,l, Utotal[l+k*AllocNum]);
    772         }
    773       }
    774 
    775 
    776       // after one set, gather A[k] from all and calculate spread
    777       for (l=0;l<ProcNum;l++)
    778         rcounts[l] = AllocNum;
    779       //debug(P,"gather A[k] for spread");
    780       for (u=0;u<max_operators+1;u++) {// one extra for B matrix !
    781         //debug(P,"A[k] all gather");
    782         for (round=0;round<2*max_rounds;round++) {
    783           for (l=0;l<ProcNum;l++)
    784             rdispls[l] = (l*2*max_rounds + round)*AllocNum;
    785           MPI_Allgatherv(Aloc[u][round], AllocNum, MPI_DOUBLE, Atotal[u], rcounts, rdispls, MPI_DOUBLE, *comm);
    786         }
    787         for (k=0;k<AllocNum;k++) {   
    788           for(l=0;l<AllocNum;l++) {
    789             gsl_matrix_set(A[u],k,l, Atotal[u][l+k*AllocNum]);
    790           }
    791         }
    792       }
    793      
    794       // free eigenvector stuff     
    795       gsl_vector_free(h);
    796       gsl_matrix_free(G);
    797       gsl_eigen_symmv_free(w); 
    798       gsl_vector_free(eval);
    799       gsl_matrix_free(evec);
    800       // Free column vectors
    801       for (k=0;k<max_operators+1;k++) {
    802         Free(Atotal[k], "bla");
    803         Free(Around[k], "bla");
    804       }
    805       Free(Uround, "bla");
    806       Free(Utotal, "bla");
    807       Free(c_all, "bla");
    808       Free(s_all, "bla");
    809       Free(c, "bla");
    810       Free(s, "bla");
    811       Free(rcounts, "bla");
    812       Free(rdispls, "bla");
    813      
    814     } else {
    815      
    816       c = (double *) Malloc(sizeof(double), "ComputeMLWF: c");
    817       s = (double *) Malloc(sizeof(double), "ComputeMLWF: s");
    818       G = gsl_matrix_calloc (2,2);
    819       h = gsl_vector_alloc (2);
    820       eval = gsl_vector_alloc (2);
    821       evec = gsl_matrix_alloc (2,2);
    822       w = gsl_eigen_symmv_alloc(2);
    823       //debug(P,"now comes the iteration loop");
    824       it_steps=0;
    825       do {
    826         it_steps++;
    827         //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 3: Iteratively maximize negative spread part\n",P->Par.me);
    828         fprintf(stderr,"(%i) Beginning iteration %i ... ",P->Par.me,it_steps);
    829         for (set=0; set < AllocNum-1; set++) { // one column less due to column 0 stating at its place all the time
    830           //fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
    831           // STEP 3: for all index pairs 0<= i<j <AllocNum
    832           for (ProcRank=0;ProcRank<AllocNum/2;ProcRank++) {
    833             // get indices
    834             i = top[ProcRank] < bot[ProcRank] ? top[ProcRank] : bot[ProcRank]; // minimum of the two indices
    835             j = top[ProcRank] > bot[ProcRank] ? top[ProcRank] : bot[ProcRank]; // maximum of the two indices: thus j >  i
    836             //fprintf(stderr,"(%i),(%i,%i) STEP 3a\n",P->Par.me,i,j);
    837             // STEP 3a: Calculate G
    838             gsl_matrix_set_zero(G);
    839            
    840             for (k=0;k<max_operators;k++) { // go through all operators ...
    841               // Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]
    842               //fprintf(stderr,"(%i) k%i [a_ii - a_ij] = %e - %e = %e\n",P->Par.me, k,gsl_matrix_get(A[k],i,i), gsl_matrix_get(A[k],j,j),gsl_matrix_get(A[k],i,i) - gsl_matrix_get(A[k],j,j));         
    843               //fprintf(stderr,"(%i) k%i [a_ij + a_jij] = %e - %e = %e\n",P->Par.me, k,gsl_matrix_get(A[k],i,j), gsl_matrix_get(A[k],j,i),gsl_matrix_get(A[k],i,j) + gsl_matrix_get(A[k],j,i));         
    844               gsl_vector_set(h, 0, gsl_matrix_get(A[k],i,i) - gsl_matrix_get(A[k],j,j));
    845               gsl_vector_set(h, 1, gsl_matrix_get(A[k],i,j) + gsl_matrix_get(A[k],j,i));
    846               //gsl_vector_complex_set(h, 2, gsl_complex_mul_imag(gsl_complex_add(gsl_matrix_complex_get(A[k],j,i), gsl_matrix_complex_get(A[k],i,j)),1));
    847              
    848               // Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
    849               for (l=0;l<2;l++)
    850                 for (m=0;m<2;m++)
    851                     gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
    852             }
    853          
    854             //fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
    855             // STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
    856             gsl_eigen_symmv(G, eval, evec, w);  // calculates eigenvalues and eigenvectors of G
    857        
    858             index = gsl_vector_max_index (eval);  // get biggest eigenvalue
    859             x = gsl_matrix_get(evec, 0, index);
    860             y = gsl_matrix_get(evec, 1, index) * x/fabs(x);
    861             //z = gsl_matrix_get(evec, 2, index) * x/fabs(x);
    862             x = fabs(x);  // ensure x>=0 so that rotation angles remain smaller Pi/4
    863          
    864             //fprintf(stderr,"(%i),(%i,%i) STEP 3c\n",P->Par.me,i,j);
    865             // STEP 3c: calculate R = [[c,s^\ast],[-s,c^\ast]]
    866             r = sqrt(x*x + y*y); // + z*z);
    867             c[0] = sqrt((x + r) / (2*r));
    868             s[0] = y / sqrt(2*r*(x+r)); //, -z / sqrt(2*r*(x+r)));
    869             //fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[0],s[0]);
    870          
    871             //fprintf(stderr,"(%i),(%i,%i) STEP 3d\n",P->Par.me,i,j);
    872             // STEP 3d: apply rotation R to rows and columns of A^{(k)}
    873             for (k=0;k<max_operators+1;k++) {// one extra for B matrix !
    874               for (l=0;l<AllocNum;l++) {
    875                 // Rows
    876                 a_i = gsl_matrix_get(A[k],i,l);
    877                 a_j = gsl_matrix_get(A[k],j,l);
    878                 gsl_matrix_set(A[k], i, l,  c[0] * a_i + s[0] * a_j);
    879                 gsl_matrix_set(A[k], j, l, -s[0] * a_i + c[0] * a_j);
    880               }
    881               for (l=0;l<AllocNum;l++) {
    882                 // Columns
    883                 a_i = gsl_matrix_get(A[k],l,i);
    884                 a_j = gsl_matrix_get(A[k],l,j);
    885                 gsl_matrix_set(A[k], l, i,  c[0] * a_i + s[0] * a_j);
    886                 gsl_matrix_set(A[k], l, j, -s[0] * a_i + c[0] * a_j);
    887               }
    888             }
    889             //fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
    890             // STEP 3e: apply U = R*U
    891             for (l=0;l<AllocNum;l++) {
    892                 a_i = gsl_matrix_get(U,i,l);
    893                 a_j = gsl_matrix_get(U,j,l);
    894                 gsl_matrix_set(U, i, l,  c[0] * a_i + s[0] * a_j);
    895                 gsl_matrix_set(U, j, l, -s[0] * a_i + c[0] * a_j);
    896             }
    897           }
    898           // and merry-go-round for the indices too           
    899           //debug(P,"and merry-go-round for the indices too");
    900           if (AllocNum > 2) music(top, bot, AllocNum/2);
    901         }
    902        
    903         //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
    904         // STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
    905         old_spread = spread;
    906         spread = 0;
    907         for(k=0;k<max_operators;k++) {   // go through all self-adjoint operators
    908           for (i=0; i < AllocNum; i++) {  // go through all wave functions
    909               spread += pow(gsl_matrix_get(A[k],i,i),2);
    910           }
    911         }
    912         if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,spread,R->EpsWannier);
    913         else fprintf(stderr,"%2.9e\n",spread);
    914         // STEP 5: check change of variance
    915       } while (fabs(old_spread-spread) >= R->EpsWannier);
    916       // end of iterative diagonalization loop: We have found our final orthogonal U!
    917       gsl_vector_free(h);
    918       gsl_matrix_free(G);
    919       gsl_eigen_symmv_free(w); 
    920       gsl_vector_free(eval);
    921       gsl_matrix_free(evec);
    922       Free(c, "bla");
    923       Free(s, "bla");
    924     } 
    925    
    926     if(P->Call.out[ReadOut]) {// && P->Par.me == 0) {
    927       //debug(P,"output total U");
    928       fprintf(stderr,"(%i) U_tot = \n",P->Par.me);
    929       for (k=0;k<Num;k++) {   
    930         for (l=0;l<Num;l++)
    931           fprintf(stderr,"%e\t",gsl_matrix_get(U,l,k));
    932         fprintf(stderr,"\n");
    933       }
    934     }
    935   }
    936   Free(top, "bla");
    937   Free(bot, "bla");
    938 
    939   if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Allocating buffer mem\n",P->Par.me);
    940   // STEP 6: apply transformation U to all wave functions \sum_i^Num U_ji | \phi_i \rangle = | \phi_j^\ast \rangle
    941   Num = Psi->TypeStartIndex[type+1] - Psi->TypeStartIndex[type]; // recalc Num as we can only work with local Psis from here
    942   fftw_complex **coeffs_buffer = Malloc(sizeof(fftw_complex *)*Num, "ComputeMLWF: **coeffs_buffer");
    943   for (l=0;l<Num;l++) // allocate for each local wave function
    944     coeffs_buffer[l] = LevS->LPsi->OldLocalPsi[l];
    945  
    946   if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation ...\n",P->Par.me);
    947   l=-1; // to access U matrix element (0..Num-1)
    948   k=-1; // to access the above swap coeffs_buffer (0..LocalNo-1)
    949   for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions
    950     OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
    951     if (OnePsiA->PsiType == type) {   // drop all but occupied ones
    952       l++;  // increase l if it is occupied wave function
    953       if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // local?
    954         k++; // increase k only if it is a local, non-extra orbital wave function
    955         LPsiDatA = (fftw_complex *) coeffs_buffer[k];    // new coeffs first go to copy buffer, old ones must not be overwritten yet
    956         SetArrayToDouble0((double *)LPsiDatA, 2*LevS->MaxG);  // zero buffer part
    957       } else
    958         LPsiDatA = NULL;  // otherwise processes won't enter second loop, though they're supposed to send coefficients!
    959 
    960       m = -1; // to access U matrix element (0..Num-1)
    961       for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
    962         OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
    963         if (OnePsiB->PsiType == type) {   // drop all but occupied ones
    964           m++;  // increase m if it is occupied wave function
    965           if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
    966              LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
    967           else
    968              LOnePsiB = NULL;
    969           if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
    970             RecvSource = OnePsiB->my_color_comm_ST_Psi;
    971             MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
    972             LPsiDatB=LevS->LPsi->TempPsi;
    973           } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
    974             for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
    975               if (p != OnePsiB->my_color_comm_ST_Psi)
    976                 MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
    977             LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
    978           } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
    979 
    980           if (LPsiDatA != NULL) {
    981             double tmp = gsl_matrix_get(U,l,m);
    982             g=0;
    983             if (LevS->GArray[0].GSq == 0.0) {
    984               LPsiDatA[g].re += LPsiDatB[g].re * tmp;
    985               LPsiDatA[g].im += LPsiDatB[g].im * tmp;
    986               g++;
    987             }
    988             for (; g < LevS->MaxG; g++) {
    989               LPsiDatA[g].re += LPsiDatB[g].re * tmp;
    990               LPsiDatA[g].im += LPsiDatB[g].im * tmp;
    991             }
    992           }
    993         }
    994       }
    995     }
    996   }
    997   gsl_matrix_free(U);
    998 
    999   if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) STEP 6: Swapping buffer mem\n",P->Par.me);
    1000   // now, as all wave functions are updated, swap the buffer
    1001   l = -1;
    1002   for (k=0;k<Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT;k++) { // go through each local occupied wave function
    1003     if (Psi->AllPsiStatus[k].PsiType == type && Psi->AllPsiStatus[k].my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {
    1004       l++;
    1005       if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) (k:%i,l:%i) LocalNo = (%i,%i)\t AllPsiNo = (%i,%i)\n", P->Par.me, k,l,Psi->LocalPsiStatus[l].MyLocalNo, Psi->LocalPsiStatus[l].MyGlobalNo, Psi->AllPsiStatus[k].MyLocalNo, Psi->AllPsiStatus[k].MyGlobalNo);
    1006       LPsiDatA = (fftw_complex *)coeffs_buffer[l];
    1007       LPsiDatB = LevS->LPsi->LocalPsi[l];
    1008       for (g=0;g<LevS->MaxG;g++) {
    1009         LPsiDatB[g].re = LPsiDatA[g].re;
    1010         LPsiDatB[g].im = LPsiDatA[g].im;
    1011       }
    1012       // recalculating non-local form factors which are coefficient dependent!
    1013       CalculateNonLocalEnergyNoRT(P, Psi->LocalPsiStatus[l].MyLocalNo);
    1014     }
    1015   }
    1016   // and free allocated buffer memory
    1017   Free(coeffs_buffer, "bla");
    1018 
    1019   if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 7\n",P->Par.me);
    1020   // STEP 7: Compute Wannier centers, spread and printout
    1021   // the spread for x,y,z resides in the respective diagonal element of A_.. for each orbital
     1741        //PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (after rot)");
     1742      }
     1743      // and merry-go-round for the indices too           
     1744      //debug(P,"and merry-go-round for the indices too");
     1745      if (DiagData->AllocNum > 2) MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
     1746    }
     1747   
     1748    //if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
     1749    // STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
     1750    old_spread = spread;
     1751    spread = 0.;
     1752    for(k=0;k<DiagData->NumMatrices;k++) {   // go through all self-adjoint operators
     1753      for (i=0; i < DiagData->AllocNum; i++) {  // go through all wave functions
     1754          spread += pow(gsl_matrix_get(DiagData->A[k],i,i),2);
     1755      }
     1756    }
     1757    if(P->Par.me == 0) {
     1758      //if(P->Call.out[ReadOut])
     1759      //  fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,spread,R->EpsWannier);
     1760      //else
     1761        //fprintf(stderr,"%2.9e\n",spread);
     1762    }
     1763    // STEP 5: check change of variance
     1764  } while (fabs(old_spread-spread) >= R->EpsWannier);
     1765  // end of iterative diagonalization loop: We have found our final orthogonal U!
     1766  Free(c, "SerialDiagonalization: c");
     1767  Free(s, "SerialDiagonalization: s");
     1768  // free eigenvector stuff     
     1769  gsl_vector_free(h);
     1770  gsl_matrix_free(G);
     1771  //gsl_eigen_symmv_free(w); 
     1772  gsl_vector_free(eval);
     1773  gsl_matrix_free(evec);
     1774}
     1775
     1776/** Writes the wannier centres and spread to file.
     1777 * Also puts centres from array into OnePsiElementAddData structure.
     1778 * \param *P Problem at hand
     1779 * \param old_spread first term of wannier spread
     1780 * \param spread second term of wannier spread
     1781 * \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
     1782 * \param *WannierSpread array with wannier spread per wave function
     1783 */
     1784void WriteWannierFile(struct Problem *P, double spread, double old_spread, double **WannierCentre, double *WannierSpread)
     1785{
     1786  struct FileData *F = &P->Files;
     1787  struct RunStruct *R = &P->R;
     1788  struct Lattice *Lat = &P->Lat;
     1789  struct Psis *Psi = &Lat->Psi;
     1790  struct OnePsiElement *OnePsiA;
     1791  char spin[12], suffix[18];
     1792  int i,j,l;
     1793
    10221794  if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Spread printout\n", P->Par.me);
    10231795
    10241796  switch (Lat->Psi.PsiST) {
    10251797  case SpinDouble:
    1026     strncpy(suffix,".spread.csv",18);   
    1027     strncat(spin,"SpinDouble",12);
     1798    strcpy(suffix,".spread.csv");   
     1799    strcpy(spin,"SpinDouble");
    10281800    break;
    10291801  case SpinUp:
    1030     strncpy(suffix,".spread_up.csv",18);   
    1031     strncat(spin,"SpinUp",12);
     1802    strcpy(suffix,".spread_up.csv");   
     1803    strcpy(spin,"SpinUp");
    10321804    break;
    10331805  case SpinDown:
    1034     strncpy(suffix,".spread_down.csv",18);   
    1035     strncat(spin,"SpinDown",12);
     1806    strcpy(suffix,".spread_down.csv");   
     1807    strcpy(spin,"SpinDown");
    10361808    break;
    10371809  }
     
    10421814      OpenFile(P, &F->SpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
    10431815    if (F->SpreadFile == NULL) {
    1044       Error(SomeError,"ComputeMLWF: Error opening Wannier File!\n");
     1816      Error(SomeError,"WriteWannierFile: Error opening Wannier File!\n");
    10451817    } else {
    10461818      fprintf(F->SpreadFile,"#===== W A N N I E R  C E N T R E S for Level %d of type %s ========================\n", R->LevSNo, spin);
    10471819      fprintf(F->SpreadFile,"# Orbital+Level\tx\ty\tz\tSpread\n");
    10481820    }
    1049   }
    1050   old_spread = 0;
    1051   spread = 0;
    1052   i=-1;
    1053   for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) {  // go through all wave functions
    1054     OnePsiA = &Psi->AllPsiStatus[l];    // grab OnePsiA
    1055     if (OnePsiA->PsiType == type) {   // drop all but occupied ones
    1056       i++;  // increase l if it is occupied wave function
    1057       //fprintf(stderr,"(%i) Wannier for %i\n", P->Par.me, i);
    1058      
    1059       // calculate Wannier Centre
    1060       for (j=0;j<NDIM;j++) {
    1061         WannierCentre[i][j] = sqrt(Lat->RealBasisSQ[j])/(2*PI) * GSL_IMAG( gsl_complex_log( gsl_complex_rect(gsl_matrix_get(A[j*2],i,i),gsl_matrix_get(A[j*2+1],i,i))));
    1062         if (WannierCentre[i][j] < 0) // change wrap around of above operator to smooth 0...Lat->RealBasisSQ
    1063           WannierCentre[i][j] = sqrt(Lat->RealBasisSQ[j]) + WannierCentre[i][j];
    1064       }
    1065      
    1066       // store orbital spread and centre in file
    1067       tmp = - pow(gsl_matrix_get(A[0],i,i),2) - pow(gsl_matrix_get(A[1],i,i),2)
    1068             - pow(gsl_matrix_get(A[2],i,i),2) - pow(gsl_matrix_get(A[3],i,i),2)
    1069             - pow(gsl_matrix_get(A[4],i,i),2) - pow(gsl_matrix_get(A[5],i,i),2);         
    1070       WannierSpread[i] = gsl_matrix_get(A[max_operators],i,i) + tmp;
    1071       //fprintf(stderr,"(%i) WannierSpread[%i] = %e\n", P->Par.me, i, WannierSpread[i]);
    1072       //if (P->Par.me == 0) fprintf(F->SpreadFile,"Orbital %d:\t Wannier center (x,y,z)=(%lg,%lg,%lg)\t Spread sigma^2 = %lg - %lg = %lg\n",
    1073         //Psi->AllPsiStatus[i].MyGlobalNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], gsl_matrix_get(A[max_operators],i,i), -tmp, WannierSpread[i]);
    1074       //if (P->Par.me == 0) fprintf(F->SpreadFile,"%e\t%e\t%e\n",
    1075         //WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2]);
    1076      
    1077       // gather all spreads
    1078       old_spread += gsl_matrix_get(A[max_operators],i,i); // tr(U^H B U)
    1079       for (k=0;k<max_operators;k++)
    1080         spread += pow(gsl_matrix_get(A[k],i,i),2);
    1081 
    1082       // store calculated Wannier centre
    1083       if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // is this local?
    1084         for (j=0;j<NDIM;j++)
    1085           Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[i][j];
    1086     }
    1087   }
    1088  
    1089   // join Wannier orbital to groups with common centres under certain conditions
    1090   switch (R->CommonWannier) {
    1091    case 4:
    1092     debug(P,"Shifting each Wannier centers to cell center");
    1093     for (i=0; i < Num; i++) {  // go through all occupied wave functions
    1094       for (j=0;j<NDIM;j++)
    1095         WannierCentre[i][j] = sqrt(Lat->RealBasisSQ[j])/2.;
    1096     }   
    1097     break;
    1098    case 3:
    1099     debug(P,"Shifting Wannier centers individually to nearest grid point");
    1100     for (i=0;i < Num; i++) {  // go through all wave functions
    1101       for (j=0;j<NDIM;j++) {
    1102         tmp = WannierCentre[i][j]/sqrt(Lat->RealBasisSQ[j])*(double)N[j];
    1103         //fprintf(stderr,"(%i) N[%i]: %i\t tmp %e\t floor %e\t ceil %e\n",P->Par.me, j, N[j], tmp, floor(tmp), ceil(tmp));
    1104         if (fabs((double)floor(tmp) - tmp) < fabs((double)ceil(tmp) - tmp))
    1105           WannierCentre[i][j] = (double)floor(tmp)/(double)N[j]*sqrt(Lat->RealBasisSQ[j]);
    1106         else
    1107           WannierCentre[i][j] = (double)ceil(tmp)/(double)N[j]*sqrt(Lat->RealBasisSQ[j]);
    1108       }
    1109     }
    1110     break;
    1111    case 2:
    1112     debug(P,"Combining individual orbitals according to spread.");
    1113     //fprintf(stderr,"(%i) Finding multiple bindings and Reweighting Wannier centres\n",P->Par.me);
    1114     //debug(P,"finding partners");
    1115     marker = (int*) Malloc(sizeof(int)*(Num+1),"ComputeMLWF: marker");
    1116     group = (int**) Malloc(sizeof(int *)*Num,"ComputeMLWF: group");
    1117     for (l=0;l<Num;l++) {
    1118       group[l] = (int*) Malloc(sizeof(int)*(Num+1),"ComputeMLWF: group[l]");  // each must group must have one more as end marker
    1119       for (k=0;k<=Num;k++)
    1120         group[l][k] = -1; // reset partner group
    1121     }
    1122     for (k=0;k<Num;k++)
    1123       partner[k] = 0;
    1124     //debug(P,"mem allocated");
    1125     // go for each orbital through every other, check distance against the sum of both spreads
    1126     // if smaller add to group of this orbital
    1127     for (l=0;l<Num;l++) {
    1128       j=0;  // index for partner group
    1129       for (k=0;k<Num;k++) {  // check this against l
    1130         Spread = 0.;
    1131         for (i=0;i<NDIM;i++) {
    1132           //fprintf(stderr,"(%i) Spread += (%e - %e)^2 \n", P->Par.me, WannierCentre[l][i], WannierCentre[k][i]);
    1133           Spread += (WannierCentre[l][i] - WannierCentre[k][i])*(WannierCentre[l][i] - WannierCentre[k][i]);
    1134         }
    1135         Spread = sqrt(Spread);  // distance in Spread
    1136         //fprintf(stderr,"(%i) %i to %i: distance %e, SpreadSum = %e + %e = %e \n", P->Par.me, l, k, Spread, WannierSpread[l], WannierSpread[k], WannierSpread[l]+WannierSpread[k]);
    1137         if (Spread < 1.5*(WannierSpread[l]+WannierSpread[k])) {// if distance smaller than sum of spread
    1138           group[l][j++] = k;  // add k to group of l
    1139           partner[l]++;
    1140           //fprintf(stderr,"(%i) %i added as %i-th member to %i's group.\n", P->Par.me, k, j, l);
    1141         }
    1142       }
    1143     }
    1144    
    1145     // consistency, for each orbital check if this orbital is also in the group of each referred orbital
    1146     //debug(P,"checking consistency");
    1147     totalflag = 1;
    1148     for (l=0;l<Num;l++) // checking l's group
    1149       for (k=0;k<Num;k++) { // k is partner index
    1150         if (group[l][k] != -1) {  // if current index k is a partner
    1151           flag = 0;
    1152           for(j=0;j<Num;j++) {  // go through each entry in l partner's partner group if l exists
    1153             if ((group[ group[l][k] ][j] == l))
    1154               flag = 1;
    1155           }
    1156           //if (flag == 0) fprintf(stderr, "(%i) in %i's group %i is referred as a partner, but not the other way round!\n", P->Par.me, l, group[l][k]);
    1157           if (totalflag == 1) totalflag = flag;
    1158         }
    1159       }
    1160     // for each orbital group (marker group) weight each center to a total and put this into the local WannierCentres
    1161     //debug(P,"weight and calculate new centers for partner groups");
    1162     for (l=0;l<=Num;l++)
    1163       marker[l] = 1;
    1164     if (totalflag) {
    1165       for (l=0;l<Num;l++) { // go through each orbital
    1166         if (marker[l] != 0) { // if it hasn't been reweighted
    1167           marker[l] = 0;
    1168           for (i=0;i<NDIM;i++)
    1169             q[i] = 0.;
    1170           j = 0;
    1171           while (group[l][j] != -1) {
    1172             marker[group[l][j]] = 0;
    1173             for (i=0;i<NDIM;i++) {
    1174               //fprintf(stderr,"(%i) Adding to %i's group, %i entry of %i: %e\n", P->Par.me, l, i, group[l][j], WannierCentre[ group[l][j] ][i]);
    1175               q[i] += WannierCentre[ group[l][j] ][i];
    1176             }
    1177             j++;
    1178           }
    1179           //fprintf(stderr,"(%i) %i's group: (%e,%e,%e)/%i = (%e,%e,%e)\n", P->Par.me, l, q[0], q[1], q[2], j, q[0]/(double)j, q[1]/(double)j, q[2]/(double)j);
    1180           for (i=0;i<NDIM;i++) {// weight by number of elements in partner group
    1181             q[i] /= (double)(j);
    1182           }
    1183  
    1184           // put WannierCentre into own and all partners'     
    1185           for (i=0;i<NDIM;i++)
    1186             WannierCentre[l][i] = q[i];
    1187           j = 0; 
    1188           while (group[l][j] != -1) {
    1189             for (i=0;i<NDIM;i++)
    1190               WannierCentre[group[l][j]][i] = q[i];
    1191             j++;
    1192           }
    1193         }
    1194       }
    1195     }
    1196     if (P->Call.out[StepLeaderOut]) {
    1197       fprintf(stderr,"Summary:\n");
    1198       fprintf(stderr,"========\n");
    1199       for (i=0;i<Num;i++)
    1200         fprintf(stderr,"%i belongs to a %i-ple binding.\n",i,partner[i]);
    1201     }
    1202     //debug(P,"done");
    1203  
    1204     Free(marker, "bla");
    1205     for (l=0;l<Num;l++)
    1206       Free(group[l], "bla");
    1207     Free(group, "bla");
    1208     break;
    1209    case 1:
    1210     debug(P,"Individual orbitals are changed to center of all.");
    1211     for (i=0;i<NDIM;i++)  // zero center of weight
    1212       q[i] = 0.;
    1213     for (k=0;k<Num;k++)
    1214       for (i=0;i<NDIM;i++) {  // sum up all orbitals each component
    1215         q[i] += WannierCentre[k][i];
    1216       }
    1217     for (i=0;i<NDIM;i++)  // divide by number
    1218       q[i] /= Num;
    1219     for (k=0;k<Num;k++)
    1220       for (i=0;i<NDIM;i++) {  // put into this function's array
    1221         WannierCentre[k][i] = q[i];
    1222       }
    1223     break;
    1224    case 0:
    1225    default:
    1226    break;
    12271821  }
    12281822 
     
    12441838    fprintf(F->SpreadFile,"\n#Matrix traces\tB_ii\tA_ii^2\tTotal (B_ii - A_ii^2)\n");
    12451839    fprintf(F->SpreadFile,"TotalSpread_L%d\t%lg\t%lg\t%lg\n\n",R->LevSNo, old_spread, spread, old_spread - spread);
    1246   }
    1247   fflush(F->SpreadFile);
    1248 
    1249   // and the spread was calculated in the loop above
    1250 /*
    1251   i=-1;     
    1252   for (l=0;l<Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT;l++)
    1253     if (Psi->AllPsiStatus[l].PsiType == type) {
    1254       i++;
    1255       spread = CalculateSpread(P,l);
    1256       tmp = gsl_matrix_get(A[max_operators],i,i) - pow(gsl_matrix_get(A[0],i,i),2) - pow(gsl_matrix_get(A[1],i,i),2)
    1257           - pow(gsl_matrix_get(A[2],i,i),2) - pow(gsl_matrix_get(A[3],i,i),2)
    1258           - pow(gsl_matrix_get(A[4],i,i),2) - pow(gsl_matrix_get(A[5],i,i),2);
    1259       if(P->Call.out[ValueOut]) fprintf(stderr, "(%i) Check of spread of %ith wave function: %lg against %lg\n",P->Par.me, i, Psi->AddData[i].WannierSpread, tmp); 
    1260     }*/
    1261   // free all remaining memory
    1262   for (k=0;k<max_operators+1;k++)
    1263     gsl_matrix_free(A[k]);
    1264 }
     1840    fflush(F->SpreadFile);
     1841  }
     1842}
     1843
    12651844
    12661845/** Parses the spread file and puts values into OnePsiElementAddData#WannierCentre.
     
    12801859  double WannierCentre[NDIM+1]; // combined centre and spread
    12811860  MPI_Status status;
    1282   enum PsiTypeTag type = Occupied;
    12831861  int signal = 0; // 1 - ok, 0 - error
    12841862
    12851863  switch (Lat->Psi.PsiST) {
    12861864  case SpinDouble:
    1287     strncpy(suffix,".spread.csv",18);   
     1865    strcpy(suffix,".spread.csv");   
    12881866    break;
    12891867  case SpinUp:
    1290     strncpy(suffix,".spread_up.csv",18);   
     1868    strcpy(suffix,".spread_up.csv");   
    12911869    break;
    12921870  case SpinDown:
    1293     strncpy(suffix,".spread_down.csv",18);   
     1871    strcpy(suffix,".spread_down.csv");   
    12941872    break;
    12951873  }
     1874  if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Parsing Wannier Centres from file ... \n", P->Par.me);
    12961875 
    12971876  if (P->Par.me_comm_ST == 0) {
     
    13541933          //return 0;
    13551934        // and store 'em (for all who have this Psi local)
    1356         fprintf(stderr,"(%i) Psi %i, L %i: (x,y,z) = (%lg, %lg, %lg), Spread %lg\n",P->Par.me,i, R->LevSNo, WannierCentre[0], WannierCentre[1], WannierCentre[2], WannierCentre[NDIM]);
     1935        if ((P->Par.me == 0) && P->Call.out[ValueOut]) fprintf(stderr,"(%i) Psi %i, L %i: (x,y,z) = (%lg, %lg, %lg), Spread %lg\n",P->Par.me,i, R->LevSNo, WannierCentre[0], WannierCentre[1], WannierCentre[2], WannierCentre[NDIM]);
    13571936        for (j=0;j<NDIM;j++) Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[j];
    13581937        Psi->AddData[OnePsiA->MyLocalNo].WannierSpread = WannierCentre[NDIM];
    1359         if (P->Call.out[ValueOut]) fprintf(stderr,"(%i) %s\t%lg\t%lg\t%lg\t\t%lg\n",P->Par.me, tagname,Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[0],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[1],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[2],Psi->AddData[OnePsiA->MyLocalNo].WannierSpread);
     1938        //if (P->Par.me ==0 && P->Call.out[ValueOut]) fprintf(stderr,"(%i) %s\t%lg\t%lg\t%lg\t\t%lg\n",P->Par.me, tagname,Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[0],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[1],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[2],Psi->AddData[OnePsiA->MyLocalNo].WannierSpread);
    13601939      } else if (P->Par.me_comm_ST == 0) { // if they are not local, yet we are process 0, send 'em to leader of its Psi group
    13611940        if (MPI_Send(WannierCentre, NDIM+1, MPI_DOUBLE, OnePsiA->my_color_comm_ST_Psi, ParseWannierTag, P->Par.comm_ST_PsiT) != MPI_SUCCESS)
     
    13671946  if ((SpreadFile != NULL) && (P->Par.me_comm_ST == 0))
    13681947    fclose(SpreadFile);
    1369   fprintf(stderr,"(%i) Parsing Wannier files succeeded!\n", P->Par.me);
     1948  //fprintf(stderr,"(%i) Parsing Wannier files succeeded!\n", P->Par.me);
    13701949  return 1;
    13711950}
     
    14081987  b_ij = 0;
    14091988
    1410   // create lookup table for sin/cos values
    1411   cos_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *cos_lookup");
    1412   sin_lookup = (double **) Malloc(sizeof(double *)*NDIM, "ComputeMLWF: *sin_lookup");
    1413   for (k=0;k<NDIM;k++) {
    1414     // allocate memory
    1415     cos_lookup[k] = (double *) Malloc(sizeof(double)*LevS->Plan0.plan->N[k], "ComputeMLWF: cos_lookup");
    1416     sin_lookup[k] = (double *) Malloc(sizeof(double)*LevS->Plan0.plan->N[k], "ComputeMLWF: sin_lookup");
    1417     // reset arrays
    1418     SetArrayToDouble0(cos_lookup[k],LevS->Plan0.plan->N[k]);
    1419     SetArrayToDouble0(sin_lookup[k],LevS->Plan0.plan->N[k]);
    1420     // create lookup values
    1421     for (g=0;g<LevS->Plan0.plan->N[k];g++) {
    1422       tmp = 2*PI/(double)LevS->Plan0.plan->N[k]*(double)g;
    1423       cos_lookup[k][g] = cos(tmp);
    1424       sin_lookup[k][g] = sin(tmp);
    1425     }
    1426   } 
     1989  CreateSinCosLookupTable(&cos_lookup,&sin_lookup,N);
     1990
    14271991  // fill matrices
    14281992  OnePsiA = &Psi->AllPsiStatus[i];    // grab the desired OnePsiA
     
    14992063  // store spread in OnePsiElementAdd
    15002064  Psi->AddData[i].WannierSpread = B_ij - spread;
    1501   // free lookups
    1502   for (k=0;k<NDIM;k++) {
    1503     Free(cos_lookup[k], "bla");
    1504     Free(sin_lookup[k], "bla");
    1505   }
    1506   Free(cos_lookup, "bla");
    1507   Free(sin_lookup, "bla");
    1508 
     2065
     2066  FreeSinCosLookupTable(cos_lookup,sin_lookup);
     2067 
    15092068  return (B_ij - spread);
    15102069}
  • pcp/src/wannier.h

    r9bdd86 r4f1369  
    1212
    1313  File: wannier.h
    14   $Id: wannier.h,v 1.8 2007/02/05 14:00:58 foo Exp $
     14  $Id: wannier.h,v 1.3 2007-10-08 15:43:29 heber Exp $
    1515*/
    1616
    1717#include <gsl/gsl_complex.h>
    1818
     19/** Structure contains all variables needed for diagonalization of a matrix with
     20 * SerialDiagonalization() or ParallelDiagonalization()
     21 */
     22struct DiagonalizationData {
     23  int Num;        //!< Number of rows/columns
     24  int AllocNum;   //!< even number of rows/columns
     25  int NumMatrices;//!< number of matrices to be simultaneously "actively" diagonalized
     26  int extra;      //!< number of additional matrices that are also, yet "passively" diagonalized (not considered in rotation angle evaluation)
     27  gsl_matrix *U;   //!< transformation matrix
     28  gsl_matrix **A;  //!< matrix to be diagonlized
     29  int *top;       //!< merry-go-round top row of indices
     30  int *bot;       //!< merry-go-round bottom row of indices
     31  MPI_Comm *comm; //!< MPI communicator for ParallelDiagonalization()
     32  int ProcRank;   //!< Rank of this process, used in ParallelDiagonalization()
     33  int ProcNum;    //!< Number of process in communicator, used in ParallelDiagonalization()
     34};
     35
     36void PrintGSLMatrix(struct Problem *P, gsl_matrix *U, int Num, const char *msg);
    1937void ComputeMLWF(struct Problem *P);
     38void WriteWannierFile(struct Problem *P, double spread, double old_spread, double **WannierCentre, double *WannierSpread);
    2039int ParseWannierFile(struct Problem *P);
     40void SerialDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData);
     41void ParallelDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData);
    2142double CalculateSpread(struct Problem *P, int i);
    2243gsl_complex convertComplex (fftw_complex a);
     44void InitDiagonalization(struct Problem *P, struct DiagonalizationData *DiagData, int Num, int NumMatrices, int extra);
     45void FreeDiagonalization(struct DiagonalizationData *DiagData);
     46void OrthogonalizePsis(struct Problem *P);
     47void StrongOrthogonalizePsis(struct Problem *P);
     48void Diagonalize(struct Problem *P, struct DiagonalizationData *DiagData);
     49
     50void CalculateSecondOrderReciprocalMoment(struct Problem *P);
    2351
    2452#endif /*WANNIER_H_*/
Note: See TracChangeset for help on using the changeset viewer.