/** \file gramsch.c
 * Gram-Schmidt-Orthonormalisation.
 * Herein are all the functions necessary to orthogonalize and normalize the wave
 * functions OnePsiElement, such as initialization FirstInitGramSchData(), norm
 * GramSchNormalize(), scalar product GramSchSP() and the actual Gram-Schmidt-routine 
 * GramSch(). All depending on the current status of the wave function.
 * 
  Project: ParallelCarParrinello
 \author Jan Hamaekers
 \date 2000

  File: gramsch.c
  $Id: gramsch.c,v 1.70.2.1 2007-04-21 12:49:50 foo Exp $
*/

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>

// use double precision fft when we have it
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#ifdef HAVE_DFFTW_H
#include "dfftw.h"
#else
#include "fftw.h"
#endif

#include "data.h"
#include "helpers.h"
#include "errors.h"
#include "myfft.h"
#include "mymath.h"
#include "gramsch.h"
#include "run.h"
#include "mergesort2.h"

/** Deallocates the defined OnePsiElement datatype.
 */
void FreeMPI_OnePsiElement()
{
  MPI_Type_free(&MPI_OnePsiElement);
}

/** Initialization of Gram-Schmidt-Orthogonalization.
 * \param *P Problem at hand
 * \param *Psi wave functions
 * \sa RemoveEverything()
 */
void FirstInitGramSchData(struct Problem *P, struct Psis *Psi) {
  int i, type;
  int GramSchLocalNo = Psi->LocalNo+1;
  MPI_Datatype type1[10] = { MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_UB}; // type of each OPE array element
  int blocklen1[10] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; // block length of each element within the OPE array
  MPI_Aint base, disp1[10];  // holds adresses in memory
  struct OnePsiElement OPE[2];

  /// Create MPI_OnePsiElement, simulacrum of OnePsiElement, enabling exchange of these among the processes
  // store adresses of its various elements in disp1 array
  MPI_Address( OPE, disp1);
  MPI_Address( &OPE[0].my_color_comm_ST_Psi, disp1+1);
  MPI_Address( &OPE[0].MyLocalNo, disp1+2);
  MPI_Address( &OPE[0].MyGlobalNo, disp1+3);
  MPI_Address( &OPE[0].PsiGramSchStatus, disp1+4);
  MPI_Address( &OPE[0].PsiType, disp1+5);
  MPI_Address( &OPE[0].PsiFactor, disp1+6);
  MPI_Address( &OPE[0].PsiReciNorm2, disp1+7);
  MPI_Address( &OPE[0].DoBrent, disp1+8);
  MPI_Address( OPE+1, disp1+9);
  base = disp1[0];
  for (i=0; i < 10; i++) disp1[i] -= base; // make the adresses of OPE elements relativ to base -> byte displacement of each entry
  MPI_Type_struct( 10, blocklen1, disp1, type1, &MPI_OnePsiElement); // creates MPI_OnePsiElement as an MPI_struct(ure)
  MPI_Type_commit( &MPI_OnePsiElement);   // commits new data type, now it's usable
  if (P->Call.out[NormalOut]) fprintf(stderr, "(%i)FirstInitGramSchData\n", P->Par.me);

  /// Allocates and fills Psis::AllLocalNo (MPI_Allgathered from all other processes).
  Psi->AllLocalNo = (int *)
    Malloc(sizeof(int)*P->Par.Max_me_comm_ST_PsiT,"FirstInitGramSchData: Psi->AllLocalNo");
  MPI_Allgather ( &GramSchLocalNo, 1, MPI_INT,
		  Psi->AllLocalNo, 1, MPI_INT, P->Par.comm_ST_PsiT );

  /// Calculates from this Psis::MaxPsiOfType.
  Psi->MaxPsiOfType = 0;
  for (i=0;i<P->Par.Max_me_comm_ST_PsiT;i++) Psi->MaxPsiOfType += Psi->AllLocalNo[i]-1; // sum up all local (orthogonalizable) Psis in the transposed communicator PsiT
  fprintf(stderr,"(%i) MaxPsiOfType = %i\n",P->Par.me, Psi->MaxPsiOfType);
  
  /// Calculates from this Psis::MaxPsiOfType and at which index this process' Psis start Psis::MyStartNo.
  Psi->MyStartNo = 0;
  for (i=0;i<P->Par.me_comm_ST_PsiT;i++) Psi->MyStartNo += Psi->AllLocalNo[i];  // where do my Psis start
  fprintf(stderr,"(%i) MyStartNo = %i\n",P->Par.me, Psi->MyStartNo);
  
  //fprintf(stderr,"(%i) OtherPsiLocalNo %d\n",P->Par.me, RecvCount);
  /// Allocates arrays Psis::AllPsiStatus, Psis::AllPsiStatusForSort and Psis::LocalPsiStatus (up 'til Extra in PsiTagType)
  Psi->AllPsiStatus = (struct OnePsiElement *)
    Malloc(sizeof(struct OnePsiElement)*(Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT),"FirstInitGramSchData: Psi->AllPsiStatus");
  Psi->AllPsiStatusForSort = (struct OnePsiElement *)
    Malloc(sizeof(struct OnePsiElement)*(Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT+1),"FirstInitGramSchData: Psi->AllPsiStatusForSort");
  Psi->LocalPsiStatus = (struct OnePsiElement *)
    Malloc(sizeof(struct OnePsiElement)*GramSchLocalNo,"FirstInitGramSchData: Psi->LocalPsiStatus");
  /// Psis::LocalPsiStatus is initialized and distributed among all processes as Psis::AllPsiStatus.
  for (i=0;i<GramSchLocalNo;i++) {
    Psi->LocalPsiStatus[i].me_comm_ST_Psi = P->Par.me_comm_ST_Psi;
    Psi->LocalPsiStatus[i].my_color_comm_ST_Psi = P->Par.my_color_comm_ST_Psi;
    Psi->LocalPsiStatus[i].MyLocalNo = i;
    Psi->LocalPsiStatus[i].MyGlobalNo = Psi->MyStartNo + i;
    Psi->LocalPsiStatus[i].DoBrent = 4;
    switch (Psi->PsiST) { // set occupation number for the regular local, one extra(!) per process (without current one!) and the additional orbitals (the "latterest" ;) are set to zero of course) (NOTE: extra orbit must always be the very last one (that's why Par->.. - 1)
      case SpinDouble:
        for (type=Occupied;type<=Extra;type++) {
          if (i >= Psi->TypeStartIndex[type] && i < Psi->TypeStartIndex[type+1]) {
            Psi->LocalPsiStatus[i].PsiType = type;
            Psi->LocalPsiStatus[i].PsiGramSchStatus = (int)(type != Occupied ? NotUsedToOrtho : NotOrthogonal);  // extra or occupied wave function
          }
        }
        if (Psi->LocalPsiStatus[i].PsiType != UnOccupied)
          Psi->LocalPsiStatus[i].PsiFactor = 2.0;
        else
          Psi->LocalPsiStatus[i].PsiFactor = 1.0;
        break;
      case SpinUp:
        for (type=Occupied;type<=Extra;type++) {
          if (i >= Psi->TypeStartIndex[type] && i < Psi->TypeStartIndex[type+1]) {
            Psi->LocalPsiStatus[i].PsiType = type;
            Psi->LocalPsiStatus[i].PsiGramSchStatus = (int)(type != Occupied ? NotUsedToOrtho : NotOrthogonal);  // extra or occupied wave function
          }
        }
        Psi->LocalPsiStatus[i].PsiFactor = 1.0;
        break;
      case SpinDown:
        for (type=Occupied;type<=Extra;type++) {
          if (i >= Psi->TypeStartIndex[type] && i < Psi->TypeStartIndex[type+1]) {
            Psi->LocalPsiStatus[i].PsiType = type;
            Psi->LocalPsiStatus[i].PsiGramSchStatus = (int)(type != Occupied ? NotUsedToOrtho : NotOrthogonal);  // extra or occupied wave function
          }
        }
        Psi->LocalPsiStatus[i].PsiFactor = 1.0;
        break;
    }
    Psi->LocalPsiStatus[i].PsiReciNorm2 = 0.0; 
  }
  
  // Update AllPsiStatus from changed LocalPsiStatus
  UpdateGramSchAllPsiStatus(P,Psi);  

  /// Psis::TempSendA, Psis::AllActualLocalPsiNo and Psis::AllOldActualLocalPsiNo are allocated, the latter two zeroed.
  Psi->TempSendA = (int *)
    Malloc(sizeof(int)*P->Par.Max_me_comm_ST_PsiT,"FirstInitGramSchData: Psi->TempSendA");
  Psi->AllActualLocalPsiNo = (int *)
    Malloc(sizeof(int)*P->Par.Max_me_comm_ST_PsiT,"FirstInitGramSchData: Psi->AllActualLocalPsiNo");
  Psi->AllOldActualLocalPsiNo = (int *)
    Malloc(sizeof(int)*P->Par.Max_me_comm_ST_PsiT,"FirstInitGramSchData: Psi->AllOldActualLocalPsiNo");
  for (i=0; i < P->Par.Max_me_comm_ST_PsiT; i++) {
    Psi->AllActualLocalPsiNo[i] = 0;
    Psi->AllOldActualLocalPsiNo[i] = 0;
  }
}

/** Normalize the coefficients of a given wave function.
 * Calculates the norm (see GramSchGetNorm2()) and divides each (for all reciprocal grid
 * vectors) complex coefficient by the norm.
 * \param *P Problem at hand
 * \param *Lev LatticeLevel structure
 * \param *LPsiDat Array of complex wave function coefficients
 * \param PsiSP If norm already calculated, can be passed on here, otherweise (== 0.0) is calculated
 * \return Squared norm of wave function
 */
static double GramSchNormalize(const struct Problem *P, struct LatticeLevel *Lev, fftw_complex *LPsiDat, double PsiSP) {
  double LocalSP=0.0;
  int i,s = 0;
  /* Falls PsiSP == 0.0 dann noch SP berechnen */
  if (PsiSP == 0.0) { // see GramSchGetNorm2()
    if (Lev->GArray[0].GSq == 0.0) {
      LocalSP += LPsiDat[0].re*LPsiDat[0].re;
      s++;
    }
    for (i=s; i < Lev->MaxG; i++) {
      LocalSP += 2*(LPsiDat[i].re*LPsiDat[i].re+LPsiDat[i].im*LPsiDat[i].im);
    }
    MPI_Allreduce ( &LocalSP, &PsiSP, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  } 
  if (PsiSP < MYEPSILON) fprintf(stderr,"GramSchNormalize: PsiSP = %lg\n",PsiSP);
  PsiSP = sqrt(PsiSP); // take square root
  for (i=0; i < Lev->MaxG; i++) { // and divide each coefficient by the norm
    LPsiDat[i].re /= PsiSP;
    LPsiDat[i].im /= PsiSP;
  }
  return(PsiSP);
}

/** Calculate squared norm of given wave function coefficients.
 * Go through each node of the reciprocal vector grid, calculate the complex product for this
 * coefficient and sum up, gathering the results from all processes before return - remember
 * that the coefficients are - for the parallel calculation of the fft - split up among the
 * processes.
 * \param *P Problem at hand
 * \param *Lev LatticeLevel structure
 * \param *LPsiDat array over G of complex i-th wave function coefficients \f$c_{i,G}\f$
 * \return \f$\sum_G c_{i,G} /cdot {c_{i,G}}^{\ast}\f$
 */
double GramSchGetNorm2(const struct Problem *P, struct LatticeLevel *Lev, fftw_complex *LPsiDat) {
  double LocalSP=0.0, PsiSP=0.0;
  int i,s = 0;
  /* Falls PsiSP == 0.0 dann noch SP berechnen */
  if (LPsiDat != NULL) {
    if (Lev->GArray[0].GSq == 0.0) {
      LocalSP += LPsiDat[0].re*LPsiDat[0].re;
      s++;
    }
    for (i=s; i < Lev->MaxG; i++) {
      LocalSP += 2*(LPsiDat[i].re*LPsiDat[i].re+LPsiDat[i].im*LPsiDat[i].im);
    }
    // send local result to all processes and received summed from all into PsiSP
    MPI_Allreduce ( &LocalSP, &PsiSP, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  } 
  return(PsiSP);
}

/** Scalar Product of two arrays of wave function coefficients.
 * Goes through each reciprocal grid vectors and calculates the complex product
 * between the two coefficients, summing up, MPI_Allreducing and returning.
 * (See also GramSchGetNorm2())
 * \param *P Problem at hand
 * \param *Lev LatticeLevel structure
 * \param *LPsiDatA first array of wave function coefficients
 * \param *LPsiDatB second array of wave function coefficients
 * \return \f$\sum_G c_{a,G} \cdot c_{b,G}^{\ast}\f$
 */
static double GramSchSP(const struct Problem *P, struct LatticeLevel *Lev, fftw_complex *LPsiDatA, fftw_complex *LPsiDatB) {
  double LocalSP=0.0,PsiSP;
  int i,s = 0;
  if (Lev->GArray[0].GSq == 0.0) {
    LocalSP += LPsiDatA[0].re*LPsiDatB[0].re;
    s++;
  }
  for (i=s; i < Lev->MaxG; i++) { // go through all nodes and calculate complex scalar product
    LocalSP += 2*(LPsiDatA[i].re*LPsiDatB[i].re+LPsiDatA[i].im*LPsiDatB[i].im);
  }
  // send local result to all processes and received summed from all into PsiSP
  MPI_Allreduce ( &LocalSP, &PsiSP, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  return(PsiSP);
}

/** Sort criteria for natueralmergesort(): Returns re-ordered OnePsiElement::PsiGramSchStatus.
 * The current status in the Gram-Schmidt-Orthonormalization is returned as sort criteria.
 * \param *a OnePsiElement
 * \param i i-th wave function
 * \param *Args
 * \return integer value for each PsiGramSchStatusType, from IsOrthonormal (0) up to NotOrthogonal(2) and NotUsedToOrtho(3)
 * \note The enum PsiGramSchStatusType is not simply copied due to a different ordering in the enumeration other than used here.
 */
static double GetKeyOnePsi(void *a, int i, void *Args) {
  double res=-1;
  switch ((enum PsiGramSchStatusType)((struct OnePsiElement *)a)[i].PsiGramSchStatus) {
  case NotOrthogonal:
    res = 2.;
    break;
  case IsOrthogonal: 
    res = 1.;
    break;
  case IsOrthonormal:
    res = 0.;
    break;
  case NotUsedToOrtho:    
    res = 100.; // extra before unoccupied and perturbed ones
    break;
  }
  switch (((struct OnePsiElement *)a)[i].PsiType) {
  case Occupied:
    res += 0.;
    break;
  case UnOccupied: 
    res += 10.;
    break;
  case Perturbed_P0:
  case Perturbed_P1:
  case Perturbed_P2:
  case Perturbed_RxP0:
  case Perturbed_RxP1:
  case Perturbed_RxP2:
    res += 20.;
    break;
  case Extra:
    res += 30.;
    break;
  }
  return(res);
}

/** Sort criteria for natueralmergesort(): Returns the global number of the Psi among all.
 * \param *a OnePsiElement
 * \param i i-th wave function
 * \param *Args unused, for contingency with GetKeyOnePsi()
 * \return \a i-th OnePsiElement::MyGlobalNo
 */
static double GetKeyOnePsi2(void *a, int i, void *Args) {
  return(((struct OnePsiElement *)a)[i].MyGlobalNo);
}

/** Copies wave function OnePsiElement.
 * Copy each entry in OnePsiElement structure from \a b[j] to \a a[i].
 * \param *a destination OnePsiElement array
 * \param i i-th element to be overwritten
 * \param *b source OnePsiElement array
 * \param j j-th element's entries to be copied
 */
static void CopyElementOnePsi(void *a, int i, void *b, int j) 
{
  ((struct OnePsiElement *)a)[i].me_comm_ST_Psi = ((struct OnePsiElement *)b)[j].me_comm_ST_Psi;
  ((struct OnePsiElement *)a)[i].my_color_comm_ST_Psi = ((struct OnePsiElement *)b)[j].my_color_comm_ST_Psi;
  ((struct OnePsiElement *)a)[i].MyLocalNo = ((struct OnePsiElement *)b)[j].MyLocalNo;
  ((struct OnePsiElement *)a)[i].MyGlobalNo = ((struct OnePsiElement *)b)[j].MyGlobalNo;
  ((struct OnePsiElement *)a)[i].PsiGramSchStatus = ((struct OnePsiElement *)b)[j].PsiGramSchStatus;
  ((struct OnePsiElement *)a)[i].PsiType = ((struct OnePsiElement *)b)[j].PsiType;
  ((struct OnePsiElement *)a)[i].PsiFactor = ((struct OnePsiElement *)b)[j].PsiFactor;
  ((struct OnePsiElement *)a)[i].PsiReciNorm2 = ((struct OnePsiElement *)b)[j].PsiReciNorm2;
}


/** Performs Gram-Schmidt-Orthonormalization on all Psis.
 * Herein the known Gram-Schmidt-Orthogonalization (with subsequent normalization) is implemented in a
 * parallel way. The problem arises due to the fact that the complex wave function coefficients are not
 * all accessible from one process, but are shared among them. Thus there are four different cases to
 * deal with - where O is one orthogonal Psi and P the Psi currently to be orthogonalized:
 * -# O and P are local\n
 *      The projection is simply calculated via scalar product and subtracted from P.
 * -# O is local, P not\n
 *      P is received from the respective process and the projetion calculated, noting down this
 *      value for later sending it back to this respective process owning the P coefficients,
 *      who will substract them
 * -# O is not local, however P is\n
 *      Send the coefficient to every process in need of them and in the end gather projections to
 *      be subtracted from our local P.
 * -# O and P are not local\n
 *      Nothing to do.
 *
 * Afterwards, a division by the norm of the Psi may additionally be called in for. The current status of
 * a Psi is always noted in OnePsiElement::PsiGramSchStatus.
 * \param *P Problem at hand
 * \param *Lev LatticeLevel structure
 * \param *Psi wave functions structure Psis
 * \param ToDo states what to do in this function: Orthogonalize or Orthonormalize
 */
void GramSch(struct Problem *P, struct LatticeLevel *Lev, struct Psis *Psi, enum PsiGramSchToDoType ToDo) 
{
  int i, j, k, TempRecv, TempSend, RecvSource;
  //int ResetNo=0;
  double GlobalSP;
  struct RunStruct *R = &P->R;
  struct OnePsiElement *OnePsi = NULL, *LOnePsi = NULL, *ROnePsi = NULL, *RLOnePsi = NULL;
  int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  fftw_complex *Temp, *Temp2;
  int *TempSendA = Psi->TempSendA;
  MPI_Status status;
  // Mergesort the wavefunction by their current status from 0 to all plus all extra ones (one for each process)
  naturalmergesort(Psi->AllPsiStatus,Psi->AllPsiStatusForSort,0,Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT-1,&GetKeyOnePsi,NULL,&CopyElementOnePsi);
  //fprintf(stderr,"(%i) GramSch: ",P->Par.me);
  for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // then go through each of the ToDo-order sorted Psis (Each Psi plus an extra one from each process)
    OnePsi = &Psi->AllPsiStatus[i]; // Mark OnePsi wave function
    if (OnePsi->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // stored in this process? => L means local
      LOnePsi = &Psi->LocalPsiStatus[OnePsi->MyLocalNo];
    else 
      LOnePsi = NULL;
      
		switch ((enum PsiGramSchStatusType)OnePsi->PsiGramSchStatus) { // depending on their ToDo-status do ...
    case NotOrthogonal:   // ORTHOGONALIZE!
      //fprintf(stderr,"(%i) ", Psi->AllPsiStatus[i].MyGlobalNo);
      //fprintf(stderr,"Orthogonalizing %i, status was: (L)%i\t(A)%i!\n", OnePsi->MyGlobalNo, Psi->LocalPsiStatus[OnePsi->MyLocalNo].PsiGramSchStatus, OnePsi->PsiGramSchStatus);
      if (LOnePsi != NULL) {  // if current Psi is local, copy (reciprocal) complex coefficients from LocalPsi to TempPsi
				memcpy(Lev->LPsi->TempPsi, Lev->LPsi->LocalPsi[OnePsi->MyLocalNo], ElementSize*Lev->MaxG*sizeof(double)); 
      }
      Temp = Lev->LPsi->TempPsi2; // another complex coefficients array (reciprocal) ...
      SetArrayToDouble0((double *)Temp, Lev->MaxG*2); // ... which is zeroed
      TempRecv = 0;   // count how often a needed local current Psi has been received (and thus if it has been already)
      TempSend = 0;   // count how often a local current Psi has been sent to other processes
      for(k=0; k < P->Par.Max_me_comm_ST_PsiT; k++) TempSendA[k] = 0; // zero array counting how often a process has sent its local Psi to others
      
      for (j=i-1; j >= 0; j--) {    // go through all wave functions from the one before the current downto first one
				ROnePsi = &Psi->AllPsiStatus[j];  // get the Psi that should be orthogonal to it
        if (ROnePsi->PsiType <= UnOccupied) { // only orthogonalize against non-perturbed wave functions
  				if (ROnePsi->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // stored in this process? => L means local
  				  RLOnePsi = &Psi->LocalPsiStatus[ROnePsi->MyLocalNo];
  				else 
  				  RLOnePsi = NULL;
//          if (((OnePsi->PsiType == Extra && (R->CurrentMin <= UnOccupied || ((LOnePsi != NULL && RLOnePsi != NULL) && ROnePsi->MyLocalNo == R->ActualLocalPsiNo))) 
//            || OnePsi->PsiType <= UnOccupied)) {
          if   ((ROnePsi->PsiType == Occupied) 
            || ((ROnePsi->PsiType == UnOccupied) && (OnePsi->PsiType == UnOccupied || OnePsi->PsiType == Extra)) 
            || ((LOnePsi != NULL && RLOnePsi != NULL) && ROnePsi->MyLocalNo == R->ActualLocalPsiNo)) {
                // occupied are orthogonal to occupied
                // unoccupied are orthogonal to occupied and unoccupied
                // perturbed are orthogonal to occupied, unoccupied and to their (process-) specific extra
                // extra is orthogonal dependent on R->CurrentMin (to occupied, occupied&unoccupied, occupied&specific perturbed)
             if (RLOnePsi != NULL && LOnePsi != NULL) {  // if both are stored locally in this process
  				    GlobalSP = GramSchSP(P,Lev,Lev->LPsi->LocalPsi[ROnePsi->MyLocalNo],Lev->LPsi->LocalPsi[OnePsi->MyLocalNo]); // scalar product of the two
    				  GlobalSP *= RLOnePsi->PsiReciNorm2; // divide by norm
    				  Temp = Lev->LPsi->TempPsi;
    				  Temp2 = Lev->LPsi->LocalPsi[ROnePsi->MyLocalNo]; 
    				  for(k=0; k < Lev->MaxG; k++) {  // orthogonalize it (subtract the projected part, real and imaginary)
    				    Temp[k].re -= GlobalSP*Temp2[k].re;
    				    Temp[k].im -= GlobalSP*Temp2[k].im;
    				  }          // the orthogonalized wave function of LocalPsi resides now in Temp!
    				}
    				if (RLOnePsi != NULL && LOnePsi == NULL) {  // if the current Psi is not local, the one to which it ought be orthogonal however is
    				  /* Recv i and put it to jLocal in TempPsi */
    				  if (TempRecv == 0) {
    				    MPI_Recv( Lev->LPsi->TempPsi, Lev->MaxG*ElementSize, MPI_DOUBLE, OnePsi->my_color_comm_ST_Psi, GramSchTag1, P->Par.comm_ST_PsiT, &status );
    				    TempRecv++;
    				  }
    				  GlobalSP = GramSchSP(P,Lev,Lev->LPsi->LocalPsi[ROnePsi->MyLocalNo],Lev->LPsi->TempPsi);
    				  GlobalSP *= RLOnePsi->PsiReciNorm2;
    				  Temp = Lev->LPsi->TempPsi2;
    				  Temp2 = Lev->LPsi->LocalPsi[ROnePsi->MyLocalNo];
    				  for(k=0; k < Lev->MaxG; k++) {
    				    Temp[k].re -= GlobalSP*Temp2[k].re;
    				    Temp[k].im -= GlobalSP*Temp2[k].im;
    				  }         // the negative orthogonal projection resides in Temp (not the local wave function part!)
    				}
    				if (RLOnePsi == NULL && LOnePsi != NULL) {  // if the current Psi is local, the one to which it ought be orthogonal yet is not
    				  /* Send i to jLocal in TempPsi */
    				  if (TempSendA[ROnePsi->my_color_comm_ST_Psi] == 0) {  // just send it out to everyone who needs it
    				    MPI_Send( Lev->LPsi->LocalPsi[OnePsi->MyLocalNo], Lev->MaxG*ElementSize, MPI_DOUBLE, ROnePsi->my_color_comm_ST_Psi, GramSchTag1, P->Par.comm_ST_PsiT);
    				    TempSend++;
    				    TempSendA[ROnePsi->my_color_comm_ST_Psi]++; // note that coefficients were sent once more to this process
    				  }
    				}
          }
        }
      }
      if (LOnePsi != NULL) {  // holds the current local Psi (TempPsi) and receives results from all other which "ought be orthogonal to this one" 
				/* Hat was in TempPsi und ist local*/
				for (j=0; j < TempSend; j++) {  // each of the recipients before should send something back now
				  MPI_Probe( MPI_ANY_SOURCE, GramSchTag2, P->Par.comm_ST_PsiT, &status );
				  RecvSource = status.MPI_SOURCE;
				  MPI_Recv( Lev->LPsi->TempPsi2, Lev->MaxG*ElementSize, MPI_DOUBLE, RecvSource, GramSchTag2, P->Par.comm_ST_PsiT, &status );
				  Temp2 = Lev->LPsi->TempPsi2;
				  Temp = Lev->LPsi->TempPsi;
				  for(k=0; k < Lev->MaxG; k++) {  // sum received projetion onto (temporary) local wave function
				    Temp[k].re += Temp2[k].re;
				    Temp[k].im += Temp2[k].im;
				  }
				}
				Temp2 = Lev->LPsi->TempPsi;
				Temp = Lev->LPsi->LocalPsi[OnePsi->MyLocalNo];
				memcpy(Temp, Temp2, sizeof(fftw_complex)*Lev->MaxG);  // finally copy back onto original one
      } 
      if (LOnePsi == NULL && TempRecv) {  // has calculated a projection to another Psi (TempPsi2) and sends it to the respective (local) owner of the current one
				/* Hat was in TempPsi2 und ist nicht local*/
				MPI_Send( Lev->LPsi->TempPsi2, Lev->MaxG*ElementSize, MPI_DOUBLE, OnePsi->my_color_comm_ST_Psi, GramSchTag2, P->Par.comm_ST_PsiT);
      }
      /*if (LOnePsi != NULL) {  // finally we set the status of our local (multi-projection subtracted) Psi
        //fprintf(stderr,"Setting L-Status of %i to %i\n",LOnePsi->MyGlobalNo, IsOrthogonal);        
				LOnePsi->PsiGramSchStatus = (int)IsOrthogonal;
      }
      fprintf(stderr,"Setting A-Status of %i to %i\n",OnePsi->MyGlobalNo, IsOrthogonal);        
      OnePsi->PsiGramSchStatus = (int)IsOrthogonal; // and also set the status in all processes for this Psi*/
      // note: There is no break here, normalization will be performed right away!
        //fprintf(stderr,"-> ");
		case IsOrthogonal: // NORMALIZE!
        //fprintf(stderr,"%i ", Psi->AllPsiStatus[i].MyGlobalNo);
	      switch (ToDo) {
	      case Orthonormalize: // ... normalize and store 1 as norm
					if (LOnePsi != NULL) {
            //fprintf(stderr,"Setting L-Status of %i to %i\n",LOnePsi->MyLocalNo, IsOrthonormal);        
					  LOnePsi->PsiGramSchStatus = (int)IsOrthonormal;  // set status and ...
					  GramSchNormalize(P,Lev,Lev->LPsi->LocalPsi[OnePsi->MyLocalNo],0.0); // ... do it
					  /*LOnePsi->PsiReciNorm2 = GramSchGetNorm2(P,Lev,Lev->LPsi->LocalPsi[OnePsi->MyLocalNo]);
					    LOnePsi->PsiReciNorm2 = 1./LOnePsi->PsiReciNorm2;*/
					  LOnePsi->PsiReciNorm2 = 1.;
					}
          //fprintf(stderr,"Setting A-Status of %i to %i\n",OnePsi->MyGlobalNo, IsOrthonormal);        
					OnePsi->PsiGramSchStatus = (int)IsOrthonormal;
					break;
	      case Orthogonalize: // ... calculate norm and store
					if (LOnePsi != NULL) {
            //fprintf(stderr,"Setting L-Status of %i to %i\n",LOnePsi->MyLocalNo, IsOrthogonal);        
					  LOnePsi->PsiGramSchStatus = (int)IsOrthogonal;
					  LOnePsi->PsiReciNorm2 = GramSchGetNorm2(P,Lev,Lev->LPsi->LocalPsi[OnePsi->MyLocalNo]);
            if (LOnePsi->PsiReciNorm2 < MYEPSILON) fprintf(stderr,"GramSch: LOnePsi->PsiReciNorm2 Nr. %d = %lg\n",LOnePsi->MyGlobalNo,LOnePsi->PsiReciNorm2);
            //if (P->Call.out[LeaderOut]) fprintf(stderr,"GramSch: Setting norm of LOnePsi->PsiReciNorm2 Nr. %d = %lg\n",LOnePsi->MyGlobalNo,LOnePsi->PsiReciNorm2);
					  LOnePsi->PsiReciNorm2 = 1./LOnePsi->PsiReciNorm2;
					}
          //fprintf(stderr,"Setting A-Status of %i to %i\n",OnePsi->MyGlobalNo, IsOrthogonal);
					OnePsi->PsiGramSchStatus = (int)IsOrthogonal;
					break;
	      }
      break;
		case IsOrthonormal:  // NOTHING TO DO ANY MORE!
      //fprintf(stderr,"%i ", Psi->AllPsiStatus[i].MyGlobalNo);
			break;
		case NotUsedToOrtho:
      //fprintf(stderr,"[%i] ", Psi->AllPsiStatus[i].MyGlobalNo);
      break;
    }
  }
  //fprintf(stderr,"\n");
  /* Reset */
  naturalmergesort(Psi->AllPsiStatus,Psi->AllPsiStatusForSort,0,Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT-1,&GetKeyOnePsi2,NULL,&CopyElementOnePsi);
/*  fprintf(stderr,"Setting L-Status of %i to %i\n",Psi->LocalNo, NotUsedToOrtho);
  Psi->LocalPsiStatus[Psi->LocalNo].PsiGramSchStatus = (int)NotUsedToOrtho;
  for (i=0; i < P->Par.Max_me_comm_ST_PsiT; i++) {
    ResetNo += Psi->AllLocalNo[i];
    OnePsi = &Psi->AllPsiStatus[ResetNo-1];
    fprintf(stderr,"Setting A-Status of %i to %i\n",OnePsi->MyGlobalNo, NotUsedToOrtho);
    OnePsi->PsiGramSchStatus = (int)NotUsedToOrtho; 
  }*/
}

/** Reset status of Gram-Schmidt-Orthogonalization for each and every Psi.
 * Sets all locally accessible Psis::LocalPsiStatus to PsiGramSchStatusType::NotOrthogonal
 * and the norm to zero, except the last (extra) and unoccupied ones which are NotUsedToOrtho, then
 * do the same for all Psis::AllPsiStatus (again exception for extra  and unoccupied ones).
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis 
 */
void ResetGramSch(const struct Problem *P, struct Psis *Psi)
{
  int i,j, ResetNo=0;
  struct OnePsiElement *OnePsi = NULL;
  for (i=0; i < Psi->LocalNo; i++) {  // go through all local Psis
    Psi->LocalPsiStatus[i].PsiGramSchStatus = (Psi->LocalPsiStatus[i].PsiType == Occupied) ? (int)NotOrthogonal : (int)NotUsedToOrtho;
    //fprintf(stderr,"Setting L-Status of %i to %i\n",i, Psi->LocalPsiStatus[i].PsiGramSchStatus);
    Psi->LocalPsiStatus[i].PsiReciNorm2 = 0.0;
  }
  //fprintf(stderr,"Setting L-Status of %i to %i\n",Psi->LocalNo, NotUsedToOrtho);
  Psi->LocalPsiStatus[Psi->LocalNo].PsiGramSchStatus = (int)NotUsedToOrtho; // extra wave function
  Psi->LocalPsiStatus[Psi->LocalNo].PsiReciNorm2 = 0.0;
  for (i=0; i < P->Par.Max_me_comm_ST_PsiT; i++) {
    for (j=ResetNo; j < ResetNo+Psi->AllLocalNo[i]-1; j++) {
      Psi->AllPsiStatus[j].PsiGramSchStatus = (Psi->AllPsiStatus[j].PsiType == Occupied) ? (int)NotOrthogonal : (int)NotUsedToOrtho;
      //fprintf(stderr,"Setting A-Status of %i to %i\n",j, Psi->AllPsiStatus[j].PsiGramSchStatus);
      Psi->AllPsiStatus[j].PsiReciNorm2 = 0.0;
    }
    ResetNo += Psi->AllLocalNo[i];
    OnePsi = &Psi->AllPsiStatus[ResetNo-1];
    //fprintf(stderr,"Setting A-Status of %i to %i\n",ResetNo-1, NotUsedToOrtho);
    OnePsi->PsiGramSchStatus = (int)NotUsedToOrtho;   // extra wave function
    OnePsi->PsiReciNorm2 = 0.0; 
  }
}

/** Reset status of Gram-Schmidt-Orthogonalization for each Psi of PsiTagType \a type.
 * Sets all locally accessible Psis::LocalPsiStatus to PsiGramSchStatusType::NotOrthogonal
 * and the norm to zero, except the last (extra) and unoccupied ones which are NotUsedToOrtho, then
 * do the same for all Psis::AllPsiStatus (again exception for extra  and unoccupied ones).
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis 
 * \param type PsiTagType of orbitals whose PsiGramSchStatus is to be reset
 * \param ToDo - set PsiGramSchToDoType for the \a type states to this
 * \sa ResetGramSch() - same procedure for occupied states
 */
void ResetGramSchTagType(const struct Problem *P, struct Psis *Psi, enum PsiTypeTag type, enum PsiGramSchStatusType ToDo)
{
  int i,j, ResetNo=0;
  struct OnePsiElement *OnePsi = NULL;
  for (i=0; i < Psi->LocalNo; i++) {  // go through all local Psis
    if (Psi->LocalPsiStatus[i].PsiType == type)  {
      //fprintf(stderr,"Setting L-Status of %i to %i\n",i, ToDo);
      Psi->LocalPsiStatus[i].PsiGramSchStatus = ToDo;
      switch(ToDo) {
        case NotOrthogonal:
          Psi->LocalPsiStatus[i].PsiReciNorm2 = 0.0;
          break;
        default:
          break;
      }
    }
  }
  //fprintf(stderr,"Setting L-Status of %i to %i\n",Psi->LocalNo, NotUsedToOrtho);
  Psi->LocalPsiStatus[Psi->LocalNo].PsiGramSchStatus = (int)NotUsedToOrtho; // extra wave function
  Psi->LocalPsiStatus[Psi->LocalNo].PsiReciNorm2 = 0.0;
  for (i=0; i < P->Par.Max_me_comm_ST_PsiT; i++) {
    for (j=ResetNo; j < ResetNo+Psi->AllLocalNo[i]-1; j++) {
      if (Psi->AllPsiStatus[j].PsiType == type)  {
        //fprintf(stderr,"Setting A-Status of %i to %i\n",j, ToDo);
        Psi->AllPsiStatus[j].PsiGramSchStatus = ToDo;
        switch(ToDo) {
          case NotOrthogonal:
            Psi->AllPsiStatus[j].PsiReciNorm2 = 0.0;
            break;
          default:
            break;
        }
      }
    }
    ResetNo += Psi->AllLocalNo[i];
    OnePsi = &Psi->AllPsiStatus[ResetNo-1];
    //fprintf(stderr,"Setting A-Status of %i to %i\n",ResetNo-1, NotUsedToOrtho);
    OnePsi->PsiGramSchStatus = (int)NotUsedToOrtho;   // extra wave function
//    OnePsi->PsiReciNorm2 = 0.0; 
  }
}
/** Set Gram-Schmidt status of the extra Psis::LocalPsiStatus and Psis::AllPsiStatus Psis to \a PsiGramSchStatus.
 * The number of these "extra" Psis is Psis::AllLocalNo - 1 for each process.
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis
 * \param PsiGramSchStatus status to be set
 */
void SetGramSchExtraPsi(const struct Problem *P, struct Psis *Psi, enum PsiGramSchStatusType PsiGramSchStatus) 
{
  int i, ResetNo=0;   // offset to the respective local Psis
  struct OnePsiElement *OnePsi = NULL;
  //fprintf(stderr,"Setting L-Status of %i to %i\n",Psi->LocalNo, PsiGramSchStatus);
  Psi->LocalPsiStatus[Psi->LocalNo].PsiGramSchStatus = (int)PsiGramSchStatus;
  for (i=0; i < P->Par.Max_me_comm_ST_PsiT; i++) {
    ResetNo += Psi->AllLocalNo[i];
    OnePsi = &Psi->AllPsiStatus[ResetNo-1];
    //fprintf(stderr,"Setting A-Status of %i to %i\n",ResetNo-1, PsiGramSchStatus);
    OnePsi->PsiGramSchStatus = (int)PsiGramSchStatus; 
  }
}

/** Set Gram-Schmidt status of the actual Psis::LocalPsiStatus and Psis::AllPsiStatus Psis to \a PsiGramSchStatus.
 * The number of these "extra" Psis is Psis::AllActualLocalPsiNo for each process.
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis
 * \param PsiGramSchStatus status to be set
 */
void SetGramSchActualPsi(const struct Problem *P, struct Psis *Psi, enum PsiGramSchStatusType PsiGramSchStatus)
{
  int i, ResetNo=0;   // offset to the respective local Psis
  struct OnePsiElement *OnePsi = NULL;
  //fprintf(stderr,"Setting L-Status of %i to %i\n",P->R.ActualLocalPsiNo, PsiGramSchStatus);
  //BUG: Psi->LocalPsiStatus[Psi->LocalNo].PsiGramSchStatus = (int)PsiGramSchStatus;
  Psi->LocalPsiStatus[P->R.ActualLocalPsiNo].PsiGramSchStatus = (int)PsiGramSchStatus;
  for (i=0; i < P->Par.Max_me_comm_ST_PsiT; i++) {
    OnePsi = &Psi->AllPsiStatus[ResetNo+Psi->AllActualLocalPsiNo[i]];
    //fprintf(stderr,"Setting A-Status of %i to %i\n",ResetNo+Psi->AllActualLocalPsiNo[i], PsiGramSchStatus);
    OnePsi->PsiGramSchStatus = (int)PsiGramSchStatus;
    ResetNo += Psi->AllLocalNo[i];
  }
}

/** Set Gram-Schmidt status of the former actual Psis::LocalPsiStatus and Psis::AllPsiStatus Psis to \a PsiGramSchStatus.
 * The former number of these "extra" Psis is Psis::AllOldActualLocalPsiNo for each process.
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis
 * \param PsiGramSchStatus status to be set
 */
void SetGramSchOldActualPsi(const struct Problem *P, struct Psis *Psi, enum PsiGramSchStatusType PsiGramSchStatus)
{
  int i, ResetNo=0;
  struct OnePsiElement *OnePsi = NULL;
  //fprintf(stderr,"Setting L-Status of %i to %i\n",P->R.OldActualLocalPsiNo, PsiGramSchStatus);
  Psi->LocalPsiStatus[P->R.OldActualLocalPsiNo].PsiGramSchStatus = (int)PsiGramSchStatus;
  for (i=0; i < P->Par.Max_me_comm_ST_PsiT; i++) {
    OnePsi = &Psi->AllPsiStatus[ResetNo+Psi->AllOldActualLocalPsiNo[i]];
    //fprintf(stderr,"Setting A-Status of %i to %i\n",ResetNo+Psi->AllOldActualLocalPsiNo[i], PsiGramSchStatus);
    OnePsi->PsiGramSchStatus = (int)PsiGramSchStatus; 
    ResetNo += Psi->AllLocalNo[i];
  }
}

/** Updates array Psis::AllActualLocalPsiNo from the RunStruct::ActualLocalPsiNo by MPI_Allgather.
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis
 */
void UpdateGramSchActualPsiNo(struct Problem *P, struct Psis *Psi)
{
  struct RunStruct *R = &P->R;
  MPI_Allgather ( &(R->ActualLocalPsiNo), 1, MPI_INT,
		  Psi->AllActualLocalPsiNo, 1, MPI_INT, P->Par.comm_ST_PsiT );
}

/** Updates array Psis::AllPsiStatus from the Psis::LocalPsiStatus by MPI_Allgather.
 * First, calculates number of MPI_OnePsiElement to be received and their displacements in the
 * global Array Psis::AllPsiStatus, then follows MPI_Allgather and afterwards a printout to screen
 * if verbose is specified.
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis
 * \warning Don't use before FirstInitGramSch() due to needed declaration of MPI_OnePsiElement
 */
void UpdateGramSchAllPsiStatus(struct Problem *P, struct Psis *Psi)
{
  int *recvcounts, *displs;
  int GramSchLocalNo = Psi->LocalNo+1;
  int MyStartNo = 0;
  int i;
  
  //recvcounts = (int *)Malloc(sizeof(int)*P->Par.Max_me_comm_ST_PsiT,"UpdateGramSchAllPsiStatus: recvcounts");
  //displs = (int *)Malloc(sizeof(int)*P->Par.Max_me_comm_ST_PsiT,"UpdateGramSchAllPsiStatus: displs");
  recvcounts = (int *)Malloc(sizeof(int)*P->Par.procs,"UpdateGramSchAllPsiStatus: recvcounts");
  displs = (int *)Malloc(sizeof(int)*P->Par.procs,"UpdateGramSchAllPsiStatus: displs");
  
  for (i=0;i<P->Par.Max_me_comm_ST_PsiT;i++) {
    recvcounts[i] = Psi->AllLocalNo[i];   // how many Psis should be received
    displs[i] = MyStartNo;                // displacement for these Psis
    MyStartNo += Psi->AllLocalNo[i];      // 
  }
  // send all (GramSchLocalNo) own local Psis and gather the AllPsiStatuses of all other processes
  MPI_Allgatherv ( Psi->LocalPsiStatus, GramSchLocalNo,  MPI_OnePsiElement, 
       Psi->AllPsiStatus, recvcounts, displs, MPI_OnePsiElement, P->Par.comm_ST_PsiT );
      
  if(P->Call.out[PsiOut])
    for (i=0;i< MyStartNo;i++)
      fprintf(stderr,"(%i) MyLocalNo = %i, MyGlobalNo = %i/%i, f = %lg, Type: %i, GramSch: %i, me_comm: %d, my_color_comm: %d \n",P->Par.me, Psi->AllPsiStatus[i].MyLocalNo, i, Psi->AllPsiStatus[i].MyGlobalNo, Psi->AllPsiStatus[i].PsiFactor, Psi->AllPsiStatus[i].PsiType, Psi->AllPsiStatus[i].PsiGramSchStatus, Psi->AllPsiStatus[i].me_comm_ST_Psi, Psi->AllPsiStatus[i].my_color_comm_ST_Psi);
      
  Free(recvcounts, "UpdateGramSchAllPsiStatus: recvcounts");
  Free(displs, "UpdateGramSchAllPsiStatus: displs");
}

/** Updates array Psis::AllOldActualLocalPsiNo from the RunStruct::OldActualLocalPsiNo by MPI_Allgather.
 * \param *P Problem at hand
 * \param *Psi wave functions structure Psis
 */
void UpdateGramSchOldActualPsiNo(struct Problem *P, struct Psis *Psi)
{
  struct RunStruct *R = &P->R;
  MPI_Allgather ( &(R->OldActualLocalPsiNo), 1, MPI_INT,
		  Psi->AllOldActualLocalPsiNo, 1, MPI_INT, P->Par.comm_ST_PsiT );
}

#define max_GramSch_iter 3

/** Test Gram-Schmidt-Orthogonalization.
 * Test if all pairs of Psis are orthogonal respectively normalized (scalar product <= 1). 
 * Give output to stderr if not so.
 * \param *P Problem at hand
 * \param *Lev LatticeLevel structure
 * \param *Psi wave functions structure Psis
 * \param Type2test basically current minimisation type, see RunStruct#CurrentMin
 */
void TestGramSch(struct Problem *P, struct LatticeLevel *Lev, struct Psis *Psi, int Type2test) {
  double LocalSP=0.0,PsiSP;
  int i,j,k,s,RecvSource;
  MPI_Status status;
  struct OnePsiElement *OnePsiA, *LOnePsiA, *LOnePsiB;
  int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  int NotOk;      // counts pairs that are not orthogonal
  int iter = 0;
  fftw_complex *LPsiDatA, *LPsiDatB;
  
  do {
    NotOk = 0;
    //fprintf(stderr,"(%i) Testing Orthogonality ... \n", P->Par.me);
    for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions (plus the extra one for each process)
      OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
      //fprintf(stderr,"(%i) OnePsiA: Type %d, GlobalNo %d \n", P->Par.me, OnePsiA->PsiType, OnePsiA->MyGlobalNo);
      if (OnePsiA->PsiGramSchStatus == (int)IsOrthonormal ||
  	     OnePsiA->PsiGramSchStatus == (int)IsOrthogonal) {   // if it has been orthogonalized
        //fprintf(stderr,"(%i) ... orthogonal\n", P->Par.me);
        if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
  	       LOnePsiA = &Psi->LocalPsiStatus[OnePsiA->MyLocalNo];
        else 
  	       LOnePsiA = NULL;
  
        if (LOnePsiA == NULL) {   // if it's not local ... receive it from respective process into TempPsi
        	RecvSource = OnePsiA->my_color_comm_ST_Psi;
        	MPI_Recv( Lev->LPsi->TempPsi, Lev->MaxG*ElementSize, MPI_DOUBLE, RecvSource, GramSchTag3, P->Par.comm_ST_PsiT, &status );
        	LPsiDatA=Lev->LPsi->TempPsi;
        } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
        	for (k=0;k<P->Par.Max_me_comm_ST_PsiT;k++)
        	  if (k != OnePsiA->my_color_comm_ST_Psi) 
        	    MPI_Send( Lev->LPsi->LocalPsi[OnePsiA->MyLocalNo], Lev->MaxG*ElementSize, MPI_DOUBLE, k, GramSchTag3, P->Par.comm_ST_PsiT);
          LPsiDatA=Lev->LPsi->LocalPsi[OnePsiA->MyLocalNo];
        } // LPsiDatA is now set to the coefficients of OnePsi either stored or MPI_Received
        
        for (j=Psi->TypeStartIndex[Occupied]; j < Psi->TypeStartIndex[Extra]+1; j++) {  // for all locally accessible including extra Psis        
        	LOnePsiB = &Psi->LocalPsiStatus[j];
          //if (LOnePsiB->PsiType > UnOccupied || OnePsiA->PsiType > UnOccupied) fprintf(stderr,"(%i) Checking global %i against local %i/%i\n",P->Par.me, OnePsiA->MyGlobalNo, LOnePsiB->MyLocalNo, LOnePsiB->MyGlobalNo);
        	if (LOnePsiB->PsiGramSchStatus == (int)IsOrthonormal ||
        	    LOnePsiB->PsiGramSchStatus == (int)IsOrthogonal) {   // if it's orthogonalized
        	  LPsiDatB=Lev->LPsi->LocalPsi[LOnePsiB->MyLocalNo];     // set LPsiDatB onto it
        	  s=0;
        	  LocalSP = 0.0;
        	  if (Lev->GArray[0].GSq == 0.0) {   // calculate scalar product of LPsiDatA and LPsiDatB
        	    LocalSP += LPsiDatA[0].re*LPsiDatB[0].re;
        	    s++;
        	  }
        	  for (k=s; k < Lev->MaxG; k++) {
        	    LocalSP += 2*(LPsiDatA[k].re*LPsiDatB[k].re+LPsiDatA[k].im*LPsiDatB[k].im);
        	  }
        	  MPI_Allreduce ( &LocalSP, &PsiSP, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);   // gather by summation results from the group sharing the coefficients
            //if (P->Call.out[LeaderOut]) 
            switch (Type2test) {
              default:
              case -1: // test all, checked!
                if (((LOnePsiB->PsiType <= UnOccupied || (LOnePsiB->MyLocalNo == P->R.ActualLocalPsiNo && OnePsiA->PsiType == Extra)) || (LOnePsiB->MyGlobalNo == OnePsiA->MyGlobalNo))) {    // check if it's zero (orthogonal) and give output if wanted
                  if (i == LOnePsiB->MyGlobalNo && LOnePsiB->PsiGramSchStatus == (int)IsOrthonormal) {
                    if (fabs(PsiSP -1.0) >= MYEPSILON) {
                      fprintf(stderr,"(%i)(%i,%i) = %g ?= 1.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP -1.0), MYEPSILON);
                      NotOk++;
                    }
                  } else {
                    if (fabs(PsiSP) >= MYEPSILON && (LOnePsiB != OnePsiA && LOnePsiB->PsiType > UnOccupied)) {
                      fprintf(stderr,"(%i)(%i,%i) = %g ?= 0.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP), MYEPSILON);
                      NotOk++;
                    }
                  }
                }
                break;
              case Occupied:  // test unperturbed orthogonality, checked!
              case UnOccupied:
                if ((LOnePsiB->PsiType <= UnOccupied) && (OnePsiA->PsiType <= UnOccupied || OnePsiA->PsiType == Extra)) {
                  if (i == LOnePsiB->MyGlobalNo && LOnePsiB->PsiGramSchStatus == (int)IsOrthonormal) {
                    if (fabs(PsiSP -1.0) >= MYEPSILON) {
                      fprintf(stderr,"(%i)(%i,%i) = %g != 1.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,i,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP -1.0), MYEPSILON);
                      NotOk++;
                    } else {
                      //fprintf(stderr,"(%i)(%i,%i) = %g != 1.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,i,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP -1.0), MYEPSILON);
                    }
                  } else {
                    if (fabs(PsiSP) >= MYEPSILON) {
                      fprintf(stderr,"(%i)(%i,%i) = %g != 0.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,i,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP), MYEPSILON);
                      NotOk++;
                    } else {
                      //fprintf(stderr,"(%i)(%i,%i) = %g != 0.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,i,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP), MYEPSILON);
                    }
                  }
                } else {
                      //fprintf(stderr,"(%i)(%i,%i) Not (Un)Occupied\n",P->Par.my_color_comm_ST,i,LOnePsiB->MyGlobalNo);
                }
                break;
              case Perturbed_P0: // test perturbed orthogonality and normalization of all, checked!
              case Perturbed_P1:
              case Perturbed_P2:
              case Perturbed_RxP0:
              case Perturbed_RxP1:
              case Perturbed_RxP2:
                if ((((LOnePsiB->PsiType <= UnOccupied || LOnePsiB->PsiType == Type2test) && (OnePsiA->PsiType <= UnOccupied || OnePsiA->PsiType == Type2test) && (OnePsiA->PsiType != LOnePsiB->PsiType)) || (LOnePsiB->MyGlobalNo == OnePsiA->MyGlobalNo))) {    // check if it's zero (orthogonal) and give output if wanted
                  if (i == LOnePsiB->MyGlobalNo && LOnePsiB->PsiGramSchStatus == (int)IsOrthonormal) {
                    if (fabs(PsiSP -1.0) >= MYEPSILON) {
                      fprintf(stderr,"(%i)(%i,%i) ... is not orthonormal: %g != 1.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP -1.0), MYEPSILON);
                      NotOk++;
                    }
                  } else {
                    if (fabs(PsiSP) >= MYEPSILON && (LOnePsiB->MyGlobalNo != OnePsiA->MyGlobalNo)) {
                      fprintf(stderr,"(%i)(%i,%i) ... is not orthogonal: %g != 0.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP), MYEPSILON);
                      NotOk++;
                    }
                  }
                }
                break;
              case Extra:
                if (((LOnePsiB->PsiType == Extra) || (LOnePsiB->PsiType == Occupied)) && ((OnePsiA->PsiType == Extra) || (OnePsiA->PsiType == Occupied))) {
                  if (i == LOnePsiB->MyGlobalNo && LOnePsiB->PsiGramSchStatus == (int)IsOrthonormal) {
                    if (fabs(PsiSP -1.0) >= MYEPSILON) {
                      fprintf(stderr,"(%i)(%i,%i) = %g != 1.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP -1.0), MYEPSILON);
                      NotOk++;
                    }
                  } else {
                    if (fabs(PsiSP) >= MYEPSILON) {
                      fprintf(stderr,"(%i)(%i,%i) = %g != 0.0 eps(%g >= %g)\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo,LOnePsiB->MyGlobalNo,PsiSP, fabs(PsiSP), MYEPSILON);
                      NotOk++;
                    }
                  }
                }
                break;
            }
        	}
        }
      }
    }
/*    if (NotOk != 0) {
      fprintf(stderr,"(%i) NotOk %i ... re-orthogonalizing type %i for the %ith time\n",P->Par.me, NotOk, Type2test, ++iter);
      ResetGramSchTagType(P, Psi, Type2test, NotOrthogonal);
      GramSch(P,Lev,Psi,Orthonormalize);
    }*/
  } while ((NotOk != 0) && (iter < max_GramSch_iter));
  if (P->Call.out[StepLeaderOut]) { // final check if there have been any un-orthogonal pairs
    if (Type2test != -1) {
      if (NotOk == 0) {
        fprintf(stderr,"(%i)TestGramSchm on %s: Ok !\n",P->Par.my_color_comm_ST, P->R.MinimisationName[Type2test]);
      } else {
        fprintf(stderr,"(%i)TestGramSchm on %s: There are %i pairs not Ok!\n",P->Par.my_color_comm_ST, P->R.MinimisationName[Type2test], NotOk);
        //Error(SomeError,"Wave functions not orthonormal as they should be!");
      }
    } else {
      if (NotOk == 0) {
        fprintf(stderr,"(%i)TestGramSchm on all: Ok !\n",P->Par.my_color_comm_ST);
      } else {
        fprintf(stderr,"(%i)TestGramSchm on all: There are %i pairs not Ok!\n",P->Par.my_color_comm_ST,NotOk);
          //Error(SomeError,"Wave functions not orthonormal as they should be!");
      }
    }
  }
}


/** Test if a given wave function to all others. 
 * \param *P Problem at hand
 * \param *Lev LatticeLevel structure
 * \param *psi pointer to array with wave function coefficients
 */
void TestForOrth(struct Problem *P, struct LatticeLevel *Lev, fftw_complex *psi) {
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  double LocalSP=0.0,PsiSP;
  int i,k,RecvSource;
  MPI_Status status;
  struct OnePsiElement *OnePsiA, *LOnePsiA;
  int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  int NotOk = 0;      // counts pairs that are not orthogonal
  fftw_complex *LPsiDatA;
  for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions (plus the extra one for each process)
    OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
    if (OnePsiA->PsiGramSchStatus == (int)IsOrthonormal ||
       OnePsiA->PsiGramSchStatus == (int)IsOrthogonal) {   // if it has been orthogonalized
      if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
         LOnePsiA = &Psi->LocalPsiStatus[OnePsiA->MyLocalNo];
      else 
         LOnePsiA = NULL;
      if (LOnePsiA == NULL) {   // if it's not local ... receive it from respective process into TempPsi
        RecvSource = OnePsiA->my_color_comm_ST_Psi;
        MPI_Recv( Lev->LPsi->TempPsi, Lev->MaxG*ElementSize, MPI_DOUBLE, RecvSource, GramSchTag3, P->Par.comm_ST_PsiT, &status );
        LPsiDatA=Lev->LPsi->TempPsi;
      } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
        for (k=0;k<P->Par.Max_me_comm_ST_PsiT;k++)
          if (k != OnePsiA->my_color_comm_ST_Psi) 
            MPI_Send( Lev->LPsi->LocalPsi[OnePsiA->MyLocalNo], Lev->MaxG*ElementSize, MPI_DOUBLE, k, GramSchTag3, P->Par.comm_ST_PsiT);
        LPsiDatA=Lev->LPsi->LocalPsi[OnePsiA->MyLocalNo];
      } // LPsiDatA is now set to the coefficients of OnePsi either stored or MPI_Received
      
      LocalSP = 0.0;
      k=0;
      if (Lev->GArray[0].GSq == 0.0) {   // calculate scalar product of LPsiDatA and LPsiDatB
        LocalSP += LPsiDatA[0].re*psi[0].re;
        k++;
      }
      for (; k < Lev->MaxG; k++) {
        LocalSP += 2*(LPsiDatA[k].re*psi[k].re+LPsiDatA[k].im*psi[k].im);
      }
      MPI_Allreduce ( &LocalSP, &PsiSP, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);   // gather by summation results from the group sharing the coefficients
      if ((fabs(PsiSP -1.0) >= MYEPSILON) && (fabs(PsiSP) >= MYEPSILON)) {
          fprintf(stderr,"(%i)(%i,Psi) = %g ?= 1.0 or 0.0 eps(%g or %g >= %g)\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo,PsiSP, fabs(PsiSP -1.0), fabs(PsiSP), MYEPSILON);
          NotOk++;
      } else
          fprintf(stderr,"(%i)(%i,Psi) ok.\n",P->Par.my_color_comm_ST,OnePsiA->MyGlobalNo);
    }
  }
  if (P->Call.out[LeaderOut]) { // final check if there have been any un-orthogonal pairs
    if (NotOk == 0) {
      fprintf(stderr,"(%i)TestGramSchm: Ok !\n",P->Par.my_color_comm_ST);
    } else {
      fprintf(stderr,"(%i)TestGramSchm: There are %i pairs not orthogonal!\n",P->Par.my_color_comm_ST,NotOk);
    }
  }
}
