/** \file perturbed.c
 * Perturbation calculation due to external magnetic field.
 * 
 * Central function is MinimisePerturbed() wherein the actual minimisation of the two different operators with each
 * three components takes place subsequently. Helpful routines are CalculatePerturbationOperator_P() - which applies a
 * specified component of p on the current wave function - and CalculatePerturbationOperator_RxP() - which does the
 * same for the RxP operator. 
 * The actual minimisation loop FindPerturbedMinimum() depends on the same routines also used for the occupied orbitals,
 * however with a different energy functional and derivatives, evaluated in Calculate1stPerturbedDerivative() and 
 * Calculate2ndPerturbedDerivative(). InitPerturbedEnergyCalculation() calculates the total energy functional
 * perturbed in second order for all wave functions, UpdatePerturbedEnergyCalculation() just updates the one
 * for the wave function after it has been minimised during the line search. Both use CalculatePerturbedEnergy() which
 * evaluates the energy functional (and the gradient if specified).
 * Finally, FillCurrentDensity() evaluates the current density at a given point in space using the perturbed
 * wave functions. Afterwards by calling CalculateMagneticSusceptibility() or 
 * CalculateChemicalShieldingByReciprocalCurrentDensity() susceptibility respectively shielding tensor are possible uses
 * of this current density.
 * 
 * There are also some test routines: TestCurrent() checks whether the integrated current is zero in each component.
 * test_fft_symmetry() tests the "pulling out imaginary unit" before fourier transformation on a given wave function.
 * CheckOrbitalOverlap() outputs the overlap matrix for the wave functions of a given minimisation state, this might
 * be important for the additional \f$\Delta J{ij}\f$ contribution to the current density, which is non-zero for
 * non-zero mutual overlap, which is evaluated if FillDeltaCurrentDensity() is called.
 * 
 * Finally, there are also some smaller routines: truedist() gives the correct relative distance between two points
 * in the unit cell under periodic boundary conditions with minimum image convention. ApplyTotalHamiltonian() returns
 * the hamiltonian applied to a given wave function. sawtooth() is a sawtooth implementation which is needed in order
 * to avoid flipping of position eigenvalues for nodes close to or on the cell boundary. CalculateOverlap()
 * is used in the energy functional derivatives, keeping an overlap table between perturbed wave functions up to date.
 * fft_Psi() is very similar to CalculateOneDensityR(), it does the extension of the wave function to the upper level
 * RunStruct#Lev0 while fouriertransforming it to real space. cross() gives correct indices in evaluating a vector cross
 * product. AllocCurrentDensity() and DisAllocCurrentDensity() mark the current density arrays as currently being in use or not.
 * 
  Project: ParallelCarParrinello
 \author Frederik Heber
 \date 2006

*/

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_eigen.h>
#include <gsl/gsl_complex.h>
#include <gsl/gsl_complex_math.h>
#include <gsl/gsl_sort_vector.h>
#include <gsl/gsl_linalg.h>
#include <gsl/gsl_multimin.h>

#include "data.h"
#include "density.h"
#include "energy.h"
#include "excor.h"
#include "errors.h"
#include "grad.h"
#include "gramsch.h"
#include "mergesort2.h"
#include "helpers.h"
#include "init.h"
#include "myfft.h"
#include "mymath.h"
#include "output.h"
#include "pcp.h"
#include "perturbed.h"
#include "run.h"
#include "wannier.h"


/** Minimisation of the PsiTypeTag#Perturbed_RxP0, PsiTypeTag#Perturbed_P0 and other orbitals.
 * For each of the above PsiTypeTag we go through the following before the minimisation loop:
 * -# ResetGramSchTagType() resets current type that is to be minimised to NotOrthogonal.
 * -# UpdateActualPsiNo() steps on to next perturbed of current PsiTypeTag type.
 * -# GramSch() orthonormalizes perturbed wave functions.
 * -# TestGramSch() tests if orthonormality was achieved.
 * -# InitDensityCalculation() gathers densities from all wave functions (and all processes), within SpeedMeasure() DensityTime.
 * -# InitPerturbedEnergyCalculation() performs initial calculation of the perturbed energy functional.
 * -# RunStruct#OldActualLocalPsiNo is set to RunStruct#ActualLocalPsiNo, immediately followed by UpdateGramSchOldActualPsiNo()
 *    to bring info on all processes on par.
 * -# UpdatePerturbedEnergyCalculation() re-calculates Gradient and GradientTypes#H1cGradient for RunStruct#ActualLocalPsiNo
 * -# EnergyAllReduce() gathers various energy terms and sums up into Energy#TotalEnergy.
 * 
 * And during the minimisation loop:
 * -# FindPerturbedMinimum() performs the gradient conjugation, the line search and wave function update.
 * -# UpdateActualPsiNo() steps on to the next wave function, orthonormalizing by GramSch() if necessary.
 * -# UpdateEnergyArray() shifts TotalEnergy values to make space for new one.
 * -# There is no density update as the energy function does not depend on the changing perturbed density but only on the fixed
 *    unperturbed one.
 * -# UpdatePerturbedEnergyCalculation() re-calculates the perturbed energy of the changed wave function.
 * -# EnergyAllReduce() gathers energy terms and sums up.
 * -# CheckCPULIM() checks if external Stop signal has been given.
 * -# CalculateMinimumStop() checks whether we have dropped below a certain minimum change during minimisation of total energy.
 * -# finally step counters LatticeLevel#Step and SpeedStruct#Steps are increased.
 * 
 * After the minimisation loop:
 * -# SetGramSchExtraPsi() removes extra Psis from orthogonaliy check.
 * -# ResetGramSchTagType() sets GramSchToDoType to NotUsedtoOrtho.
 * 
 * And after all minimisation runs are done:
 * -# UpdateActualPsiNo() steps back to PsiTypeTag#Occupied type.
 * 
 * At the end we return to Occupied wave functions.
 * \param *P at hand
 * \param *Stop flag to determine if epsilon stop conditions have met
 * \param *SuperStop flag to determinte whether external signal's required end of calculations
 */
void MinimisePerturbed (struct Problem *P, int *Stop, int *SuperStop) {
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  int type, flag = 0;//,i;
  
  for (type=Perturbed_P0;type<=Perturbed_RxP2;type++) {  // go through each perturbation group separately //
    *Stop=0;   // reset stop flag
    if(P->Call.out[LeaderOut]) fprintf(stderr,"(%i)Beginning perturbed minimisation of type %s ...\n", P->Par.me, R->MinimisationName[type]);
    //OutputOrbitalPositions(P, Occupied);
    R->PsiStep = R->MaxPsiStep; // reset in-Psi-minimisation-counter, so that we really advance to the next wave function
    UpdateActualPsiNo(P, type); // step on to next perturbed one

    if(P->Call.out[MinOut]) fprintf(stderr, "(%i) Re-initializing perturbed psi array for type %s ", P->Par.me, R->MinimisationName[type]);
    if (P->Call.ReadSrcFiles && (flag = ReadSrcPsiDensity(P,type,1, R->LevS->LevelNo))) {// in flag store whether stored Psis are readible or not
      SpeedMeasure(P, InitSimTime, StartTimeDo);  
      if(P->Call.out[MinOut]) fprintf(stderr,"from source file of recent calculation\n");
      ReadSrcPsiDensity(P,type, 0, R->LevS->LevelNo);
      ResetGramSchTagType(P, Psi, type, IsOrthogonal); // loaded values are orthonormal
      SpeedMeasure(P, DensityTime, StartTimeDo);
      //InitDensityCalculation(P);
      SpeedMeasure(P, DensityTime, StopTimeDo);
      R->OldActualLocalPsiNo = R->ActualLocalPsiNo; // needed otherwise called routines in function below crash
      UpdateGramSchOldActualPsiNo(P,Psi);
      InitPerturbedEnergyCalculation(P, 1);  // go through all orbitals calculate each H^{(0)}-eigenvalue, recalc HGDensity, cause InitDensityCalc zero'd it
      UpdatePerturbedEnergyCalculation(P);  // H1cGradient and Gradient must be current ones
      EnergyAllReduce(P);   // gather energies for minimum search
      SpeedMeasure(P, InitSimTime, StopTimeDo);  
    }
    if ((P->Call.ReadSrcFiles != 1) || (!flag)) {  // read and don't minimise only if SrcPsi were parsable!
      SpeedMeasure(P, InitSimTime, StartTimeDo);  
      ResetGramSchTagType(P, Psi, type, NotOrthogonal); // perturbed now shall be orthonormalized
      if ((P->Call.ReadSrcFiles != 2) || (!flag)) {
        if (R->LevSNo == Lat->MaxLevel-1) { // is it the starting level? (see InitRunLevel())
          if(P->Call.out[MinOut]) fprintf(stderr, "randomly.\n");
          InitPsisValue(P, Psi->TypeStartIndex[type], Psi->TypeStartIndex[type+1]);  // initialize perturbed array for this run
        } else {
          if(P->Call.out[MinOut]) fprintf(stderr, "from source file of last level.\n");
          ReadSrcPerturbedPsis(P, type);
        }
      }
      SpeedMeasure(P, InitGramSchTime, StartTimeDo);  
      GramSch(P, R->LevS, Psi, Orthogonalize);
      SpeedMeasure(P, InitGramSchTime, StopTimeDo);  
      SpeedMeasure(P, InitDensityTime, StartTimeDo);
      //InitDensityCalculation(P);
      SpeedMeasure(P, InitDensityTime, StopTimeDo);
      InitPerturbedEnergyCalculation(P, 1);  // go through all orbitals calculate each H^{(0)}-eigenvalue, recalc HGDensity, cause InitDensityCalc zero'd it
      R->OldActualLocalPsiNo = R->ActualLocalPsiNo; // needed otherwise called routines in function below crash
      UpdateGramSchOldActualPsiNo(P,Psi);
      UpdatePerturbedEnergyCalculation(P);  // H1cGradient and Gradient must be current ones
      EnergyAllReduce(P);   // gather energies for minimum search
      SpeedMeasure(P, InitSimTime, StopTimeDo);  
      R->LevS->Step++;
      EnergyOutput(P,0);
      while (*Stop != 1) {
        //debug(P,"FindPerturbedMinimum");
        FindPerturbedMinimum(P);   // find minimum
        //debug(P,"UpdateActualPsiNo");
        UpdateActualPsiNo(P, type); // step on to next perturbed Psi
        //debug(P,"UpdateEnergyArray");
        UpdateEnergyArray(P); // shift energy values in their array by one
        //debug(P,"UpdatePerturbedEnergyCalculation");
        UpdatePerturbedEnergyCalculation(P);  // re-calc energies (which is hopefully lower)
        EnergyAllReduce(P); // gather from all processes and sum up to total energy
        //ControlNativeDensity(P);  // check total density (summed up PertMixed must be zero!)
        //printf ("(%i,%i,%i)S(%i,%i,%i):\t %5d %10.5f\n",P->Par.my_color_comm_ST,P->Par.me_comm_ST, P->Par.me_comm_ST_PsiT, R->MinStep, R->ActualLocalPsiNo, R->PsiStep, (int)iter, s_multi->f);
        if (*SuperStop != 1)
          *SuperStop = CheckCPULIM(P);
        *Stop = CalculateMinimumStop(P, *SuperStop);
        P->Speed.Steps++; // step on
        R->LevS->Step++; 
      }
      // now release normalization condition and minimize wrt to norm
      if(P->Call.out[MinOut]) fprintf(stderr,"(%i) Writing %s srcpsi to disk\n", P->Par.me, R->MinimisationName[type]);
      OutputSrcPsiDensity(P, type);
//      if (!TestReadnWriteSrcDensity(P,type))
//        Error(SomeError,"TestReadnWriteSrcDensity failed!");
    }

    TestGramSch(P,R->LevS,Psi, type); // functions are orthonormal?
    // calculate current density summands
    //if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Filling current density grid ...\n",P->Par.me);
    SpeedMeasure(P, CurrDensTime, StartTimeDo);
    if (*SuperStop != 1) {
      if ((R->DoFullCurrent == 1) || ((R->DoFullCurrent == 2) && (CheckOrbitalOverlap(P) == 1))) { //test to check whether orbitals have mutual overlap and thus \\DeltaJ_{xc} must not be dropped
        R->DoFullCurrent = 1; // set to 1 if it was 2 but Check...() yielded necessity
        //debug(P,"Filling with Delta j ...");
        //FillDeltaCurrentDensity(P);
      }// else
        //debug(P,"There is no overlap between orbitals.");
      //debug(P,"Filling with j ...");
      FillCurrentDensity(P);
    }
    SpeedMeasure(P, CurrDensTime, StopTimeDo);
    
    SetGramSchExtraPsi(P,Psi,NotUsedToOrtho); // remove extra Psis from orthogonality check
    ResetGramSchTagType(P, Psi, type, NotUsedToOrtho);  // remove this group from the check for the next minimisation group as well!
  }
  UpdateActualPsiNo(P, Occupied); // step on back to an occupied one
}

/** Tests overlap matrix between each pair of orbitals for non-diagonal form.
 * We simply check whether the overlap matrix Psis#lambda has off-diagonal entries greater MYEPSILON or not.
 * \param *P Problem at hand
 * \note The routine is meant as atest criteria if \f$\Delta J_[ij]\f$ contribution is necessary, as it is only non-zero if
 *       there is mutual overlap between the two orbitals.
 */
int CheckOrbitalOverlap(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  int i,j;
  int counter = 0;

  // output matrix
  if (P->Par.me == 0) fprintf(stderr, "(%i) S_ij =\n", P->Par.me);
  for (i=0;i<Psi->NoOfPsis;i++) {
    for (j=0;j<Psi->NoOfPsis;j++) {
      if (fabs(Psi->lambda[i][j]) > MYEPSILON) counter++;
      if (P->Par.me == 0) fprintf(stderr, "%e\t", Psi->lambda[i][j]); //Overlap[i][j]
    }
    if (P->Par.me == 0) fprintf(stderr, "\n");
  }

  fprintf(stderr, "(%i) CheckOverlap: %i overlaps found.\t", P->Par.me, counter);
  if (counter > 0) return (1);
  else return(0);
}

/** Initialization of perturbed energy.
 * For each local wave function of the current minimisation type RunStruct#CurrentMin it is called:
 * - CalculateNonLocalEnergyNoRT(): for the coefficient-dependent form factors
 * - CalculatePerturbedEnergy(): for the perturbed energy, yet without gradient calculation
 * - CalculateOverlap(): for the overlap between the perturbed wave functions of the current RunStruct#CurrentMin state.
 * 
 * Afterwards for the two types AllPsiEnergyTypes#Perturbed1_0Energy and AllPsiEnergyTypes#Perturbed0_1Energy the
 * energy contribution from each wave function is added up in Energy#AllLocalPsiEnergy.
 * \param *P Problem at hand
 * \param first state whether it is the first (1) or successive call (0), which avoids some initial calculations. 
 * \sa UpdatePerturbedEnergy()
 * \note Afterwards EnergyAllReduce() must be called.
 */
void InitPerturbedEnergyCalculation(struct Problem *P, const int first)
{
  struct Lattice *Lat = &(P->Lat);
  int p,i;
  const enum PsiTypeTag state = P->R.CurrentMin;
  for (p=Lat->Psi.TypeStartIndex[state]; p < Lat->Psi.TypeStartIndex[state+1]; p++) {
    //if (p < 0 || p >= Lat->Psi.LocalNo) Error(SomeError,"InitPerturbedEnergyCalculation: p out of range");
    //CalculateNonLocalEnergyNoRT(P, p); // recalculating non-local form factors which are coefficient dependent!
    CalculatePsiEnergy(P,p,1);
    CalculatePerturbedEnergy(P, p, 0, first);
    CalculateOverlap(P, p, state);
  }
  for (i=0; i<= Perturbed0_1Energy; i++) {
    Lat->E->AllLocalPsiEnergy[i] = 0.0;
    for (p=0; p < Lat->Psi.LocalNo; p++)
      if (P->Lat.Psi.LocalPsiStatus[p].PsiType == state)
        Lat->E->AllLocalPsiEnergy[i] += Lat->E->PsiEnergy[i][p];
  }
}


/** Updating of perturbed energy.
 * For current and former (if not the same) local wave function RunStruct#ActualLocal, RunStruct#OldActualLocalPsiNo it is called:
 * - CalculateNonLocalEnergyNoRT(): for the form factors
 * - CalculatePerturbedEnergy(): for the perturbed energy, gradient only for RunStruct#ActualLocal
 * - CalculatePerturbedOverlap(): for the overlap between the perturbed wave functions
 * 
 * Afterwards for the two types AllPsiEnergyTypes#Perturbed1_0Energy and AllPsiEnergyTypes#Perturbed0_1Energy the
 * energy contribution from each wave function is added up in Energy#AllLocalPsiEnergy.
 * \param *P Problem at hand
 * \sa CalculatePerturbedEnergy() called from here.
 * \note Afterwards EnergyAllReduce() must be called.
 */
void UpdatePerturbedEnergyCalculation(struct Problem *P)
{
  struct Lattice *Lat = &(P->Lat);
  struct Psis *Psi = &Lat->Psi;
  struct RunStruct *R = &P->R;
  const enum PsiTypeTag state = R->CurrentMin;
  int p = R->ActualLocalPsiNo;
  const int p_old = R->OldActualLocalPsiNo;
  int i;
  
  if (p != p_old) {
    //if (p_old < 0 || p_old >= Lat->Psi.LocalNo) Error(SomeError,"UpdatePerturbedEnergyCalculation: p_old out of range");
    //CalculateNonLocalEnergyNoRT(P, p_old);
    CalculatePsiEnergy(P,p_old,0);
    CalculatePerturbedEnergy(P, p_old, 0, 0);
    CalculateOverlap(P, p_old, state);
  }
  //if (p < 0 || p >= Lat->Psi.LocalNo) Error(SomeError,"InitPerturbedEnergyCalculation: p out of range");
  // recalculating non-local form factors which are coefficient dependent!
  //CalculateNonLocalEnergyNoRT(P,p);
  CalculatePsiEnergy(P,p,0);
  CalculatePerturbedEnergy(P, p, 1, 0);
  CalculateOverlap(P, p, state);
  
  for (i=0; i<= Perturbed0_1Energy; i++) {
    Lat->E->AllLocalPsiEnergy[i] = 0.0;
    for (p=0; p < Psi->LocalNo; p++)
      if (Psi->LocalPsiStatus[p].PsiType == state)
        Lat->E->AllLocalPsiEnergy[i] += Lat->E->PsiEnergy[i][p];
  }
}

/** Calculates gradient and evaluates second order perturbed energy functional for specific wave function.
 * The in second order perturbed energy functional reads as follows.
 * \f[
 * E^{(2)} = \sum_{kl} \langle \varphi_k^{(1)} | H^{(0)} \delta_{kl} - \lambda_{kl} | \varphi_l^{(1)} \rangle
 *  + \underbrace{\langle \varphi_l^{(0)} | H^{(1)} | \varphi_l^{(1)} \rangle + \langle \varphi_l^{(1)} | H^{(1)} | \varphi_l^{(0)} \rangle}_{2 {\cal R} \langle \varphi_l^{(1)} | H^{(1)} | \varphi_l^{(0)} \rangle}
 * \f]
 * And the gradient
 * \f[
 *  \widetilde{\varphi}_k^{(1)} = - \sum_l ({\cal H}^{(0)} \delta_{kl} - \lambda_{kl} | \varphi_l^{(1)} \rangle + {\cal H}^{(1)} | \varphi_k^{(0)} \rangle
 * \f]
 * First, the HGDensity is recalculated if \a first says so - see ApplyTotalHamiltonian().
 *
 * Next, we need the perturbation hamiltonian acting on both the respective occupied and current wave function,
 * see perturbed.c for respective function calls.
 * 
 * Finally, the scalar product between the wave function and Hc_Gradient yields the eigenvalue of the hamiltonian,
 * which is summed up over all reciprocal grid vectors and stored in OnePsiElementAddData#Lambda. The Gradient is
 * the inverse of Hc_Gradient and with the following summation over all perturbed wave functions (MPI exchange of
 * non-local coefficients) the gradient is computed. Here we need Psis#lambda, which is computed in CalculateHamiltonian().
 * 
 * Also \f${\cal H}^{(1)} | \varphi_l^{(0)} \rangle\f$ is stored in GradientTypes#H1cGradient.
 * \param *P Problem at hand, contains RunStruct, Lattice, LatticeLevel RunStruct#LevS
 * \param l offset of perturbed wave function within Psi#LocalPsiStatus (\f$\varphi_l^{(1)}\f$)
 * \param DoGradient (1 = yes, 0 = no) whether gradient shall be calculated or not
 * \param first recaculate HGDensity (1) or not (0)
 * \note DensityTypes#ActualPsiDensity must be recent for gradient calculation!
 * \sa CalculateGradientNoRT() - same procedure for evaluation of \f${\cal H}^{(0)}| \varphi_l^{(1)} \rangle\f$
 * \note without the simplification of \f$2 {\cal R} \langle \varphi_l^{(1)} | H^{(1)} | \varphi_l^{(0)} \rangle\f$ the
 *       calculation would be impossible due to non-local nature of perturbed wave functions. The position operator would
 *       be impossible to apply in a sensible manner.
 */
void CalculatePerturbedEnergy(struct Problem *P, const int l, const int DoGradient, const int first) 
{
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  struct Energy *E = Lat->E;
  struct PseudoPot *PP = &P->PP;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  const int state = R->CurrentMin;
  const int l_normal = Psi->TypeStartIndex[Occupied] + (l - Psi->TypeStartIndex[state]);  // offset l to \varphi_l^{(0)}
  const int ActNum = l - Psi->TypeStartIndex[state] + Psi->TypeStartIndex[1] * Psi->LocalPsiStatus[l].my_color_comm_ST_Psi;
  int g, i, m, j;
  double lambda, Lambda;
  double RElambda10, RELambda10;
  const fftw_complex *source = LevS->LPsi->LocalPsi[l];
  fftw_complex *grad = P->Grad.GradientArray[ActualGradient];
  fftw_complex *Hc_grad = P->Grad.GradientArray[HcGradient];
  fftw_complex *H1c_grad = P->Grad.GradientArray[H1cGradient];
  fftw_complex *TempPsi_0 = H1c_grad;
  fftw_complex *varphi_1, *varphi_0;
  struct OnePsiElement *OnePsiB, *LOnePsiB;
  fftw_complex *LPsiDatB=NULL;
  const int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  int RecvSource;
  MPI_Status status;
  
  // ============ Calculate H^(0) psi^(1) =============================
  //if (Hc_grad != P->Grad.GradientArray[HcGradient]) Error(SomeError,"CalculatePerturbedEnergy: Hc_grad corrupted");
  SetArrayToDouble0((double *)Hc_grad,2*R->InitLevS->MaxG);
  ApplyTotalHamiltonian(P,source,Hc_grad, PP->fnl[l], 1, first);
    
  // ============ ENERGY FUNCTIONAL Evaluation  PART 1 ================  
  //if (l_normal < 0 || l_normal >= Psi->LocalNo) Error(SomeError,"CalculatePerturbedEnergy: l_normal out of range");
  varphi_0 = LevS->LPsi->LocalPsi[l_normal];
  //if (l < 0 || l >= Psi->LocalNo) Error(SomeError,"CalculatePerturbedEnergy: l out of range");
  varphi_1 = LevS->LPsi->LocalPsi[l];
  //if (TempPsi_0 != P->Grad.GradientArray[H1cGradient]) Error(SomeError,"CalculatePerturbedEnergy: TempPsi_0 corrupted");
  SetArrayToDouble0((double *)TempPsi_0,2*R->InitLevS->MaxG);
  switch (state) {
    case Perturbed_P0:
      CalculatePerturbationOperator_P(P,varphi_0,TempPsi_0,0); //  \nabla_0 | \varphi_l^{(0)} \rangle
      break;
    case Perturbed_P1:
      CalculatePerturbationOperator_P(P,varphi_0,TempPsi_0,1); //  \nabla_1 | \varphi_l^{(0)} \rangle
      break;
    case Perturbed_P2:
      CalculatePerturbationOperator_P(P,varphi_0,TempPsi_0,2); //  \nabla_1 | \varphi_l^{(0)} \rangle
      break;
    case Perturbed_RxP0:
      CalculatePerturbationOperator_RxP(P,varphi_0,TempPsi_0,l_normal,0); //  r \times \nabla | \varphi_l^{(0)} \rangle
      break;
    case Perturbed_RxP1:
      CalculatePerturbationOperator_RxP(P,varphi_0,TempPsi_0,l_normal,1); //  r \times \nabla | \varphi_l^{(0)} \rangle
      break;
    case Perturbed_RxP2:
      CalculatePerturbationOperator_RxP(P,varphi_0,TempPsi_0,l_normal,2); //  r \times \nabla | \varphi_l^{(0)} \rangle
      break;
    default:
      fprintf(stderr,"(%i) CalculatePerturbedEnergy called whilst not within perturbation run: CurrentMin = %i !\n",P->Par.me, R->CurrentMin);
      break;
  }

  // ============ GRADIENT and EIGENVALUE Evaluation  Part 1==============
  lambda = 0.0;
  if ((DoGradient) && (grad != NULL)) {
    g = 0;
    if (LevS->GArray[0].GSq == 0.0) {
      lambda += Hc_grad[0].re*source[0].re;
      //if (grad != P->Grad.GradientArray[ActualGradient]) Error(SomeError,"CalculatePerturbedEnergy: grad corrupted");
      grad[0].re = -(Hc_grad[0].re + TempPsi_0[0].re);
      grad[0].im = -(Hc_grad[0].im + TempPsi_0[0].im);
      g++;
    }
    for (;g<LevS->MaxG;g++) {
      lambda += 2.*(Hc_grad[g].re*source[g].re + Hc_grad[g].im*source[g].im);
      //if (grad != P->Grad.GradientArray[ActualGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"CalculatePerturbedEnergy: grad corrupted");
      grad[g].re = -(Hc_grad[g].re + TempPsi_0[g].re);
      grad[g].im = -(Hc_grad[g].im + TempPsi_0[g].im);
    }

    m = -1;
    for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
      OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
      if (OnePsiB->PsiType == state) {   // drop all but the ones of current min state
        m++;  // increase m if it is type-specific wave function
        if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
           LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
        else
           LOnePsiB = NULL;
        if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
          RecvSource = OnePsiB->my_color_comm_ST_Psi;
          MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, PerturbedTag, P->Par.comm_ST_PsiT, &status );
          LPsiDatB=LevS->LPsi->TempPsi;
        } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
          for (i=0;i<P->Par.Max_me_comm_ST_PsiT;i++)
            if (i != OnePsiB->my_color_comm_ST_Psi)
              MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, i, PerturbedTag, P->Par.comm_ST_PsiT);
          LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
        } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
  
        g = 0;
        if (LevS->GArray[0].GSq == 0.0) { // perform the summation
          //if (grad != P->Grad.GradientArray[ActualGradient]) Error(SomeError,"CalculatePerturbedEnergy: grad corrupted");
          grad[0].re += Lat->Psi.lambda[ActNum][m]*LPsiDatB[0].re;
          grad[0].im += Lat->Psi.lambda[ActNum][m]*LPsiDatB[0].im;
          g++;
        }
        for (;g<LevS->MaxG;g++) {
          //if (grad != P->Grad.GradientArray[ActualGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"CalculatePerturbedEnergy: grad corrupted");
          grad[g].re += Lat->Psi.lambda[ActNum][m]*LPsiDatB[g].re;
          grad[g].im += Lat->Psi.lambda[ActNum][m]*LPsiDatB[g].im;
        }
      }
    }
  } else {
    lambda = GradSP(P,LevS,Hc_grad,source);
  }
  MPI_Allreduce ( &lambda, &Lambda, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  //fprintf(stderr,"(%i) Lambda[%i] = %lg\n",P->Par.me, l, Lambda);
  //if (l < 0 || l >= Psi->LocalNo) Error(SomeError,"CalculatePerturbedEnergy: l out of range");
  Lat->Psi.AddData[l].Lambda = Lambda; 

  // ============ ENERGY FUNCTIONAL Evaluation  PART 2 ================
  // varphi_1 jas negative symmetry, returning TempPsi_0 from CalculatePerturbedOperator also, thus real part of scalar product
  // "-" due to purely imaginary wave function is on left hand side, thus becomes complex conjugated: i -> -i
  // (-i goes into pert. op., "-" remains when on right hand side)  
  RElambda10 =  GradSP(P,LevS,varphi_1,TempPsi_0) * sqrt(Psi->LocalPsiStatus[l].PsiFactor * Psi->LocalPsiStatus[l_normal].PsiFactor);
  //RElambda01 = -GradSP(P,LevS,varphi_0,TempPsi_1) * sqrt(Psi->LocalPsiStatus[l].PsiFactor * Psi->LocalPsiStatus[l_normal].PsiFactor);
  
  MPI_Allreduce ( &RElambda10, &RELambda10, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  //MPI_Allreduce ( &RElambda01, &RELambda01, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  
  //if (l < 0 || l >= Psi->LocalNo) Error(SomeError,"CalculatePerturbedEnergy: l out of range");
  E->PsiEnergy[Perturbed1_0Energy][l] = RELambda10;
  E->PsiEnergy[Perturbed0_1Energy][l] = RELambda10;
//  if (P->Par.me == 0) {
//    fprintf(stderr,"RE.Lambda10[%i-%i] = %lg\t RE.Lambda01[%i-%i] = %lg\n", l, l_normal, RELambda10, l_normal, l, RELambda01);
//  }
  // GradImSP() is only applicable to a product of wave functions with uneven symmetry!
  // Otherwise, due to the nature of symmetry, a sum over only half of the coefficients will in most cases not result in zero!
}

/** Applies \f$H^{(0)}\f$ to a given \a source.
 * The DensityTypes#HGDensity is computed, the exchange potential added and the
 * whole multiplied - coefficient by coefficient - with the current wave function, taken from its density coefficients,
 * on the upper LatticeLevel (RunStruct#Lev0), which (DensityTypes#ActualPsiDensity) is updated beforehand.
 * After an inverse fft (now G-dependent) the non-local potential is added and
 * within the reciprocal basis set, the kinetic energy can be evaluated easily. 
 * \param *P Problem at hand
 * \param *source pointer to source coefficient array, \f$| \varphi(G) \rangle\f$
 * \param *dest pointer to dest coefficient array,\f$H^{(0)} | \varphi(G) \rangle\f$
 * \param **fnl pointer to non-local form factor array
 * \param PsiFactor occupation number of orbital
 * \param first 1 - Re-calculate DensityTypes#HGDensity, 0 - don't 
 * \sa CalculateConDirHConDir() - same procedure
 */
void ApplyTotalHamiltonian(struct Problem *P, const fftw_complex *source, fftw_complex *dest, fftw_complex ***fnl, const double PsiFactor, const int first) {
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens0 = Lev0->Dens;
  struct fft_plan_3d *plan = Lat->plan; 
  struct PseudoPot *PP = &P->PP;
  struct Ions *I = &P->Ion;
  fftw_complex *work = Dens0->DensityCArray[TempDensity];
  fftw_real *HGcR = Dens0->DensityArray[HGcDensity];
  fftw_complex *HGcRC = (fftw_complex*)HGcR;
  fftw_complex *HGC = Dens0->DensityCArray[HGDensity];
  fftw_real *HGCR = (fftw_real *)HGC;
  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
  fftw_real *PsiCR = (fftw_real *)PsiC;
  //const fftw_complex *dest_bak = dest;
  int nx,ny,nz,iS,i0;
  const int Nx = LevS->Plan0.plan->local_nx;
  const int Ny = LevS->Plan0.plan->N[1];
  const int Nz = LevS->Plan0.plan->N[2];
  const int NUpx = LevS->NUp[0];
  const int NUpy = LevS->NUp[1];
  const int NUpz = LevS->NUp[2];
  const double HGcRCFactor = 1./LevS->MaxN;
  int g, Index, i, it;
  fftw_complex vp,rp,rhog,TotalPsiDensity;
  double Fac;

  if (first) {
    // recalculate HGDensity
    //if (HGC != Dens0->DensityCArray[HGDensity]) Error(SomeError,"ApplyTotalHamiltonian: HGC corrupted");
    SetArrayToDouble0((double *)HGC,2*Dens0->TotalSize);
    g=0;
    if (Lev0->GArray[0].GSq == 0.0) {
      Index = Lev0->GArray[0].Index;
      c_re(vp) = 0.0;
      c_im(vp) = 0.0;
      for (it = 0; it < I->Max_Types; it++) {
        c_re(vp) += (c_re(I->I[it].SFactor[0])*PP->phi_ps_loc[it][0]); 
        c_im(vp) += (c_im(I->I[it].SFactor[0])*PP->phi_ps_loc[it][0]);
      }
      //if (HGC != Dens0->DensityCArray[HGDensity] || Index<0 || Index>=Dens0->LocalSizeC) Error(SomeError,"ApplyTotalHamiltonian: HGC corrupted");
      c_re(HGC[Index]) = c_re(vp);
      c_re(TotalPsiDensity) = c_re(Dens0->DensityCArray[TotalDensity][Index]);
      c_im(TotalPsiDensity) = c_im(Dens0->DensityCArray[TotalDensity][Index]);
  
      g++;
    }
    for (; g < Lev0->MaxG; g++) {
      Index = Lev0->GArray[g].Index;
      Fac = 4.*PI/(Lev0->GArray[g].GSq);
      c_re(vp) = 0.0;
      c_im(vp) = 0.0;
      c_re(rp) = 0.0;
      c_im(rp) = 0.0;
      for (it = 0; it < I->Max_Types; it++) {
        c_re(vp) += (c_re(I->I[it].SFactor[g])*PP->phi_ps_loc[it][g]); 
        c_im(vp) += (c_im(I->I[it].SFactor[g])*PP->phi_ps_loc[it][g]);
        c_re(rp) += (c_re(I->I[it].SFactor[g])*PP->FacGauss[it][g]); 
        c_im(rp) += (c_im(I->I[it].SFactor[g])*PP->FacGauss[it][g]);
      } // rp = n^{Gauss)(G)
  
      // n^{tot} = n^0 + \lambda n^1 + ...
      //if (isnan(c_re(Dens0->DensityCArray[TotalDensity][Index]))) { fprintf(stderr,"(%i) WARNING in CalculatePerturbedEnergy(): TotalDensity[%i] = NaN!\n", P->Par.me, Index); Error(SomeError, "NaN-Fehler!"); }
      c_re(TotalPsiDensity) = c_re(Dens0->DensityCArray[TotalDensity][Index]);
      c_im(TotalPsiDensity) = c_im(Dens0->DensityCArray[TotalDensity][Index]);
  
      c_re(rhog) = c_re(TotalPsiDensity)*R->HGcFactor+c_re(rp);
      c_im(rhog) = c_im(TotalPsiDensity)*R->HGcFactor+c_im(rp);
      // rhog = n(G) + n^{Gauss}(G), rhoe = n(G)
      //if (HGC != Dens0->DensityCArray[HGDensity] || Index<0 || Index>=Dens0->LocalSizeC) Error(SomeError,"ApplyTotalHamiltonian: HGC corrupted");
      c_re(HGC[Index]) = c_re(vp)+Fac*c_re(rhog);
      c_im(HGC[Index]) = c_im(vp)+Fac*c_im(rhog);
    }
    //
    for (i=0; i<Lev0->MaxDoubleG; i++) {
      //if (HGC != Dens0->DensityCArray[HGDensity] || Lev0->DoubleG[2*i+1]<0 || Lev0->DoubleG[2*i+1]>Dens0->LocalSizeC || Lev0->DoubleG[2*i]<0 || Lev0->DoubleG[2*i]>Dens0->LocalSizeC) Error(SomeError,"CalculatePerturbedEnergy: grad corrupted");
      HGC[Lev0->DoubleG[2*i+1]].re =  HGC[Lev0->DoubleG[2*i]].re;
      HGC[Lev0->DoubleG[2*i+1]].im = -HGC[Lev0->DoubleG[2*i]].im;
    }
  }
  // ============ GRADIENT and EIGENVALUE Evaluation  Part 1==============
  // \lambda_l^{(1)} = \langle \varphi_l^{(1)} | H^{(0)} | \varphi_l^{(1)} \rangle and gradient calculation
  SpeedMeasure(P, LocTime, StartTimeDo);
  // back-transform HGDensity: (G) -> (R)
  //if (HGC != Dens0->DensityCArray[HGDensity]) Error(SomeError,"ApplyTotalHamiltonian: HGC corrupted");
  if (first) fft_3d_complex_to_real(plan, Lev0->LevelNo, FFTNF1, HGC, work);
  // evaluate exchange potential with this density, add up onto HGCR
  //if (HGCR != (fftw_real *)Dens0->DensityCArray[HGDensity]) Error(SomeError,"ApplyTotalHamiltonian: HGCR corrupted");
  if (first) CalculateXCPotentialNoRT(P, HGCR); // add V^{xc} on V^H + V^{ps}
  // make sure that ActualPsiDensity is recent
  CalculateOneDensityR(Lat, LevS, Dens0, source, Dens0->DensityArray[ActualDensity], R->FactorDensityR*PsiFactor, 1);
  for (nx=0;nx<Nx;nx++)  
    for (ny=0;ny<Ny;ny++) 
      for (nz=0;nz<Nz;nz++) {
        i0 = nz*NUpz+Nz*NUpz*(ny*NUpy+Ny*NUpy*nx*NUpx);
        iS = nz+Nz*(ny+Ny*nx);
        //if (HGcR != Dens0->DensityArray[HGcDensity] || iS<0 || iS>=LevS->Dens->LocalSizeR) Error(SomeError,"ApplyTotalHamiltonian: HGC corrupted");
        HGcR[iS] = HGCR[i0]*PsiCR[i0]; /* Matrix Vector Mult */
      }
  // (R) -> (G)
  //if (HGcRC != (fftw_complex *)Dens0->DensityArray[HGcDensity]) Error(SomeError,"ApplyTotalHamiltonian: HGcRC corrupted");
  fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC, work);
  SpeedMeasure(P, LocTime, StopTimeDo);
  /* NonLocalPP */
  SpeedMeasure(P, NonLocTime, StartTimeDo);
  //if (dest != dest_bak) Error(SomeError,"ApplyTotalHamiltonian: dest corrupted");
  CalculateAddNLPot(P, dest, fnl, PsiFactor);  // wave function hidden in form factors fnl, also resets Hc_grad beforehand
  SpeedMeasure(P, NonLocTime, StopTimeDo);
  
  /* create final vector */
  for (g=0;g<LevS->MaxG;g++) {
    Index = LevS->GArray[g].Index; /* FIXME - factoren */
    //if (dest != dest_bak || g<0 || g>=LevS->MaxG) Error(SomeError,"ApplyTotalHamiltonian: dest corrupted");
    dest[g].re += PsiFactor*(HGcRC[Index].re*HGcRCFactor + 0.5*LevS->GArray[g].GSq*source[g].re);
    dest[g].im += PsiFactor*(HGcRC[Index].im*HGcRCFactor + 0.5*LevS->GArray[g].GSq*source[g].im);
  }
}

#define stay_above 0.001 //!< value above which the coefficient of the wave function will always remain

/** Finds the minimum of perturbed energy in regards of actual wave function.
 * The following happens step by step:
 * -# The Gradient is copied into GradientTypes#GraSchGradient (which is nothing but a pointer to
 *    one array in LPsiDat) and orthonormalized via GramSch() to all occupied wave functions
 *    except to the current perturbed one.
 * -# Then comes pre-conditioning, analogous to CalculatePreConGrad().
 * -# The Gradient is projected onto the current perturbed wave function and this is subtracted, i.e.
 *    vector is the conjugate gradient.
 * -# Finally, Calculate1stPerturbedDerivative() and Calculate2ndPerturbedDerivative() are called and
 *    with these results and the current total energy, CalculateDeltaI() finds the parameter for the one-
 *    dimensional minimisation. The current wave function is set to newly found minimum and approximated
 *    total energy is printed.
 * 
 * \param *P Problem at hand
 * \sa CalculateNewWave() and functions therein
 */
void FindPerturbedMinimum(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &Lat->Psi; 
  struct PseudoPot *PP = &P->PP;
  struct LatticeLevel *LevS = R->LevS;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens = Lev0->Dens;
  struct Energy *En = Lat->E;
  struct FileData *F = &P->Files;
  int g,p,i;
  int step = R->PsiStep;
  double *GammaDiv = &Lat->Psi.AddData[R->ActualLocalPsiNo].Gamma;
  const int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  fftw_complex *source = LevS->LPsi->LocalPsi[R->ActualLocalPsiNo];
  fftw_complex *grad = P->Grad.GradientArray[ActualGradient];
  fftw_complex *GradOrtho = P->Grad.GradientArray[GraSchGradient];
  fftw_complex *PCgrad = P->Grad.GradientArray[PreConGradient];
  fftw_complex *PCOrtho = P->Grad.GradientArray[GraSchGradient];
  fftw_complex *ConDir = P->Grad.GradientArray[ConDirGradient];
  fftw_complex *ConDir_old = P->Grad.GradientArray[OldConDirGradient];
  fftw_complex *Ortho = P->Grad.GradientArray[GraSchGradient];
  const fftw_complex *Hc_grad = P->Grad.GradientArray[HcGradient];
  const fftw_complex *H1c_grad = P->Grad.GradientArray[H1cGradient];
  fftw_complex *HConDir = Dens->DensityCArray[ActualDensity];
  const double PsiFactor = Lat->Psi.LocalPsiStatus[R->ActualLocalPsiNo].PsiFactor;
  //double Lambda = Lat->Psi.AddData[R->ActualLocalPsiNo].Lambda;
  double T;
  double x, K; //, dK;
  double dS[2], S[2], Gamma, GammaDivOld = *GammaDiv;
  double LocalSP, PsiSP;
  double dEdt0, ddEddt0, ConDirHConDir, ConDirConDir;//, sourceHsource;
  double E0, E, delta;
  //double E0, E, dE, ddE, delta, dcos, dsin;
  //double EI, dEI, ddEI, deltaI, dcosI, dsinI;
  //double HartreeddEddt0, XCddEddt0;
  double d[4],D[4], Diff;
  const int Num = Psi->NoOfPsis;
  
  // ORTHOGONALIZED-GRADIENT
  for (g=0;g<LevS->MaxG;g++) {
    //if (GradOrtho != P->Grad.GradientArray[GraSchGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: GradOrtho corrupted");
    GradOrtho[g].re = grad[g].re; //+Lambda*source[g].re;
    GradOrtho[g].im = grad[g].im; //+Lambda*source[g].im;
  }
  // include the ExtraPsi (which is the GraSchGradient!)
  SetGramSchExtraPsi(P, Psi, NotOrthogonal);
  // exclude the minimised Psi
  SetGramSchActualPsi(P, Psi, NotUsedToOrtho);
  SpeedMeasure(P, GramSchTime, StartTimeDo);
  // makes conjugate gradient orthogonal to all other orbits
  //fprintf(stderr,"CalculateCGGradient: GramSch() for extra orbital\n");
  GramSch(P, LevS, Psi, Orthogonalize);
  SpeedMeasure(P, GramSchTime, StopTimeDo);
  //if (grad != P->Grad.GradientArray[ActualGradient]) Error(SomeError,"FindPerturbedMinimum: grad corrupted");
  memcpy(grad, GradOrtho, ElementSize*LevS->MaxG*sizeof(double));
  //memcpy(PCOrtho, GradOrtho, ElementSize*LevS->MaxG*sizeof(double));

  // PRE-CONDITION-GRADIENT
  //if (fabs(T) < MYEPSILON) T = 1;
  T = 0.;
  for (i=0;i<Num;i++)
    T += Psi->lambda[i][i];
  for (g=0;g<LevS->MaxG;g++) {
    x = .5*LevS->GArray[g].GSq;
    // FIXME: Good way of accessing reciprocal Lev0 Density coefficients on LevS! (not so trivial)  
    //x += sqrt(Dens->DensityCArray[HGDensity][g].re*Dens->DensityCArray[HGDensity][g].re+Dens->DensityCArray[HGDensity][g].im*Dens->DensityCArray[HGDensity][g].im);
    x -= T/(double)Num;
    K = x/(x*x+stay_above*stay_above);
    //if (PCOrtho != P->Grad.GradientArray[GraSchGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: PCOrtho corrupted");
    c_re(PCOrtho[g]) = K*c_re(grad[g]);
    c_im(PCOrtho[g]) = K*c_im(grad[g]);  
  }
  SetGramSchExtraPsi(P, Psi, NotOrthogonal);
  SpeedMeasure(P, GramSchTime, StartTimeDo);
  // preconditioned direction is orthogonalized
  //fprintf(stderr,"CalculatePreConGrad: GramSch() for extra orbital\n");
  GramSch(P, LevS, Psi, Orthogonalize);
  SpeedMeasure(P, GramSchTime, StopTimeDo);
  //if (PCgrad != P->Grad.GradientArray[PreConGradient]) Error(SomeError,"FindPerturbedMinimum: PCgrad corrupted");
  memcpy(PCgrad, PCOrtho, ElementSize*LevS->MaxG*sizeof(double));

  //debug(P, "Before ConDir");
  //fprintf(stderr,"|(%i)|^2 = %lg\t |PCgrad|^2 = %lg\t |PCgrad,(%i)| = %lg\n", R->ActualLocalPsiNo, GradSP(P,LevS,source,source),GradSP(P,LevS,PCgrad,PCgrad), R->ActualLocalPsiNo, GradSP(P,LevS,PCgrad,source));
  // CONJUGATE-GRADIENT
  LocalSP = GradSP(P, LevS, PCgrad, grad);
  MPI_Allreduce ( &LocalSP, &PsiSP, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  *GammaDiv = dS[0] = PsiSP;
  dS[1] = GammaDivOld;
  S[0]=dS[0]; S[1]=dS[1];
  /*MPI_Allreduce ( dS, S, 2, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);*/
  if (step) { // only in later steps is the scalar product used, but always condir stored in oldcondir and Ortho (working gradient)
    if (fabs(S[1]) < MYEPSILON) fprintf(stderr,"CalculateConDir: S[1] = %lg\n",S[1]);
    Gamma = S[0]/S[1];
    if (fabs(S[1]) < MYEPSILON) {
      if (fabs(S[0]) < MYEPSILON)  
        Gamma = 1.0;
      else
        Gamma = 0.0;
    }
    for (g=0; g < LevS->MaxG; g++) {
      //if (ConDir != P->Grad.GradientArray[ConDirGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: ConDir corrupted");
      c_re(ConDir[g]) = c_re(PCgrad[g]) + Gamma*c_re(ConDir_old[g]);
      c_im(ConDir[g]) = c_im(PCgrad[g]) + Gamma*c_im(ConDir_old[g]);
      //if (ConDir_old != P->Grad.GradientArray[OldConDirGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: ConDir_old corrupted");
      c_re(ConDir_old[g]) = c_re(ConDir[g]);
      c_im(ConDir_old[g]) = c_im(ConDir[g]);
      //if (Ortho != P->Grad.GradientArray[GraSchGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: Ortho corrupted");
      c_re(Ortho[g]) = c_re(ConDir[g]);
      c_im(Ortho[g]) = c_im(ConDir[g]);
    }
  } else {
    Gamma = 0.0;
    for (g=0; g < LevS->MaxG; g++) {
      //if (ConDir != P->Grad.GradientArray[ConDirGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: ConDir corrupted");
      c_re(ConDir[g]) = c_re(PCgrad[g]);
      c_im(ConDir[g]) = c_im(PCgrad[g]);
      //if (ConDir_old != P->Grad.GradientArray[OldConDirGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: ConDir_old corrupted");
      c_re(ConDir_old[g]) = c_re(ConDir[g]);
      c_im(ConDir_old[g]) = c_im(ConDir[g]);
      //if (Ortho != P->Grad.GradientArray[GraSchGradient] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: Ortho corrupted");
      c_re(Ortho[g]) = c_re(ConDir[g]);
      c_im(Ortho[g]) = c_im(ConDir[g]);
    }
  }
  // orthonormalize
  SetGramSchExtraPsi(P, Psi, NotOrthogonal);
  SpeedMeasure(P, GramSchTime, StartTimeDo);
  //fprintf(stderr,"CalculateConDir: GramSch() for extra orbital\n");
  GramSch(P, LevS, Psi, Orthogonalize);
  SpeedMeasure(P, GramSchTime, StopTimeDo);
  //if (ConDir != P->Grad.GradientArray[ConDirGradient]) Error(SomeError,"FindPerturbedMinimum: ConDir corrupted");
  memcpy(ConDir, Ortho, ElementSize*LevS->MaxG*sizeof(double));
  //debug(P, "Before LineSearch");
  //fprintf(stderr,"|(%i)|^2 = %lg\t |ConDir|^2 = %lg\t |ConDir,(%i)| = %lg\n", R->ActualLocalPsiNo, GradSP(P,LevS,source,source),GradSP(P,LevS,ConDir,ConDir), R->ActualLocalPsiNo, GradSP(P,LevS,ConDir,source));
  SetGramSchActualPsi(P, Psi, IsOrthogonal);
    
  //fprintf(stderr,"(%i) Testing conjugate gradient for Orthogonality ...\n", P->Par.me);
  //TestForOrth(P,LevS,ConDir);

  // ONE-DIMENSIONAL LINE-SEARCH

  // ========= dE / dt | 0 ============
  p = Lat->Psi.TypeStartIndex[Occupied] + (R->ActualLocalPsiNo - Lat->Psi.TypeStartIndex[R->CurrentMin]);
  //if (Hc_grad != P->Grad.GradientArray[HcGradient]) Error(SomeError,"FindPerturbedMinimum: Hc_grad corrupted");
  //if (H1c_grad != P->Grad.GradientArray[H1cGradient]) Error(SomeError,"FindPerturbedMinimum: H1c_grad corrupted");
  d[0] = Calculate1stPerturbedDerivative(P, LevS->LPsi->LocalPsi[p], source, ConDir, Hc_grad, H1c_grad);
  //CalculateConDirHConDir(P, ConDir, PsiFactor, &d[1], &d[2], &d[3]);
  //if (ConDir != P->Grad.GradientArray[ConDirGradient]) Error(SomeError,"FindPerturbedMinimum: ConDir corrupted");
  CalculateCDfnl(P, ConDir, PP->CDfnl); // calculate needed non-local form factors
  //if (HConDir != Dens->DensityCArray[ActualDensity]) Error(SomeError,"FindPerturbedMinimum: HConDir corrupted");
  SetArrayToDouble0((double *)HConDir,Dens->TotalSize*2);
  //if (ConDir != P->Grad.GradientArray[ConDirGradient]) Error(SomeError,"FindPerturbedMinimum: ConDir corrupted");
  ApplyTotalHamiltonian(P,ConDir,HConDir, PP->CDfnl, PsiFactor, 0); // applies H^(0) with total perturbed density!
  d[1] = GradSP(P,LevS,ConDir,HConDir);
  d[2] = GradSP(P,LevS,ConDir,ConDir);
  d[3] = 0.;

  // gather results  
  MPI_Allreduce ( &d, &D, 4, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  // ========== ddE / ddt | 0 =========
  dEdt0 = D[0];
  for (i=MAXOLD-1; i > 0; i--)
    En->dEdt0[i] = En->dEdt0[i-1];
  En->dEdt0[0] = dEdt0;
  ConDirHConDir = D[1];
  ConDirConDir = D[2];
  ddEddt0 = 0.0;
  //if (ConDir != P->Grad.GradientArray[ConDirGradient]) Error(SomeError,"FindPerturbedMinimum: ConDir corrupted");
  //if (H1c_grad != P->Grad.GradientArray[H1cGradient]) Error(SomeError,"FindPerturbedMinimum: H1c_grad corrupted");
  ddEddt0 = Calculate2ndPerturbedDerivative(P, LevS->LPsi->LocalPsi[p], source, ConDir, Lat->Psi.AddData[R->ActualLocalPsiNo].Lambda * Psi->LocalPsiStatus[R->ActualLocalPsiNo].PsiFactor, ConDirHConDir, ConDirConDir);

  for (i=MAXOLD-1; i > 0; i--)
    En->ddEddt0[i] = En->ddEddt0[i-1];
  En->ddEddt0[0] = ddEddt0;
  E0 = En->TotalEnergy[0];
  // delta
  //if (isnan(E0)) { fprintf(stderr,"(%i) WARNING in CalculateLineSearch(): E0_%i[%i] = NaN!\n", P->Par.me, i, 0); Error(SomeError, "NaN-Fehler!"); }
  //if (isnan(dEdt0)) { fprintf(stderr,"(%i) WARNING in CalculateLineSearch(): dEdt0_%i[%i] = NaN!\n", P->Par.me, i, 0); Error(SomeError, "NaN-Fehler!"); }
  //if (isnan(ddEddt0)) { fprintf(stderr,"(%i) WARNING in CalculateLineSearch(): ddEddt0_%i[%i] = NaN!\n", P->Par.me, i, 0); Error(SomeError, "NaN-Fehler!"); }

  ////deltaI = CalculateDeltaI(E0, dEdt0, ddEddt0, 
  ////      &EI, &dEI, &ddEI, &dcosI, &dsinI);
  ////delta = deltaI; E = EI; dE = dEI; ddE = ddEI; dcos = dcosI; dsin = dsinI;
  if (ddEddt0 > 0) {
    delta = - dEdt0/ddEddt0;
    E = E0 + delta * dEdt0 + delta*delta/2. * ddEddt0;
  } else {
    delta = 0.;
    E = E0;
    fprintf(stderr,"(%i) Taylor approximation leads not to minimum!\n",P->Par.me);
  }

  // shift energy delta values
  for (i=MAXOLD-1; i > 0; i--) {
    En->delta[i] = En->delta[i-1];
    En->ATE[i] = En->ATE[i-1];
  }
  // store new one
  En->delta[0] = delta;
  En->ATE[0] = E;
  if (En->TotalEnergy[1] != 0.)
    Diff = fabs(En->TotalEnergy[1] - E0)/(En->TotalEnergy[1] - E0)*fabs((E0 - En->ATE[1])/E0);
  else
    Diff = 0.;
  R->Diffcount += pow(Diff,2);
  
  // reinstate actual density (only needed for UpdateDensityCalculation) ... 
  //CalculateOneDensityR(Lat, LevS, Dens, source, Dens->DensityArray[ActualDensity], R->FactorDensityR*Psi->LocalPsiStatus[R->ActualLocalPsiNo].PsiFactor, 1);
  // ... before changing actual local Psi  
  for (g = 0; g < LevS->MaxG; g++) {  // Here all coefficients are updated for the new found wave function
    //if (isnan(ConDir[g].re)) { fprintf(stderr,"WARNGING: CalculateLineSearch(): ConDir_%i(%i) = NaN!\n", R->ActualLocalPsiNo, g); Error(SomeError, "NaN-Fehler!"); }
    //if (source != LevS->LPsi->LocalPsi[R->ActualLocalPsiNo] || g<0 || g>=LevS->MaxG) Error(SomeError,"FindPerturbedMinimum: source corrupted");
    ////c_re(source[g]) = c_re(source[g])*dcos + c_re(ConDir[g])*dsin;
    ////c_im(source[g]) = c_im(source[g])*dcos + c_im(ConDir[g])*dsin;
    c_re(source[g]) = c_re(source[g]) + c_re(ConDir[g])*delta;
    c_im(source[g]) = c_im(source[g]) + c_im(ConDir[g])*delta;
  }
  if (P->Call.out[StepLeaderOut]) {
    fprintf(stderr, "(%i,%i,%i)S(%i,%i,%i):\tTE: %e\tATE: %e\t Diff: %e\t --- d: %e\tdEdt0: %e\tddEddt0: %e\n",P->Par.my_color_comm_ST,P->Par.me_comm_ST, P->Par.me_comm_ST_PsiT, R->MinStep, R->ActualLocalPsiNo, R->PsiStep, E0, E, Diff,delta, dEdt0, ddEddt0);
    //fprintf(stderr, "(%i,%i,%i)S(%i,%i,%i):\tp0: %e p1: %e p2: %e \tATE: %e\t Diff: %e\t --- d: %e\tdEdt0: %e\tddEddt0: %e\n",P->Par.my_color_comm_ST,P->Par.me_comm_ST, P->Par.me_comm_ST_PsiT, R->MinStep, R->ActualLocalPsiNo, R->PsiStep, En->parts[0], En->parts[1], En->parts[2], E, Diff,delta, dEdt0, ddEddt0);
  }
  if (P->Par.me == 0) {
    fprintf(F->MinimisationFile, "%i\t%i\t%i\t%e\t%e\t%e\t%e\t%e\n",R->MinStep, R->ActualLocalPsiNo, R->PsiStep, E0, E, delta, dEdt0, ddEddt0);
    fflush(F->MinimisationFile);
  }
}

/** Applies perturbation operator \f$\nabla_{index}\f$ to \a *source.
 * As wave functions are stored in the reciprocal basis set, the application is straight-forward,
 * for every G vector, the by \a index specified component is multiplied with the respective
 * coefficient. Afterwards, 1/i is applied by flipping real and imaginary components (and an additional minus sign on the new imaginary term).
 * \param *P Problem at hand
 * \param *source complex coefficients of wave function \f$\varphi(G)\f$
 * \param *dest returned complex coefficients of wave function \f$\widehat{p}_{index}|\varphi(G)\f$
 * \param index_g vectorial index of operator to be applied
 */
void CalculatePerturbationOperator_P(struct Problem *P, const fftw_complex *source, fftw_complex *dest, const int index_g)
{
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  //const fftw_complex *dest_bak = dest;
  int g = 0;
  if (LevS->GArray[0].GSq == 0.0) {
    //if (dest != dest_bak) Error(SomeError,"CalculatePerturbationOperator_P: dest corrupted");
    dest[0].re =  LevS->GArray[0].G[index_g]*source[0].im;
    dest[0].im = -LevS->GArray[0].G[index_g]*source[0].re; 
    g++;
  }
  for (;g<LevS->MaxG;g++) {
    //if (dest != dest_bak || g<0 || g>=LevS->MaxG) Error(SomeError,"CalculatePerturbationOperator_P: g out of range");
    dest[g].re =  LevS->GArray[g].G[index_g]*source[g].im;
    dest[g].im = -LevS->GArray[g].G[index_g]*source[g].re;
  }
  // don't put dest[0].im = 0! Otherwise real parts of perturbed01/10 are not the same anymore!
}

/** Applies perturbation operator \f$\widehat{r}_{index}\f$ to \a *source.
 * The \a *source wave function is blown up onto upper level LatticeLevel RunStruct#Lev0, fourier
 * transformed. Afterwards, for each point on the real mesh the coefficient is multiplied times the real
 * vector pointing within the cell to the mesh point, yet on LatticeLevel RunStruct#LevS. The new wave
 * function is inverse fourier transformed and the resulting reciprocal coefficients stored in *dest.
 * \param *P Problem at hand
 * \param *source source coefficients
 * \param *source2 second source coefficients, e.g. in the evaluation of a scalar product
 * \param *dest destination coefficienta array, is overwrittten!
 * \param index_r index of real vector.
 * \param wavenr index of respective PsiTypeTag#Occupied(!) OnePsiElementAddData for the needed Wanner centre of the wave function.
 */
void CalculatePerturbationOperator_R(struct Problem *P, const fftw_complex *source, fftw_complex *dest, const fftw_complex *source2, const int index_r, const int wavenr)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Density *Dens0 = Lev0->Dens;
  struct fft_plan_3d *plan = Lat->plan;
  fftw_complex *TempPsi = Dens0->DensityCArray[Temp2Density];
  fftw_real *TempPsiR = (fftw_real *) TempPsi;
  fftw_complex *workC = Dens0->DensityCArray[TempDensity];
  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
  fftw_real *PsiCR = (fftw_real *) PsiC;
  fftw_complex *tempdestRC = (fftw_complex *)Dens0->DensityArray[TempDensity];
  fftw_complex *posfac, *destsnd, *destrcv;
  double x[NDIM], X[NDIM], fac[NDIM], Wcentre[NDIM];
  const int k_normal = Lat->Psi.TypeStartIndex[Occupied] + (wavenr - Lat->Psi.TypeStartIndex[R->CurrentMin]);
  int n[NDIM], n0, g, Index, pos, iS, i0;
  int N[NDIM], NUp[NDIM];
  const int N0 = LevS->Plan0.plan->local_nx;
  N[0] = LevS->Plan0.plan->N[0];
  N[1] = LevS->Plan0.plan->N[1];
  N[2] = LevS->Plan0.plan->N[2];
  NUp[0] = LevS->NUp[0]; 
  NUp[1] = LevS->NUp[1]; 
  NUp[2] = LevS->NUp[2]; 
  Wcentre[0] = Lat->Psi.AddData[k_normal].WannierCentre[0];
  Wcentre[1] = Lat->Psi.AddData[k_normal].WannierCentre[1];
  Wcentre[2] = Lat->Psi.AddData[k_normal].WannierCentre[2];
  // init pointers and values
  const int myPE = P->Par.me_comm_ST_Psi;
  const double FFTFactor = 1./LevS->MaxN; 
  double vector;
  //double result, Result;

  // blow up source coefficients
  LockDensityArray(Dens0,TempDensity,real); // tempdestRC
  LockDensityArray(Dens0,Temp2Density,imag); // TempPsi
  LockDensityArray(Dens0,ActualPsiDensity,imag); // PsiC
  //if (tempdestRC != (fftw_complex *)Dens0->DensityArray[TempDensity]) Error(SomeError,"CalculatePerturbationOperator_R: tempdestRC corrupted");
  SetArrayToDouble0((double *)tempdestRC ,Dens0->TotalSize*2);
  //if (TempPsi != Dens0->DensityCArray[Temp2Density]) Error(SomeError,"CalculatePerturbationOperator_R: TempPsi corrupted");
  SetArrayToDouble0((double *)TempPsi ,Dens0->TotalSize*2);
  //if (PsiC != Dens0->DensityCArray[ActualPsiDensity]) Error(SomeError,"CalculatePerturbationOperator_R: PsiC corrupted");
  SetArrayToDouble0((double *)PsiC,Dens0->TotalSize*2);
  for (g=0; g<LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    posfac = &LevS->PosFactorUp[LevS->MaxNUp*g];
    destrcv = &tempdestRC[LevS->MaxNUp*Index]; 
    for (pos=0; pos < LevS->MaxNUp; pos++) {
      //if (destrcv != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->LocalSizeC) Error(SomeError,"CalculatePerturbationOperator_R: destrcv corrupted");
      destrcv [pos].re = (( source[g].re)*posfac[pos].re-(source[g].im)*posfac[pos].im);
      destrcv [pos].im = (( source[g].re)*posfac[pos].im+(source[g].im)*posfac[pos].re);
    }
  }
  for (g=0; g<LevS->MaxDoubleG; g++) {
    destsnd  = &tempdestRC [LevS->DoubleG[2*g]*LevS->MaxNUp];
    destrcv  = &tempdestRC [LevS->DoubleG[2*g+1]*LevS->MaxNUp];
    for (pos=0; pos<LevS->MaxNUp; pos++) {
      //if (destrcv != &tempdestRC [LevS->DoubleG[2*g+1]*LevS->MaxNUp] || LevS->DoubleG[2*g]*LevS->MaxNUp+pos<0 || LevS->DoubleG[2*g]*LevS->MaxNUp+pos>=Dens0->LocalSizeC|| LevS->DoubleG[2*g+1]*LevS->MaxNUp+pos<0 || LevS->DoubleG[2*g+1]*LevS->MaxNUp+pos>=Dens0->LocalSizeC) Error(SomeError,"CalculatePerturbationOperator_R: destrcv corrupted");
      destrcv [pos].re =  destsnd [pos].re;
      destrcv [pos].im = -destsnd [pos].im;
    }    
  }  
  // fourier transform blown up wave function
  //if (tempdestRC != (fftw_complex *)Dens0->DensityArray[TempDensity]) Error(SomeError,"CalculatePerturbationOperator_R: tempdestRC corrupted");
  //if (workC != Dens0->DensityCArray[TempDensity]) Error(SomeError,"CalculatePerturbationOperator_R: workC corrupted");
  fft_3d_complex_to_real(plan,LevS->LevelNo, FFTNFUp, tempdestRC , workC);
  //if (tempdestRC != (fftw_complex *)Dens0->DensityArray[TempDensity]) Error(SomeError,"CalculatePerturbationOperator_R: tempdestRC corrupted");
  //if (TempPsiR != (fftw_real *)Dens0->DensityCArray[Temp2Density]) Error(SomeError,"CalculatePerturbationOperator_R: TempPsiR corrupted");
  DensityRTransformPos(LevS,(fftw_real*)tempdestRC ,TempPsiR );
  UnLockDensityArray(Dens0,TempDensity,real); // TempdestRC
 
  //result = 0.;
  // for every point on the real grid multiply with component of position vector
  for (n0=0; n0<N0; n0++)
    for (n[1]=0; n[1]<N[1]; n[1]++)
      for (n[2]=0; n[2]<N[2]; n[2]++) {
        n[0] = n0 + N0 * myPE;
        fac[0] = (double)(n[0])/(double)((N[0]));
        fac[1] = (double)(n[1])/(double)((N[1]));
        fac[2] = (double)(n[2])/(double)((N[2]));
        RMat33Vec3(x,Lat->RealBasis,fac);
        iS = n[2] + N[2]*(n[1] + N[1]*n0);  // mind splitting of x axis due to multiple processes
        i0 = n[2]*NUp[2]+N[2]*NUp[2]*(n[1]*NUp[1]+N[1]*NUp[1]*n0*NUp[0]);
        //PsiCR[iS] = ((double)n[0]/(double)N[0]*Lat->RealBasis[0] - fabs(Wcentre[0]))*TempPsiR[i0] - ((double)n[1]/(double)N[1]*Lat->RealBasis[4] - fabs(Wcentre[1]))*TempPsi2R[i0];
        //fprintf(stderr,"(%i) R[%i] = (%lg,%lg,%lg)\n",P->Par.me, i0, x[0], x[1], x[2]);
        //else fprintf(stderr,"(%i) WCentre[%i] = %e \n",P->Par.me, index_r, Wcentre[index_r]);
        MinImageConv(Lat,x, Wcentre, X);
        vector = sawtooth(Lat,X[index_r],index_r);
        //vector = 1.;//sin((double)(n[index_r])/(double)((N[index_r]))*2*PI);      
         PsiCR[iS] = vector * TempPsiR[i0];
        //fprintf(stderr,"(%i) vector(%i/%i,%i/%i,%i/%i): %lg\tx[%i] = %e\tWcentre[%i] = %e\tTempPsiR[%i] = %e\tPsiCR[%i] = %e\n",P->Par.me, n[0], N[0], n[1], N[1], n[2], N[2], vector, index_r, x[index_r],index_r, Wcentre[index_r],i0,TempPsiR[i0],iS,PsiCR[iS]);
         
        //truedist(Lat,x[cross(index_r,2)],Wcentre[cross(index_r,2)],cross(index_r,2)) * TempPsiR[i0]; 
        //tmp += truedist(Lat,x[index_r],WCentre[index_r],index_r) * RealPhiR[i0];
        //tmp += sawtooth(Lat,truedist(Lat,x[index_r],WCentre[index_r],index_r), index_r)*RealPhiR[i0];
        //(Fehler mit falschem Ort ist vor dieser Stelle!): ueber result =  RealPhiR[i0] * (x[index_r]) * RealPhiR[i0]; gecheckt
        //result += TempPsiR[i0] * PsiCR[iS];
      }
  UnLockDensityArray(Dens0,Temp2Density,imag); // TempPsi
  //MPI_Allreduce( &result, &Result, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
  //if (P->Par.me == 0) fprintf(stderr,"(%i) PerturbationOpertator_R: %e\n",P->Par.me, Result/LevS->MaxN);
  // inverse fourier transform 
  fft_3d_real_to_complex(plan,LevS->LevelNo, FFTNF1, PsiC, workC);
  //fft_3d_real_to_complex(plan,LevS->LevelNo, FFTNF1, Psi2C, workC);
  
  // copy to destination array
  for (g=0; g<LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    dest[g].re = ( PsiC[Index].re)*FFTFactor; 
    dest[g].im = ( PsiC[Index].im)*FFTFactor;
  }
  UnLockDensityArray(Dens0,ActualPsiDensity,imag);  //PsiC
  //if (LevS->GArray[0].GSq == 0)
  //  dest[0].im = 0; // imaginary of G=0 is zero
}
/*
{
  struct RunStruct *R = &P->R;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Lattice *Lat = &P->Lat;
  struct fft_plan_3d *plan = Lat->plan;
  struct Density *Dens0 = Lev0->Dens;
  fftw_complex *tempdestRC =  Dens0->DensityCArray[TempDensity];
  fftw_real *tempdestR = (fftw_real *) tempdestRC;
  fftw_complex *work =  Dens0->DensityCArray[Temp2Density];
  fftw_complex *PsiC = (fftw_complex *) Dens0->DensityCArray[ActualPsiDensity];;
  fftw_real *PsiCR = (fftw_real *)  PsiC;
  fftw_real *RealPhiR = (fftw_real *) Dens0->DensityArray[Temp2Density];
  fftw_complex *posfac, *destsnd, *destrcv;
  double x[NDIM], fac[NDIM], WCentre[NDIM];
  int n[NDIM], N0, n0, g, Index, pos, iS, i0;

  // init pointers and values
  int myPE = P->Par.me_comm_ST_Psi;
  double FFTFactor = 1./LevS->MaxN; 
  int N[NDIM], NUp[NDIM];
  N[0] = LevS->Plan0.plan->N[0];
  N[1] = LevS->Plan0.plan->N[1];
  N[2] = LevS->Plan0.plan->N[2];
  NUp[0] = LevS->NUp[0]; 
  NUp[1] = LevS->NUp[1]; 
  NUp[2] = LevS->NUp[2]; 
  N0 = LevS->Plan0.plan->local_nx;
  wavenr = Lat->Psi.TypeStartIndex[Occupied] + (wavenr - Lat->Psi.TypeStartIndex[R->CurrentMin]);
  Wcentre[0] = Lat->Psi.AddData[wavenr].WannierCentre[0];
  Wcentre[1] = Lat->Psi.AddData[wavenr].WannierCentre[1];
  Wcentre[2] = Lat->Psi.AddData[wavenr].WannierCentre[2];
  
  // blow up source coefficients
  SetArrayToDouble0((double *)tempdestRC,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)RealPhiR,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)PsiC,Dens0->TotalSize*2);
  for (g=0; g<LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    posfac = &LevS->PosFactorUp[LevS->MaxNUp*g];
    destrcv = &tempdestRC[LevS->MaxNUp*Index]; 
    for (pos=0; pos<LevS->MaxNUp; pos++) {
      destrcv[pos].re = (( source[g].re)*posfac[pos].re-( source[g].im)*posfac[pos].im);
      destrcv[pos].im = (( source[g].re)*posfac[pos].im+( source[g].im)*posfac[pos].re);
    }
  }
  for (g=0; g<LevS->MaxDoubleG; g++) {
    destsnd = &tempdestRC[LevS->DoubleG[2*g]*LevS->MaxNUp];
    destrcv = &tempdestRC[LevS->DoubleG[2*g+1]*LevS->MaxNUp];
    for (pos=0; pos<LevS->MaxNUp; pos++) {
      destrcv[pos].re =  destsnd[pos].re;
      destrcv[pos].im = -destsnd[pos].im;
    }    
  }
  
  // fourier transform blown up wave function
  fft_3d_complex_to_real(plan,LevS->LevelNo, FFTNFUp, tempdestRC, work);
  DensityRTransformPos(LevS,tempdestR,RealPhiR);

  //fft_Psi(P,source,RealPhiR,0,0);
  
  // for every point on the real grid multiply with component of position vector
  for (n0=0; n0<N0; n0++)
    for (n[1]=0; n[1]<N[1]; n[1]++)
      for (n[2]=0; n[2]<N[2]; n[2]++) {
        n[0] = n0 + N0 * myPE;
        fac[0] = (double)(n[0])/(double)((N[0]));
        fac[1] = (double)(n[1])/(double)((N[1]));
        fac[2] = (double)(n[2])/(double)((N[2]));
        RMat33Vec3(x,Lat->RealBasis,fac);
        iS = n[2] + N[2]*(n[1] + N[1]*n0);  // mind splitting of x axis due to multiple processes
        i0 = n[2]*NUp[2]+N[2]*NUp[2]*(n[1]*NUp[1]+N[1]*NUp[1]*n0*NUp[0]);
        //PsiCR[iS] = (x[index_r]) * RealPhiR[i0]; //- WCentre[index_r] 
        PsiCR[iS] = truedist(Lat,x[index_r],WCentre[index_r],index_r) * RealPhiR[i0];
        //PsiCR[iS] = truedist(Lat,x[index_r],0.,index_r) * RealPhiR[i0];
        //PsiCR[iS] = sawtooth(Lat,truedist(Lat,x[index_r],WCentre[index_r],index_r), index_r)*RealPhiR[i0];
        //(Fehler mit falschem Ort ist vor dieser Stelle!): ueber result =  RealPhiR[i0] * (x[index_r]) * RealPhiR[i0]; gecheckt
      }
  
  // inverse fourier transform 
  fft_3d_real_to_complex(plan,LevS->LevelNo, FFTNF1, PsiC, work);
  
  // copy to destination array
  for (g=0; g<LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    dest[g].re = ( PsiC[Index].re)*FFTFactor; 
    dest[g].im = ( PsiC[Index].im)*FFTFactor;
    if (LevS->GArray[g].GSq == 0)
      dest[g].im = 0; // imaginary of G=0 is zero
  }
}*/

/** Prints the positions of all unperturbed orbitals to screen.
 * \param *P Problem at hand
 * \param type PsiTypeTag specifying group of orbitals
 * \sa CalculatePerturbationOperator_R() 
 */
void OutputOrbitalPositions(struct Problem *P, const enum PsiTypeTag type)
{
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS; 
  fftw_complex *temp = LevS->LPsi->TempPsi;
  fftw_complex *source; 
  int wavenr, index;
  double result[NDIM], Result[NDIM];
  //double imsult[NDIM], Imsult[NDIM];
  double norm[NDIM], Norm[NDIM];
  //double imnorm[NDIM], imNorm[NDIM];
  double Wcentre[NDIM];

  // for every unperturbed wave function
  for (wavenr=Psi->TypeStartIndex[type]; wavenr<Psi->TypeStartIndex[type+1]; wavenr++) {
    source = LevS->LPsi->LocalPsi[wavenr];
    Wcentre[0] = Psi->AddData[wavenr].WannierCentre[0];
    Wcentre[1] = Psi->AddData[wavenr].WannierCentre[1];
    Wcentre[2] = Psi->AddData[wavenr].WannierCentre[2];
    for (index=0; index<NDIM; index++) {
      SetArrayToDouble0((double *)temp,2*R->InitLevS->MaxG);
      // apply position operator      
      CalculatePerturbationOperator_R(P,source,temp,source,index, wavenr + Psi->TypeStartIndex[R->CurrentMin]);
      // take scalar product
      result[index] = GradSP(P,LevS,source,temp);
      //imsult[index] = GradImSP(P,LevS,source,temp);
      norm[index] = GradSP(P,LevS,source,source);
      //imnorm[index] = GradImSP(P,LevS,source,source);
      MPI_Allreduce( result, Result, NDIM, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
      //MPI_Allreduce( imsult, Imsult, NDIM, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
      MPI_Allreduce( norm, Norm, NDIM, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
      //MPI_Allreduce( imnorm, imNorm, NDIM, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
    }
    // print output to stderr
    if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Position of Orbital %i: (%e,%e,%e)\n",P->Par.me, wavenr, Result[0]/Norm[0]+Wcentre[0], Result[1]/Norm[1]+Wcentre[1], Result[2]/Norm[2]+Wcentre[2]);
    //fprintf(stderr,"(%i) Position of Orbital %i wrt Wannier: (%e,%e,%e)\n",P->Par.me, wavenr, Result[0]/Norm[0], Result[1]/Norm[1], Result[2]/Norm[2]);
    //fprintf(stderr,"(%i) with Norm: (%e,%e,%e) + i (%e,%e,%e)\n",P->Par.me, Norm[0], Norm[1], Norm[2], imNorm[0], imNorm[1], imNorm[2]);
    //if (P->Par.me == 0) fprintf(stderr,"(%i) Position of Orbital %i: (%e,%e,%e)\n",P->Par.me, wavenr, Result[0]/Norm[0], Result[1]/Norm[1], Result[2]/Norm[2]);
  }
}

#define borderstart 0.9

/** Applies perturbation operator \f$(\widehat{r} \times \nabla)_{index}\f$ to \a *source.
 * The source is fourier-transformed by transforming it to a density (on the next higher level RunStruct#Lev0) 
 * and at the same time multiply it with the respective component of the reciprocal G vector - the momentum. This
 * is done by callinf fft_Psi(). Thus we get \f$\nabla_k | \varphi (R) \rangle\f$.
 * 
 * Next, we apply the two of three components of the position operator r, which ones stated by cross(), while going 
 * in a loop through every point of the grid. In order to do this sensibly, truedist() is used to map the coordinates
 * onto -L/2...L/2, by subtracting the OneElementPsiAddData#WannierCentre R and wrapping. Also, due to the breaking up
 * of the x axis into equally sized chunks for each coefficient sharing process, we need to step only over local
 * x-axis grid points, however shift them to the global position when being used as position. In the end, we get
 * \f$\epsilon_{index,j,k} (\widehat{r}-R)_j \nabla_k | \varphi (R) \rangle\f$.
 * 
 * One last fft brings the wave function back to reciprocal basis and it is copied to \a *dest.
 * \param *P Problem at hand
 * \param *source complex coefficients of wave function \f$\varphi(G)\f$
 * \param *dest returned complex coefficients of wave function \f$(\widehat{r} \times \widehat{p})_{index}|\varphi(G)\rangle\f$
 * \param phi0nr number within LocalPsi of the unperturbed pendant of the given perturbed wavefunction \a *source.
 * \param index_rxp index desired of the vector product
 * \sa CalculateConDirHConDir() - the procedure of fft and inverse fft is very similar.
 */
void CalculatePerturbationOperator_RxP(struct Problem *P, const fftw_complex *source, fftw_complex *dest, const int phi0nr, const int index_rxp)

{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Density *Dens0 = Lev0->Dens;
  struct fft_plan_3d *plan = Lat->plan;
  fftw_complex *TempPsi = Dens0->DensityCArray[Temp2Density];
  fftw_real *TempPsiR = (fftw_real *) TempPsi;
  fftw_complex *TempPsi2 = (fftw_complex *)Dens0->DensityArray[Temp2Density];
  fftw_real *TempPsi2R =  (fftw_real *) TempPsi2;
  fftw_complex *workC = Dens0->DensityCArray[TempDensity];
  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
  fftw_real *PsiCR = (fftw_real *) PsiC;
  double x[NDIM], X[NDIM], fac[NDIM], *Wcentre;
  int n[NDIM], n0, g, Index, iS, i0; //pos, 
  const int *N, *NUp;
  const int N0 = LevS->Plan0.plan->local_nx;
  N = LevS->Plan0.plan->N;
  NUp = LevS->NUp; 
  Wcentre = Lat->Psi.AddData[phi0nr].WannierCentre;
  // init pointers and values
  const int myPE = P->Par.me_comm_ST_Psi;
  const double FFTFactor = 1./LevS->MaxN; //
//  double max[NDIM], max_psi[NDIM];
//  double max_n[NDIM];
  int index[4];
//  double smooth, wall[NDIM];
//  for (g=0;g<NDIM;g++) {
//    max[g] = 0.;
//    max_psi[g] = 0.;
//    max_n[g] = -1.;
//  }

  //fprintf(stderr,"(%i) Wannier[%i] (%2.13e, %2.13e, %2.13e)\n", P->Par.me, phi0nr, 10.-Wcentre[0], 10.-Wcentre[1], 10.-Wcentre[2]);
  for (g=0;g<4;g++)
    index[g] = cross(index_rxp,g);

  // blow up source coefficients
  LockDensityArray(Dens0,Temp2Density,imag); // TempPsi
  LockDensityArray(Dens0,Temp2Density,real); // TempPsi2
  LockDensityArray(Dens0,ActualPsiDensity,imag); // PsiC

  fft_Psi(P,source,TempPsiR ,index[1],7);
  fft_Psi(P,source,TempPsi2R,index[3],7);
  
  //result = 0.;
  // for every point on the real grid multiply with component of position vector
  for (n0=0; n0<N0; n0++)
    for (n[1]=0; n[1]<N[1]; n[1]++)
      for (n[2]=0; n[2]<N[2]; n[2]++) {
        n[0] = n0 + N0 * myPE;
        fac[0] = (double)(n[0])/(double)((N[0]));
        fac[1] = (double)(n[1])/(double)((N[1]));
        fac[2] = (double)(n[2])/(double)((N[2]));
        RMat33Vec3(x,Lat->RealBasis,fac);
//        fac[0] = (fac[0] > .9) ? fac[0]-0.9 : 0.;
//        fac[1] = (fac[1] > .9) ? fac[1]-0.9 : 0.;
//        fac[2] = (fac[2] > .9) ? fac[2]-0.9 : 0.;
//        RMat33Vec3(wall,Lat->RealBasis,fac);
//        smooth = exp(wall[0]*wall[0]+wall[1]*wall[1]+wall[2]*wall[2]);  // smoothing near the borders of the virtual cell
        iS = n[2] + N[2]*(n[1] + N[1]*n0);  // mind splitting of x axis due to multiple processes
        i0 = n[2]*NUp[2]+N[2]*NUp[2]*(n[1]*NUp[1]+N[1]*NUp[1]*n0*NUp[0]);

//        if (fabs(truedist(Lat,x[index[1]],Wcentre[index[1]],index[1])) >= borderstart * sqrt(Lat->RealBasisSQ[index[1]])/2.)
//          if (max[index[1]] < sawtooth(Lat,truedist(Lat,x[index[1]],Wcentre[index[1]],index[1]),index[1]) * TempPsiR [i0]) {
//            max[index[1]] = sawtooth(Lat,truedist(Lat,x[index[1]],Wcentre[index[1]],index[1]),index[1]) * TempPsiR [i0];  
//            max_psi[index[1]] = TempPsiR [i0];
//            max_n[index[1]] = truedist(Lat,x[index[1]],Wcentre[index[1]],index[1]);
//          }
//
//        if (fabs(truedist(Lat,x[index[3]],Wcentre[index[3]],index[3])) >= borderstart * sqrt(Lat->RealBasisSQ[index[3]])/2.)
//          if (max[index[3]] < sawtooth(Lat,truedist(Lat,x[index[3]],Wcentre[index[3]],index[3]),index[3]) * TempPsiR [i0]) {
//            max[index[3]] = sawtooth(Lat,truedist(Lat,x[index[3]],Wcentre[index[3]],index[3]),index[3]) * TempPsiR [i0];  
//            max_psi[index[3]] = TempPsiR [i0];  
//            max_n[index[3]] = truedist(Lat,x[index[3]],Wcentre[index[3]],index[3]);
//          }
        
        MinImageConv(Lat, x, Wcentre, X);
         PsiCR[iS] = //vector * TempPsiR[i0]; 
          sawtooth(Lat,X[index[0]],index[0]) * TempPsiR [i0]
         -sawtooth(Lat,X[index[2]],index[2]) * TempPsi2R[i0];
//          ShiftGaugeOrigin(P,X,index[0]) * TempPsiR [i0]
//         -ShiftGaugeOrigin(P,X,index[2]) * TempPsi2R[i0];
//        PsiCR[iS] = (x[index[0]] - Wcentre[index[0]]) * TempPsiR [i0] - (x[index[2]] - Wcentre[index[2]]) * TempPsi2R[i0];
      }
  //if (P->Par.me == 0) fprintf(stderr,"(%i) PerturbationOpertator_R(xP): %e\n",P->Par.me, Result/LevS->MaxN);
  UnLockDensityArray(Dens0,Temp2Density,imag); // TempPsi
  UnLockDensityArray(Dens0,Temp2Density,real); // TempPsi2
  
//  // print maximum values
//  fprintf (stderr,"(%i) RxP: Maximum values = (",P->Par.me);
//  for (g=0;g<NDIM;g++)
//    fprintf(stderr,"%lg\t", max[g]);
//  fprintf(stderr,"\b)\t(");
//  for (g=0;g<NDIM;g++)
//    fprintf(stderr,"%lg\t", max_psi[g]);
//  fprintf(stderr,"\b)\t");
//  fprintf (stderr,"at (");
//  for (g=0;g<NDIM;g++)
//    fprintf(stderr,"%lg\t", max_n[g]);
//  fprintf(stderr,"\b)\n");
  
  // inverse fourier transform 
  //if (PsiC != Dens0->DensityCArray[ActualPsiDensity]) Error(SomeError,"CalculatePerturbationOperator_RxP: PsiC corrupted");
  fft_3d_real_to_complex(plan,LevS->LevelNo, FFTNF1, PsiC, workC);
  
  // copy to destination array
  SetArrayToDouble0((double *)dest, 2*R->InitLevS->MaxG);
  for (g=0; g<LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    dest[g].re += ( PsiC[Index].re)*FFTFactor; // factor confirmed, see grad.c:CalculateConDirHConDir()
    dest[g].im += ( PsiC[Index].im)*FFTFactor;
    //fprintf(stderr,"(%i) PsiC[(%lg,%lg,%lg)] = %lg +i %lg\n", P->Par.me, LevS->GArray[g].G[0], LevS->GArray[g].G[1], LevS->GArray[g].G[2], dest[g].re, dest[g].im);
  }
  UnLockDensityArray(Dens0,ActualPsiDensity,imag); // PsiC
  //if (LevS->GArray[0].GSq == 0.)
    //dest[0].im = 0.; // don't do this, see ..._P()
}

/** Applies perturbation operator \f$-(\nabla \times \widehat{r})_{index}\f$ to \a *source.
 * Is analogous to CalculatePerturbationOperator_RxP(), only the order is reversed, first position operator, then
 * momentum operator
 * \param *P Problem at hand
 * \param *source complex coefficients of wave function \f$\varphi(G)\f$
 * \param *dest returned complex coefficients of wave function \f$(\widehat{r} \times \widehat{p})_{index}|\varphi(G)\rangle\f$
 * \param phi0nr number within LocalPsi of the unperturbed pendant of the given perturbed wavefunction \a *source.
 * \param index_pxr index of position operator
 * \note Only third component is important due to initial rotiation of cell such that B field is aligned with z axis.
 * \sa CalculateConDirHConDir() - the procedure of fft and inverse fft is very similar.
 * \bug routine is not tested (but should work), as it offers no advantage over CalculatePerturbationOperator_RxP() 
 */
void CalculatePerturbationOperator_PxR(struct Problem *P, const fftw_complex *source, fftw_complex *dest, const int phi0nr, const int index_pxr)

{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Density *Dens0 = Lev0->Dens;
  struct fft_plan_3d *plan = Lat->plan;
  fftw_complex *TempPsi = Dens0->DensityCArray[Temp2Density];
  fftw_real *TempPsiR = (fftw_real *) TempPsi;
  fftw_complex *workC = Dens0->DensityCArray[TempDensity];
  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
  fftw_real *PsiCR = (fftw_real *) PsiC;
  fftw_complex *Psi2C = Dens0->DensityCArray[ActualDensity];
  fftw_real *Psi2CR = (fftw_real *) Psi2C;
  fftw_complex *tempdestRC = (fftw_complex *)Dens0->DensityArray[Temp2Density];
  fftw_complex *posfac, *destsnd, *destrcv;
  double x[NDIM], X[NDIM], fac[NDIM], Wcentre[NDIM];
  int n[NDIM], n0, g, Index, pos, iS, i0;
  int N[NDIM], NUp[NDIM];
  const int N0 = LevS->Plan0.plan->local_nx;
  N[0] = LevS->Plan0.plan->N[0];
  N[1] = LevS->Plan0.plan->N[1];
  N[2] = LevS->Plan0.plan->N[2];
  NUp[0] = LevS->NUp[0]; 
  NUp[1] = LevS->NUp[1]; 
  NUp[2] = LevS->NUp[2]; 
  Wcentre[0] = Lat->Psi.AddData[phi0nr].WannierCentre[0];
  Wcentre[1] = Lat->Psi.AddData[phi0nr].WannierCentre[1];
  Wcentre[2] = Lat->Psi.AddData[phi0nr].WannierCentre[2];
  // init pointers and values
  const int myPE = P->Par.me_comm_ST_Psi;
  const double FFTFactor = 1./LevS->MaxN; 

  // blow up source coefficients
  SetArrayToDouble0((double *)tempdestRC ,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)TempPsi ,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)PsiC,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)Psi2C,Dens0->TotalSize*2);
  for (g=0; g<LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    posfac = &LevS->PosFactorUp[LevS->MaxNUp*g];
    destrcv = &tempdestRC[LevS->MaxNUp*Index]; 
    for (pos=0; pos < LevS->MaxNUp; pos++) {
      destrcv [pos].re = (( source[g].re)*posfac[pos].re-( source[g].im)*posfac[pos].im);
      destrcv [pos].im = (( source[g].re)*posfac[pos].im+( source[g].im)*posfac[pos].re);
    }
  }
  for (g=0; g<LevS->MaxDoubleG; g++) {
    destsnd  = &tempdestRC [LevS->DoubleG[2*g]*LevS->MaxNUp];
    destrcv  = &tempdestRC [LevS->DoubleG[2*g+1]*LevS->MaxNUp];
    for (pos=0; pos<LevS->MaxNUp; pos++) {
      destrcv [pos].re =  destsnd [pos].re;
      destrcv [pos].im = -destsnd [pos].im;
    }    
  }  
  // fourier transform blown up wave function
  fft_3d_complex_to_real(plan,LevS->LevelNo, FFTNFUp, tempdestRC , workC);
  DensityRTransformPos(LevS,(fftw_real*)tempdestRC ,TempPsiR );

  //fft_Psi(P,source,TempPsiR ,cross(index_pxr,1),7);
  //fft_Psi(P,source,TempPsi2R,cross(index_pxr,3),7);
  
  //result = 0.;
  // for every point on the real grid multiply with component of position vector
  for (n0=0; n0<N0; n0++)
    for (n[1]=0; n[1]<N[1]; n[1]++)
      for (n[2]=0; n[2]<N[2]; n[2]++) {
        n[0] = n0 + N0 * myPE;
        fac[0] = (double)(n[0])/(double)((N[0]));
        fac[1] = (double)(n[1])/(double)((N[1]));
        fac[2] = (double)(n[2])/(double)((N[2]));
        RMat33Vec3(x,Lat->RealBasis,fac);
        iS = n[2] + N[2]*(n[1] + N[1]*n0);  // mind splitting of x axis due to multiple processes
        i0 = n[2]*NUp[2]+N[2]*NUp[2]*(n[1]*NUp[1]+N[1]*NUp[1]*n0*NUp[0]);
//         PsiCR[iS] = sawtooth(Lat,X[cross(index_pxr,1)],cross(index_pxr,1)) * TempPsiR[i0];
//         Psi2CR[iS] = sawtooth(Lat,X[cross(index_pxr,3)],cross(index_pxr,3)) * TempPsiR[i0]; 
        MinImageConv(Lat,x,Wcentre,X); 
        PsiCR[iS] = ShiftGaugeOrigin(P,X,cross(index_pxr,1)) * TempPsiR[i0];
        Psi2CR[iS] = ShiftGaugeOrigin(P,X,cross(index_pxr,3)) * TempPsiR[i0]; 
      }
  
  // inverse fourier transform 
  fft_3d_real_to_complex(plan,LevS->LevelNo, FFTNF1, PsiC, workC);
  fft_3d_real_to_complex(plan,LevS->LevelNo, FFTNF1, Psi2C, workC);
  
  // copy to destination array
  for (g=0; g<LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    dest[g].re = -LevS->GArray[g].G[cross(index_pxr,0)]*( PsiC[Index].im)*FFTFactor; 
    dest[g].im = -LevS->GArray[g].G[cross(index_pxr,0)]*(-PsiC[Index].re)*FFTFactor;
    dest[g].re -= -LevS->GArray[g].G[cross(index_pxr,2)]*( Psi2C[Index].im)*FFTFactor; 
    dest[g].im -= -LevS->GArray[g].G[cross(index_pxr,2)]*(-Psi2C[Index].re)*FFTFactor;
  }
  if (LevS->GArray[0].GSq == 0.)
    dest[0].im = 0.; // don't do this, see ..._P()
}

/** Evaluates first derivative of perturbed energy functional with respect to minimisation parameter \f$\Theta\f$.
 * \f[
 *  \frac{\delta {\cal E}^{(2)}} {\delta \Theta} =
 *    2 {\cal R} \langle \widetilde{\varphi}_i^{(1)} | {\cal H}^{(0)} | \varphi_i^{(1)} \rangle
 *    - \sum_l \lambda_{il} \langle \widetilde{\varphi}_i^{(1)} | \varphi_l^{(1)} \rangle
 *    - \sum_k \lambda_{ki} \langle \varphi_k^{(1)} | \widetilde{\varphi}_i^{(1)} \rangle
 *    + 2 {\cal R} \langle \widetilde{\varphi}_i^{(1)} | {\cal H}^{(1)} | \varphi_i^{(0)} \rangle
 * \f]
 * 
 * The summation over all Psis has again to be done with an MPI exchange of non-local coefficients, as the conjugate
 * directions are not the same in situations where PePGamma > 1 (Psis split up among processes = multiple minimisation)
 * \param *P Problem at hand
 * \param source0 unperturbed wave function \f$\varphi_l^{(0)}\f$
 * \param source perturbed wave function \f$\varphi_l^{(1)} (G)\f$
 * \param ConDir normalized conjugate direction \f$\widetilde{\varphi}_l^{(1)} (G)\f$
 * \param Hc_grad complex coefficients of \f$H^{(0)} | \varphi_l^{(1)} (G) \rangle\f$, see GradientArray#HcGradient
 * \param H1c_grad complex coefficients of \f$H^{(1)} | \varphi_l^{(0)} (G) \rangle\f$, see GradientArray#H1cGradient
 * \sa CalculateLineSearch() - used there, \sa CalculateConDirHConDir() - same principles
 * \warning The MPI_Allreduce for the scalar product in the end has not been done and must not have been done for given
 *          parameters yet!
 */
double Calculate1stPerturbedDerivative(struct Problem *P, const fftw_complex *source0, const fftw_complex *source, const fftw_complex *ConDir, const fftw_complex *Hc_grad, const fftw_complex *H1c_grad)
{
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &P->Lat.Psi;
  struct LatticeLevel *LevS = R->LevS;
  double result = 0., E0 = 0., Elambda = 0., E1 = 0.;//, E2 = 0.;
  int i,m,j;
  const int state = R->CurrentMin;
  //const int l_normal = R->ActualLocalPsiNo - Psi->TypeStartIndex[state] + Psi->TypeStartIndex[Occupied]; 
  const int ActNum =  R->ActualLocalPsiNo - Psi->TypeStartIndex[state] + Psi->TypeStartIndex[1] * Psi->LocalPsiStatus[R->ActualLocalPsiNo].my_color_comm_ST_Psi;
  //int l = R->ActualLocalPsiNo;
  //int l_normal = Psi->TypeStartIndex[Occupied] + (l - Psi->TypeStartIndex[state]);  // offset l to \varphi_l^{(0)}
  struct OnePsiElement *OnePsiB, *LOnePsiB;
  //fftw_complex *HConGrad = LevS->LPsi->TempPsi;
  fftw_complex *LPsiDatB=NULL;
  const int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  int RecvSource;
  MPI_Status status;

  //CalculateCDfnl(P,ConDir,PP->CDfnl);  
  //ApplyTotalHamiltonian(P,ConDir,HConDir, PP->CDfnl, 1, 0);
  //E0 = (GradSP(P, LevS, ConDir, Hc_grad) + GradSP(P, LevS, source, HConDir)) * Psi->LocalPsiStatus[R->ActualLocalPsiNo].PsiFactor;
  E0 = 2.*GradSP(P, LevS, ConDir, Hc_grad) * Psi->LocalPsiStatus[R->ActualLocalPsiNo].PsiFactor;
  result = E0;
  //fprintf(stderr,"(%i) 1st: E0 = \t\t%lg\n", P->Par.me, E0);

  m = -1;
  for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
    OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
    if (OnePsiB->PsiType == state) {   // drop all but the ones of current min state
      m++;  // increase m if it is type-specific wave function
      if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
         LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
      else
         LOnePsiB = NULL;
      if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
        RecvSource = OnePsiB->my_color_comm_ST_Psi;
        MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, PerturbedTag, P->Par.comm_ST_PsiT, &status );
        LPsiDatB=LevS->LPsi->TempPsi;
      } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
        for (i=0;i<P->Par.Max_me_comm_ST_PsiT;i++)
          if (i != OnePsiB->my_color_comm_ST_Psi)
            MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, i, PerturbedTag, P->Par.comm_ST_PsiT);
        LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
      } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received

      Elambda -= 2.*Psi->lambda[ActNum][m]*GradSP(P, LevS, ConDir, LPsiDatB) * OnePsiB->PsiFactor;  // lambda is symmetric
    }
  }
  result += Elambda;
  //fprintf(stderr,"(%i) 1st: Elambda = \t%lg\n", P->Par.me, Elambda);

  E1 =  2.*GradSP(P,LevS,ConDir,H1c_grad) * sqrt(Psi->AllPsiStatus[ActNum].PsiFactor*Psi->LocalPsiStatus[R->ActualLocalPsiNo].PsiFactor);
  result += E1;
  //fprintf(stderr,"(%i) 1st: E1 = \t\t%lg\n", P->Par.me, E1);

  return result;
}


/** Evaluates second derivative of perturbed energy functional with respect to minimisation parameter \f$\Theta\f$.
 * \f[
 *   \frac{\delta^2 {\cal E}^{(2)}} {\delta \Theta^2} =
 *        2 \bigl( \langle \widetilde{\varphi}_l^{(1)} | {\cal H}^{(0)} | \widetilde{\varphi}_l^{(1)} \rangle
 *      - \langle \varphi_l^{(1)} | {\cal H}^{(0)} | \varphi_l^{(1)} \rangle \bigr )
 *      + 2 \sum_{i,i \neq l } \lambda_{il} \langle \varphi_i^{(1)} | \varphi_l^{(1)} \rangle
 *      - 2 {\cal R} \langle \varphi_l^{(1)} | {\cal H}^{(1)} | \varphi_l^{(0)} \rangle
 * \f]
 * 
 * The energy eigenvalues of \a ConDir and \a source must be supplied, they can be calculated via CalculateConDirHConDir() and/or
 * by the due to CalculatePerturbedEnergy() already present OnePsiElementAddData#Lambda eigenvalue. The summation over the
 * unperturbed lambda within the scalar product of perturbed wave functions is evaluated with Psis#lambda and Psis#Overlap.
 * Afterwards, the ConDir density is calculated and also the i-th perturbed density to first degree. With these in a sum over
 * all real mesh points the exchange-correlation first and second derivatives and also the Hartree potential ones can be calculated
 * and summed up.
 * \param *P Problem at hand
 * \param source0 unperturbed wave function \f$\varphi_l^{(0)}\f$
 * \param source wave function \f$\varphi_l^{(1)}\f$
 * \param ConDir conjugated direction \f$\widetilde{\varphi}_l^{(1)}\f$
 * \param sourceHsource eigenvalue of wave function \f$\langle \varphi_l^{(1)} | H^{(0)} | \varphi_l^{(1)}\rangle\f$
 * \param ConDirHConDir perturbed eigenvalue of conjugate direction \f$\langle \widetilde{\varphi}_l^{(1)} | H^{(0)} | \widetilde{\varphi}_l^{(1)}\rangle\f$
 * \param ConDirConDir norm of conjugate direction \f$\langle \widetilde{\varphi}_l^{(1)} | \widetilde{\varphi}_l^{(1)}\rangle\f$
 * \warning No MPI_AllReduce() takes place, parameters have to be reduced already.
 */
double Calculate2ndPerturbedDerivative(struct Problem *P, const fftw_complex *source0,const  fftw_complex *source, const fftw_complex *ConDir,const  double sourceHsource, const double ConDirHConDir, const double ConDirConDir)
{
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &P->Lat.Psi;
  //struct Lattice *Lat = &P->Lat;
  //struct Energy *E = Lat->E;
  double result = 0.;
  double Con0 = 0., Elambda = 0.;//, E0 = 0., E1 = 0.;
  //int i;
  const int state = R->CurrentMin;
  //const int l_normal = R->ActualLocalPsiNo - Psi->TypeStartIndex[state] + Psi->TypeStartIndex[Occupied]; 
  const int ActNum =  R->ActualLocalPsiNo - Psi->TypeStartIndex[state] + Psi->TypeStartIndex[1] * Psi->LocalPsiStatus[R->ActualLocalPsiNo].my_color_comm_ST_Psi;

  Con0 = 2.*ConDirHConDir;
  result += Con0;
  ////E0 = -2.*sourceHsource;
  ////result += E0;
  ////E1 = -E->PsiEnergy[Perturbed1_0Energy][R->ActualLocalPsiNo] - E->PsiEnergy[Perturbed0_1Energy][R->ActualLocalPsiNo];
  ////result += E1;
  //fprintf(stderr,"(%i) 2nd: E1 = \t%lg\n", P->Par.me, E1);

  ////for (i=0;i<Lat->Psi.NoOfPsis;i++) {
  ////  if (i != ActNum) Elambda += Psi->lambda[i][ActNum]*Psi->Overlap[i][ActNum]+ Psi->lambda[ActNum][i]*Psi->Overlap[ActNum][i];   // overlap contains PsiFactor
  ////}
  ////Elambda = Psi->lambda[ActNum][ActNum]*Psi->Overlap[ActNum][ActNum];
  Elambda = 2.*Psi->lambda[ActNum][ActNum]*ConDirConDir;
  result -= Elambda;

  //fprintf(stderr,"(%i) 2ndPerturbedDerivative: Result = Con0 + E0 + E1 + Elambda + dEdt0_XC + ddEddt0_XC + dEdt0_H + ddEddt0_H = %lg + %lg + %lg + %lg + %lg + %lg + %lg + %lg = %lg\n", P->Par.me, Con0, E0, E1, Elambda, VolumeFactorR*dEdt0_XC, VolumeFactorR*ddEddt0_XC, dEdt0_H, ddEddt0_H, result);

  return (result);
}

/** Returns index of specific component in 3x3 cross product.
 * \param i vector product component index, ranging from 0..NDIM
 * \param j index specifies which one of the four vectors in x*y - y*x, ranging from 0..3 (0,1 positive sign, 2,3 negative sign)
 * \return Component 0..2 of vector to be taken to evaluate a vector product 
 * \sa crossed() - is the same but vice versa, return value must be specified, \a i is returned.
 */
inline int cross(int i, int j)
{
  const int matrix[NDIM*4] = {1,2,2,1,2,0,0,2,0,1,1,0};
  if (i>=0 && i<NDIM && j>=0 && j<4)
    return (matrix[i*4+j]);
  else {
    Error(SomeError,"cross: i or j out of range!");
    return (0);
  }
}

/** Returns index of resulting vector component in 3x3 cross product.
 * In the column specified by the \a j index \a i is looked for and the found row index returned.  
 * \param i vector component index, ranging from 0..NDIM
 * \param j index specifies which one of the four vectors in x*y - y*x, ranging from 0..3 (0,1 positive sign, 2,3 negative sign)
 * \return Component 0..2 of resulting vector
 * \sa cross() - is the same but vice versa, return value must be specified, \a i is returned.
 */
inline int crossed(int i, int j)
{
  const int matrix[NDIM*4] = {1,2,2,1,2,0,0,2,0,1,1,0};
  int k;
  if (i>=0 && i<NDIM && j>=0 && j<4) {
    for (k=0;k<NDIM;k++)
      if (matrix[4*k+j] == i) return(k);
    Error(SomeError,"crossed: given component not found!");
    return(-1);
  } else {
    Error(SomeError,"crossed: i or j out of range!");
    return (-1);
  }
}

#define Nsin 16  //!< should be dependent on MaxG/MaxN per axis!

/** Returns sawtooth shaped profile for position operator within cell.
 * This is a mapping from -L/2...L/2 (L = length of unit cell derived from Lattice#RealBasisSQ) to -L/2 to L/2 with a smooth transition:
 * \f[
 *  f(x): x \rightarrow \left \{ 
 *    \begin{array}{l} 
 *      -\frac{L}{2} \cdot \sin \left ( \frac{x}{0,05\cdot L} \cdot \frac{\pi}{2} \right ), 0<x<0,05\cdot L \\ 
 *      (x - 0,05\cdot L) \cdot \frac{10}{9} - \frac{L}{2}, 0,05\cdot L \leq x<0,95\cdot L \\
 *       \frac{L}{2} \cdot \cos \left ( \frac{x-0,95\cdot L}{0,05\cdot L} \cdot \frac{\pi}{2} \right), 0,95\cdot L<x<L
 *    \end{array} \right \}
 * \f]
 * \param *Lat pointer to Lattice structure for Lattice#RealBasisSQ
 * \param L parameter x
 * \param index component index for Lattice#RealBasisSQ
 */
inline double sawtooth(struct Lattice *Lat, double L, const int index)
{
  double axis = sqrt(Lat->RealBasisSQ[index]);
  double sawstart = Lat->SawtoothStart;
  double sawend = 1. - sawstart;
  double sawfactor = (sawstart+sawend)/(sawend-sawstart);
  //return(L);

  //fprintf(stderr, "sawstart: %e\tsawend: %e\tsawfactor: %e\tL: %e\n", sawstart, sawend, sawfactor, L);
  // transform and return (sawtooth profile checked, 04.08.06)
  L += axis/2.; // transform to 0 ... L
  if (L < (sawstart*axis)) return (-axis/(2*sawfactor)*sin(L/(sawstart*axis)*PI/2.));  // first smooth transition from 0 ... -L/2
  if (L > (sawend*axis)) return ( axis/(2*sawfactor)*cos((L-sawend*axis)/(sawstart*axis)*PI/2.));  // second smooth transition from +L/2 ... 0
  //fprintf(stderr,"L %e\t sawstart %e\t sawend %e\t sawfactor %e\t axis%e\n", L, sawstart, sawend, sawfactor, axis);
  //return ((L - sawstart*axis) - axis/(2*sawfactor));  // area in between scale to -L/2 ... +L/2
  return (L - axis/2);  // area in between return as it was 
}

/** Shifts the origin of the gauge according to the CSDGT method.
 * \f[
 *  d(r) = r - \sum_{I_s,I_a} (r-R_{I_s,I_a}) exp{(-\alpha_{I_s,I_a}(r-R_{I_s,I_a})^4)}
 * \f]
 * This trafo is necessary as the current otherweise (CSGT) sensitively depends on the current around
 * the core region inadequately/only moderately well approximated by a plane-wave-pseudo-potential-method.
 * \param *P Problem at hand, containing Lattice and Ions
 * \param r coordinate vector
 * \param index index of the basis vector
 * \return \f$d(r)\f$ 
 * \note Continuous Set of Damped Gauge Transformations according to Keith and Bader 
 */
inline double ShiftGaugeOrigin(struct Problem *P, double r[NDIM], const int index)
{
  struct Ions *I = &P->Ion;
  struct Lattice *Lat = &P->Lat;
  double x[NDIM], tmp; 
  int is,ia, i;
  
  // loop over all ions to calculate the sum
  for(i=0;i<NDIM;i++)
    x[i] = r[i];
  for (is=0; is < I->Max_Types; is++)
    for (ia=0; ia < I->I[is].Max_IonsOfType; ia++)
      for(i=0;i<NDIM;i++) {
        tmp = (r[i] - I->I[is].R[NDIM*ia]);
        x[i] -= tmp*exp(- I->I[is].alpha[ia] * tpow(tmp,4));
      }
  
  return(sawtooth(Lat,x[index],index));  // still use sawtooth due to the numerical instability around the border region of the cell
}

/** Print sawtooth() for each node along one axis.
 * \param *P Problem at hand, containing RunStruct, Lattice and LatticeLevel RunStruct#LevS
 * \param index index of axis
 */
void TestSawtooth(struct Problem *P, const int index)
{
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  struct Lattice *Lat =&P->Lat;
  double x[NDIM];
  double n[NDIM];
  int N[NDIM];
  N[0] = LevS->Plan0.plan->N[0];
  N[1] = LevS->Plan0.plan->N[1];
  N[2] = LevS->Plan0.plan->N[2];
  
  n[0] = n[1] = n[2] = 0.;
  for (n[index]=0;n[index]<N[index];n[index]++) {
    n[index] = (double)n[index]/(double)N[index] * sqrt(Lat->RealBasisSQ[index]);
    //fprintf(stderr,"(%i) x %e\t Axis/2 %e\n",P->Par.me, x, sqrt(Lat->RealBasisSQ[index])/2. );     
	  MinImageConv(Lat, n, Lat->RealBasisCenter, x);
  	fprintf(stderr,"%e\t%e\n", n[index], sawtooth(Lat,n[index],index));  
  }
}

/** Secures minimum image convention between two given points \a R[] and \a r[] within periodic boundary.
 * Each distance component within a periodic boundary must always be between -L/2 ... L/2
 * \param *Lat pointer to Lattice structure
 * \param R[] first vector, NDIM, each must be between 0...L
 * \param r[] second vector, NDIM, each must be between 0...L
 * \param result[] return vector
 */
inline void MinImageConv(struct Lattice *Lat, const double R[NDIM], const double r[NDIM], double *result)
{
  //double axis = Lat->RealBasisQ[index];
  double x[NDIM], X[NDIM], Result[NDIM];
  int i;
  
  for(i=0;i<NDIM;i++)
    result[i] = x[i] = x[i] = 0.;
  //fprintf(stderr, "R = (%lg, %lg, %lg), r = (%lg, %lg, %lg)\n", R[0], R[1], R[2], r[0], r[1], r[2]);
  RMat33Vec3(X, Lat->ReciBasis, R); // transform both to [0,1]^3
  RMat33Vec3(x, Lat->ReciBasis, r);
  //fprintf(stderr, "X = (%lg, %lg, %lg), x = (%lg, %lg, %lg)\n", X[0], X[1], X[2], x[0], x[1], x[2]);
  for(i=0;i<NDIM;i++) {
//    if (fabs(X[i]) > 1.)
//      fprintf(stderr,"X[%i] > 1. : %lg!\n", i, X[i]);
//    if (fabs(x[i]) > 1.)
//      fprintf(stderr,"x[%i] > 1. : %lg!\n", i, x[i]);
    if (fabs(Result[i] = X[i] - x[i] + 2.*PI) < PI) { }
    else if (fabs(Result[i] = X[i] - x[i]) <= PI) { }
     else if (fabs(Result[i] = X[i] - x[i] - 2.*PI) < PI) { }
      else Error(SomeError, "MinImageConv: None of the three cases applied!");
  }
  for(i=0;i<NDIM;i++) // ReciBasis is not true inverse, but times 2.*PI
    Result[i] /= 2.*PI;
  RMat33Vec3(result, Lat->RealBasis, Result);
}

/** Linear interpolation for coordinate \a R that lies between grid nodes of \a *grid.
 * \param *P Problem at hand
 * \param *Lat Lattice structure for grid axis
 * \param *Lev LatticeLevel structure for grid axis node counts
 * \param R[] coordinate vector
 * \param *grid grid with fixed nodes
 * \return linearly interpolated value of \a *grid for position \a R[NDIM]
 */
double LinearInterpolationBetweenGrid(struct Problem *P, struct Lattice *Lat, struct LatticeLevel *Lev, double R[NDIM], fftw_real *grid) 
{
  double x[2][NDIM];
  const int myPE = P->Par.me_comm_ST_Psi;
  int N[NDIM];
  const int N0 = Lev->Plan0.plan->local_nx;
  N[0] = Lev->Plan0.plan->N[0];
  N[1] = Lev->Plan0.plan->N[1];
  N[2] = Lev->Plan0.plan->N[2];
  int g;
  double n[NDIM];
  int k[2][NDIM];
  double sigma;
  
  RMat33Vec3(n, Lat->ReciBasis, &R[0]); // transform real coordinates to [0,1]^3 vector
  for (g=0;g<NDIM;g++) {
    // k[i] are right and left nearest neighbour node to true position
    k[0][g] = floor(n[g]/(2.*PI)*(double)N[g]);  // n[2] is floor grid
    k[1][g] = ceil(n[g]/(2.*PI)*(double)N[g]); // n[1] is ceil grid
    // x[i] give weights of left and right neighbours (the nearer the true point is to one, the closer its weight to 1) 
    x[0][g] = (k[1][g] - n[g]/(2.*PI)*(double)N[g]);
    x[1][g] = 1. - x[0][g];
    //fprintf(stderr,"(%i) n = %lg, n_floor[%i] = %i\tn_ceil[%i] = %i --- x_floor[%i] = %e\tx_ceil[%i] = %e\n",P->Par.me, n[g], g,k[0][g], g,k[1][g], g,x[0][g], g,x[1][g]);
  }
  sigma = 0.;
  for (g=0;g<2;g++) { // interpolate linearly between adjacent grid points per axis
    if ((k[g][0] >= N0*myPE) && (k[g][0] < N0*(myPE+1))) {
      //fprintf(stderr,"(%i) grid[%i]: sigma = %e\n", P->Par.me,  k[g][2]+N[2]*(k[g][1]+N[1]*(k[g][0]-N0*myPE)), sigma);
      sigma += (x[g][0]*x[0][1]*x[0][2])*grid[k[0][2]+N[2]*(k[0][1]+N[1]*(k[g][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
      //fprintf(stderr,"(%i) grid[%i]: sigma += %e * %e \n", P->Par.me,  k[g][2]+N[2]*(k[g][1]+N[1]*(k[g][0]-N0*myPE)), (x[g][0]*x[0][1]*x[0][2]), grid[k[0][2]+N[2]*(k[0][1]+N[1]*(k[g][0]-N0*myPE))]*mu0);
      sigma += (x[g][0]*x[0][1]*x[1][2])*grid[k[1][2]+N[2]*(k[0][1]+N[1]*(k[g][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
      //fprintf(stderr,"(%i) grid[%i]: sigma += %e * %e \n", P->Par.me,  k[g][2]+N[2]*(k[g][1]+N[1]*(k[g][0]-N0*myPE)), (x[g][0]*x[0][1]*x[1][2]), grid[k[1][2]+N[2]*(k[0][1]+N[1]*(k[g][0]-N0*myPE))]*mu0);
      sigma += (x[g][0]*x[1][1]*x[0][2])*grid[k[0][2]+N[2]*(k[1][1]+N[1]*(k[g][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
      //fprintf(stderr,"(%i) grid[%i]: sigma += %e * %e \n", P->Par.me,  k[g][2]+N[2]*(k[g][1]+N[1]*(k[g][0]-N0*myPE)), (x[g][0]*x[1][1]*x[0][2]), grid[k[0][2]+N[2]*(k[1][1]+N[1]*(k[g][0]-N0*myPE))]*mu0);
      sigma += (x[g][0]*x[1][1]*x[1][2])*grid[k[1][2]+N[2]*(k[1][1]+N[1]*(k[g][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
      //fprintf(stderr,"(%i) grid[%i]: sigma += %e * %e \n", P->Par.me,  k[g][2]+N[2]*(k[g][1]+N[1]*(k[g][0]-N0*myPE)), (x[g][0]*x[1][1]*x[1][2]), grid[k[1][2]+N[2]*(k[1][1]+N[1]*(k[g][0]-N0*myPE))]*mu0);
    }
  }
  return sigma;
}

/** Linear Interpolation from all eight corners of the box that singles down to a point on the lower level.
 * \param *P Problem at hand
 * \param *Lev LatticeLevel structure for node numbers
 * \param upperNode Node around which to interpolate
 * \param *upperGrid array of grid points
 * \return summed up and then averaged octant around \a upperNode 
 */  
double LinearPullDownFromUpperLevel(struct Problem *P, struct LatticeLevel *Lev, int upperNode, fftw_real *upperGrid) 
{
  const int N0 = Lev->Plan0.plan->local_nx;
  const int N1 = Lev->Plan0.plan->N[1];
  const int N2 = Lev->Plan0.plan->N[2];
  double lowerGrid = 0.;
  int nr=1;
  lowerGrid += upperGrid[upperNode];
  if (upperNode % N0 != N0-1) {
    lowerGrid += upperGrid[upperNode+1];
    nr++;
    if (upperNode % N1 != N1-1) {
      lowerGrid += upperGrid[upperNode + 0 + N2*(1 + N1*1)];
      nr++;
      if (upperNode % N2 != N2-1) {
       lowerGrid += upperGrid[upperNode + 1 + N2*(1 + N1*1)];
        nr++;
      }
    }
    if (upperNode % N2 != N2-1) {
      lowerGrid += upperGrid[upperNode + 1 + N2*(0 + N1*1)];
      nr++;
    }
  }
  if (upperNode % N1 != N1-1) {
    lowerGrid += upperGrid[upperNode + 0 + N2*(1 + N1*0)];
    nr++;
    if (upperNode % N2 != N2-1) {
      lowerGrid += upperGrid[upperNode + 1 + N2*(1 + N1*0)];
      nr++;
    }
  }
  if (upperNode % N2 != N2-1) {
    lowerGrid += upperGrid[upperNode + 1 + N2*(0 + N1*0)];
    nr++;
  }
  return (lowerGrid/(double)nr);
} 

/** Evaluates the 1-stern in order to evaluate the first derivative on the grid.
 * \param *P Problem at hand
 * \param *Lev Level to interpret the \a *density on
 * \param *density array with gridded values
 * \param *n 3 vector with indices on the grid
 * \param axis axis along which is derived
 * \param myPE number of processes who share the density
 * \return  [+1/2 -1/2] of \a *n
 */
double FirstDiscreteDerivative(struct Problem *P, struct LatticeLevel *Lev, fftw_real *density, int *n, int axis, int myPE)
{
  int *N = Lev->Plan0.plan->N;  // maximum nodes per axis
  const int N0 = Lev->Plan0.plan->local_nx; // special local number due to parallel split up
  double ret[NDIM],  Ret[NDIM];  // return value local/global
  int i;

  for (i=0;i<NDIM;i++) {
    ret[i] = Ret[i] = 0.;
  }
  if (((n[0]+1)%N[0] >= N0*myPE) && ((n[0]+1)%N[0] < N0*(myPE+1)))   // next cell belongs to this process 
    ret[0] += 1./2. * (density[n[2]+N[2]*(n[1]+N[1]*(n[0]+1-N0*myPE))]);
  if (((n[0]-1)%N[0] >= N0*myPE) && ((n[0]-1)%N[0] < N0*(myPE+1))) // previous cell belongs to this process 
    ret[0] -= 1./2. * (density[n[2]+N[2]*(n[1]+N[1]*(n[0]-1-N0*myPE))]);
  if ((n[0] >= N0*myPE) && (n[0] < N0*(myPE+1))) { 
    ret[1] += 1./2. * (density[n[2]+N[2]*((n[1]+1)%N[1] + N[1]*(n[0]%N0))]);
    ret[1] -= 1./2. * (density[n[2]+N[2]*((n[1]-1)%N[1] + N[1]*(n[0]%N0))]);
  }     
  if ((n[0] >= N0*myPE) && (n[0] < N0*(myPE+1))) {
    ret[2] += 1./2. * (density[(n[2]+1)%N[2] + N[2]*(n[1]+N[1]*(n[0]%N0))]);
    ret[2] -= 1./2. * (density[(n[2]-1)%N[2] + N[2]*(n[1]+N[1]*(n[0]%N0))]);
  }
  
  if (MPI_Allreduce(ret, Ret, 3, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi) != MPI_SUCCESS)
    Error(SomeError, "FirstDiscreteDerivative: MPI_Allreduce failure!");
   
  for (i=0;i<NDIM;i++)  // transform from node count to [0,1]^3
    Ret[i] *= N[i]; 
  RMat33Vec3(ret, P->Lat.ReciBasis, Ret); // this actually divides it by mesh length in real coordinates
  //fprintf(stderr, "(%i) sum at (%i,%i,%i) : %lg\n",P->Par.me, n[0],n[1],n[2], ret[axis]);  
  return ret[axis]; ///(P->Lat.RealBasisQ[axis]/N[axis]);
} 

/** Fouriertransforms given \a source.
 * By the use of the symmetry parameter an additional imaginary unit and/or the momentum operator can
 * be applied at the same time.
 * \param *P Problem at hand
 * \param *Psi source array of reciprocal coefficients
 * \param *PsiR destination array, becoming filled with real coefficients
 * \param index_g component of G vector (only needed for symmetry=4..7)
 * \param symmetry 0 - do nothing, 1 - factor by "-1", 2 - factor by "i", 3 - factor by "1/i = -i", from 4 to 7 the same 
 *        but additionally with momentum operator 
 */
void fft_Psi(struct Problem *P, const fftw_complex *Psi, fftw_real *PsiR, const int index_g, const int symmetry)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Density *Dens0 = Lev0->Dens;
  struct  fft_plan_3d *plan = Lat->plan;
  fftw_complex *tempdestRC = (fftw_complex *)Dens0->DensityArray[TempDensity];
  fftw_complex *work = Dens0->DensityCArray[TempDensity];
  fftw_complex *posfac, *destpos, *destRCS, *destRCD;
  int i, Index, pos;

  LockDensityArray(Dens0,TempDensity,imag); // tempdestRC
  SetArrayToDouble0((double *)tempdestRC, Dens0->TotalSize*2);
  SetArrayToDouble0((double *)PsiR, Dens0->TotalSize*2);
  switch (symmetry) {
    case 0:
      for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is positive
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = (Psi[i].re)*posfac[pos].re-(Psi[i].im)*posfac[pos].im;
          destpos[pos].im = (Psi[i].re)*posfac[pos].im+(Psi[i].im)*posfac[pos].re;
        }
      }
      break;
    case 1:
      for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is - positive
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = -((Psi[i].re)*posfac[pos].re-(Psi[i].im)*posfac[pos].im);
          destpos[pos].im = -((Psi[i].re)*posfac[pos].im+(Psi[i].im)*posfac[pos].re);
        }
      }
      break;
    case 2:
      for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is negative
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = (-Psi[i].im)*posfac[pos].re-(Psi[i].re)*posfac[pos].im;
          destpos[pos].im = (-Psi[i].im)*posfac[pos].im+(Psi[i].re)*posfac[pos].re;
        }
      }
      break;
    case 3:
      for (i=0;i<LevS->MaxG;i++) { // incoming is negative, outgoing is positive
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = (Psi[i].im)*posfac[pos].re-(-Psi[i].re)*posfac[pos].im;
          destpos[pos].im = (Psi[i].im)*posfac[pos].im+(-Psi[i].re)*posfac[pos].re;
        }
      }
      break;
    case 4:
      for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is positive
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = LevS->GArray[i].G[index_g]*((Psi[i].re)*posfac[pos].re-(Psi[i].im)*posfac[pos].im);
          destpos[pos].im = LevS->GArray[i].G[index_g]*((Psi[i].re)*posfac[pos].im+(Psi[i].im)*posfac[pos].re);
        }
      }
      break;
    case 5:
      for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is - positive
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = -LevS->GArray[i].G[index_g]*((Psi[i].re)*posfac[pos].re-(Psi[i].im)*posfac[pos].im);
          destpos[pos].im = -LevS->GArray[i].G[index_g]*((Psi[i].re)*posfac[pos].im+(Psi[i].im)*posfac[pos].re);
        }
      }
      break;
    case 6:
      for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is negative
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = LevS->GArray[i].G[index_g]*((-Psi[i].im)*posfac[pos].re-(Psi[i].re)*posfac[pos].im);
          destpos[pos].im = LevS->GArray[i].G[index_g]*((-Psi[i].im)*posfac[pos].im+(Psi[i].re)*posfac[pos].re);
        }
      }
      break;
    case 7:
      for (i=0;i<LevS->MaxG;i++) { // incoming is negative, outgoing is positive
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          //if (destpos != &tempdestRC[LevS->MaxNUp*Index] || LevS->MaxNUp*Index+pos<0 || LevS->MaxNUp*Index+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destpos corrupted");
          destpos[pos].re = LevS->GArray[i].G[index_g]*((Psi[i].im)*posfac[pos].re-(-Psi[i].re)*posfac[pos].im);
          destpos[pos].im = LevS->GArray[i].G[index_g]*((Psi[i].im)*posfac[pos].im+(-Psi[i].re)*posfac[pos].re);
        }
      }
      break;
  }
  for (i=0; i<LevS->MaxDoubleG; i++) {
    destRCS = &tempdestRC[LevS->DoubleG[2*i]*LevS->MaxNUp];
    destRCD = &tempdestRC[LevS->DoubleG[2*i+1]*LevS->MaxNUp];
    for (pos=0; pos < LevS->MaxNUp; pos++) {
      //if (destRCD != &tempdestRC[LevS->DoubleG[2*i+1]*LevS->MaxNUp] || LevS->DoubleG[2*i+1]*LevS->MaxNUp+pos<0 || LevS->DoubleG[2*i+1]*LevS->MaxNUp+pos>=Dens0->TotalSize) Error(SomeError,"fft_Psi: destRCD corrupted");
      destRCD[pos].re =  destRCS[pos].re;
      destRCD[pos].im = -destRCS[pos].im;
    }
  }
  fft_3d_complex_to_real(plan, LevS->LevelNo, FFTNFUp, tempdestRC, work);
  DensityRTransformPos(LevS,(fftw_real*)tempdestRC, PsiR);
  UnLockDensityArray(Dens0,TempDensity,imag); // tempdestRC
}

/** Locks all NDIM_NDIM current density arrays
 * \param Dens0 Density structure to be locked (in the current parts)
 */
void AllocCurrentDensity(struct Density *Dens0) {
  // real
  LockDensityArray(Dens0,CurrentDensity0,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity1,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity2,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity3,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity4,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity5,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity6,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity7,real); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity8,real); // CurrentDensity[B_index]
  // imaginary
  LockDensityArray(Dens0,CurrentDensity0,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity1,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity2,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity3,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity4,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity5,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity6,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity7,imag); // CurrentDensity[B_index]
  LockDensityArray(Dens0,CurrentDensity8,imag); // CurrentDensity[B_index]
}

/** Reset and unlocks all NDIM_NDIM current density arrays
 * \param Dens0 Density structure to be unlocked/resetted (in the current parts)
 */
void DisAllocCurrentDensity(struct Density *Dens0) {
  //int i;
  // real
//  for(i=0;i<NDIM*NDIM;i++)
//    SetArrayToDouble0((double *)Dens0->DensityArray[i], Dens0->TotalSize*2);
  UnLockDensityArray(Dens0,CurrentDensity0,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity1,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity2,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity3,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity4,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity5,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity6,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity7,real); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity8,real); // CurrentDensity[B_index]
  // imaginary
//  for(i=0;i<NDIM*NDIM;i++)
//    SetArrayToDouble0((double *)Dens0->DensityCArray[i], Dens0->TotalSize*2);
  UnLockDensityArray(Dens0,CurrentDensity0,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity1,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity2,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity3,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity4,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity5,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity6,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity7,imag); // CurrentDensity[B_index]
  UnLockDensityArray(Dens0,CurrentDensity8,imag); // CurrentDensity[B_index]
}

// these defines safe-guard same symmetry for same kind of wave function
#define Psi0symmetry 0  // //0 //0 //0 // regard psi0 as real
#define Psi1symmetry 0  // //3 //0 //0 // regard psi0 as real
#define Psip0symmetry 6 //6  //6 //6 //6 // momentum times "i" due to operation on left hand 
#define Psip1symmetry 7 //7  //4 //6 //7 // momentum times "-i" as usual (right hand)

/** Evaluates the 3x3 current density arrays.
 * The formula we want to evaluate is as follows
 * \f[
 *  j_k(r) = \langle \psi_k^{(0)} | \Bigl (  p|r'\rangle\langle r' | + | r' \rangle \langle r' | p \Bigr )
              \Bigl [ | \psi_k^{(r\times p )} \rangle - r' \times | \psi_k^{(p)} \rangle \Bigr ] \cdot B.
 * \f]
 * Most of the DensityTypes-arrays are locked for temporary use. Pointers are set to their
 * start address and afterwards the current density arrays locked and reset'ed. Then for every
 * unperturbed wave function we do:
 * -# FFT unperturbed p-perturbed and rxp-perturbed wave function
 * -# FFT wave function with applied momentum operator for all three indices
 * -# For each index of the momentum operator:
 *  -# FFT p-perturbed wave function
 *  -# For every index of the external field:
 *    -# FFT rxp-perturbed wave function
 *    -# Evaluate current density for these momentum index and external field indices
 * 
 * Afterwards the temporary densities are unlocked and the density ones gathered from all Psi-
 * sharing processes. 
 * 
 * \param *P Problem at hand, containing Lattice and RunStruct
 */ 
void FillCurrentDensity(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *LevS = R->LevS;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens0 = Lev0->Dens;
  fftw_complex *Psi0;
  fftw_real *Psi0R, *Psip0R;
  fftw_real *CurrentDensity[NDIM*NDIM];
  fftw_real *Psi1R;
  fftw_real *Psip1R;
  fftw_real *tempArray; // intendedly the same
  double r_bar[NDIM], x[NDIM], X[NDIM], fac[NDIM];
  double Current;//, current;
  const double UnitsFactor = 1.; ///LevS->MaxN;  // 1/N (from ff-backtransform)
  int i, index, B_index;
  int k, j, i0;
  int n[NDIM], n0;
  int *N;
  N = Lev0->Plan0.plan->N;
  const int N0 = Lev0->Plan0.plan->local_nx;
  //int ActNum;
  const int myPE =  P->Par.me_comm_ST_Psi;
  const int type = R->CurrentMin;
  MPI_Status status;
  int cross_lookup_1[4], cross_lookup_3[4], l_1 = 0, l_3 = 0;
  double Factor;//, factor;

  //fprintf(stderr,"(%i) FactoR %e\n", P->Par.me, R->FactorDensityR);

  // Init values and pointers
  if (P->Call.out[PsiOut]) {
    fprintf(stderr,"(%i) LockArray: ", P->Par.me);
    for(i=0;i<MaxDensityTypes;i++)
      fprintf(stderr,"(%i,%i) ",Dens0->LockArray[i],Dens0->LockCArray[i]);
    fprintf(stderr,"\n");
  }
  LockDensityArray(Dens0,Temp2Density,real); // Psi1R
  LockDensityArray(Dens0,Temp2Density,imag); // Psip1R and tempArray
  LockDensityArray(Dens0,GapDensity,real); // Psi0R
  LockDensityArray(Dens0,GapLocalDensity,real); // Psip0R

  Psi0R = (fftw_real *)Dens0->DensityArray[GapDensity];
  Psip0R = (fftw_real *)Dens0->DensityArray[GapLocalDensity];  
  Psi1R = (fftw_real *)Dens0->DensityArray[Temp2Density];
  tempArray = Psip1R = (fftw_real *)Dens0->DensityCArray[Temp2Density];
  SetArrayToDouble0((double *)Psi0R,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)Psip0R,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)Psi1R,Dens0->TotalSize*2);
  SetArrayToDouble0((double *)Psip1R,Dens0->TotalSize*2);

  if (P->Call.out[PsiOut]) {
    fprintf(stderr,"(%i) LockArray: ", P->Par.me);
    for(i=0;i<MaxDensityTypes;i++)
      fprintf(stderr,"(%i,%i) ",Dens0->LockArray[i],Dens0->LockCArray[i]);
    fprintf(stderr,"\n");
  }
  
  // don't put the following stuff into a for loop, they might not be continuous! (preprocessor values: CurrentDensity...) 
  CurrentDensity[0] = (fftw_real *) Dens0->DensityArray[CurrentDensity0];
  CurrentDensity[1] = (fftw_real *) Dens0->DensityArray[CurrentDensity1];
  CurrentDensity[2] = (fftw_real *) Dens0->DensityArray[CurrentDensity2];
  CurrentDensity[3] = (fftw_real *) Dens0->DensityArray[CurrentDensity3];
  CurrentDensity[4] = (fftw_real *) Dens0->DensityArray[CurrentDensity4];
  CurrentDensity[5] = (fftw_real *) Dens0->DensityArray[CurrentDensity5];
  CurrentDensity[6] = (fftw_real *) Dens0->DensityArray[CurrentDensity6];
  CurrentDensity[7] = (fftw_real *) Dens0->DensityArray[CurrentDensity7];
  CurrentDensity[8] = (fftw_real *) Dens0->DensityArray[CurrentDensity8];
 
  // initialize the array if it is the first of all six perturbation run
  if ((R->DoFullCurrent == 0) && (R->CurrentMin == Perturbed_P0)) { // reset if FillDelta...() hasn't done it before 
    debug(P,"resetting CurrentDensity...");
    for (B_index=0; B_index<NDIM*NDIM; B_index++) // initialize current density array
      SetArrayToDouble0((double *)CurrentDensity[B_index],Dens0->TotalSize*2); // DensityArray is fftw_real, no 2*LocalSizeR here!
  } 

  switch(type) { // set j (which is linked to the index from derivation wrt to B^{ext})
    case Perturbed_P0:
    case Perturbed_P1:
    case Perturbed_P2:
      j = type - Perturbed_P0;
      l_1 = crossed(j,1);
      l_3 = crossed(j,3);
      for(k=0;k<4;k++) {
        cross_lookup_1[k] = cross(l_1,k);
        cross_lookup_3[k] = cross(l_3,k);
      }
      break;
    case Perturbed_RxP0:
    case Perturbed_RxP1:
    case Perturbed_RxP2:
      j = type - Perturbed_RxP0;
      break;
    default:
      j = 0;
      Error(SomeError,"FillCurrentDensity() called while not in perturbed minimisation!");
    break;
  }
  
  int wished = -1;
  FILE *file =  fopen(P->Call.MainParameterFile,"r");
  if (!ParseForParameter(0,file,"Orbital",0,1,1,int_type,&wished, 1, optional)) {
     if (P->Call.out[ReadOut]) fprintf(stderr,"Desired Orbital missing, using: All!\n");
    wished = -1;
  } else if (wished != -1) {
     if (P->Call.out[ReadOut]) fprintf(stderr,"Desired Orbital is: %i.\n", wished);
  } else { 
     if (P->Call.out[ReadOut]) fprintf(stderr,"Desired Orbital is: All.\n");
  }
  fclose(file);
  
   // Commence grid filling
  for (k=Psi->TypeStartIndex[Occupied];k<Psi->TypeStartIndex[Occupied+1];k++)  // every local wave functions adds up its part of the current
    if ((k + P->Par.me_comm_ST_PsiT*(Psi->TypeStartIndex[UnOccupied]-Psi->TypeStartIndex[Occupied]) == wished) || (wished == -1)) {  // compare with global number
    if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i)Calculating Current Density Summand of type %s for Psi (%i/%i) ... \n", P->Par.me, R->MinimisationName[type], Psi->LocalPsiStatus[k].MyGlobalNo, k);
    //ActNum = k - Psi->TypeStartIndex[Occupied] + Psi->TypeStartIndex[1] * Psi->LocalPsiStatus[k].my_color_comm_ST_Psi; // global number of unperturbed Psi
    Psi0 = LevS->LPsi->LocalPsi[k]; // Local unperturbed psi

    // now some preemptive ffts for the whole grid      
    if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Bringing |Psi0> one level up and fftransforming\n", P->Par.me);
    fft_Psi(P, Psi0, Psi0R, 0, Psi0symmetry);  //0 // 0 //0

    if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Bringing |Psi1> one level up and fftransforming\n", P->Par.me);
    fft_Psi(P, LevS->LPsi->LocalPsi[Psi->TypeStartIndex[type]+k], Psi1R, 0, Psi1symmetry); //3 //0 //0

    for (index=0;index<NDIM;index++) { // for all NDIM components of momentum operator
      
      if ((P->Call.out[StepLeaderOut]) && (!index)) fprintf(stderr,"(%i) Bringing p|Psi0> one level up and fftransforming\n", P->Par.me);
      fft_Psi(P, Psi0, Psip0R, index, Psip0symmetry); //6 //6 //6

      if ((P->Call.out[StepLeaderOut]) && (!index)) fprintf(stderr,"(%i) Bringing p|Psi1> one level up and fftransforming\n", P->Par.me);
      fft_Psi(P, LevS->LPsi->LocalPsi[Psi->TypeStartIndex[type]+k], Psip1R, index, Psip1symmetry);  //4 //6 //7
      
      // then for every point on the grid in real space ...

      //if (Psi1R != (fftw_real *)Dens0->DensityArray[Temp2Density] || i0<0 || i0>=Dens0->LocalSizeR) Error(SomeError,"fft_Psi: Psi1R corrupted");
       //Psi1R[i0] = (Psi1_rxp_R[j])[i0] - (r_bar[cross(j,0)] *  (Psi1_p_R[cross(j,1)])[i0] - r_bar[cross(j,2)] *  (Psi1_p_R[cross(j,3)])[i0]); // 
      //if (Psip1R != (fftw_real *)Dens0->DensityCArray[Temp2Density] || i0<0 || i0>=Dens0->LocalSizeR) Error(SomeError,"fft_Psi: Psip1R corrupted");
      //Psip1R[i0] = Psi1_rxp_pR[i0] - (r_bar[cross(j,0)] * (Psi1_p_pR[cross(j,1)])[i0] - r_bar[cross(j,2)] * (Psi1_p_pR[cross(j,3)])[i0]); //  

      switch(type) {
        case Perturbed_P0:
        case Perturbed_P1:
        case Perturbed_P2:
/*          // evaluate factor to compensate r x normalized phi(r) against normalized phi(rxp)
          factor = 0.;
          for (n0=0;n0<N0;n0++)  // only local points on x axis
            for (n[1]=0;n[1]<N[1];n[1]++)
              for (n[2]=0;n[2]<N[2];n[2]++) {
                i0 = n[2]+N[2]*(n[1]+N[1]*n0);
                n[0]=n0 + N0*myPE; // global relative coordinate: due to partitoning of x-axis in PEPGamma>1 case
                fac[0] = (double)n[0]/(double)N[0];
                fac[1] = (double)n[1]/(double)N[1];
                fac[2] = (double)n[2]/(double)N[2];
                RMat33Vec3(x, Lat->RealBasis, fac); // relative coordinate times basis matrix gives absolute ones
                MinImageConv(Lat, x, Psi->AddData[k].WannierCentre, X)
                for (i=0;i<NDIM;i++) // build gauge-translated r_bar evaluation point
                  r_bar[i] = sawtooth(Lat,X,i);
//                    ShiftGaugeOrigin(P,X,i);
                    //truedist(Lat, x[i], Psi->AddData[k].WannierCentre[i], i);
                factor += Psi1R[i0] * (r_bar[cross_lookup_1[0]] * Psi1R[i0]);
              }
          MPI_Allreduce (&factor, &Factor, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
          Factor *= R->FactorDensityR;  // discrete integration constant
          fprintf(stderr,"(%i) normalization factor of Phi^(RxP%i)_{%i} is %lg\n", P->Par.me, type, k, Factor);
          Factor =  1./sqrt(fabs(Factor)); //Factor/fabs(Factor) */
          Factor = 1.;
          for (n0=0;n0<N0;n0++)  // only local points on x axis
            for (n[1]=0;n[1]<N[1];n[1]++)
              for (n[2]=0;n[2]<N[2];n[2]++) {
                i0 = n[2]+N[2]*(n[1]+N[1]*n0);
                n[0]=n0 + N0*myPE; // global relative coordinate: due to partitoning of x-axis in PEPGamma>1 case
                fac[0] = (double)n[0]/(double)N[0];
                fac[1] = (double)n[1]/(double)N[1];
                fac[2] = (double)n[2]/(double)N[2];
                RMat33Vec3(x, Lat->RealBasis, fac); // relative coordinate times basis matrix gives absolute ones
                MinImageConv(Lat, x, Psi->AddData[k].WannierCentre, X);
                for (i=0;i<NDIM;i++) // build gauge-translated r_bar evaluation point
                  r_bar[i] = sawtooth(Lat,X[i],i);
//                    ShiftGaugeOrigin(P,X,i);
                    //X[i];
                Current = Psip0R[i0] * (r_bar[cross_lookup_1[0]] * Psi1R[i0]);
                Current += (Psi0R[i0] * r_bar[cross_lookup_1[0]] * Psip1R[i0]);
                Current *= .5 * UnitsFactor * Psi->LocalPsiStatus[k].PsiFactor * R->FactorDensityR; // factor confirmed, see CalculateOneDensityR() and InitDensityCalculation()
                ////if (CurrentDensity[index+j*NDIM] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index+j*NDIM] || i0<0 || i0>=Dens0->LocalSizeR || (index+j*NDIM)<0 || (index+j*NDIM)>=NDIM*NDIM) Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
                CurrentDensity[index+l_1*NDIM][i0] -= Current; // note: sign of cross product resides in Current itself (here: plus)
                Current = - Psip0R[i0] * (r_bar[cross_lookup_3[2]] * Psi1R[i0]);
                Current += - (Psi0R[i0] * r_bar[cross_lookup_3[2]] * Psip1R[i0]);
                Current *= .5 * UnitsFactor * Psi->LocalPsiStatus[k].PsiFactor * R->FactorDensityR; // factor confirmed, see CalculateOneDensityR() and InitDensityCalculation()
                ////if (CurrentDensity[index+j*NDIM] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index+j*NDIM] || i0<0 || i0>=Dens0->LocalSizeR || (index+j*NDIM)<0 || (index+j*NDIM)>=NDIM*NDIM) Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
                CurrentDensity[index+l_3*NDIM][i0] -= Current; // note: sign of cross product resides in Current itself (here: minus)
              }
          break;
        case Perturbed_RxP0:
        case Perturbed_RxP1:
        case Perturbed_RxP2:
          for (n0=0;n0<N0;n0++)  // only local points on x axis
            for (n[1]=0;n[1]<N[1];n[1]++)
              for (n[2]=0;n[2]<N[2];n[2]++) {
                i0 = n[2]+N[2]*(n[1]+N[1]*n0);
                Current = (Psip0R[i0] * Psi1R[i0] + Psi0R[i0] * Psip1R[i0]);
                Current *= .5 * UnitsFactor * Psi->LocalPsiStatus[k].PsiFactor * R->FactorDensityR; // factor confirmed, see CalculateOneDensityR() and InitDensityCalculation()
                ////if (CurrentDensity[index+j*NDIM] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index+j*NDIM] || i0<0 || i0>=Dens0->LocalSizeR || (index+j*NDIM)<0 || (index+j*NDIM)>=NDIM*NDIM) Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
                CurrentDensity[index+j*NDIM][i0] += Current;
              }
          break;
         default:
          break;
      }
    }
    //OutputCurrentDensity(P);
  }
  
  //debug(P,"Unlocking arrays");
  //debug(P,"GapDensity");
  UnLockDensityArray(Dens0,GapDensity,real); // Psi0R
  //debug(P,"GapLocalDensity");
  UnLockDensityArray(Dens0,GapLocalDensity,real); // Psip0R
  //debug(P,"Temp2Density");
  UnLockDensityArray(Dens0,Temp2Density,real); // Psi1R
  
//  if (P->Call.out[StepLeaderOut]) 
//    fprintf(stderr,"\n\n");
    
  //debug(P,"MPI operation");
  // and in the end gather partial densities from other processes
  if (type == Perturbed_RxP2) // exchange all (due to shared wave functions) only after last pertubation run
    for (index=0;index<NDIM*NDIM;index++) {
      //if (tempArray != (fftw_real *)Dens0->DensityCArray[Temp2Density]) Error(SomeError,"FillCurrentDensity: tempArray corrupted");
      //debug(P,"tempArray to zero");
      SetArrayToDouble0((double *)tempArray, Dens0->TotalSize*2);
      ////if (CurrentDensity[index] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index]) Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
      //debug(P,"CurrentDensity exchange");
      MPI_Allreduce( CurrentDensity[index], tempArray, Dens0->LocalSizeR, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT); // gather results from all wave functions ...
      switch(Psi->PsiST) {  // ... and also from SpinUp/Downs
        default:
            //debug(P,"CurrentDensity = tempArray");
            for (i=0;i<Dens0->LocalSizeR;i++) {
              ////if (CurrentDensity[index] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index] || i<0 || i>=Dens0->LocalSizeR) Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
              CurrentDensity[index][i] = tempArray[i];
            }
            break;
        case SpinUp:
            //debug(P,"CurrentDensity exchange spinup");
            MPI_Sendrecv(tempArray, Dens0->LocalSizeR, MPI_DOUBLE, P->Par.me_comm_ST, CurrentTag1,
             CurrentDensity[index], Dens0->LocalSizeR, MPI_DOUBLE, P->Par.me_comm_ST, CurrentTag2, P->Par.comm_STInter, &status );
            //debug(P,"CurrentDensity += tempArray");
            for (i=0;i<Dens0->LocalSizeR;i++) {
              ////if (CurrentDensity[index] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index] || i<0 || i>=Dens0->LocalSizeR) Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
              CurrentDensity[index][i] += tempArray[i];
            }
            break;
        case SpinDown:
            //debug(P,"CurrentDensity exchange spindown");
            MPI_Sendrecv(tempArray, Dens0->LocalSizeR, MPI_DOUBLE, P->Par.me_comm_ST, CurrentTag2,
             CurrentDensity[index], Dens0->LocalSizeR, MPI_DOUBLE, P->Par.me_comm_ST, CurrentTag1, P->Par.comm_STInter, &status );
            //debug(P,"CurrentDensity += tempArray");
            for (i=0;i<Dens0->LocalSizeR;i++) {
              ////if (CurrentDensity[index] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index] || i<0 || i>=Dens0->LocalSizeR) Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
              CurrentDensity[index][i] += tempArray[i];
            }
            break;
      }
    }
  //debug(P,"Temp2Density");
  UnLockDensityArray(Dens0,Temp2Density,imag); // Psip1R and tempArray
  //debug(P,"CurrentDensity end");
}

/** Structure holding Problem at hand and two indices, defining the greens function to be inverted.
 */
struct params
{
  struct Problem *P;
  int *k;
  int *l;
  int *iter;
  fftw_complex *x_l;
};

/** Wrapper function to solve G_kl x = b for x.
 * \param *x above x
 * \param *param additional parameters, here Problem at hand
 * \return evaluated to be minimized functional \f$\frac{1}{2}x \cdot Ax - xb\f$ at \a x on return
 */
static double DeltaCurrent_f(const gsl_vector * x, void * param) 
{
  struct Problem *P = ((struct params *)param)->P;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  struct Psis *Psi = &P->Lat.Psi; 
  struct PseudoPot *PP = &P->PP;
  const double PsiFactor = Psi->AllPsiStatus[*((struct params *)param)->k].PsiFactor;
  double result = 0.;
  fftw_complex *TempPsi = LevS->LPsi->TempPsi;
  fftw_complex *TempPsi2 = LevS->LPsi->TempPsi2;
  int u;

  //fprintf(stderr,"Evaluating f(%i,%i) for %i-th time\n", *((struct params *)param)->k, *((struct params *)param)->l, *((struct params *)param)->iter);

  // extract gsl_vector
  for (u=0;u<LevS->MaxG;u++) {
    TempPsi[u].re = gsl_vector_get(x, 2*u);
    TempPsi[u].im = gsl_vector_get(x, 2*u+1);
  }
  // generate fnl
  CalculateCDfnl(P, TempPsi, PP->CDfnl); // calculate needed non-local form factors
  // Apply Hamiltonian to x
  ApplyTotalHamiltonian(P,TempPsi,TempPsi2, PP->CDfnl,PsiFactor,0);
  // take scalar product to get eigen value
  result = .5 * PsiFactor * (((*((struct params *)param)->k == *((struct params *)param)->l ? GradSP(P,LevS,TempPsi,TempPsi2) : 0.) - Psi->lambda[*((struct params *)param)->k][*((struct params *)param)->l])) - GradSP(P,LevS,TempPsi,LevS->LPsi->LocalPsi[*((struct params *)param)->l]);
  return result;
}

/** Wrapper function to solve G_kl x = b for x.
 * \param *x above x
 * \param *param additional parameters, here Problem at hand
 * \param *g gradient vector on return
 * \return error code
 */
static void DeltaCurrent_df(const gsl_vector * x, void * param, gsl_vector * g)
{
  struct Problem *P = ((struct params *)param)->P;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  struct Psis *Psi = &P->Lat.Psi; 
  struct PseudoPot *PP = &P->PP;
  const double PsiFactor = Psi->AllPsiStatus[*((struct params *)param)->k].PsiFactor;
  fftw_complex *TempPsi = LevS->LPsi->TempPsi;
  fftw_complex *TempPsi2 = LevS->LPsi->TempPsi2;
  fftw_complex *x_l = ((struct params *)param)->x_l;
  int u;

  //fprintf(stderr,"Evaluating df(%i,%i) for %i-th time\n", *((struct params *)param)->k, *((struct params *)param)->l, *((struct params *)param)->iter);

  // extract gsl_vector
  for (u=0;u<LevS->MaxG;u++) {
    TempPsi[u].re = gsl_vector_get(x, 2*u);
    TempPsi[u].im = gsl_vector_get(x, 2*u+1);
  }
  // generate fnl
  CalculateCDfnl(P, TempPsi, PP->CDfnl); // calculate needed non-local form factors
  // Apply Hamiltonian to x
  ApplyTotalHamiltonian(P,TempPsi,TempPsi2, PP->CDfnl,PsiFactor,0);
  // put into returning vector
  for (u=0;u<LevS->MaxG;u++) {
    gsl_vector_set(g, 2*u, TempPsi2[u].re - x_l[u].re);
    gsl_vector_set(g, 2*u+1, TempPsi2[u].im - x_l[u].im);
  }
}

/** Wrapper function to solve G_kl x = b for x.
 * \param *x above x
 * \param *param additional parameters, here Problem at hand
 * \param *f evaluated to be minimized functional \f$\frac{1}{2}x \cdot Ax - xb\f$ at \a x on return
 * \param *g gradient vector on return
 * \return error code
 */
static void DeltaCurrent_fdf(const gsl_vector * x, void * param, double * f, gsl_vector * g)
{
  struct Problem *P = ((struct params *)param)->P;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  struct Psis *Psi = &P->Lat.Psi; 
  struct PseudoPot *PP = &P->PP;
  const double PsiFactor = Psi->AllPsiStatus[*((struct params *)param)->k].PsiFactor;
  fftw_complex *TempPsi = LevS->LPsi->TempPsi;
  fftw_complex *TempPsi2 = LevS->LPsi->TempPsi2;
  fftw_complex *x_l = ((struct params *)param)->x_l;
  int u;

  //fprintf(stderr,"Evaluating fdf(%i,%i) for %i-th time\n", *((struct params *)param)->k, *((struct params *)param)->l, *((struct params *)param)->iter);

  // extract gsl_vector
  for (u=0;u<LevS->MaxG;u++) {
    TempPsi[u].re = gsl_vector_get(x, 2*u);
    TempPsi[u].im = gsl_vector_get(x, 2*u+1);
  }
  // generate fnl
  CalculateCDfnl(P, TempPsi, PP->CDfnl); // calculate needed non-local form factors
  // Apply Hamiltonian to x
  ApplyTotalHamiltonian(P,TempPsi,TempPsi2, PP->CDfnl,PsiFactor,0);
  // put into returning vector
  for (u=0;u<LevS->MaxG;u++) {
    gsl_vector_set(g, 2*u, TempPsi[u].re - x_l[u].re);
    gsl_vector_set(g, 2*u+1, TempPsi[u].im - x_l[u].im);
  }
  
  *f = .5 * PsiFactor * (((*((struct params *)param)->k == *((struct params *)param)->l ? GradSP(P,LevS,TempPsi,TempPsi2) : 0.) - Psi->lambda[*((struct params *)param)->k][*((struct params *)param)->l])) - GradSP(P,LevS,TempPsi,LevS->LPsi->LocalPsi[*((struct params *)param)->l]);
}

/** Evaluates the \f$\Delta j_k(r')\f$ component of the current density.
 * \f[
 *  \Delta j_k(r') = \frac{e}{m} \sum_l \langle \varphi^{(0)}_k | \left ( p |r'\rangle \langle r'| + | r'\rangle\langle r'|p \right ) {\cal G}_{kl} (d_k - d_l) \times p | \varphi^{(1)}_l \rangle \cdot B 
 * \f]
 * \param *P Problem at hand
 * \note result has not yet been MPI_Allreduced for ParallelSimulationData#comm_ST_inter or ParallelSimulationData#comm_ST_PsiT groups!
 * \warning the routine is checked but does  not yet produce sensible results.
 */
void FillDeltaCurrentDensity(struct Problem *P)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct Density *Dens0 = Lev0->Dens;
  int i,j,s;
  int k,l,u, in, dex, index,i0;
  //const int Num = Psi->NoOfPsis;
  int RecvSource;
  MPI_Status status;
  struct OnePsiElement *OnePsiB, *LOnePsiB, *OnePsiA, *LOnePsiA;
  const int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  int n[NDIM], n0;
  int N[NDIM];
  N[0] = Lev0->Plan0.plan->N[0];
  N[1] = Lev0->Plan0.plan->N[1];
  N[2] = Lev0->Plan0.plan->N[2];
  const int N0 = Lev0->Plan0.plan->local_nx;
  fftw_complex *LPsiDatB; 
  fftw_complex *Psi0, *Psi1;
  fftw_real *Psi0R, *Psip0R;
  fftw_real *Psi1R, *Psip1R;
  fftw_complex *x_l = LevS->LPsi->TempPsi;//, **x_l_bak;
  fftw_real *CurrentDensity[NDIM*NDIM];
  int mem_avail, MEM_avail;
  double Current;
  double X[NDIM];
  const double UnitsFactor = 1.;
  int cross_lookup[4];
  struct params param;
  double factor;    // temporary factor in Psi1 pre-evaluation
  
  LockDensityArray(Dens0,GapDensity,real); // Psi0R
  LockDensityArray(Dens0,GapLocalDensity,real); // Psip0R
  LockDensityArray(Dens0,Temp2Density,imag); // Psi1
  LockDensityArray(Dens0,GapUpDensity,real); // Psi1R
  LockDensityArray(Dens0,GapDownDensity,real); // Psip1R

  CurrentDensity[0] = (fftw_real *) Dens0->DensityArray[CurrentDensity0];
  CurrentDensity[1] = (fftw_real *) Dens0->DensityArray[CurrentDensity1];
  CurrentDensity[2] = (fftw_real *) Dens0->DensityArray[CurrentDensity2];
  CurrentDensity[3] = (fftw_real *) Dens0->DensityArray[CurrentDensity3];
  CurrentDensity[4] = (fftw_real *) Dens0->DensityArray[CurrentDensity4];
  CurrentDensity[5] = (fftw_real *) Dens0->DensityArray[CurrentDensity5];
  CurrentDensity[6] = (fftw_real *) Dens0->DensityArray[CurrentDensity6];
  CurrentDensity[7] = (fftw_real *) Dens0->DensityArray[CurrentDensity7];
  CurrentDensity[8] = (fftw_real *) Dens0->DensityArray[CurrentDensity8];

  Psi0R = (fftw_real *)Dens0->DensityArray[GapDensity];
  Psip0R = (fftw_real *)Dens0->DensityArray[GapLocalDensity];  
  Psi1 = (fftw_complex *) Dens0->DensityCArray[Temp2Density];
  Psi1R = (fftw_real *)Dens0->DensityArray[GapUpDensity];
  Psip1R = (fftw_real *)Dens0->DensityArray[GapDownDensity];  

//  if (R->CurrentMin == Perturbed_P0)  
//    for (B_index=0; B_index<NDIM*NDIM; B_index++) { // initialize current density array
//      debug(P,"resetting CurrentDensity...");
//      SetArrayToDouble0((double *)CurrentDensity[B_index],Dens0->TotalSize*2); // DensityArray is fftw_real, no 2*LocalSizeR here! 
//    }
  //if (Psi1 != (fftw_complex *) Dens0->DensityCArray[Temp2Density]) Error(SomeError,"FillDeltaCurrentDensity: Psi1 corrupted");
  SetArrayToDouble0((double *)Psi1,2*Dens0->TotalSize);
  
//  gsl_vector *x = gsl_vector_alloc(Num);
//  gsl_matrix *G = gsl_matrix_alloc(Num,Num);
//  gsl_permutation *p = gsl_permutation_alloc(Num);
  //int signum;
  // begin of GSL linearer CG solver stuff
  int iter, Status;
 
  const gsl_multimin_fdfminimizer_type *T;
  gsl_multimin_fdfminimizer *minset;
 
  /* Position of the minimum (1,2). */
  //double par[2] = { 1.0, 2.0 };
 
  gsl_vector *x;
  gsl_multimin_function_fdf my_func;
  
  param.P = P;
  param.k = &k;
  param.l = &l;
  param.iter = &iter;
  param.x_l = x_l;
  
  my_func.f = &DeltaCurrent_f;
  my_func.df = &DeltaCurrent_df;
  my_func.fdf = &DeltaCurrent_fdf;
  my_func.n = 2*LevS->MaxG;
  my_func.params = (void *)&param;

  T = gsl_multimin_fdfminimizer_conjugate_pr;
  minset = gsl_multimin_fdfminimizer_alloc (T, 2*LevS->MaxG);
  x = gsl_vector_alloc (2*LevS->MaxG);
   // end of GSL CG stuff


//  // construct G_kl = - (H^{(0)} \delta_{kl} -\langle \varphi^{(0)}_k |H^{(0)}| \varphi^{(0)}_l|rangle)^{-1} = A^{-1}
//  for (k=0;k<Num;k++)
//    for (l=0;l<Num;l++)
//      gsl_matrix_set(G, k, l, k == l ? 0. : Psi->lambda[k][l]);
//  // and decompose G_kl = L U
  
  mem_avail = MEM_avail = 0;
//  x_l_bak = x_l = (fftw_complex **) Malloc(sizeof(fftw_complex *)*Num,"FillDeltaCurrentDensity: *x_l");
//  for (i=0;i<Num;i++) {
//    x_l[i] = NULL;
//    x_l[i] = (fftw_complex *) malloc(sizeof(fftw_complex)*LevS->MaxG);
//    if (x_l[i] == NULL) {
//      mem_avail = 1;    // there was not enough memory for this node
//      fprintf(stderr,"(%i) FillDeltaCurrentDensity: x_l[%i] ... insufficient memory.\n",P->Par.me,i);
//    }
//  }
//  MPI_Allreduce(&mem_avail,&MEM_avail,1,MPI_INT,MPI_SUM,P->Par.comm_ST);  // sum results from all processes

  if (MEM_avail != 0) { // means at least node couldn't allocate sufficient memory, skipping...
    fprintf(stderr,"(%i) FillDeltaCurrentDensity: x_l[], not enough memory: %i! Skipping FillDeltaCurrentDensity evaluation.", P->Par.me, MEM_avail);
  } else {
    // sum over k and calculate \Delta j_k(r')
    k=-1;
    for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) {  // go through all wave functions
      //fprintf(stderr,"(%i) GlobalNo: %d\tLocalNo: %d\n", P->Par.me,Psi->AllPsiStatus[i].MyGlobalNo,Psi->AllPsiStatus[i].MyLocalNo);
      OnePsiA = &Psi->AllPsiStatus[i];    // grab OnePsiA
      if (OnePsiA->PsiType == Occupied) {   // drop the extra and perturbed ones
        k++;
        if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
           LOnePsiA = &Psi->LocalPsiStatus[OnePsiA->MyLocalNo];
        else 
           LOnePsiA = NULL;
        if (LOnePsiA != NULL) { 
          Psi0=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
          
          if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Bringing |Psi0> one level up and fftransforming\n", P->Par.me);
          //if (Psi0R != (fftw_real *)Dens0->DensityArray[GapDensity]) Error(SomeError,"FillDeltaCurrentDensity: Psi0R corrupted");
          fft_Psi(P,Psi0,Psi0R, 0, Psi0symmetry);  //0 // 0 //0

          for (in=0;in<NDIM;in++) { // in is the index from derivation wrt to B^{ext}   
            l = -1;
            for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
              OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiA
              if (OnePsiB->PsiType == Occupied)
                l++;
              if ((OnePsiB != OnePsiA) && (OnePsiB->PsiType == Occupied)) {   // drop the same and the extra ones
                if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
                   LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
                else 
                   LOnePsiB = NULL;
                if (LOnePsiB == NULL) {   // if it's not local ... receive x from respective process
                  RecvSource = OnePsiB->my_color_comm_ST_Psi;
                  MPI_Recv( x_l, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, HamiltonianTag, P->Par.comm_ST_PsiT, &status );
                } else {  // .. otherwise setup wave function as x ...             
                  // Evaluate cross product: \epsilon_{ijm} (d_k - d_l)_j p_m | \varphi^{(0)} \rangle = b_i ... and
                  LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
                  //LPsiDatx=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo+Psi->TypeStartIndex[Perturbed_P0]];
                  //CalculatePerturbationOperator_P(P,LPsiDatB,LPsiDatB_p0,cross(in,1),0);
                  //CalculatePerturbationOperator_P(P,LPsiDatB,LPsiDatB_p1,cross(in,3),0);
                  for (dex=0;dex<4;dex++)
                    cross_lookup[dex] = cross(in,dex);
                  MinImageConv(Lat,Psi->AddData[LOnePsiA->MyLocalNo].WannierCentre, Psi->AddData[LOnePsiB->MyLocalNo].WannierCentre,X);
                  for(s=0;s<LevS->MaxG;s++) {
                    //if (x_l != x_l_bak || s<0 || s>LevS->MaxG) Error(SomeError,"FillDeltaCurrentDensity: x_l[] corrupted");
                    factor = (X[cross_lookup[0]] * LevS->GArray[s].G[cross_lookup[1]] - X[cross_lookup[2]] * LevS->GArray[s].G[cross_lookup[3]]); 
                    x_l[s].re = factor * (-LPsiDatB[s].im); // switched due to factorization with "-i G"
                    x_l[s].im = factor * (LPsiDatB[s].re);
                  }
                  // ... and send it to all other processes (Max_me... - 1)
                  for (u=0;u<P->Par.Max_me_comm_ST_PsiT;u++)
                    if (u != OnePsiB->my_color_comm_ST_Psi) 
                      MPI_Send( x_l, LevS->MaxG*ElementSize, MPI_DOUBLE, u, HamiltonianTag, P->Par.comm_ST_PsiT);
                } // x_l row is now filled (either by receiving result or evaluating it on its own)
                // Solve Ax = b by minimizing 1/2 xAx -xb (gradient is residual Ax - b) with conjugate gradient polak-ribiere
    
                debug(P,"fill starting point x with values from b");
                /* Starting point, x = b */
                for (u=0;u<LevS->MaxG;u++) {
                  gsl_vector_set (x, 2*u, x_l[u].re);
                  gsl_vector_set (x, 2*u+1, x_l[u].im);
                }
     
                gsl_multimin_fdfminimizer_set (minset, &my_func, x, 0.01, 1e-4);
     
                fprintf(stderr,"(%i) Start solving for (%i,%i) and index %i\n",P->Par.me, k,l,in);
                // start solving
                iter = 0;
                do
                {
                  iter++;
                  Status = gsl_multimin_fdfminimizer_iterate (minset);
     
                  if (Status)
                    break;
     
                  Status = gsl_multimin_test_gradient (minset->gradient, 1e-3);
     
                  if (Status == GSL_SUCCESS)
                    fprintf (stderr,"(%i) Minimum found after %i iterations.\n", P->Par.me, iter);
     
                } while (Status == GSL_CONTINUE && iter < 100);
     
                debug(P,"Put solution into Psi1");
                // ... and what do we do now? Put solution into Psi1!
                for(s=0;s<LevS->MaxG;s++) {
                  //if (Psi1 != (fftw_complex *) Dens0->DensityCArray[Temp2Density] || s<0 || s>LevS->MaxG) Error(SomeError,"FillDeltaCurrentDensity: Psi1 corrupted");
                  Psi1[s].re = gsl_vector_get (minset->x, 2*s);
                  Psi1[s].im = gsl_vector_get (minset->x, 2*s+1);
                }
                
    //            // Solve A^{-1} b_i = x
    //            for(s=0;s<LevS->MaxG;s++) {
    //              // REAL PART
    //              // retrieve column from gathered matrix
    //              for(u=0;u<Num;u++)
    //                gsl_vector_set(x,u,x_l[u][s].re);
    //      
    //              // solve: sum_l A_{kl}^(-1) b_l (s) = x_k (s)
    //              gsl_linalg_LU_svx (G, p, x);
    //      
    //              // put solution back into x_l[s]
    //              for(u=0;u<Num;u++) {
    //                //if (x_l != x_l_bak || s<0 || s>=LevS->MaxG) Error(SomeError,"FillDeltaCurrentDensity: x_l[] corrupted");
    //                x_l[u][s].re = gsl_vector_get(x,u);
    //              }
    //              
    //              // IMAGINARY PART
    //              // retrieve column from gathered matrix
    //              for(u=0;u<Num;u++)
    //                gsl_vector_set(x,u,x_l[u][s].im);
    //      
    //              // solve: sum_l A_{kl}^(-1) b_l (s) = x_k (s)
    //              gsl_linalg_LU_svx (G, p, x);
    //      
    //              // put solution back into x_l[s]
    //              for(u=0;u<Num;u++) {
    //                //if (x_l != x_l_bak || s<0 || s>=LevS->MaxG) Error(SomeError,"FillDeltaCurrentDensity: x_l[] corrupted");
    //                x_l[u][s].im = gsl_vector_get(x,u);
    //              }
    //            } // now we have in x_l a vector similar to "Psi1" which we use to evaluate the current density
    //      
    //            // evaluate \Delta J_k ... mind the minus sign from G_kl!  
    //            // fill Psi1
    //            for(s=0;s<LevS->MaxG;s++) {
    //              //if (Psi1 != (fftw_complex *) Dens0->DensityCArray[Temp2Density] || s<0 || s>LevS->MaxG) Error(SomeError,"FillDeltaCurrentDensity: Psi1 corrupted");
    //              Psi1[s].re = x_l[k][s].re;
    //              Psi1[s].im = x_l[k][s].im;
    //            }
                            
                if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Bringing |Psi1> one level up and fftransforming\n", P->Par.me);
                //if (Psi1R != (fftw_real *)Dens0->DensityArray[GapUpDensity]) Error(SomeError,"FillDeltaCurrentDensity: Psi1R corrupted");
                fft_Psi(P,Psi1,Psi1R, 0, Psi1symmetry);  //2 // 0 //0
            
                for (index=0;index<NDIM;index++) { // for all NDIM components of momentum operator
          
                  if ((P->Call.out[StepLeaderOut]) && (!index)) fprintf(stderr,"(%i) Bringing p|Psi0> one level up and fftransforming\n", P->Par.me);
                  //if (Psip0R != (fftw_real *)Dens0->DensityArray[GapLocalDensity]) Error(SomeError,"FillDeltaCurrentDensity: Psip0R corrupted");
                  fft_Psi(P,Psi0,Psip0R, index, Psip0symmetry); //6 //6 //6
            
                  if ((P->Call.out[StepLeaderOut]) && (!index)) fprintf(stderr,"(%i) Bringing p|Psi1> one level up and fftransforming\n", P->Par.me);
                  //if (Psip1R != (fftw_real *)Dens0->DensityArray[GapDownDensity]) Error(SomeError,"FillDeltaCurrentDensity: Psip1R corrupted");
                  fft_Psi(P,Psi1,Psip1R, index, Psip1symmetry); //4 //6 //6
            
                    // then for every point on the grid in real space ...
                    for (n0=0;n0<N0;n0++)  // only local points on x axis
                      for (n[1]=0;n[1]<N[1];n[1]++)
                        for (n[2]=0;n[2]<N[2];n[2]++) {
                          i0 = n[2]+N[2]*(n[1]+N[1]*n0);
                          // and take the product
                          Current = (Psip0R[i0] * Psi1R[i0] + Psi0R[i0] * Psip1R[i0]);
                          Current *= 0.5 * UnitsFactor * Psi->AllPsiStatus[OnePsiA->MyGlobalNo].PsiFactor * R->FactorDensityR;
                          ////if (CurrentDensity[index+in*NDIM] != (fftw_real *) Dens0->DensityArray[CurrentDensity0 + index+in*NDIM])  Error(SomeError,"FillCurrentDensity: CurrentDensity[] corrupted");
                          //if (i0<0 || i0>=Dens0->LocalSizeR)  Error(SomeError,"FillDeltaCurrentDensity: i0 out of range");
                          //if ((index+in*NDIM)<0 || (index+in*NDIM)>=NDIM*NDIM) Error(SomeError,"FillDeltaCurrentDensity: index out of range");
                          CurrentDensity[index+in*NDIM][i0] += Current; // minus sign is from G_kl
                        }
                }
              }
            }
          }
        }
      }
    }
  }
  UnLockDensityArray(Dens0,GapDensity,real); // Psi0R
  UnLockDensityArray(Dens0,GapLocalDensity,real); // Psip0R
  UnLockDensityArray(Dens0,Temp2Density,imag); // Psi1
  UnLockDensityArray(Dens0,GapUpDensity,real); // Psi1R
  UnLockDensityArray(Dens0,GapDownDensity,real); // Psip1R
//  for (i=0;i<Num;i++)
//    if (x_l[i] != NULL) Free(x_l[i], "FillDeltaCurrentDensity: x_l[i]");         
//  Free(x_l, "FillDeltaCurrentDensity: x_l");
  gsl_multimin_fdfminimizer_free (minset);
  gsl_vector_free (x);            
//  gsl_matrix_free(G);
//  gsl_permutation_free(p);
//  gsl_vector_free(x);
}


/** Evaluates the overlap integral between \a state wave functions.
 * \f[
 *  S_{kl} = \langle \varphi_k^{(1)} | \varphi_l^{(1)} \rangle
 * \f]
 * The scalar product is calculated via GradSP(), MPI_Allreduced among comm_ST_Psi and the result
 * stored in Psis#Overlap. The rows have to be MPI exchanged, as otherwise processes will add
 * to the TotalEnergy overlaps calculated with old wave functions - they have been minimised after
 * the product with exchanged coefficients was taken.
 * \param *P Problem at hand
 * \param l local number of perturbed wave function.
 * \param state PsiTypeTag minimisation state of wave functions to be overlapped
 */
void CalculateOverlap(struct Problem *P, const int l, const enum PsiTypeTag state) 
{
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &(P->Lat);
  struct Psis *Psi = &Lat->Psi;
  struct LatticeLevel *LevS = R->LevS;
  struct OnePsiElement *OnePsiB, *LOnePsiB;
  fftw_complex *LPsiDatB=NULL, *LPsiDatA=NULL;
  const int ElementSize = (sizeof(fftw_complex) / sizeof(double));
  int RecvSource;
  MPI_Status status;
  int i,j,m,p;
  //const int l_normal = l - Psi->TypeStartIndex[state] + Psi->TypeStartIndex[Occupied]; 
  const int ActNum = l - Psi->TypeStartIndex[state] + Psi->TypeStartIndex[1] * Psi->LocalPsiStatus[l].my_color_comm_ST_Psi;
  double *sendbuf, *recvbuf;
  double tmp,TMP;
  const int gsize = P->Par.Max_me_comm_ST_PsiT;  //number of processes in PsiT
  int p_num;  // number of wave functions (for overlap)

  // update overlap table after wave function has changed
  LPsiDatA = LevS->LPsi->LocalPsi[l];
  m = -1; // to access U matrix element (0..Num-1)
  for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) {  // go through all wave functions
    OnePsiB = &Psi->AllPsiStatus[j];    // grab OnePsiB
    if (OnePsiB->PsiType == state) {   // drop all but the ones of current min state
      m++;  // increase m if it is non-extra wave function
      if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
         LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
      else
         LOnePsiB = NULL;
      if (LOnePsiB == NULL) {   // if it's not local ... receive it from respective process into TempPsi
        RecvSource = OnePsiB->my_color_comm_ST_Psi;
        MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, OverlapTag, P->Par.comm_ST_PsiT, &status );
        LPsiDatB=LevS->LPsi->TempPsi;
      } else {                  // .. otherwise send it to all other processes (Max_me... - 1)
        for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
          if (p != OnePsiB->my_color_comm_ST_Psi)
            MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, OverlapTag, P->Par.comm_ST_PsiT);
        LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
      } // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received

      tmp = GradSP(P, LevS, LPsiDatA, LPsiDatB) * sqrt(Psi->LocalPsiStatus[l].PsiFactor * OnePsiB->PsiFactor);
      MPI_Allreduce ( &tmp, &TMP, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
      //fprintf(stderr,"(%i) Setting Overlap [%i][%i] = %lg\n",P->Par.me, ActNum,m,TMP);
      Psi->Overlap[ActNum][m] = TMP; //= Psi->Overlap[m][ActNum] 
    }
  }
 
  // exchange newly calculated rows among PsiT
  p_num = (m+1) + 1;  // number of Psis: one more due to ActNum
  sendbuf = (double *) Malloc(p_num * sizeof(double), "CalculateOverlap: sendbuf");
  sendbuf[0] = ActNum;  // first entry is the global row number
  for (i=1;i<p_num;i++)
    sendbuf[i] = Psi->Overlap[ActNum][i-1]; // then follow up each entry of overlap row
  recvbuf = (double *) Malloc(gsize * p_num * sizeof(double), "CalculateOverlap: recvbuf");
  MPI_Allgather(sendbuf, p_num, MPI_DOUBLE, recvbuf, p_num, MPI_DOUBLE, P->Par.comm_ST_PsiT);
  Free(sendbuf, "CalculateOverlap: sendbuf");
  for (i=0;i<gsize;i++) {// extract results from other processes out of receiving buffer
    m = recvbuf[i*p_num]; // m is ActNum of the process whose results we've just received
    //fprintf(stderr,"(%i) Received row %i from process %i\n", P->Par.me, m, i);
    for (j=1;j<p_num;j++)
      Psi->Overlap[m][j-1] = Psi->Overlap[j-1][m] = recvbuf[i*p_num+j]; // put each entry into correspondent Overlap row
  }
  Free(recvbuf, "CalculateOverlap: recvbuf");
}


/** Calculates magnetic susceptibility from known current density.
 * The bulk susceptibility tensor can be expressed as a function of the current density.
 * \f[
 *  \chi_{ij} = \frac{\mu_0}{2\Omega} \frac{\delta}{\delta B_i^{ext}} \int_\Omega d^3 r \left (r \times j(r) \right )_j
 * \f]
 * Thus the integral over real space and subsequent MPI_Allreduce() over results from ParallelSimulationData#comm_ST_Psi is 
 * straightforward. Tensor is diagonalized afterwards and split into its various sub-tensors of lower rank (e.g., isometric
 * value is tensor of rank 0) which are printed to screen and the tensorial elements to file '....chi.csv'
  * \param *P Problem at hand
  */
void CalculateMagneticSusceptibility(struct Problem *P)
{
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens0 = R->Lev0->Dens;
  struct Ions *I = &P->Ion;
  fftw_real *CurrentDensity[NDIM*NDIM];
  int in, dex, i, i0, n0;
  int n[NDIM];
  const int N0 = Lev0->Plan0.plan->local_nx;
  int N[NDIM];
  N[0] = Lev0->Plan0.plan->N[0];
  N[1] = Lev0->Plan0.plan->N[1];
  N[2] = Lev0->Plan0.plan->N[2];
  double chi[NDIM*NDIM],Chi[NDIM*NDIM], x[NDIM], X[NDIM], fac[NDIM];
  const double discrete_factor = Lat->Volume/Lev0->MaxN;
  const int myPE =  P->Par.me_comm_ST_Psi;
  double eta, delta_chi, S, A, iso;
  int cross_lookup[4];
  char filename[256];
  FILE *ChiFile;
  time_t seconds;  
  time(&seconds); // get current time

 // set pointers onto current density
  CurrentDensity[0] = (fftw_real *) Dens0->DensityArray[CurrentDensity0];
  CurrentDensity[1] = (fftw_real *) Dens0->DensityArray[CurrentDensity1];
  CurrentDensity[2] = (fftw_real *) Dens0->DensityArray[CurrentDensity2];
  CurrentDensity[3] = (fftw_real *) Dens0->DensityArray[CurrentDensity3];
  CurrentDensity[4] = (fftw_real *) Dens0->DensityArray[CurrentDensity4];
  CurrentDensity[5] = (fftw_real *) Dens0->DensityArray[CurrentDensity5];
  CurrentDensity[6] = (fftw_real *) Dens0->DensityArray[CurrentDensity6];
  CurrentDensity[7] = (fftw_real *) Dens0->DensityArray[CurrentDensity7];
  CurrentDensity[8] = (fftw_real *) Dens0->DensityArray[CurrentDensity8];
  //for(i=0;i<NDIM;i++) {
//    field[i] = Dens0->DensityArray[TempDensity+i];
    //LockDensityArray(Dens0,TempDensity+i,real);
//    SetArrayToDouble0((double *)field[i],Dens0->TotalSize*2);  
  //}
  gsl_matrix_complex *H = gsl_matrix_complex_calloc(NDIM,NDIM);
    
  
  if (P->Call.out[ValueOut]) fprintf(stderr,"(%i) magnetic susceptibility tensor \\Chi_ij = \n",P->Par.me);
  for (in=0; in<NDIM; in++) { // index i of integrand vector component
    for(dex=0;dex<4;dex++)  // initialise cross lookup
      cross_lookup[dex] = cross(in,dex);
    for (dex=0; dex<NDIM; dex++) {  // index j of derivation wrt B field 
      chi[in+dex*NDIM] = 0.;
      // do the integration over real space
      for(n0=0;n0<N0;n0++)
        for(n[1]=0;n[1]<N[1];n[1]++)
          for(n[2]=0;n[2]<N[2];n[2]++) {
            n[0]=n0 + N0*myPE; // global relative coordinate: due to partitoning of x-axis in PEPGamma>1 case
            fac[0] = (double)(n[0])/(double)N[0];
            fac[1] = (double)(n[1])/(double)N[1];
            fac[2] = (double)(n[2])/(double)N[2];
            RMat33Vec3(x, Lat->RealBasis, fac);
            i0 = n[2]+N[2]*(n[1]+N[1]*(n0));  // the index of current density must match LocalSizeR!
            MinImageConv(Lat,x, Lat->RealBasisCenter, X);
            chi[in+dex*NDIM] += X[cross_lookup[0]] * CurrentDensity[dex*NDIM+cross_lookup[1]][i0]; // x[cross(in,0)], Lat->RealBasisCenter[cross_lookup[0]]
            chi[in+dex*NDIM] -= X[cross_lookup[2]] * CurrentDensity[dex*NDIM+cross_lookup[3]][i0]; // x[cross(in,2)], Lat->RealBasisCenter[cross_lookup[2]]
//            if (in == dex) field[in][i0] = 
//                truedist(Lat,x[cross_lookup[0]], sqrt(Lat->RealBasisSQ[c[0]])/2.,cross_lookup[0]) * CurrentDensity[dex*NDIM+cross_lookup[1]][i0] 
//              - truedist(Lat,x[cross_lookup[2]], sqrt(Lat->RealBasisSQ[c[2]])/2.,cross_lookup[2]) * CurrentDensity[dex*NDIM+cross_lookup[3]][i0];
            //fprintf(stderr,"(%i) temporary susceptiblity \\chi[%i][%i] += %e * %e = r[%i] * CurrDens[%i][%i] = %e\n",P->Par.me,in,dex,(double)n[cross_lookup[0]]/(double)N[cross_lookup[0]]*(sqrt(Lat->RealBasisSQ[cross_lookup[0]])),CurrentDensity[dex*NDIM+cross_lookup[1]][i0],cross_lookup[0],dex*NDIM+cross_lookup[1],i0,chi[in*NDIM+dex]);
          }
      chi[in+dex*NDIM] *= mu0*discrete_factor/(2.*Lat->Volume); // integral factor
      chi[in+dex*NDIM] *= (-1625.); // empirical gauge factor ... sigh
      MPI_Allreduce ( &chi[in+dex*NDIM], &Chi[in+dex*NDIM], 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);   // sum "LocalSize to TotalSize"
      I->I[0].chi[in+dex*NDIM] = Chi[in+dex*NDIM];
      Chi[in+dex*NDIM] *= Lat->Volume*loschmidt_constant; // factor for _molar_ susceptibility
      if (P->Call.out[ValueOut]) {
        fprintf(stderr,"%e\t", Chi[in+dex*NDIM]);
        if (dex == NDIM-1) fprintf(stderr,"\n");
      }
    }
  }
  // store symmetrized matrix
  for (in=0;in<NDIM;in++)
    for (dex=0;dex<NDIM;dex++)
      gsl_matrix_complex_set(H,in,dex,gsl_complex_rect((Chi[in+dex*NDIM]+Chi[dex+in*NDIM])/2.,0));
  // output tensor to file
  if (P->Par.me == 0) {
    sprintf(filename, ".chi.L%i.csv", Lev0->LevelNo);
    OpenFile(P, &ChiFile, filename, "a", P->Call.out[ReadOut]);
    fprintf(ChiFile,"# magnetic susceptibility tensor chi[01,02,03,10,11,12,20,21,22], seed %i, config %s, run on %s", R->Seed, P->Files.default_path, ctime(&seconds));
    fprintf(ChiFile,"%lg\t", P->Lat.ECut/(Lat->LevelSizes[0]*Lat->LevelSizes[0]));
    for (in=0;in<NDIM*NDIM;in++)
      fprintf(ChiFile,"%e\t", Chi[in]);
    fprintf(ChiFile,"\n");
    fclose(ChiFile);
  }
  // diagonalize chi
  gsl_vector *eval = gsl_vector_alloc(NDIM);
  gsl_eigen_herm_workspace *w = gsl_eigen_herm_alloc(NDIM);
  gsl_eigen_herm(H, eval, w);
  gsl_eigen_herm_free(w);
  gsl_sort_vector(eval);  // sort eigenvalues
  // print eigenvalues
  iso = 0;
  for (i=0;i<NDIM;i++) {
    I->I[0].chi_PAS[i] = gsl_vector_get(eval,i);
    iso += Chi[i+i*NDIM]/3.;
  } 
  eta = (gsl_vector_get(eval,1)-gsl_vector_get(eval,0))/(gsl_vector_get(eval,2)-iso);
  delta_chi = gsl_vector_get(eval,2) - 0.5*(gsl_vector_get(eval,0)+gsl_vector_get(eval,1));
  S = (delta_chi*delta_chi)*(1+1./3.*eta*eta);
  A = 0.;
  for (i=0;i<NDIM;i++) {
    in = cross(i,0);
    dex = cross(i,1);
    A += pow(-1,i)*pow(0.5*(Chi[in+dex*NDIM]-Chi[dex+in*NDIM]),2);
  }
  if (P->Call.out[ValueOut]) {
    fprintf(stderr,"(%i) converted to Principal Axis System\n==================\nDiagonal entries:", P->Par.me);
    for (i=0;i<NDIM;i++)
      fprintf(stderr,"\t%lg",gsl_vector_get(eval,i));
    fprintf(stderr,"\nsusceptib. : %e\n", iso);
    fprintf(stderr,"anisotropy : %e\n", delta_chi);
    fprintf(stderr,"asymmetry  : %e\n", eta);
    fprintf(stderr,"S          : %e\n", S);
    fprintf(stderr,"A          : %e\n", A);
    fprintf(stderr,"==================\n");
  }
  //for(i=0;i<NDIM;i++)
    //UnLockDensityArray(Dens0,TempDensity+i,real);  
  gsl_vector_free(eval);
  gsl_matrix_complex_free(H);
}

/** Fouriertransforms all nine current density components and calculates shielding tensor.
 * \f[
 *  \sigma_{ij} = \left ( \frac{G}{|G|^2} \times J_i(G) \right )_j
 * \f]
 * The CurrentDensity has to be fouriertransformed to reciprocal subspace in order to be useful, and the final
 * product \f$\sigma_{ij}(G)\f$ has to be back-transformed to real space. However, the shielding is the only evaluated
 * at the grid points and not where the real ion position is. The shieldings there are interpolated between the eight
 * adjacent grid points by a simple linear weighting. Afterwards follows the same analaysis and printout of the rank-2-tensor
 * as in the case of CalculateMagneticShielding().
 * \param *P Problem at hand
 * \note Lots of arrays are used temporarily during the routine for the fft'ed Current density tensor.
 * \note MagneticSusceptibility is needed for G=0-component and thus has to be computed beforehand
 */
void CalculateChemicalShieldingByReciprocalCurrentDensity(struct Problem *P)
{
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Ions *I = &P->Ion;
  struct Density *Dens0 = Lev0->Dens;
  struct OneGData *GArray = Lev0->GArray;
  struct fft_plan_3d *plan = Lat->plan; 
  fftw_real *CurrentDensity[NDIM*NDIM];
  fftw_complex *CurrentDensityC[NDIM*NDIM];
  fftw_complex *work = (fftw_complex *)Dens0->DensityCArray[TempDensity];
  //fftw_complex *sigma_imag = (fftw_complex *)Dens0->DensityCArray[Temp2Density];
  //fftw_real *sigma_real = (fftw_real *)sigma_imag;
  fftw_complex *sigma_imag[NDIM_NDIM];
  fftw_real *sigma_real[NDIM_NDIM];
  double sigma,Sigma;
  double x[2][NDIM];
  int it, ion, in, dex, g, Index, i;
  //const double FFTfactor = 1.;///Lev0->MaxN;
  int n[2][NDIM];
  double eta, delta_sigma, S, A, iso, tmp;
  FILE *SigmaFile;
  char suffixsigma[255];
  const int myPE = P->Par.me_comm_ST_Psi;
  int N[NDIM];
  int cross_lookup[4]; // cross lookup table
  N[0] = Lev0->Plan0.plan->N[0];
  N[1] = Lev0->Plan0.plan->N[1];
  N[2] = Lev0->Plan0.plan->N[2];
  const int N0 = Lev0->Plan0.plan->local_nx;
  const double factorDC = R->FactorDensityC;
  gsl_matrix_complex *H = gsl_matrix_complex_calloc(NDIM,NDIM);
  
  time_t seconds;  
  time(&seconds); // get current time

  // inverse Fourier transform current densities
  CurrentDensityC[0] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity0];
  CurrentDensityC[1] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity1];
  CurrentDensityC[2] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity2];
  CurrentDensityC[3] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity3];
  CurrentDensityC[4] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity4];
  CurrentDensityC[5] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity5];
  CurrentDensityC[6] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity6];
  CurrentDensityC[7] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity7];
  CurrentDensityC[8] = (fftw_complex *) Dens0->DensityCArray[CurrentDensity8];
  // don't put the following stuff into a for loop, they are not continuous! (preprocessor values CurrentDensity.)
  CurrentDensity[0] = (fftw_real *) Dens0->DensityArray[CurrentDensity0];
  CurrentDensity[1] = (fftw_real *) Dens0->DensityArray[CurrentDensity1];
  CurrentDensity[2] = (fftw_real *) Dens0->DensityArray[CurrentDensity2];
  CurrentDensity[3] = (fftw_real *) Dens0->DensityArray[CurrentDensity3];
  CurrentDensity[4] = (fftw_real *) Dens0->DensityArray[CurrentDensity4];
  CurrentDensity[5] = (fftw_real *) Dens0->DensityArray[CurrentDensity5];
  CurrentDensity[6] = (fftw_real *) Dens0->DensityArray[CurrentDensity6];
  CurrentDensity[7] = (fftw_real *) Dens0->DensityArray[CurrentDensity7];
  CurrentDensity[8] = (fftw_real *) Dens0->DensityArray[CurrentDensity8];

  // inverse Fourier transform current densities
  if (P->Call.out[LeaderOut]) fprintf(stderr,"(%i) Transforming and checking J_{ij} (G=0) = 0 for each i,j ... \n",P->Par.me);
  for (in=0;in<NDIM*NDIM;in++) {
    CalculateOneDensityC(Lat, R->LevS, Dens0, CurrentDensity[in], CurrentDensityC[in], factorDC);
    TestReciprocalCurrent(P, CurrentDensityC[in], GArray, in);
  }

  // linking pointers to the arrays
  for (in=0;in<NDIM*NDIM;in++) {
    LockDensityArray(Dens0,in,real); // Psi1R
    sigma_imag[in] = (fftw_complex *) Dens0->DensityArray[in];
    sigma_real[in] = (fftw_real *) sigma_imag[in];
  }

  LockDensityArray(Dens0,TempDensity,imag); // work
  LockDensityArray(Dens0,Temp2Density,imag); // tempdestRC and field
  // go through reciprocal nodes and calculate shielding tensor sigma
  for (in=0; in<NDIM; in++) {// index i of vector component in integrand
    for(dex=0;dex<4;dex++)  // initialise cross lookup
      cross_lookup[dex] = cross(in,dex);
    for (dex=0; dex<NDIM; dex++) { // index j of B component derivation in current density tensor
      //if (tempdestRC != (fftw_complex *)Dens0->DensityCArray[Temp2Density]) Error(SomeError,"CalculateChemicalShieldingByReciprocalCurrentDensity: tempdestRC corrupted");
      SetArrayToDouble0((double *)sigma_imag[in+dex*NDIM],Dens0->TotalSize*2);
      for (g=0; g < Lev0->MaxG; g++)
        if (GArray[g].GSq > MYEPSILON) { // skip due to divisor
          Index = GArray[g].Index; // re = im, im = -re due to "i" in formula
          //if (tempdestRC != (fftw_complex *)Dens0->DensityCArray[Temp2Density] || Index<0 || Index>=Dens0->LocalSizeC) Error(SomeError,"CalculateChemicalShieldingByReciprocalCurrentDensity: tempdestRC corrupted");
          sigma_imag[in+dex*NDIM][Index].re  = GArray[g].G[cross_lookup[0]] * (-CurrentDensityC[dex*NDIM+cross_lookup[1]][Index].im)/GArray[g].GSq;//*FFTfactor;
          sigma_imag[in+dex*NDIM][Index].re -= GArray[g].G[cross_lookup[2]] * (-CurrentDensityC[dex*NDIM+cross_lookup[3]][Index].im)/GArray[g].GSq;//*FFTfactor;
          sigma_imag[in+dex*NDIM][Index].im  = GArray[g].G[cross_lookup[0]] * ( CurrentDensityC[dex*NDIM+cross_lookup[1]][Index].re)/GArray[g].GSq;//*FFTfactor;
          sigma_imag[in+dex*NDIM][Index].im -= GArray[g].G[cross_lookup[2]] * ( CurrentDensityC[dex*NDIM+cross_lookup[3]][Index].re)/GArray[g].GSq;//*FFTfactor;
        } else {  // G=0-component stems from magnetic susceptibility
          sigma_imag[in+dex*NDIM][GArray[g].Index].re = 2./3.*I->I[0].chi[in+dex*NDIM];//-4.*M_PI*(0.5*I->I[0].chi[0+0*NDIM]+0.5*I->I[0].chi[1+1*NDIM]+2./3.*I->I[0].chi[2+2*NDIM]);
        }
      for (g=0; g<Lev0->MaxDoubleG; g++) { // apply symmetry
        //if (tempdestRC != (fftw_complex *)Dens0->DensityCArray[Temp2Density] || Lev0->DoubleG[2*g+1]<0 || Lev0->DoubleG[2*g+1]>=Dens0->LocalSizeC) Error(SomeError,"CalculateChemicalShieldingByReciprocalCurrentDensity: tempdestRC corrupted");
        sigma_imag[in+dex*NDIM][Lev0->DoubleG[2*g+1]].re =  sigma_imag[in+dex*NDIM][Lev0->DoubleG[2*g]].re;
        sigma_imag[in+dex*NDIM][Lev0->DoubleG[2*g+1]].im = -sigma_imag[in+dex*NDIM][Lev0->DoubleG[2*g]].im;
      }
      // fourier transformation of sigma
      //if (tempdestRC != (fftw_complex *)Dens0->DensityCArray[Temp2Density]) Error(SomeError,"CalculateChemicalShieldingByReciprocalCurrentDensity: tempdestRC corrupted");
      fft_3d_complex_to_real(plan, Lev0->LevelNo, FFTNF1, sigma_imag[in+dex*NDIM], work);

      for (it=0; it < I->Max_Types; it++) {  // integration over all types
        for (ion=0; ion < I->I[it].Max_IonsOfType; ion++) { // and each ion of type
          // read transformed sigma at core position and MPI_Allreduce
          for (g=0;g<NDIM;g++) {
            n[0][g] = floor(I->I[it].R[NDIM*ion+g]/sqrt(Lat->RealBasisSQ[g])*(double)N[g]);
            n[1][g] = ceil(I->I[it].R[NDIM*ion+g]/sqrt(Lat->RealBasisSQ[g])*(double)N[g]);
            x[1][g] = I->I[it].R[NDIM*ion+g]/sqrt(Lat->RealBasisSQ[g])*(double)N[g] - (double)n[0][g];
            x[0][g] = 1. - x[1][g];
            //fprintf(stderr,"(%i) n_floor[%i] = %i\tn_ceil[%i] = %i --- x_floor[%i] = %e\tx_ceil[%i] = %e\n",P->Par.me, g,n[0][g], g,n[1][g], g,x[0][g], g,x[1][g]);
          }
          sigma = 0.;
          for (i=0;i<2;i++) { // interpolate linearly between adjacent grid points per axis
            if ((n[i][0] >= N0*myPE) && (n[i][0] < N0*(myPE+1))) {
//              fprintf(stderr,"(%i) field[%i]: sigma = %e\n", P->Par.me,  n[i][2]+N[2]*(n[i][1]+N[1]*(n[i][0]-N0*myPE)), sigma);
              sigma += (x[i][0]*x[0][1]*x[0][2])*sigma_real[in+dex*NDIM][n[0][2]+N[2]*(n[0][1]+N[1]*(n[i][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
              //fprintf(stderr,"(%i) field[%i]: sigma += %e * %e \n", P->Par.me,  n[i][2]+N[2]*(n[i][1]+N[1]*(n[i][0]-N0*myPE)), (x[i][0]*x[0][1]*x[0][2]), field[n[0][2]+N[2]*(n[0][1]+N[1]*(n[i][0]-N0*myPE))]*mu0);
              sigma += (x[i][0]*x[0][1]*x[1][2])*sigma_real[in+dex*NDIM][n[1][2]+N[2]*(n[0][1]+N[1]*(n[i][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
              //fprintf(stderr,"(%i) field[%i]: sigma += %e * %e \n", P->Par.me,  n[i][2]+N[2]*(n[i][1]+N[1]*(n[i][0]-N0*myPE)), (x[i][0]*x[0][1]*x[1][2]), field[n[1][2]+N[2]*(n[0][1]+N[1]*(n[i][0]-N0*myPE))]*mu0);
              sigma += (x[i][0]*x[1][1]*x[0][2])*sigma_real[in+dex*NDIM][n[0][2]+N[2]*(n[1][1]+N[1]*(n[i][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
              //fprintf(stderr,"(%i) field[%i]: sigma += %e * %e \n", P->Par.me,  n[i][2]+N[2]*(n[i][1]+N[1]*(n[i][0]-N0*myPE)), (x[i][0]*x[1][1]*x[0][2]), field[n[0][2]+N[2]*(n[1][1]+N[1]*(n[i][0]-N0*myPE))]*mu0);
              sigma += (x[i][0]*x[1][1]*x[1][2])*sigma_real[in+dex*NDIM][n[1][2]+N[2]*(n[1][1]+N[1]*(n[i][0]-N0*myPE))]*mu0; // if it's local and factor from inverse fft
              //fprintf(stderr,"(%i) field[%i]: sigma += %e * %e \n", P->Par.me,  n[i][2]+N[2]*(n[i][1]+N[1]*(n[i][0]-N0*myPE)), (x[i][0]*x[1][1]*x[1][2]), field[n[1][2]+N[2]*(n[1][1]+N[1]*(n[i][0]-N0*myPE))]*mu0);
            }
          }
          sigma *= -R->FactorDensityR; // factor from inverse fft? (and its defined as negative proportionaly factor)
          MPI_Allreduce ( &sigma, &Sigma, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);  // sum local to total
          I->I[it].sigma_rezi[ion][in+dex*NDIM] = Sigma;
        }
      }
      // fabs() all sigma values, as we need them as a positive density: OutputVis plots them in logarithmic scale and
      // thus cannot deal with negative values!
      for (i=0; i< Dens0->LocalSizeR; i++)
        sigma_real[in+dex*NDIM][i] = fabs(sigma_real[in+dex*NDIM][i]);
    }
  }
  UnLockDensityArray(Dens0,TempDensity,imag); // work
  UnLockDensityArray(Dens0,Temp2Density,imag); // tempdestRC and field

  // output tensor to file
  if (P->Par.me == 0) {
    sprintf(&suffixsigma[0], ".sigma_chi_rezi.L%i.csv", Lev0->LevelNo);
    OpenFile(P, &SigmaFile, suffixsigma, "a", P->Call.out[ReadOut]);
    fprintf(SigmaFile,"# chemical shielding tensor sigma_rezi[01,02,03,10,11,12,20,21,22], seed %i, config %s, run on %s", R->Seed, P->Files.default_path, ctime(&seconds));
    fprintf(SigmaFile,"%lg\t", P->Lat.ECut/(Lat->LevelSizes[0]*Lat->LevelSizes[0]));
    for (in=0;in<NDIM;in++)
      for (dex=0;dex<NDIM;dex++)
        fprintf(SigmaFile,"%e\t", GSL_REAL(gsl_matrix_complex_get(H,in,dex)));
    fprintf(SigmaFile,"\n");
    fclose(SigmaFile);
  }

  gsl_vector *eval = gsl_vector_alloc(NDIM);
  gsl_eigen_herm_workspace *w = gsl_eigen_herm_alloc(NDIM);

  for (it=0; it < I->Max_Types; it++) {  // integration over all types
    for (ion=0; ion < I->I[it].Max_IonsOfType; ion++) { // and each ion of type
      if (P->Call.out[ValueOut]) fprintf(stderr,"(%i) Shielding Tensor for Ion %i of element %s \\sigma_ij = \n",P->Par.me, ion, I->I[it].Name);
      for (in=0; in<NDIM; in++) { // index i of vector component in integrand
        for (dex=0; dex<NDIM; dex++) {// index j of B component derivation in current density tensor
          gsl_matrix_complex_set(H,in,dex,gsl_complex_rect((I->I[it].sigma_rezi[ion][in+dex*NDIM]+I->I[it].sigma_rezi[ion][dex+in*NDIM])/2.,0));
          if (P->Call.out[ValueOut]) fprintf(stderr,"%e\t", I->I[it].sigma_rezi[ion][in+dex*NDIM]);
        }
        if (P->Call.out[ValueOut]) fprintf(stderr,"\n");
      }
      // output tensor to file
      if (P->Par.me == 0) {
        sprintf(&suffixsigma[0], ".sigma_i%i_%s_rezi.L%i.csv", ion, I->I[it].Symbol, Lev0->LevelNo);
        OpenFile(P, &SigmaFile, suffixsigma, "a", P->Call.out[ReadOut]);
        fprintf(SigmaFile,"# chemical shielding tensor sigma_rezi[01,02,03,10,11,12,20,21,22], seed %i, config %s, run on %s", R->Seed, P->Files.default_path, ctime(&seconds));
        fprintf(SigmaFile,"%lg\t", P->Lat.ECut/(Lat->LevelSizes[0]*Lat->LevelSizes[0]));
        for (in=0;in<NDIM;in++)
          for (dex=0;dex<NDIM;dex++)
            fprintf(SigmaFile,"%e\t", I->I[it].sigma_rezi[ion][in+dex*NDIM]);
        fprintf(SigmaFile,"\n");
        fclose(SigmaFile);
      }
      // diagonalize sigma
      gsl_eigen_herm(H, eval, w);
      gsl_sort_vector(eval);  // sort eigenvalues
//      print eigenvalues
//      if (P->Call.out[ValueOut]) {
//        fprintf(stderr,"(%i) diagonal shielding for Ion %i of element %s:", P->Par.me, ion, I->I[it].Name);
//        for (in=0;in<NDIM;in++)
//          fprintf(stderr,"\t%lg",gsl_vector_get(eval,in));
//        fprintf(stderr,"\n\n");
//      }
      iso = 0.;
      for (i=0;i<NDIM;i++) {
        I->I[it].sigma_rezi_PAS[ion][i] = gsl_vector_get(eval,i);
        iso += I->I[it].sigma_rezi[ion][i+i*NDIM]/3.;
      } 
      eta = (gsl_vector_get(eval,1)-gsl_vector_get(eval,0))/(gsl_vector_get(eval,2)-iso);
      delta_sigma = gsl_vector_get(eval,2) - 0.5*(gsl_vector_get(eval,0)+gsl_vector_get(eval,1));
      S = (delta_sigma*delta_sigma)*(1+1./3.*eta*eta);
      A = 0.;
      for (i=0;i<NDIM;i++) {
        in = cross(i,0);
        dex = cross(i,1);
        A += pow(-1,i)*pow(0.5*(I->I[it].sigma_rezi[ion][in+dex*NDIM]-I->I[it].sigma_rezi[ion][dex+in*NDIM]),2);
      }
      if (P->Call.out[ValueOut]) {
        fprintf(stderr,"(%i) converted to Principal Axis System\n==================\nDiagonal entries:", P->Par.me);
        for (i=0;i<NDIM;i++)
          fprintf(stderr,"\t%lg",gsl_vector_get(eval,i));
        fprintf(stderr,"\nshielding  : %e\n", iso);
        fprintf(stderr,"anisotropy : %e\n", delta_sigma);
        fprintf(stderr,"asymmetry  : %e\n", eta);
        fprintf(stderr,"S          : %e\n", S);
        fprintf(stderr,"A          : %e\n", A);
        fprintf(stderr,"==================\n");
        
      }
    }
  }
  
  // Output of magnetic field densities for each direction
  for (i=0;i<NDIM*NDIM;i++)
    OutputVis(P, sigma_real[i]);    
  // Diagonalizing the tensor "field" B_ij [r]
  fprintf(stderr,"(%i) Diagonalizing B_ij [r] ... \n", P->Par.me);
  for (i=0; i< Dens0->LocalSizeR; i++) {
    for (in=0; in<NDIM; in++) // index i of vector component in integrand
      for (dex=0; dex<NDIM; dex++) { // index j of B component derivation in current density tensor
        //fprintf(stderr,"(%i) Setting B_(%i,%i)[%i] ... \n", P->Par.me, in,dex,i);
        gsl_matrix_complex_set(H,in,dex,gsl_complex_rect((sigma_real[in+dex*NDIM][i]+sigma_real[dex+in*NDIM][i])/2.,0));
      }
    gsl_eigen_herm(H, eval, w);
    gsl_sort_vector(eval);  // sort eigenvalues
    for (in=0;in<NDIM;in++)
      sigma_real[in][i] = gsl_vector_get(eval,in);
  }
  // Output of diagonalized magnetic field densities for each direction
  for (i=0;i<NDIM;i++)
    OutputVis(P, sigma_real[i]);    
  for (i=0;i<NDIM*NDIM;i++)
    UnLockDensityArray(Dens0,i,real);  // sigma_imag/real free

  gsl_eigen_herm_free(w);
  gsl_vector_free(eval);
  gsl_matrix_complex_free(H);
}


/** Calculates the chemical shielding tensor at the positions of the nuclei.
 * The chemical shielding tensor at position R is defined as the proportionality factor between the induced and
 * the externally applied field.
 * \f[
 *  \sigma_{ij} (R) = \frac{\delta B_j^{ind} (R)}{\delta B_i^{ext}} 
 *  = \frac{\mu_0}{4 \pi} \int d^3 r' \left ( \frac{r'-r}{| r' - r |^3} \times J_i (r') \right )_j 
 * \f]
 * One after another for each nuclear position is the tensor evaluated and the result printed
 * to screen. Tensor is diagonalized afterwards.
 * \param *P Problem at hand
 * \sa CalculateMagneticSusceptibility() - similar calculation, yet without translation to ion centers.
 * \warning This routine is out-dated due to being numerically unstable because of the singularity which is not
 * considered carefully, recommendend replacement is CalculateChemicalShieldingByReciprocalCurrentDensity().
 */
void CalculateChemicalShielding(struct Problem *P)
{
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens0 = R->Lev0->Dens;
  struct Ions *I = &P->Ion;
  double sigma[NDIM*NDIM],Sigma[NDIM*NDIM];
  fftw_real *CurrentDensity[NDIM*NDIM];
  int it, ion, in, dex, i0, n[NDIM], n0, i;//, *NUp;
  double r[NDIM], fac[NDIM], X[NDIM];
  const double discrete_factor = Lat->Volume/Lev0->MaxN;
  double eta, delta_sigma, S, A, iso;
  const int myPE =  P->Par.me_comm_ST_Psi;
  int N[NDIM];
  N[0] = Lev0->Plan0.plan->N[0];
  N[1] = Lev0->Plan0.plan->N[1];
  N[2] = Lev0->Plan0.plan->N[2];
  const int N0 = Lev0->Plan0.plan->local_nx;
  FILE *SigmaFile;
  char suffixsigma[255];
  time_t seconds;  
  time(&seconds); // get current time
  
  // set pointers onto current density
  CurrentDensity[0] = (fftw_real *) Dens0->DensityArray[CurrentDensity0];
  CurrentDensity[1] = (fftw_real *) Dens0->DensityArray[CurrentDensity1];
  CurrentDensity[2] = (fftw_real *) Dens0->DensityArray[CurrentDensity2];
  CurrentDensity[3] = (fftw_real *) Dens0->DensityArray[CurrentDensity3];
  CurrentDensity[4] = (fftw_real *) Dens0->DensityArray[CurrentDensity4];
  CurrentDensity[5] = (fftw_real *) Dens0->DensityArray[CurrentDensity5];
  CurrentDensity[6] = (fftw_real *) Dens0->DensityArray[CurrentDensity6];
  CurrentDensity[7] = (fftw_real *) Dens0->DensityArray[CurrentDensity7];
  CurrentDensity[8] = (fftw_real *) Dens0->DensityArray[CurrentDensity8];
  gsl_matrix_complex *H = gsl_matrix_complex_calloc(NDIM,NDIM);
  
  for (it=0; it < I->Max_Types; it++) {  // integration over all types
    for (ion=0; ion < I->I[it].Max_IonsOfType; ion++) { // and each ion of type
      if (P->Call.out[ValueOut]) fprintf(stderr,"(%i) Shielding Tensor for Ion %i of element %s \\sigma_ij = \n",P->Par.me, ion, I->I[it].Name);
      for (in=0; in<NDIM; in++) {// index i of vector component in integrand
        for (dex=0; dex<NDIM; dex++) { // index j of B component derivation in current density tensor
          sigma[in+dex*NDIM] = 0.;
    
          for(n0=0;n0<N0;n0++) // do the integration over real space
            for(n[1]=0;n[1]<N[1];n[1]++)
              for(n[2]=0;n[2]<N[2];n[2]++) {
                n[0]=n0 + N0*myPE; // global relative coordinate: due to partitoning of x-axis in PEPGamma>1 case
                fac[0] = (double)n[0]/(double)N[0];
                fac[1] = (double)n[1]/(double)N[1];
                fac[2] = (double)n[2]/(double)N[2];
                RMat33Vec3(r, Lat->RealBasis, fac);
                MinImageConv(Lat,r, &(I->I[it].R[NDIM*ion]),X);
                i0 = n[2]+N[2]*(n[1]+N[1]*(n0));  // the index of current density must match LocalSizeR!
                //z = MinImageConv(Lat,r, I->I[it].R[NDIM*ion],in);  // "in" always is missing third component in cross product
                sigma[in+dex*NDIM] += (X[cross(in,0)] * CurrentDensity[dex*NDIM+cross(in,1)][i0] - X[cross(in,2)] * CurrentDensity[dex*NDIM+cross(in,3)][i0]);
                //if (it == 0 && ion == 0) fprintf(stderr,"(%i) moment[%i][%i] += (%e * %e - %e * %e) = %e\n", P->Par.me, in, dex, x,CurrentDensity[dex*NDIM+cross(in,1)][i0],y,CurrentDensity[dex*NDIM+cross(in,3)][i0],moment[in+dex*NDIM]); 
              }
          sigma[in+dex*NDIM] *= -mu0*discrete_factor/(4.*PI); // due to summation instead of integration
          MPI_Allreduce ( &sigma[in+dex*NDIM], &Sigma[in+dex*NDIM], 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);  // sum "LocalSize to TotalSize"
          I->I[it].sigma[ion][in+dex*NDIM] = Sigma[in+dex*NDIM];          
          if (P->Call.out[ValueOut]) fprintf(stderr," %e", Sigma[in+dex*NDIM]);
        }
        if (P->Call.out[ValueOut]) fprintf(stderr,"\n");
      }
      // store symmetrized matrix
      for (in=0;in<NDIM;in++)
        for (dex=0;dex<NDIM;dex++)
          gsl_matrix_complex_set(H,in,dex,gsl_complex_rect((Sigma[in+dex*NDIM]+Sigma[dex+in*NDIM])/2.,0));
      // output tensor to file
      if (P->Par.me == 0) {
        sprintf(&suffixsigma[0], ".sigma_i%i_%s.L%i.csv", ion, I->I[it].Symbol, Lev0->LevelNo);
        OpenFile(P, &SigmaFile, suffixsigma, "a", P->Call.out[ReadOut]);
        fprintf(SigmaFile,"# chemical shielding tensor sigma[01,02,03,10,11,12,20,21,22], seed %i, config %s, run on %s", R->Seed, P->Files.default_path, ctime(&seconds));
        fprintf(SigmaFile,"%lg\t", P->Lat.ECut/(Lat->LevelSizes[0]*Lat->LevelSizes[0]));
        for (in=0;in<NDIM*NDIM;in++)
          fprintf(SigmaFile,"%e\t", Sigma[in]);
        fprintf(SigmaFile,"\n");
        fclose(SigmaFile);
      }
      // diagonalize sigma
      gsl_vector *eval = gsl_vector_alloc(NDIM);
      gsl_eigen_herm_workspace *w = gsl_eigen_herm_alloc(NDIM);
      gsl_eigen_herm(H, eval, w);
      gsl_eigen_herm_free(w);
      gsl_sort_vector(eval);  // sort eigenvalues
      // print eigenvalues
//      if (P->Call.out[ValueOut]) {
//        fprintf(stderr,"(%i) diagonal shielding for Ion %i of element %s:", P->Par.me, ion, I->I[it].Name);
//        for (in=0;in<NDIM;in++)
//          fprintf(stderr,"\t%lg",gsl_vector_get(eval,in));
//        fprintf(stderr,"\n\n");
//      }
      // print eigenvalues
      iso = 0;
      for (i=0;i<NDIM;i++) {
        I->I[it].sigma[ion][i] = gsl_vector_get(eval,i);
        iso += Sigma[i+i*NDIM]/3.;
      } 
      eta = (gsl_vector_get(eval,1)-gsl_vector_get(eval,0))/(gsl_vector_get(eval,2)-iso);
      delta_sigma = gsl_vector_get(eval,2) - 0.5*(gsl_vector_get(eval,0)+gsl_vector_get(eval,1));
      S = (delta_sigma*delta_sigma)*(1+1./3.*eta*eta);
      A = 0.;
      for (i=0;i<NDIM;i++) {
        in = cross(i,0);
        dex = cross(i,1);
        A += pow(-1,i)*pow(0.5*(Sigma[in+dex*NDIM]-Sigma[dex+in*NDIM]),2);
      }
      if (P->Call.out[ValueOut]) {
        fprintf(stderr,"(%i) converted to Principal Axis System\n==================\nDiagonal entries:", P->Par.me);
        for (i=0;i<NDIM;i++)
          fprintf(stderr,"\t%lg",gsl_vector_get(eval,i));
        fprintf(stderr,"\nshielding  : %e\n", iso);
        fprintf(stderr,"anisotropy : %e\n", delta_sigma);
        fprintf(stderr,"asymmetry  : %e\n", eta);
        fprintf(stderr,"S          : %e\n", S);
        fprintf(stderr,"A          : %e\n", A);
        fprintf(stderr,"==================\n");
        
      }
      gsl_vector_free(eval);
    }
  }

  gsl_matrix_complex_free(H);
}

/** Test if G=0-component of reciprocal current is 0.
 * In most cases we do not reach a numerical sensible zero as in MYEPSILON and remain satisfied as long
 * as the integrated current density is very small (e.g. compared to single entries in the current density array)
 * \param *P Problem at hand
 * \param *CurrentC pointer to reciprocal current density
 * \param *GArray pointer to array with G vectors
 * \param in index of current component
 * \sa TestCurrent() these two tests are equivalent and follow by fourier transformation
 */
void TestReciprocalCurrent(struct Problem *P, const fftw_complex *CurrentC, struct OneGData *GArray, int in)
{
  double tmp;
  tmp = sqrt(CurrentC[0].re*CurrentC[0].re+CurrentC[0].im*CurrentC[0].im);
  if ((P->Call.out[LeaderOut]) && (GArray[0].GSq < MYEPSILON)) {
    if (in % NDIM == 0) fprintf(stderr,"(%i) ",P->Par.me); 
    if (tmp > MYEPSILON) {
      fprintf(stderr,"J_{%i,%i} = |%e + i%e| < %e ? (%e)\t", in / NDIM, in%NDIM, CurrentC[0].re, CurrentC[0].im, MYEPSILON, tmp - MYEPSILON);
    } else { 
      fprintf(stderr,"J_{%i,%i} ok\t", in / NDIM, in%NDIM);
    }
    if (in % NDIM == (NDIM-1)) fprintf(stderr,"\n");
  }   
}

/** Test if integrated current over cell is 0.
 * In most cases we do not reach a numerical sensible zero as in MYEPSILON and remain satisfied as long
 * as the integrated current density is very small (e.g. compared to single entries in the current density array)
 * \param *P Problem at hand
 * \param index index of current component
 * \sa CalculateNativeIntDens() for integration of one current tensor component
 */
 void TestCurrent(struct Problem *P, const int index)
{
  struct RunStruct *R = &P->R;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens0 = Lev0->Dens;
  fftw_real *CurrentDensity[NDIM*NDIM];
  int in;
  double result[NDIM*NDIM], res = 0.;

  // set pointers onto current density array and get number of grid points in each direction
  CurrentDensity[0] = (fftw_real *) Dens0->DensityArray[CurrentDensity0];
  CurrentDensity[1] = (fftw_real *) Dens0->DensityArray[CurrentDensity1];
  CurrentDensity[2] = (fftw_real *) Dens0->DensityArray[CurrentDensity2];
  CurrentDensity[3] = (fftw_real *) Dens0->DensityArray[CurrentDensity3];
  CurrentDensity[4] = (fftw_real *) Dens0->DensityArray[CurrentDensity4];
  CurrentDensity[5] = (fftw_real *) Dens0->DensityArray[CurrentDensity5];
  CurrentDensity[6] = (fftw_real *) Dens0->DensityArray[CurrentDensity6];
  CurrentDensity[7] = (fftw_real *) Dens0->DensityArray[CurrentDensity7];
  CurrentDensity[8] = (fftw_real *) Dens0->DensityArray[CurrentDensity8];
  for(in=0;in<NDIM;in++) {
    result[in] = CalculateNativeIntDens(P,Lev0,CurrentDensity[in + NDIM*index],R->FactorDensityR);
    res += pow(result[in],2.);
  }
  res = sqrt(res);
  // if greater than 0, complain about it
  if ((res > MYEPSILON) && (P->Call.out[LeaderOut])) 
    fprintf(stderr, "(%i) \\int_\\Omega d^3 r j_%i(r) = (%e,%e,%e), %e > %e!\n",P->Par.me, index, result[0], result[1], result[2], res, MYEPSILON);
}

/** Testing whether re<->im switches (due to symmetry) confuses fft.
 * \param *P Problem at hand
 * \param l local wave function number
 */
void test_fft_symmetry(struct Problem *P, const int l)
{
  struct Lattice *Lat = &P->Lat;
  struct RunStruct *R = &P->R;
  struct LatticeLevel *LevS = R->LevS;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens0 = Lev0->Dens;
  struct  fft_plan_3d *plan = Lat->plan;
  fftw_complex *tempdestRC = (fftw_complex *)Dens0->DensityCArray[Temp2Density];
  fftw_complex *work = Dens0->DensityCArray[TempDensity];
  fftw_complex *workC = (fftw_complex *)Dens0->DensityArray[TempDensity];
  fftw_complex *posfac, *destpos, *destRCS, *destRCD;
  fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
  fftw_real *PsiCR = (fftw_real *) PsiC;
  fftw_complex *Psi0 = LevS->LPsi->LocalPsi[l];
  fftw_complex *dest = LevS->LPsi->TempPsi;
  fftw_real *Psi0R = (fftw_real *)Dens0->DensityArray[Temp2Density];
  int i,Index, pos, i0, iS,g; //, NoOfPsis = Psi->TypeStartIndex[UnOccupied] - Psi->TypeStartIndex[Occupied];
  int n[NDIM], n0;
  const int N0 = LevS->Plan0.plan->local_nx; // we don't want to build global density, but local
  int N[NDIM], NUp[NDIM];
  N[0] = LevS->Plan0.plan->N[0];
  N[1] = LevS->Plan0.plan->N[1];
  N[2] = LevS->Plan0.plan->N[2];
  NUp[0] = LevS->NUp[0]; 
  NUp[1] = LevS->NUp[1]; 
  NUp[2] = LevS->NUp[2]; 
  //const int k_normal = Lat->Psi.TypeStartIndex[Occupied] + (l - Lat->Psi.TypeStartIndex[R->CurrentMin]);
  //const double *Wcentre = Lat->Psi.AddData[k_normal].WannierCentre;
  //double x[NDIM], fac[NDIM]; 
  double result1=0., result2=0., result3=0., result4=0.;
  double Result1=0., Result2=0., Result3=0., Result4=0.;
  const double HGcRCFactor = 1./LevS->MaxN; // factor for inverse fft

  
  // fft to real space
  SetArrayToDouble0((double *)tempdestRC, Dens0->TotalSize*2);
  SetArrayToDouble0((double *)PsiC, Dens0->TotalSize*2);
  for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is positive
    Index = LevS->GArray[i].Index;
    posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
    destpos = &tempdestRC[LevS->MaxNUp*Index];
    for (pos=0; pos < LevS->MaxNUp; pos++) {
      destpos[pos].re = (Psi0[i].re)*posfac[pos].re-(Psi0[i].im)*posfac[pos].im;
      destpos[pos].im = (Psi0[i].re)*posfac[pos].im+(Psi0[i].im)*posfac[pos].re;
      //destpos[pos].re = (Psi0[i].im)*posfac[pos].re-(-Psi0[i].re)*posfac[pos].im;
      //destpos[pos].im = (Psi0[i].im)*posfac[pos].im+(-Psi0[i].re)*posfac[pos].re;
    }
  }
  for (i=0; i<LevS->MaxDoubleG; i++) {
    destRCS = &tempdestRC[LevS->DoubleG[2*i]*LevS->MaxNUp];
    destRCD = &tempdestRC[LevS->DoubleG[2*i+1]*LevS->MaxNUp];
    for (pos=0; pos < LevS->MaxNUp; pos++) {
      destRCD[pos].re =  destRCS[pos].re;
      destRCD[pos].im = -destRCS[pos].im;
    }
  }
  fft_3d_complex_to_real(plan, LevS->LevelNo, FFTNFUp, tempdestRC, work);
  DensityRTransformPos(LevS,(fftw_real*)tempdestRC, Psi0R);  
  
  // apply position operator and do first result
  for (n0=0;n0<N0;n0++)  // only local points on x axis
    for (n[1]=0;n[1]<N[1];n[1]++)
      for (n[2]=0;n[2]<N[2];n[2]++) {
        n[0]=n0 + LevS->Plan0.plan->start_nx; // global relative coordinate: due to partitoning of x-axis in PEPGamma>1 case
        i0 = n[2]*NUp[2]+N[2]*NUp[2]*(n[1]*NUp[1]+N[1]*NUp[1]*n0*NUp[0]);
        iS = n[2]+N[2]*(n[1]+N[1]*n0);
        //x[0] += 1; // shifting expectation value of x coordinate from 0 to 1 
        PsiCR[iS] = Psi0R[i0]; // truedist(Lat, x[0], Wcentre[0],0) * 
        result1 += PsiCR[iS] * Psi0R[i0];         
      }               
  result1 /= LevS->MaxN; // factor due to discrete integration
  MPI_Allreduce ( &result1, &Result1, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);   // sum "LocalSize to TotalSize"
  if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) 1st result: %e\n",P->Par.me, Result1);
    
  // fft to reciprocal space and do second result
  fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, PsiC, workC);
  SetArrayToDouble0((double *)dest, 2*R->InitLevS->MaxG);
  for (g=0; g < LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    dest[g].re = (Psi0[Index].re)*HGcRCFactor; 
    dest[g].im = (Psi0[Index].im)*HGcRCFactor; 
  }
  result2 = GradSP(P,LevS,Psi0,dest);  
  MPI_Allreduce ( &result2, &Result2, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);   // sum "LocalSize to TotalSize"
  if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) 2nd result: %e\n",P->Par.me, Result2);  
  
  // fft again to real space, this time change symmetry
  SetArrayToDouble0((double *)tempdestRC, Dens0->TotalSize*2);
  SetArrayToDouble0((double *)PsiC, Dens0->TotalSize*2);
  for (i=0;i<LevS->MaxG;i++) { // incoming is positive, outgoing is positive
    Index = LevS->GArray[i].Index;
    posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
    destpos = &tempdestRC[LevS->MaxNUp*Index];
    for (pos=0; pos < LevS->MaxNUp; pos++) {
      destpos[pos].re = (Psi0[i].im)*posfac[pos].re-(-Psi0[i].re)*posfac[pos].im;
      destpos[pos].im = (Psi0[i].im)*posfac[pos].im+(-Psi0[i].re)*posfac[pos].re;
    }
  }
  for (i=0; i<LevS->MaxDoubleG; i++) {
    destRCS = &tempdestRC[LevS->DoubleG[2*i]*LevS->MaxNUp];
    destRCD = &tempdestRC[LevS->DoubleG[2*i+1]*LevS->MaxNUp];
    for (pos=0; pos < LevS->MaxNUp; pos++) {
      destRCD[pos].re =  destRCS[pos].re;
      destRCD[pos].im = -destRCS[pos].im;
    }
  }
  fft_3d_complex_to_real(plan, LevS->LevelNo, FFTNFUp, tempdestRC, work);
  DensityRTransformPos(LevS,(fftw_real*)tempdestRC, Psi0R);
  
  // bring down from Lev0 to LevS
  for (n0=0;n0<N0;n0++)  // only local points on x axis
    for (n[1]=0;n[1]<N[1];n[1]++)
      for (n[2]=0;n[2]<N[2];n[2]++) {
        i0 = n[2]*NUp[2]+N[2]*NUp[2]*(n[1]*NUp[1]+N[1]*NUp[1]*n0*NUp[0]);
        iS = n[2]+N[2]*(n[1]+N[1]*n0);
        PsiCR[iS] = Psi0R[i0]; // truedist(Lat, x[0], Wcentre[0],0) * 
        result3 += PsiCR[iS] * Psi0R[i0];         
      }               
  result3 /= LevS->MaxN; // factor due to discrete integration
  MPI_Allreduce ( &result3, &Result3, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);   // sum "LocalSize to TotalSize"
  if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) 3rd result: %e\n",P->Par.me, Result3);  
  
  // fft back to reciprocal space, change symmetry back and do third result
  fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, PsiC, workC);
  SetArrayToDouble0((double *)dest, 2*R->InitLevS->MaxG);
  for (g=0; g < LevS->MaxG; g++) {
    Index = LevS->GArray[g].Index;
    dest[g].re = (-PsiC[Index].im)*HGcRCFactor; 
    dest[g].im = ( PsiC[Index].re)*HGcRCFactor; 
  }
  result4 = GradSP(P,LevS,Psi0,dest);
  MPI_Allreduce ( &result4, &Result4, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);   // sum "LocalSize to TotalSize"
  if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) 4th result: %e\n",P->Par.me, Result4);  
}


/** Test function to check RxP application.
 * Checks applied solution to an analytic for a specific and simple wave function -
 * where just one coefficient is unequal to zero.
 * \param *P Problem at hand
         exp(I b G) - I exp(I b G) b G - exp(I a G) + I exp(I a G) a G
 -------------------------------------------------------------
                               2                              
                              G                               
 */
void test_rxp(struct Problem *P)
{
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  //struct LatticeLevel *Lev0 = R->Lev0;
  struct LatticeLevel *LevS = R->LevS;
  struct OneGData *GA = LevS->GArray;
  //struct Density *Dens0 = Lev0->Dens;
  fftw_complex *Psi0 = LevS->LPsi->TempPsi;
  fftw_complex *Psi2 = P->Grad.GradientArray[GraSchGradient];
  fftw_complex *Psi3 = LevS->LPsi->TempPsi2;
  int g, g_bar, i, j, k, k_normal = 0;
  double tmp, a,b, G;
  //const double *Wcentre = Lat->Psi.AddData[k_normal].WannierCentre;
  const double discrete_factor = 1.;//Lat->Volume/LevS->MaxN;
  fftw_complex integral;
  
  // reset coefficients
  debug (P,"Creating RxP test function.");
  SetArrayToDouble0((double *)Psi0,2*R->InitLevS->MaxG);
  SetArrayToDouble0((double *)Psi2,2*R->InitLevS->MaxG);

  // pick one which becomes non-zero
  g = 3;
  
  //for (g=0;g<LevS->MaxG;g++) {
    Psi0[g].re = 1.;
    Psi0[g].im = 0.;
  //}
  fprintf(stderr,"(%i) G[%i] = (%e,%e,%e) \n",P->Par.me, g, GA[g].G[0], GA[g].G[1], GA[g].G[2]);
  i = 0;

  // calculate analytic result
  debug (P,"Calculating analytic solution.");
  for (g_bar=0;g_bar<LevS->MaxG;g_bar++) {
    for (g=0;g<LevS->MaxG;g++) {
      if (GA[g].G[i] == GA[g_bar].G[i]) {
        j = cross(i,0);
        k = cross(i,1);
        if (GA[g].G[k] == GA[g_bar].G[k]) {
          //b = truedist(Lat, sqrt(Lat->RealBasisSQ[j]), Wcentre[j], j); 
          b = sqrt(Lat->RealBasisSQ[j]);
          //a = truedist(Lat, 0., Wcentre[j], j);  
          a = 0.;
          G = 1; //GA[g].G[k];
          if (GA[g].G[j] == GA[g_bar].G[j]) {
            Psi2[g_bar].re += G*Psi0[g].re * (.5 * b * b - .5 * a * a) * discrete_factor;
            Psi2[g_bar].im += G*Psi0[g].im * (.5 * b * b - .5 * a * a) * discrete_factor;
            //if ((G != 0) && ((fabs(Psi0[g].re) > MYEPSILON) || (fabs(Psi0[g].im) > MYEPSILON))) 
              //fprintf(stderr,"(%i) Psi[%i].re += %e +i %e\n",P->Par.me, g_bar, G*Psi0[g].re * (.5 * b * b - .5 * a * a) * discrete_factor, G*Psi0[g].im * (.5 * b * b - .5 * a * a) * discrete_factor);
          } else {
            tmp = GA[g].G[j]-GA[g_bar].G[j];
            integral.re = (cos(tmp*b)+sin(tmp*b)*b*tmp - cos(tmp*a)-sin(tmp*a)*a*tmp) / (tmp * tmp);
            integral.im = (sin(tmp*b)-cos(tmp*b)*b*tmp - sin(tmp*a)+cos(tmp*a)*a*tmp) / (tmp * tmp);
            Psi2[g_bar].re += G*(Psi0[g].re*integral.re - Psi0[g].im*integral.im) * discrete_factor; 
            Psi2[g_bar].im += G*(Psi0[g].re*integral.im + Psi0[g].im*integral.re) * discrete_factor;
            //if ((G != 0) && ((fabs(Psi0[g].re) > MYEPSILON) || (fabs(Psi0[g].im) > MYEPSILON)))  
              //fprintf(stderr,"(%i) Psi[%i].re += %e\tPsi[%i].im += %e \n",P->Par.me, g_bar, G*(Psi0[g].re*integral.re - Psi0[g].im*integral.im) * discrete_factor, g_bar, G*(Psi0[g].re*integral.im + Psi0[g].im*integral.re) * discrete_factor);
          }
        }
        j = cross(i,2);
        k = cross(i,3);
        if (GA[g].G[k] == GA[g_bar].G[k]) {
          //b = truedist(Lat, sqrt(Lat->RealBasisSQ[j]), Wcentre[j], j); 
          b = sqrt(Lat->RealBasisSQ[j]);
          //a = truedist(Lat, 0., Wcentre[j], j);  
          a = 0.;
          G = 1; //GA[g].G[k];
          if (GA[g].G[j] == GA[g_bar].G[j]) {
            Psi2[g_bar].re += G*Psi0[g].re * (.5 * b * b - .5 * a * a) * discrete_factor;
            Psi2[g_bar].im += G*Psi0[g].im * (.5 * b * b - .5 * a * a) * discrete_factor;
            //if ((G != 0) && ((fabs(Psi0[g].re) > MYEPSILON) || (fabs(Psi0[g].im) > MYEPSILON))) 
              //fprintf(stderr,"(%i) Psi[%i].re += %e +i %e\n",P->Par.me, g_bar, G*Psi0[g].re * (.5 * b * b - .5 * a * a) * discrete_factor, G*Psi0[g].im * (.5 * b * b - .5 * a * a) * discrete_factor);
          } else {
            tmp = GA[g].G[j]-GA[g_bar].G[j];
            integral.re = (cos(tmp*b)+sin(tmp*b)*b*tmp - cos(tmp*a)-sin(tmp*a)*a*tmp) / (tmp * tmp);
            integral.im = (sin(tmp*b)-cos(tmp*b)*b*tmp - sin(tmp*a)+cos(tmp*a)*a*tmp) / (tmp * tmp);
            Psi2[g_bar].re += G*(Psi0[g].re*integral.re - Psi0[g].im*integral.im) * discrete_factor; 
            Psi2[g_bar].im += G*(Psi0[g].re*integral.im + Psi0[g].im*integral.re) * discrete_factor;
            //if ((G != 0) && ((fabs(Psi0[g].re) > MYEPSILON) || (fabs(Psi0[g].im) > MYEPSILON)))  
              //fprintf(stderr,"(%i) Psi[%i].re += %e\tPsi[%i].im += %e \n",P->Par.me, g_bar, G*(Psi0[g].re*integral.re - Psi0[g].im*integral.im) * discrete_factor, g_bar, G*(Psi0[g].re*integral.im + Psi0[g].im*integral.re) * discrete_factor);
          }
        }
      }
    }
  }

  // apply rxp
  debug (P,"Applying RxP to test function.");
  CalculatePerturbationOperator_RxP(P,Psi0,Psi3,k_normal,i);

  // compare both coefficient arrays
  debug(P,"Beginning comparison of analytic and Rxp applied solution.");
  for (g=0;g<LevS->MaxG;g++) {
    if ((fabs(Psi3[g].re-Psi2[g].re) >= MYEPSILON) || (fabs(Psi3[g].im-Psi2[g].im) >= MYEPSILON))
      fprintf(stderr,"(%i) Psi3[%i] = %e +i %e != Psi2[%i] = %e +i %e\n",P->Par.me, g, Psi3[g].re, Psi3[g].im, g, Psi2[g].re, Psi2[g].im);
    //else
      //fprintf(stderr,"(%i) Psi1[%i] == Psi2[%i] = %e +i %e\n",P->Par.me, g, g, Psi1[g].re, Psi1[g].im);
  }    
  fprintf(stderr,"(%i) <0|1> = <0|r|0> == %e +i %e\n",P->Par.me, GradSP(P,LevS,Psi0,Psi3), GradImSP(P,LevS,Psi0,Psi3));
  fprintf(stderr,"(%i) <1|1> = |r|� == %e +i %e\n",P->Par.me, GradSP(P,LevS,Psi3,Psi3), GradImSP(P,LevS,Psi3,Psi3));
  fprintf(stderr,"(%i) <0|0> = %e +i %e\n",P->Par.me, GradSP(P,LevS,Psi0,Psi0), GradImSP(P,LevS,Psi0,Psi0));
  fprintf(stderr,"(%i) <0|2> = %e +i %e\n",P->Par.me, GradSP(P,LevS,Psi0,Psi2), GradImSP(P,LevS,Psi0,Psi2));
}


/** Output of a (X,Y,DX,DY) 2d-vector plot.
 * For a printable representation of the induced current two-dimensional vector plots are useful, as three-dimensional
 * isospheres are sometimes mis-leading or do not represent the desired flow direction. The routine simply extracts a
 * two-dimensional cut orthogonal to one of the lattice axis at a certain node. 
 * \param *P Problem at hand
 * \param B_index direction of B field
 * \param n_orth grid node in B_index direction of the plane (the order in which the remaining two coordinate axis
 *        appear is the same as in a cross product, which is used to determine orthogonality)
 */
void PlotVectorPlane(struct Problem *P, int B_index, int n_orth)
{
  struct RunStruct *R = &P->R;
  struct LatticeLevel *Lev0 = R->Lev0;
  struct Density *Dens0 = Lev0->Dens;
  char filename[255];
  char *suchpointer;
  FILE *PlotFile = NULL;
  const int myPE = P->Par.me_comm_ST;
  time_t seconds;  
  fftw_real *CurrentDensity[NDIM*NDIM];
  CurrentDensity[0] = (fftw_real *) Dens0->DensityArray[CurrentDensity0];
  CurrentDensity[1] = (fftw_real *) Dens0->DensityArray[CurrentDensity1];
  CurrentDensity[2] = (fftw_real *) Dens0->DensityArray[CurrentDensity2];
  CurrentDensity[3] = (fftw_real *) Dens0->DensityArray[CurrentDensity3];
  CurrentDensity[4] = (fftw_real *) Dens0->DensityArray[CurrentDensity4];
  CurrentDensity[5] = (fftw_real *) Dens0->DensityArray[CurrentDensity5];
  CurrentDensity[6] = (fftw_real *) Dens0->DensityArray[CurrentDensity6];
  CurrentDensity[7] = (fftw_real *) Dens0->DensityArray[CurrentDensity7];
  CurrentDensity[8] = (fftw_real *) Dens0->DensityArray[CurrentDensity8];
  time(&seconds); // get current time

  if (!myPE) { // only process 0 writes to file
    // open file
    sprintf(&filename[0], ".current.L%i.csv", Lev0->LevelNo);
    OpenFile(P, &PlotFile, filename, "w", P->Call.out[ReadOut]);
    strcpy(filename, ctime(&seconds));
    suchpointer = strchr(filename, '\n');
    if (suchpointer != NULL)
      *suchpointer = '\0'; 
    if (PlotFile != NULL) {      
      fprintf(PlotFile,"# current vector plot of plane perpendicular to direction e_%i at node %i, seed %i, config %s, run on %s, #cpus %i", B_index, n_orth, R->Seed, P->Files.default_path, filename, P->Par.Max_me_comm_ST_Psi);
      fprintf(PlotFile,"\n");
    } else { Error(SomeError, "PlotVectorPlane: Opening Plot File"); }
  }

  // plot density
  if (!P->Par.me_comm_ST_PsiT) // only first wave function group as current density of all psis was gathered
    PlotRealDensity(P, Lev0, PlotFile, B_index, n_orth, CurrentDensity[B_index*NDIM+cross(B_index,0)], CurrentDensity[B_index*NDIM+cross(B_index,1)]);

  if (PlotFile != NULL) {      
    // close file
    fclose(PlotFile);
  }
}


/** Reads psi coefficients of \a type from file and transforms to new level.
 * \param *P Problem at hand
 * \param type PsiTypeTag of which minimisation group to load from file
 * \sa ReadSrcPsiDensity() - reading the coefficients, ChangePsiAndDensToLevUp() - transformation to upper level
 */
void ReadSrcPerturbedPsis(struct Problem *P, enum PsiTypeTag type)
{
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct LatticeLevel *Lev0 = &P->Lat.Lev[R->Lev0No+1]; // one level higher than current (ChangeLevUp already occurred)
  struct LatticeLevel *LevS = &P->Lat.Lev[R->LevSNo+1];
  struct Density *Dens = Lev0->Dens;
  struct Psis *Psi = &Lat->Psi;
  struct fft_plan_3d *plan = Lat->plan; 
  fftw_complex *work = (fftw_complex *)Dens->DensityCArray[TempDensity];
  fftw_complex *tempdestRC = (fftw_complex *)Dens->DensityArray[TempDensity];
  fftw_complex *posfac, *destpos, *destRCS, *destRCD;
  fftw_complex *source, *source0;
  int Index,i,pos;
  double factorC = 1./Lev0->MaxN; 
  int p,g;
  
  // ================= read coefficients from file to LocalPsi ============
  ReadSrcPsiDensity(P, type, 0, R->LevSNo+1);
  
  // ================= transform to upper level ===========================
  // for all local Psis do the usual transformation (completing coefficients for all grid vectors, fft, permutation)
  LockDensityArray(Dens, TempDensity, real);
  LockDensityArray(Dens, TempDensity, imag);
  for (p=Psi->LocalNo-1; p >= 0; p--) 
    if (Psi->LocalPsiStatus[p].PsiType == type) { // only for the desired type
      source = LevS->LPsi->LocalPsi[p];
      source0 = Lev0->LPsi->LocalPsi[p];
      //fprintf(stderr,"(%i) ReadSrcPerturbedPsis: LevSNo %i\t Lev0No %i\tp %i\t source %p\t source0 %p\n", P->Par.me, LevS->LevelNo, Lev0->LevelNo, p, source, source0);
      SetArrayToDouble0((double *)tempdestRC, Dens->TotalSize*2);
      for (i=0;i<LevS->MaxG;i++) {
        Index = LevS->GArray[i].Index;
        posfac = &LevS->PosFactorUp[LevS->MaxNUp*i];
        destpos = &tempdestRC[LevS->MaxNUp*Index];
        //if (isnan(source[i].re)) { fprintf(stderr,"(%i) WARNING in ReadSrcPerturbedPsis(): source_%i[%i] = NaN!\n", P->Par.me, p, i); Error(SomeError, "NaN-Fehler!"); }
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          destpos[pos].re = source[i].re*posfac[pos].re-source[i].im*posfac[pos].im;
          destpos[pos].im = source[i].re*posfac[pos].im+source[i].im*posfac[pos].re;
       }
      }
      for (i=0; i<LevS->MaxDoubleG; i++) {
        destRCS = &tempdestRC[LevS->DoubleG[2*i]*LevS->MaxNUp];
        destRCD = &tempdestRC[LevS->DoubleG[2*i+1]*LevS->MaxNUp];
        for (pos=0; pos < LevS->MaxNUp; pos++) {
          destRCD[pos].re =  destRCS[pos].re;
          destRCD[pos].im = -destRCS[pos].im;
        }
      }  
      fft_3d_complex_to_real(plan, LevS->LevelNo, FFTNFUp, tempdestRC, work);
      DensityRTransformPos(LevS,(fftw_real*)tempdestRC,(fftw_real *)Dens->DensityCArray[ActualPsiDensity]);
      // now we have density in the upper level, fft back to complex and store it as wave function coefficients
      fft_3d_real_to_complex(plan, Lev0->LevelNo, FFTNF1, Dens->DensityCArray[ActualPsiDensity], work);
      for (g=0; g < Lev0->MaxG; g++) {
        Index = Lev0->GArray[g].Index;
        source0[g].re = Dens->DensityCArray[ActualPsiDensity][Index].re*factorC; 
        source0[g].im = Dens->DensityCArray[ActualPsiDensity][Index].im*factorC; 
        //if (isnan(source0[g].re)) { fprintf(stderr,"(%i) WARNING in ReadSrcPerturbedPsis(): source0_%i[%i] = NaN!\n", P->Par.me, p, g); Error(SomeError, "NaN-Fehler!"); }
      }
      if (Lev0->GArray[0].GSq == 0.0)
        source0[g].im = 0.0;
    }
  UnLockDensityArray(Dens, TempDensity, real);
  UnLockDensityArray(Dens, TempDensity, imag);
  // finished.
}

/** evaluates perturbed energy functional
 * \param norm norm of current Psi in functional
 * \param *params void-pointer to parameter array
 * \return evaluated functional at f(x) with \a norm
 */
double perturbed_function (double norm, void *params) {
  struct Problem *P = (struct Problem *)params;
  int i, n = P->R.LevS->MaxG;
  double old_norm  = GramSchGetNorm2(P,P->R.LevS,P->R.LevS->LPsi->LocalPsi[P->R.ActualLocalPsiNo]);
  fftw_complex *currentPsi = P->R.LevS->LPsi->LocalPsi[P->R.ActualLocalPsiNo];
  fprintf(stderr,"(%i) perturbed_function: setting norm to %lg ...", P->Par.me, norm);
  // set desired norm for current Psi
  for (i=0; i< n; i++) {
    currentPsi[i].re *= norm/old_norm; // real part
    currentPsi[i].im *= norm/old_norm; // imaginary part
  }
  P->R.PsiStep = 0; // make it not advance to next Psi
  
  //debug(P,"UpdateActualPsiNo");
  UpdateActualPsiNo(P, P->R.CurrentMin); // orthogonalize
  //debug(P,"UpdateEnergyArray");
  UpdateEnergyArray(P); // shift energy values in their array by one
  //debug(P,"UpdatePerturbedEnergyCalculation");
  UpdatePerturbedEnergyCalculation(P);  // re-calc energies (which is hopefully lower)
  EnergyAllReduce(P); // gather from all processes and sum up to total energy
/*
  for (i=0; i< n; i++) {
    currentPsi[i].re /= norm/old_norm; // real part
    currentPsi[i].im /= norm/old_norm; // imaginary part
  }*/
  
  fprintf(stderr,"%lg\n", P->Lat.E->TotalEnergy[0]);
  return P->Lat.E->TotalEnergy[0];   // and return evaluated functional
}

/** evaluates perturbed energy functional.
 * \param *x current position in functional
 * \param *params void-pointer to parameter array
 * \return evaluated functional at f(x)
 */
double perturbed_f (const gsl_vector *x, void *params) {
  struct Problem *P = (struct Problem *)params;
  int i, n = P->R.LevS->MaxG*2;
  fftw_complex *currentPsi = P->R.LevS->LPsi->LocalPsi[P->R.ActualLocalPsiNo];
  //int diff = 0;
  //debug(P,"f");
  // put x into current Psi
  for (i=0; i< n; i+=2) {
    //if ((currentPsi[i/2].re != gsl_vector_get (x, i)) || (currentPsi[i/2].im != gsl_vector_get (x, i+1))) diff++;
    currentPsi[i/2].re = gsl_vector_get (x, i);   // real part
    currentPsi[i/2].im = gsl_vector_get (x, i+1); // imaginary part
  }
  //if (diff) fprintf(stderr,"(%i) %i differences between old and new currentPsi.\n", P->Par.me, diff);
  P->R.PsiStep = 0; // make it not advance to next Psi
  
  //debug(P,"UpdateActualPsiNo");
  UpdateActualPsiNo(P, P->R.CurrentMin); // orthogonalize
  //debug(P,"UpdateEnergyArray");
  UpdateEnergyArray(P); // shift energy values in their array by one
  //debug(P,"UpdatePerturbedEnergyCalculation");
  UpdatePerturbedEnergyCalculation(P);  // re-calc energies (which is hopefully lower)
  EnergyAllReduce(P); // gather from all processes and sum up to total energy
  
  return P->Lat.E->TotalEnergy[0];   // and return evaluated functional
}

/** evaluates perturbed energy gradient.
 * \param *x current position in functional
 * \param *params void-pointer to parameter array
 * \param *g array for gradient vector on return
 */
void perturbed_df (const gsl_vector *x, void *params, gsl_vector *g) {
  struct Problem *P = (struct Problem *)params;
  int i, n = P->R.LevS->MaxG*2;
  fftw_complex *currentPsi = P->R.LevS->LPsi->LocalPsi[P->R.ActualLocalPsiNo];
  fftw_complex *gradient = P->Grad.GradientArray[ActualGradient];
  //int diff = 0;
  //debug(P,"df");
  // put x into current Psi
  for (i=0; i< n; i+=2) {
    //if ((currentPsi[i/2].re != gsl_vector_get (x, i)) || (currentPsi[i/2].im != gsl_vector_get (x, i+1))) diff++;
    currentPsi[i/2].re = gsl_vector_get (x, i);   // real part
    currentPsi[i/2].im = gsl_vector_get (x, i+1); // imaginary part
  }
  //if (diff) fprintf(stderr,"(%i) %i differences between old and new currentPsi.\n", P->Par.me, diff);
  P->R.PsiStep = 0; // make it not advance to next Psi
  
  //debug(P,"UpdateActualPsiNo");
  UpdateActualPsiNo(P, P->R.CurrentMin); // orthogonalize
  //debug(P,"UpdateEnergyArray");
  UpdateEnergyArray(P); // shift energy values in their array by one
  //debug(P,"UpdatePerturbedEnergyCalculation");
  UpdatePerturbedEnergyCalculation(P);  // re-calc energies (which is hopefully lower)
  EnergyAllReduce(P); // gather from all processes and sum up to total energy
  
  // checkout gradient
  //diff = 0;
  for (i=0; i< n; i+=2) {
    //if ((-gradient[i/2].re != gsl_vector_get (g, i)) || (-gradient[i/2].im != gsl_vector_get (g, i+1)))  diff++;
    gsl_vector_set (g, i, -gradient[i/2].re);   // real part
    gsl_vector_set (g, i+1, -gradient[i/2].im); // imaginary part
  }
  //if (diff) fprintf(stderr,"(%i) %i differences between old and new gradient.\n", P->Par.me, diff);
}

/** evaluates perturbed energy functional and gradient.
 * \param *x current position in functional
 * \param *params void-pointer to parameter array
 * \param *f pointer to energy function value on return
 * \param *g array for gradient vector on return
 */
void perturbed_fdf (const gsl_vector *x, void *params, double *f, gsl_vector *g) {
  struct Problem *P = (struct Problem *)params;
  int i, n = P->R.LevS->MaxG*2;
  fftw_complex *currentPsi = P->R.LevS->LPsi->LocalPsi[P->R.ActualLocalPsiNo];
  fftw_complex *gradient = P->Grad.GradientArray[ActualGradient];
  //int diff = 0;
  //debug(P,"fdf");
  // put x into current Psi
  for (i=0; i< n; i+=2) {
    //if ((currentPsi[i/2].re != gsl_vector_get (x, i)) || (currentPsi[i/2].im != gsl_vector_get (x, i+1))) diff++;
    currentPsi[i/2].re = gsl_vector_get (x, i);   // real part
    currentPsi[i/2].im = gsl_vector_get (x, i+1); // imaginary part
  }
  //if (diff) fprintf(stderr,"(%i) %i differences between old and new currentPsi.\n", P->Par.me, diff);
  P->R.PsiStep = 0; // make it not advance to next Psi
  
  //debug(P,"UpdateActualPsiNo");
  UpdateActualPsiNo(P, P->R.CurrentMin); // orthogonalize
  //debug(P,"UpdateEnergyArray");
  UpdateEnergyArray(P); // shift energy values in their array by one
  //debug(P,"UpdatePerturbedEnergyCalculation");
  UpdatePerturbedEnergyCalculation(P);  // re-calc energies (which is hopefully lower)
  EnergyAllReduce(P); // gather from all processes and sum up to total energy
  
  // checkout gradient
  //diff = 0;
  for (i=0; i< n; i+=2) {
    //if ((-gradient[i/2].re != gsl_vector_get (g, i)) || (-gradient[i/2].im != gsl_vector_get (g, i+1)))  diff++;
    gsl_vector_set (g, i, -gradient[i/2].re);   // real part
    gsl_vector_set (g, i+1, -gradient[i/2].im); // imaginary part
  }
  //if (diff) fprintf(stderr,"(%i) %i differences between old and new gradient.\n", P->Par.me, diff);
  
  *f = P->Lat.E->TotalEnergy[0];   // and return evaluated functional
}

/* MinimisePerturbed with all the brent minimisation approach
void MinimisePerturbed (struct Problem *P, int *Stop, int *SuperStop) {
  struct RunStruct *R = &P->R;
  struct Lattice *Lat = &P->Lat;
  struct Psis *Psi = &Lat->Psi;
  int type;
  //int i;
  
  // stuff for GSL minimization
  //size_t iter;
  //int status, Status
  int n = R->LevS->MaxG*2;
  const gsl_multimin_fdfminimizer_type *T_multi;
  const gsl_min_fminimizer_type *T;
  gsl_multimin_fdfminimizer *s_multi;
  gsl_min_fminimizer *s;
  gsl_vector *x;//, *ss;
  gsl_multimin_function_fdf my_func;
  gsl_function F;
  //fftw_complex *currentPsi;
  //double a,b,m, f_m, f_a, f_b;
  //double old_norm;
  
  my_func.f = &perturbed_f;
  my_func.df = &perturbed_df;
  my_func.fdf = &perturbed_fdf;
  my_func.n = n;
  my_func.params = P;
  F.function = &perturbed_function;
  F.params = P;
     
  x = gsl_vector_alloc (n);
  //ss = gsl_vector_alloc (Psi->NoOfPsis);
  T_multi = gsl_multimin_fdfminimizer_vector_bfgs;
  s_multi = gsl_multimin_fdfminimizer_alloc (T_multi, n);
  T = gsl_min_fminimizer_brent;
  s = gsl_min_fminimizer_alloc (T);
  
  for (type=Perturbed_P0;type<=Perturbed_RxP2;type++) {  // go through each perturbation group separately //
    *Stop=0;   // reset stop flag
    fprintf(stderr,"(%i)Beginning perturbed minimisation of type %s ...\n", P->Par.me, R->MinimisationName[type]);
    //OutputOrbitalPositions(P, Occupied);
    R->PsiStep = R->MaxPsiStep; // reset in-Psi-minimisation-counter, so that we really advance to the next wave function
    UpdateActualPsiNo(P, type); // step on to next perturbed one
    fprintf(stderr, "(%i) Re-initializing perturbed psi array for type %s ", P->Par.me, R->MinimisationName[type]);
    if (P->Call.ReadSrcFiles && ReadSrcPsiDensity(P,type,1, R->LevSNo)) {
      SpeedMeasure(P, InitSimTime, StartTimeDo);  
      fprintf(stderr,"from source file of recent calculation\n");
      ReadSrcPsiDensity(P,type, 0, R->LevSNo);
      ResetGramSchTagType(P, Psi, type, IsOrthogonal); // loaded values are orthonormal
      SpeedMeasure(P, DensityTime, StartTimeDo);
      //InitDensityCalculation(P);
      SpeedMeasure(P, DensityTime, StopTimeDo);
      R->OldActualLocalPsiNo = R->ActualLocalPsiNo; // needed otherwise called routines in function below crash
      UpdateGramSchOldActualPsiNo(P,Psi);
      InitPerturbedEnergyCalculation(P, 1);  // go through all orbitals calculate each H^{(0)}-eigenvalue, recalc HGDensity, cause InitDensityCalc zero'd it
      UpdatePerturbedEnergyCalculation(P);  // H1cGradient and Gradient must be current ones
      EnergyAllReduce(P);   // gather energies for minimum search
      SpeedMeasure(P, InitSimTime, StopTimeDo);  
    }
    if (P->Call.ReadSrcFiles != 1) {
      SpeedMeasure(P, InitSimTime, StartTimeDo);  
      ResetGramSchTagType(P, Psi, type, NotOrthogonal); // perturbed now shall be orthonormalized
      if (P->Call.ReadSrcFiles != 2) {
        if (R->LevSNo == Lat->MaxLevel-1) { // is it the starting level? (see InitRunLevel())
          fprintf(stderr, "randomly.\n");
          InitPsisValue(P, Psi->TypeStartIndex[type], Psi->TypeStartIndex[type+1]);  // initialize perturbed array for this run
        } else {
          fprintf(stderr, "from source file of last level.\n");
          ReadSrcPerturbedPsis(P, type);
        }
      }
      SpeedMeasure(P, InitGramSchTime, StartTimeDo);  
      GramSch(P, R->LevS, Psi, Orthogonalize);
      SpeedMeasure(P, InitGramSchTime, StopTimeDo);  
      SpeedMeasure(P, InitDensityTime, StartTimeDo);
      //InitDensityCalculation(P);
      SpeedMeasure(P, InitDensityTime, StopTimeDo);
      InitPerturbedEnergyCalculation(P, 1);  // go through all orbitals calculate each H^{(0)}-eigenvalue, recalc HGDensity, cause InitDensityCalc zero'd it
      R->OldActualLocalPsiNo = R->ActualLocalPsiNo; // needed otherwise called routines in function below crash
      UpdateGramSchOldActualPsiNo(P,Psi);
      UpdatePerturbedEnergyCalculation(P);  // H1cGradient and Gradient must be current ones
      EnergyAllReduce(P);   // gather energies for minimum search
      SpeedMeasure(P, InitSimTime, StopTimeDo);  
      R->LevS->Step++;
      EnergyOutput(P,0);
      while (*Stop != 1) {
        // copy current Psi into starting vector
        currentPsi = R->LevS->LPsi->LocalPsi[R->ActualLocalPsiNo];
        for (i=0; i< n; i+=2) {
          gsl_vector_set (x, i, currentPsi[i/2].re);   // real part
          gsl_vector_set (x, i+1, currentPsi[i/2].im); // imaginary part
        }
        gsl_multimin_fdfminimizer_set (s_multi, &my_func, x, 0.01, 1e-2);  
        iter = 0;
        status = 0;
        do {  // look for minimum along current local psi
          iter++;
          status = gsl_multimin_fdfminimizer_iterate (s_multi);
          MPI_Allreduce(&status, &Status, 1, MPI_INT, MPI_MAX, P->Par.comm_ST_Psi);
          if (Status)
            break;
          status = gsl_multimin_test_gradient (s_multi->gradient, 1e-2);
          MPI_Allreduce(&status, &Status, 1, MPI_INT, MPI_MAX, P->Par.comm_ST_Psi);
          //if (Status == GSL_SUCCESS)
            //printf ("Minimum found at:\n");
          if (P->Par.me == 0) fprintf (stderr,"(%i,%i,%i)S(%i,%i,%i):\t %5d %10.5f\n",P->Par.my_color_comm_ST,P->Par.me_comm_ST, P->Par.me_comm_ST_PsiT, R->MinStep, R->ActualLocalPsiNo, R->PsiStep, (int)iter, s_multi->f);
          //TestGramSch(P,R->LevS,Psi, type); // functions are orthonormal?
        } while (Status == GSL_CONTINUE && iter < 3);
        // now minimize norm of currentPsi (one-dim)
        if (0) { 
          iter = 0;
        status = 0;
        m = 1.;
        a = MYEPSILON;
        b = 100.;
        f_a = perturbed_function (a, P);
        f_b = perturbed_function (b, P);
        f_m = perturbed_function (m, P);
        //if ((f_m < f_a) && (f_m < f_b)) {
          gsl_min_fminimizer_set (s, &F, m, a, b);
          do {  // look for minimum along current local psi
            iter++;
            status = gsl_min_fminimizer_iterate (s);
            m = gsl_min_fminimizer_x_minimum (s);
            a = gsl_min_fminimizer_x_lower (s);
            b = gsl_min_fminimizer_x_upper (s);         
            status = gsl_min_test_interval (a, b, 0.001, 0.0);
            if (status == GSL_SUCCESS)
              printf ("Minimum found at:\n");
            printf ("%5d [%.7f, %.7f] %.7f %.7f\n",
               (int) iter, a, b,
               m, b - a);
          } while (status == GSL_CONTINUE && iter < 100);
          old_norm = GramSchGetNorm2(P,P->R.LevS,P->R.LevS->LPsi->LocalPsi[P->R.ActualLocalPsiNo]);
          for (i=0; i< n; i++) {
            currentPsi[i].re *= m/old_norm; // real part
            currentPsi[i].im *= m/old_norm; // imaginary part
          }
        } else debug(P,"Norm not minimizable!");
        //P->R.PsiStep = P->R.MaxPsiStep; // make it advance to next Psi
        FindPerturbedMinimum(P);        
        //debug(P,"UpdateActualPsiNo");
        UpdateActualPsiNo(P, type); // step on to next perturbed Psi
        //debug(P,"UpdateEnergyArray");
        UpdateEnergyArray(P); // shift energy values in their array by one
        //debug(P,"UpdatePerturbedEnergyCalculation");
        UpdatePerturbedEnergyCalculation(P);  // re-calc energies (which is hopefully lower)
        EnergyAllReduce(P); // gather from all processes and sum up to total energy
        //ControlNativeDensity(P);  // check total density (summed up PertMixed must be zero!)
        //printf ("(%i,%i,%i)S(%i,%i,%i):\t %5d %10.5f\n",P->Par.my_color_comm_ST,P->Par.me_comm_ST, P->Par.me_comm_ST_PsiT, R->MinStep, R->ActualLocalPsiNo, R->PsiStep, (int)iter, s_multi->f);
        if (*SuperStop != 1)
          *SuperStop = CheckCPULIM(P);
        *Stop = CalculateMinimumStop(P, *SuperStop);
        P->Speed.Steps++; // step on
        R->LevS->Step++; 
      }
      // now release normalization condition and minimize wrt to norm
      *Stop = 0;
      while (*Stop != 1) {
        currentPsi = R->LevS->LPsi->LocalPsi[R->ActualLocalPsiNo];
        iter = 0;
        status = 0;
        m = 1.;
        a = 0.001;
        b = 10.;
        f_a = perturbed_function (a, P);
        f_b = perturbed_function (b, P);
        f_m = perturbed_function (m, P);
        if ((f_m < f_a) && (f_m < f_b)) {
          gsl_min_fminimizer_set (s, &F, m, a, b);
          do {  // look for minimum along current local psi
            iter++;
            status = gsl_min_fminimizer_iterate (s);
            m = gsl_min_fminimizer_x_minimum (s);
            a = gsl_min_fminimizer_x_lower (s);
            b = gsl_min_fminimizer_x_upper (s);         
            status = gsl_min_test_interval (a, b, 0.001, 0.0);
            if (status == GSL_SUCCESS)
              printf ("Minimum found at:\n");
            printf ("%5d [%.7f, %.7f] %.7f %.7f\n",
               (int) iter, a, b,
               m, b - a);
          } while (status == GSL_CONTINUE && iter < 100);
          old_norm = GramSchGetNorm2(P,P->R.LevS,P->R.LevS->LPsi->LocalPsi[P->R.ActualLocalPsiNo]);
          for (i=0; i< n; i++) {
            currentPsi[i].re *= m/old_norm; // real part
            currentPsi[i].im *= m/old_norm; // imaginary part
          }
        }
        P->R.PsiStep = P->R.MaxPsiStep; // make it advance to next Psi
        //debug(P,"UpdateActualPsiNo");
        UpdateActualPsiNo(P, type); // step on to next perturbed Psi
        if (*SuperStop != 1)
          *SuperStop = CheckCPULIM(P);
        *Stop = CalculateMinimumStop(P, *SuperStop);
        P->Speed.Steps++; // step on
        R->LevS->Step++; 
      }
      if(P->Call.out[NormalOut]) fprintf(stderr,"(%i) Write %s srcpsi to disk\n", P->Par.me, R->MinimisationName[type]);
      OutputSrcPsiDensity(P, type);
//      if (!TestReadnWriteSrcDensity(P,type))
//        Error(SomeError,"TestReadnWriteSrcDensity failed!");
    }

    TestGramSch(P,R->LevS,Psi, type); // functions are orthonormal?
    // calculate current density summands
    //if (P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Filling current density grid ...\n",P->Par.me);
    SpeedMeasure(P, CurrDensTime, StartTimeDo);
    if (*SuperStop != 1) {
      if ((R->DoFullCurrent == 1) || ((R->DoFullCurrent == 2) && (CheckOrbitalOverlap(P) == 1))) { //test to check whether orbitals have mutual overlap and thus \\DeltaJ_{xc} must not be dropped
        R->DoFullCurrent = 1; // set to 1 if it was 2 but Check...() yielded necessity
        //debug(P,"Filling with Delta j ...");
        //FillDeltaCurrentDensity(P);
      }// else
        //debug(P,"There is no overlap between orbitals.");
      //debug(P,"Filling with j ...");
      FillCurrentDensity(P);
    }
    SpeedMeasure(P, CurrDensTime, StopTimeDo);
    
    SetGramSchExtraPsi(P,Psi,NotUsedToOrtho); // remove extra Psis from orthogonality check
    ResetGramSchTagType(P, Psi, type, NotUsedToOrtho);  // remove this group from the check for the next minimisation group as well!
  }
  UpdateActualPsiNo(P, Occupied); // step on back to an occupied one
  
  gsl_multimin_fdfminimizer_free (s_multi);
  gsl_min_fminimizer_free (s);
  gsl_vector_free (x);
  //gsl_vector_free (ss);
}
*/