Context Navigation

wannier.c@ 4f1369

Visit:

Last change on this file since 4f1369 was 4f1369, checked in by Frederik Heber <heber@…>, 17 years ago

Wannier-file split up: ComputeMWLF() into smaller functions

This split up was done in the SVN repository and tested there and is working. It was impossible to piece it up into smaller chunks in order to maintain "bisectability", hence the last wannier.* from the SVN rep. was used. However, there was one bug in ParseWannierFile: There is the define "type" and there was a enum variable also of name type which caused errors on reading perturbed densities in the resulting shielding values. Removing the variable completely fixed this.
Note: With regards to the shielding values of c2h4 at 24Ht the resulting values are still absolutely the same as with revision initial_commit

Property mode set to 100644

File size: 93.4 KB

Line
1	/** \file wannier.c
2	* Maximally Localized Wannier Functions.
3	*
4	* Contains the on function that minimises the spread of all orbitals in one rush in a parallel
5	* Jacobi-Diagonalization implementation, ComputeMLWF(), and one routine CalculateSpread() to
6	* calculate the spread of a specific orbital, which may be useful in checking on the change of
7	* spread during other calculations. convertComplex() helps in typecasting fftw_complex to gsl_complex.
8	*
9	Project: ParallelCarParrinello
10	\author Frederik Heber
11	\date 2006
12
13	File: wannier.c
14	$Id: wannier.c,v 1.7 2007-10-12 15:50:38 heber Exp $
15	*/
16
17	#include <math.h>
18	#include <gsl/gsl_math.h>
19	#include <gsl/gsl_eigen.h>
20	#include <gsl/gsl_matrix.h>
21	#include <gsl/gsl_vector.h>
22	#include <gsl/gsl_complex.h>
23	#include <gsl/gsl_complex_math.h>
24	#include <gsl/gsl_sort_vector.h>
25	#include <gsl/gsl_heapsort.h>
26	#include <gsl/gsl_blas.h>
27	#include <string.h>
28
29	#include "data.h"
30	#include "density.h"
31	#include "errors.h"
32	#include "gramsch.h"
33	#include "helpers.h"
34	#include "init.h"
35	#include "myfft.h"
36	#include "mymath.h"
37	#include "output.h"
38	#include "perturbed.h"
39	#include "wannier.h"
40
41
42	#define max_operators NDIM*2 //!< number of chosen self-adjoint operators when evaluating the spread
43	#define type Occupied
44
45
46	/** Converts type fftw_complex to gsl_complex.
47	* \param a complex number
48	* \return b complex number
49	*/
50	gsl_complex convertComplex (fftw_complex a) {
51	return gsl_complex_rect(c_re(a),c_im(a));
52	}
53
54	/** "merry go round" implementation for parallel index ordering.
55	* Given two arrays, one for the upper/left matrix columns, one for the lower/right ones, one step of an index generation is
56	* performed which generates once each possible pairing.
57	* \param *top index array 1
58	* \param *bot index array 2
59	* \param m N/2, where N is the matrix row/column dimension
60	* \note taken from [Golub, Matrix computations, 1989, p451]
61	*/
62	void MerryGoRoundIndices(int top, int bot, int m)
63	{
64	int old_top, old_bot;
65	int k;
66	old_top = (int ) Malloc(sizeof(int)m, "music: old_top");
67	old_bot = (int ) Malloc(sizeof(int)m, "music: old_bot");
68	/* fprintf(stderr,"oldtop\t");
69	for (k=0;k<m;k++)
70	fprintf(stderr,"%i\t", top[k]);
71	fprintf(stderr,"\n");
72	fprintf(stderr,"oldbot\t");
73	for (k=0;k<m;k++)
74	fprintf(stderr,"%i\t", bot[k]);
75	fprintf(stderr,"\n");*/
76	// first copy arrays
77	for (k=0;k<m;k++) {
78	old_top[k] = top[k];
79	old_bot[k] = bot[k];
80	}
81	// then let the music play
82	for (k=0;k<m;k++) {
83	if (k==1)
84	top[k] = old_bot[0];
85	else if (k > 1)
86	top[k] = old_top[k-1];
87	if (k==m-1)
88	bot[k] = old_top[k];
89	else
90	bot[k] = old_bot[k+1];
91	}
92	/* fprintf(stderr,"top\t");
93	for (k=0;k<m;k++)
94	fprintf(stderr,"%i\t", top[k]);
95	fprintf(stderr,"\n");
96	fprintf(stderr,"bot\t");
97	for (k=0;k<m;k++)
98	fprintf(stderr,"%i\t", bot[k]);
99	fprintf(stderr,"\n");*/
100	// and finito
101	Free(old_top, "MerryGoRoundIndices: old_top");
102	Free(old_bot, "MerryGoRoundIndices: old_bot");
103	}
104
105	/** merry-go-round for matrix columns.
106	* The trick here is that we must be aware of multiple rotations per process, thus only some of the
107	* whole lot of local columns get sent/received, most of them are just shifted via exchanging the various
108	* pointers to the matrix columns within the local array.
109	* \param comm communicator for circulation
110	* \param *Aloc local array of columns
111	* \param Num entries per column
112	* \param max_rounds number of column pairs in \a *Around
113	* \param k offset for tag
114	* \param tagS0 MPI tag for sending left column
115	* \param tagS1 MPI tag for sending right column
116	* \param tagR0 MPI tag for receiving left column
117	* \param tagR1 MPI tag for receiving right column
118	*/
119	void MerryGoRoundColumns(MPI_Comm comm, double **Aloc, int Num, int max_rounds, int k, int tagS0, int tagS1, int tagR0, int tagR1) {
120	//double A_locS1, A_locS2; // local columns of A[k]
121	//double A_locR1, A_locR2; // local columns of A[k]
122	MPI_Request requestS0, requestS1, requestR0, requestR1;
123	MPI_Status status;
124	int ProcRank, ProcNum;
125	int l;
126	MPI_Comm_size (comm, &ProcNum);
127	MPI_Comm_rank (comm, &ProcRank);
128	double Abuffer1, Abuffer2; // mark the columns that are circulated
129
130	//fprintf(stderr,"shifting...");
131	if (ProcRank == 0) {
132	if (max_rounds > 1) {
133	// get last left column
134	Abuffer1 = Aloc[2*(max_rounds-1)]; // note down the free column
135	MPI_Isend(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, WannierALTag+2*k, comm, &requestS0);
136	} else {
137	// get right column
138	Abuffer1 = Aloc[1]; // note down the free column
139	MPI_Isend(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, tagS1+2*k, comm, &requestS0);
140	}
141
142	//fprintf(stderr,"...left columns...");
143	for(l=2*max_rounds-2;l>2;l-=2) // left columns become shifted one place to the right
144	Aloc[l] = Aloc[l-2];
145
146	if (max_rounds > 1) {
147	//fprintf(stderr,"...first right...");
148	Aloc[2] = Aloc[1]; // get first right column
149	}
150
151	//fprintf(stderr,"...right columns...");
152	for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
153	Aloc[l] = Aloc[l+2];
154
155	//fprintf(stderr,"...last right...");
156	Aloc[(2*max_rounds-1)] = Abuffer1;
157	MPI_Irecv(Abuffer1, Num, MPI_DOUBLE, ProcRank+1, WannierARTag+2*k, comm, &requestR1);
158
159	} else if (ProcRank == ProcNum-1) {
160	//fprintf(stderr,"...first right...");
161	// get first right column
162	Abuffer2 = Aloc[1]; // note down the free column
163	MPI_Isend(Abuffer2, Num, MPI_DOUBLE, ProcRank-1, WannierARTag+2*k, comm, &requestS1);
164
165	//fprintf(stderr,"...right columns...");
166	for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
167	Aloc[(l)] = Aloc[(l+2)];
168
169	//fprintf(stderr,"...last right...");
170	Aloc[(2max_rounds-1)] = Aloc[2(max_rounds-1)]; // Put last left into last right column
171
172	//fprintf(stderr,"...left columns...");
173	for(l=2*(max_rounds-1);l>0;l-=2) // left columns become shifted one place to the right
174	Aloc[(l)] = Aloc[(l-2)];
175
176	//fprintf(stderr,"...first left...");
177	// if (max_rounds > 1)
178	Aloc[0] = Abuffer2; // get first left column
179	MPI_Irecv(Abuffer2, Num, MPI_DOUBLE, ProcRank-1, WannierALTag+2*k, comm, &requestR0);
180
181	} else {
182	// get last left column
183	MPI_Isend(Aloc[2(max_rounds-1)], Num, MPI_DOUBLE, ProcRank+1, WannierALTag+2k, comm, &requestS0);
184	Abuffer1 = Aloc[2*(max_rounds-1)]; // note down the free column
185
186	//fprintf(stderr,"...first right...");
187	// get first right column
188	MPI_Isend(Aloc[1], Num, MPI_DOUBLE, ProcRank-1, WannierARTag+2*k, comm, &requestS1);
189	Abuffer2 = Aloc[1]; // note down the free column
190
191	//fprintf(stderr,"...left columns...");
192	for(l=2*(max_rounds-1);l>0;l-=2) // left columns become shifted one place to the right
193	Aloc[(l)] = Aloc[(l-2)];
194
195	//fprintf(stderr,"...right columns...");
196	for(l=1;l<2*max_rounds-1;l+=2) // right columns become shifted one place to the left
197	Aloc[(l)] = Aloc[(l+2)];
198
199	//fprintf(stderr,"...first left...");
200	Aloc[0] = Abuffer1; // get first left column
201	MPI_Irecv(Aloc[0], Num, MPI_DOUBLE, ProcRank-1, WannierALTag+2*k, comm, &requestR0);
202
203	//fprintf(stderr,"...last right...");
204	Aloc[(2*max_rounds-1)] = Abuffer2;
205	MPI_Irecv(Aloc[(2max_rounds-1)], Num, MPI_DOUBLE, ProcRank+1, WannierARTag+2k, comm, &requestR1);
206	}
207
208	//fprintf(stderr,"...waiting...");
209	if (ProcRank != ProcNum-1)
210	MPI_Wait(&requestS0, &status);
211	if (ProcRank != 0) // first left column
212	MPI_Wait(&requestR0, &status);
213	if (ProcRank != 0)
214	MPI_Wait(&requestS1, &status);
215	if (ProcRank != ProcNum-1)
216	MPI_Wait(&requestR1, &status);
217	//fprintf(stderr,"...done\n");
218	}
219
220	/** By testing of greatest common divisor of the matrix rows (\a AllocNum) finds suitable parallel cpu group.
221	* \param *P Problem at hand
222	* \param AllocNum number of rows in matrix
223	* \return address of MPI communicator
224	*/
225	#ifdef HAVE_INLINE
226	inline MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
227	#else
228	MPI_Comm * DetermineParallelGroupbyGCD (struct Problem *P, int AllocNum)
229	#endif
230	{
231	MPI_Comm *comm = &P->Par.comm_ST;
232
233	//if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Comparing groups - AllocNum %i --- All %i\t Psi %i\t PsiT %i\n",P->Par.me, AllocNum, P->Par.Max_me_comm_ST, P->Par.Max_me_comm_ST_Psi, P->Par.Max_my_color_comm_ST_Psi);
234	//if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) Jacobi diagonalization is done parallely by ", P->Par.me);
235	if (AllocNum % (P->Par.Max_me_comm_ST*2) == 0) { // all parallel
236	comm = &P->Par.comm_ST;
237	//if (P->Call.out[ReadOut]) fprintf(stderr,"all\n");
238	} else if (P->Par.Max_me_comm_ST_Psi >= P->Par.Max_my_color_comm_ST_Psi) { // always the bigger group comes first
239	if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) { // coefficients parallel
240	comm = &P->Par.comm_ST_Psi;
241	//if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");
242	} else if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) { // Psis parallel
243	comm = &P->Par.comm_ST_PsiT;
244	//if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");
245	}
246	} else {
247	if (AllocNum % (P->Par.Max_my_color_comm_ST_Psi*2) == 0) { // Psis parallel
248	comm = &P->Par.comm_ST_PsiT;
249	//if (P->Call.out[ReadOut]) fprintf(stderr,"PsiT\n");
250	} else if (AllocNum % (P->Par.Max_me_comm_ST_Psi*2) == 0) { // coefficients parallel
251	comm = &P->Par.comm_ST_Psi;
252	//if (P->Call.out[ReadOut]) fprintf(stderr,"Psi\n");
253	}
254	}
255	return comm;
256	}
257
258	/** Allocates and fills Lookup table for sin/cos values at each grid node.
259	* \param ***cos_table pointer to two-dimensional lookup table for cosine values
260	* \param ***sin_table pointer to two-dimensional lookup table for sine values
261	* \param *N array with number of nodes per \a NDIM axis
262	*/
263	void CreateSinCosLookupTable(double *cos_table, double sin_table, int N)
264	{
265	int i, j;
266	double argument;
267	double cos_lookup, sin_lookup;
268
269	// create lookup table for sin/cos values
270	cos_lookup = (double *) Malloc(sizeof(double )NDIM, "ComputeMLWF: cos_lookup");
271	sin_lookup = (double *) Malloc(sizeof(double )NDIM, "ComputeMLWF: sin_lookup");
272	for (i=0;i<NDIM;i++) {
273	// allocate memory
274	cos_lookup[i] = (double ) Malloc(sizeof(double)N[i], "ComputeMLWF: cos_lookup");
275	sin_lookup[i] = (double ) Malloc(sizeof(double)N[i], "ComputeMLWF: sin_lookup");
276
277	// reset arrays
278	SetArrayToDouble0(cos_lookup[i],N[i]);
279	SetArrayToDouble0(sin_lookup[i],N[i]);
280
281	// create lookup values
282	for (j=0;j<N[i];j++) {
283	argument = 2PI/(double)N[i](double)j;
284	cos_lookup[i][j] = cos(argument);
285	sin_lookup[i][j] = sin(argument);
286	}
287	}
288	*cos_table = cos_lookup;
289	*sin_table = sin_lookup;
290	}
291
292	/** Frees memory allocated during CreateSinCosLookupTable().
293	* \param ***cos_lookup pointer to two-dimensional lookup table for cosine values
294	* \param ***sin_lookup pointer to two-dimensional lookup table for sine values
295	*/
296	void FreeSinCosLookupTable(double cos_lookup, double sin_lookup)
297	{
298	int i;
299	// free lookups
300	for (i=0;i<NDIM;i++) {
301	Free(cos_lookup[i], "FreeSinCosLookupTable: cos_lookup[i]");
302	Free(sin_lookup[i], "FreeSinCosLookupTable: sin_lookup[i]");
303	}
304	Free(cos_lookup, "FreeSinCosLookupTable: cos_lookup");
305	Free(sin_lookup, "FreeSinCosLookupTable: sin_lookup");
306	}
307
308	/** Fills the entries of the six variance matrices.
309	* These matrices are parallely diagonalized during Wannier Localization. They are calculated from the
310	* wave function and by diagonalization one obtains the unitary transformation with which the Psis are
311	* treated afterwards.
312	* \param *P Problem at hand
313	* \param AllocNum number of rows/columns
314	* \param **A pointer to variance matrices
315	* \sa ComputeMLWF() - master function.
316	*/
317	void FillHigherOrderRealMomentsMatrices(struct Problem P, int AllocNum, gsl_matrix *A)
318	{
319	struct Lattice *Lat = &P->Lat;
320	struct RunStruct *R = &P->R;
321	struct Psis *Psi = &Lat->Psi;
322	struct LatticeLevel *Lev0 = R->Lev0;
323	struct LatticeLevel *LevS = R->LevS;
324	struct Density *Dens0 = Lev0->Dens;
325	struct OnePsiElement OnePsiA, OnePsiB, *LOnePsiB;
326	struct fft_plan_3d *plan = Lat->plan;
327	fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
328	fftw_real PsiCR = (fftw_real )PsiC;
329	fftw_complex *work = Dens0->DensityCArray[Temp2Density];
330	fftw_real **HGcR = &Dens0->DensityArray[HGDensity]; // use HGDensity, 4x Gap..Density, TempDensity as a storage array
331	fftw_complex HGcRC = (fftw_complex)HGcR;
332	fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity]; // use HGcDensity, 4x Gap..Density, TempDensity as an array
333	fftw_real HGcR2 = (fftw_real)HGcR2C;
334	int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
335	MPI_Status status;
336	fftw_complex LPsiDatA=NULL, LPsiDatB=NULL;
337	int n[NDIM],n0,i0,iS, Index;
338	int Num = Psi->NoOfPsis; // is number of occupied plus unoccupied states for rows
339	int N0 = LevS->Plan0.plan->local_nx;
340	int *N = LevS->Plan0.plan->N;
341	const int NUpx = LevS->NUp[0];
342	const int NUpy = LevS->NUp[1];
343	const int NUpz = LevS->NUp[2];
344	double argument, PsiAtNode;
345	int e,g,i,j,k,l,m,p,u;
346	double a_ij = 0, b_ij = 0, A_ij = 0, B_ij = 0;
347	double cos_lookup = NULL,sin_lookup = NULL;
348
349	if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 2\n",P->Par.me);
350
351	debug(P, "Creating Lookup Table");
352	CreateSinCosLookupTable(&cos_lookup, &sin_lookup, N);
353
354	debug(P, "Calculating each entry of variance matrices");
355	l=-1; // to access U matrix element (0..Num-1)
356	// fill the matrices
357	for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) { // go through all wave functions
358	OnePsiA = &Psi->AllPsiStatus[i]; // grab OnePsiA
359	if (OnePsiA->PsiType == type) { // drop all but occupied ones
360	l++; // increase l if it is non-extra wave function
361	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
362	LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
363	else
364	LPsiDatA = NULL; // otherwise processes won't enter second loop, though they're supposed to send coefficients!
365
366	//fprintf(stderr,"(%i),(%i,%i): fft'd, A[..] and B, back-fft'd acting on \\phi_A\n",P->Par.me,l,0);
367	if (LPsiDatA != NULL) {
368	CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
369	// note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
370	for (n0=0;n0<N0;n0++)
371	for (n[1]=0;n[1]<N[1];n[1]++)
372	for (n[2]=0;n[2]<N[2];n[2]++) {
373	i0 = n[2]NUpz+N[2]NUpz(n[1]NUpy+N[1]NUpyn0*NUpx);
374	iS = n[2]+N[2](n[1]+N[1]n0);
375	n[0] = n0 + LevS->Plan0.plan->start_nx;
376	for (k=0;k<max_operators;k+=2) {
377	e = k/2;
378	argument = 2.PI/(double)(N[e])(double)(n[e]);
379	PsiAtNode = PsiCR[i0] /LevS->MaxN;
380	// check lookup
381	if (!l) // perform check on first wave function only
382	if ((fabs(cos(argument) - cos_lookup[e][n[e]]) > MYEPSILON) \|\| (fabs(sin(argument) - sin_lookup[e][n[e]]) > MYEPSILON)) {
383	Error(SomeError, "Lookup table does not match real value!");
384	}
385	HGcR[k][iS] = cos_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
386	HGcR2[k][iS] = cos_lookup[e][n[e]] * HGcR[k][iS]; /* Matrix Vector Mult */
387	HGcR[k+1][iS] = sin_lookup[e][n[e]] * PsiAtNode; /* Matrix Vector Mult */
388	HGcR2[k+1][iS] = sin_lookup[e][n[e]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
389	}
390	}
391	for (u=0;u<max_operators;u++) {
392	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[u], work);
393	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[u], work);
394	}
395	}
396	m = -1; // to access U matrix element (0..Num-1)
397	for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) { // go through all wave functions
398	OnePsiB = &Psi->AllPsiStatus[j]; // grab OnePsiB
399	if (OnePsiB->PsiType == type) { // drop all but occupied ones
400	m++; // increase m if it is non-extra wave function
401	if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
402	LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
403	else
404	LOnePsiB = NULL;
405	if (LOnePsiB == NULL) { // if it's not local ... receive it from respective process into TempPsi
406	RecvSource = OnePsiB->my_color_comm_ST_Psi;
407	MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
408	LPsiDatB=LevS->LPsi->TempPsi;
409	} else { // .. otherwise send it to all other processes (Max_me... - 1)
410	for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
411	if (p != OnePsiB->my_color_comm_ST_Psi)
412	MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
413	LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
414	} // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
415
416	for (u=0;u<max_operators;u++) {
417	a_ij = 0;
418	b_ij = 0;
419	if (LPsiDatA != NULL) { // calculate, reduce and send to all ...
420	//fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,u);
421	g=0;
422	if (LevS->GArray[0].GSq == 0.0) {
423	Index = LevS->GArray[g].Index;
424	a_ij = (LPsiDatB[0].reHGcRC[u][Index].re + LPsiDatB[0].imHGcRC[u][Index].im);
425	b_ij = (LPsiDatB[0].reHGcR2C[u][Index].re + LPsiDatB[0].imHGcR2C[u][Index].im);
426	g++;
427	}
428	for (; g < LevS->MaxG; g++) {
429	Index = LevS->GArray[g].Index;
430	a_ij += 2(LPsiDatB[g].reHGcRC[u][Index].re + LPsiDatB[g].im*HGcRC[u][Index].im);
431	b_ij += 2(LPsiDatB[g].reHGcR2C[u][Index].re + LPsiDatB[g].im*HGcR2C[u][Index].im);
432	} // due to the symmetry the resulting matrix element is real and symmetric in (i,j) ! (complex multiplication simplifies ...)
433	// sum up elements from all coefficients sharing processes
434	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
435	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
436	a_ij = A_ij;
437	b_ij = B_ij;
438	// send element to all Psi-sharing who don't have l local (MPI_Send is a lot slower than AllReduce!)
439	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
440	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
441	} else { // receive ...
442	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
443	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_PsiT);
444	}
445	// ... and store
446	//fprintf(stderr,"(%i),(%i,%i): A[%i]: setting component (local: %lg, total: %lg)\n",P->Par.me, l,m,u,a_ij,A_ij);
447	//fprintf(stderr,"(%i),(%i,%i): B: adding upon component (local: %lg, total: %lg)\n",P->Par.me, l,m,b_ij,B_ij);
448	gsl_matrix_set(A[u], l, m, A_ij);
449	gsl_matrix_set(A[max_operators], l, m, B_ij + gsl_matrix_get(A[max_operators],l,m));
450	}
451	}
452	}
453	}
454	}
455	// reset extra entries
456	for (u=0;u<=max_operators;u++) {
457	for (i=Num;i<AllocNum;i++)
458	for (j=0;j<AllocNum;j++)
459	gsl_matrix_set(A[u], i,j, 0.);
460	for (i=Num;i<AllocNum;i++)
461	for (j=0;j<AllocNum;j++)
462	gsl_matrix_set(A[u], j,i, 0.);
463	}
464	FreeSinCosLookupTable(cos_lookup, sin_lookup);
465
466	/*// print A matrices for debug
467	if (P->Par.me == 0)
468	for (u=0;u<max_operators+1;u++) {
469	fprintf(stderr, "A[%i] = \n",u);
470	for (i=0;i<Num;i++) {
471	for (j=0;j<Num;j++)
472	fprintf(stderr, "%e\t",gsl_matrix_get(A[u],i,j));
473	fprintf(stderr, "\n");
474	}
475	}
476	*/
477	}
478
479	/** Calculates reciprocal second order moments of each PsiType#Occupied orbital.
480	* First order is zero due to wave function being real (symmetry condition).
481	* \param *P Problem at hand
482	*/
483	void CalculateSecondOrderReciprocalMoment(struct Problem *P)
484	{
485	struct Lattice *Lat = &P->Lat;
486	struct RunStruct *R = &P->R;
487	struct FileData *F = &P->Files;
488	struct Psis *Psi = &Lat->Psi;
489	struct LatticeLevel *LevS = R->LevS;
490	double result, Result;
491	fftw_complex *LPsiDatA=NULL;
492	struct OnePsiElement *OnePsiA;
493	int i,j,l,g;
494	char spin[12], suffix[18];
495
496	switch (Lat->Psi.PsiST) {
497	case SpinDouble:
498	strcpy(suffix,".recispread.csv");
499	strcpy(spin,"SpinDouble");
500	break;
501	case SpinUp:
502	strcpy(suffix,".recispread_up.csv");
503	strcpy(spin,"SpinUp");
504	break;
505	case SpinDown:
506	strcpy(suffix,".recispread_down.csv");
507	strcpy(spin,"SpinDown");
508	break;
509	}
510	if(P->Par.me_comm_ST == 0) {
511	if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Calculating reciprocal moments ...\n",P->Par.me);
512	if (R->LevSNo == Lat->MaxLevel-1) // open freshly if first level
513	OpenFile(P, &F->ReciSpreadFile, suffix, "w", P->Call.out[ReadOut]); // only open on starting level
514	else if (F->ReciSpreadFile == NULL) // re-op,18en if not first level and not opened yet (or closed from ParseWannierFile)
515	OpenFile(P, &F->ReciSpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
516	if (F->ReciSpreadFile == NULL) {
517	Error(SomeError,"ComputeMLWF: Error opening Reciprocal spread File!\n");
518	} else {
519	fprintf(F->ReciSpreadFile,"===== Reciprocal Spreads of type %s ==========================================================================\n", spin);
520	}
521	}
522
523	// integrate second order moment
524	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
525	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
526	if (OnePsiA->PsiType == type) { // drop all but occupied ones
527	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
528	LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
529	else
530	LPsiDatA = NULL;
531
532	if (LPsiDatA != NULL) {
533	if (P->Par.me_comm_ST == 0)
534	fprintf(F->ReciSpreadFile,"Psi%d_Lev%d\t", Psi->AllPsiStatus[l].MyGlobalNo, R->LevSNo);
535	for (i=0;i<NDIM;i++) {
536	for (j=0;j<NDIM;j++) {
537	result = 0.;
538	g = 0;
539	if (LevS->GArray[0].GSq == 0.0) {
540	result += LevS->GArray[g].G[i]LevS->GArray[g].G[j](LPsiDatA[0].re * LPsiDatA[0].re);
541	g++;
542	}
543	for (;g<LevS->MaxG;g++)
544	result += LevS->GArray[g].G[i]LevS->GArray[g].G[j]2.(LPsiDatA[g].re LPsiDatA[g].re + LPsiDatA[g].im * LPsiDatA[g].im);
545	//result *= Lat->Volume/LevS->MaxG;
546	MPI_Allreduce ( &result, &Result, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
547	if (P->Par.me_comm_ST == 0)
548	fprintf(F->ReciSpreadFile,"%2.5lg ", Result);
549	}
550	}
551	if (P->Par.me_comm_ST == 0)
552	fprintf(F->ReciSpreadFile,"\n");
553	}
554	}
555	}
556	if(P->Par.me_comm_ST == 0) {
557	fprintf(F->ReciSpreadFile,"====================================================================================================================\n\n");
558	fflush(F->ReciSpreadFile);
559	}
560	}
561
562	/** Given the unitary matrix the transformation is performed on the Psis.
563	* \param *P Problem at hand
564	* \param *U gsl matrix containing the transformation matrix
565	* \param Num dimension parameter for the matrix, i.e. number of wave functions
566	*/
567	void UnitaryTransformationOnWavefunctions(struct Problem P, gsl_matrix U, int Num)
568	{
569	struct Lattice *Lat = &P->Lat;
570	struct RunStruct *R = &P->R;
571	struct Psis *Psi = &Lat->Psi;
572	struct LatticeLevel *LevS = R->LevS;
573	MPI_Status status;
574	struct OnePsiElement OnePsiB, OnePsiA, *LOnePsiB;
575	int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
576	fftw_complex LPsiDatA=NULL, LPsiDatB=NULL;
577	int g,i,j,l,k,m,p;
578
579	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation of all wave functions according to U\n",P->Par.me);
580
581	Num = Psi->TypeStartIndex[type+1] - Psi->TypeStartIndex[type]; // recalc Num as we can only work with local Psis from here
582	fftw_complex *coeffs_buffer = Malloc(sizeof(fftw_complex )Num, "ComputeMLWF: *coeffs_buffer");
583
584	for (l=0;l<Num;l++) // allocate for each local wave function
585	coeffs_buffer[l] = LevS->LPsi->OldLocalPsi[l];
586
587	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 6: Transformation ...\n",P->Par.me);
588	l=-1; // to access U matrix element (0..Num-1)
589	k=-1; // to access the above swap coeffs_buffer (0..LocalNo-1)
590	for (i=0; i < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; i++) { // go through all wave functions
591	OnePsiA = &Psi->AllPsiStatus[i]; // grab OnePsiA
592	if (OnePsiA->PsiType == type) { // drop all but occupied ones
593	l++; // increase l if it is occupied wave function
594	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // local?
595	k++; // increase k only if it is a local, non-extra orbital wave function
596	LPsiDatA = (fftw_complex *) coeffs_buffer[k]; // new coeffs first go to copy buffer, old ones must not be overwritten yet
597	SetArrayToDouble0((double )LPsiDatA, 2LevS->MaxG); // zero buffer part
598	} else
599	LPsiDatA = NULL; // otherwise processes won't enter second loop, though they're supposed to send coefficients!
600
601	m = -1; // to access U matrix element (0..Num-1)
602	for (j=0; j < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; j++) { // go through all wave functions
603	OnePsiB = &Psi->AllPsiStatus[j]; // grab OnePsiB
604	if (OnePsiB->PsiType == type) { // drop all but occupied ones
605	m++; // increase m if it is occupied wave function
606	if (OnePsiB->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
607	LOnePsiB = &Psi->LocalPsiStatus[OnePsiB->MyLocalNo];
608	else
609	LOnePsiB = NULL;
610	if (LOnePsiB == NULL) { // if it's not local ... receive it from respective process into TempPsi
611	RecvSource = OnePsiB->my_color_comm_ST_Psi;
612	MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag2, P->Par.comm_ST_PsiT, &status );
613	LPsiDatB=LevS->LPsi->TempPsi;
614	} else { // .. otherwise send it to all other processes (Max_me... - 1)
615	for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
616	if (p != OnePsiB->my_color_comm_ST_Psi)
617	MPI_Send( LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag2, P->Par.comm_ST_PsiT);
618	LPsiDatB=LevS->LPsi->LocalPsi[OnePsiB->MyLocalNo];
619	} // LPsiDatB is now set to the coefficients of OnePsi either stored or MPI_Received
620
621	if (LPsiDatA != NULL) {
622	double tmp = gsl_matrix_get(U,l,m);
623	g=0;
624	if (LevS->GArray[0].GSq == 0.0) {
625	LPsiDatA[g].re += LPsiDatB[g].re * tmp;
626	LPsiDatA[g].im += LPsiDatB[g].im * tmp;
627	g++;
628	}
629	for (; g < LevS->MaxG; g++) {
630	LPsiDatA[g].re += LPsiDatB[g].re * tmp;
631	LPsiDatA[g].im += LPsiDatB[g].im * tmp;
632	}
633	}
634	}
635	}
636	}
637	}
638
639	//if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) STEP 6: Swapping buffer mem\n",P->Par.me);
640	// now, as all wave functions are updated, swap the buffer
641	l = -1;
642	for (k=0;k<Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT;k++) { // go through each local occupied wave function
643	if (Psi->AllPsiStatus[k].PsiType == type && Psi->AllPsiStatus[k].my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {
644	l++;
645	//if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) (k:%i,l:%i) LocalNo = (%i,%i)\t AllPsiNo = (%i,%i)\n", P->Par.me, k,l,Psi->LocalPsiStatus[l].MyLocalNo, Psi->LocalPsiStatus[l].MyGlobalNo, Psi->AllPsiStatus[k].MyLocalNo, Psi->AllPsiStatus[k].MyGlobalNo);
646	LPsiDatA = (fftw_complex *)coeffs_buffer[l];
647	LPsiDatB = LevS->LPsi->LocalPsi[l];
648	for (g=0;g<LevS->MaxG;g++) {
649	LPsiDatB[g].re = LPsiDatA[g].re;
650	LPsiDatB[g].im = LPsiDatA[g].im;
651	}
652	// recalculating non-local form factors which are coefficient dependent!
653	CalculateNonLocalEnergyNoRT(P, Psi->LocalPsiStatus[l].MyLocalNo);
654	}
655	}
656	// and free allocated buffer memory
657	Free(coeffs_buffer, "UnitaryTransformationOnWavefunctions: coeffs_buffer");
658	}
659
660	/** Changes Wannier Centres according to RunStruct#CommonWannier.
661	* \param *P Problem at hand.
662	* \param Num number of Psis
663	* \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
664	* \param *WannierSpread array with wannier spread per wave function
665	*/
666	void ChangeWannierCentres(struct Problem P, int Num, double WannierCentre, double WannierSpread)
667	{
668	struct RunStruct *R = &P->R;
669	struct Lattice *Lat = &P->Lat;
670	struct LatticeLevel *LevS = R->LevS;
671	int marker, *group;
672	int partner[Num];
673	int i,j,l,k;
674	int totalflag, flag;
675	double q[NDIM], center[NDIM];
676	double Spread;
677	int *N = LevS->Plan0.plan->N;
678
679	switch (R->CommonWannier) {
680	case 4:
681	debug(P,"Shifting each Wannier centers to cell center");
682	for (j=0;j<NDIM;j++) // center point in [0,1]^3
683	center[j] = 0.5;
684	RMat33Vec3(q,Lat->RealBasis, center); // transform to real coordinates
685	for (i=0; i < Num; i++) { // go through all occupied wave functions
686	for (j=0;j<NDIM;j++) // put into Wannier centres
687	WannierCentre[i][j] = q[j];
688	}
689	break;
690	case 3:
691	debug(P,"Shifting Wannier centers individually to nearest grid point");
692	for (i=0;i < Num; i++) { // go through all wave functions
693	RMat33Vec3(q, Lat->ReciBasis, WannierCentre[i]);
694	for (j=0;j<NDIM;j++) { // Recibasis is not true inverse but times 2.*PI
695	q[j] = (double)N[j]/(2.PI);
696
697	//fprintf(stderr,"(%i) N[%i]: %i\t tmp %e\t floor %e\t ceil %e\n",P->Par.me, j, N[j], tmp, floor(tmp), ceil(tmp));
698	if (fabs((double)floor(q[j]) - q[j]) < fabs((double)ceil(q[j]) - q[j]))
699	q[j] = floor(q[j])/(double)N[j];
700	else
701	q[j] = ceil(q[j])/(double)N[j];
702	}
703	RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
704	}
705	break;
706	case 2:
707	debug(P,"Combining individual orbitals according to spread.");
708	//fprintf(stderr,"(%i) Finding multiple bindings and Reweighting Wannier centres\n",P->Par.me);
709	debug(P,"finding partners");
710	marker = (int) Malloc(sizeof(int)(Num+1),"ComputeMLWF: marker");
711	group = (int*) Malloc(sizeof(int )*Num,"ComputeMLWF: group");
712	for (l=0;l<Num;l++) {
713	group[l] = (int) Malloc(sizeof(int)(Num+1),"ComputeMLWF: group[l]"); // each must group must have one more as end marker
714	for (k=0;k<=Num;k++)
715	group[l][k] = -1; // reset partner group
716	}
717	for (k=0;k<Num;k++)
718	partner[k] = 0;
719	debug(P,"mem allocated");
720	// go for each orbital through every other, check distance against the sum of both spreads
721	// if smaller add to group of this orbital
722	for (l=0;l<Num;l++) {
723	j=0; // index for partner group
724	for (k=0;k<Num;k++) { // check this against l
725	Spread = 0.;
726	for (i=0;i<NDIM;i++) {
727	//fprintf(stderr,"(%i) Spread += (%e - %e)^2 \n", P->Par.me, WannierCentre[l][i], WannierCentre[k][i]);
728	Spread += (WannierCentre[l][i] - WannierCentre[k][i])*(WannierCentre[l][i] - WannierCentre[k][i]);
729	}
730	Spread = sqrt(Spread); // distance in Spread
731	//fprintf(stderr,"(%i) %i to %i: distance %e, SpreadSum = %e + %e = %e \n", P->Par.me, l, k, Spread, WannierSpread[l], WannierSpread[k], WannierSpread[l]+WannierSpread[k]);
732	if (Spread < 1.5*(WannierSpread[l]+WannierSpread[k])) {// if distance smaller than sum of spread
733	group[l][j++] = k; // add k to group of l
734	partner[l]++;
735	//fprintf(stderr,"(%i) %i added as %i-th member to %i's group.\n", P->Par.me, k, j, l);
736	}
737	}
738	}
739
740	// consistency, for each orbital check if this orbital is also in the group of each referred orbital
741	debug(P,"checking consistency");
742	totalflag = 1;
743	for (l=0;l<Num;l++) // checking l's group
744	for (k=0;k<Num;k++) { // k is partner index
745	if (group[l][k] != -1) { // if current index k is a partner
746	flag = 0;
747	for(j=0;j<Num;j++) { // go through each entry in l partner's partner group if l exists
748	if ((group[ group[l][k] ][j] == l))
749	flag = 1;
750	}
751	//if (flag == 0) fprintf(stderr, "(%i) in %i's group %i is referred as a partner, but not the other way round!\n", P->Par.me, l, group[l][k]);
752	if (totalflag == 1) totalflag = flag;
753	}
754	}
755	// for each orbital group (marker group) weight each center to a total and put this into the local WannierCentres
756	debug(P,"weight and calculate new centers for partner groups");
757	for (l=0;l<=Num;l++)
758	marker[l] = 1;
759	if (totalflag) {
760	for (l=0;l<Num;l++) { // go through each orbital
761	if (marker[l] != 0) { // if it hasn't been reweighted
762	marker[l] = 0;
763	for (i=0;i<NDIM;i++)
764	q[i] = 0.;
765	j = 0;
766	while (group[l][j] != -1) {
767	marker[group[l][j]] = 0;
768	for (i=0;i<NDIM;i++) {
769	//fprintf(stderr,"(%i) Adding to %i's group, %i entry of %i: %e\n", P->Par.me, l, i, group[l][j], WannierCentre[ group[l][j] ][i]);
770	q[i] += WannierCentre[ group[l][j] ][i];
771	}
772	j++;
773	}
774	//fprintf(stderr,"(%i) %i's group: (%e,%e,%e)/%i = (%e,%e,%e)\n", P->Par.me, l, q[0], q[1], q[2], j, q[0]/(double)j, q[1]/(double)j, q[2]/(double)j);
775	for (i=0;i<NDIM;i++) {// weight by number of elements in partner group
776	q[i] /= (double)(j);
777	}
778
779	// put WannierCentre into own and all partners'
780	for (i=0;i<NDIM;i++)
781	WannierCentre[l][i] = q[i];
782	j = 0;
783	while (group[l][j] != -1) {
784	for (i=0;i<NDIM;i++)
785	WannierCentre[group[l][j]][i] = q[i];
786	j++;
787	}
788	}
789	}
790	}
791	if (P->Call.out[StepLeaderOut]) {
792	fprintf(stderr,"Summary:\n");
793	fprintf(stderr,"========\n");
794	for (i=0;i<Num;i++)
795	fprintf(stderr,"%i belongs to a %i-ple binding.\n",i,partner[i]);
796	}
797	debug(P,"done");
798
799	Free(marker, "ChangeWannierCentres: marker");
800	for (l=0;l<Num;l++)
801	Free(group[l], "ChangeWannierCentres: group[l]");
802	Free(group, "ChangeWannierCentres: group");
803	break;
804	case 1:
805	debug(P,"Individual orbitals are changed to center of all.");
806	for (i=0;i<NDIM;i++) // zero center of weight
807	q[i] = 0.;
808	for (k=0;k<Num;k++)
809	for (i=0;i<NDIM;i++) { // sum up all orbitals each component
810	q[i] += WannierCentre[k][i];
811	}
812	for (i=0;i<NDIM;i++) // divide by number
813	q[i] /= Num;
814	for (k=0;k<Num;k++)
815	for (i=0;i<NDIM;i++) { // put into this function's array
816	WannierCentre[k][i] = q[i];
817	}
818	break;
819	case 0:
820	default:
821	break;
822	}
823	}
824
825	/** From the entries of the variance matrices the spread is calculated.
826	* WannierCentres are evaluated according to Resta operator: \f$\langle X \rangle = \frac{L}{2\pi} \sum_j {\cal I} \ln{\langle \Psi_j \| \exp{(i \frac{2\pi}{L} X)} \| \Psi_j \rangle}\f$
827	* WannierSpread is: \f$ \sum_j \langle \Psi_j \| r^2 \| \Psi_j \rangle - \langle \Psi_j \| r \| psi_j \rangle^2\f$
828	* \param *P Problem at hand
829	* \param *A variance matrices
830	* \param old_spread first term of wannier spread
831	* \param spread second term of wannier spread
832	* \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
833	* \param *WannierSpread array with wannier spread per wave function
834	*/
835	void ComputeWannierCentresfromVarianceMatrices(struct Problem P, gsl_matrix A, double spread, double old_spread, double WannierCentre, double WannierSpread)
836	{
837	struct Lattice *Lat = &P->Lat;
838	struct Psis *Psi = &Lat->Psi;
839	struct OnePsiElement *OnePsiA;
840	int i,j,k,l;
841	double tmp, q[NDIM];
842
843	*old_spread = 0;
844	*spread = 0;
845
846	// the spread for x,y,z resides in the respective diagonal element of A_.. for each orbital
847	i=-1;
848	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
849	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
850	if (OnePsiA->PsiType == type) { // drop all but occupied ones
851	i++; // increase l if it is occupied wave function
852	//fprintf(stderr,"(%i) Wannier for %i\n", P->Par.me, i);
853
854	// calculate Wannier Centre
855	for (j=0;j<NDIM;j++) {
856	q[j] = 1./(2.PI) GSL_IMAG( gsl_complex_log( gsl_complex_rect(gsl_matrix_get(A[j2],i,i),gsl_matrix_get(A[j2+1],i,i))));
857	if (q[j] < 0) // change wrap around of above operator to smooth 0...Lat->RealBasisSQ
858	q[j] += 1.;
859	}
860	RMat33Vec3(WannierCentre[i], Lat->RealBasis, q);
861
862	// store orbital spread and centre in file
863	tmp = - pow(gsl_matrix_get(A[0],i,i),2) - pow(gsl_matrix_get(A[1],i,i),2)
864	- pow(gsl_matrix_get(A[2],i,i),2) - pow(gsl_matrix_get(A[3],i,i),2)
865	- pow(gsl_matrix_get(A[4],i,i),2) - pow(gsl_matrix_get(A[5],i,i),2);
866	WannierSpread[i] = gsl_matrix_get(A[max_operators],i,i) + tmp;
867	//fprintf(stderr,"(%i) WannierSpread[%i] = %e\n", P->Par.me, i, WannierSpread[i]);
868	//if (P->Par.me == 0) fprintf(F->SpreadFile,"Orbital %d:\t Wannier center (x,y,z)=(%lg,%lg,%lg)\t Spread sigma^2 = %lg - %lg = %lg\n",
869	//Psi->AllPsiStatus[i].MyGlobalNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], gsl_matrix_get(A[max_operators],i,i), -tmp, WannierSpread[i]);
870	//if (P->Par.me == 0) fprintf(F->SpreadFile,"%e\t%e\t%e\n",
871	//WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2]);
872
873	// gather all spreads
874	*old_spread += gsl_matrix_get(A[max_operators],i,i); // tr(U^H B U)
875	for (k=0;k<max_operators;k++)
876	*spread += pow(gsl_matrix_get(A[k],i,i),2);
877	}
878	}
879	}
880
881	/** Prints gsl_matrix nicely to screen.
882	* \param *P Problem at hand
883	* \param *U gsl matrix
884	* \param Num number of rows/columns
885	* \param *msg name of matrix, prepended before entries
886	*/
887	void PrintGSLMatrix(struct Problem P, gsl_matrix U, int Num, const char *msg)
888	{
889	int k,l;
890	fprintf(stderr,"(%i) %s = \n",P->Par.me, msg);
891	for (k=0;k<Num;k++) {
892	for (l=0;l<Num;l++)
893	fprintf(stderr,"%e\t",gsl_matrix_get(U,l,k));
894	fprintf(stderr,"\n");
895	}
896	}
897
898	/** Allocates memory for the DiagonalizationData structure
899	* \param *P Problem at hand
900	* \param DiagData pointer to structure
901	* \param Num number of rows and columns in matrix
902	* \param NumMatrices number of matrices to be simultaneously diagonalized
903	* \param extra number of extra matrices to be passively also diagonalized
904	*/
905	void InitDiagonalization(struct Problem P, struct DiagonalizationData DiagData, int Num, int NumMatrices, int extra)
906	{
907	int i;
908
909	// store integer values
910	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 1\n",P->Par.me);
911	DiagData->Num = Num;
912	DiagData->AllocNum = ceil((double)Num / 2. ) *2;
913	DiagData->NumMatrices = NumMatrices;
914	DiagData->extra = extra;
915
916	// determine communicator and our rank and its size
917	DiagData->comm = DetermineParallelGroupbyGCD(P,DiagData->AllocNum);
918	MPI_Comm_size (*(DiagData->comm), &(DiagData->ProcNum));
919	MPI_Comm_rank (*(DiagData->comm), &(DiagData->ProcRank));
920
921	// allocate memory
922	DiagData->U = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
923	gsl_matrix_set_identity(DiagData->U);
924
925	DiagData->A = (gsl_matrix *) Malloc((DiagData->NumMatrices+DiagData->extra) sizeof(gsl_matrix ), "InitDiagonalization: A");
926	for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++) {
927	DiagData->A[i] = gsl_matrix_calloc(DiagData->AllocNum,DiagData->AllocNum);
928	gsl_matrix_set_zero(DiagData->A[i]);
929	}
930
931	// init merry-go-round array for diagonilzation
932	DiagData->top = (int ) Malloc(sizeof(int)DiagData->AllocNum/2, "InitDiagonalization: top");
933	DiagData->bot = (int ) Malloc(sizeof(int)DiagData->AllocNum/2, "InitDiagonalization: bot");
934	for (i=0;i<DiagData->AllocNum/2;i++) {
935	DiagData->top[i] = 2*i;
936	DiagData->bot[i] = 2*i+1;
937	}
938	/* // print starting values of index generation tables top and bot
939	fprintf(stderr,"top\t");
940	for (k=0;k<AllocNum/2;k++)
941	fprintf(stderr,"%i\t", top[k]);
942	fprintf(stderr,"\n");
943	fprintf(stderr,"bot\t");
944	for (k=0;k<AllocNum/2;k++)
945	fprintf(stderr,"%i\t", bot[k]);
946	fprintf(stderr,"\n");*/
947	}
948
949	/** Frees allocated memory for the DiagonalizationData structure.
950	* \param DiagData pointer to structure
951	*/
952	void FreeDiagonalization(struct DiagonalizationData *DiagData)
953	{
954	int i;
955
956	Free(DiagData->top, "FreeDiagonalization: DiagData->top");
957	Free(DiagData->bot, "FreeDiagonalization: DiagData->bot");
958	gsl_matrix_free(DiagData->U);
959	for (i=0;i<(DiagData->NumMatrices+DiagData->extra);i++)
960	gsl_matrix_free(DiagData->A[i]);
961	Free(DiagData->A, "FreeDiagonalization: DiagData->A");
962	}
963
964	/** Orthogonalizing PsiType#Occupied wave functions for MD.
965	* Orthogonalizes wave functions by finding a unitary transformation such that the density remains unchanged.
966	* To this end, the overlap matrix \f$\langle \Psi_i \| \Psi_j \rangle\f$ is created and diagonalised by Jacobi
967	* transformation (product of rotation matrices). The found transformation matrix is applied to all Psis.
968	* \param *P Problem at hand
969	* \note [Payne] states that GramSchmidt is not well-suited. However, this Jacobi diagonalisation is not well
970	* suited for our CG minimisation either. If one wave functions is changed in course of the minimsation
971	* procedure, in a Gram-Schmidt-Orthogonalisation only this wave function is changed (although it remains under
972	* debate whether subsequent Psis are still orthogonal to this changed wave function!), in a Jacobi-Orthogonali-
973	* stion however at least two if not all wave functions are changed. Thus inhibits our fast way of updating the
974	* density after a minimisation step -- UpdateDensityCalculation(). Thus, this routine can only be used for a
975	* final orthogonalisation of all Psis, after the CG minimisation.
976	*/
977	void OrthogonalizePsis(struct Problem *P)
978	{
979	struct Lattice *Lat = &P->Lat;
980	struct Psis *Psi = &Lat->Psi;
981	struct LatticeLevel *LevS = P->R.LevS;
982	int Num = Psi->NoOfPsis;
983	int i,j,l;
984	struct DiagonalizationData DiagData;
985	double PsiSP;
986
987	InitDiagonalization(P, &DiagData, Num, 1, 0);
988
989	// Calculate Overlap matrix
990	for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
991	CalculateOverlap(P, l, Occupied);
992	for (i=0;i<Num;i++) // fill gsl matrix with overlap values
993	for (j=0;j<Num;j++)
994	gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
995	//if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i\|Psi_J> (before)");
996
997	//if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (before)");
998
999	// diagonalize overlap matrix
1000	Diagonalize(P, &DiagData);
1001
1002	//if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.U,Num,"Transformation (after)");
1003
1004	// apply found unitary transformation to wave functions
1005	UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);
1006
1007	// update GramSch stati
1008	for (i=Psi->TypeStartIndex[Occupied];i<Psi->TypeStartIndex[UnOccupied];i++) {
1009	GramSchNormalize(P,LevS,LevS->LPsi->LocalPsi[i], 0.); // calculates square product if 0. is given...
1010	PsiSP = GramSchGetNorm2(P,LevS,LevS->LPsi->LocalPsi[i]);
1011	//if (P->Par.me_comm_ST_Psi == 0) fprintf(stderr,"(%i) PsiSP[%i] = %lg\n", P->Par.me, i, PsiSP);
1012	Psi->LocalPsiStatus[i].PsiGramSchStatus = (int)IsOrthonormal;
1013	}
1014	UpdateGramSchAllPsiStatus(P,Psi);
1015	/*
1016	for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++)
1017	CalculateOverlap(P, l, Occupied);
1018	for (i=0;i<Num;i++) // fill gsl matrix with overlap values
1019	for (j=0;j<Num;j++)
1020	gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
1021	if (P->Call.out[ReadOut]) PrintGSLMatrix(P,DiagData.A[0],Num,"<Psi_i\|Psi_J> (after)");
1022	*/
1023	// free memory
1024	FreeDiagonalization(&DiagData);
1025	}
1026
1027	/** Orthogonalizing PsiType#Occupied wave functions and their time derivatives for MD.
1028	* Ensures that two orthonormality condition are met for Psis:
1029	* \f$\langle \Psi_i \| \Psi_j \rangle = \delta_{ij}\f$
1030	* \f$\langle \dot{\Psi_i} \| \dot{\Psi_j} \rangle = \delta_{ij}\f$
1031	* \param *P Problem at hand
1032	* \note [Payne] states that GramSchmidt is not well-suited, and this stronger
1033	* condition is needed for numerical stability
1034	* \todo create and fill 1stoverlap with finite difference between Psi, OldPsi with R->Deltat
1035	*/
1036	void StrongOrthogonalizePsis(struct Problem *P)
1037	{
1038	struct Lattice *Lat = &P->Lat;
1039	struct Psis *Psi = &Lat->Psi;
1040	int Num = Psi->NoOfPsis;
1041	int i,j,l;
1042	struct DiagonalizationData DiagData;
1043
1044	InitDiagonalization(P, &DiagData, Num, 2, 0);
1045
1046	// Calculate Overlap matrix
1047	for (l=Psi->TypeStartIndex[Occupied]; l<Psi->TypeStartIndex[UnOccupied]; l++) {
1048	CalculateOverlap(P, l, Occupied);
1049	//Calculate1stOverlap(P, l, Occupied);
1050	}
1051	for (i=0;i<Num;i++) // fill gsl matrix with overlap values
1052	for (j=0;j<Num;j++) {
1053	gsl_matrix_set(DiagData.A[0],i,j,Psi->Overlap[i][j]);
1054	//gsl_matrix_set(DiagData.A[1],i,j,Psi->1stOverlap[i][j]);
1055	}
1056
1057	// diagonalize overlap matrix
1058	Diagonalize(P, &DiagData);
1059
1060	// apply found unitary transformation to wave functions
1061	UnitaryTransformationOnWavefunctions(P, DiagData.U, DiagData.AllocNum);
1062
1063	// free memory
1064	FreeDiagonalization(&DiagData);
1065	}
1066
1067	/** Uses either serial or parallel diagonalization.
1068	* \param *P Problem at hand
1069	* \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
1070	* \sa ParallelDiagonalization(), SerialDiagonalization()
1071	*/
1072	void Diagonalize(struct Problem P, struct DiagonalizationData DiagData)
1073	{
1074	if (DiagData->Num != 1) { // one- or multi-process case?
1075	if (((DiagData->AllocNum % 2) == 0) && (DiagData->ProcNum != 1) && ((DiagData->AllocNum / 2) % DiagData->ProcNum == 0)) {
1076	/*
1077	debug(P,"Testing with silly matrix");
1078	ParallelDiagonalization(P, test, Utest, 1, 0, 4, Num, comm, ProcRank, ProcNum, top, bot);
1079	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1080	PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
1081	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1082	PrintGSLMatrix(P, Utest, 4, "Utest (final)");
1083	*/
1084	debug(P,"ParallelDiagonalization");
1085	ParallelDiagonalization(P, DiagData);
1086	} else {/*
1087	debug(P,"Testing with silly matrix");
1088	SerialDiagonalization(P, test, Utest, 1, 0, 4, Num, top, bot);
1089	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1090	PrintGSLMatrix(P, test[0], 4, "test[0] (diagonalized)");
1091	if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1092	PrintGSLMatrix(P, Utest, 4, "Utest (final)");
1093	*/
1094	debug(P,"SerialDiagonalization");
1095	SerialDiagonalization(P, DiagData);
1096	}
1097
1098	//if(P->Call.out[ReadOut]) // && P->Par.me == 0)
1099	//PrintGSLMatrix(P, DiagData->U, DiagData->AllocNum, "U");
1100	}
1101	}
1102
1103	/** Computation of Maximally Localized Wannier Functions.
1104	* Maximally localized functions are prime when evulating a Hamiltonian with
1105	* magnetic fields under periodic boundary conditions, as the common position
1106	* operator is no longer valid. These can be obtained by orbital rotations, which
1107	* are looked for iteratively and gathered in one transformation matrix, to be
1108	* later applied to the set of orbital wave functions.
1109	*
1110	* In order to obtain these, the following algorithm is applied:
1111	* -# Initialize U (identity) as the sought-for transformation matrix
1112	* -# Compute the real symmetric (due to Gamma point symmetry!) matrix elements
1113	* \f$A^{(k)}_{ij} = \langle \phi_i \| A^{(k)} \| \phi_j \rangle\f$ for the six operators
1114	* \f$A^{(k)}\f$
1115	* -# For each pair of indices (i,j) (i<j) do the following:
1116	* -# Compute the 2x2 matrix \f$G = \Re \Bigl ( \sum_k h^H(A^{(k)}) h(A^{(k)}) \Bigr)\f$
1117	* where \f$h(A) = [a_{ii} - a_{jj}, a_{ij} + a_{ji}]\f$
1118	* -# Obtain eigenvalues and eigenvectors of G. Set \f$[x,y]^T\f$ to the eigenvector of G
1119	* corresponding to the greatest eigenvalue, such that \f$x\geq0\f$
1120	* -# Compute the rotation matrix R elements (ii,ij,ji,jj) \f$[c,s,-s,c]\f$ different from the
1121	* identity matrix by \f$r=\sqrt{x^2+y^2}\f$, \f$c = \sqrt{\frac{x+r}{2r}}\f$
1122	* \f$s=\frac{y}{\sqrt{2r(x+r)}}\f$
1123	* -# Perform the similarity operation \f$A^{(k)} \rightarrow R A^{(k)} R\f$
1124	* -# Gather the rotations in \f$U = U R\f$
1125	* -# Compute the total spread \f$\sigma^2_{A^{(k)}}\f$
1126	* -# Compare the change in spread to a desired minimum RunStruct#EpsWannier, if still greater go to step 3.
1127	* -# Apply transformations to the orbital wavefunctions \f$ \| \phi_i \rangle = \sum_j U_{ij} \| \phi_j \rangle\f$
1128	* -# Compute the position of the Wannier centers from diagonal elements of \f$A^{(k)}\f$, store in
1129	* OnePsiElementAddData#WannierCentre
1130	*
1131	* Afterwards, the routine applies the found unitary rotation to the unperturbed group of wave functions.
1132	* Note that hereby additional memory is needed as old and transformed wave functions must be present at the same
1133	* time.
1134	*
1135	* The routine uses parallelization if possible. A parallel Jacobi-Diagonalization is implemented using the index
1136	* generation in music() and shift-columns() such that the evaluated position operator eigenvalue matrices
1137	* may be diagonalized simultaneously and parallely. We use the implementation explained in
1138	* [Golub, Matrix computations, 1989, p451].
1139	*
1140	* \param *P Problem at hand
1141	*/
1142	void ComputeMLWF(struct Problem *P) {
1143	// variables and allocation
1144	struct Lattice *Lat = &P->Lat;
1145	struct Psis *Psi = &Lat->Psi;
1146	int i;
1147	int Num = Psi->NoOfPsis; // is number of occupied plus unoccupied states for rows
1148	double **WannierCentre;
1149	double *WannierSpread;
1150	double spread, spreadSQ;
1151	struct DiagonalizationData DiagData;
1152
1153	if(P->Call.out[StepLeaderOut]) fprintf(stderr,"(%i) Beginning localization of orbitals ...\n",P->Par.me);
1154
1155	InitDiagonalization(P, &DiagData, Num, max_operators, 1);
1156
1157	// STEP 2: Calculate A[k]_ij = V/N \sum_{G1,G2} C^\ast_{l,G1} c_{m,G2} \sum_R A^{(k)}(R) exp(iR(G2-G1))
1158	//debug (P,"Calculatung Variance matrices");
1159	FillHigherOrderRealMomentsMatrices(P, DiagData.AllocNum, DiagData.A);
1160
1161	//debug (P,"Diagonalizing");
1162	Diagonalize(P, &DiagData);
1163
1164	// STEP 6: apply transformation U to all wave functions \sum_i^Num U_ji \| \phi_i \rangle = \| \phi_j^\ast \rangle
1165	//debug (P,"Performing Unitary transformation");
1166	UnitaryTransformationOnWavefunctions(P, DiagData.U, Num);
1167
1168	if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 7: Compute centres and spread printout\n",P->Par.me);
1169	WannierCentre = (double *) Malloc(sizeof(double )Num, "ComputeMLWF: WannierCentre");
1170	WannierSpread = (double ) Malloc(sizeof(double)Num, "ComputeMLWF: WannierSpread");
1171	for (i=0;i<Num;i++)
1172	WannierCentre[i] = (double ) Malloc(sizeof(double)NDIM, "ComputeMLWF: WannierCentre");
1173
1174	//debug (P,"Computing Wannier centres from variance matrix");
1175	ComputeWannierCentresfromVarianceMatrices(P, DiagData.A, &spread, &spreadSQ, WannierCentre, WannierSpread);
1176
1177	// join Wannier orbital to groups with common centres under certain conditions
1178	//debug (P,"Changing Wannier Centres according to CommonWannier");
1179	ChangeWannierCentres(P, Num, WannierCentre, WannierSpread);
1180
1181	// write to file
1182	//debug (P,"Writing spread file");
1183	WriteWannierFile(P, spread, spreadSQ, WannierCentre, WannierSpread);
1184
1185	debug(P,"Free'ing memory");
1186	// free all remaining memory
1187	FreeDiagonalization(&DiagData);
1188	for (i=0;i<Num;i++)
1189	Free(WannierCentre[i], "ComputeMLWF: WannierCentre[i]");
1190	Free(WannierCentre, "ComputeMLWF: WannierCentre");
1191	Free(WannierSpread, "ComputeMLWF: WannierSpread");
1192	}
1193
1194	/** Solves directly for eigenvectors and eigenvalues of a real 2x2 matrix.
1195	* The eigenvalues are determined by \f$\det{(A-\lambda \cdot I)}\f$, where \f$I\f$ is the unit matrix and
1196	* \f$\lambda\f$ an eigenvalue. This leads to a pq-formula which is easily evaluated in the first part.
1197	*
1198	* The eigenvectors are then obtained by solving \f$A-\lambda \cdot I)x = 0\f$ for a given eigenvalue
1199	* \f$\lambda\f$. However, the eigenvector magnitudes are not specified, thus we the equation system is
1200	* still lacking such an equation. We use this fact to set either coordinate arbitrarily to 1 and then
1201	* derive the other by solving the equation system. Finally, the eigenvector is normalized.
1202	* \param *P Problem at hand
1203	* \param *A matrix whose eigenvalues/-vectors are to be found
1204	* \param *eval vector with eigenvalues
1205	* \param *evec matrix with corresponding eigenvectors in columns
1206	*/
1207	#ifdef HAVE_INLINE
1208	inline void EigensolverFor22Matrix(struct Problem P, gsl_matrix A, gsl_vector eval, gsl_matrix evec)
1209	#else
1210	void EigensolverFor22Matrix(struct Problem P, gsl_matrix A, gsl_vector eval, gsl_matrix evec)
1211	#endif
1212	{
1213	double a11,a12,a21,a22;
1214	double ev[2], summand1, summand2, norm;
1215	int i;
1216	// find eigenvalues
1217	a11 = gsl_matrix_get(A,0,0);
1218	a12 = gsl_matrix_get(A,0,1);
1219	a21 = gsl_matrix_get(A,1,0);
1220	a22 = gsl_matrix_get(A,1,1);
1221	summand1 = (a11+a22)/2.;
1222	summand2 = sqrt(summand1summand1 + a12a21 - a11*a22);
1223	ev[0] = summand1 + summand2;
1224	ev[1] = summand1 - summand2;
1225	gsl_vector_set(eval, 0, ev[0]);
1226	gsl_vector_set(eval, 1, ev[1]);
1227	//fprintf (stderr,"(%i) ev1 %lg \t ev2 %lg\n", P->Par.me, ev1, ev2);
1228
1229	// find eigenvectors
1230	for(i=0;i<2;i++) {
1231	if (fabs(ev[i]) < MYEPSILON) {
1232	gsl_matrix_set(evec, 0,i, 0.);
1233	gsl_matrix_set(evec, 1,i, 0.);
1234	} else if (fabs(a22-ev[i]) > MYEPSILON) {
1235	norm = sqrt(11 + (a21a21/(a22-ev[i])/(a22-ev[i])));
1236	gsl_matrix_set(evec, 0,i, 1./norm);
1237	gsl_matrix_set(evec, 1,i, -(a21/(a22-ev[i]))/norm);
1238	//fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
1239	} else if (fabs(a12) > MYEPSILON) {
1240	norm = sqrt(11 + (a11-ev[i])(a11-ev[i])/(a12*a12));
1241	gsl_matrix_set(evec, 0,i, 1./norm);
1242	gsl_matrix_set(evec, 1,i, -((a11-ev[i])/a12)/norm);
1243	//fprintf (stderr,"(%i) evec %i (%lg,%lg)", P->Par.me, i, -(a12/(a11-ev[i]))/norm, 1./norm);
1244	} else {
1245	if (fabs(a11-ev[i]) > MYEPSILON) {
1246	norm = sqrt(11 + (a12a12/(a11-ev[i])/(a11-ev[i])));
1247	gsl_matrix_set(evec, 0,i, -(a12/(a11-ev[i]))/norm);
1248	gsl_matrix_set(evec, 1,i, 1./norm);
1249	//fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
1250	} else if (fabs(a21) > MYEPSILON) {
1251	norm = sqrt(11 + (a22-ev[i])(a22-ev[i])/(a21*a21));
1252	gsl_matrix_set(evec, 0,i, -(a22-ev[i])/a21/norm);
1253	gsl_matrix_set(evec, 1,i, 1./norm);
1254	//fprintf (stderr,"\t evec %i (%lg,%lg)\n", i, -(a12/(a11-ev[i]))/norm, 1./norm);
1255	} else {
1256	//gsl_matrix_set(evec, 0,i, 0.);
1257	//gsl_matrix_set(evec, 1,i, 1.);
1258	fprintf (stderr,"\t evec %i undetermined\n", i);
1259	}
1260	//gsl_matrix_set(evec, 0,0, 1.);
1261	//gsl_matrix_set(evec, 1,0, 0.);
1262	//fprintf (stderr,"(%i) evec1 undetermined", P->Par.me);
1263	}
1264	}
1265	}
1266
1267	/** Calculates sine and cosine values for multiple matrix diagonalization.
1268	* \param *evec eigenvectors in columns
1269	* \param *eval corresponding eigenvalues
1270	* \param *c cosine to be returned
1271	* \param *s sine to be returned
1272	*/
1273	#ifdef HAVE_INLINE
1274	inline void CalculateRotationAnglesFromEigenvalues(gsl_matrix evec, gsl_vector eval, double c, double s)
1275	#else
1276	void CalculateRotationAnglesFromEigenvalues(gsl_matrix evec, gsl_vector eval, double c, double s)
1277	#endif
1278	{
1279	int index;
1280	double x,y,r;
1281
1282	index = gsl_vector_max_index (eval); // get biggest eigenvalue
1283	//fprintf(stderr,"\t1st: %lg\t2nd: %lg --- biggest: %i\n", gsl_vector_get(eval, 0), gsl_vector_get(eval, 1), index);
1284	x = gsl_matrix_get(evec, 0, index);
1285	y = gsl_matrix_get(evec, 1, index);
1286	if (x < 0) { // ensure x>=0 so that rotation angles remain smaller Pi/4
1287	y = -y;
1288	x = -x;
1289	}
1290	//z = gsl_matrix_get(evec, 2, index) * x/fabs(x);
1291	//fprintf(stderr,"\tx %lg\ty %lg\n", x,y);
1292
1293	//fprintf(stderr,"(%i),(%i,%i) STEP 3c\n",P->Par.me,i,j);
1294	// STEP 3c: calculate R = [[c,s^\ast],[-s,c^\ast]]
1295	r = sqrt(xx + yy); // + z*z);
1296	if (fabs(r) > MYEPSILON) {
1297	c = sqrt((x + r) / (2.r));
1298	s = y / sqrt(2.r(x+r)); //, -z / sqrt(2r*(x+r)));
1299	} else {
1300	*c = 1.;
1301	*s = 0.;
1302	}
1303	}
1304
1305	/*
1306	* \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
1307	* \param *U transformation matrix set to unity matrix
1308	* \param NumMatrices number of matrices to be diagonalized simultaneously
1309	* \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
1310	* \param AllocNum number of rows/columns in matrices
1311	* \param Num number of wave functions
1312	* \param ProcRank index in group for this cpu
1313	* \param ProcNum number of cpus in group
1314	* \param *top array with top row indices (merry-go-round)
1315	* \param *bot array with bottom row indices (merry-go-round)
1316	*/
1317
1318	/** Simultaneous diagonalization of matrices with multiple cpus.
1319	* \param *P Problem at hand
1320	* \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
1321	* \note this is slower given one cpu only than SerialDiagonalization()
1322	*/
1323	void ParallelDiagonalization(struct Problem P, struct DiagonalizationData DiagData)
1324	{
1325	struct RunStruct *R =&P->R;
1326	int tagR0, tagR1, tagS0, tagS1;
1327	int iloc, jloc;
1328	double s_all, c_all;
1329	int round, max_rounds;
1330	int start;
1331	int rcounts, rdispls;
1332	double c, s;
1333	int Lsend, Rsend, Lrecv, Rrecv; // where left(right) column is sent to or where it originates from
1334	int left, right; // left or right neighbour for process
1335	double spread = 0., old_spread=0., Spread=0.;
1336	int i,j,k,l,m,u;
1337	int set;
1338	int it_steps; // iteration step counter
1339	double Aloc[DiagData->NumMatrices+1], Uloc; // local columns for one step of A[k]
1340	double Around[DiagData->NumMatrices+1], Uround; // all local columns for one round of A[k]
1341	double Atotal[DiagData->NumMatrices+1], Utotal; // all local columns for one round of A[k]
1342	double a_i, a_j;
1343	gsl_matrix *G;
1344	gsl_vector *h;
1345	gsl_vector *eval;
1346	gsl_matrix *evec;
1347	//gsl_eigen_symmv_workspace *w;
1348
1349	max_rounds = (DiagData->AllocNum / 2)/DiagData->ProcNum; // each process must perform multiple rotations per step of a set
1350	if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) start %i\tstep %i\tmax.rounds %i\n",P->Par.me, DiagData->ProcRank, DiagData->ProcNum, max_rounds);
1351
1352	// allocate column vectors for interchange of columns
1353	debug(P,"allocate column vectors for interchange of columns");
1354	c = (double ) Malloc(sizeof(double)max_rounds, "ComputeMLWF: c");
1355	s = (double ) Malloc(sizeof(double)max_rounds, "ComputeMLWF: s");
1356	c_all = (double ) Malloc(sizeof(double)DiagData->AllocNum/2, "ComputeMLWF: c_all");
1357	s_all = (double ) Malloc(sizeof(double)DiagData->AllocNum/2, "ComputeMLWF: s_all");
1358	rcounts = (int ) Malloc(sizeof(int)DiagData->ProcNum, "ComputeMLWF: rcounts");
1359	rdispls = (int ) Malloc(sizeof(int)DiagData->ProcNum, "ComputeMLWF: rdispls");
1360
1361	// allocate eigenvector stuff
1362	debug(P,"allocate eigenvector stuff");
1363	G = gsl_matrix_calloc (2,2);
1364	h = gsl_vector_alloc (2);
1365	eval = gsl_vector_alloc (2);
1366	evec = gsl_matrix_alloc (2,2);
1367	//w = gsl_eigen_symmv_alloc(2);
1368
1369	// establish communication partners
1370	debug(P,"establish communication partners");
1371	if (DiagData->ProcRank == 0) {
1372	tagS0 = WannierALTag; // left p0 always remains left p0
1373	} else {
1374	tagS0 = (DiagData->ProcRank == DiagData->ProcNum - 1) ? WannierARTag : WannierALTag; // left p_last becomes right p_last
1375	}
1376	tagS1 = (DiagData->ProcRank == 0) ? WannierALTag : WannierARTag; // right p0 always goes into left p1
1377	tagR0 = WannierALTag; //
1378	tagR1 = WannierARTag; // first process
1379	if (DiagData->ProcRank == 0) {
1380	left = DiagData->ProcNum-1;
1381	right = 1;
1382	Lsend = 0;
1383	Rsend = 1;
1384	Lrecv = 0;
1385	Rrecv = 1;
1386	} else if (DiagData->ProcRank == DiagData->ProcNum - 1) {
1387	left = DiagData->ProcRank - 1;
1388	right = 0;
1389	Lsend = DiagData->ProcRank;
1390	Rsend = DiagData->ProcRank - 1;
1391	Lrecv = DiagData->ProcRank - 1;
1392	Rrecv = DiagData->ProcRank;
1393	} else {
1394	left = DiagData->ProcRank - 1;
1395	right = DiagData->ProcRank + 1;
1396	Lsend = DiagData->ProcRank+1;
1397	Rsend = DiagData->ProcRank - 1;
1398	Lrecv = DiagData->ProcRank - 1;
1399	Rrecv = DiagData->ProcRank+1;
1400	}
1401	//if (P->Call.out[ReadOut]) fprintf(stderr,"(%i) left %i\t right %i --- Lsend %i\tRsend%i\tLrecv %i\tRrecv%i\n",P->Par.me, left, right, Lsend, Rsend, Lrecv, Rrecv);
1402
1403	// initialise A_loc
1404	debug(P,"initialise A_loc");
1405	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
1406	//Aloc[k] = (double ) Malloc(sizeof(double)AllocNum*2, "ComputeMLWF: Aloc[k]");
1407	Around[k] = (double ) Malloc(sizeof(double)DiagData->AllocNum2max_rounds, "ComputeMLWF: Around[k]");
1408	Atotal[k] = (double ) Malloc(sizeof(double)DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Atotal[k]");
1409	Aloc[k] = (double *) Malloc(sizeof(double )2max_rounds, "ComputeMLWF: Aloc[k]");
1410	//Around[k] = &Atotal[k][ProcRankAllocNum2*max_rounds];
1411
1412	for (round=0;round<max_rounds;round++) {
1413	Aloc[k][2round] = &Around[k][DiagData->AllocNum(2*round)];
1414	Aloc[k][2round+1] = &Around[k][DiagData->AllocNum(2*round+1)];
1415	for (l=0;l<DiagData->AllocNum;l++) {
1416	Aloc[k][2round][l] = gsl_matrix_get(DiagData->A[k],l,2(DiagData->ProcRank*max_rounds+round));
1417	Aloc[k][2round+1][l] = gsl_matrix_get(DiagData->A[k],l,2(DiagData->ProcRank*max_rounds+round)+1);
1418	//fprintf(stderr,"(%i) (%i, 0/1) A_loc1 %e\tA_loc2 %e\n",P->Par.me, l, Aloc[k][l],Aloc[k][l+AllocNum]);
1419	}
1420	}
1421	}
1422	// initialise U_loc
1423	debug(P,"initialise U_loc");
1424	//Uloc = (double ) Malloc(sizeof(double)AllocNum*2, "ComputeMLWF: Uloc");
1425	Uround = (double ) Malloc(sizeof(double)DiagData->AllocNum2max_rounds, "ComputeMLWF: Uround");
1426	Utotal = (double ) Malloc(sizeof(double)DiagData->AllocNum*DiagData->AllocNum, "ComputeMLWF: Utotal");
1427	Uloc = (double *) Malloc(sizeof(double )2max_rounds, "ComputeMLWF: Uloc");
1428	//Uround = &Utotal[ProcRankAllocNum2*max_rounds];
1429	for (round=0;round<max_rounds;round++) {
1430	Uloc[2round] = &Uround[DiagData->AllocNum(2*round)];
1431	Uloc[2round+1] = &Uround[DiagData->AllocNum(2*round+1)];
1432	for (l=0;l<DiagData->AllocNum;l++) {
1433	Uloc[2round][l] = gsl_matrix_get(DiagData->U,l,2(DiagData->ProcRank*max_rounds+round));
1434	Uloc[2round+1][l] = gsl_matrix_get(DiagData->U,l,2(DiagData->ProcRank*max_rounds+round)+1);
1435	//fprintf(stderr,"(%i) (%i, 0/1) U_loc1 %e\tU_loc2 %e\n",P->Par.me, l, Uloc[l+AllocNum0],Uloc[l+AllocNum1]);
1436	}
1437	}
1438
1439	// now comes the iteration loop
1440	debug(P,"now comes the iteration loop");
1441	it_steps = 0;
1442	do {
1443	it_steps++;
1444	//if (P->Par.me == 0) fprintf(stderr,"(%i) Beginning parallel iteration %i ... ",P->Par.me,it_steps);
1445	for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 staying at its place all the time
1446	//fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
1447	for (round = 0; round < max_rounds;round++) {
1448	start = DiagData->ProcRank * max_rounds + round;
1449	// get indices
1450	i = DiagData->top[start] < DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // minimum of the two indices
1451	iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1;
1452	j = DiagData->top[start] > DiagData->bot[start] ? DiagData->top[start] : DiagData->bot[start]; // maximum of the two indices: thus j > i
1453	jloc = DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
1454	//fprintf(stderr,"(%i) my (%i,%i), loc(%i,%i)\n",P->Par.me, i,j, iloc, jloc);
1455
1456	// calculate rotation angle, i.e. c and s
1457	//fprintf(stderr,"(%i),(%i,%i) calculate rotation angle\n",P->Par.me,i,j);
1458	gsl_matrix_set_zero(G);
1459	for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
1460	// Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]
1461	//fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,Aloc[k][2round+iloc][i], Aloc[k][2round+jloc][j],Aloc[k][2round+iloc][i] - Aloc[k][2round+jloc][j]);
1462	//fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,Aloc[k][2round+jloc][i], Aloc[k][2round+iloc][j],Aloc[k][2round+jloc][i] + Aloc[k][2round+iloc][j]);
1463	gsl_vector_set(h, 0, Aloc[k][2round+iloc][i] - Aloc[k][2round+jloc][j]);
1464	gsl_vector_set(h, 1, Aloc[k][2round+jloc][i] + Aloc[k][2round+iloc][j]);
1465
1466	// Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
1467	for (l=0;l<2;l++)
1468	for (m=0;m<2;m++)
1469	gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
1470	}
1471	//fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
1472	// STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
1473	EigensolverFor22Matrix(P,G,eval,evec);
1474	//gsl_eigen_symmv(G, eval, evec, w); // calculates eigenvalues and eigenvectors of G
1475
1476	CalculateRotationAnglesFromEigenvalues(evec, eval, &c[round], &s[round]);
1477	//fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[round],s[round]);
1478
1479	//fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
1480	// V_loc = V_loc * V_small
1481	//debug(P,"apply rotation to local U");
1482	for (l=0;l<DiagData->AllocNum;l++) {
1483	a_i = Uloc[2*round+iloc][l];
1484	a_j = Uloc[2*round+jloc][l];
1485	Uloc[2round+iloc][l] = c[round] a_i + s[round] * a_j;
1486	Uloc[2round+jloc][l] = -s[round] a_i + c[round] * a_j;
1487	}
1488	} // end of round
1489	// circulate the rotation angles
1490	//debug(P,"circulate the rotation angles");
1491	MPI_Allgather(c, max_rounds, MPI_DOUBLE, c_all, max_rounds, MPI_DOUBLE, *(DiagData->comm)); // MPI_Allgather is waaaaay faster than ring circulation
1492	MPI_Allgather(s, max_rounds, MPI_DOUBLE, s_all, max_rounds, MPI_DOUBLE, *(DiagData->comm));
1493	//m = start;
1494	for (l=0;l<DiagData->AllocNum/2;l++) { // for each process
1495	// we have V_small from process k
1496	//debug(P,"Apply V_small from other process");
1497	i = DiagData->top[l] < DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // minimum of the two indices
1498	j = DiagData->top[l] > DiagData->bot[l] ? DiagData->top[l] : DiagData->bot[l]; // maximum of the two indices: thus j > i
1499	iloc = DiagData->top[l] < DiagData->bot[l] ? 0 : 1;
1500	jloc = DiagData->top[l] > DiagData->bot[l] ? 0 : 1;
1501	for (m=0;m<max_rounds;m++) {
1502	//fprintf(stderr,"(%i) %i processes' (%i,%i)\n",P->Par.me, m,i,j);
1503	// apply row rotation to each A[k]
1504	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
1505	//fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+iloc][i],j,Aloc[k][2m+iloc][j]);
1506	//fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+jloc][i],j,Aloc[k][2m+jloc][j]);
1507	a_i = Aloc[k][2*m+iloc][i];
1508	a_j = Aloc[k][2*m+iloc][j];
1509	Aloc[k][2m+iloc][i] = c_all[l] a_i + s_all[l] * a_j;
1510	Aloc[k][2m+iloc][j] = -s_all[l] a_i + c_all[l] * a_j;
1511	a_i = Aloc[k][2*m+jloc][i];
1512	a_j = Aloc[k][2*m+jloc][j];
1513	Aloc[k][2m+jloc][i] = c_all[l] a_i + s_all[l] * a_j;
1514	Aloc[k][2m+jloc][j] = -s_all[l] a_i + c_all[l] * a_j;
1515	//fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+iloc][i],j,Aloc[k][2m+iloc][j]);
1516	//fprintf(stderr,"(%i) A^%i: a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, i, Aloc[k][2m+jloc][i],j,Aloc[k][2m+jloc][j]);
1517	}
1518	}
1519	}
1520	// apply rotation to local operator matrices
1521	// A_loc = A_loc * V_small
1522	//debug(P,"apply rotation to local operator matrices A[k]");
1523	for (m=0;m<max_rounds;m++) {
1524	start = DiagData->ProcRank * max_rounds + m;
1525	iloc = DiagData->top[start] < DiagData->bot[start] ? 0 : 1;
1526	jloc = DiagData->top[start] > DiagData->bot[start] ? 0 : 1;
1527	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// extra for B matrix !
1528	for (l=0;l<DiagData->AllocNum;l++) {
1529	// Columns, i and j belong to this process only!
1530	a_i = Aloc[k][2*m+iloc][l];
1531	a_j = Aloc[k][2*m+jloc][l];
1532	Aloc[k][2m+iloc][l] = c[m] a_i + s[m] * a_j;
1533	Aloc[k][2m+jloc][l] = -s[m] a_i + c[m] * a_j;
1534	//fprintf(stderr,"(%i) A:(k%i) a_i[%i] %e\ta_j[%i] %e\n",P->Par.me, k, l, Aloc[k][2m+iloc][l],l,Aloc[k][2m+jloc][l]);
1535	}
1536	}
1537	}
1538	// Shuffling of these round's columns to prepare next rotation set
1539	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
1540	// extract columns from A
1541	//debug(P,"extract columns from A");
1542	MerryGoRoundColumns(*(DiagData->comm), Aloc[k], DiagData->AllocNum, max_rounds, k, tagS0, tagS1, tagR0, tagR1);
1543
1544	}
1545	// and also for V ...
1546	//debug(P,"extract columns from U");
1547	MerryGoRoundColumns(*(DiagData->comm), Uloc, DiagData->AllocNum, max_rounds, 0, tagS0, tagS1, tagR0, tagR1);
1548
1549
1550	// and merry-go-round for the indices too
1551	//debug(P,"and merry-go-round for the indices too");
1552	MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
1553	}
1554
1555	//fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
1556	// STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
1557	old_spread = Spread;
1558	spread = 0.;
1559	for(k=0;k<DiagData->NumMatrices;k++) { // go through all self-adjoint operators
1560	for (i=0; i < 2*max_rounds; i++) { // go through all wave functions
1561	spread += Aloc[k][i][i+DiagData->ProcRank2max_rounds]Aloc[k][i][i+DiagData->ProcRank2*max_rounds];
1562	//spread += gsl_matrix_get(A[k],i,i)*gsl_matrix_get(A[k],i,i);
1563	}
1564	}
1565	MPI_Allreduce(&spread, &Spread, 1, MPI_DOUBLE, MPI_SUM, *(DiagData->comm));
1566	//Spread = spread;
1567	if (P->Par.me == 0) {
1568	//if(P->Call.out[ReadOut])
1569	// fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,Spread,R->EpsWannier);
1570	//else
1571	//fprintf(stderr,"%2.9e\n",Spread);
1572	}
1573	// STEP 5: check change of variance
1574	} while (fabs(old_spread-Spread) >= R->EpsWannier);
1575	// end of iterative diagonalization loop: We have found our final orthogonal U!
1576
1577	// gather local parts of U into complete matrix
1578	for (l=0;l<DiagData->ProcNum;l++)
1579	rcounts[l] = DiagData->AllocNum;
1580	debug(P,"allgather U");
1581	for (round=0;round<2*max_rounds;round++) {
1582	for (l=0;l<DiagData->ProcNum;l++)
1583	rdispls[l] = (l2max_rounds + round)*DiagData->AllocNum;
1584	MPI_Allgatherv(Uloc[round], DiagData->AllocNum, MPI_DOUBLE, Utotal, rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
1585	}
1586	for (k=0;k<DiagData->AllocNum;k++) {
1587	for(l=0;l<DiagData->AllocNum;l++) {
1588	gsl_matrix_set(DiagData->U,k,l, Utotal[l+k*DiagData->AllocNum]);
1589	}
1590	}
1591
1592	// after one set, gather A[k] from all and calculate spread
1593	for (l=0;l<DiagData->ProcNum;l++)
1594	rcounts[l] = DiagData->AllocNum;
1595	debug(P,"gather A[k] for spread");
1596	for (u=0;u<DiagData->NumMatrices+DiagData->extra;u++) {// extra for B matrix !
1597	debug(P,"A[k] all gather");
1598	for (round=0;round<2*max_rounds;round++) {
1599	for (l=0;l<DiagData->ProcNum;l++)
1600	rdispls[l] = (l2max_rounds + round)*DiagData->AllocNum;
1601	MPI_Allgatherv(Aloc[u][round], DiagData->AllocNum, MPI_DOUBLE, Atotal[u], rcounts, rdispls, MPI_DOUBLE, *(DiagData->comm));
1602	}
1603	for (k=0;k<DiagData->AllocNum;k++) {
1604	for(l=0;l<DiagData->AllocNum;l++) {
1605	gsl_matrix_set(DiagData->A[u],k,l, Atotal[u][l+k*DiagData->AllocNum]);
1606	}
1607	}
1608	}
1609
1610	// free eigenvector stuff
1611	gsl_vector_free(h);
1612	gsl_matrix_free(G);
1613	//gsl_eigen_symmv_free(w);
1614	gsl_vector_free(eval);
1615	gsl_matrix_free(evec);
1616
1617	// Free column vectors
1618	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {
1619	Free(Atotal[k], "ParallelDiagonalization: Atotal[k]");
1620	Free(Around[k], "ParallelDiagonalization: Around[k]");
1621	}
1622	Free(Uround, "ParallelDiagonalization: Uround");
1623	Free(Utotal, "ParallelDiagonalization: Utotal");
1624	Free(c_all, "ParallelDiagonalization: c_all");
1625	Free(s_all, "ParallelDiagonalization: s_all");
1626	Free(c, "ParallelDiagonalization: c");
1627	Free(s, "ParallelDiagonalization: s");
1628	Free(rcounts, "ParallelDiagonalization: rcounts");
1629	Free(rdispls, "ParallelDiagonalization: rdispls");
1630	}
1631	/*
1632	* \param **A matrices (pointer array with \a NumMatrices entries) to be diagonalized
1633	* \param *U transformation matrix set to unity matrix
1634	* \param NumMatrices number of matrices to be diagonalized
1635	* \param extra number of additional matrices the rotation is applied to however which actively diagonalized (follow in \a **A)
1636	* \param AllocNum number of rows/columns in matrices
1637	* \param Num number of wave functions
1638	* \param *top array with top row indices (merry-go-round)
1639	* \param *bot array with bottom row indices (merry-go-round)
1640	*/
1641
1642	/** Simultaneous Diagonalization of variances matrices with one cpu.
1643	* \param *P Problem at hand
1644	* \param *DiagData pointer to structure DiagonalizationData containing necessary information for diagonalization
1645	* \note this is faster given one cpu only than ParallelDiagonalization()
1646	*/
1647	void SerialDiagonalization(struct Problem P, struct DiagonalizationData DiagData)
1648	{
1649	struct RunStruct *R = &P->R;
1650	gsl_matrix *G;
1651	gsl_vector *h;
1652	gsl_vector *eval;
1653	gsl_matrix *evec;
1654	//gsl_eigen_symmv_workspace *w;
1655	double c,s,a_i,a_j;
1656	int it_steps, set, ProcRank;
1657	int i,j,k,l,m;
1658	double spread = 0., old_spread = 0.;\
1659
1660	// allocate eigenvector stuff
1661	debug(P,"allocate eigenvector stuff");
1662	G = gsl_matrix_calloc (2,2);
1663	h = gsl_vector_alloc (2);
1664	eval = gsl_vector_alloc (2);
1665	evec = gsl_matrix_alloc (2,2);
1666	//w = gsl_eigen_symmv_alloc(2);
1667
1668	c = (double *) Malloc(sizeof(double), "ComputeMLWF: c");
1669	s = (double *) Malloc(sizeof(double), "ComputeMLWF: s");
1670	debug(P,"now comes the iteration loop");
1671	it_steps=0;
1672	do {
1673	it_steps++;
1674	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 3: Iteratively maximize negative spread part\n",P->Par.me);
1675	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Beginning iteration %i ... ",P->Par.me,it_steps);
1676	for (set=0; set < DiagData->AllocNum-1; set++) { // one column less due to column 0 stating at its place all the time
1677	//fprintf(stderr,"(%i) Beginning rotation set %i ...\n",P->Par.me,set);
1678	// STEP 3: for all index pairs 0<= i<j <AllocNum
1679	for (ProcRank=0;ProcRank<DiagData->AllocNum/2;ProcRank++) {
1680	// get indices
1681	i = DiagData->top[ProcRank] < DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // minimum of the two indices
1682	j = DiagData->top[ProcRank] > DiagData->bot[ProcRank] ? DiagData->top[ProcRank] : DiagData->bot[ProcRank]; // maximum of the two indices: thus j > i
1683	//fprintf(stderr,"(%i),(%i,%i) STEP 3a\n",P->Par.me,i,j);
1684	// STEP 3a: Calculate G
1685	gsl_matrix_set_zero(G);
1686
1687	for (k=0;k<DiagData->NumMatrices;k++) { // go through all operators ...
1688	// Calculate vector h(a) = [a_ii - a_jj, a_ij + a_ji, i(a_ji - a_ij)]
1689	//fprintf(stderr,"(%i) k%i [a_ii - a_jj] = %e - %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,i), gsl_matrix_get(DiagData->A[k],j,j),gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));
1690	//fprintf(stderr,"(%i) k%i [a_ij + a_ji] = %e + %e = %e\n",P->Par.me, k,gsl_matrix_get(DiagData->A[k],i,j), gsl_matrix_get(DiagData->A[k],j,i),gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));
1691	gsl_vector_set(h, 0, gsl_matrix_get(DiagData->A[k],i,i) - gsl_matrix_get(DiagData->A[k],j,j));
1692	gsl_vector_set(h, 1, gsl_matrix_get(DiagData->A[k],i,j) + gsl_matrix_get(DiagData->A[k],j,i));
1693	//gsl_vector_complex_set(h, 2, gsl_complex_mul_imag(gsl_complex_add(gsl_matrix_complex_get(A[k],j,i), gsl_matrix_complex_get(A[k],i,j)),1));
1694
1695	// Calculate G = Re[ \sum_k h^H (A^{(k)}) h(A^{(k)}) ]
1696	for (l=0;l<2;l++)
1697	for (m=0;m<2;m++)
1698	gsl_matrix_set(G,l,m, gsl_vector_get(h,l) * gsl_vector_get(h,m) + gsl_matrix_get(G,l,m));
1699
1700	//PrintGSLMatrix(P,G,2, "G");
1701	}
1702
1703	//fprintf(stderr,"(%i),(%i,%i) STEP 3b\n",P->Par.me,i,j);
1704	// STEP 3b: retrieve eigenvector which belongs to greatest eigenvalue of G
1705	EigensolverFor22Matrix(P,G,eval,evec);
1706	//gsl_eigen_symmv(G, eval, evec, w); // calculates eigenvalues and eigenvectors of G
1707
1708	//PrintGSLMatrix(P, evec, 2, "Eigenvectors");
1709	CalculateRotationAnglesFromEigenvalues(evec, eval, c, s);
1710	//fprintf(stderr,"(%i),(%i,%i) COS %e\t SIN %e\n",P->Par.me,i,j,c[0],s[0]);
1711
1712	//fprintf(stderr,"(%i),(%i,%i) STEP 3d\n",P->Par.me,i,j);
1713	// STEP 3d: apply rotation R to rows and columns of A^{(k)}
1714	for (k=0;k<DiagData->NumMatrices+DiagData->extra;k++) {// one extra for B matrix !
1715	//PrintGSLMatrix(P,A[k],AllocNum, "A (before rot)");
1716	for (l=0;l<DiagData->AllocNum;l++) {
1717	// Rows
1718	a_i = gsl_matrix_get(DiagData->A[k],i,l);
1719	a_j = gsl_matrix_get(DiagData->A[k],j,l);
1720	gsl_matrix_set(DiagData->A[k], i, l, c[0] * a_i + s[0] * a_j);
1721	gsl_matrix_set(DiagData->A[k], j, l, -s[0] * a_i + c[0] * a_j);
1722	}
1723	for (l=0;l<DiagData->AllocNum;l++) {
1724	// Columns
1725	a_i = gsl_matrix_get(DiagData->A[k],l,i);
1726	a_j = gsl_matrix_get(DiagData->A[k],l,j);
1727	gsl_matrix_set(DiagData->A[k], l, i, c[0] * a_i + s[0] * a_j);
1728	gsl_matrix_set(DiagData->A[k], l, j, -s[0] * a_i + c[0] * a_j);
1729	}
1730	//PrintGSLMatrix(P,A[k],AllocNum, "A (after rot)");
1731	}
1732	//fprintf(stderr,"(%i),(%i,%i) STEP 3e\n",P->Par.me,i,j);
1733	//PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (before rot)");
1734	// STEP 3e: apply U = R*U
1735	for (l=0;l<DiagData->AllocNum;l++) {
1736	a_i = gsl_matrix_get(DiagData->U,i,l);
1737	a_j = gsl_matrix_get(DiagData->U,j,l);
1738	gsl_matrix_set(DiagData->U, i, l, c[0] * a_i + s[0] * a_j);
1739	gsl_matrix_set(DiagData->U, j, l, -s[0] * a_i + c[0] * a_j);
1740	}
1741	//PrintGSLMatrix(P,DiagData->U,DiagData->AllocNum, "U (after rot)");
1742	}
1743	// and merry-go-round for the indices too
1744	//debug(P,"and merry-go-round for the indices too");
1745	if (DiagData->AllocNum > 2) MerryGoRoundIndices(DiagData->top, DiagData->bot, DiagData->AllocNum/2);
1746	}
1747
1748	//if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) STEP 4\n",P->Par.me);
1749	// STEP 4: calculate new variance: \sum_{ik} (A^{(k)}_ii)^2
1750	old_spread = spread;
1751	spread = 0.;
1752	for(k=0;k<DiagData->NumMatrices;k++) { // go through all self-adjoint operators
1753	for (i=0; i < DiagData->AllocNum; i++) { // go through all wave functions
1754	spread += pow(gsl_matrix_get(DiagData->A[k],i,i),2);
1755	}
1756	}
1757	if(P->Par.me == 0) {
1758	//if(P->Call.out[ReadOut])
1759	// fprintf(stderr,"(%i) STEP 5: %2.9e - %2.9e <= %lg ?\n",P->Par.me,old_spread,spread,R->EpsWannier);
1760	//else
1761	//fprintf(stderr,"%2.9e\n",spread);
1762	}
1763	// STEP 5: check change of variance
1764	} while (fabs(old_spread-spread) >= R->EpsWannier);
1765	// end of iterative diagonalization loop: We have found our final orthogonal U!
1766	Free(c, "SerialDiagonalization: c");
1767	Free(s, "SerialDiagonalization: s");
1768	// free eigenvector stuff
1769	gsl_vector_free(h);
1770	gsl_matrix_free(G);
1771	//gsl_eigen_symmv_free(w);
1772	gsl_vector_free(eval);
1773	gsl_matrix_free(evec);
1774	}
1775
1776	/** Writes the wannier centres and spread to file.
1777	* Also puts centres from array into OnePsiElementAddData structure.
1778	* \param *P Problem at hand
1779	* \param old_spread first term of wannier spread
1780	* \param spread second term of wannier spread
1781	* \param **WannierCentre 2D array (NDIM, \a Num) with wannier centres
1782	* \param *WannierSpread array with wannier spread per wave function
1783	*/
1784	void WriteWannierFile(struct Problem P, double spread, double old_spread, double WannierCentre, double WannierSpread)
1785	{
1786	struct FileData *F = &P->Files;
1787	struct RunStruct *R = &P->R;
1788	struct Lattice *Lat = &P->Lat;
1789	struct Psis *Psi = &Lat->Psi;
1790	struct OnePsiElement *OnePsiA;
1791	char spin[12], suffix[18];
1792	int i,j,l;
1793
1794	if(P->Call.out[ReadOut]) fprintf(stderr,"(%i) Spread printout\n", P->Par.me);
1795
1796	switch (Lat->Psi.PsiST) {
1797	case SpinDouble:
1798	strcpy(suffix,".spread.csv");
1799	strcpy(spin,"SpinDouble");
1800	break;
1801	case SpinUp:
1802	strcpy(suffix,".spread_up.csv");
1803	strcpy(spin,"SpinUp");
1804	break;
1805	case SpinDown:
1806	strcpy(suffix,".spread_down.csv");
1807	strcpy(spin,"SpinDown");
1808	break;
1809	}
1810	if (P->Par.me_comm_ST == 0) {
1811	if (R->LevSNo == Lat->MaxLevel-1) // open freshly if first level
1812	OpenFile(P, &F->SpreadFile, suffix, "w", P->Call.out[ReadOut]); // only open on starting level
1813	else if (F->SpreadFile == NULL) // re-open if not first level and not opened yet (or closed from ParseWannierFile)
1814	OpenFile(P, &F->SpreadFile, suffix, "a", P->Call.out[ReadOut]); // only open on starting level
1815	if (F->SpreadFile == NULL) {
1816	Error(SomeError,"WriteWannierFile: Error opening Wannier File!\n");
1817	} else {
1818	fprintf(F->SpreadFile,"#===== W A N N I E R C E N T R E S for Level %d of type %s ========================\n", R->LevSNo, spin);
1819	fprintf(F->SpreadFile,"# Orbital+Level\tx\ty\tz\tSpread\n");
1820	}
1821	}
1822
1823	// put (new) WannierCentres into local ones and into file
1824	i=-1;
1825	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
1826	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
1827	if (OnePsiA->PsiType == type) { // drop all but occupied ones
1828	i++; // increase l if it is occupied wave function
1829	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) {// is this local?
1830	for (j=0;j<NDIM;j++)
1831	Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[i][j];
1832	}
1833	if (P->Par.me_comm_ST == 0)
1834	fprintf(F->SpreadFile,"Psi%d_Lev%d\t%lg\t%lg\t%lg\t%lg\n", Psi->AllPsiStatus[i].MyGlobalNo, R->LevSNo, WannierCentre[i][0], WannierCentre[i][1], WannierCentre[i][2], WannierSpread[i]);
1835	}
1836	}
1837	if (P->Par.me_comm_ST == 0) {
1838	fprintf(F->SpreadFile,"\n#Matrix traces\tB_ii\tA_ii^2\tTotal (B_ii - A_ii^2)\n");
1839	fprintf(F->SpreadFile,"TotalSpread_L%d\t%lg\t%lg\t%lg\n\n",R->LevSNo, old_spread, spread, old_spread - spread);
1840	fflush(F->SpreadFile);
1841	}
1842	}
1843
1844
1845	/** Parses the spread file and puts values into OnePsiElementAddData#WannierCentre.
1846	* \param *P Problem at hand
1847	* \return 1 - success, 0 - failure
1848	*/
1849	int ParseWannierFile(struct Problem *P)
1850	{
1851	struct Lattice *Lat = &P->Lat;
1852	struct RunStruct *R = &P->R;
1853	struct Psis *Psi = &Lat->Psi;
1854	struct OnePsiElement *OnePsiA;
1855	int i,l,j, msglen;
1856	FILE *SpreadFile;
1857	char tagname[255];
1858	char suffix[18];
1859	double WannierCentre[NDIM+1]; // combined centre and spread
1860	MPI_Status status;
1861	int signal = 0; // 1 - ok, 0 - error
1862
1863	switch (Lat->Psi.PsiST) {
1864	case SpinDouble:
1865	strcpy(suffix,".spread.csv");
1866	break;
1867	case SpinUp:
1868	strcpy(suffix,".spread_up.csv");
1869	break;
1870	case SpinDown:
1871	strcpy(suffix,".spread_down.csv");
1872	break;
1873	}
1874	if (P->Call.out[NormalOut]) fprintf(stderr,"(%i) Parsing Wannier Centres from file ... \n", P->Par.me);
1875
1876	if (P->Par.me_comm_ST == 0) {
1877	if(!OpenFile(P, &SpreadFile, suffix, "r", P->Call.out[ReadOut])) { // check if file exists
1878	if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
1879	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
1880	return 0;
1881	//Error(SomeError,"ParseWannierFile: Opening failed\n");
1882	}
1883	signal = 1;
1884	if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
1885	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
1886	} else {
1887	if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
1888	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
1889	if (signal == 0)
1890	return 0;
1891	}
1892	i=-1;
1893	for (l=0; l < Psi->MaxPsiOfType+P->Par.Max_me_comm_ST_PsiT; l++) { // go through all wave functions
1894	OnePsiA = &Psi->AllPsiStatus[l]; // grab OnePsiA
1895	if (OnePsiA->PsiType == type) { // drop all but occupied ones
1896	i++; // increase l if it is occupied wave function
1897	if (P->Par.me_comm_ST == 0) { // only process 0 may access the spread file
1898	sprintf(tagname,"Psi%d_Lev%d",i,R->LevSNo);
1899	signal = 0;
1900	if (!ParseForParameter(0,SpreadFile,tagname,0,3,1,row_double,WannierCentre,1,optional)) {
1901	//Error(SomeError,"ParseWannierFile: Parsing WannierCentre failed");
1902	if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
1903	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
1904	return 0;
1905	}
1906	if (!ParseForParameter(0,SpreadFile,tagname,0,4,1,double_type,&WannierCentre[NDIM],1,optional)) {
1907	//Error(SomeError,"ParseWannierFile: Parsing WannierSpread failed");
1908	if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
1909	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
1910	return 0;
1911	}
1912	signal = 1;
1913	if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
1914	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
1915	} else {
1916	if (MPI_Bcast(&signal,1,MPI_INT,0,P->Par.comm_ST) != MPI_SUCCESS)
1917	Error(SomeError,"ParseWannierFile: Bcast of signal failed\n");
1918	if (signal == 0)
1919	return 0;
1920	}
1921	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) { // is this Psi local?
1922	if ((P->Par.me_comm_ST != 0) && (P->Par.me_comm_ST_Psi == 0)) { // if they don't belong to process 0 and we are a leader of a Psi group, receive 'em
1923	if (MPI_Recv(WannierCentre, NDIM+1, MPI_DOUBLE, 0, ParseWannierTag, P->Par.comm_ST_PsiT, &status) != MPI_SUCCESS)
1924	Error(SomeError,"ParseWannierFile: MPI_Recv of WannierCentre/Spread from process 0 failed");
1925	//return 0;
1926	MPI_Get_count(&status, MPI_DOUBLE, &msglen);
1927	if (msglen != NDIM+1)
1928	Error(SomeError,"ParseWannierFile: MPI_Recv of WannierCentre/Spread from process 0 failed due to wrong item count");
1929	//return 0;
1930	}
1931	if (MPI_Bcast(WannierCentre, NDIM+1, MPI_DOUBLE, 0, P->Par.comm_ST_Psi) != MPI_SUCCESS) // Bcast to all processes of the Psi group from leader
1932	Error(SomeError,"ParseWannierFile: MPI_Bcast of WannierCentre/Spread to sub process in Psi group failed");
1933	//return 0;
1934	// and store 'em (for all who have this Psi local)
1935	if ((P->Par.me == 0) && P->Call.out[ValueOut]) fprintf(stderr,"(%i) Psi %i, L %i: (x,y,z) = (%lg, %lg, %lg), Spread %lg\n",P->Par.me,i, R->LevSNo, WannierCentre[0], WannierCentre[1], WannierCentre[2], WannierCentre[NDIM]);
1936	for (j=0;j<NDIM;j++) Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[j] = WannierCentre[j];
1937	Psi->AddData[OnePsiA->MyLocalNo].WannierSpread = WannierCentre[NDIM];
1938	//if (P->Par.me ==0 && P->Call.out[ValueOut]) fprintf(stderr,"(%i) %s\t%lg\t%lg\t%lg\t\t%lg\n",P->Par.me, tagname,Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[0],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[1],Psi->AddData[OnePsiA->MyLocalNo].WannierCentre[2],Psi->AddData[OnePsiA->MyLocalNo].WannierSpread);
1939	} else if (P->Par.me_comm_ST == 0) { // if they are not local, yet we are process 0, send 'em to leader of its Psi group
1940	if (MPI_Send(WannierCentre, NDIM+1, MPI_DOUBLE, OnePsiA->my_color_comm_ST_Psi, ParseWannierTag, P->Par.comm_ST_PsiT) != MPI_SUCCESS)
1941	Error(SomeError,"ParseWannierFile: MPI_Send of WannierCentre/Spread to process 0 of owning Psi group failed");
1942	//return 0;
1943	}
1944	}
1945	}
1946	if ((SpreadFile != NULL) && (P->Par.me_comm_ST == 0))
1947	fclose(SpreadFile);
1948	//fprintf(stderr,"(%i) Parsing Wannier files succeeded!\n", P->Par.me);
1949	return 1;
1950	}
1951
1952	/** Calculates the spread of orbital \a i.
1953	* Stored in OnePsiElementAddData#WannierSpread.
1954	* \param *P Problem at hand
1955	* \param i i-th wave function (note "extra" ones are not counted!)
1956	* \return spread \f$\sigma^2_{A^{(k)}}\f$
1957	*/
1958	double CalculateSpread(struct Problem *P, int i) {
1959	struct Lattice *Lat = &P->Lat;
1960	struct RunStruct *R = &P->R;
1961	struct Psis *Psi = &Lat->Psi;
1962	struct LatticeLevel *Lev0 = R->Lev0;
1963	struct LatticeLevel *LevS = R->LevS;
1964	struct Density *Dens0 = Lev0->Dens;
1965	struct fft_plan_3d *plan = Lat->plan;
1966	fftw_complex *PsiC = Dens0->DensityCArray[ActualPsiDensity];
1967	fftw_real PsiCR = (fftw_real )PsiC;
1968	fftw_complex *work = Dens0->DensityCArray[Temp2Density];
1969	fftw_real **HGcR = &Dens0->DensityArray[HGcDensity]; // use HGcDensity, 4x Gap..Density, TempDensity as a storage array
1970	fftw_complex HGcRC = (fftw_complex)HGcR;
1971	fftw_complex **HGcR2C = &Dens0->DensityCArray[HGcDensity]; // use HGcDensity, 4x Gap..Density, TempDensity as an array
1972	fftw_real HGcR2 = (fftw_real)HGcR2C;
1973	MPI_Status status;
1974	struct OnePsiElement OnePsiA, LOnePsiA;
1975	int ElementSize = (sizeof(fftw_complex) / sizeof(double)), RecvSource;
1976	fftw_complex *LPsiDatA=NULL;
1977	int k,n[NDIM],n0, *N,N0, g, p, iS, i0, Index;
1978	N0 = LevS->Plan0.plan->local_nx;
1979	N = LevS->Plan0.plan->N;
1980	const int NUpx = LevS->NUp[0];
1981	const int NUpy = LevS->NUp[1];
1982	const int NUpz = LevS->NUp[2];
1983	double a_ij, b_ij, A_ij, B_ij;
1984	double tmp, tmp2, spread = 0;
1985	double cos_lookup, sin_lookup;
1986
1987	b_ij = 0;
1988
1989	CreateSinCosLookupTable(&cos_lookup,&sin_lookup,N);
1990
1991	// fill matrices
1992	OnePsiA = &Psi->AllPsiStatus[i]; // grab the desired OnePsiA
1993	if (OnePsiA->PsiType != Extra) { // drop if extra one
1994	if (OnePsiA->my_color_comm_ST_Psi == P->Par.my_color_comm_ST_Psi) // local?
1995	LOnePsiA = &Psi->LocalPsiStatus[OnePsiA->MyLocalNo];
1996	else
1997	LOnePsiA = NULL;
1998	if (LOnePsiA == NULL) { // if it's not local ... receive it from respective process into TempPsi
1999	RecvSource = OnePsiA->my_color_comm_ST_Psi;
2000	MPI_Recv( LevS->LPsi->TempPsi, LevS->MaxG*ElementSize, MPI_DOUBLE, RecvSource, WannierTag1, P->Par.comm_ST_PsiT, &status );
2001	LPsiDatA=LevS->LPsi->TempPsi;
2002	} else { // .. otherwise send it to all other processes (Max_me... - 1)
2003	for (p=0;p<P->Par.Max_me_comm_ST_PsiT;p++)
2004	if (p != OnePsiA->my_color_comm_ST_Psi)
2005	MPI_Send( LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo], LevS->MaxG*ElementSize, MPI_DOUBLE, p, WannierTag1, P->Par.comm_ST_PsiT);
2006	LPsiDatA=LevS->LPsi->LocalPsi[OnePsiA->MyLocalNo];
2007	} // LPsiDatA is now set to the coefficients of OnePsi either stored or MPI_Received
2008
2009	CalculateOneDensityR(Lat, LevS, Dens0, LPsiDatA, Dens0->DensityArray[ActualDensity], R->FactorDensityR, 1);
2010	// note: factor is not used when storing result in DensityCArray[ActualPsiDensity] in CalculateOneDensityR()!
2011	for (n0=0;n0<N0;n0++)
2012	for (n[1]=0;n[1]<N[1];n[1]++)
2013	for (n[2]=0;n[2]<N[2];n[2]++) {
2014	i0 = n[2]NUpz+N[2]NUpz(n[1]NUpy+N[1]NUpyn0*NUpx);
2015	iS = n[2]+N[2](n[1]+N[1]n0);
2016	n[0] = n0 + LevS->Plan0.plan->start_nx;
2017	for (k=0;k<max_operators;k+=2) {
2018	tmp = 2PI/(double)(N[k/2])(double)(n[k/2]);
2019	tmp2 = PsiCR[i0] /LevS->MaxN;
2020	// check lookup
2021	if ((fabs(cos(tmp) - cos_lookup[k/2][n[k/2]]) > MYEPSILON) \|\| (fabs(sin(tmp) - sin_lookup[k/2][n[k/2]]) > MYEPSILON)) {
2022	fprintf(stderr,"(%i) (cos) %2.15e against (lookup) %2.15e,\t(sin) %2.15e against (lookup) %2.15e\n", P->Par.me, cos(tmp), cos_lookup[k/2][n[k/2]],sin(tmp),sin_lookup[k/2][n[k/2]]);
2023	Error(SomeError, "Lookup table does not match real value!");
2024	}
2025	// HGcR[k][iS] = cos_lookup[k/2][n[k/2]] * tmp2; /* Matrix Vector Mult */
2026	// HGcR2[k][iS] = cos_lookup[k/2][n[k/2]] * HGcR[k][iS]; /* Matrix Vector Mult */
2027	// HGcR[k+1][iS] = sin_lookup[k/2][n[k/2]] * tmp2; /* Matrix Vector Mult */
2028	// HGcR2[k+1][iS] = sin_lookup[k/2][n[k/2]] * HGcR[k+1][iS]; /* Matrix Vector Mult */
2029	HGcR[k][iS] = cos(tmp) * tmp2; /* Matrix Vector Mult */
2030	HGcR2[k][iS] = pow(cos(tmp),2) * tmp2; /* Matrix Vector Mult */
2031	HGcR[k+1][iS] = sin(tmp) * tmp2; /* Matrix Vector Mult */
2032	HGcR2[k+1][iS] = pow(sin(tmp),2) * tmp2; /* Matrix Vector Mult */
2033	}
2034	}
2035	for (k=0;k<max_operators;k++) {
2036	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcRC[k], work);
2037	fft_3d_real_to_complex(plan, LevS->LevelNo, FFTNF1, HGcR2C[k], work);
2038	}
2039
2040
2041	for (k=0;k<max_operators;k++) {
2042	a_ij = 0;
2043	//fprintf(stderr,"(%i),(%i,%i): A[%i]: multiplying with \\phi_B\n",P->Par.me, l,m,k);
2044	// sum directly in a_ij and b_ij the two desired terms
2045	g=0;
2046	if (LevS->GArray[0].GSq == 0.0) {
2047	Index = LevS->GArray[g].Index;
2048	a_ij += (LPsiDatA[0].reHGcRC[k][Index].re + LPsiDatA[0].imHGcRC[k][Index].im);
2049	b_ij += (LPsiDatA[0].reHGcR2C[k][Index].re + LPsiDatA[0].imHGcR2C[k][Index].im);
2050	g++;
2051	}
2052	for (; g < LevS->MaxG; g++) {
2053	Index = LevS->GArray[g].Index;
2054	a_ij += 2(LPsiDatA[g].reHGcRC[k][Index].re + LPsiDatA[g].im*HGcRC[k][Index].im);
2055	b_ij += 2(LPsiDatA[g].reHGcR2C[k][Index].re + LPsiDatA[g].im*HGcR2C[k][Index].im);
2056	} // due to the symmetry the resulting matrix element is real and symmetric in (i,i) ! (complex multiplication simplifies ...)
2057	MPI_Allreduce ( &a_ij, &A_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
2058	spread += pow(A_ij,2);
2059	}
2060	}
2061	MPI_Allreduce ( &b_ij, &B_ij, 1, MPI_DOUBLE, MPI_SUM, P->Par.comm_ST_Psi);
2062
2063	// store spread in OnePsiElementAdd
2064	Psi->AddData[i].WannierSpread = B_ij - spread;
2065
2066	FreeSinCosLookupTable(cos_lookup,sin_lookup);
2067
2068	return (B_ij - spread);
2069	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: pcp/src/wannier.c@ 4f1369

Download in other formats: