// // csgrad.cc // // Copyright (C) 1996 Limit Point Systems, Inc. // // Author: Ida Nielsen // Maintainer: LPS // // This file is part of the SC Toolkit. // // The SC Toolkit is free software; you can redistribute it and/or modify // it under the terms of the GNU Library General Public License as published by // the Free Software Foundation; either version 2, or (at your option) // any later version. // // The SC Toolkit is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Library General Public License for more details. // // You should have received a copy of the GNU Library General Public License // along with the SC Toolkit; see the file COPYING.LIB. If not, write to // the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. // // The U.S. Government is granted a limited license as per AL 91-7. // #include #include #include #include #include #include #include #include #include #include

#include #include #include #include #include #include #include #include #include using namespace std;
using namespace sc;

#define SINGLE_THREAD_E12   0
#define SINGLE_THREAD_QBT34 0
#define SINGLE_THREAD_S2PDM 0

#define PRINT2Q 0
#define PRINT3Q 0
#define PRINT4Q 0
#if PRINT_BIGGEST_INTS
BiggestContribs biggest_ints_1(4,40);
#endif

#define WRITE_DOUBLES 0

static void sum_gradients(const Ref & msg, double **f, int n1, int n2);
static void zero_gradients(double **f, int n1, int n2);
static void accum_gradients(double **g, double **f, int n1, int n2);

#define PRINT1Q 0

#if PRINT_CONTRIB
static void
sw(int&i,int&j)
{
  int tmp = i;
  i = j;
  j = tmp;
}

static void
print_contrib(double tmpval, int num, int onum,
              int P,int Q,int R,int S, int p,int q,int r,int s)
{

  printf("noncanon: z(%d)(%d %d %d %d)(%d %d %d %d) contrib = % 6.4f\n",
         num, P, Q, R, S, p, q, r, s, tmpval);
  printf("noncanon: z(%d)(%d %d %d %d)(%d %d %d %d) contrib = % 6.4f\n",
         onum, P, Q, R, S, p, q, r, s, -tmpval);

  if (p < q) {
      sw(p,q); sw(P,Q);
    }
  if (r < s) {
      sw(r,s); sw(R,S);
    }
  if (p < r || (p == r && q < s)) {
      sw(P,R); sw(p,r);
      sw(Q,S); sw(q,s);
    }

  printf("z(%d)(%d %d %d %d)(%d %d %d %d) contrib = % 6.4f\n",
         num, P, Q, R, S, p, q, r, s, tmpval);
  printf("z(%d)(%d %d %d %d)(%d %d %d %d) contrib = % 6.4f\n",
         onum, P, Q, R, S, p, q, r, s, -tmpval);
}
#endif

void
MBPT2::compute_cs_grad()
{

  // New version of MP2 gradient program which uses the full
  // permutational symmetry of the two-electron integral derivatives

  Ref kit = basis()->matrixkit();

  int do_d2_ = 1;  // if true, compute d2 diagnostic

  int nij;        // number of i,j pairs on a node (for e.g., mo_int)
  double *mo_int; // MO integrals of type (ov|ov)
                  // (and these integrals divided by
                  // orbital energy denominators)
  double *integral_iqjs; // half-transformed integrals

  int nocc_act, nvir_act;
  int i, j, k;
  int ii, bb;
  int x, y;
  int a, b, c;
  int nshell;
  int offset;
  int ik_offset;
  int i_offset; 
  int npass, pass;
  int tmpint;
  int np, nq, nr, ns; 
  int P, Q, R, S;
  int p, q, r, s;
  int bf1, bf2, bf3, bf4;
  int index;
  int me;
  int nproc;
  int rest;
  int p_offset, q_offset, r_offset, s_offset;

  int aoint_computed = 0; 
  int aointder_computed = 0; 
  int xyz;
  int natom = molecule()->natom();     // the number of atoms
  int int_index;
  size_t mem_static;    // static memory in bytes
  int ij_proc;          // the processor which has ij pair
  int ij_index;         // of the ij pairs on a proc, this ij pair is number ij_index
                        // (i.e., ij_index < nij)
  int ik_proc;          // the processor which has ik pair
  int ik_index;
  int jloop, kloop;

  int ni;

  double *evals;              // scf eigenvalues
  double *iajb_ptr, *ibja_ptr, *iakb_ptr, *ibka_ptr;
  double *iajc_ptr, *ibjc_ptr, *icjb_ptr, *icja_ptr;
  double *ijkb_ptr, *ibkj_ptr;
  double pqrs;
  double *c_sa, c_rj;
  double *c_pi, *c_qi, *c_sj;
  double *c_qx, *c_qa, *c_sb, *c_pa, *c_pq, *c_sy;
  double delta_ijab, delta_ijbc, delta_ijac;
  double ecorr_mp2 = 0.0;
  double escf;
  double emp2=0.0;
  int tol;                    // log2 of the erep tolerance
                              // (erep < 2^tol => discard)
  double *Wkj=0,*Wab=0,*Waj=0;// occ-occ, vir-vir and vir-occ parts of 
                              // second order correction to MP2
                              // energy weighted density matrix
  double *Pkj=0,*Pab=0;       // occ-occ and vir-vir parts of second order
                              // correction to MP2 density matrix
  double *d2occ_mat, *d2vir_mat; // matrices for computation of D2 diagnostic
  double *Laj=0;              // MP2 Lagrangian
  double *Lpi;                // contrib to MP2 Lagrangian partially in AO basis
  double *pkj_ptr=0, *pab_ptr;
  double *d2occ_mat_ptr;
  double *d2vir_mat_ptr;
  double *wkj_ptr, *wjk_ptr, *wab_ptr, *wba_ptr, *waj_ptr=0;
  double *laj_ptr, *lpi_ptr, *lqi_ptr;
  double *gamma_iajs, *gamma_iajs_tmp; 
                              // partially back-transformed non-sep 2PDM's
  double *gamma_iqjs_tmp;
  double *gamma_iajs_ptr;
  double *gamma_iqjs_ptr;
  double *gammabuf;           // buffer used for sending elements of gamma_iqjs
  double *mo_intbuf;          // buffer used for sending mo integrals
  double tmpval, tmpval1;
  double *P2AO, *W2AO;
  double *p2ao_ptr, *w2ao_ptr;
  double *PHF, *WHF;
  double *phf_ptr, *whf_ptr;
  double *PMP2, *WMP2;
  double *pmp2_ptr, *wmp2_ptr;

  double *ixjs_tmp;      // three-quarter transformed two-el integrals
  double *integral_ixjs;  // all three-quarter transformed two-el integrals
  double *integral_iajy; // mo integrals (y = any MO)
  double *integral_ikja; // mo integrals
  double *integral_iqjs_ptr;
  double *iajy_ptr;
  double *ixjs_ptr;
  double *ikja_ptr;
  double *iajs_ptr, *ikjs_ptr;

  double **gradient=0, *gradient_dat=0;  // The MP2 gradient
  double **hf_gradient=0, *hf_gradient_dat=0;  // The HF gradient
  double **ginter=0;    // Intermediates for the MP2 gradient
  double **hf_ginter=0;    // Intermediates for the HF gradient
  double d2o, d2v, d2_diag;

  BiggestContribs biggest_coefs(5,10);
  CharacterTable ct = molecule()->point_group()->char_table();

#if PRINT_BIGGEST_INTS
  BiggestContribs biggest_ints_2(4,40);
  BiggestContribs biggest_ints_2s(4,40);
  BiggestContribs biggest_ints_3a(4,40);
  BiggestContribs biggest_ints_3(4,40);
#endif

  int dograd = gradient_needed();

  tim_enter("mp2-mem");

  nfuncmax = basis()->max_nfunction_in_shell();

  nshell = basis()->nshell();

  me = msg_->me();

  if (me == 0) {
    ExEnv::out0() << endl << indent
         << "Entered memgrp based MP2 routine" << endl;
    }
  
  nproc = msg_->n();
  if (me == 0)
    ExEnv::out0() << indent << scprintf("nproc = %i", nproc) << endl;

  tol = (int) (-10.0/log10(2.0));  // discard ereps smaller than 10^-10

  nocc = 0;
  for (i=0; i n(); i++) {
    if (reference_->occupation(i) == 2.0) nocc++;
    }

  nocc_act = nocc - nfzc;
  nvir  = noso - nocc;
  nvir_act = nvir - nfzv;

  // Do a few preliminary tests to make sure the desired calculation
  // can be done (and appears to be meaningful!)

  if (nocc_act <= 0) {
    if (me == 0) {
      ExEnv::err0() << "There are no active occupied orbitals; program exiting" << endl;
      }
    abort();
    }

  if (nvir_act <= 0) {
    if (me == 0) {
      ExEnv::err0() << "There are no active virtual orbitals; program exiting" << endl;
      }
    abort();
    }
    
  if (restart_orbital_memgrp_) {
    if (!dograd && !do_d1_ && !do_d2_) {
      ExEnv::out0() << indent
           << scprintf("Restarting at orbital %d with partial energy %18.14f",
                       restart_orbital_memgrp_, restart_ecorr_)
           << endl;
      ecorr_mp2 = restart_ecorr_;
      }
    else {
      ExEnv::out0() << indent
           << "Restart requested but not possible with gradients, D1, or D2"
           << endl;
      restart_ecorr_ = 0.0;
      restart_orbital_memgrp_ = 0;
      }
    }
  else {
      restart_ecorr_ = 0.0;
    }

  ////////////////////////////////////////////////////////
  // Compute batch size ni for mp2 loops;
  //
  // The following arrays are kept throughout (all of type double):
  //   scf_vector, gradient, ginter, Pkj, Pab, Wkj, Wab, Waj, Laj
  // and memory allocated for these arrays  and integral evaluators
  // is called mem_static
  //
  ////////////////////////////////////////////////////////
  if (me == 0) {
    mem_static = nbasis*noso; // scf vector
    mem_static += 2*nbasis*nfuncmax; // iqjs & iqjr
    if (dograd) {
      mem_static += 9*natom; // gradient & ginter & hf_ginter
      mem_static += (nocc*(nocc+1))/2; // Pkj
      mem_static += (nvir*(nvir+1))/2; // Pab
      mem_static += nocc*nocc; // Wkj
      mem_static += nvir*nvir; // Wab
      mem_static += 2*nocc*nvir; // Waj & Laj
      if (do_d2_) {
        mem_static += (nocc_act*(nocc_act+1))/2; // d2occ_mat
        mem_static += (nvir_act*(nvir_act+1))/2; // d2vir_mat
        }
      }
    else if (do_d1_) {
      mem_static += nocc*nvir; // partial Laj
      }
    mem_static *= sizeof(double);
    int nthreads = thr_->nthread();
    mem_static += nthreads * integral()->storage_required_eri(basis()); // integral evaluators
    ni = compute_cs_batchsize(mem_static, nocc_act-restart_orbital_memgrp_); 
    }

  if (max_norb_ > 0 && ni > max_norb_) {
      ExEnv::out0() << indent
           << "\"max_norb\" set: could have done "
           << ni << " orbitals per pass otherwise."
           << endl;
      ni = max_norb_;
    }

  // Send value of ni and mem_static to other nodes
  msg_->bcast(ni);
  double dmem_static = mem_static;
  msg_->bcast(dmem_static);
  mem_static = size_t(dmem_static);

  // Compute the storage to be used by the integral routines (required plus optional)
  size_t dyn_mem = distsize_to_size(compute_cs_dynamic_memory(ni,nocc_act));
  int mem_remaining;
  if (mem_alloc <= (dyn_mem + mem_static)) mem_remaining = 0;
  else mem_remaining = mem_alloc - dyn_mem - mem_static;
  mem_remaining += thr_->nthread() * integral()->storage_required_eri(basis());

  ExEnv::out0() << indent
       << "Memory available per node:      " << mem_alloc << " Bytes"
       << endl;
  ExEnv::out0() << indent
       << "Static memory used per node:    " << mem_static << " Bytes"
       << endl;
  ExEnv::out0() << indent
       << "Total memory used per node:     " << dyn_mem+mem_static << " Bytes"
       << endl;
  ExEnv::out0() << indent
       << "Memory required for one pass:   "
       << compute_cs_dynamic_memory(nocc_act,nocc_act)+mem_static
       << " Bytes"
       << endl;
  ExEnv::out0() << indent
       << "Minimum memory required:        "
       << compute_cs_dynamic_memory(1,nocc_act)+mem_static
       << " Bytes"
       << endl;
  ExEnv::out0() << indent
       << "Batch size:                     " << ni
       << endl;

  if (ni == 0) {
    ExEnv::err0() << "Batch size is 0: more memory or processors are needed"
         << endl;
    abort();
    }

  if (dynamic_) {
    ExEnv::out0() << indent << "Using dynamic load balancing." << endl;
    }

  if (ni == nocc_act-restart_orbital_memgrp_) {
    npass = 1;
    rest = 0;
    }
  else {
    rest = (nocc_act-restart_orbital_memgrp_)%ni;
    npass = (nocc_act-restart_orbital_memgrp_ - rest)/ni + 1;
    if (rest == 0) npass--;
    }

  if (me == 0) {
    ExEnv::out0() << indent
         << scprintf(" npass  rest  nbasis  nshell  nfuncmax") << endl;
    ExEnv::out0() << indent
         << scprintf("  %-4i   %-3i   %-5i    %-4i     %-3i",
                     npass,rest,nbasis,nshell,nfuncmax)
         << endl;
    ExEnv::out0() << indent
         << scprintf(" nocc   nvir   nfzc   nfzv") << endl;
    ExEnv::out0() << indent
         << scprintf("  %-4i   %-4i   %-4i   %-4i",
                     nocc,nvir,nfzc,nfzv)
         << endl;
    }

  int nijmax = 0;
  index = 0;
  for (i=0; i energy();
  hf_energy_ = escf;

  RefDiagSCMatrix occ;
  RefSCMatrix Scf_Vec;
  RefDiagSCMatrix evalmat;
  eigen(evalmat, Scf_Vec, occ);

  if (debug_ > 1) {
    evalmat.print("eigenvalues");
    Scf_Vec.print("eigenvectors");
    }

  double *scf_vector_dat = new double[nbasis*noso];
  Scf_Vec.t()->convert(scf_vector_dat);

  evals = new double[noso];
  double** scf_vector = new double*[nbasis];
  for (i=0; i 2 && me == 0) {
    for (j=0; j set_storage(mem_remaining);
  tbints_ = new Ref [thr_->nthread()];
  for (i=0; i nthread(); i++) {
      tbints_[i] = integral()->electron_repulsion();
    }
  if (dograd || do_d1_) {
    tbintder_ = new Ref [thr_->nthread()];
    for (i=0; i nthread(); i++) {
      tbintder_[i] = integral()->electron_repulsion_deriv();
      }
    }

  int mem_integral_intermediates = integral()->storage_used();
  int mem_integral_storage = (mem_remaining - mem_integral_intermediates) / thr_->nthread();
  if (mem_integral_storage<0) mem_integral_storage = 0;
  for (i=0; i nthread(); i++) {
      tbints_[i]->set_integral_storage(mem_integral_storage);
    }

  ExEnv::out0() << endl << indent
       << scprintf("Memory used for integral intermediates: %i Bytes",
                   mem_integral_intermediates)
       << endl;
  ExEnv::out0() << indent
       << scprintf("Memory used for integral storage:       %i Bytes",
                   mem_integral_storage)
       << endl;

  if (mem.null()) {
      ExEnv::errn() << "MBPT2: memory group not initialized" << endl;
      abort();
    }

  mem->set_localsize(size_t(nijmax)*nbasis*nbasis*sizeof(double));
  ExEnv::out0() << indent
       << "Size of global distributed array:       "
       << mem->totalsize()
       << " Bytes"
       << endl;

  MemoryGrpBuf membuf_remote(mem);

  int usep4 = !dograd;

  Ref lock = thr_->new_lock();
  CSGradErep12Qtr** e12thread = new CSGradErep12Qtr*[thr_->nthread()];
  DistShellPair::SharedData sp_e_data, sp_g_data;
  for (i=0; i nthread(); i++) {
    e12thread[i] = new CSGradErep12Qtr(i, thr_->nthread(), me, nproc,
                                       mem, msg_, lock, basis(), tbints_[i],
                                       nocc, scf_vector, tol, debug_,
                                       dynamic_, print_percent_,
                                       &sp_e_data, usep4);
    }

    CSGrad34Qbtr** qbt34thread;
    if (dograd || do_d1_) {
      qbt34thread = new CSGrad34Qbtr*[thr_->nthread()];
      for (i=0; i nthread(); i++) {
        qbt34thread[i] = new CSGrad34Qbtr(i, thr_->nthread(), me, nproc,
                                          mem, msg_, lock, basis(), tbints_[i],
                                          tbintder_[i], nocc, nfzc, scf_vector,
                                          tol, debug_, dynamic_, print_percent_,
                                          &sp_g_data, dograd, natom);
        }
      }

  tim_enter("mp2 passes");
  for (pass=0; pass nthread(); i++) {
      e12thread[i]->set_i_offset(i_offset);
      e12thread[i]->set_ni(ni);
      thr_->add_thread(i,e12thread[i]);
#     if SINGLE_THREAD_E12
      e12thread[i]->run();
#     endif
      }
#   if !SINGLE_THREAD_E12
    thr_->start_threads();
    thr_->wait_threads();
#   endif
    tim_exit("erep+1.qt+2.qt");

    if (me == 0) {
      ExEnv::out0() << indent << "End of loop over shells" << endl;
      }

    mem->sync();  // Make sure iqjs is complete on each node before continuing

    integral_iqjs = (double*) mem->localdata();

#if PRINT2Q
    if (me == 0) {
      int index = 0;
      int ij_index = 0;
      for (int i = 0; i = nfzc) {
	      double *integral_ij_offset = integral_iqjs + nbasis*nbasis*ij_index;
	      for (int s = 0; s =nocc) {
                biggest_ints_3a.insert(*ixjs_ptr,i+i_offset,j,s,x-nocc);
                }
#endif
              *integral_iqjs_ptr++ = *ixjs_ptr++;
              }
            }   // exit s loop
          ij_index++;
          }     // endif
        }       // exit j loop
      }         // exit i loop
    // end of third quarter transformation
    tim_exit("3. q.t.");

    if (me == 0) {
      ExEnv::out0() << indent << "End of third q.t." << endl;
      }

    delete[] ixjs_tmp;

    // The array of half-transformed integrals integral_iqjs has now
    // been overwritten by three-quarter transformed integrals ixjs;
    // rename the array integral_ixjs, where x = any MO
    integral_ixjs = integral_iqjs;

#if PRINT3Q
    if (me == 0) {
      int index = 0;
      int ij_index = 0;
      for (int i = 0; i = nfzc) {
	      double *integral_ij_offset = integral_ixjs + nbasis*nbasis*ij_index;
	      for (int s = 0; s = nfzc) {
            for (k=0; k =nfzc) {
            for (b=0; b 1 || npass > 1) {
      ExEnv::outn() << "csgrad.cc: WRITE_DOUBLES set but case not allowed" << endl;
      abort();
      }
    ExEnv::outn() << "csgrad.cc: WRITING DOUBLES: CHECK ORDER" << endl;
    char *doutname = SCFormIO::fileext_to_filename(".mp2");
    FILE *dout = fopen(doutname,"w");
    delete[] doutname;
    fwrite(&nocc_act, sizeof(int), 1, dout);
    fwrite(&nvir_act, sizeof(int), 1, dout);
    for (j=nfzc; j 1.0e-8) {
              ExEnv::outn() << scprintf(" Djbia(%2d %2d %2d %2d) = %12.8f",
                               j+1-nfzc,b+1,i+1,a+1,iajb_ptr[a])
                   << endl;
              }
            }
          }
        }
      }
    fclose(dout);
#endif

    tim_enter("compute ecorr");

    index = 0;
    ij_index = 0;
    for (i=0; i =nfzc) {
            for (b=0; b =b && i_offset+i>=j) {
                  if (a>b && i_offset+i>j) {
                    // aaaa or bbbb
                    biggest_coefs.insert(*iajb_ptr - *ibja_ptr,
                                         i_offset+i,j,a,b,1111);
                    // aabb or bbaa or abba or baab
                    biggest_coefs.insert(*ibja_ptr,i_offset+i,j,b,a,1212);
                    } // endif
                  // aabb or bbaa or abba or baab
                  biggest_coefs.insert(*iajb_ptr,i_offset+i,j,a,b,1212);
                  } // endif

                tmpval = *iajb_ptr*(2**iajb_ptr - *ibja_ptr)*delta_ijab;
                ecorr_mp2 += tmpval;
                if (debug_) ecorr_ij += tmpval;
                iajb_ptr++;
                ibja_ptr += nbasis;;
                } // exit a loop
              }   // exit b loop
            }     // endif
          ij_index++;
          }       // endif
        if (debug_) {
          msg_->sum(ecorr_ij);
          ecorr_i += ecorr_ij;
          ExEnv::out0() << indent
               << scprintf("correlation energy for pair %3d %3d = %16.12f",
                           i+i_offset, j, ecorr_ij)
               << endl;
          }
        }         // exit j loop
      if (debug_) {
        ExEnv::out0() << indent
             << scprintf("correlation energy for orbital %3d = %16.12f",
                         i+i_offset, ecorr_i)
             << endl;
        }
      }           // exit i loop
    tim_exit("compute ecorr");

    // debug print
    if (debug_ && me == 0) {
      ExEnv::out0() << indent << "End of ecorr" << endl;
      }
    // end of debug print

    if (npass > 1 && pass < npass - 1) {
      double passe = ecorr_mp2;
      msg_->sum(passe);
      ExEnv::out0() << indent
           << "Partial correlation energy for pass " << pass << ":" << endl;
      ExEnv::out0() << indent
           << scprintf("  restart_ecorr          = %18.14f", passe)
           << endl;
      ExEnv::out0() << indent
           << scprintf("  restart_orbital_memgrp = %d", ((pass+1) * ni))
           << endl;
      }

    integral_iqjs = 0;
    mem->sync(); // Make sure MO integrals are complete on all nodes before continuing

    // don't go beyond this point if only the energy is needed
    if (!dograd && !do_d1_) continue;

    mo_int = (double*) mem->localdata();

    if (!dograd) goto compute_L;

    // Update the matrices Pkj and Wkj with
    // contributions from (occ vir|occ vir) integrals
    index = 0;
    ij_index = 0;
    tim_enter("Pkj and Wkj");
    for (i=0; i =nfzc) {
            for (kloop=me; kloop =nfzc) {
                  d2occ_mat_ptr = &d2occ_mat[(j-nfzc)*(j-nfzc+1)/2 + k-nfzc];
                  }
                }
              wjk_ptr = &Wkj[j*nocc + k];
              // Send for iakb, if necessary
              ik_index = (i*nocc + k)/nproc;
              ik_proc = (i*nocc + k)%nproc;
              ik_offset = nocc + nocc*nbasis + nbasis*nbasis*ik_index;
              mo_intbuf = (double*) membuf_remote.readonly_on_node(ik_offset,
                                                                   nbasis*nvir-nocc,
                                                                   ik_proc);
              for (a=0; a =nfzc) {
                    delta_ijab = evals[i_offset+i]+evals[j]-evals[nocc+a]-evals[nocc+b];
                    *wjk_ptr += tmpval*delta_ijab;
                    } 
                  } // exit b loop
                }   // exit a loop
              mo_intbuf = 0;
              membuf_remote.release();
              }     // end kloop loop
            }       // endif

          ij_index++;
          }         // endif
        }           // exit j loop
      }             // exit i loop
    tim_exit("Pkj and Wkj");

    // debug print
    if (debug_ && me == 0) {
      ExEnv::out0() << indent << "End of Pkj and Wkj" << endl;
      }
    // end of debug print

    // Update the matrices Pab and Wab with
    // contributions from (occ vir|occ vir) integrals
    tim_enter("Pab and Wab");
    index = 0;
    ij_index = 0;
    for (i=0; i =nfzc) {

            offset = nocc + nocc*nbasis + nbasis*nbasis*ij_index;
            for (a=0; a =nfzc) {
            offset = nbasis*nocc + nbasis*nbasis*ik_index;
            for (j=0; j =nfzc) {
            offset = nocc + nbasis*nbasis*ik_index;
            for (b=0; b sync(); // Need to synchronize before deleting mo_intbuf

    mo_int = (double*) mem->localdata();

    gamma_iajs_tmp = new double[nbasis*nvir_act];
    if (!gamma_iajs_tmp) {
      ExEnv::outn() << indent << "Could not allocate gamma_iajs_tmp" << endl;
      abort();
      }

    // debug print
    if (debug_ && me == 0) {
      ExEnv::out0() << indent << "Begin first and second q.b.t." << endl;
      }
    // end of debug print

    ///////////////////////////////////////////////////////////
    // Perform first and second quarter back-transformation.
    // Each node produces gamma_iajs, and gamma_iqjs 
    // for a subset of i and j, all a and all s;
    // the back-transf. is done only for active i, j, a, and b
    ///////////////////////////////////////////////////////////

    // Begin first quarter back-transformation
    tim_enter("1. q.b.t.");
    index = 0;
    ij_index = 0;
    for (i=0; i =nfzc) {
            bzerofast(gamma_iajs_tmp,nbasis*nvir_act);
            offset = nocc + nocc*nbasis + nbasis*nbasis*ij_index;

            for (a=0; a sync(); // Make sure all nodes are done with gamma_iajs_tmp before renaming

    delete[] gamma_iajs_tmp;

    // The array mo_int has now been overwritten by the quarter 
    // back-transformed non-sep 2PDM gamma_iajs, so rename
    gamma_iajs = (double*) mem->localdata();

    gamma_iqjs_tmp = new double[nbasis];
    if (!gamma_iqjs_tmp) {
      ExEnv::errn() << "Could not allocate gamma_iqjs_tmp" << endl;
      abort();
      }

    if (debug_ && me == 0) {
      ExEnv::out0() << indent << "Begin second q.b.t." << endl;
      }

    // Begin second quarter back-transformation
    // (gamma_iqjs elements ordered as i,j,s,q,
    // i.e., q varies fastest)
    tim_enter("2. q.b.t.");
    index = 0;
    ij_index = 0;
    for (i=0; i =nfzc) {
            offset = nbasis*nbasis*ij_index;

            for (s=0; s sync(); // Keep this here to make sure all nodes have gamma_iqjs
                 // before it is needed below, and that gamma_iajs is not
                 // deleted prematurely

    // The quarter back-transformed elements gamma_iajs have now been
    // overwritten by the half back-transformed elements gamma_iqjs

    delete[] gamma_iqjs_tmp;

    /////////////////////////////////////////////////
    // End of 1. and 2. quarter back-transformation
    /////////////////////////////////////////////////

    Lpi = new double[nbasis*ni];
    bzerofast(Lpi,nbasis*ni);

    if (me == 0) {
      ExEnv::out0() << indent << "Begin third and fourth q.b.t." << endl;
      }

    //////////////////////////////////////////////////////////
    // Perform third and fourth quarter back-transformation
    // and compute contribution to gradient from non-sep 2PDM
    //////////////////////////////////////////////////////////

    tim_enter("3.qbt+4.qbt+non-sep contrib.");
    sp_g_data.init();
    for (i=0; i nthread(); i++) {
      qbt34thread[i]->set_i_offset(i_offset);
      qbt34thread[i]->set_ni(ni);
      thr_->add_thread(i,qbt34thread[i]);
#     if SINGLE_THREAD_QBT34
      qbt34thread[i]->run();
#     endif
      }
#   if !SINGLE_THREAD_QBT34
    thr_->start_threads();
    thr_->wait_threads();
#   endif
    tim_exit("3.qbt+4.qbt+non-sep contrib.");
    // Add thread contributions to Lpi and ginter
    for (i=0; i nthread(); i++) {
      double *Lpi_thread = qbt34thread[i]->get_Lpi();
      double **ginter_thread = qbt34thread[i]->get_ginter();
      for (j=0; j get_aointder_computed();
      }

    if (me == 0) {
      ExEnv::out0() << indent << "End of third and fourth q.b.t." << endl;
      }

    mem->sync(); // Make sure all nodes are done before deleting arrays

    if (debug_ > 1) {
      RefSCDimension ni_dim(new SCDimension(ni,1));
      ni_dim->blocks()->set_subdim(0, new SCDimension(ni));
      RefSCDimension nbasis_dim(new SCDimension(nbasis,1));
      nbasis_dim->blocks()->set_subdim(0, new SCDimension(nbasis));
      RefSCMatrix Lpi_mat(nbasis_dim, ni_dim, kit);
      Lpi_mat->assign(Lpi);
      Lpi_mat.print("Lpi");
      }

    if (debug_ && me == 0) {
      ExEnv::out0() << indent << "Back-transform Lpi" << endl;
      }

    // Back-transform Lpi to MO basis
    lpi_ptr = Lpi;
    for (p=0; p nthread(); i++) {
      delete qbt34thread[i];
    }
    delete[] qbt34thread;
  }

  mem->set_localsize(0);

  // debug print
  if (debug_ && me == 0) {
    ExEnv::out0() << indent << "Exited loop over i-batches" << endl;
    }
  // end of debug print

  ///////////////////////////////////////////////////////////////
  // The computation of the MP2 energy is now complete on each
  // node; add the nodes' contributions and print out the energy
  ///////////////////////////////////////////////////////////////
  msg_->sum(ecorr_mp2);
  msg_->sum(aoint_computed);
  msg_->sum(aointder_computed);

  biggest_coefs.combine(msg_);
#if PRINT_BIGGEST_INTS
  biggest_ints_1.combine(msg_);
  biggest_ints_2.combine(msg_);
  biggest_ints_2s.combine(msg_);
  biggest_ints_3a.combine(msg_);
  biggest_ints_3.combine(msg_);
#endif

  if (me == 0) {
    emp2 = escf + ecorr_mp2;

#if PRINT_BIGGEST_INTS
    ExEnv::out0() << "biggest 1/4 transformed ints" << endl;
    for (i=0; i %2d %3s %2d %3s (%s)",
                         i+1, biggest_coefs.val(i),
                         symorb_num_[i0]+1,
                         ct.gamma(symorb_irrep_[i0]).symbol(),
                         symorb_num_[i1]+1,
                         ct.gamma(symorb_irrep_[i1]).symbol(),
                         symorb_num_[i2]+1,
                         ct.gamma(symorb_irrep_[i2]).symbol(),
                         symorb_num_[i3]+1,
                         ct.gamma(symorb_irrep_[i3]).symbol(),
                         (spincase==1111?"++++":"+-+-")
               )
             << endl;
        }
      }

    // Print out various energies etc.

    if (debug_) {
      ExEnv::out0() << indent << "Number of shell quartets for which AO integrals\n"
           << indent << "(or integral derivatives) would have been computed\n"
           << indent << "without bounds checking: "
           << npass*nshell*nshell*(nshell+1)*(nshell+1)/2
           << endl;

      ExEnv::out0() << indent << "Number of shell quartets for which AO integrals\n"
           << indent << "were computed: " << aoint_computed
           << endl;

      if (dograd) {
        ExEnv::out0() << indent
             << "Number of shell quartets for which AO integral derivatives\n"
             << indent << "were computed: " << aointder_computed
             << endl;
        }
      }

    ExEnv::out0()<