/* * Project: MoleCuilder * Description: creates and alters molecular systems * Copyright (C) 2010-2012 University of Bonn. All rights reserved. * Copyright (C) 2013 Frederik Heber. All rights reserved. * * * This file is part of MoleCuilder. * * MoleCuilder is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * MoleCuilder is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with MoleCuilder. If not, see . */ /* * Fragmentation.cpp * * Created on: Oct 18, 2011 * Author: heber */ #ifdef HAVE_CONFIG_H #include #endif #include #include "CodePatterns/MemDebug.hpp" #include "Fragmentation.hpp" #include "CodePatterns/Assert.hpp" #include "CodePatterns/Info.hpp" #include "CodePatterns/IteratorAdaptors.hpp" #include "CodePatterns/Log.hpp" #include "Atom/atom.hpp" #include "Bond/bond.hpp" #include "Descriptors/MoleculeDescriptor.hpp" #include "Element/element.hpp" #include "Element/periodentafel.hpp" #include "Fragmentation/AdaptivityMap.hpp" #include "Fragmentation/AtomMask.hpp" #include "Fragmentation/fragmentation_helpers.hpp" #include "Fragmentation/Graph.hpp" #include "Fragmentation/helpers.hpp" #include "Fragmentation/KeySet.hpp" #include "Fragmentation/PowerSetGenerator.hpp" #include "Fragmentation/UniqueFragments.hpp" #include "Graph/BondGraph.hpp" #include "Graph/AdjacencyList.hpp" #include "Graph/ListOfLocalAtoms.hpp" #include "molecule.hpp" #include "World.hpp" /** Constructor of class Fragmentation. * * \param _mol molecule which we currently fragment * \param _FileChecker instance contains adjacency parsed from elsewhere * \param _treatment whether to treat hydrogen special and saturate dangling bonds or not */ Fragmentation::Fragmentation(molecule *_mol, AdjacencyList &_FileChecker, const enum HydrogenTreatment _treatment) : mol(_mol), treatment(_treatment), FileChecker(_FileChecker) {} /** Destructor of class Fragmentation. * */ Fragmentation::~Fragmentation() {} /** Performs a many-body bond order analysis for a given bond order. * * \note during fragmentation we switch to so-called local ids, atomic ids * that are valid only for the specific molecule (representing a connected * subgraph of the molecular system). * * -# create the local id to global id mapping * -# parse the adjacency file and require the above mapping for translation * -# initialize a mask for the molecule's atoms, telling which atoms are * treated and which atoms neglected during fragmentation * -# parse and orderatsite file and check whether there's something to do. This allows for iterative calls to fragmentation * -# fragments from the last fragmentation stored to file are converted into * keysets (sets of atomic indices that describe one fragment) * -# in a loop as long as order at site is not correct yet * -# prepare a stack containing the initial ids where the fragmentation or * rather the graph algorithms start from * -# call fragmentBOSSANOVA() * -# afterwards in case we don't saturate we remove single-atom fragments * -# translate the local ids of the keysets into global ids. * -# updated order at site file is written * * note that all created fragments or rather their describing key sets are * contained in the Graph Fragmentation::FragmentList. * * \param atomids atomic ids (local to Fragmentation::mol) to fragment, used in AtomMask * \param Order up to how many neighbouring bonds a fragment contains in BondOrderScheme::BottumUp scheme * \param prefix prefix string for every fragment file name (may include path) * \param ParsedFragmentList all already created key sets * \return 1 - continue, 2 - stop (no fragmentation occured) */ int Fragmentation::FragmentMolecule( const std::vector &atomids, int Order, const std::string &prefix, const Graph &ParsedFragmentList, const bool _ParseStateFile) { std::fstream File; bool CheckOrder = false; int TotalNumberOfKeySets = 0; LOG(0, std::endl); switch (treatment) { case ExcludeHydrogen: LOG(1, "INFO: I will treat hydrogen special."); break; case IncludeHydrogen: LOG(1, "INFO: Hydrogen is treated just like the rest of the lot."); break; default: ASSERT(0, "Fragmentation::FragmentMolecule() - there is a HydrogenTreatment setting which I have no idea about."); break; } // ++++++++++++++++++++++++++++ INITIAL STUFF: Bond structure analysis, file parsing, ... ++++++++++++++++++++++++++++++++++++++++++ bool FragmentationToDo = true; // ===== 1. Check whether bond structure is same as stored in files ==== // create a lookup global <-> local id for atomic ids valid in World and in molecule Global_local_bimap_t Global_local_bimap; for (std::vector::const_iterator iter = atomids.begin(); iter != atomids.end(); ++iter) { const atom * _atom = mol->FindAtom(*iter); ASSERT( _atom != NULL, "Fragmentation::FragmentMolecule() - could not find atom "+toString(*iter)+"."); Global_local_bimap.insert( idpair_t( (global_t)(_atom->getId()), (local_t)(*iter) ) ); } // === compare it with adjacency file === { const std::vector globalids( MapKeyConstIterator(Global_local_bimap.left.begin()), MapKeyConstIterator(Global_local_bimap.left.end()) ); AdjacencyList WorldAdjacency(globalids); FragmentationToDo = FragmentationToDo && (FileChecker > WorldAdjacency); } // ===== 2. create AtomMask that takes Saturation condition into account AtomMask_t AtomMask(atomids); for (molecule::const_iterator iter = const_cast(mol)->begin(); iter != const_cast(mol)->end(); ++iter) { // remove in hydrogen and we do saturate if ((treatment == ExcludeHydrogen) && ((*iter)->getType()->getAtomicNumber() == 1)) // skip hydrogen AtomMask.setFalse((*iter)->getNr()); } // ===== 4. check globally whether there's something to do actually (first adaptivity check) if (_ParseStateFile) FragmentationToDo = FragmentationToDo && ParseOrderAtSiteFromFile(atomids, prefix, Global_local_bimap); // =================================== Begin of FRAGMENTATION =============================== // ===== 6a. assign each keyset to its respective subgraph ===== ListOfLocalAtoms_t ListOfLocalAtoms; Graph FragmentList; AssignKeySetsToFragment(ParsedFragmentList, ListOfLocalAtoms, FragmentList, true); // ===== 6b. prepare and go into the adaptive (Order<0), single-step (Order==0) or incremental (Order>0) cycle KeyStack RootStack; FragmentationToDo = false; // if CheckOrderAtSite just ones recommends fragmentation, we will save fragments afterwards bool LoopDoneAlready = false; while ((CheckOrder = CheckOrderAtSite(AtomMask, ParsedFragmentList, Order, prefix, LoopDoneAlready))) { FragmentationToDo = FragmentationToDo || CheckOrder; LoopDoneAlready = true; // last plus one entry is used as marker that we have been through this loop once already in CheckOrderAtSite() // ===== 6b. fill RootStack for each subgraph (second adaptivity check) ===== FillRootStackForSubgraphs(RootStack, AtomMask); // call BOSSANOVA method FragmentBOSSANOVA(mol, FragmentList, RootStack); } LOG(3, "DEBUG: CheckOrder is " << CheckOrder << "."); // ==================================== End of FRAGMENTATION ============================================ // if hydrogen is not treated special, we may have single hydrogens and other // fragments which are note single-determinant. These need to be removed if (treatment == IncludeHydrogen) { // remove all single atom fragments from FragmentList Graph::iterator iter = FragmentList.begin(); while ( iter != FragmentList.end()) { if ((*iter).first.size() == 1) { LOG(1, "INFO: Removing KeySet " << (*iter).first << " as is not single-determinant."); Graph::iterator eraseiter = iter++; FragmentList.erase(eraseiter); } else ++iter; } } // ===== 8a. translate list into global numbers (i.e. ones that are valid in "this" molecule, not in MolecularWalker->Leaf) TranslateIndicesToGlobalIDs(FragmentList, TotalNumberOfKeySets, TotalGraph); LOG(1, "STATUS: We have created " << TotalGraph.size() << " fragments."); // store adaptive orders into file StoreOrderAtSiteFile(prefix); return ((int)(!FragmentationToDo)+1); // 1 - continue, 2 - stop (no fragmentation occured) }; /** Performs BOSSANOVA decomposition at selected sites, increasing the cutoff by one at these sites. * -# constructs a complete keyset of the molecule * -# In a loop over all possible roots from the given rootstack * -# increases order of root site * -# calls PowerSetGenerator with this order, the complete keyset and the rootkeynr * -# for all consecutive lower levels PowerSetGenerator is called with the suborder, the higher order keyset as the restricted one and each site in the set as the root) * -# these are merged into a fragment list of keysets * -# All fragment lists (for all orders, i.e. from all destination fields) are merged into one list for return * Important only is that we create all fragments, it is not important if we create them more than once * as these copies are filtered out via use of the hash table (KeySet). * \param *out output stream for debugging * \param Fragment&*List list of already present keystacks (adaptive scheme) or empty list * \param &RootStack stack with all root candidates (unequal to each atom in complete molecule if adaptive scheme is applied) * \return pointer to Graph list */ void Fragmentation::FragmentBOSSANOVA(molecule *mol, Graph &FragmentList, KeyStack &RootStack) { Info FunctionInfo(__func__); std::vector *FragmentLowerOrdersList = NULL; size_t NumLevels = 0; // size_t NumMolecules = 0; size_t TotalNumMolecules = 0; int *NumMoleculesOfOrder = NULL; int Order = 0; int UpgradeCount = RootStack.size(); KeyStack FragmentRootStack; int RootKeyNr = 0; int RootNr = 0; // FragmentLowerOrdersList is a 2D-array of pointer to vector of molecule objects, one dimension represents the ANOVA expansion of a single order (i.e. 5) // with all needed lower orders that are subtracted, the other dimension is the BondOrder (i.e. from 1 to 5) NumMoleculesOfOrder = new int[UpgradeCount]; FragmentLowerOrdersList = new std::vector[UpgradeCount]; for(int i=0;iFindAtom(RootKeyNr); // check cyclic lengths //if ((MinimumRingSize[Walker->GetTrueFather()->getNr()] != -1) && (Walker->GetTrueFather()->getAdaptiveOrder()+1 > MinimumRingSize[Walker->GetTrueFather()->getNr()])) { // LOG(0, "Bond order " << Walker->GetTrueFather()->getAdaptiveOrder() << " of Root " << *Walker << " greater than or equal to Minimum Ring size of " << MinimumRingSize << " found is not allowed."); //} else { // set adaptive order to desired max order Walker->GetTrueFather()->setAdaptiveOrder(Walker->GetTrueFather()->getMaxOrder()); Order = Walker->GetTrueFather()->getAdaptiveOrder(); Walker->setAdaptiveOrder(Order); // allocate memory for all lower level orders NumLevels = Order; FragmentLowerOrdersList[RootNr].resize(NumLevels, NULL); for( size_t i=0;igetAdaptiveOrder()); // create top order where nothing is reduced LOG(0, "=============================================================================================================="); LOG(0, "Creating KeySets up till Bond Order " << Order << " for " << *Walker << ", " << (RootStack.size()-RootNr) << " Roots remaining."); // , NumLevels is " << NumLevels << " // Create list of Graphs of current Bond Order (i.e. F_{ij}) NumMoleculesOfOrder[RootNr] = PSG(RestrictedKeyset, treatment); // output resulting number LOG(1, "INFO: Number of resulting KeySets is: " << NumMoleculesOfOrder[RootNr] << "."); // if (NumMoleculesOfOrder[RootNr] != 0) { // NumMolecules = 0; // } // now, we have completely filled each cell of FragmentLowerOrdersList[] for the current Walker->getAdaptiveOrder() //NumMoleculesOfOrder[Walker->getAdaptiveOrder()-1] = NumMolecules; TotalNumMolecules += NumMoleculesOfOrder[RootNr]; // LOG(1, "Number of resulting molecules for Order " << (int)Walker->GetTrueFather()->getAdaptiveOrder() << " is: " << NumMoleculesOfOrder[RootNr] << "."); RootStack.push_back(RootKeyNr); // put back on stack RootNr++; } } LOG(0, "=============================================================================================================="); LOG(0, "\tTotal number of resulting fragments is: " << TotalNumMolecules << "."); LOG(0, "=============================================================================================================="); // now, FragmentLowerOrdersList is complete, it looks - for BondOrder 5 - as this (number is the ANOVA Order of the terms therein) // 5433222211111111 // 43221111 // 3211 // 21 // 1 // Subsequently, we combine all into a single list (FragmentList) CombineAllOrderListIntoOne(FragmentList, FragmentLowerOrdersList, RootStack, mol); FreeAllOrdersList(FragmentLowerOrdersList, RootStack, mol); delete[](NumMoleculesOfOrder); }; /** Estimates by educated guessing (using upper limit) the expected number of fragments. * The upper limit is * \f[ * n = N \cdot C^k * \f] * where \f$C=2^c\f$ and c is the maximum bond degree over N number of atoms. * \param *out output stream for debugging * \param order bond order k * \return number n of fragments */ int Fragmentation::GuesstimateFragmentCount(int order) { size_t c = 0; int FragmentCount; // get maximum bond degree for (molecule::const_iterator iter = const_cast(mol)->begin(); iter != const_cast(mol)->end(); ++iter) { const BondList& ListOfBonds = (*iter)->getListOfBonds(); c = (ListOfBonds.size() > c) ? ListOfBonds.size() : c; } FragmentCount = (treatment == ExcludeHydrogen ? mol->getNoNonHydrogen() : mol->getAtomCount()) *(1 << (c*order)); LOG(1, "INFO: Upper limit for this subgraph is " << FragmentCount << " for " << mol->getNoNonHydrogen() << " non-H atoms with maximum bond degree of " << c << "."); return FragmentCount; }; /** Checks whether the OrderAtSite is still below \a Order at some site. * \param AtomMask defines true/false per global Atom::Nr to mask in/out each nuclear site, used to activate given number of site to increment order adaptively * \param *GlobalKeySetList list of keysets with global ids (valid in "this" molecule) needed for adaptive increase * \param Order desired Order if positive, desired exponent in threshold criteria if negative (0 is single-step) * \param path path to ENERGYPERFRAGMENT file (may be NULL if Order is non-negative) * \param LoopDoneAlready indicate whether we have done a fragmentation loop already * \return true - needs further fragmentation, false - does not need fragmentation */ bool Fragmentation::CheckOrderAtSite(AtomMask_t &AtomMask, const Graph &GlobalKeySetList, int Order, const std::string &path, bool LoopDoneAlready) { bool status = false; if (Order < 0) { // adaptive increase of BondOrder per site if (LoopDoneAlready) // break after one step return false; // transmorph graph keyset list into indexed KeySetList AdaptivityMap * IndexKeySetList = GlobalKeySetList.GraphToAdaptivityMap(); // parse the EnergyPerFragment file IndexKeySetList->ScanAdaptiveFileIntoMap(path); // (Root No., (Value, Order)) ! // then map back onto (Value, (Root Nr., Order)) (i.e. sorted by value to pick the highest ones) IndexKeySetList->ReMapAdaptiveCriteriaListToValue(mol); // pick the ones still below threshold and mark as to be adaptively updated if (IndexKeySetList->IsAdaptiveCriteriaListEmpty()) { ELOG(2, "Unable to parse file, incrementing all."); status = true; } else { // mark as false all sites that are below threshold already status = IndexKeySetList->MarkUpdateCandidates(AtomMask, Order, mol); } delete[](IndexKeySetList); } else { // global increase of Bond Order for(molecule::iterator iter = mol->begin(); iter != mol->end(); ++iter) { atom * const Walker = *iter; if (AtomMask.isTrue(Walker->getNr())) { // skip masked out Walker->setMaxOrder((Order != 0 ? Order : Walker->getMaxOrder()+1)); // remove all that have reached desired order if (Walker->getAdaptiveOrder() >= Walker->getMaxOrder()) // && (Walker->getAdaptiveOrder() < MinimumRingSize[Walker->getNr()])) AtomMask.setFalse(Walker->getNr()); else status = true; } } if ((!Order) && (!LoopDoneAlready)) // single stepping, just check status = true; if (!status) { if (Order == 0) LOG(1, "INFO: Single stepping done."); else LOG(1, "INFO: Order at every site is already equal or above desired order " << Order << "."); } } PrintAtomMask(AtomMask, AtomMask.size()); // for debugging return status; }; /** Stores pairs (Atom::Nr, Atom::AdaptiveOrder) into file. * Atoms not present in the file get "-1". * \param &path path to file ORDERATSITEFILE * \return true - file writable, false - not writable */ bool Fragmentation::StoreOrderAtSiteFile( const std::string &path) { string line; ofstream file; line = path + ORDERATSITEFILE; file.open(line.c_str(), std::ofstream::out | std::ofstream::app); std::stringstream output; output << "INFO: Writing OrderAtSite " << ORDERATSITEFILE << " ... "; if (file.good()) { for (molecule::const_iterator iter = const_cast(mol)->begin(); iter != const_cast(mol)->end(); ++iter) { file << (*iter)->getId() << "\t" << (int)(*iter)->getAdaptiveOrder() << "\t" << (int)(*iter)->getMaxOrder() << std::endl; } file.close(); output << "done."; return true; } else { output << "failed to open file " << line << "."; return false; } LOG(1, output.str()); return true; }; /** Parses pairs(Atom::Nr, Atom::AdaptiveOrder) from file and stores in molecule's Atom's. * Atoms not present in the file get "0". * \param atomids atoms to fragment, used in AtomMask * \param &path path to file ORDERATSITEFILE * \param global_local_bimap translate global to local id * \return true - file found and scanned, false - file not found * \sa ParseKeySetFile() and CheckAdjacencyFileAgainstMolecule() as this is meant to be used in conjunction with the two */ bool Fragmentation::ParseOrderAtSiteFromFile( const std::vector &atomids, const std::string &path, const Global_local_bimap_t &global_local_bimap) { // Info FunctionInfo(__func__); typedef unsigned char order_t; typedef std::map OrderArray_t; OrderArray_t OrderArray; AtomMask_t MaxArray(atomids); bool status; int AtomNr, ordervalue, maxvalue; string line; ifstream file; line = path + ORDERATSITEFILE; file.open(line.c_str()); if (file.good()) { while (!file.eof()) { // parse from file AtomNr = -1; file >> AtomNr; file >> ordervalue; file >> maxvalue; if (AtomNr != -1) { // test whether we really parsed something (this is necessary, otherwise last atom is set twice and to 0 on second time) // parsed id is global, must be translated to local id Global_local_bimap_t::left_const_iterator iter = global_local_bimap.left.find(AtomNr); // skip global ids we don't know about, must be in other molecule if (iter != global_local_bimap.left.end()) { const int LocalId = iter->second; OrderArray[LocalId] = ordervalue; MaxArray.setValue(LocalId, (bool)maxvalue); //LOG(2, "AtomNr " << LocalId << " with order " << (int)OrderArray[LocalId] << " and max order set to " << (int)MaxArray[LocalId] << "."); } } } file.close(); // set atom values for(molecule::iterator iter=mol->begin();iter!=mol->end();++iter){ (*iter)->setAdaptiveOrder(OrderArray[(*iter)->getNr()]); (*iter)->setMaxOrder(OrderArray[(*iter)->getNr()]); //MaxArray.isTrue((*iter)->getNr()); } //SetAtomValueToIndexedArray( OrderArray, &atom::getNr(), &atom::AdaptiveOrder ); //SetAtomValueToIndexedArray( MaxArray, &atom::getNr(), &atom::MaxOrder ); status = true; } else { ELOG(1, "Failed to open OrdersAtSite file " << line << "."); status = false; } return status; }; /** Fills the root stack for sites to be used as root in fragmentation depending on order or adaptivity criteria * Again, as in \sa FillBondStructureFromReference steps recursively through each Leaf in this chain list of molecule's. * \param &RootStack stack to be filled * \param AtomMask defines true/false per global Atom::Nr to mask in/out each nuclear site * \return true - stack is non-empty, fragmentation necessary, false - stack is empty, no more sites to update */ void Fragmentation::FillRootStackForSubgraphs(KeyStack &RootStack, const AtomMask_t &AtomMask) { for(molecule::const_iterator iter = const_cast(mol)->begin(); iter != const_cast(mol)->end(); ++iter) { const atom * const Father = (*iter)->GetTrueFather(); if (AtomMask.isTrue(Father->getNr())) // apply mask if ((treatment == IncludeHydrogen) || ((*iter)->getType()->getAtomicNumber() != 1)) // skip hydrogen RootStack.push_front((*iter)->getNr()); } } /** The indices per keyset are compared to the respective father's Atom::Nr in each subgraph and thus put into \a **&FragmentList. * \param *KeySetList list with all keysets * \param ListOfLocalAtoms Lookup table for each subgraph and index of each atom in global molecule, may be NULL on start, then it is filled * \param **&FragmentList list to be allocated and returned * \param FreeList true - ***ListOfLocalAtoms is free'd before return, false - it is not * \retuen true - success, false - failure */ bool Fragmentation::AssignKeySetsToFragment(const Graph &KeySetList, ListOfLocalAtoms_t &ListOfLocalAtoms, Graph &FragmentList, bool FreeList) { // Info FunctionInfo(__func__); bool status = true; size_t KeySetCounter = 0; // fill ListOfLocalAtoms if NULL was given if (!mol->FillListOfLocalAtoms(ListOfLocalAtoms, mol->getAtomCount())) { ELOG(1, "Filling of ListOfLocalAtoms failed."); return false; } if (KeySetList.size() != 0) { // if there are some scanned keysets at all // assign scanned keysets KeySet TempSet; for (Graph::const_iterator runner = KeySetList.begin(); runner != KeySetList.end(); runner++) { // key sets contain global numbers! if (ListOfLocalAtoms[mol->FindAtom(*((*runner).first.begin()))->getNr()] != NULL) {// as we may assume that that bond structure is unchanged, we only test the first key in each set // translate keyset to local numbers for (KeySet::iterator sprinter = (*runner).first.begin(); sprinter != (*runner).first.end(); sprinter++) TempSet.insert(ListOfLocalAtoms[mol->FindAtom(*sprinter)->getNr()]->getNr()); // insert into FragmentList FragmentList.insert(GraphPair(TempSet, pair (KeySetCounter++, (*runner).second.second))); } TempSet.clear(); } } else ELOG(2, "KeySetList is NULL or empty."); if (FreeList) { // free the index lookup list ListOfLocalAtoms.clear(); } return status; } /** Translate list into global numbers (i.e. ones that are valid in "this" molecule, not in MolecularWalker->Leaf) * \param &FragmentList Graph with local numbers per fragment * \param &TotalNumberOfKeySets global key set counter * \param &TotalGraph Graph to be filled with global numbers */ void Fragmentation::TranslateIndicesToGlobalIDs(Graph &FragmentList, int &TotalNumberOfKeySets, Graph &TotalGraph) { // Info FunctionInfo(__func__); for (Graph::iterator runner = FragmentList.begin(); runner != FragmentList.end(); runner++) { KeySet TempSet; for (KeySet::iterator sprinter = (*runner).first.begin(); sprinter != (*runner).first.end(); sprinter++) TempSet.insert((mol->FindAtom(*sprinter))->GetTrueFather()->getId()); TotalGraph.insert(GraphPair(TempSet, pair (TotalNumberOfKeySets++, (*runner).second.second))); } }