Ignore:
Timestamp:
Jun 29, 2012, 8:01:38 AM (13 years ago)
Author:
Frederik Heber <heber@…>
Branches:
Action_Thermostats, Add_AtomRandomPerturbation, Add_FitFragmentPartialChargesAction, Add_RotateAroundBondAction, Add_SelectAtomByNameAction, Added_ParseSaveFragmentResults, AddingActions_SaveParseParticleParameters, Adding_Graph_to_ChangeBondActions, Adding_MD_integration_tests, Adding_ParticleName_to_Atom, Adding_StructOpt_integration_tests, AtomFragments, Automaking_mpqc_open, AutomationFragmentation_failures, Candidate_v1.5.4, Candidate_v1.6.0, Candidate_v1.6.1, Candidate_v1.7.0, ChangeBugEmailaddress, ChangingTestPorts, ChemicalSpaceEvaluator, CombiningParticlePotentialParsing, Combining_Subpackages, Debian_Package_split, Debian_package_split_molecuildergui_only, Disabling_MemDebug, Docu_Python_wait, EmpiricalPotential_contain_HomologyGraph, EmpiricalPotential_contain_HomologyGraph_documentation, Enable_parallel_make_install, Enhance_userguide, Enhanced_StructuralOptimization, Enhanced_StructuralOptimization_continued, Example_ManyWaysToTranslateAtom, Exclude_Hydrogens_annealWithBondGraph, FitPartialCharges_GlobalError, Fix_BoundInBox_CenterInBox_MoleculeActions, Fix_ChargeSampling_PBC, Fix_ChronosMutex, Fix_FitPartialCharges, Fix_FitPotential_needs_atomicnumbers, Fix_ForceAnnealing, Fix_IndependentFragmentGrids, Fix_ParseParticles, Fix_ParseParticles_split_forward_backward_Actions, Fix_PopActions, Fix_QtFragmentList_sorted_selection, Fix_Restrictedkeyset_FragmentMolecule, Fix_StatusMsg, Fix_StepWorldTime_single_argument, Fix_Verbose_Codepatterns, Fix_fitting_potentials, Fixes, ForceAnnealing_goodresults, ForceAnnealing_oldresults, ForceAnnealing_tocheck, ForceAnnealing_with_BondGraph, ForceAnnealing_with_BondGraph_continued, ForceAnnealing_with_BondGraph_continued_betteresults, ForceAnnealing_with_BondGraph_contraction-expansion, FragmentAction_writes_AtomFragments, FragmentMolecule_checks_bonddegrees, GeometryObjects, Gui_Fixes, Gui_displays_atomic_force_velocity, ImplicitCharges, IndependentFragmentGrids, IndependentFragmentGrids_IndividualZeroInstances, IndependentFragmentGrids_IntegrationTest, IndependentFragmentGrids_Sole_NN_Calculation, JobMarket_RobustOnKillsSegFaults, JobMarket_StableWorkerPool, JobMarket_unresolvable_hostname_fix, MoreRobust_FragmentAutomation, ODR_violation_mpqc_open, PartialCharges_OrthogonalSummation, PdbParser_setsAtomName, PythonUI_with_named_parameters, QtGui_reactivate_TimeChanged_changes, Recreated_GuiChecks, Rewrite_FitPartialCharges, RotateToPrincipalAxisSystem_UndoRedo, SaturateAtoms_findBestMatching, SaturateAtoms_singleDegree, StoppableMakroAction, Subpackage_CodePatterns, Subpackage_JobMarket, Subpackage_LinearAlgebra, Subpackage_levmar, Subpackage_mpqc_open, Subpackage_vmg, Switchable_LogView, ThirdParty_MPQC_rebuilt_buildsystem, TrajectoryDependenant_MaxOrder, TremoloParser_IncreasedPrecision, TremoloParser_MultipleTimesteps, TremoloParser_setsAtomName, Ubuntu_1604_changes, stable
Children:
f98c8e
Parents:
559385
git-author:
Frederik Heber <heber@…> (03/04/12 16:18:32)
git-committer:
Frederik Heber <heber@…> (06/29/12 08:01:38)
Message:

FragmentQueue can now resubmit jobs.

  • we keep an internal list of jobs currently being worked on. If the worker returns with a failure, they can be resubmit to the queue.
  • there are Max_Attempts (currently set to 1) tries to resubmit.
  • added unit test function on this.
  • added regression test Fragmentation/Automation resubmitjobs.
Location:
src/Fragmentation/Automation
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • src/Fragmentation/Automation/FragmentQueue.cpp

    r559385 rfe95b7  
    2323
    2424#include "CodePatterns/Assert.hpp"
     25#include "CodePatterns/Log.hpp"
    2526
    2627FragmentResult::ptr FragmentQueue::NoResult( new FragmentResult(-1) );
    2728FragmentResult::ptr FragmentQueue::NoResultQueued( new FragmentResult(-2) );
    2829FragmentResult::ptr FragmentQueue::ResultDelivered( new FragmentResult(-3) );
     30size_t FragmentQueue::Max_Attempts = (size_t)1;
    2931
    3032/** Constructor for class FragmentQueue.
     
    9395      "FragmentQueue::popJob() - there are no jobs on the queue.");
    9496  FragmentJob::ptr job = jobs.front();
     97#ifndef NDEBUG
     98  std::pair< BackupMap::iterator, bool> inserter =
     99#endif
     100  backup.insert( std::make_pair( job->getId(), job ));
     101  ASSERT (inserter.second,
     102      "FragmentQueue::popJob() - job "+toString(job->getId())+
     103      " is already in the backup.");
    95104  ResultMap::iterator iter = results.find(job->getId());
    96105  ASSERT(iter != results.end(),
     
    197206void FragmentQueue::pushResult(FragmentResult::ptr &_result)
    198207{
     208  const JobId_t id = _result->getId();
    199209  /// check for presence
    200   ResultMap::iterator iter = results.find(_result->getId());
     210  ResultMap::iterator iter = results.find(id);
    201211  ASSERT(iter != results.end(),
    202       "FragmentQueue::pushResult() - job "+toString(_result->getId())+" is not known to us.");
     212      "FragmentQueue::pushResult() - job "+toString(id)+" is not known to us.");
    203213  ASSERT(*iter->second == *NoResultQueued,
    204       "FragmentQueue::pushResult() - is not waiting for the result of job "+toString(_result->getId())+".");
    205   /// and overwrite NoResult in found entry
    206   iter->second = _result;
    207 }
    208 
     214      "FragmentQueue::pushResult() - is not waiting for the result of job "+toString(id)+".");
     215  // check whether this is a resubmitted job
     216  AttemptsMap::iterator attemptiter = attempts.find(id);
     217  // check whether succeeded or (finally) failed
     218  if ((_result->exitflag == 0) || ((attemptiter != attempts.end()) && (attemptiter->second >= Max_Attempts))) {
     219    // give notice if it is resubmitted job
     220    if (attemptiter != attempts.end()) {
     221      if (attemptiter->second >= Max_Attempts)
     222        ELOG(1, "Job #" << id << " failed on " << Max_Attempts << "th attempt for the last time.");
     223      else
     224        LOG(1, "INFO: Job #" << id << " succeeded on " << attemptiter->second << "th attempt.");
     225    }
     226    // remove in attempts
     227    if (attemptiter != attempts.end())
     228      attempts.erase(attemptiter);
     229    // remove in backup map
     230    BackupMap::iterator backupiter = backup.find(id);
     231    ASSERT( backupiter != backup.end(),
     232        "FragmentQueue::pushResult() - cannot find job "+toString(id)
     233        +" in backup.");
     234    backup.erase(backupiter);
     235    /// and overwrite NoResult in found entry
     236    iter->second = _result;
     237  } else {
     238    LOG(1, "Job " << id << " failed, resubmitting.");
     239    // increase attempts
     240    if (attemptiter != attempts.end())
     241      ++(attemptiter->second);
     242    else
     243      attempts.insert( std::make_pair(id, (size_t)1) );
     244    // resubmit job
     245    resubmitJob(id);
     246  }
     247}
     248
     249/** Resubmit a job which a worker failed to calculate.
     250 *
     251 * @param jobid id of the failed job
     252 */
     253void FragmentQueue::resubmitJob(const JobId_t jobid)
     254{
     255  BackupMap::iterator iter = backup.find(jobid);
     256  ASSERT( iter != backup.end(),
     257      "FragmentQueue::resubmitJob() - job id "+toString(jobid)
     258      +" not stored in backup.");
     259  if (iter != backup.end()) {
     260    // remove result
     261    ResultMap::iterator resiter = results.find(jobid);
     262    ASSERT( resiter != results.end(),
     263        "FragmentQueue::resubmitJob() - resubmitting job "+toString(jobid)
     264        +" for which no result is present.");
     265    results.erase(resiter);
     266    pushJob(iter->second);
     267    backup.erase(iter);
     268  }
     269}
     270
  • src/Fragmentation/Automation/FragmentQueue.hpp

    r559385 rfe95b7  
    4040  FragmentJob::ptr popJob();
    4141  bool isJobPresent() const;
     42  void resubmitJob(const JobId_t jobid);
    4243
    4344  // querying for results
     
    5960  //!> result that takes place in ResultQueue after real result has been delivered.
    6061  static FragmentResult::ptr ResultDelivered;
     62  //!> maximum number of attempts before job is not resubmitted
     63  static size_t Max_Attempts;
    6164
    6265  typedef std::deque<FragmentJob::ptr> JobQueue;
     66  typedef std::map<JobId_t, FragmentJob::ptr> BackupMap;
    6367  typedef std::map<JobId_t, FragmentResult::ptr> ResultMap;
     68  typedef std::map<JobId_t, size_t> AttemptsMap;
    6469
    6570  //!> queue for all jobs
    6671  JobQueue jobs;
     72  //!> backup map for popped jobs that workers failed to calculate
     73  BackupMap backup;
    6774  //!> map for all results that have been polled by the server
    6875  ResultMap results;
     76  //!> map for all jobs that had to be resubmitted storing how often this has been tried
     77  AttemptsMap attempts;
    6978};
    7079
  • src/Fragmentation/Automation/unittests/FragmentQueueUnitTest.cpp

    r559385 rfe95b7  
    294294}
    295295
     296/** Unit test for resubmitJob().
     297 *
     298 */
     299void FragmentQueueTest::resubmitTest()
     300{
     301  FragmentJob::ptr testJob(new FragmentJobStub(1));
     302
     303  // push a Job into queue
     304#ifndef NDEBUG
     305  CPPUNIT_ASSERT_NO_THROW( queue->pushJob(testJob) );
     306#else
     307  queue->pushJob(testJob);
     308#endif
     309  CPPUNIT_ASSERT_EQUAL((size_t)1, queue->jobs.size());
     310  CPPUNIT_ASSERT_EQUAL((size_t)1, queue->results.size());
     311
     312  // pop the job
     313  // pop both as if some work was being done
     314#ifndef NDEBUG
     315  CPPUNIT_ASSERT_NO_THROW( queue->popJob() );
     316#else
     317  queue->popJob();
     318#endif
     319  CPPUNIT_ASSERT_EQUAL((size_t)0, queue->jobs.size());
     320  CPPUNIT_ASSERT_EQUAL((size_t)1, queue->results.size());
     321
     322  // resubmit
     323#ifndef NDEBUG
     324  CPPUNIT_ASSERT_NO_THROW( queue->resubmitJob((JobId_t)1) );
     325#else
     326  queue->resubmitJob((JobId_t)1);
     327#endif
     328
     329  // check whethers it's present again
     330  CPPUNIT_ASSERT_EQUAL((size_t)1, queue->jobs.size());
     331  CPPUNIT_ASSERT_EQUAL((size_t)1, queue->results.size());
     332}
     333
  • src/Fragmentation/Automation/unittests/FragmentQueueUnitTest.hpp

    r559385 rfe95b7  
    2828    CPPUNIT_TEST ( ResultsTest );
    2929    CPPUNIT_TEST ( AllResultsTest );
     30    CPPUNIT_TEST ( resubmitTest );
    3031    CPPUNIT_TEST_SUITE_END();
    3132
     
    3738      void ResultsTest();
    3839      void AllResultsTest();
     40      void resubmitTest();
    3941
    4042private:
Note: See TracChangeset for help on using the changeset viewer.