diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d8ab9e..852f7cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,7 @@ option(KR_ENABLE_VELOC "use VeloC backend for automatic checkpointing" ON) option(KR_ENABLE_STDFILE "use StdFile backend for automatic checkpointing" OFF) option(KR_ENABLE_MAGISTRATE "use Magistrate for serializing and deserializing" OFF) +option(KR_ENABLE_RESILIENT_EXEC "enable resilient execution spaces" OFF) include(CMakeDependentOption) diff --git a/examples/benchmark_multiviews.cpp b/examples/benchmark_multiviews.cpp index 8c622c5..2491193 100644 --- a/examples/benchmark_multiviews.cpp +++ b/examples/benchmark_multiviews.cpp @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) { wtime = MPI_Wtime(); std::size_t i = 1 + KokkosResilience::latest_version(*ctx, "test_kokkos"); - while(i < nsteps) { + while(i < nsteps ) { KokkosResilience::checkpoint(*ctx, "test_kokkos", i, [=]() { // Nic, tell me what should I put for []/ diff --git a/src/resilience/AutomaticCheckpoint.hpp b/src/resilience/AutomaticCheckpoint.hpp index ad1bff1..798f466 100644 --- a/src/resilience/AutomaticCheckpoint.hpp +++ b/src/resilience/AutomaticCheckpoint.hpp @@ -50,6 +50,7 @@ #include #include "view_hooks/ViewHolder.hpp" #include "view_hooks/DynamicViewHooks.hpp" +#include "registration/ViewHolder.hpp" #include "context/ContextBase.hpp" @@ -117,7 +118,7 @@ namespace KokkosResilience //Figure out how we should be handling this bool recover_region = false, checkpoint_region = false; - if(last_region != regions.end() && last_region.label() == label) { + if(last_region.iter() != regions.end() && last_region.label() == label) { active_region = last_region; } else { active_region = regions.insert({label, {}}).first; diff --git a/src/resilience/CMakeLists.txt b/src/resilience/CMakeLists.txt index 3dd1e57..679edb6 100644 --- a/src/resilience/CMakeLists.txt +++ b/src/resilience/CMakeLists.txt @@ -23,11 +23,11 @@ endif() add_subdirectory(view_hooks) -if (KR_CUDA_EXEC_SPACE) +if (KR_ENABLE_CUDA_EXEC_SPACE) add_subdirectory(cuda) endif() -if (KR_OPENMP_EXEC_SPACE) +if (KR_ENABLE_OPENMP_EXEC_SPACE) add_subdirectory(openMP) endif() diff --git a/src/resilience/context/ContextBase.hpp b/src/resilience/context/ContextBase.hpp index 72465f8..ee67b1f 100644 --- a/src/resilience/context/ContextBase.hpp +++ b/src/resilience/context/ContextBase.hpp @@ -45,6 +45,7 @@ #if defined(KOKKOS_ENABLE_HPX) #include #endif + #include #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include "resilience/Config.hpp" #include "resilience/CheckpointFilter.hpp" @@ -59,6 +61,10 @@ #include "resilience/view_hooks/ViewHolder.hpp" #include "resilience/util/Trace.hpp" +#ifdef KR_ENABLE_MAGISTRATE +#include "../registration/Magistrate.hpp" +#endif + namespace KokkosResilience { class ContextBase @@ -69,11 +75,11 @@ namespace KokkosResilience virtual ~ContextBase() {}; template - void run(const std::string& label, int iteration, RegionFunc&& fun, FilterFunc&& filter, + void run(const std::string& label, int iteration, RegionFunc&& fun, FilterFunc&& filter, Detail::RegInfo&... explicit_members); template - void run(const std::string& label, int iteration, RegionFunc&& fun, + void run(const std::string& label, int iteration, RegionFunc&& fun, Detail::RegInfo&... explicit_members) { run(label, iteration, std::forward(fun), default_filter(), explicit_members...); } @@ -93,7 +99,7 @@ namespace KokkosResilience virtual void register_members(const std::set< KokkosResilience::Registration > &members) { for(auto& member : members) register_member(member); }; - + //Registers to the active region, requires an active region. template void register_to_active(T& member, const std::string& label = ""){ @@ -103,11 +109,11 @@ namespace KokkosResilience //Registers only if in an active region. template bool register_if_active(T& member, const std::string& label){ - if(active_region == regions.end()) return false; + if(active_region.iter() == regions.end()) return false; register_to_active(member, label); return true; } - + template void register_globally(T& member, const std::string& label){ global_members.insert(impl_register(member, label)); @@ -127,30 +133,41 @@ namespace KokkosResilience //Pointer not guaranteed to remain valid, use immediately & discard. char* get_buffer(size_t minimum_size); - + template void register_to_active(const ViewHolder& view){ Registration registration = create_registration>(*this, view); register_member(registration); //Virtual function to whatever inheriting class - active_region.insert(registration); + active_region.insert(registration); } protected: using RegionsMap = std::unordered_map>; - struct Region : RegionsMap::iterator { - Region(RegionsMap::iterator iter) : RegionsMap::iterator(iter) {}; + class Region + { + public: + + using map_iterator = RegionsMap::iterator; + + Region(map_iterator iter) : m_map_iterator(iter) {}; const std::string& label() const { - return (*this)->first; + return m_map_iterator->first; } - const std::set members() const { - return (*this)->second; + const std::set &members() const { + return m_map_iterator->second; } std::set& members(){ - return (*this)->second; + return m_map_iterator->second; } - void insert(Registration& member){ + void insert(const Registration& member){ members().insert(member); } + + auto iter() { return m_map_iterator; } + + private: + + map_iterator m_map_iterator; }; //Create Registration and register to implementation @@ -161,12 +178,12 @@ namespace KokkosResilience register_member(registration); //Virtual function to whatever inheriting class return registration; } - + template void register_to_active(Detail::RegInfo& info){ register_to_active(info.member, info.label); } - + private: //Detect views being copied in, register them and any explicitly-listed members. template @@ -183,10 +200,10 @@ namespace KokkosResilience RegionsMap regions; Region active_region = regions.end(); - + //Performance helper Region last_region = regions.end(); - + std::set global_members; public: diff --git a/src/resilience/registration/Magistrate.hpp b/src/resilience/registration/Magistrate.hpp index 8b5340c..443782d 100644 --- a/src/resilience/registration/Magistrate.hpp +++ b/src/resilience/registration/Magistrate.hpp @@ -1,22 +1,30 @@ +#ifndef INC_RESILIENCE_MAGISTRATE_HPP +#define INC_RESILIENCE_MAGISTRATE_HPP + #ifdef KR_ENABLE_MAGISTRATE #include "resilience/registration/Registration.hpp" #include "resilience/view_hooks/ViewHolder.hpp" #include +#include + +namespace KokkosResilience { + class ContextBase; +} namespace KokkosResilience::Detail { struct Checkpoint_Trait {}; //Registration for some type which Magistrate knows how to checkpoint. - template + template < typename MemberType, typename... Traits > struct MagistrateRegistration : public RegistrationBase { MagistrateRegistration() = delete; - - MagistrateRegistration(MemberType& member, std::string name) + + MagistrateRegistration(MemberType& member, std::string name) : RegistrationBase(name), m_member(member) {} const serializer_t serializer() const override{ @@ -39,13 +47,13 @@ namespace KokkosResilience::Detail { const bool is_same_reference(const Registration& other_reg) const override{ auto other = dynamic_cast(other_reg.get()); - + if(!other){ //We wouldn't expect this to happen, and it may indicate a hash collision fprintf(stderr, "KokkosResilience: Warning, member name %s is shared by more than 1 registration type\n", name.c_str()); return false; } - + return &m_member == &other->m_member; } @@ -63,7 +71,7 @@ namespace KokkosResilience { T, std::tuple, std::enable_if_t< - checkpoint::SerializableTraits::is_traversable + checkpoint::SerializableTraits>::is_traversable >* > { using BaseT = Detail::MagistrateRegistration; @@ -72,7 +80,7 @@ namespace KokkosResilience { create_registration(ContextBase& ctx, T& member, std::string label) : reg(std::make_shared(member, label)) {}; - auto get() { + auto get() && { return std::move(reg); } }; @@ -80,3 +88,5 @@ namespace KokkosResilience { #endif + +#endif // INC_RESILIENCE_MAGISTRATE_HPP diff --git a/src/resilience/registration/Registration.hpp b/src/resilience/registration/Registration.hpp index c55e835..de19073 100644 --- a/src/resilience/registration/Registration.hpp +++ b/src/resilience/registration/Registration.hpp @@ -8,9 +8,9 @@ #include namespace KokkosResilience -{ +{ struct Registration; - + namespace Detail { struct RegistrationBase { typedef std::function serializer_t; @@ -24,7 +24,7 @@ namespace KokkosResilience virtual const serializer_t serializer() const = 0; virtual const deserializer_t deserializer() const = 0; virtual const bool is_same_reference(const Registration&) const = 0; - + bool operator==(const RegistrationBase& other) const { return this->name == other.name; } @@ -39,14 +39,14 @@ namespace KokkosResilience } return static_cast(hash%INT_MAX); } - + protected: - RegistrationBase(const std::string member_name) : + RegistrationBase(const std::string member_name) : name(member_name) { } }; - - - //Helper for explicitly-listing data that a + + + //Helper for explicitly-listing data that a //checkpoint region should also use. template struct RegInfo { @@ -56,28 +56,28 @@ namespace KokkosResilience }; } - + template, typename enable = void*> struct create_registration; struct Registration : public std::shared_ptr { using serializer_t = typename Detail::RegistrationBase::serializer_t; using deserializer_t = typename Detail::RegistrationBase::deserializer_t; - + template - Registration(std::shared_ptr base) + Registration(std::shared_ptr base) : std::shared_ptr(std::move(base)) {} template - Registration(create_registration reg) - : Registration(reg.get()) {}; + Registration(create_registration reg) + : Registration(std::move(reg).get()) {}; const size_t hash() const { return (*this)->hash(); } bool operator==(const Registration& other){ - return this->get() == other.get(); + return this->get() == other.get(); } }; } //namespace KokkosResilience diff --git a/src/resilience/registration/ViewHolder.hpp b/src/resilience/registration/ViewHolder.hpp index 86bcb80..1b23a31 100644 --- a/src/resilience/registration/ViewHolder.hpp +++ b/src/resilience/registration/ViewHolder.hpp @@ -1,3 +1,6 @@ +#ifndef INC_RESILIENCE_REGISTRATION_VIEWHOLDER_HPP +#define INC_RESILIENCE_REGISTRATION_VIEWHOLDER_HPP + #include "Registration.hpp" #include "resilience/view_hooks/ViewHolder.hpp" #include "resilience/context/ContextBase.hpp" @@ -6,14 +9,14 @@ namespace KokkosResilience::Detail { struct ViewHolderRegistration : public RegistrationBase { ViewHolderRegistration() = delete; - ViewHolderRegistration(ContextBase& ctx, const KokkosResilience::ViewHolder& view) : + ViewHolderRegistration(ContextBase& ctx, const KokkosResilience::ViewHolder& view) : RegistrationBase(view->label()), m_view(view), m_ctx(ctx) {}; const serializer_t serializer() const override{ return [&, this](std::ostream& stream){ size_t buffer_size = need_buffer ? m_view->data_type_size()*m_view->span() : 0; char* buf = m_ctx.get_buffer(buffer_size); - + m_view->serialize(stream, buf); return stream.good(); }; @@ -23,7 +26,7 @@ struct ViewHolderRegistration : public RegistrationBase { return [&, this](std::istream& stream){ size_t buffer_size = need_buffer ? m_view->data_type_size()*m_view->span() : 0; char* buf = m_ctx.get_buffer(buffer_size); - + m_view->deserialize(stream, buf); return stream.good(); }; @@ -31,18 +34,18 @@ struct ViewHolderRegistration : public RegistrationBase { const bool is_same_reference(const Registration& other_reg) const override{ auto other = dynamic_cast(other_reg.get()); - + if(!other){ //We wouldn't expect this to happen, and it may indicate a hash collision fprintf(stderr, "KokkosResilience: Warning, member name %s is shared by more than 1 registration type\n", name.c_str()); return false; } - - //Handle subviews! We want to checkpoint the largest view/subview, so report that the other is + + //Handle subviews! We want to checkpoint the largest view/subview, so report that the other is //the same reference if they're a subset of me. // //TODO: This currently assumes the two views are equal or subviews (ie no name collisions), - // and that a larger data() pointer implies a subview (ie we can deal well with subviews of + // and that a larger data() pointer implies a subview (ie we can deal well with subviews of // subviews, but not two different subviews of the same view). Does Kokkos expose anything // that can help with this? return m_view->data() <= other->m_view->data(); @@ -51,7 +54,7 @@ struct ViewHolderRegistration : public RegistrationBase { private: const KokkosResilience::ViewHolder m_view; - const bool need_buffer = + const bool need_buffer = #ifdef KR_ENABLE_MAGISTRATE false; #else @@ -68,7 +71,7 @@ namespace KokkosResilience { using RegT = Detail::ViewHolderRegistration; std::shared_ptr reg; - create_registration(ContextBase& ctx, const KokkosResilience::ViewHolder& view, std::string unused = "") + create_registration(ContextBase& ctx, const KokkosResilience::ViewHolder& view, std::string unused = "") : reg(std::make_shared(ctx, view)) {}; auto get() { @@ -76,3 +79,5 @@ namespace KokkosResilience { } }; } + +#endif diff --git a/src/resilience/view_hooks/ViewHolder.hpp b/src/resilience/view_hooks/ViewHolder.hpp index d3d2960..d1c820e 100644 --- a/src/resilience/view_hooks/ViewHolder.hpp +++ b/src/resilience/view_hooks/ViewHolder.hpp @@ -252,7 +252,7 @@ class ViewHolderImpl : public ViewHolderImplBase { stream.read(buf, data_type_size() * span()); deep_copy_from_buffer((const unsigned char*)buf); } else { - stream.read(data(), data_type_size() * span()); + stream.read(static_cast< char * >( data() ), data_type_size() * span()); } } #endif diff --git a/tests/TestLambdaCapture.cpp b/tests/TestLambdaCapture.cpp index a098b90..a1fdbd6 100644 --- a/tests/TestLambdaCapture.cpp +++ b/tests/TestLambdaCapture.cpp @@ -53,7 +53,6 @@ auto get_view_list( F &&_fun ) auto f = _fun; - KokkosResilience::Detail::Cref::check_ref_list = nullptr; KokkosResilience::DynamicViewHooks::copy_constructor_set.reset(); f(); diff --git a/tests/TestVelocMemoryBackend.cpp b/tests/TestVelocMemoryBackend.cpp index 8f68755..3d9c601 100644 --- a/tests/TestVelocMemoryBackend.cpp +++ b/tests/TestVelocMemoryBackend.cpp @@ -42,7 +42,7 @@ #include #include -#include +#include #include #include