diff --git a/CMakeLists.txt b/CMakeLists.txt index 21951fd7ea1..8a7bae7687f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -237,15 +237,19 @@ if (LBANN_WITH_DISTCONV) find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv) set(LBANN_HAS_DISTCONV TRUE) set(LBANN_H2_LIBS + H2::H2Core H2::H2Meta H2::H2Patterns H2::H2DistConv) else () find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns) set(LBANN_H2_LIBS + H2::H2Core H2::H2Meta H2::H2Patterns) endif () +#FIXME(KLG): There is no H2CoreConfig.cmake in H2 +#find_package(H2Core REQUIRED) set(LBANN_HAS_DIHYDROGEN TRUE) message(STATUS "Found DiHydrogen: ${DiHydrogen_DIR}") @@ -660,6 +664,7 @@ target_link_libraries(lbann PUBLIC ${CLARA_LIBRARIES} ${LBANN_PYTHON_LIBS} protobuf::libprotobuf + spdlog::spdlog ${CEREAL_LIBRARIES} ZSTR::ZSTR) diff --git a/docs/callbacks/mlperf_logging.rst b/docs/callbacks/mlperf_logging.rst new file mode 100644 index 00000000000..4fdbb4c50b0 --- /dev/null +++ b/docs/callbacks/mlperf_logging.rst @@ -0,0 +1,69 @@ +.. role:: python(code) + :language: python + +.. role:: c(code) + :language: c + +.. _mlperf-logging-callback: + +============================================================ +MLPerf Logging Callback +============================================================ + +The MLPerf callback exports an MLPerf compatible log for running +benchmarks on LBANN. The logging output is included in the out.log +file located in the LBANN run directory. + +--------------------------------------------- +Execution Points +--------------------------------------------- + ++ setup ++ on setup end ++ on epoch begin ++ on epoch end ++ on train begin ++ on train end ++ on batch evaluate begin ++ on batch evaluate end + +.. _callback-arguments: + +--------------------------------------------- +Callback Arguments +--------------------------------------------- + + .. note:: While technically optional, omitting arguments will + result in "UNKNOWN_" appearing in the log + results (with the exception of sub_org). + + :sub_benchmark: (``string``) Benchmark name. A list of benchmarks + can be found in the `MLPerf Benchmarks Suite + `_. + + :sub_org: (``string``, optional) Organization running the + benchmarks. Default: `LBANN`. + + :sub_division: (``string``) Closed or open division. See `Divisions +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// mlperf_logging .hpp .cpp - Prints mlperf compliant benchmark logs +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED +#define LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include

+ +namespace lbann_data { +class Callback; +} + +namespace lbann { +namespace callback { + +/** @class mlperf_logging + * @brief Callback to print mlperf compliant benchmark logs + */ +class mlperf_logging : public callback_base { + +public: + + enum class event_type { + TIME_POINT, + INT_START, + INT_END, + }; + +public: + + /** @brief mlperf_logging Constructor. + * @param string sub_benchmark Name of benchmark. + * @param string sub_org Name of submission organization (Default: LBANN) + * @param string sub_division Division of benchmark suite (open or closed) + * @param string sub_status Submission status (onprem, cloud, or preview) + * @param string sub_platform Submission platform/hardware + */ + mlperf_logging(std::string sub_benchmark, std::string sub_org, + std::string sub_division, std::string sub_status, + std::string sub_platform) + : callback_base(/*batch_interval=*/1), + m_sub_benchmark{sub_benchmark.size() ? + std::move(sub_benchmark) : + std::string("UNKNOWN_SUBMISSION_BENCHMARK")}, + m_sub_org{sub_org.size() ? + std::move(sub_org) : + std::string("LBANN")}, + m_sub_division{sub_division.size() ? + std::move(sub_division) : + std::string("UNKNOWN_SUBMISSION_DIVISION")}, + m_sub_status{sub_status.size() ? + std::move(sub_status) : + std::string("UNKNOWN_SUBMISSION_STATUS")}, + m_sub_platform{sub_platform.size() ? + std::move(sub_platform) : + std::string("UNKNOWN_SUBMISSION_PLATFORM")} + {} + + /** @brief Copy interface */ + mlperf_logging* copy() const override { + return new mlperf_logging(*this); + } + + /** @brief Return name of callback */ + std::string name() const override { return "mlperf_logging"; } + + /** @brief Push mlperf formatted log string to stream object. + * @param ostringstream os Stores log strings. + * @param event_type et Type of mlperf style event. + * @param string key Mlperf log key. + * @param T value Mlperf log value. + * @param char const* file Current file name. + * @param size_t line File line number. + * @param double epoch Current epoch number. + */ + template + void print(std::ostringstream& os, mlperf_logging::event_type et, + std::string key, T value, char const* file, size_t line, + double epoch = -1) const; + + void setup(model *m) override; + void on_setup_end(model *m) override; + void on_epoch_begin(model *m) override; + void on_epoch_end(model *m) override; + void on_train_begin(model *m) override; + void on_train_end(model *m) override; + void on_batch_evaluate_begin(model *m) override; + void on_batch_evaluate_end(model *m) override; + +private: + + void write_specific_proto(lbann_data::Callback& proto) const final; + + /** @brief Populate log with mlperf event type. + * @param ostringstream os Stores log string. + * @param event_type et Type of mlperf style event. + */ + void print_event_type(std::ostringstream& os, mlperf_logging::event_type et) const; + + /** @brief Populate log with value. + * @param ostringstream os Stores log string. + * @param event_type et Mlperf log value. + */ + + static size_t get_ms_since_epoch(); + +private: + + std::string m_sub_benchmark; + std::string m_sub_org; + std::string m_sub_division; + std::string m_sub_status; + std::string m_sub_platform; + /* @brief name of output file. Default = results.txt */ + //std::string m_output_filename; + /* @brief DiHydrogen logger */ + mutable h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"}; + + +}; // class mlperf_logging + +std::unique_ptr +build_mlperf_logging_callback_from_pbuf( + const google::protobuf::Message& proto_msg, + const std::shared_ptr&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index a423c3eb315..8b6713f966d 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -52,6 +52,7 @@ set_full_path(THIS_DIR_SOURCES load_model.cpp ltfb.cpp mixup.cpp + mlperf_logging.cpp monitor_io.cpp perturb_adam.cpp perturb_dropout.cpp diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp new file mode 100644 index 00000000000..adc752df35f --- /dev/null +++ b/src/callbacks/mlperf_logging.cpp @@ -0,0 +1,341 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// mlperf_logging .hpp .cpp - Prints mlperf compliant benchmark logs +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/callbacks/mlperf_logging.hpp" +#include "lbann/metrics/metric.hpp" +#include "lbann/weights/weights.hpp" +#include "lbann/trainers/trainer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/data_coordinator/data_coordinator.hpp" +#include "lbann/execution_algorithms/sgd_execution_context.hpp" +#include "lbann/optimizers/optimizer.hpp" + +#include "lbann/proto/callbacks.pb.h" + +#include +#include +#include +#include +#include +#include + +namespace lbann { +namespace callback { + +namespace { +void print_value(std::ostringstream& os, int value) +{ + os << value; +} +void print_value(std::ostringstream& os, double value) +{ + os << value; +} +void print_value(std::ostringstream& os, long value) +{ + os << value; +} +void print_value(std::ostringstream& os, size_t value) +{ + os << value; +} +void print_value(std::ostringstream& os, std::string const& value) +{ + os << "\"" << value << "\""; +} +void print_value(std::ostringstream& os, char const* value) +{ + os << "\"" << value << "\""; +} +template +void print_value(std::ostringstream& os, T value) +{ + //FIXME: Should I push the value anyway? + os << "\"UNKNOWN_DATA_TYPE\""; +} + +//FIXME: Tom's problem +int get_real_num_accelerators() +{ + return 0; +} + +int get_num_nodes() +{ + if (std::getenv("SLURM_NNODES")) + return atoi(std::getenv("SLURM_NNODES")); + else if (std::getenv("FLUX_JOB_NNODES")) + return atoi(std::getenv("FLUX_JOB_NNODES")); + else return -1; + //FIXME: count number of unique hostnames in universe? +} +}// namespace + +void mlperf_logging::write_specific_proto(lbann_data::Callback& proto) const +{ + auto* msg = proto.mutable_mlperf_logging(); + msg->set_sub_benchmark(m_sub_benchmark); + msg->set_sub_org(m_sub_org); + msg->set_sub_division(m_sub_division); + msg->set_sub_status(m_sub_status); + msg->set_sub_platform(m_sub_platform); +} + +template +void mlperf_logging::print(std::ostringstream& os, + mlperf_logging::event_type et, std::string key, + T value, char const* file, size_t line, + double epoch) const +{ + os << "{" + << "\"namespace\": \"\", " + << "\"time_ms\": " << get_ms_since_epoch() << ", " + << "\"event_type\": \""; + print_event_type(os, et); + + os << "\", " + << "\"key\": \"" << key << "\", " + << "\"value\": "; + print_value(os, value); + os << ", " + << "\"metadata\": {\"file\": \"" << file << "\", " + << "\"lineno\": " << line; + if(epoch < 0) + os << "}}"; + else + os << ", " << "\"epoch_num\": " << epoch << "}}"; + + m_logger.get().info(os.str()); + os.flush(); +} + +void mlperf_logging::print_event_type(std::ostringstream& os, mlperf_logging::event_type et) const +{ + switch (et) { + case mlperf_logging::event_type::TIME_POINT: os << "POINT_IN_TIME"; break; + case mlperf_logging::event_type::INT_START: os << "INTERVAL_START"; break; + case mlperf_logging::event_type::INT_END: os << "INTERVAL_END"; break; + default: os << "INVALID_EVENT_TYPE"; break; + } +} + +size_t mlperf_logging::get_ms_since_epoch() +{ + using namespace std::chrono; + return duration_cast< milliseconds >( + system_clock::now().time_since_epoch()).count(); +} + +void mlperf_logging::setup(model *m) +{ + std::ostringstream os; + + // Not a good/portable way to do this in C++ + // std::string value = "null"; + // print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value, + // __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::TIME_POINT, "submission_benchmark", + m_sub_benchmark, __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::TIME_POINT, "submission_org", + m_sub_org, __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::TIME_POINT, "submission_division", + m_sub_division, __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::TIME_POINT, "submission_status", + m_sub_status, __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform", + m_sub_platform, __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::INT_START, "init_start", "null", + __FILE__, __LINE__); +} +void mlperf_logging::on_setup_end(model *m) +{ + std::ostringstream os; + lbann_comm *comm = m->get_comm(); + auto const& trainer = get_const_trainer(); + + print(os, mlperf_logging::event_type::TIME_POINT, "number_of_ranks", + static_cast(comm->get_procs_in_world()), __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::TIME_POINT, "number_of_nodes", + static_cast(get_num_nodes()), __FILE__, __LINE__); + + auto accelerators = get_real_num_accelerators(); + print(os, mlperf_logging::event_type::TIME_POINT, "accelerators_per_node", + static_cast(accelerators), __FILE__, __LINE__); + + auto const seed = trainer.get_random_seed(); + print(os, mlperf_logging::event_type::TIME_POINT, "seed", + seed, __FILE__, __LINE__); + + auto const& dc = trainer.get_data_coordinator(); + auto const batch_size = dc.get_global_mini_batch_size( + execution_mode::training); + print(os, mlperf_logging::event_type::TIME_POINT, "global_batch_size", + batch_size, __FILE__, __LINE__); + + auto samples = dc.get_total_num_samples(execution_mode::training); + print(os, mlperf_logging::event_type::TIME_POINT, "train_samples", + samples, __FILE__, __LINE__); + + //FIXME: Should this be execution_mode::validation? Tom thinks no + auto eval_samples = dc.get_total_num_samples(execution_mode::testing); + print(os, mlperf_logging::event_type::TIME_POINT, "eval_samples", + eval_samples, __FILE__, __LINE__); + + auto const weights = m->get_weights(); + for (auto const w : weights) + if( w->get_optimizer() != nullptr ){ + std::string opt = w->get_optimizer()->get_type(); + print(os, mlperf_logging::event_type::TIME_POINT, "opt_name", + opt, __FILE__, __LINE__); + + auto opt_learning_rate = w->get_optimizer()->get_learning_rate(); + print(os, mlperf_logging::event_type::TIME_POINT, + "opt_base_learning_rate", static_cast(opt_learning_rate), + __FILE__, __LINE__); + break; + } + + // LBANN does not perform warmup steps. + // auto opt_warmup_steps = -1; + // print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_warmup_steps", + // static_cast(opt_warmup_steps), + // __FILE__, __LINE__); + + // auto opt_warmup_factor = -1; + // print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_warmup_factor", + // static_cast(opt_warmup_factor), + // __FILE__, __LINE__); + + // FIXME (Tom's problem) + //auto opt_decay_bound_steps = -1; + //print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_decay_boundary_steps", + // static_cast(opt_decay_bound_steps), + // __FILE__, __LINE__); + + // auto opt_decay_factor = -1; + // print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_decay_factor", + // static_cast(opt_decay_factor), + // __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::INT_END, "init_stop", "null", + __FILE__, __LINE__); +} + +void mlperf_logging::on_epoch_begin(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_START, "epoch_start", "null", + __FILE__, __LINE__, epoch); +} + +void mlperf_logging::on_epoch_end(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_END, "epoch_stop", "null", + __FILE__, __LINE__, epoch); +} + +void mlperf_logging::on_train_begin(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_START, "run_start", "null", + __FILE__, __LINE__, epoch); +} + +void mlperf_logging::on_train_end(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_END, "run_stop", "null", + __FILE__, __LINE__, epoch); +} + +void mlperf_logging::on_batch_evaluate_begin(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_START, "eval_start", "null", + __FILE__, __LINE__, epoch); +} + +void mlperf_logging::on_batch_evaluate_end(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_END, "eval_stop", "null", + __FILE__, __LINE__, epoch); + + //FIXME (Tom's problem) + auto eval_error = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "eval_error", + static_cast(eval_error), __FILE__, + __LINE__, epoch); +} + +std::unique_ptr +build_mlperf_logging_callback_from_pbuf( + const google::protobuf::Message& proto_msg, + const std::shared_ptr&) +{ + const auto& params = + dynamic_cast(proto_msg); + return std::make_unique(params.sub_benchmark(), + params.sub_org(), + params.sub_division(), + params.sub_status(), + params.sub_platform()); + //params.num_nodes()); +} +} // namespace callback +} // namespace lbann diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index 71fee45c9ac..b75a19bb17d 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -86,7 +86,8 @@ message Callback { CallbackPerturbWeights perturb_weights = 52; CallbackExportOnnx export_onnx = 53; CallbackAlternateUpdates alternate_updates = 54; - } + CallbackMlperfLogging mlperf_logging = 56; +} message CallbackLTFB { int64 batch_interval = 1; @@ -433,8 +434,17 @@ message Callback { /** @brief Export trained model in onnx format */ message CallbackExportOnnx { - string output_filename = 1; // name of onnx output file - string debug_string_filename = 2; // print debug string to file + string output_filename = 1; // Name of onnx output file + string debug_string_filename = 2; // Print debug string to file + } + + /** @brief Prints mlperf compliant benchmark logs */ + message CallbackMlperfLogging { + string sub_benchmark = 1; // Name of benchmark + string sub_org = 2; // Name of submission organization + string sub_division = 3; // Open or closed division + string sub_status = 4; // Submission status: onprem, cloud, or preview + string sub_platform = 5; // Submission platform/hardware } message CallbackAlternateUpdates { diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 8302cb29840..2ee5a4847d7 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -55,6 +55,7 @@ #include "lbann/callbacks/load_model.hpp" #include "lbann/callbacks/ltfb.hpp" #include "lbann/callbacks/mixup.hpp" +#include "lbann/callbacks/mlperf_logging.hpp" #include "lbann/callbacks/monitor_io.hpp" #include "lbann/callbacks/perturb_adam.hpp" #include "lbann/callbacks/perturb_dropout.hpp" @@ -162,6 +163,8 @@ void register_default_builders(factory_type& factory) factory.register_builder("CallbackMinibatchSchedule", build_minibatch_schedule_callback_from_pbuf); factory.register_builder("CallbackMixup", build_mixup_callback_from_pbuf); + factory.register_builder("CallbackMlperfLogging", + build_mlperf_logging_callback_from_pbuf); factory.register_builder( "CallbackOptimizerwiseAdaptiveLearningRate", build_optimizerwise_adaptive_learning_rate_callback_from_pbuf);