Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added callback for mlperf logs #2155

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -237,15 +237,19 @@ if (LBANN_WITH_DISTCONV)
find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
set(LBANN_HAS_DISTCONV TRUE)
set(LBANN_H2_LIBS
H2::H2Core
H2::H2Meta
H2::H2Patterns
H2::H2DistConv)
else ()
find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns)
set(LBANN_H2_LIBS
H2::H2Core
H2::H2Meta
H2::H2Patterns)
endif ()
#FIXME(KLG): There is no H2CoreConfig.cmake in H2
#find_package(H2Core REQUIRED)
set(LBANN_HAS_DIHYDROGEN TRUE)
message(STATUS "Found DiHydrogen: ${DiHydrogen_DIR}")

Expand Down Expand Up @@ -660,6 +664,7 @@ target_link_libraries(lbann PUBLIC
${CLARA_LIBRARIES}
${LBANN_PYTHON_LIBS}
protobuf::libprotobuf
spdlog::spdlog
${CEREAL_LIBRARIES}
ZSTR::ZSTR)

Expand Down
69 changes: 69 additions & 0 deletions docs/callbacks/mlperf_logging.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
.. role:: python(code)
:language: python

.. role:: c(code)
:language: c

.. _mlperf-logging-callback:

============================================================
MLPerf Logging Callback
============================================================

The MLPerf callback exports an MLPerf compatible log for running
benchmarks on LBANN. The logging output is included in the out.log
file located in the LBANN run directory.

---------------------------------------------
Execution Points
---------------------------------------------

+ setup
+ on setup end
+ on epoch begin
+ on epoch end
+ on train begin
+ on train end
+ on batch evaluate begin
+ on batch evaluate end

.. _callback-arguments:

---------------------------------------------
Callback Arguments
---------------------------------------------

.. note:: While technically optional, omitting arguments will
result in "UNKNOWN_<FIELD_NAME>" appearing in the log
results (with the exception of sub_org).

:sub_benchmark: (``string``) Benchmark name. A list of benchmarks
can be found in the `MLPerf Benchmarks Suite
<https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#3-benchmarks>`_.

:sub_org: (``string``, optional) Organization running the
benchmarks. Default: `LBANN`.

:sub_division: (``string``) Closed or open division. See `Divisions <https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#4-divisions`_

:sub_status: (``string``) Submission status. (onprem, cloud, or
preview)

:sub_platform: (``string``) Submission platform/hardware. (Example:
Longhorn, NVIDIA DGX A100, JUWELS_Booster)

.. _examples-using-export-onnx:

------------------------------------------------------
Example Using Export ONNX Callback (Python Front-End)
------------------------------------------------------

.. code-block:: python

# Pass parameters to callback
mlperf_logging = lbann.CallbackMlperfLogging(
sub_benchmark="SUBMISSION_BENCHMARK",
sub_org="SUBMISSION_ORGANIZATION",
sub_division="SUBMISSION_DIVISION",
sub_status="SUBMISSION_STATUS",
sub_platform="SUBMISSION_PLATFORM")
1 change: 1 addition & 0 deletions include/lbann/callbacks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ set_full_path(THIS_DIR_HEADERS
learning_rate.hpp
ltfb.hpp
mixup.hpp
mlperf_logging.hpp
monitor_io.hpp
perturb_adam.hpp
perturb_dropout.hpp
Expand Down
156 changes: 156 additions & 0 deletions include/lbann/callbacks/mlperf_logging.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory.
// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
//
// LLNL-CODE-697807.
// All rights reserved.
//
// This file is part of LBANN: Livermore Big Artificial Neural Network
// Toolkit. For details, see http://software.llnl.gov/LBANN or
// https://github.com/LLNL/LBANN.
//
// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
// may not use this file except in compliance with the License. You may
// obtain a copy of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the license.
//
// mlperf_logging .hpp .cpp - Prints mlperf compliant benchmark logs
////////////////////////////////////////////////////////////////////////////////

#ifndef LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED
#define LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED

#include "lbann/callbacks/callback.hpp"
#include <h2/utils/Logger.hpp>

namespace lbann_data {
class Callback;
}

namespace lbann {
namespace callback {

/** @class mlperf_logging
* @brief Callback to print mlperf compliant benchmark logs
*/
class mlperf_logging : public callback_base {

public:

enum class event_type {
TIME_POINT,
INT_START,
INT_END,
};

public:

/** @brief mlperf_logging Constructor.
* @param string sub_benchmark Name of benchmark.
* @param string sub_org Name of submission organization (Default: LBANN)
* @param string sub_division Division of benchmark suite (open or closed)
* @param string sub_status Submission status (onprem, cloud, or preview)
* @param string sub_platform Submission platform/hardware
*/
mlperf_logging(std::string sub_benchmark, std::string sub_org,
std::string sub_division, std::string sub_status,
std::string sub_platform)
: callback_base(/*batch_interval=*/1),
m_sub_benchmark{sub_benchmark.size() ?
std::move(sub_benchmark) :
std::string("UNKNOWN_SUBMISSION_BENCHMARK")},
m_sub_org{sub_org.size() ?
std::move(sub_org) :
std::string("LBANN")},
m_sub_division{sub_division.size() ?
std::move(sub_division) :
std::string("UNKNOWN_SUBMISSION_DIVISION")},
m_sub_status{sub_status.size() ?
std::move(sub_status) :
std::string("UNKNOWN_SUBMISSION_STATUS")},
m_sub_platform{sub_platform.size() ?
std::move(sub_platform) :
std::string("UNKNOWN_SUBMISSION_PLATFORM")}
{}

/** @brief Copy interface */
mlperf_logging* copy() const override {
return new mlperf_logging(*this);
}

/** @brief Return name of callback */
std::string name() const override { return "mlperf_logging"; }

/** @brief Push mlperf formatted log string to stream object.
* @param ostringstream os Stores log strings.
* @param event_type et Type of mlperf style event.
* @param string key Mlperf log key.
* @param T value Mlperf log value.
* @param char const* file Current file name.
* @param size_t line File line number.
* @param double epoch Current epoch number.
*/
template <typename T>
void print(std::ostringstream& os, mlperf_logging::event_type et,
std::string key, T value, char const* file, size_t line,
double epoch = -1) const;

void setup(model *m) override;
void on_setup_end(model *m) override;
void on_epoch_begin(model *m) override;
void on_epoch_end(model *m) override;
void on_train_begin(model *m) override;
void on_train_end(model *m) override;
void on_batch_evaluate_begin(model *m) override;
void on_batch_evaluate_end(model *m) override;

private:

void write_specific_proto(lbann_data::Callback& proto) const final;

/** @brief Populate log with mlperf event type.
* @param ostringstream os Stores log string.
* @param event_type et Type of mlperf style event.
*/
void print_event_type(std::ostringstream& os, mlperf_logging::event_type et) const;

/** @brief Populate log with value.
* @param ostringstream os Stores log string.
* @param event_type et Mlperf log value.
*/

static size_t get_ms_since_epoch();

private:

std::string m_sub_benchmark;
std::string m_sub_org;
std::string m_sub_division;
std::string m_sub_status;
std::string m_sub_platform;
/* @brief name of output file. Default = results.txt */
//std::string m_output_filename;
/* @brief DiHydrogen logger */
mutable h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"};


}; // class mlperf_logging

std::unique_ptr<callback_base>
build_mlperf_logging_callback_from_pbuf(
const google::protobuf::Message& proto_msg,
const std::shared_ptr<lbann_summary>&);

} // namespace callback
} // namespace lbann

#endif // LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED
1 change: 1 addition & 0 deletions src/callbacks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ set_full_path(THIS_DIR_SOURCES
load_model.cpp
ltfb.cpp
mixup.cpp
mlperf_logging.cpp
monitor_io.cpp
perturb_adam.cpp
perturb_dropout.cpp
Expand Down
Loading