From afbfb96c21cbcc039a21c95524c70a14758f2a6c Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 27 Oct 2020 15:58:52 +0200
Subject: [PATCH 1/4] MLPFProducer new version, squash all

---
 .../EventContent/python/EventContent_cff.py   |    2 +
 .../ProcessModifiers/python/mlpf_cff.py       |    4 +
 .../PyReleaseValidation/python/relval_2017.py |    2 +
 .../python/upgradeWorkflowComponents.py       |   24 +
 .../RecoParticleFlow_EventContent_cff.py      |    6 +
 .../python/RecoParticleFlow_cff.py            |    5 +
 .../PFProducer/interface/MLPFModel.h          |   67 ++
 .../PFProducer/plugins/BuildFile.xml          |    3 +
 .../PFProducer/plugins/MLPFProducer.cc        |  139 +++
 .../plugins/MLPFProducerSonicTriton.cc        |  136 +++
 .../python/mlpf_EventContent_cff.py           |    8 +
 RecoParticleFlow/PFProducer/src/MLPFModel.cc  |  218 ++++
 .../PFProducer/test/mlpf_training/generate.sh |   67 ++
 .../test/mlpf_training/preprocessing.py       |  598 +++++++++
 .../PFProducer/test/mlpf_training/run.sh      |   15 +
 .../PFProducer/test/mlpf_training/tf_data.py  |  147 +++
 .../PFProducer/test/mlpf_training/tf_model.py | 1070 +++++++++++++++++
 .../{PFAnalysis.cc => PFAnalysisNtuplizer.cc} |  214 ++--
 .../python/customize_pfanalysis.py            |    7 +
 .../test/pfanalysis_ntuple.py                 |    3 +-
 20 files changed, 2634 insertions(+), 101 deletions(-)
 create mode 100644 Configuration/ProcessModifiers/python/mlpf_cff.py
 create mode 100644 RecoParticleFlow/PFProducer/interface/MLPFModel.h
 create mode 100644 RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc
 create mode 100644 RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc
 create mode 100644 RecoParticleFlow/PFProducer/python/mlpf_EventContent_cff.py
 create mode 100644 RecoParticleFlow/PFProducer/src/MLPFModel.cc
 create mode 100755 RecoParticleFlow/PFProducer/test/mlpf_training/generate.sh
 create mode 100644 RecoParticleFlow/PFProducer/test/mlpf_training/preprocessing.py
 create mode 100755 RecoParticleFlow/PFProducer/test/mlpf_training/run.sh
 create mode 100644 RecoParticleFlow/PFProducer/test/mlpf_training/tf_data.py
 create mode 100644 RecoParticleFlow/PFProducer/test/mlpf_training/tf_model.py
 rename Validation/RecoParticleFlow/plugins/{PFAnalysis.cc => PFAnalysisNtuplizer.cc} (89%)

diff --git a/Configuration/EventContent/python/EventContent_cff.py b/Configuration/EventContent/python/EventContent_cff.py
index e7bd292dbf241..9195a125c8ea2 100644
--- a/Configuration/EventContent/python/EventContent_cff.py
+++ b/Configuration/EventContent/python/EventContent_cff.py
@@ -227,6 +227,7 @@ def SwapKeepAndDrop(l):
     outputCommands = RECOEventContent.outputCommands + RecoCTPPSRECO.outputCommands)
 phase2_hgcal.toModify(RECOEventContent,
     outputCommands = RECOEventContent.outputCommands + TICL_RECO.outputCommands)
+
 #
 #
 # RAWRECO Data Tier definition
@@ -400,6 +401,7 @@ def SwapKeepAndDrop(l):
     outputCommands = RECOSIMEventContent.outputCommands + RecoLocalFastTimeRECO.outputCommands)
 phase2_timing_layer.toModify(RECOSIMEventContent, 
     outputCommands = RECOSIMEventContent.outputCommands + RecoMTDRECO.outputCommands)
+
 #
 #
 # GENRAW Data Tier definition
diff --git a/Configuration/ProcessModifiers/python/mlpf_cff.py b/Configuration/ProcessModifiers/python/mlpf_cff.py
new file mode 100644
index 0000000000000..d1cedca2a2e53
--- /dev/null
+++ b/Configuration/ProcessModifiers/python/mlpf_cff.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+# This modifier is for activating MLPF reconstruction in 2021
+mlpf = cms.Modifier()
diff --git a/Configuration/PyReleaseValidation/python/relval_2017.py b/Configuration/PyReleaseValidation/python/relval_2017.py
index d7218ad988efd..35ecc44228bfe 100644
--- a/Configuration/PyReleaseValidation/python/relval_2017.py
+++ b/Configuration/PyReleaseValidation/python/relval_2017.py
@@ -35,6 +35,7 @@
 #        (Patatrack ECAL-only: TTbar - on CPU, on GPU, both, auto)
 #        (Patatrack HCAL-only: TTbar - on CPU, on GPU, both, auto)
 #        (TTbar 0T, TTbar PU 0T)
+#        (TTbar PU MLPF)
 #   2023 (TTbar, TTbar PU, TTbar PU premix)
 #   2024 (TTbar, TTbar PU, TTbar PU premix)
 numWFIB = [10001.0,10002.0,10003.0,10004.0,10005.0,10006.0,10007.0,10008.0,10009.0,10059.0,10071.0,
@@ -59,6 +60,7 @@
            11634.511,11634.512, # 11634.513,11634.514,
            11634.521,11634.522, # 11634.523,11634.524
            11634.24,11834.24,
+           11834.13,
            12434.0,12634.0,12634.99,
            12834.0,13034.0,13034.99]
 for numWF in numWFIB:
diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
index a9876705bf631..f253bbb22d23d 100644
--- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
+++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
@@ -328,6 +328,30 @@ def condition(self, fragment, stepList, key, hasHarvest):
     offset = 0.9,
 )
 
+# MLPF workflows
+class UpgradeWorkflow_mlpf(UpgradeWorkflow):
+    def setup_(self, step, stepName, stepDict, k, properties):
+        if 'Reco' in step:
+            stepDict[stepName][k] = merge([self.step3, stepDict[step][k]])
+    def condition(self, fragment, stepList, key, hasHarvest):
+        return fragment=="TTbar_14TeV" and '2021' in key
+
+upgradeWFs['mlpf'] = UpgradeWorkflow_mlpf(
+    steps = [
+        'Reco',
+    ],
+    PU = [
+        'Reco',
+    ],
+    suffix = '_mlpf',
+    offset = 0.13,
+)
+upgradeWFs['mlpf'].step3 = {
+    '--datatier': 'GEN-SIM-RECO,RECOSIM,MINIAODSIM,DQMIO',
+    '--eventcontent': 'FEVTDEBUGHLT,RECOSIM,MINIAODSIM,DQM',
+    '--procModifiers': 'mlpf'
+}
+
 # Patatrack workflows
 class UpgradeWorkflowPatatrack(UpgradeWorkflow):
     def condition(self, fragment, stepList, key, hasHarvest):
diff --git a/RecoParticleFlow/Configuration/python/RecoParticleFlow_EventContent_cff.py b/RecoParticleFlow/Configuration/python/RecoParticleFlow_EventContent_cff.py
index d35456e4f2cf8..40b41d38c7767 100644
--- a/RecoParticleFlow/Configuration/python/RecoParticleFlow_EventContent_cff.py
+++ b/RecoParticleFlow/Configuration/python/RecoParticleFlow_EventContent_cff.py
@@ -87,3 +87,9 @@
     outputCommands = RecoParticleFlowFEVT.outputCommands + ['keep recoPFRecHits_particleFlowClusterECAL__*',
                                                             'keep recoPFRecHits_particleFlowRecHitHGC__*',
                                                             'keep *_simPFProducer_*_*'])
+
+from Configuration.ProcessModifiers.mlpf_cff import mlpf
+from RecoParticleFlow.PFProducer.mlpf_EventContent_cff import MLPF_RECO
+
+mlpf.toModify(RecoParticleFlowRECO,
+    outputCommands = RecoParticleFlowRECO.outputCommands + MLPF_RECO.outputCommands)
diff --git a/RecoParticleFlow/Configuration/python/RecoParticleFlow_cff.py b/RecoParticleFlow/Configuration/python/RecoParticleFlow_cff.py
index 3c91c95484e95..5a8d6b6885046 100644
--- a/RecoParticleFlow/Configuration/python/RecoParticleFlow_cff.py
+++ b/RecoParticleFlow/Configuration/python/RecoParticleFlow_cff.py
@@ -86,4 +86,9 @@
     e.toModify(pfPileUp, enable = cms.bool(False))
     
 
+from Configuration.ProcessModifiers.mlpf_cff import mlpf
+from RecoParticleFlow.PFProducer.mlpfProducer_cfi import mlpfProducer
 
+_mlpfTask = cms.Task(mlpfProducer, particleFlowRecoTask.copy())
+
+mlpf.toReplaceWith(particleFlowRecoTask, _mlpfTask)
diff --git a/RecoParticleFlow/PFProducer/interface/MLPFModel.h b/RecoParticleFlow/PFProducer/interface/MLPFModel.h
new file mode 100644
index 0000000000000..9c850859981ef
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/interface/MLPFModel.h
@@ -0,0 +1,67 @@
+#ifndef RecoParticleFlow_PFProducer_interface_MLPFModel
+#define RecoParticleFlow_PFProducer_interface_MLPFModel
+
+#include "FWCore/Framework/interface/Event.h"
+#include "DataFormats/ParticleFlowReco/interface/PFBlockElement.h"
+#include "DataFormats/ParticleFlowCandidate/interface/PFCandidate.h"
+
+namespace reco::mlpf {
+  //The model takes the following number of features for each input PFElement
+  static constexpr unsigned int NUM_ELEMENT_FEATURES = 15;
+
+  //these are defined at model creation time and set the random LSH codebook size
+  static constexpr int NUM_MAX_ELEMENTS_BATCH = 20000;
+  static constexpr int LSH_BIN_SIZE = 100;
+
+  //In CPU mode, we only want to evaluate each event separately
+  static constexpr int BATCH_SIZE = 1;
+
+  //The model has 12 outputs for each particle:
+  // out[0-7]: particle classification logits
+  // out[8]: regressed eta
+  // out[9]: regressed phi
+  // out[10]: regressed energy
+  // out[11]: regressed charge logit
+  static constexpr unsigned int NUM_OUTPUTS = 12;
+  static constexpr unsigned int NUM_CLASS = 7;
+  static constexpr unsigned int IDX_ETA = 8;
+  static constexpr unsigned int IDX_PHI = 9;
+  static constexpr unsigned int IDX_ENERGY = 10;
+  static constexpr unsigned int IDX_CHARGE = 11;
+
+  //index [0, N_pdgids) -> PDGID
+  //this maps the absolute values of the predicted PDGIDs to an array of ascending indices
+  static const std::vector<int> pdgid_encoding = {0, 1, 2, 11, 13, 22, 130, 211};
+
+  //PFElement::type -> index [0, N_types)
+  //this maps the type of the PFElement to an ascending index that is used by the model to distinguish between different elements
+  static const std::map<int, int> elem_type_encoding = {
+      {0, 0},
+      {1, 1},
+      {2, 2},
+      {3, 3},
+      {4, 4},
+      {5, 5},
+      {6, 6},
+      {7, 7},
+      {8, 8},
+      {9, 9},
+      {10, 10},
+      {11, 11},
+  };
+
+  std::array<float, NUM_ELEMENT_FEATURES> getElementProperties(const reco::PFBlockElement& orig);
+  float normalize(float in);
+
+  int argMax(std::vector<float> const& vec);
+
+  reco::PFCandidate makeCandidate(int pred_pid, int pred_charge, float pred_e, float pred_eta, float pred_phi);
+
+  const std::vector<const reco::PFBlockElement*> getPFElements(const reco::PFBlockCollection& blocks);
+
+  void setCandidateRefs(reco::PFCandidate& cand,
+                        const std::vector<const reco::PFBlockElement*> elems,
+                        size_t ielem_originator);
+};  // namespace reco::mlpf
+
+#endif
\ No newline at end of file
diff --git a/RecoParticleFlow/PFProducer/plugins/BuildFile.xml b/RecoParticleFlow/PFProducer/plugins/BuildFile.xml
index 676d8a1ecc962..75ce00d280222 100644
--- a/RecoParticleFlow/PFProducer/plugins/BuildFile.xml
+++ b/RecoParticleFlow/PFProducer/plugins/BuildFile.xml
@@ -79,5 +79,8 @@
   <use name="RecoParticleFlow/PFProducer"/>
   <use name="RecoParticleFlow/PFTracking"/>
   <use name="RecoEcal/EgammaCoreTools"/>
+  <use name="PhysicsTools/TensorFlow"/>
+  <use name="HeterogeneousCore/SonicCore"/>
+  <use name="HeterogeneousCore/SonicTriton"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc b/RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc
new file mode 100644
index 0000000000000..1ae5c1877052e
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc
@@ -0,0 +1,139 @@
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+
+#include "DataFormats/ParticleFlowCandidate/interface/PFCandidate.h"
+#include "PhysicsTools/TensorFlow/interface/TensorFlow.h"
+#include "RecoParticleFlow/PFProducer/interface/MLPFModel.h"
+
+struct MLPFCache {
+  std::atomic<tensorflow::GraphDef*> graph_def;
+};
+
+class MLPFProducer : public edm::stream::EDProducer<edm::GlobalCache<MLPFCache> > {
+public:
+  explicit MLPFProducer(const edm::ParameterSet&, const MLPFCache*);
+  void produce(edm::Event& event, const edm::EventSetup& setup) override;
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  // static methods for handling the global cache
+  static std::unique_ptr<MLPFCache> initializeGlobalCache(const edm::ParameterSet&);
+  static void globalEndJob(MLPFCache*);
+
+private:
+  const edm::EDPutTokenT<reco::PFCandidateCollection> pfCandidatesPutToken_;
+  const edm::EDGetTokenT<reco::PFBlockCollection> inputTagBlocks_;
+  const std::string model_path_;
+  tensorflow::Session* session_;
+};
+
+MLPFProducer::MLPFProducer(const edm::ParameterSet& cfg, const MLPFCache* cache)
+    : pfCandidatesPutToken_{produces<reco::PFCandidateCollection>()},
+      inputTagBlocks_(consumes<reco::PFBlockCollection>(cfg.getParameter<edm::InputTag>("src"))),
+      model_path_(cfg.getParameter<std::string>("model_path")) {
+  session_ = tensorflow::createSession(cache->graph_def);
+}
+
+void MLPFProducer::produce(edm::Event& event, const edm::EventSetup& setup) {
+  using namespace reco::mlpf;
+
+  const auto& blocks = event.get(inputTagBlocks_);
+  const auto& all_elements = getPFElements(blocks);
+
+  const long long int num_elements_total = all_elements.size();
+
+  //tensor size must be a multiple of the bin size and larger than the number of elements
+  const auto tensor_size = LSH_BIN_SIZE * (num_elements_total / LSH_BIN_SIZE + 1);
+  assert(tensor_size <= NUM_MAX_ELEMENTS_BATCH);
+
+  //Create the input tensor
+  tensorflow::TensorShape shape({BATCH_SIZE, tensor_size, NUM_ELEMENT_FEATURES});
+  tensorflow::Tensor input(tensorflow::DT_FLOAT, shape);
+  input.flat<float>().setZero();
+
+  //Fill the input tensor
+  unsigned int ielem = 0;
+  for (const auto* pelem : all_elements) {
+    const auto& elem = *pelem;
+
+    //prepare the input array from the PFElement
+    const auto& props = getElementProperties(elem);
+
+    //copy features to the input array
+    for (unsigned int iprop = 0; iprop < NUM_ELEMENT_FEATURES; iprop++) {
+      input.tensor<float, 3>()(0, ielem, iprop) = normalize(props[iprop]);
+    }
+    ielem += 1;
+  }
+
+  //TF model input and output tensor names
+  const tensorflow::NamedTensorList input_list = {{"x:0", input}};
+  const std::vector<std::string> output_names = {"Identity:0"};
+
+  //Prepare the output tensor
+  std::vector<tensorflow::Tensor> outputs;
+
+  //run the GNN inference, given the inputs and the output.
+  //Note that the GNN enables information transfer between the input PFElements,
+  //such that the output ML-PFCandidates are in general combinations of the input PFElements, in the form of
+  //y_out = Adj.x_in, where x_in is input matrix (num_elem, NUM_ELEMENT_FEATURES), y_out is the output matrix (num_elem, NUM_OUTPUT_FEATURES)
+  //and Adj is an adjacency matrix between the elements that is constructed on the fly during model inference.
+  tensorflow::run(session_, input_list, output_names, &outputs);
+
+  //process the output tensor to ML-PFCandidates.
+  //The output can contain up to num_elem particles, with predicted PDGID=0 corresponding to no particles predicted.
+  const auto out_arr = outputs[0].tensor<float, 3>();
+
+  std::vector<reco::PFCandidate> pOutputCandidateCollection;
+  for (unsigned int ielem = 0; ielem < all_elements.size(); ielem++) {
+    //get the coefficients in the output corresponding to the class probabilities (raw logits)
+    std::vector<float> pred_id_logits;
+    for (unsigned int idx_id = 0; idx_id <= NUM_CLASS; idx_id++) {
+      pred_id_logits.push_back(out_arr(0, ielem, idx_id));
+    }
+
+    //get the most probable class PDGID
+    int pred_pid = pdgid_encoding[argMax(pred_id_logits)];
+
+    //get the predicted momentum components
+    float pred_eta = out_arr(0, ielem, IDX_ETA);
+    float pred_phi = out_arr(0, ielem, IDX_PHI);
+    float pred_charge = out_arr(0, ielem, IDX_CHARGE);
+    float pred_e = out_arr(0, ielem, IDX_ENERGY);
+
+    //a particle was predicted for this PFElement, otherwise it was a spectator
+    if (pred_pid != 0) {
+      auto cand = makeCandidate(pred_pid, pred_charge, pred_e, pred_eta, pred_phi);
+      setCandidateRefs(cand, all_elements, ielem);
+      pOutputCandidateCollection.push_back(cand);
+    }
+  }  //loop over PFElements
+
+  event.emplace(pfCandidatesPutToken_, pOutputCandidateCollection);
+}
+
+std::unique_ptr<MLPFCache> MLPFProducer::initializeGlobalCache(const edm::ParameterSet& params) {
+  // this method is supposed to create, initialize and return a MLPFCache instance
+  std::unique_ptr<MLPFCache> cache = std::make_unique<MLPFCache>();
+
+  //load the frozen TF graph of the GNN model
+  std::string path = params.getParameter<std::string>("model_path");
+  auto fullPath = edm::FileInPath(path).fullPath();
+  LogDebug("MLPFProducer") << "Initializing MLPF model from " << fullPath;
+
+  cache->graph_def = tensorflow::loadGraphDef(fullPath);
+
+  return cache;
+}
+
+void MLPFProducer::globalEndJob(MLPFCache* cache) { delete cache->graph_def; }
+
+void MLPFProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("particleFlowBlock"));
+  desc.add<std::string>("model_path", "RecoParticleFlow/PFProducer/data/mlpf/mlpf_2020_11_04.pb");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+DEFINE_FWK_MODULE(MLPFProducer);
diff --git a/RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc b/RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc
new file mode 100644
index 0000000000000..43c3a25b56fd4
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc
@@ -0,0 +1,136 @@
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <cmath>
+#include <map>
+
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+#include "HeterogeneousCore/SonicCore/interface/SonicEDProducer.h"
+#include "HeterogeneousCore/SonicTriton/interface/TritonClient.h"
+#include "RecoParticleFlow/PFProducer/interface/MLPFModel.h"
+
+class MLPFProducerSonicTriton : public SonicEDProducer<TritonClient> {
+public:
+  explicit MLPFProducerSonicTriton(edm::ParameterSet const& cfg)
+      : SonicEDProducer<TritonClient>(cfg),
+        pfCandidatesPutToken_{produces<reco::PFCandidateCollection>()},
+        inputTagBlocks_(consumes<reco::PFBlockCollection>(cfg.getParameter<edm::InputTag>("src"))) {
+    this->setDebugName("MLPFProducerSonic");
+  }
+
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, Input& iInput) override {
+    using namespace reco::mlpf;
+
+    //get the PFElements in the event. Currently use PFBlock for convenience, but we don't need anything
+    //else the block does, later we can get the events directly from the event.
+    const auto& blocks = iEvent.get(inputTagBlocks_);
+    const auto& all_elements = getPFElements(blocks);
+
+    const auto num_elements_total = all_elements.size();
+
+    //tensor size must be a multiple of the bin size and larger than the number of elements
+    const auto tensor_size = LSH_BIN_SIZE * (num_elements_total / LSH_BIN_SIZE + 1);
+    assert(tensor_size <= NUM_MAX_ELEMENTS_BATCH);
+
+    auto& input1 = iInput.at("x");
+
+    //we ignore Sonic/Triton batching, as it doesn't create a dim-3 input for batch size 1.
+    //instead, we specify the batch dim as a model dim.
+    input1.setShape(0, 1);
+    input1.setShape(1, tensor_size);
+
+    auto data1 = std::make_shared<TritonInput<float>>(1);
+    auto& vdata1 = (*data1)[0];
+    vdata1.reserve(input1.sizeShape());
+
+    //Fill the input tensor
+    for (const auto* pelem : all_elements) {
+      const auto& elem = *pelem;
+
+      //prepare the input array from the PFElement
+      const auto& props = getElementProperties(elem);
+
+      //copy features to the input array
+      for (unsigned int iprop = 0; iprop < NUM_ELEMENT_FEATURES; iprop++) {
+        vdata1.push_back(normalize(props[iprop]));
+      }
+    }
+    input1.toServer(data1);
+  }
+
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, Output const& iOutput) override {
+    using namespace reco::mlpf;
+
+    //we need the input element list to set the refs on the produced candidate. Currently use PFBlock for convenience, but we don't need anything
+    //else the block does, later we can get the events directly from the event.
+    const auto& blocks = iEvent.get(inputTagBlocks_);
+    const auto& all_elements = getPFElements(blocks);
+
+    std::vector<reco::PFCandidate> pOutputCandidateCollection;
+    const auto& output1 = iOutput.at("Identity");
+
+    //get the data of the first (and only) batch
+    const auto& out_data = output1.fromServer<float>();
+
+    //batch size 1
+    assert(output1.shape()[0] == 1);
+
+    //model should have the correct number of outputs
+    assert(output1.shape()[2] == NUM_OUTPUTS);
+
+    //we process only uyp to the true number of input elements, the predicion is padded to the bin size
+    const auto num_elem = all_elements.size();
+
+    for (size_t ielem = 0; ielem < num_elem; ielem++) {
+      //get the coefficients in the output corresponding to the class probabilities (raw logits)
+      std::vector<float> pred_id_logits;
+      for (unsigned int idx_id = 0; idx_id <= NUM_CLASS; idx_id++) {
+        pred_id_logits.push_back(out_data[0][ielem * NUM_OUTPUTS + idx_id]);
+      }
+
+      //get the most probable class PDGID
+      int pred_pid = pdgid_encoding[argMax(pred_id_logits)];
+
+      //get the predicted momentum components
+      float pred_eta = out_data[0][ielem * NUM_OUTPUTS + IDX_ETA];
+      float pred_phi = out_data[0][ielem * NUM_OUTPUTS + IDX_PHI];
+      float pred_e = out_data[0][ielem * NUM_OUTPUTS + IDX_ENERGY];
+      float pred_charge = out_data[0][ielem * NUM_OUTPUTS + IDX_CHARGE];
+
+      //a particle was predicted for this PFElement, otherwise it was a spectator
+      if (pred_pid != 0) {
+        auto cand = makeCandidate(pred_pid, pred_charge, pred_e, pred_eta, pred_phi);
+        setCandidateRefs(cand, all_elements, ielem);
+        pOutputCandidateCollection.push_back(cand);
+      }
+    }  //loop over PFElements
+
+    iEvent.emplace(pfCandidatesPutToken_, pOutputCandidateCollection);
+  }
+
+  ~MLPFProducerSonicTriton() override {}
+
+  //to ensure distinct cfi names - specialized below
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+    TritonClient::fillPSetDescription(desc);
+    desc.add<edm::InputTag>("src", edm::InputTag("particleFlowBlock"));
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+private:
+  const edm::EDPutTokenT<reco::PFCandidateCollection> pfCandidatesPutToken_;
+  const edm::EDGetTokenT<reco::PFBlockCollection> inputTagBlocks_;
+};
+
+DEFINE_FWK_MODULE(MLPFProducerSonicTriton);
\ No newline at end of file
diff --git a/RecoParticleFlow/PFProducer/python/mlpf_EventContent_cff.py b/RecoParticleFlow/PFProducer/python/mlpf_EventContent_cff.py
new file mode 100644
index 0000000000000..0e91687fa40a6
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/python/mlpf_EventContent_cff.py
@@ -0,0 +1,8 @@
+import FWCore.ParameterSet.Config as cms
+
+MLPF_RECO = cms.PSet(
+    outputCommands = cms.untracked.vstring(
+        'keep recoPFCandidates_mlpfProducer_*_*',
+        )
+)
+
diff --git a/RecoParticleFlow/PFProducer/src/MLPFModel.cc b/RecoParticleFlow/PFProducer/src/MLPFModel.cc
new file mode 100644
index 0000000000000..ff05f2dd65e7e
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/src/MLPFModel.cc
@@ -0,0 +1,218 @@
+#include "RecoParticleFlow/PFProducer/interface/MLPFModel.h"
+#include "DataFormats/ParticleFlowReco/interface/PFCluster.h"
+#include "DataFormats/ParticleFlowReco/interface/PFBlock.h"
+#include "DataFormats/ParticleFlowReco/interface/PFBlockElementSuperCluster.h"
+#include "DataFormats/ParticleFlowReco/interface/PFBlockElementGsfTrack.h"
+#include "DataFormats/ParticleFlowReco/interface/PFBlockElementTrack.h"
+#include "DataFormats/ParticleFlowReco/interface/PFBlockElementBrem.h"
+#include "DataFormats/ParticleFlowReco/interface/PFBlockElementCluster.h"
+
+namespace reco::mlpf {
+
+  //Prepares the input array of floats for a single PFElement
+  std::array<float, NUM_ELEMENT_FEATURES> getElementProperties(const reco::PFBlockElement& orig) {
+    const auto type = orig.type();
+    float pt = 0.0;
+    //these are placeholders for the the future
+    [[maybe_unused]] float deltap = 0.0;
+    [[maybe_unused]] float sigmadeltap = 0.0;
+    [[maybe_unused]] float px = 0.0;
+    [[maybe_unused]] float py = 0.0;
+    [[maybe_unused]] float pz = 0.0;
+    float eta = 0.0;
+    float phi = 0.0;
+    float energy = 0.0;
+    float trajpoint = 0.0;
+    float eta_ecal = 0.0;
+    float phi_ecal = 0.0;
+    float eta_hcal = 0.0;
+    float phi_hcal = 0.0;
+    float charge = 0;
+    float layer = 0;
+    float depth = 0;
+    float muon_dt_hits = 0.0;
+    float muon_csc_hits = 0.0;
+
+    if (type == reco::PFBlockElement::TRACK) {
+      const auto& matched_pftrack = orig.trackRefPF();
+      if (matched_pftrack.isNonnull()) {
+        const auto& atECAL = matched_pftrack->extrapolatedPoint(reco::PFTrajectoryPoint::ECALShowerMax);
+        const auto& atHCAL = matched_pftrack->extrapolatedPoint(reco::PFTrajectoryPoint::HCALEntrance);
+        if (atHCAL.isValid()) {
+          eta_hcal = atHCAL.positionREP().eta();
+          phi_hcal = atHCAL.positionREP().phi();
+        }
+        if (atECAL.isValid()) {
+          eta_ecal = atECAL.positionREP().eta();
+          phi_ecal = atECAL.positionREP().phi();
+        }
+      }
+      const auto& ref = ((const reco::PFBlockElementTrack*)&orig)->trackRef();
+      pt = ref->pt();
+      px = ref->px();
+      py = ref->py();
+      pz = ref->pz();
+      eta = ref->eta();
+      phi = ref->phi();
+      energy = ref->p();
+      charge = ref->charge();
+
+      reco::MuonRef muonRef = orig.muonRef();
+      if (muonRef.isNonnull()) {
+        reco::TrackRef standAloneMu = muonRef->standAloneMuon();
+        if (standAloneMu.isNonnull()) {
+          muon_dt_hits = standAloneMu->hitPattern().numberOfValidMuonDTHits();
+          muon_csc_hits = standAloneMu->hitPattern().numberOfValidMuonCSCHits();
+        }
+      }
+
+    } else if (type == reco::PFBlockElement::BREM) {
+      const auto* orig2 = (const reco::PFBlockElementBrem*)&orig;
+      const auto& ref = orig2->GsftrackRef();
+      if (ref.isNonnull()) {
+        deltap = orig2->DeltaP();
+        sigmadeltap = orig2->SigmaDeltaP();
+        pt = ref->pt();
+        px = ref->px();
+        py = ref->py();
+        pz = ref->pz();
+        eta = ref->eta();
+        phi = ref->phi();
+        energy = ref->p();
+        trajpoint = orig2->indTrajPoint();
+        charge = ref->charge();
+      }
+    } else if (type == reco::PFBlockElement::GSF) {
+      //requires to keep GsfPFRecTracks
+      const auto* orig2 = (const reco::PFBlockElementGsfTrack*)&orig;
+      const auto& vec = orig2->Pin();
+      pt = vec.pt();
+      px = vec.px();
+      py = vec.py();
+      pz = vec.pz();
+      eta = vec.eta();
+      phi = vec.phi();
+      energy = vec.energy();
+      if (!orig2->GsftrackRefPF().isNull()) {
+        charge = orig2->GsftrackRefPF()->charge();
+      }
+    } else if (type == reco::PFBlockElement::ECAL || type == reco::PFBlockElement::PS1 ||
+               type == reco::PFBlockElement::PS2 || type == reco::PFBlockElement::HCAL ||
+               type == reco::PFBlockElement::HO || type == reco::PFBlockElement::HFHAD ||
+               type == reco::PFBlockElement::HFEM) {
+      const auto& ref = ((const reco::PFBlockElementCluster*)&orig)->clusterRef();
+      if (ref.isNonnull()) {
+        eta = ref->eta();
+        phi = ref->phi();
+        px = ref->position().x();
+        py = ref->position().y();
+        pz = ref->position().z();
+        energy = ref->energy();
+        layer = ref->layer();
+        depth = ref->depth();
+      }
+    } else if (type == reco::PFBlockElement::SC) {
+      const auto& clref = ((const reco::PFBlockElementSuperCluster*)&orig)->superClusterRef();
+      if (clref.isNonnull()) {
+        eta = clref->eta();
+        phi = clref->phi();
+        px = clref->position().x();
+        py = clref->position().y();
+        pz = clref->position().z();
+        energy = clref->energy();
+      }
+    }
+
+    float typ_idx = static_cast<float>(elem_type_encoding.at(orig.type()));
+
+    //Must be the same order as in tf_model.py
+    return std::array<float, NUM_ELEMENT_FEATURES>({{typ_idx,
+                                                     pt,
+                                                     eta,
+                                                     phi,
+                                                     energy,
+                                                     layer,
+                                                     depth,
+                                                     charge,
+                                                     trajpoint,
+                                                     eta_ecal,
+                                                     phi_ecal,
+                                                     eta_hcal,
+                                                     phi_hcal,
+                                                     muon_dt_hits,
+                                                     muon_csc_hits}});
+  }
+
+  //to make sure DNN inputs are within numerical bounds, use the same in training
+  float normalize(float in) {
+    if (std::abs(in) > 1e4f) {
+      return 0.0;
+    } else if (std::isnan(in)) {
+      return 0.0;
+    }
+    return in;
+  }
+
+  int argMax(std::vector<float> const& vec) {
+    return static_cast<int>(std::distance(vec.begin(), max_element(vec.begin(), vec.end())));
+  }
+
+  reco::PFCandidate makeCandidate(int pred_pid, int pred_charge, float pred_e, float pred_eta, float pred_phi) {
+    pred_phi = angle0to2pi::make0To2pi(pred_phi);
+
+    //currently, set the pT from a massless approximation.
+    //later versions of the model may predict predict both the energy and pT of the particle
+    float pred_pt = pred_e / cosh(pred_eta);
+
+    //set the charge to +1 or -1 for PFCandidates that are charged, according to the sign of the predicted charge
+    reco::PFCandidate::Charge charge = 0;
+    if (pred_pid == 11 || pred_pid == 13 || pred_pid == 211) {
+      charge = pred_charge > 0 ? +1 : -1;
+    }
+
+    math::PtEtaPhiELorentzVectorD p4(pred_pt, pred_eta, pred_phi, pred_e);
+
+    reco::PFCandidate cand(
+        0, math::XYZTLorentzVector(p4.X(), p4.Y(), p4.Z(), p4.E()), reco::PFCandidate::ParticleType(0));
+    cand.setPdgId(pred_pid);
+    cand.setCharge(charge);
+
+    return cand;
+  }
+
+  const std::vector<const reco::PFBlockElement*> getPFElements(const reco::PFBlockCollection& blocks) {
+    std::vector<reco::PFCandidate> pOutputCandidateCollection;
+
+    std::vector<const reco::PFBlockElement*> all_elements;
+    for (const auto& block : blocks) {
+      const auto& elems = block.elements();
+      for (const auto& elem : elems) {
+        if (all_elements.size() < NUM_MAX_ELEMENTS_BATCH) {
+          all_elements.push_back(&elem);
+        } else {
+          //model needs to be created with a bigger LSH codebook size
+          edm::LogError("MLPFProducer") << "too many input PFElements for predefined model size: " << elems.size();
+          break;
+        }
+      }
+    }
+    return all_elements;
+  }
+
+  void setCandidateRefs(reco::PFCandidate& cand,
+                        const std::vector<const reco::PFBlockElement*> elems,
+                        size_t ielem_originator) {
+    const reco::PFBlockElement* elem = elems[ielem_originator];
+    //set the track ref in case the originating element was a track
+    if (elem->type() == reco::PFBlockElement::TRACK && cand.charge() != 0 && elem->trackRef().isNonnull()) {
+      cand.setTrackRef(elem->trackRef());
+
+      //set the muon ref in case the originator was a muon
+      const auto& muonref = elem->muonRef();
+      if (muonref.isNonnull()) {
+        cand.setMuonRef(muonref);
+      }
+    }
+  }
+
+};  // namespace reco::mlpf
\ No newline at end of file
diff --git a/RecoParticleFlow/PFProducer/test/mlpf_training/generate.sh b/RecoParticleFlow/PFProducer/test/mlpf_training/generate.sh
new file mode 100755
index 0000000000000..ef8467080b48e
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/test/mlpf_training/generate.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -e
+set -x
+
+#sleep randomly up to 120s to stagger job start times
+#sleep $((RANDOM % 120))
+
+#seed must be greater than 0
+SAMPLE=$1
+SEED=$2
+N=$3
+
+#examples:
+#SAMPLE=TTbar_14TeV_TuneCUETP8M1_cfi
+#SEED=1
+#N=5
+
+PILEUP=Run3_Flat55To75_PoissonOOTPU
+#PILEUP=NoPileUp
+
+#in case of locally downloaded minbias files, use the following
+#PILEUP_INPUT=filelist:/storage/user/jpata/particleflow/test/pu_files.txt
+PILEUP_INPUT=dbs:/MinBias_TuneCP5_13TeV-pythia8/RunIIFall18GS-102X_upgrade2018_realistic_v9-v1/GEN-SIM
+#and add this line to cmsDriver for step2
+#--pileup_input $PILEUP_INPUT \
+
+#Generate the MC
+cmsDriver.py $SAMPLE \
+  --conditions auto:phase1_2021_realistic \
+  -n $N \
+  --era Run3 \
+  --eventcontent FEVTDEBUGHLT \
+  -s GEN,SIM,DIGI,L1,DIGI2RAW,HLT \
+  --datatier GEN-SIM \
+  --geometry DB:Extended \
+  --pileup $PILEUP \
+  --pileup_input $PILEUP_INPUT \
+  --no_exec \
+  --fileout step2_phase1_new.root \
+  --customise Validation/RecoParticleFlow/customize_pfanalysis.customize_step2 \
+  --python_filename=step2_phase1_new.py
+
+#Run the reco sequences
+cmsDriver.py step3 \
+  --conditions auto:phase1_2021_realistic \
+  --era Run3 \
+  -n -1 \
+  --eventcontent FEVTDEBUGHLT \
+  --runUnscheduled \
+  -s RAW2DIGI,L1Reco,RECO,RECOSIM \
+  --datatier GEN-SIM-RECO \
+  --geometry DB:Extended \
+  --no_exec \
+  --filein file:step2_phase1_new.root \
+  --fileout step3_phase1_new.root \
+  --customise Validation/RecoParticleFlow/customize_pfanalysis.customize_step3 \
+  --python_filename=step3_phase1_new.py
+
+pwd
+ls -lrt
+
+echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py
+cmsRun step2_phase1_new.py
+cmsRun step3_phase1_new.py
+cmsRun $CMSSW_BASE/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
+
+mv pfntuple.root pfntuple_${SEED}.root
diff --git a/RecoParticleFlow/PFProducer/test/mlpf_training/preprocessing.py b/RecoParticleFlow/PFProducer/test/mlpf_training/preprocessing.py
new file mode 100644
index 0000000000000..24212674716de
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/test/mlpf_training/preprocessing.py
@@ -0,0 +1,598 @@
+import sys
+import pickle
+import networkx as nx
+import numpy as np
+#import numba
+import os
+import uproot
+import uproot_methods
+import math
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import scipy
+import scipy.sparse
+from networkx.readwrite import json_graph
+from networkx.drawing.nx_pydot import graphviz_layout
+
+map_candid_to_pdgid = {
+    0: [0],
+    211: [211, 2212, 321, -3112, 3222, -3312, -3334],
+    -211: [-211, -2212, -321, 3112, -3222, 3312, 3334],
+    130: [111, 130, 2112, -2112, 310, 3122, -3122, 3322, -3322],
+    22: [22],
+    11: [11],
+    -11: [-11],
+     13: [13],
+     -13: [-13]
+}
+
+map_pdgid_to_candid = {}
+
+for candid, pdgids in map_candid_to_pdgid.items():
+    for p in pdgids:
+        map_pdgid_to_candid[p] = candid
+
+#@numba.njit
+def get_charge(pid):
+    abs_pid = abs(pid)
+    if pid == 130 or pid == 22 or pid == 1 or pid == 2:
+        return 0.0
+    #13: mu-, 11: e-
+    elif abs_pid == 13 or abs_pid == 11:
+        return -math.copysign(1.0, pid)
+    #211: pi+
+    elif abs_pid == 211:
+        return math.copysign(1.0, pid)
+
+def save_ego_graph(g, node, radius=4, undirected=False):
+    sg = nx.ego_graph(g, node, radius, undirected=undirected).reverse()
+
+    #remove BREM PFElements from plotting 
+    nodes_to_remove = [n for n in sg.nodes if (n[0]=="elem" and sg.nodes[n]["typ"] in [7,])]
+    sg.remove_nodes_from(nodes_to_remove)
+
+    fig = plt.figure(figsize=(2*len(sg.nodes)+2, 10))
+    sg_pos = graphviz_layout(sg, prog='dot')
+    
+    edge_labels = {}
+    for e in sg.edges:
+        if e[1][0] == "elem" and not (sg.nodes[e[1]]["typ"] in [1,10]):
+            edge_labels[e] = "{:.2f} GeV".format(sg.edges[e].get("weight", 0))
+        else:
+            edge_labels[e] = ""
+    
+    node_labels = {}
+    for node in sg.nodes:
+        labels = {"sc": "SimCluster", "elem": "PFElement", "tp": "TrackingParticle", "pfcand": "PFCandidate"}
+        node_labels[node] = "[{label} {idx}] \ntype: {typ}\ne: {e:.4f} GeV\npt: {pt:.4f} GeV\neta: {eta:.4f}\nphi: {phi:.4f}\nc/p: {children}/{parents}".format(
+            label=labels[node[0]], idx=node[1], **sg.nodes[node])
+        tp = sg.nodes[node]["typ"]
+            
+    nx.draw_networkx(sg, pos=sg_pos, node_shape=".", node_color="grey", edge_color="grey", node_size=0, alpha=0.5, labels={})
+    nx.draw_networkx_labels(sg, pos=sg_pos, labels=node_labels)
+    nx.draw_networkx_edge_labels(sg, pos=sg_pos, edge_labels=edge_labels);
+    plt.tight_layout()
+    plt.axis("off")
+
+    return fig
+
+def draw_event(g):
+    pos = {}
+    for node in g.nodes:
+        pos[node] = (g.nodes[node]["eta"], g.nodes[node]["phi"])
+
+    fig = plt.figure(figsize=(10,10))
+    
+    nodes_to_draw = [n for n in g.nodes if n[0]=="elem"]
+    nx.draw_networkx(g, pos=pos, with_labels=False, node_size=5, nodelist=nodes_to_draw, edgelist=[], node_color="red", node_shape="s", alpha=0.5)
+    
+    nodes_to_draw = [n for n in g.nodes if n[0]=="pfcand"]
+    nx.draw_networkx(g, pos=pos, with_labels=False, node_size=10, nodelist=nodes_to_draw, edgelist=[], node_color="green", node_shape="x", alpha=0.5)
+    
+    nodes_to_draw = [n for n in g.nodes if (n[0]=="sc" or n[0]=="tp")]
+    nx.draw_networkx(g, pos=pos, with_labels=False, node_size=1, nodelist=nodes_to_draw, edgelist=[], node_color="blue", node_shape=".", alpha=0.5)
+   
+    #draw edges between genparticles and elements
+    edges_to_draw = [e for e in g.edges if e[0] in nodes_to_draw]
+    nx.draw_networkx_edges(g, pos, edgelist=edges_to_draw, arrows=False, alpha=0.1)
+    
+    plt.xlim(-6,6)
+    plt.ylim(-4,4)
+    plt.tight_layout()
+    plt.axis("on")
+    return fig
+
+def cleanup_graph(g, edge_energy_threshold=0.01, edge_fraction_threshold=0.05, genparticle_energy_threshold=0.2, genparticle_pt_threshold=0.01):
+    g = g.copy()
+
+    edges_to_remove = []
+    nodes_to_remove = []
+
+    #remove edges that contribute little
+    for edge in g.edges:
+        if edge[0][0] == "sc":
+            w = g.edges[edge]["weight"]
+            if w < edge_energy_threshold:
+                edges_to_remove += [edge]
+        if edge[0][0] == "sc" or edge[0][0] == "tp":
+            if g.nodes[edge[1]]["typ"] == 10:
+                g.edges[edge]["weight"] = 1.0
+                
+    #remove genparticles below energy threshold
+    for node in g.nodes:
+        if (node[0]=="sc" or node[0]=="tp") and g.nodes[node]["e"] < genparticle_energy_threshold:
+            nodes_to_remove += [node]
+    
+    g.remove_edges_from(edges_to_remove)
+    g.remove_nodes_from(nodes_to_remove)
+    
+    rg = g.reverse()
+    
+    #for each element, remove the incoming edges that contribute less than 5% of the total
+    edges_to_remove = []
+    nodes_to_remove = []
+    for node in rg.nodes:
+        if node[0] == "elem":
+            ##check for generator pairs with very similar eta,phi, which can come from gamma->e+ e-
+            #if rg.nodes[node]["typ"] == 4:
+            #    by_eta_phi = {}
+            #    for neigh in rg.neighbors(node):
+            #        k = (round(rg.nodes[neigh]["eta"], 2), round(rg.nodes[neigh]["phi"], 2))
+            #        if not k in by_eta_phi:
+            #            by_eta_phi[k] = []
+            #        by_eta_phi[k] += [neigh]
+    
+            #    for k in by_eta_phi:
+            #        #if there were genparticles with the same eta,phi, assume it was a photon with nuclear interaction
+            #        if len(by_eta_phi[k])>=2:
+            #            #print(by_eta_phi[k][0])
+            #            rg.nodes[by_eta_phi[k][0]]["typ"] = 22
+            #            rg.nodes[by_eta_phi[k][0]]["e"] += sum(rg.nodes[n]["e"] for n in by_eta_phi[k][1:])
+            #            rg.nodes[by_eta_phi[k][0]]["pt"] = 0 #fixme
+            #            nodes_to_remove += by_eta_phi[k][1:]
+            
+            #remove links that don't contribute above a threshold 
+            ew = [((node, node2), rg.edges[node, node2]["weight"]) for node2 in rg.neighbors(node)]
+            ew = filter(lambda x: x[1] != 1.0, ew)
+            ew = sorted(ew, key=lambda x: x[1], reverse=True)
+            if len(ew) > 1:
+                max_in = ew[0][1]
+                for e, w in ew[1:]:
+                    if w / max_in < edge_fraction_threshold:
+                        edges_to_remove += [e]
+    
+    rg.remove_edges_from(edges_to_remove)        
+    rg.remove_nodes_from(nodes_to_remove)        
+    g = rg.reverse()
+    
+    #remove genparticles not linked to any elements
+    nodes_to_remove = []
+    for node in g.nodes:
+        if node[0]=="sc" or node[0]=="tp":
+            deg = g.degree[node]
+            if deg==0:
+                nodes_to_remove += [node]
+    g.remove_nodes_from(nodes_to_remove)
+   
+    #compute number of children and parents, save on node for visualization 
+    for node in g.nodes:
+        g.nodes[node]["children"] = len(list(g.neighbors(node)))
+    
+    rg = g.reverse()
+    
+    for node in rg.nodes:
+        g.nodes[node]["parents"] = len(list(rg.neighbors(node)))
+        rg.nodes[node]["parents"] = len(list(rg.neighbors(node)))
+
+    return g
+
+def prepare_normalized_table(g, genparticle_energy_threshold=0.2):
+    rg = g.reverse()
+
+    all_genparticles = []
+    all_elements = []
+    all_pfcandidates = []
+    for node in rg.nodes:
+        if node[0] == "elem":
+            all_elements += [node]
+            for parent in rg.neighbors(node):
+                all_genparticles += [parent]
+        elif node[0] == "pfcand":
+            all_pfcandidates += [node]
+    all_genparticles = list(set(all_genparticles))
+    all_elements = sorted(all_elements)
+
+    #assign genparticles in reverse pt order uniquely to best element
+    elem_to_gp = {}
+    unmatched_gp = []
+    for gp in sorted(all_genparticles, key=lambda x: g.nodes[x]["pt"], reverse=True):
+        elems = [e for e in g.neighbors(gp)]
+
+        #don't assign any genparticle to these elements (PS, BREM, SC)
+        elems = [e for e in elems if not (g.nodes[e]["typ"] in [2,3,7,10])]
+
+        #sort elements by energy from genparticle
+        elems_sorted = sorted([(g.edges[gp, e]["weight"], e) for e in elems], key=lambda x: x[0], reverse=True)
+        
+        if len(elems_sorted) == 0:
+            continue
+
+        chosen_elem = None
+        for _, elem in elems_sorted:
+            if not (elem in elem_to_gp):
+                chosen_elem = elem
+                elem_to_gp[elem] = []
+                break
+        if chosen_elem is None:
+            unmatched_gp += [gp]
+        else:
+            elem_to_gp[elem] += [gp]
+
+    #assign unmatched genparticles to best element, allowing for overlaps
+    for gp in sorted(unmatched_gp, key=lambda x: g.nodes[x]["pt"], reverse=True):
+        elems = [e for e in g.neighbors(gp)]
+        #we don't want to assign any genparticles to PS, BREM or SC - links are not reliable
+        elems = [e for e in elems if not (g.nodes[e]["typ"] in [2,3,7,10])]
+        elems_sorted = sorted([(g.edges[gp, e]["weight"], e) for e in elems], key=lambda x: x[0], reverse=True)
+        _, elem = elems_sorted[0]
+        elem_to_gp[elem] += [gp]
+ 
+    unmatched_cand = [] 
+    elem_to_cand = {} 
+    for cand in sorted(all_pfcandidates, key=lambda x: g.nodes[x]["pt"], reverse=True):
+        tp = g.nodes[cand]["typ"]
+        neighbors = list(rg.neighbors(cand))
+
+        chosen_elem = None
+
+        #Pions and muons will be assigned to tracks
+        if abs(tp) == 211 or abs(tp) == 13:
+            for elem in neighbors:
+                tp_neighbor = g.nodes[elem]["typ"]
+                if tp_neighbor == 1:
+                    if not (elem in elem_to_cand):
+                        chosen_elem = elem
+                        elem_to_cand[elem] = cand
+                        break
+        #other particles will be assigned to the highest-energy cluster (ECAL, HCAL, HFEM, HFHAD, SC)
+        else:
+            neighbors = [n for n in neighbors if g.nodes[n]["typ"] in [4,5,8,9,10]]
+            sorted_neighbors = sorted(neighbors, key=lambda x: g.nodes[x]["e"], reverse=True)
+            for elem in sorted_neighbors:
+                if not (elem in elem_to_cand):
+                    chosen_elem = elem
+                    elem_to_cand[elem] = cand
+                    break
+
+        if chosen_elem is None:
+            print("unmatched candidate {}, {}".format(cand, g.nodes[cand]))
+            unmatched_cand += [cand]
+
+    elem_branches = [
+        "typ", "pt", "eta", "phi", "e",
+        "layer", "depth", "charge", "trajpoint", 
+        "eta_ecal", "phi_ecal", "eta_hcal", "phi_hcal", "muon_dt_hits", "muon_csc_hits"
+    ]
+    target_branches = ["typ", "pt", "eta", "phi", "e", "px", "py", "pz", "charge"]
+
+    Xelem = np.recarray((len(all_elements),), dtype=[(name, np.float32) for name in elem_branches])
+    Xelem.fill(0.0)
+    ygen = np.recarray((len(all_elements),), dtype=[(name, np.float32) for name in target_branches])
+    ygen.fill(0.0)
+    ycand = np.recarray((len(all_elements),), dtype=[(name, np.float32) for name in target_branches])
+    ycand.fill(0.0)
+ 
+    #find which elements should be linked together in the output when regressing to PFCandidates or GenParticles
+    graph_elem_cand = nx.Graph()  
+    graph_elem_gen = nx.Graph()  
+    for elem in all_elements:
+        graph_elem_cand.add_node(elem) 
+        graph_elem_gen.add_node(elem) 
+ 
+    for cand in all_pfcandidates:
+        for elem1 in rg.neighbors(cand):
+            for elem2 in rg.neighbors(cand):
+                if (elem1 != elem2):
+                    graph_elem_cand.add_edge(elem1, elem2)
+
+    for gp in all_genparticles:
+        for elem1 in g.neighbors(gp):
+            for elem2 in g.neighbors(gp):
+                if (elem1 != elem2):
+                    graph_elem_gen.add_edge(elem1, elem2)
+ 
+    for ielem, elem in enumerate(all_elements):
+        elem_type = g.nodes[elem]["typ"]
+        elem_eta = g.nodes[elem]["eta"]
+        genparticles = sorted(elem_to_gp.get(elem, []), key=lambda x: g.nodes[x]["e"], reverse=True)
+        genparticles = [gp for gp in genparticles if g.nodes[gp]["e"] > genparticle_energy_threshold]
+        candidate = elem_to_cand.get(elem, None)
+       
+        lv = uproot_methods.TLorentzVector(0, 0, 0, 0)
+       
+        pid = 0
+        if len(genparticles) > 0:
+            pid = map_pdgid_to_candid.get(g.nodes[genparticles[0]]["typ"], 0)
+
+        for gp in genparticles:
+            try:
+                lv += uproot_methods.TLorentzVector.from_ptetaphie(
+                    g.nodes[gp]["pt"],
+                    g.nodes[gp]["eta"],
+                    g.nodes[gp]["phi"],
+                    g.nodes[gp]["e"]
+                )
+            except OverflowError:
+                lv += uproot_methods.TLorentzVector.from_ptetaphie(
+                    g.nodes[gp]["pt"],
+                    np.nan,
+                    g.nodes[gp]["phi"],
+                    g.nodes[gp]["e"]
+                )
+
+        if len(genparticles) > 0:
+            if abs(elem_eta) > 3.0:
+                #HFHAD -> always produce hadronic candidate
+                if elem_type == 9:
+                    pid = 1
+                #HFEM -> decide based on pid
+                elif elem_type == 8:
+                    if abs(pid) in [11, 22]:
+                        pid = 2 #produce EM candidate 
+                    else:
+                        pid = 1 #produce hadronic
+
+            #remap PID in case of HCAL cluster
+            if elem_type == 5 and (pid == 22 or abs(pid) == 11):
+                pid = 130
+
+        #reproduce ROOT.TLorentzVector behavior (https://root.cern.ch/doc/master/TVector3_8cxx_source.html#l00320)
+        try:
+            eta = lv.eta
+        except ZeroDivisionError:
+            eta = np.sign(lv.z)*10e10
+        
+        gp = {
+            "pt": lv.pt, "eta": eta, "phi": lv.phi, "e": lv.energy, "typ": pid, "px": lv.x, "py": lv.y, "pz": lv.z, "charge": get_charge(pid)
+        }
+
+        for j in range(len(elem_branches)):
+            Xelem[elem_branches[j]][ielem] = g.nodes[elem][elem_branches[j]]
+
+        for j in range(len(target_branches)):
+            if not (candidate is None):
+                ycand[target_branches[j]][ielem] = g.nodes[candidate][target_branches[j]]
+            ygen[target_branches[j]][ielem] = gp[target_branches[j]]
+
+    dm_elem_cand = scipy.sparse.coo_matrix(nx.to_numpy_matrix(graph_elem_cand, nodelist=all_elements))
+    dm_elem_gen = scipy.sparse.coo_matrix(nx.to_numpy_matrix(graph_elem_gen, nodelist=all_elements))
+    return Xelem, ycand, ygen, dm_elem_cand, dm_elem_gen
+#end of prepare_normalized_table
+
+
+def process(args):
+    infile = args.input
+    outpath = os.path.join(args.outpath, os.path.basename(infile).split(".")[0])
+    tf = uproot.open(infile)
+    tt = tf["ana/pftree"]
+
+    events_to_process = [i for i in range(tt.numentries)] 
+    if not (args.event is None):
+        events_to_process = [args.event]
+
+    all_data = []
+    ifile = 0
+    for iev in events_to_process:
+        print("processing event {}".format(iev))
+
+        ev = tt.arrays(flatten=True,entrystart=iev,entrystop=iev+1)
+        
+        element_type = ev[b'element_type']
+        element_pt = ev[b'element_pt']
+        element_e = ev[b'element_energy']
+        element_eta = ev[b'element_eta']
+        element_phi = ev[b'element_phi']
+        element_eta_ecal = ev[b'element_eta_ecal']
+        element_phi_ecal = ev[b'element_phi_ecal']
+        element_eta_hcal = ev[b'element_eta_hcal']
+        element_phi_hcal = ev[b'element_phi_hcal']
+        element_trajpoint = ev[b'element_trajpoint']
+        element_layer = ev[b'element_layer']
+        element_charge = ev[b'element_charge']
+        element_depth = ev[b'element_depth']
+        element_deltap = ev[b'element_deltap']
+        element_sigmadeltap = ev[b'element_sigmadeltap']
+        element_px = ev[b'element_px']
+        element_py = ev[b'element_py']
+        element_pz = ev[b'element_pz']
+        element_muon_dt_hits = ev[b'element_muon_dt_hits']
+        element_muon_csc_hits = ev[b'element_muon_csc_hits']
+
+        trackingparticle_pid = ev[b'trackingparticle_pid']
+        trackingparticle_pt = ev[b'trackingparticle_pt']
+        trackingparticle_e = ev[b'trackingparticle_energy']
+        trackingparticle_eta = ev[b'trackingparticle_eta']
+        trackingparticle_phi = ev[b'trackingparticle_phi']
+        trackingparticle_phi = ev[b'trackingparticle_phi']
+        trackingparticle_px = ev[b'trackingparticle_px']
+        trackingparticle_py = ev[b'trackingparticle_py']
+        trackingparticle_pz = ev[b'trackingparticle_pz']
+
+        simcluster_pid = ev[b'simcluster_pid']
+        simcluster_pt = ev[b'simcluster_pt']
+        simcluster_e = ev[b'simcluster_energy']
+        simcluster_eta = ev[b'simcluster_eta']
+        simcluster_phi = ev[b'simcluster_phi']
+        simcluster_px = ev[b'simcluster_px']
+        simcluster_py = ev[b'simcluster_py']
+        simcluster_pz = ev[b'simcluster_pz']
+
+        simcluster_idx_trackingparticle = ev[b'simcluster_idx_trackingparticle']
+        pfcandidate_pdgid = ev[b'pfcandidate_pdgid']
+        pfcandidate_pt = ev[b'pfcandidate_pt']
+        pfcandidate_e = ev[b'pfcandidate_energy']
+        pfcandidate_eta = ev[b'pfcandidate_eta']
+        pfcandidate_phi = ev[b'pfcandidate_phi']
+        pfcandidate_px = ev[b'pfcandidate_px']
+        pfcandidate_py = ev[b'pfcandidate_py']
+        pfcandidate_pz = ev[b'pfcandidate_pz']
+
+        g = nx.DiGraph()
+        for iobj in range(len(element_type)):
+            g.add_node(("elem", iobj),
+                typ=element_type[iobj],
+                pt=element_pt[iobj],
+                e=element_e[iobj],
+                eta=element_eta[iobj],
+                phi=element_phi[iobj],
+                eta_ecal=element_eta_ecal[iobj],
+                phi_ecal=element_phi_ecal[iobj],
+                eta_hcal=element_eta_hcal[iobj],
+                phi_hcal=element_phi_hcal[iobj],
+                trajpoint=element_trajpoint[iobj],
+                layer=element_layer[iobj],
+                charge=element_charge[iobj],
+                depth=element_depth[iobj],
+                deltap=element_deltap[iobj],
+                sigmadeltap=element_sigmadeltap[iobj],
+                px=element_px[iobj],
+                py=element_py[iobj],
+                pz=element_pz[iobj],
+                muon_dt_hits=element_muon_dt_hits[iobj],
+                muon_csc_hits=element_muon_csc_hits[iobj],
+            )
+        for iobj in range(len(trackingparticle_pid)):
+            g.add_node(("tp", iobj),
+                typ=trackingparticle_pid[iobj],
+                pt=trackingparticle_pt[iobj],
+                e=trackingparticle_e[iobj],
+                eta=trackingparticle_eta[iobj],
+                phi=trackingparticle_phi[iobj],
+                px=trackingparticle_px[iobj],
+                py=trackingparticle_py[iobj],
+                pz=trackingparticle_pz[iobj],
+            )
+        for iobj in range(len(simcluster_pid)):
+            g.add_node(("sc", iobj),
+                typ=simcluster_pid[iobj],
+                pt=simcluster_pt[iobj],
+                e=simcluster_e[iobj],
+                eta=simcluster_eta[iobj],
+                phi=simcluster_phi[iobj],
+                px=simcluster_px[iobj],
+                py=simcluster_py[iobj],
+                pz=simcluster_pz[iobj],
+            )
+
+        trackingparticle_to_element_first = ev[b'trackingparticle_to_element.first']
+        trackingparticle_to_element_second = ev[b'trackingparticle_to_element.second']
+        #for trackingparticles associated to elements, set a very high edge weight
+        for tp, elem in zip(trackingparticle_to_element_first, trackingparticle_to_element_second):
+            g.add_edge(("tp", tp), ("elem", elem), weight=99999.0)
+ 
+        simcluster_to_element_first = ev[b'simcluster_to_element.first']
+        simcluster_to_element_second = ev[b'simcluster_to_element.second']
+        simcluster_to_element_cmp = ev[b'simcluster_to_element_cmp']
+        for sc, elem, c in zip(simcluster_to_element_first, simcluster_to_element_second, simcluster_to_element_cmp):
+            g.add_edge(("sc", sc), ("elem", elem), weight=c)
+
+        print("contracting nodes: trackingparticle to simcluster")
+        nodes_to_remove = []
+        for idx_sc, idx_tp in enumerate(simcluster_idx_trackingparticle):
+            if idx_tp != -1:
+                for elem in g.neighbors(("sc", idx_sc)):
+                    g.add_edge(("tp", idx_tp), elem, weight=g.edges[("sc", idx_sc), elem]["weight"]) 
+                g.nodes[("tp", idx_tp)]["idx_sc"] = idx_sc   
+                nodes_to_remove += [("sc", idx_sc)]
+        g.remove_nodes_from(nodes_to_remove)
+
+        for iobj in range(len(pfcandidate_pdgid)):
+            g.add_node(("pfcand", iobj),
+                typ=pfcandidate_pdgid[iobj],
+                pt=pfcandidate_pt[iobj],
+                e=pfcandidate_e[iobj],
+                eta=pfcandidate_eta[iobj],
+                phi=pfcandidate_phi[iobj],
+                px=pfcandidate_px[iobj],
+                py=pfcandidate_py[iobj],
+                pz=pfcandidate_pz[iobj],
+                charge=get_charge(pfcandidate_pdgid[iobj]),
+            )
+
+        element_to_candidate_first = ev[b'element_to_candidate.first']
+        element_to_candidate_second = ev[b'element_to_candidate.second']
+        for elem, pfcand in zip(element_to_candidate_first, element_to_candidate_second):
+            g.add_edge(("elem", elem), ("pfcand", pfcand), weight=1.0)
+        print("Graph created: {} nodes, {} edges".format(len(g.nodes), len(g.edges)))
+ 
+        g = cleanup_graph(g)
+        rg = g.reverse()
+
+        #make tree visualizations for PFCandidates
+        ncand = 0 
+        for node in sorted(filter(lambda x: x[0]=="pfcand", g.nodes), key=lambda x: g.nodes[x]["pt"], reverse=True):
+            if ncand < args.plot_candidates:
+                print(node, g.nodes[node]["pt"])
+                fig = save_ego_graph(rg, node, 3, False)
+                plt.savefig(outpath + "_ev_{}_cand_{}_idx_{}.pdf".format(iev, ncand, node[1]), bbox_inches="tight")
+                plt.clf()
+                del fig
+            ncand += 1
+
+        #fig = draw_event(g)
+        #plt.savefig(outpath + "_ev_{}.pdf".format(iev))
+        #plt.clf()
+
+        #do one-to-one associations
+        Xelem, ycand, ygen, dm_elem_cand, dm_elem_gen = prepare_normalized_table(g)
+        #dm = prepare_elem_distance_matrix(ev)
+        data = {}
+
+        if args.save_normalized_table:
+            data = {
+                "Xelem": Xelem,
+                "ycand": ycand,
+                "ygen": ygen,
+                #"dm": dm,
+                "dm_elem_cand": dm_elem_cand,
+                "dm_elem_gen": dm_elem_gen
+            }
+
+        if args.save_full_graph:
+            data["full_graph"] = g
+
+        all_data += [data]
+
+        if args.events_per_file > 0:
+            if len(all_data) == args.events_per_file:
+                print(outpath + "_{}.pkl".format(ifile))
+                with open(outpath + "_{}.pkl".format(ifile), "wb") as fi:
+                    pickle.dump(all_data, fi)
+                ifile += 1
+                all_data = []
+
+    if args.events_per_file == -1:
+        print(outpath)
+        with open(outpath + ".pkl", "wb") as fi:
+            pickle.dump(all_data, fi)
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, help="Input file from PFAnalysis", required=True)
+    parser.add_argument("--event", type=int, default=None, help="event index to process, omit to process all")
+    parser.add_argument("--outpath", type=str, default="raw", help="output path")
+    parser.add_argument("--plot-candidates", type=int, default=0, help="number of PFCandidates to plot as trees in pt-descending order")
+    parser.add_argument("--events-per-file", type=int, default=-1, help="number of events per output file, -1 for all")
+    parser.add_argument("--save-full-graph", action="store_true", help="save the full event graph")
+    parser.add_argument("--save-normalized-table", action="store_true", help="save the uniquely identified table")
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    process(args)
+
diff --git a/RecoParticleFlow/PFProducer/test/mlpf_training/run.sh b/RecoParticleFlow/PFProducer/test/mlpf_training/run.sh
new file mode 100755
index 0000000000000..dee272ce416cd
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/test/mlpf_training/run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+set -x
+mkdir -p TTbar_14TeV_TuneCUETP8M1_cfi/root
+mkdir -p TTbar_14TeV_TuneCUETP8M1_cfi/raw
+mkdir -p TTbar_14TeV_TuneCUETP8M1_cfi/tfr/cand
+
+./generate.sh TTbar_14TeV_TuneCUETP8M1_cfi 1 10
+cp pfntuple_1.root TTbar_14TeV_TuneCUETP8M1_cfi/root/
+
+echo "now initialize TF 2.3"
+source training_env/bin/activate 
+python3 preprocessing.py --input TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_1.root --save-normalized-table --outpath TTbar_14TeV_TuneCUETP8M1_cfi/raw/ --events-per-file 5
+python3 tf_data.py --datapath TTbar_14TeV_TuneCUETP8M1_cfi --target gen --num-files-per-tfr 1
+python3 tf_model.py --datapath TTbar_14TeV_TuneCUETP8M1_cfi --target gen --ntrain 5 --ntest 5
diff --git a/RecoParticleFlow/PFProducer/test/mlpf_training/tf_data.py b/RecoParticleFlow/PFProducer/test/mlpf_training/tf_data.py
new file mode 100644
index 0000000000000..51cb89eea73f8
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/test/mlpf_training/tf_data.py
@@ -0,0 +1,147 @@
+import numpy as np
+import glob
+import multiprocessing
+import os
+
+import tensorflow as tf
+from tf_model import load_one_file
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="cand")
+    parser.add_argument("--datapath", type=str, required=True, help="Input data path")
+    parser.add_argument("--num-files-per-tfr", type=int, default=100, help="Number of pickle files to merge to one TFRecord file")
+    args = parser.parse_args()
+    return args
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+#https://stackoverflow.com/questions/47861084/how-to-store-numpy-arrays-as-tfrecord
+def _bytes_feature(value):
+    """Returns a bytes_list from a string / byte."""
+    if isinstance(value, type(tf.constant(0))): # if value ist tensor
+        value = value.numpy() # get value of tensor
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+def _parse_tfr_element(element):
+    parse_dic = {
+        'X': tf.io.FixedLenFeature([], tf.string),
+        'y': tf.io.FixedLenFeature([], tf.string),
+        'w': tf.io.FixedLenFeature([], tf.string),
+        #'dm_row': tf.io.FixedLenFeature([], tf.string),
+        #'dm_col': tf.io.FixedLenFeature([], tf.string),
+        #'dm_data': tf.io.FixedLenFeature([], tf.string),
+    }
+    example_message = tf.io.parse_single_example(element, parse_dic)
+
+    X = example_message['X']
+    arr_X = tf.io.parse_tensor(X, out_type=tf.float32)
+    y = example_message['y']
+    arr_y = tf.io.parse_tensor(y, out_type=tf.float32)
+    w = example_message['w']
+    arr_w = tf.io.parse_tensor(w, out_type=tf.float32)
+    
+    #dm_row = example_message['dm_row']
+    #arr_dm_row = tf.io.parse_tensor(dm_row, out_type=tf.int64)
+    #dm_col = example_message['dm_col']
+    #arr_dm_col = tf.io.parse_tensor(dm_col, out_type=tf.int64)
+    #dm_data = example_message['dm_data']
+    #arr_dm_data = tf.io.parse_tensor(dm_data, out_type=tf.float32)
+
+    #https://github.com/tensorflow/tensorflow/issues/24520#issuecomment-577325475
+    arr_X.set_shape(tf.TensorShape((None, 15)))
+    arr_y.set_shape(tf.TensorShape((None, 5)))
+    arr_w.set_shape(tf.TensorShape((None, )))
+    #inds = tf.stack([arr_dm_row, arr_dm_col], axis=-1)
+    #dm_sparse = tf.SparseTensor(values=arr_dm_data, indices=inds, dense_shape=[tf.shape(arr_X)[0], tf.shape(arr_X)[0]])
+
+    return arr_X, arr_y, arr_w
+
+def serialize_X_y_w(writer, X, y, w):
+    feature = {
+        'X': _bytes_feature(tf.io.serialize_tensor(X)),
+        'y': _bytes_feature(tf.io.serialize_tensor(y)),
+        'w': _bytes_feature(tf.io.serialize_tensor(w)),
+        #'dm_row': _bytes_feature(tf.io.serialize_tensor(np.array(dm.row, np.int64))),
+        #'dm_col': _bytes_feature(tf.io.serialize_tensor(np.array(dm.col, np.int64))),
+        #'dm_data': _bytes_feature(tf.io.serialize_tensor(dm.data)),
+    }
+    sample = tf.train.Example(features=tf.train.Features(feature=feature))
+    writer.write(sample.SerializeToString())
+
+def serialize_chunk(args):
+    path, files, ichunk, target = args
+    print(path, len(files), ichunk, target)
+    out_filename = os.path.join(path, "chunk_{}.tfrecords".format(ichunk))
+    writer = tf.io.TFRecordWriter(out_filename)
+    Xs = []
+    ys = []
+    ws = []
+    dms = []
+
+    for fi in files:
+        print(fi)
+        X, y, ycand = load_one_file(fi)
+
+        Xs += X
+        if target == "cand":
+            ys += ycand
+        elif target == "gen":
+            ys += y
+        else:
+            raise Exception("Unknown target")
+
+    #set weights for each sample to be equal to the number of samples of this type
+    #in the training script, this can be used to compute either inverse or class-balanced weights
+    uniq_vals, uniq_counts = np.unique(np.concatenate([y[:, 0] for y in ys]), return_counts=True)
+    for i in range(len(ys)):
+        w = np.ones(len(ys[i]), dtype=np.float32)
+        for uv, uc in zip(uniq_vals, uniq_counts):
+            w[ys[i][:, 0]==uv] = uc
+        ws += [w]
+
+    for X, y, w in zip(Xs, ys, ws):
+        #print("serializing", X.shape, y.shape, w.shape)
+        serialize_X_y_w(writer, X, y, w)
+
+    writer.close()
+
+if __name__ == "__main__":
+    args = parse_args()
+    #tf.config.experimental_run_functions_eagerly(True)
+
+    datapath = args.datapath
+
+    filelist = sorted(glob.glob("{}/raw/*.pkl".format(datapath)))
+    print("found {} files".format(len(filelist)))
+    assert(len(filelist) > 0)
+    #means, stds = extract_means_stds(filelist)
+    outpath = "{}/tfr/{}".format(datapath, args.target)
+
+    if not os.path.isdir(outpath):
+        os.makedirs(outpath)
+
+    pars = []
+    for ichunk, files in enumerate(chunks(filelist, args.num_files_per_tfr)):
+        pars += [(outpath, files, ichunk, args.target)]
+    assert(len(pars) > 0)
+    #serialize_chunk(pars[0])
+    #pool = multiprocessing.Pool(20)
+    for par in pars:
+        serialize_chunk(par)
+
+    #Load and test the dataset 
+    tfr_dataset = tf.data.TFRecordDataset(glob.glob(outpath + "/*.tfrecords"))
+    dataset = tfr_dataset.map(_parse_tfr_element)
+    num_ev = 0
+    num_particles = 0
+    for X, y, w in dataset:
+        num_ev += 1
+        num_particles += len(X)
+    assert(num_ev > 0)
+    print("Created TFRecords dataset in {} with {} events, {} particles".format(
+        datapath, num_ev, num_particles))
diff --git a/RecoParticleFlow/PFProducer/test/mlpf_training/tf_model.py b/RecoParticleFlow/PFProducer/test/mlpf_training/tf_model.py
new file mode 100644
index 0000000000000..2ae4b78c6dff5
--- /dev/null
+++ b/RecoParticleFlow/PFProducer/test/mlpf_training/tf_model.py
@@ -0,0 +1,1070 @@
+import os
+import sys
+import random
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import glob
+try:
+    if not ("CUDA_VISIBLE_DEVICES" in os.environ):
+        print("importing setGPU")
+        import setGPU
+except:
+    print("Could not import setGPU, please make sure you configure CUDA_VISIBLE_DEVICES manually")
+    pass
+
+try:
+    from comet_ml import Experiment
+    comet_enabled = True
+except ImportError as e:
+    print("could not import comet, online dashboard disabled")
+    comet_enabled = False
+ 
+import pickle
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.metrics import confusion_matrix, accuracy_score
+import pandas
+import time
+import itertools
+import io
+import tensorflow as tf
+
+#physical_devices = tf.config.list_physical_devices('GPU')
+#tf.config.experimental.set_memory_growth(physical_devices[0], True)
+
+from numpy.lib.recfunctions import append_fields
+
+elem_labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+class_labels = [0, 1, 2, 11, 13, 22, 130, 211]
+
+num_max_elems = 5000
+
+mult_classification_loss = 1e3
+mult_charge_loss = 1.0
+mult_energy_loss = 10.0
+mult_phi_loss = 10.0
+mult_eta_loss = 10.0
+mult_total_loss = 1e3
+
+def split_indices_to_bins(cmul, nbins, bin_size):
+    bin_idx = tf.argmax(cmul, axis=-1)
+    bins_split = tf.reshape(tf.argsort(bin_idx), (nbins, bin_size))
+    return bins_split
+
+def pairwise_dist(A, B):  
+    na = tf.reduce_sum(tf.square(A), -1)
+    nb = tf.reduce_sum(tf.square(B), -1)
+
+    # na as a row and nb as a column vectors
+    na = tf.expand_dims(na, -1)
+    nb = tf.expand_dims(nb, -2)
+
+    # return pairwise euclidead difference matrix
+    D = tf.sqrt(tf.maximum(na - 2*tf.matmul(A, B, False, True) + nb, 1e-6))
+    return D
+
+"""
+sp_a: (nbatch, nelem, nelem) sparse distance matrices
+b: (nbatch, nelem, ncol) dense per-element feature matrices
+"""
+def sparse_dense_matmult_batch(sp_a, b):
+
+    num_batches = tf.shape(b)[0]
+    def map_function(x):
+        i, dense_slice = x[0], x[1]
+        num_points = tf.shape(b)[1]
+
+        sparse_slice = tf.sparse.reshape(tf.sparse.slice(
+            sp_a, [i, 0, 0], [1, num_points, num_points]),
+            [num_points, num_points])
+        mult_slice = tf.sparse.sparse_dense_matmul(sparse_slice, dense_slice)
+        return mult_slice
+
+    elems = (tf.range(0, tf.cast(num_batches, tf.int64), delta=1, dtype=tf.int64), b)
+    ret = tf.map_fn(map_function, elems, fn_output_signature=tf.float32, back_prop=True)
+    return ret 
+
+
+def summarize_dataset(dataset):
+    yclasses = []
+    nev = 0.0
+    ntot = 0.0
+    sizes = []
+
+    for X, y, w in dataset:
+        yclasses += [y[:, 0]]
+        nev += 1
+        ntot += len(y)
+        sizes += [len(y)]
+    
+    yclasses = np.concatenate(yclasses)
+    values, counts= np.unique(yclasses, return_counts=True)
+    print("nev={}".format(nev))
+    print("sizes={}".format(np.percentile(sizes, [25, 50, 95, 99])))
+    for v, c in zip(values, counts):
+        print("label={} count={} frac={:.6f}".format(class_labels[int(v)], c, c/ntot))
+
+#https://arxiv.org/pdf/1901.05555.pdf
+beta = 0.9999 #beta -> 1 means weight by inverse frequency, beta -> 0 means no reweighting
+def compute_weights_classbalanced(X, y, w):
+    wn = (1.0 - beta)/(1.0 - tf.pow(beta, w))
+    wn /= tf.reduce_sum(wn)
+    return X, y, wn
+
+#uniform weights
+def compute_weights_uniform(X, y, w):
+    wn = tf.ones_like(w)
+    wn /= tf.reduce_sum(wn)
+    return X, y, wn
+
+#weight proportional to 1/sqrt(N)
+def compute_weights_inverse(X, y, w):
+    wn = 1.0/tf.sqrt(w)
+    wn /= tf.reduce_sum(wn)
+    return X, y, wn
+
+weight_schemes = {
+    "uniform": compute_weights_uniform,
+    "inverse": compute_weights_inverse,
+    "classbalanced": compute_weights_classbalanced,
+}
+
+def load_one_file(fn):
+    Xs = []
+    ys = []
+    ys_cand = []
+    dms = []
+
+    data = pickle.load(open(fn, "rb"), encoding='iso-8859-1')
+    for event in data:
+        Xelem = event["Xelem"]
+        ygen = event["ygen"]
+        ycand = event["ycand"]
+
+        #remove PS from inputs, they don't seem to be very useful
+        msk_ps = (Xelem["typ"] == 2) | (Xelem["typ"] == 3)
+
+        Xelem = Xelem[~msk_ps]
+        ygen = ygen[~msk_ps]
+        ycand = ycand[~msk_ps]
+
+        Xelem = append_fields(Xelem, "typ_idx", np.array([elem_labels.index(int(i)) for i in Xelem["typ"]], dtype=np.float32))
+        ygen = append_fields(ygen, "typ_idx", np.array([class_labels.index(abs(int(i))) for i in ygen["typ"]], dtype=np.float32))
+        ycand = append_fields(ycand, "typ_idx", np.array([class_labels.index(abs(int(i))) for i in ycand["typ"]], dtype=np.float32))
+    
+        Xelem_flat = np.stack([Xelem[k].view(np.float32).data for k in [
+            'typ_idx',
+            'pt', 'eta', 'phi', 'e',
+            'layer', 'depth', 'charge', 'trajpoint',
+            'eta_ecal', 'phi_ecal', 'eta_hcal', 'phi_hcal',
+            'muon_dt_hits', 'muon_csc_hits']], axis=-1
+        )
+        ygen_flat = np.stack([ygen[k].view(np.float32).data for k in [
+            'typ_idx',
+            'eta', 'phi', 'e', 'charge',
+            ]], axis=-1
+        )
+        ycand_flat = np.stack([ycand[k].view(np.float32).data for k in [
+            'typ_idx',
+            'eta', 'phi', 'e', 'charge',
+            ]], axis=-1
+        )
+
+        #take care of outliers
+        Xelem_flat[np.isnan(Xelem_flat)] = 0
+        Xelem_flat[np.abs(Xelem_flat) > 1e4] = 0
+        ygen_flat[np.isnan(ygen_flat)] = 0
+        ygen_flat[np.abs(ygen_flat) > 1e4] = 0
+        ycand_flat[np.isnan(ycand_flat)] = 0
+        ycand_flat[np.abs(ycand_flat) > 1e4] = 0
+
+        Xs += [Xelem_flat[:num_max_elems]]
+        ys += [ygen_flat[:num_max_elems]]
+        ys_cand += [ycand_flat[:num_max_elems]]
+    
+    print("created {} blocks, max size {}".format(len(Xs), max([len(X) for X in Xs])))
+    return Xs, ys, ys_cand
+
+
+class InputEncoding(tf.keras.layers.Layer):
+    def __init__(self, num_input_classes):
+        super(InputEncoding, self).__init__()
+        self.num_input_classes = num_input_classes
+
+    """
+        X: [Nbatch, Nelem, Nfeat] array of all the input detector element feature data
+    """        
+    def call(self, X):
+
+        #X[:, :, 0] - categorical index of the element type
+        Xid = tf.cast(tf.one_hot(tf.cast(X[:, :, 0], tf.int32), self.num_input_classes), dtype=tf.float32)
+
+        #X[:, :, 1:] - all the other non-categorical features
+        Xprop = X[:, :, 1:]
+        return tf.concat([Xid, Xprop], axis=-1)
+
+#https://arxiv.org/pdf/2004.04635.pdf
+#https://github.com/gcucurull/jax-ghnet/blob/master/models.py 
+class GHConv(tf.keras.layers.Layer):
+    def __init__(self, *args, **kwargs):
+        self.activation = kwargs.pop("activation")
+        self.hidden_dim = args[0]
+
+        super(GHConv, self).__init__(*args, **kwargs)
+
+    def build(self, input_shape):
+        self.W_t = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_t", initializer="random_normal")
+        self.b_t = self.add_weight(shape=(self.hidden_dim, ), name="b_t", initializer="random_normal")
+        self.W_h = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_h", initializer="random_normal")
+        self.theta = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="theta", initializer="random_normal")
+ 
+    def call(self, inputs):
+        x, adj = inputs
+
+        #compute the normalization of the adjacency matrix
+        in_degrees = tf.sparse.reduce_sum(adj, axis=-1)
+        in_degrees = tf.reshape(in_degrees, (tf.shape(x)[0], tf.shape(x)[1]))
+
+        #add epsilon to prevent numerical issues from 1/sqrt(x)
+        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
+
+        f_hom = tf.linalg.matmul(x, self.theta)
+        f_hom = sparse_dense_matmult_batch(adj, f_hom*norm)*norm
+
+        f_het = tf.linalg.matmul(x, self.W_h)
+        gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t)
+
+        out = gate*f_hom + (1-gate)*f_het
+        return self.activation(out)
+
+class GHConvDense(tf.keras.layers.Layer):
+    def __init__(self, *args, **kwargs):
+        self.activation = kwargs.pop("activation")
+        self.hidden_dim = args[0]
+        super(GHConvDense, self).__init__(*args, **kwargs)
+
+    def build(self, input_shape):
+        self.W_t = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_t", initializer="random_normal")
+        self.b_t = self.add_weight(shape=(self.hidden_dim, ), name="b_t", initializer="random_normal")
+        self.W_h = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="w_h", initializer="random_normal")
+        self.theta = self.add_weight(shape=(self.hidden_dim, self.hidden_dim), name="theta", initializer="random_normal")
+ 
+    def call(self, inputs):
+        x, adj = inputs
+
+        #compute the normalization of the adjacency matrix
+        in_degrees = tf.reduce_sum(adj, axis=-1)
+        in_degrees = tf.reshape(in_degrees, (tf.shape(x)[0], tf.shape(x)[1]))
+
+        #add epsilon to prevent numerical issues from 1/sqrt(x)
+        norm = tf.expand_dims(tf.pow(in_degrees + 1e-6, -0.5), -1)
+
+        f_hom = tf.linalg.matmul(x, self.theta)
+        f_hom = tf.linalg.matmul(adj, f_hom*norm)*norm
+
+        f_het = tf.linalg.matmul(x, self.W_h)
+        gate = tf.nn.sigmoid(tf.linalg.matmul(x, self.W_t) + self.b_t)
+
+        out = gate*f_hom + (1-gate)*f_het
+        return self.activation(out)
+
+class DenseDistance(tf.keras.layers.Layer):
+    def __init__(self, dist_mult=0.1, **kwargs):
+        super(DenseDistance, self).__init__(**kwargs)
+        self.dist_mult = dist_mult
+   
+    def call(self, inputs, training=True):
+        dm = pairwise_dist(inputs, inputs)
+        dm = tf.exp(-self.dist_mult*dm)
+        return dm 
+
+class SparseHashedNNDistance(tf.keras.layers.Layer):
+    def __init__(self, max_num_bins=200, bin_size=500, num_neighbors=5, dist_mult=0.1, cosine_dist=False, **kwargs):
+        super(SparseHashedNNDistance, self).__init__(**kwargs)
+        self.num_neighbors = num_neighbors
+        self.dist_mult = dist_mult
+
+        self.cosine_dist = cosine_dist
+
+        #generate the codebook for LSH hashing at model instantiation for up to this many bins
+        #set this to a high-enough value at model generation to take into account the largest possible input 
+        self.max_num_bins = max_num_bins
+
+        #each bin will receive this many input elements, in total we can accept max_num_bins*bin_size input elements
+        #in each bin, we will do a dense top_k evaluation
+        self.bin_size = bin_size
+
+    def build(self, input_shape):
+        #(n_batch, n_points, n_features)
+
+        #generate the LSH codebook for random rotations (num_features, num_bins/2)
+        self.codebook_random_rotations = self.add_weight(
+            shape=(input_shape[-1], self.max_num_bins//2), initializer="random_normal", trainable=False, name="lsh_projections"
+        )
+
+    def call(self, inputs, training=True):
+
+        #(n_batch, n_points, n_features)
+        point_embedding = inputs
+
+        n_batches = tf.shape(point_embedding)[0]
+        n_points = tf.shape(point_embedding)[1]
+
+        #cannot concat sparse tensors directly as that incorrectly destroys the gradient, see
+        #https://github.com/tensorflow/tensorflow/blob/df3a3375941b9e920667acfe72fb4c33a8f45503/tensorflow/python/ops/sparse_grad.py#L33
+        #therefore, for training, we implement sparse concatenation by hand 
+        indices_all = []
+        values_all = []
+
+        def func(args):
+            ibatch, points_batch = args[0], args[1]
+            dm = self.construct_sparse_dm_batch(points_batch)
+            inds = tf.concat([tf.expand_dims(tf.cast(ibatch, tf.int64)*tf.ones(tf.shape(dm.indices)[0], dtype=tf.int64), -1), dm.indices], axis=-1)
+            vals = dm.values
+            return inds, vals
+
+        elems = (tf.range(0, tf.cast(n_batches, tf.int64), delta=1, dtype=tf.int64), point_embedding)
+        ret = tf.map_fn(func, elems, fn_output_signature=(tf.int64, tf.float32), parallel_iterations=1)
+        shp = tf.shape(ret[0])
+        # #now create a new SparseTensor that is a concatenation of the previous ones
+        dms = tf.SparseTensor(
+            tf.reshape(ret[0], (shp[0]*shp[1], shp[2])),
+            tf.reshape(ret[1], (shp[0]*shp[1],)),
+            (n_batches, n_points, n_points)
+        )
+
+        return tf.sparse.reorder(dms)
+
+    def subpoints_to_sparse_matrix(self, n_points, subindices, subpoints):
+
+        #find the distance matrix between the given points using dense matrix multiplication
+        if self.cosine_dist:
+            normed = tf.nn.l2_normalize(subpoints, axis=-1)
+            dm = tf.linalg.matmul(subpoints, subpoints, transpose_b=True)
+        else:
+            dm = pairwise_dist(subpoints, subpoints)
+            dm = tf.exp(-self.dist_mult*dm)
+
+        dmshape = tf.shape(dm)
+        nbins = dmshape[0]
+        nelems = dmshape[1]
+
+        #run KNN in the dense distance matrix, accumulate each index pair into a sparse distance matrix
+        top_k = tf.nn.top_k(dm, k=self.num_neighbors)
+        top_k_vals = tf.reshape(top_k.values, (nbins*nelems, self.num_neighbors))
+
+        indices_gathered = tf.vectorized_map(
+            lambda i: tf.gather_nd(subindices, top_k.indices[:, :, i:i+1], batch_dims=1),
+            tf.range(self.num_neighbors, dtype=tf.int64))
+
+        indices_gathered = tf.transpose(indices_gathered, [1,2,0])
+
+        #add the neighbors up to a big matrix using dense matrices, then convert to sparse (mainly for testing)
+        # sp_sum = tf.zeros((n_points, n_points))
+        # for i in range(self.num_neighbors):
+        #     dst_ind = indices_gathered[:, :, i] #(nbins, nelems)
+        #     dst_ind = tf.reshape(dst_ind, (nbins*nelems, ))
+        #     src_ind = tf.reshape(tf.stack(subindices), (nbins*nelems, ))
+        #     src_dst_inds = tf.transpose(tf.stack([src_ind, dst_ind]))
+        #     sp_sum += tf.scatter_nd(src_dst_inds, top_k_vals[:, i], (n_points, n_points))
+        # spt_this = tf.sparse.from_dense(sp_sum)
+        # validate that the vectorized ops are doing what we want by hand while debugging
+        # dm = np.eye(n_points)
+        # for ibin in range(nbins):
+        #     for ielem in range(nelems):
+        #         idx0 = subindices[ibin][ielem]
+        #         for ineigh in range(self.num_neighbors):
+        #             idx1 = subindices[ibin][top_k.indices[ibin, ielem, ineigh]]
+        #             val = top_k.values[ibin, ielem, ineigh]
+        #             dm[idx0, idx1] += val
+        # assert(np.all(sp_sum.numpy() == dm))
+
+        #update the output using intermediate sparse matrices, which may result in some inconsistencies from duplicated indices
+        sp_sum = tf.sparse.SparseTensor(indices=tf.zeros((0,2), dtype=tf.int64), values=tf.zeros(0, tf.float32), dense_shape=(n_points, n_points))
+        for i in range(self.num_neighbors):
+           dst_ind = indices_gathered[:, :, i] #(nbins, nelems)
+           dst_ind = tf.reshape(dst_ind, (nbins*nelems, ))
+           src_ind = tf.reshape(tf.stack(subindices), (nbins*nelems, ))
+           src_dst_inds = tf.cast(tf.transpose(tf.stack([src_ind, dst_ind])), dtype=tf.int64)
+           sp_sum = tf.sparse.add(
+               sp_sum,
+               tf.sparse.reorder(tf.sparse.SparseTensor(src_dst_inds, top_k_vals[:, i], (n_points, n_points)))
+           )
+        spt_this = tf.sparse.reorder(sp_sum)
+
+        return spt_this
+
+    def construct_sparse_dm_batch(self, points):
+
+        #points: (n_points, n_features) input elements for graph construction
+        n_points = tf.shape(points)[0]
+        n_features = tf.shape(points)[1]
+
+        #compute the number of LSH bins to divide the input points into on the fly
+        #n_points must be divisible by bin_size exactly due to the use of reshape
+        n_bins = tf.math.floordiv(n_points, self.bin_size)
+        #tf.debugging.assert_greater(n_bins, 0)
+
+        #put each input item into a bin defined by the softmax output across the LSH embedding
+        mul = tf.linalg.matmul(points, self.codebook_random_rotations[:, :n_bins//2])
+        #tf.debugging.assert_greater(tf.shape(mul)[2], 0)
+
+        cmul = tf.concat([mul, -mul], axis=-1)
+
+        #cmul is now an integer in [0..nbins) for each input point
+        #bins_split: (n_bins, bin_size) of integer bin indices, which put each input point into a bin of size (n_points/n_bins)
+        bins_split = split_indices_to_bins(cmul, n_bins, self.bin_size)
+
+        #parts: (n_bins, bin_size, n_features), the input points divided up into bins
+        parts = tf.gather(points, bins_split)
+
+        #sparse_distance_matrix: (n_points, n_points) sparse distance matrix
+        #where higher values (closer to 1) are associated with points that are closely related
+        sparse_distance_matrix = self.subpoints_to_sparse_matrix(n_points, bins_split, parts)
+
+        return sparse_distance_matrix
+
+class EncoderDecoderGNN(tf.keras.layers.Layer):
+    def __init__(self, encoders, decoders, dropout, activation, conv, **kwargs):
+        super(EncoderDecoderGNN, self).__init__(**kwargs)
+        name = kwargs.get("name")
+
+        #assert(encoders[-1] == decoders[0])
+        self.encoders = encoders
+        self.decoders = decoders
+
+        self.encoding_layers = []
+        for ilayer, nunits in enumerate(encoders):
+            self.encoding_layers.append(
+                tf.keras.layers.Dense(nunits, activation=activation, name="encoding_{}_{}".format(name, ilayer)))
+            if dropout > 0.0:
+                self.encoding_layers.append(tf.keras.layers.Dropout(dropout))
+
+        self.conv = conv
+
+        self.decoding_layers = []
+        for ilayer, nunits in enumerate(decoders):
+            self.decoding_layers.append(
+                tf.keras.layers.Dense(nunits, activation=activation, name="decoding_{}_{}".format(name, ilayer)))
+            if dropout > 0.0:
+                self.decoding_layers.append(tf.keras.layers.Dropout(dropout))
+
+    def call(self, inputs, distance_matrix, training=True):
+        x = inputs
+
+        for layer in self.encoding_layers:
+            x = layer(x)
+
+        for convlayer in self.conv:
+            x = convlayer([x, distance_matrix])
+
+        for layer in self.decoding_layers:
+            x = layer(x)
+
+        return x
+
+class AddSparse(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super(AddSparse, self).__init__(**kwargs)
+
+    def call(self, matrices):
+        ret = matrices[0]
+        for mat in matrices[1:]:
+            ret = tf.sparse.add(ret, mat)
+        return ret
+
+#Simple message passing based on a matrix multiplication
+class PFNet(tf.keras.Model):
+    def __init__(self,
+        activation=tf.nn.selu,
+        hidden_dim_id=256,
+        hidden_dim_reg=256,
+        distance_dim=256,
+        convlayer="ghconv",
+        dropout=0.1,
+        bin_size=10,
+        num_convs_id=1,
+        num_convs_reg=1,
+        num_hidden_id_enc=1,
+        num_hidden_id_dec=1,
+        num_hidden_reg_enc=1,
+        num_hidden_reg_dec=1,
+        num_neighbors=5,
+        dist_mult=0.1,
+        cosine_dist=False):
+
+        super(PFNet, self).__init__()
+        self.activation = activation
+        self.num_dists = 1
+
+        encoding_id = []
+        decoding_id = []
+        encoding_reg = []
+        decoding_reg = []
+
+        #the encoder outputs and decoder inputs have to have the hidden dim (convlayer size)
+        for ihidden in range(num_hidden_id_enc):
+            encoding_id.append(hidden_dim_id)
+
+        for ihidden in range(num_hidden_id_dec):
+            decoding_id.append(hidden_dim_id)
+
+        for ihidden in range(num_hidden_reg_enc):
+            encoding_reg.append(hidden_dim_reg)
+
+        for ihidden in range(num_hidden_reg_dec):
+            decoding_reg.append(hidden_dim_reg)
+
+        self.enc = InputEncoding(len(elem_labels))
+        self.layer_embedding = tf.keras.layers.Dense(distance_dim, name="embedding_attention")
+        
+        self.embedding_dropout = None
+        if dropout > 0.0:
+            self.embedding_dropout = tf.keras.layers.Dropout(dropout)
+
+        self.dists = []
+        for idist in range(self.num_dists):
+            self.dists.append(SparseHashedNNDistance(bin_size=bin_size, num_neighbors=num_neighbors, dist_mult=dist_mult, cosine_dist=cosine_dist))
+        self.addsparse = AddSparse()
+        #self.dist = DenseDistance(dist_mult=dist_mult)
+
+        convs_id = []
+        convs_reg = []
+
+        for iconv in range(num_convs_id):
+            convs_id.append(GHConv(26 if len(encoding_id)==0 else hidden_dim_id, activation=activation, name="conv_id{}".format(iconv)))
+        for iconv in range(num_convs_reg):
+            convs_reg.append(GHConv(35 if len(encoding_reg)==0 else hidden_dim_reg, activation=activation, name="conv_reg{}".format(iconv)))
+
+        self.gnn_id = EncoderDecoderGNN(encoding_id, decoding_id, dropout, activation, convs_id, name="gnn_id")
+        self.layer_id = tf.keras.layers.Dense(len(class_labels), activation="linear", name="out_id")
+        self.layer_charge = tf.keras.layers.Dense(1, activation="linear", name="out_charge")
+        
+        self.gnn_reg = EncoderDecoderGNN(encoding_reg, decoding_reg, dropout, activation, convs_reg, name="gnn_reg")
+        self.layer_momentum = tf.keras.layers.Dense(3, activation="linear", name="out_momentum")
+
+    def create_model(self, num_max_elems, training=True):
+        inputs = tf.keras.Input(shape=(num_max_elems,15,))
+        return tf.keras.Model(inputs=[inputs], outputs=self.call(inputs, training), name="MLPFNet")
+
+    def call(self, inputs, training=True):
+        X = tf.cast(inputs, tf.float32)
+        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, tf.float32), -1)
+
+        enc = self.enc(inputs)
+
+        #embed inputs for graph structure prediction
+        embedding_attention = self.layer_embedding(enc)
+        if self.embedding_dropout:
+            embedding_attention = self.embedding_dropout(embedding_attention, training)
+
+        #create graph structure by predicting a sparse distance matrix
+        dms = [dist(embedding_attention, training) for dist in self.dists]
+        dm = self.addsparse(dms)
+
+        #run graph net for multiclass id prediction
+        x_id = self.gnn_id(enc, dm, training)
+        to_decode = tf.concat([enc, x_id], axis=-1)
+        out_id_logits = self.layer_id(to_decode)
+        out_charge = self.layer_charge(to_decode)
+
+        #run graph net for regression output prediction, taking as an additonal input the ID predictions
+        x_reg = self.gnn_reg(tf.concat([enc, out_id_logits, out_charge], axis=-1), dm, training)
+        to_decode = tf.concat([enc, x_reg], axis=-1)
+        pred_corr = self.layer_momentum(to_decode)
+
+        #soft-mask elements for which the id prediction was 0  
+        probabilistic_mask_good = 1.0 - tf.keras.activations.softmax(out_id_logits)[:, :, 0]
+
+        out_momentum_eta = X[:, :, 2] + pred_corr[:, :, 0]
+        out_momentum_phi = X[:, :, 3] + pred_corr[:, :, 1] 
+        out_momentum_E = X[:, :, 4] + pred_corr[:, :, 2]
+
+        out_momentum = tf.stack([
+            out_momentum_eta * probabilistic_mask_good,
+            out_momentum_phi * probabilistic_mask_good,
+            out_momentum_E * probabilistic_mask_good,
+        ], axis=-1)
+
+        ret = tf.concat([out_id_logits, out_momentum, out_charge], axis=-1)*msk_input
+        return ret
+
+    def set_trainable_classification(self):
+        self.gnn_reg.trainable = False
+        self.layer_momentum.trainable = False
+
+    def set_trainable_regression(self):
+        for layer in self.layers:
+            layer.trainable = False
+        self.gnn_reg.trainable = True
+        self.layer_momentum.trainable = True
+
+#Just a dummy elementwise model
+class PFNetDummy(tf.keras.Model):
+    def __init__(self, **kwargs):
+        super(PFNetDummy, self).__init__()
+        self.enc = InputEncoding(len(elem_labels))
+
+        self.flatten = tf.keras.layers.Flatten()
+        self.layer_hidden0 = tf.keras.layers.Dense(32, activation="elu")
+        self.layer_hidden1 = tf.keras.layers.Dense(64, activation="elu")
+        self.layer_hidden2 = tf.keras.layers.Dense(128, activation="elu")
+        self.layer_hidden3 = tf.keras.layers.Dense(256, activation="elu")
+
+        self.layer_id = tf.keras.layers.Dense(len(class_labels), activation="linear", name="out_id")
+        self.layer_charge = tf.keras.layers.Dense(1, activation="linear", name="out_charge")
+        self.layer_momentum = tf.keras.layers.Dense(3, activation="linear", name="out_momentum")
+
+    def call(self, inputs, training=True):
+        X = tf.cast(inputs, tf.float32)
+        msk_input = tf.expand_dims(tf.cast(X[:, :, 0] != 0, tf.float32), -1)
+        enc = self.enc(inputs)
+
+        h = self.layer_hidden0(flat)
+        h = self.layer_hidden1(h)
+        h = self.layer_hidden2(h)
+        h = self.layer_hidden3(h)
+
+        out_id_logits = self.layer_id(h)
+        out_charge = self.layer_charge(h)
+        pred_corr = self.layer_momentum(h)
+
+        #soft-mask elements for which the id prediction was 0  
+        probabilistic_mask_good = 1.0 - tf.keras.activations.softmax(out_id_logits)[:, :, 0]
+
+        out_momentum_eta = X[:, :, 2] + pred_corr[:, :, 0]
+        out_momentum_phi = X[:, :, 3] + pred_corr[:, :, 1] 
+        out_momentum_E = X[:, :, 4] + pred_corr[:, :, 2]
+
+        out_momentum = tf.stack([
+            out_momentum_eta * probabilistic_mask_good,
+            out_momentum_phi * probabilistic_mask_good,
+            out_momentum_E * probabilistic_mask_good,
+        ], axis=-1)
+
+        ret = tf.concat([out_id_logits, out_momentum, out_charge], axis=-1)*msk_input
+        return ret
+
+    def set_trainable_classification(self):
+        self.layer_momentum.trainable = False
+
+    def set_trainable_regression(self):
+        pass
+
+    def create_model(self, num_max_elems, training=True):
+        inputs = tf.keras.Input(shape=(num_max_elems,15,))
+        return tf.keras.Model(inputs=[inputs], outputs=self.call(inputs, training), name="MLPFNet")
+
+def separate_prediction(y_pred):
+    N = len(class_labels)
+    pred_id_logits = y_pred[:, :, :N]
+    pred_momentum = y_pred[:, :, N:N+3]
+    pred_charge = y_pred[:, :, N+3:N+4]
+    return pred_id_logits, pred_charge, pred_momentum
+
+def separate_truth(y_true):
+    true_id = tf.cast(y_true[:, :, :1], tf.int32)
+    true_momentum = y_true[:, :, 1:4]
+    true_charge = y_true[:, :, 4:5]
+    return true_id, true_charge, true_momentum
+
+def mse_unreduced(true, pred):
+    return tf.math.pow(true-pred,2)
+
+def msle_unreduced(true, pred):
+    return tf.math.pow(tf.math.log(tf.math.abs(true) + 1.0) - tf.math.log(tf.math.abs(pred) + 1.0), 2)
+
+def my_loss_cls(y_true, y_pred):
+    pred_id_logits, pred_charge, _ = separate_prediction(y_pred)
+    true_id, true_charge, _ = separate_truth(y_true)
+
+    true_id_onehot = tf.one_hot(tf.cast(true_id, tf.int32), depth=len(class_labels))
+    #predict the particle class labels
+    l1 = mult_classification_loss*tf.nn.softmax_cross_entropy_with_logits(true_id_onehot, pred_id_logits)
+    l3 = mult_charge_loss*mse_unreduced(true_charge, pred_charge)[:, :, 0]
+
+    loss = l1 + l3
+    return mult_total_loss*loss
+
+def my_loss_reg(y_true, y_pred):
+    _, _, pred_momentum = separate_prediction(y_pred)
+    _, true_charge, true_momentum = separate_truth(y_true)
+
+    l2_0 = mult_eta_loss*mse_unreduced(true_momentum[:, :, 0], pred_momentum[:, :, 0])
+    l2_1 = mult_phi_loss*mse_unreduced(tf.math.floormod(true_momentum[:, :, 1] - pred_momentum[:, :, 1] + np.pi, 2*np.pi) - np.pi, 0.0)
+    l2_2 = mult_energy_loss*mse_unreduced(true_momentum[:, :, 2], pred_momentum[:, :, 2])
+
+    loss = (l2_0 + l2_1 + l2_2)
+    
+    return 1e3*loss
+
+def my_loss_full(y_true, y_pred):
+    pred_id_logits, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_logits, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+    true_id_onehot = tf.one_hot(tf.cast(true_id, tf.int32), depth=len(class_labels))
+    
+    l1 = mult_classification_loss*tf.nn.softmax_cross_entropy_with_logits(true_id_onehot, pred_id_logits)
+  
+    l2_0 = mult_eta_loss*mse_unreduced(true_momentum[:, :, 0], pred_momentum[:, :, 0])
+    l2_1 = mult_phi_loss*mse_unreduced(tf.math.floormod(true_momentum[:, :, 1] - pred_momentum[:, :, 1] + np.pi, 2*np.pi) - np.pi, 0.0)
+    l2_2 = mult_energy_loss*mse_unreduced(true_momentum[:, :, 2], pred_momentum[:, :, 2])
+
+    l2 = (l2_0 + l2_1 + l2_2)
+
+    l3 = mult_charge_loss*mse_unreduced(true_charge, pred_charge)[:, :, 0]
+    loss = l1 + l2 + l3
+
+    return mult_total_loss*loss
+
+#TODO: put these in a class
+def cls_130(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk_true = true_id[:, :, 0] == class_labels.index(130)
+    msk_pos = pred_id == class_labels.index(130)
+    num_true_pos = tf.reduce_sum(tf.cast(msk_true&msk_pos, tf.float32))
+    num_true = tf.reduce_sum(tf.cast(msk_true, tf.float32))
+    return num_true_pos/num_true
+
+def cls_211(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk_true = true_id[:, :, 0] == class_labels.index(211)
+    msk_pos = pred_id == class_labels.index(211)
+    num_true_pos = tf.reduce_sum(tf.cast(msk_true&msk_pos, tf.float32))
+    num_true = tf.reduce_sum(tf.cast(msk_true, tf.float32))
+
+    return num_true_pos/num_true
+
+def cls_22(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk_true = true_id[:, :, 0] == class_labels.index(22)
+    msk_pos = pred_id == class_labels.index(22)
+    num_true_pos = tf.reduce_sum(tf.cast(msk_true&msk_pos, tf.float32))
+    num_true = tf.reduce_sum(tf.cast(msk_true, tf.float32))
+
+    return num_true_pos/num_true
+
+def cls_11(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk_true = true_id[:, :, 0] == class_labels.index(11)
+    msk_pos = pred_id == class_labels.index(11)
+    num_true_pos = tf.reduce_sum(tf.cast(msk_true&msk_pos, tf.float32))
+    num_true = tf.reduce_sum(tf.cast(msk_true, tf.float32))
+
+    return num_true_pos/num_true
+
+def cls_13(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk_true = true_id[:, :, 0] == class_labels.index(13)
+    msk_pos = pred_id == class_labels.index(13)
+    num_true_pos = tf.reduce_sum(tf.cast(msk_true&msk_pos, tf.float32))
+    num_true = tf.reduce_sum(tf.cast(msk_true, tf.float32))
+
+    return num_true_pos/num_true
+
+def num_pred(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    ntrue = tf.reduce_sum(tf.cast(true_id[:, :, 0]!=0, tf.int32))
+    npred = tf.reduce_sum(tf.cast(pred_id!=0, tf.int32))
+    return tf.cast(ntrue - npred, tf.float32)
+
+def accuracy(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    is_true = true_id[:, :, 0]!=0
+    is_same = true_id[:, :, 0] == pred_id
+
+    acc = tf.reduce_sum(tf.cast(is_true&is_same, tf.int32)) / tf.reduce_sum(tf.cast(is_true, tf.int32))
+    return tf.cast(acc, tf.float32)
+
+def eta_resolution(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk = true_id[:, :, 0]!=0
+    return tf.reduce_mean(mse_unreduced(true_momentum[msk][:, 0], pred_momentum[msk][:, 0]))
+
+def phi_resolution(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk = true_id[:, :, 0]!=0
+    return tf.reduce_mean(mse_unreduced(tf.math.floormod(true_momentum[msk][:, 1] - pred_momentum[msk][:, 1] + np.pi, 2*np.pi) - np.pi, 0.0))
+
+def energy_resolution(y_true, y_pred):
+    pred_id_onehot, pred_charge, pred_momentum = separate_prediction(y_pred)
+    pred_id = tf.cast(tf.argmax(pred_id_onehot, axis=-1), tf.int32)
+    true_id, true_charge, true_momentum = separate_truth(y_true)
+
+    msk = true_id[:, :, 0]!=0
+    return tf.reduce_mean(mse_unreduced(true_momentum[msk][:, 2], pred_momentum[msk][:, 2]))
+
+def get_unique_run():
+    previous_runs = os.listdir('experiments')
+    if len(previous_runs) == 0:
+        run_number = 1
+    else:
+        run_number = max([int(s.split('run_')[1]) for s in previous_runs]) + 1
+    return run_number
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="PFNet", help="type of model to train", choices=["PFNet"])
+    parser.add_argument("--ntrain", type=int, default=100, help="number of training events")
+    parser.add_argument("--ntest", type=int, default=100, help="number of testing events")
+    parser.add_argument("--nepochs", type=int, default=100, help="number of training epochs")
+    parser.add_argument("--hidden-dim-id", type=int, default=256, help="hidden dimension")
+    parser.add_argument("--hidden-dim-reg", type=int, default=256, help="hidden dimension")
+    parser.add_argument("--batch-size", type=int, default=1, help="number of events in training batch")
+    parser.add_argument("--num-convs-id", type=int, default=3, help="number of convolution layers")
+    parser.add_argument("--num-convs-reg", type=int, default=3, help="number of convolution layers")
+    parser.add_argument("--num-hidden-id-enc", type=int, default=2, help="number of encoder layers for multiclass")
+    parser.add_argument("--num-hidden-id-dec", type=int, default=2, help="number of decoder layers for multiclass")
+    parser.add_argument("--num-hidden-reg-enc", type=int, default=2, help="number of encoder layers for regression")
+    parser.add_argument("--num-hidden-reg-dec", type=int, default=2, help="number of decoder layers for regression")
+    parser.add_argument("--num-neighbors", type=int, default=5, help="number of knn neighbors")
+    parser.add_argument("--distance-dim", type=int, default=256, help="distance dimension")
+    parser.add_argument("--bin-size", type=int, default=100, help="number of points per LSH bin")
+    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout rate")
+    parser.add_argument("--dist-mult", type=float, default=1.0, help="Exponential multiplier")
+    parser.add_argument("--target", type=str, choices=["cand", "gen"], help="Regress to PFCandidates or GenParticles", default="cand")
+    parser.add_argument("--weights", type=str, choices=["uniform", "inverse", "classbalanced"], help="Sample weighting scheme to use", default="inverse")
+    parser.add_argument("--name", type=str, default=None, help="where to store the output")
+    parser.add_argument("--convlayer", type=str, default="ghconv", choices=["ghconv"], help="Type of graph convolutional layer")
+    parser.add_argument("--load", type=str, default=None, help="model to load")
+    parser.add_argument("--datapath", type=str, help="Input data path", required=True)
+    parser.add_argument("--lr", type=float, default=1e-5, help="learning rate")
+    parser.add_argument("--lr-decay", type=float, default=0.0, help="learning rate decay")
+    parser.add_argument("--train-cls", action="store_true", help="Train only the classification part")
+    parser.add_argument("--train-reg", action="store_true", help="Train only the regression part")
+    parser.add_argument("--cosine-dist", action="store_true", help="Use cosine distance")
+    parser.add_argument("--eager", action="store_true", help="Run in eager mode for debugging")
+    args = parser.parse_args()
+    return args
+
+def assign_label(pred_id_onehot_linear):
+    ret2 = np.argmax(pred_id_onehot_linear, axis=-1)
+    return ret2
+
+def prepare_df(model, data, outdir, target, save_raw=False):
+    print("prepare_df")
+
+    dfs = []
+    for iev, d in enumerate(data):
+        if iev%50==0:
+            tf.print(".", end="")
+        X, y, w = d
+        pred = model(X, training=False).numpy()
+        pred_id_onehot, pred_charge, pred_momentum = separate_prediction(pred)
+        pred_id = assign_label(pred_id_onehot).flatten()
+ 
+        if save_raw:
+            np.savez_compressed("ev_{}.npz".format(iev), X=X.numpy(), y=y.numpy(), w=w.numpy(), y_pred=pred)
+
+        pred_charge = pred_charge[:, :, 0].flatten()
+        pred_momentum = pred_momentum.reshape((pred_momentum.shape[0]*pred_momentum.shape[1], pred_momentum.shape[2]))
+
+        true_id, true_charge, true_momentum = separate_truth(y)
+        true_id = true_id.numpy()[:, :, 0].flatten()
+        true_charge = true_charge.numpy()[:, :, 0].flatten()
+        true_momentum = true_momentum.numpy().reshape((true_momentum.shape[0]*true_momentum.shape[1], true_momentum.shape[2]))
+       
+        df = pandas.DataFrame()
+        df["pred_pid"] = np.array([int(class_labels[p]) for p in pred_id])
+        df["pred_eta"] = np.array(pred_momentum[:, 0], dtype=np.float64)
+        df["pred_phi"] = np.array(pred_momentum[:, 1], dtype=np.float64)
+        df["pred_e"] = np.array(pred_momentum[:, 2], dtype=np.float64)
+
+        df["{}_pid".format(target)] = np.array([int(class_labels[p]) for p in true_id])
+        df["{}_eta".format(target)] = np.array(true_momentum[:, 0], dtype=np.float64)
+        df["{}_phi".format(target)] = np.array(true_momentum[:, 1], dtype=np.float64)
+        df["{}_e".format(target)] = np.array(true_momentum[:, 2], dtype=np.float64)
+
+        df["iev"] = iev
+        dfs += [df]
+    df = pandas.concat(dfs, ignore_index=True)
+    fn = outdir + "/df.pkl.bz2"
+    df.to_pickle(fn)
+    print("prepare_df done", fn)
+
+def plot_to_image(figure):
+    """Converts the matplotlib plot specified by 'figure' to a PNG image and
+    returns it. The supplied figure is closed and inaccessible after this call."""
+    # Save the plot to a PNG in memory.
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    # Closing the figure prevents it from being displayed directly inside
+    # the notebook.
+    buf.seek(0)
+    # Convert PNG buffer to TF image
+    image = tf.image.decode_png(buf.getvalue(), channels=4)
+    # Add the batch dimension
+    image = tf.expand_dims(image, 0)
+    return image
+
+def load_dataset_ttbar(datapath, target):
+    from tf_data import _parse_tfr_element
+    path = datapath + "/tfr/{}/*.tfrecords".format(target)
+    tfr_files = glob.glob(path)
+    if len(tfr_files) == 0:
+        raise Exception("Could not find any files in {}".format(path))
+    dataset = tf.data.TFRecordDataset(tfr_files).map(_parse_tfr_element, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+   
+    if comet_enabled: 
+        experiment = Experiment(project_name="particleflow_tf")
+
+    #tf.debugging.enable_check_numerics()
+    tf.config.experimental_run_functions_eagerly(args.eager)
+
+    #batch size for loading data must be configured according to the number of distributed GPUs 
+    global_batch_size = args.batch_size
+    try:
+        num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
+        print("num_gpus=", num_gpus)
+        if num_gpus > 1:
+            strategy = tf.distribute.MirroredStrategy()
+            global_batch_size = num_gpus * args.batch_size
+        else:
+            strategy = tf.distribute.OneDeviceStrategy("gpu:0")
+    except Exception as e:
+        print("fallback to CPU")
+        strategy = tf.distribute.OneDeviceStrategy("cpu")
+
+    filelist = sorted(glob.glob(args.datapath + "/raw/*.pkl"))[:args.ntrain+args.ntest]
+
+    dataset = load_dataset_ttbar(args.datapath, args.target)
+
+    #create padded input data
+    ps = (tf.TensorShape([num_max_elems, 15]), tf.TensorShape([num_max_elems, 5]), tf.TensorShape([num_max_elems, ]))
+    ds_train = dataset.take(args.ntrain).map(weight_schemes[args.weights]).padded_batch(global_batch_size, padded_shapes=ps)
+    ds_test = dataset.skip(args.ntrain).take(args.ntest).map(weight_schemes[args.weights]).padded_batch(global_batch_size, padded_shapes=ps)
+
+    #repeat needed for keras api
+    ds_train_r = ds_train.repeat(args.nepochs)
+    ds_test_r = ds_test.repeat(args.nepochs)
+
+    if args.lr_decay > 0:
+        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+            args.lr,
+            decay_steps=10*int(args.ntrain/global_batch_size),
+            decay_rate=args.lr_decay
+        )
+    else:
+        lr_schedule = args.lr
+
+    loss_fn = my_loss_full
+
+    with strategy.scope():
+        opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
+
+        model = PFNet(
+            hidden_dim_id=args.hidden_dim_id,
+            hidden_dim_reg=args.hidden_dim_reg,
+            num_convs_id=args.num_convs_id,
+            num_convs_reg=args.num_convs_reg,
+            num_hidden_id_enc=args.num_hidden_id_enc,
+            num_hidden_id_dec=args.num_hidden_id_dec,
+            num_hidden_reg_enc=args.num_hidden_reg_enc,
+            num_hidden_reg_dec=args.num_hidden_reg_dec,
+            distance_dim=args.distance_dim,
+            convlayer=args.convlayer,
+            dropout=args.dropout,
+            bin_size=args.bin_size,
+            num_neighbors=args.num_neighbors,
+            dist_mult=args.dist_mult
+        )
+
+        if args.train_cls:
+            loss_fn = my_loss_cls
+            model.set_trainable_classification()
+        elif args.train_reg:
+            loss_fn = my_loss_reg
+            model.set_trainable_regression()
+
+        model(np.random.randn(args.batch_size, num_max_elems, 15).astype(np.float32))
+        if not args.eager:
+            model = model.create_model(num_max_elems)
+            model.summary()
+
+    if not os.path.isdir("experiments"):
+        os.makedirs("experiments")
+
+    if args.name is None:
+        args.name =  'run_{:02}'.format(get_unique_run())
+
+    outdir = 'experiments/' + args.name
+
+    if os.path.isdir(outdir):
+        print("Output directory exists: {}".format(outdir), file=sys.stderr)
+        sys.exit(1)
+
+    print(outdir)
+    callbacks = []
+    tb = tf.keras.callbacks.TensorBoard(
+        log_dir=outdir, histogram_freq=0, write_graph=False, write_images=False,
+        update_freq='epoch',
+        #profile_batch=(10,40),
+        profile_batch=0,
+    )
+    tb.set_model(model)
+    callbacks += [tb]
+
+    terminate_cb = tf.keras.callbacks.TerminateOnNaN()
+    callbacks += [terminate_cb]
+
+    cp_callback = tf.keras.callbacks.ModelCheckpoint(
+        filepath=outdir + "/weights.{epoch:02d}-{val_loss:.6f}.hdf5",
+        save_weights_only=True,
+        verbose=0
+    )
+    cp_callback.set_model(model)
+    callbacks += [cp_callback]
+
+    with strategy.scope():
+        model.compile(optimizer=opt, loss=loss_fn,
+            metrics=[accuracy, cls_130, cls_211, cls_22, energy_resolution, eta_resolution, phi_resolution],
+            sample_weight_mode="temporal")
+
+        if args.load:
+            #ensure model input size is known
+            for X, y, w in ds_train:
+                model(X)
+                break
+   
+            model.load_weights(args.load)
+
+        if args.nepochs > 0:
+            ret = model.fit(ds_train_r,
+                validation_data=ds_test_r, epochs=args.nepochs,
+                steps_per_epoch=args.ntrain/global_batch_size, validation_steps=args.ntest/global_batch_size,
+                verbose=True,
+                callbacks=callbacks
+            )
diff --git a/Validation/RecoParticleFlow/plugins/PFAnalysis.cc b/Validation/RecoParticleFlow/plugins/PFAnalysisNtuplizer.cc
similarity index 89%
rename from Validation/RecoParticleFlow/plugins/PFAnalysis.cc
rename to Validation/RecoParticleFlow/plugins/PFAnalysisNtuplizer.cc
index e3cb706992090..a7b43bad1ed6b 100644
--- a/Validation/RecoParticleFlow/plugins/PFAnalysis.cc
+++ b/Validation/RecoParticleFlow/plugins/PFAnalysisNtuplizer.cc
@@ -49,6 +49,7 @@
 #include "MagneticField/Engine/interface/MagneticField.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
 #include "MagneticField/VolumeGeometry/interface/MagVolumeOutsideValidity.h"
+#include "RecoParticleFlow/PFProducer/interface/MLPFModel.h"
 
 #include "CommonTools/UtilAlgos/interface/TFileService.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
@@ -73,40 +74,30 @@ class ElementWithIndex {
       : orig(_orig), idx_block(_idx_block), idx_elem(_idx_elem){};
 };
 
-int find_element_ref(const vector<ElementWithIndex>& vec, const edm::RefToBase<reco::Track>& r) {
+vector<int> find_element_ref(const vector<ElementWithIndex>& vec, const edm::RefToBase<reco::Track>& r) {
+  vector<int> ret;
   for (unsigned int i = 0; i < vec.size(); i++) {
     const auto& elem = vec.at(i);
     if (elem.orig.type() == reco::PFBlockElement::TRACK) {
       const auto& ref = elem.orig.trackRef();
-      assert(ref.isNonnull());
-      if (ref.key() == r.key()) {
-        return i;
-      }
-    } else if (elem.orig.type() == reco::PFBlockElement::BREM) {
-      const auto& ref = elem.orig.trackRefPF();
-      if (ref.isNonnull()) {
-        const auto& ref2 = ref->trackRef();
-        assert(ref2.isNonnull());
-        if (ref2.key() == r.key()) {
-          return i;
+      if (ref.isNonnull() && ref->extra().isNonnull()) {
+        if (ref.key() == r.key()) {
+          ret.push_back(i);
         }
       }
     }
   }
-  return -1;
+  return ret;
 }
 
-double detid_compare(const map<uint64_t, double>& rechits,
-                     const map<uint64_t, double>& simhits,
-                     const map<uint64_t, double>& rechits_energy,
-                     bool print) {
+double detid_compare(const map<uint64_t, double>& rechits, const map<uint64_t, double>& simhits) {
   double ret = 0.0;
 
   for (const auto& rh : rechits) {
     for (const auto& sh : simhits) {
       if (rh.first == sh.first) {
         //rechit energy times simhit fraction
-        ret += rechits_energy.at(rh.first) * sh.second;
+        ret += rh.second * sh.second;
         break;
       }
     }
@@ -236,6 +227,8 @@ class PFAnalysis : public edm::one::EDAnalyzer<edm::one::WatchRuns, edm::one::Sh
   vector<float> element_px_;
   vector<float> element_py_;
   vector<float> element_pz_;
+  vector<float> element_deltap_;
+  vector<float> element_sigmadeltap_;
   vector<float> element_eta_;
   vector<float> element_phi_;
   vector<float> element_energy_;
@@ -248,6 +241,8 @@ class PFAnalysis : public edm::one::EDAnalyzer<edm::one::WatchRuns, edm::one::Sh
   vector<int> element_layer_;
   vector<float> element_depth_;
   vector<float> element_trajpoint_;
+  vector<float> element_muon_dt_hits_;
+  vector<float> element_muon_csc_hits_;
 
   vector<int> element_distance_i_;
   vector<int> element_distance_j_;
@@ -276,6 +271,8 @@ class PFAnalysis : public edm::one::EDAnalyzer<edm::one::WatchRuns, edm::one::Sh
   CaloGeometry* geom;
   HcalTopology* hcal_topo;
   const HcalDDDRecConstants* hcons;
+
+  bool saveHits;
 };
 
 PFAnalysis::PFAnalysis() { ; }
@@ -288,6 +285,7 @@ PFAnalysis::PFAnalysis(const edm::ParameterSet& iConfig) {
   pfBlocks_ = consumes<std::vector<reco::PFBlock>>(edm::InputTag("particleFlowBlock"));
   pfCandidates_ = consumes<std::vector<reco::PFCandidate>>(edm::InputTag("particleFlow"));
   tracks_ = consumes<edm::View<reco::Track>>(edm::InputTag("generalTracks"));
+  saveHits = iConfig.getUntrackedParameter<bool>("saveHits", false);
 
   geometryToken_ = esConsumes<CaloGeometry, CaloGeometryRecord>(edm::ESInputTag{});
   topologyToken_ = esConsumes<HcalTopology, HcalRecNumberingRecord>(edm::ESInputTag{});
@@ -330,27 +328,29 @@ PFAnalysis::PFAnalysis(const edm::ParameterSet& iConfig) {
   t_->Branch("simcluster_idx_trackingparticle", &simcluster_idx_trackingparticle_);
   t_->Branch("simcluster_nhits", &simcluster_nhits_);
 
-  t_->Branch("simhit_frac", &simhit_frac_);
-  t_->Branch("simhit_x", &simhit_x_);
-  t_->Branch("simhit_y", &simhit_y_);
-  t_->Branch("simhit_z", &simhit_z_);
-  t_->Branch("simhit_det", &simhit_det_);
-  t_->Branch("simhit_subdet", &simhit_subdet_);
-  t_->Branch("simhit_eta", &simhit_eta_);
-  t_->Branch("simhit_phi", &simhit_phi_);
-  t_->Branch("simhit_idx_simcluster", &simhit_idx_simcluster_);
-  t_->Branch("simhit_detid", &simhit_detid_);
-
-  t_->Branch("rechit_e", &rechit_e_);
-  t_->Branch("rechit_x", &rechit_x_);
-  t_->Branch("rechit_y", &rechit_y_);
-  t_->Branch("rechit_z", &rechit_z_);
-  t_->Branch("rechit_det", &rechit_det_);
-  t_->Branch("rechit_subdet", &rechit_subdet_);
-  t_->Branch("rechit_eta", &rechit_eta_);
-  t_->Branch("rechit_phi", &rechit_phi_);
-  t_->Branch("rechit_idx_element", &rechit_idx_element_);
-  t_->Branch("rechit_detid", &rechit_detid_);
+  if (saveHits) {
+    t_->Branch("simhit_frac", &simhit_frac_);
+    t_->Branch("simhit_x", &simhit_x_);
+    t_->Branch("simhit_y", &simhit_y_);
+    t_->Branch("simhit_z", &simhit_z_);
+    t_->Branch("simhit_det", &simhit_det_);
+    t_->Branch("simhit_subdet", &simhit_subdet_);
+    t_->Branch("simhit_eta", &simhit_eta_);
+    t_->Branch("simhit_phi", &simhit_phi_);
+    t_->Branch("simhit_idx_simcluster", &simhit_idx_simcluster_);
+    t_->Branch("simhit_detid", &simhit_detid_);
+
+    t_->Branch("rechit_e", &rechit_e_);
+    t_->Branch("rechit_x", &rechit_x_);
+    t_->Branch("rechit_y", &rechit_y_);
+    t_->Branch("rechit_z", &rechit_z_);
+    t_->Branch("rechit_det", &rechit_det_);
+    t_->Branch("rechit_subdet", &rechit_subdet_);
+    t_->Branch("rechit_eta", &rechit_eta_);
+    t_->Branch("rechit_phi", &rechit_phi_);
+    t_->Branch("rechit_idx_element", &rechit_idx_element_);
+    t_->Branch("rechit_detid", &rechit_detid_);
+  }
 
   t_->Branch("simtrack_x", &simtrack_x_);
   t_->Branch("simtrack_y", &simtrack_y_);
@@ -375,6 +375,8 @@ PFAnalysis::PFAnalysis(const edm::ParameterSet& iConfig) {
   t_->Branch("element_px", &element_px_);
   t_->Branch("element_py", &element_py_);
   t_->Branch("element_pz", &element_pz_);
+  t_->Branch("element_deltap", &element_deltap_);
+  t_->Branch("element_sigmadeltap", &element_sigmadeltap_);
   t_->Branch("element_eta", &element_eta_);
   t_->Branch("element_phi", &element_phi_);
   t_->Branch("element_energy", &element_energy_);
@@ -387,6 +389,8 @@ PFAnalysis::PFAnalysis(const edm::ParameterSet& iConfig) {
   t_->Branch("element_layer", &element_layer_);
   t_->Branch("element_depth", &element_depth_);
   t_->Branch("element_trajpoint", &element_trajpoint_);
+  t_->Branch("element_muon_dt_hits", &element_muon_dt_hits_);
+  t_->Branch("element_muon_csc_hits", &element_muon_csc_hits_);
 
   //Distance matrix between PF elements
   t_->Branch("element_distance_i", &element_distance_i_);
@@ -455,27 +459,29 @@ void PFAnalysis::clearVariables() {
   simcluster_idx_trackingparticle_.clear();
   simcluster_nhits_.clear();
 
-  simhit_frac_.clear();
-  simhit_x_.clear();
-  simhit_y_.clear();
-  simhit_z_.clear();
-  simhit_det_.clear();
-  simhit_subdet_.clear();
-  simhit_eta_.clear();
-  simhit_phi_.clear();
-  simhit_idx_simcluster_.clear();
-  simhit_detid_.clear();
-
-  rechit_e_.clear();
-  rechit_x_.clear();
-  rechit_y_.clear();
-  rechit_z_.clear();
-  rechit_det_.clear();
-  rechit_subdet_.clear();
-  rechit_eta_.clear();
-  rechit_phi_.clear();
-  rechit_idx_element_.clear();
-  rechit_detid_.clear();
+  if (saveHits) {
+    simhit_frac_.clear();
+    simhit_x_.clear();
+    simhit_y_.clear();
+    simhit_z_.clear();
+    simhit_det_.clear();
+    simhit_subdet_.clear();
+    simhit_eta_.clear();
+    simhit_phi_.clear();
+    simhit_idx_simcluster_.clear();
+    simhit_detid_.clear();
+
+    rechit_e_.clear();
+    rechit_x_.clear();
+    rechit_y_.clear();
+    rechit_z_.clear();
+    rechit_det_.clear();
+    rechit_subdet_.clear();
+    rechit_eta_.clear();
+    rechit_phi_.clear();
+    rechit_idx_element_.clear();
+    rechit_detid_.clear();
+  }
 
   simtrack_x_.clear();
   simtrack_y_.clear();
@@ -499,6 +505,8 @@ void PFAnalysis::clearVariables() {
   element_px_.clear();
   element_py_.clear();
   element_pz_.clear();
+  element_deltap_.clear();
+  element_sigmadeltap_.clear();
   element_eta_.clear();
   element_phi_.clear();
   element_energy_.clear();
@@ -511,6 +519,8 @@ void PFAnalysis::clearVariables() {
   element_layer_.clear();
   element_depth_.clear();
   element_trajpoint_.clear();
+  element_muon_dt_hits_.clear();
+  element_muon_csc_hits_.clear();
 
   element_distance_i_.clear();
   element_distance_j_.clear();
@@ -616,12 +626,10 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
   //We need to use the original reco::Track collection for track association
   for (unsigned long ntrack = 0; ntrack < tracks.size(); ntrack++) {
     edm::RefToBase<reco::Track> trackref(trackHandle, ntrack);
-
-    //get the index of the track in the 'all_elements' collection that we use later
-    int idx_in_all_elements = find_element_ref(all_elements, trackref);
+    const auto vec_idx_in_all_elements = find_element_ref(all_elements, trackref);
 
     //track was not used by PF, we skip as well
-    if (idx_in_all_elements == -1) {
+    if (vec_idx_in_all_elements.empty()) {
       continue;
     }
 
@@ -629,11 +637,9 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
       const auto& tps = recotosim[trackref];
       for (const auto& tp : tps) {
         edm::Ref<std::vector<TrackingParticle>> tpr = tp.first;
-        trackingparticle_to_element.push_back(make_pair(tpr.key(), idx_in_all_elements));
-        //cout << "trackingparticle_to_element " << tpr.key() << " " << idx_in_all_elements << endl;
-        //cout << "track.eta=" << trackref->eta() << " track.phi=" << trackref->phi() << endl;
-        //cout << "elem.eta=" << all_elements[idx_in_all_elements].orig.trackRef()->eta();
-        //cout << " elem.phi=" << all_elements[idx_in_all_elements].orig.trackRef()->phi() << endl;
+        for (auto idx_in_all_elements : vec_idx_in_all_elements) {
+          trackingparticle_to_element.emplace_back(tpr.key(), idx_in_all_elements);
+        }
       }
     }
   }
@@ -676,7 +682,6 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
           const float z = pos.z();
           const float eta = pos.eta();
           const float phi = pos.phi();
-          //const float e = simcluster->energy() * hf.second;
 
           simhit_frac_.push_back(hf.second);
           simhit_x_.push_back(x);
@@ -727,6 +732,8 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
     reco::PFBlockElement::Type type = orig.type();
 
     float pt = 0.0;
+    float deltap = 0.0;
+    float sigmadeltap = 0.0;
     float px = 0.0;
     float py = 0.0;
     float pz = 0.0;
@@ -741,6 +748,8 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
     int charge = 0;
     int layer = 0;
     float depth = 0;
+    float muon_dt_hits = 0.0;
+    float muon_csc_hits = 0.0;
 
     if (type == reco::PFBlockElement::TRACK) {
       const auto& matched_pftrack = orig.trackRefPF();
@@ -763,40 +772,52 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
       pz = ref->pz();
       eta = ref->eta();
       phi = ref->phi();
-      energy = ref->pt() * cosh(ref->eta());
+      energy = ref->p();
       charge = ref->charge();
+
+      reco::MuonRef muonRef = orig.muonRef();
+      if (muonRef.isNonnull()) {
+        reco::TrackRef standAloneMu = muonRef->standAloneMuon();
+        if (standAloneMu.isNonnull()) {
+          muon_dt_hits = standAloneMu->hitPattern().numberOfValidMuonDTHits();
+          muon_csc_hits = standAloneMu->hitPattern().numberOfValidMuonCSCHits();
+        }
+      }
+
     } else if (type == reco::PFBlockElement::BREM) {
-      //requires to keep GsfPFRecTracks
       const auto* orig2 = (const reco::PFBlockElementBrem*)&orig;
       const auto& ref = orig2->GsftrackRef();
       if (ref.isNonnull()) {
+        deltap = orig2->DeltaP();
+        sigmadeltap = orig2->SigmaDeltaP();
         pt = ref->pt();
         px = ref->px();
         py = ref->py();
         pz = ref->pz();
         eta = ref->eta();
         phi = ref->phi();
-        energy = ref->pt() * cosh(ref->eta());
+        energy = ref->p();
         trajpoint = orig2->indTrajPoint();
         charge = ref->charge();
       }
     } else if (type == reco::PFBlockElement::GSF) {
       //requires to keep GsfPFRecTracks
       const auto* orig2 = (const reco::PFBlockElementGsfTrack*)&orig;
-      const auto& ref = orig2->GsftrackRef();
-      if (ref.isNonnull()) {
-        pt = ref->pt();
-        px = ref->px();
-        py = ref->py();
-        pz = ref->pz();
-        eta = ref->eta();
-        phi = ref->phi();
-        energy = ref->pt() * cosh(ref->eta());
+      const auto& vec = orig2->Pin();
+      pt = vec.pt();
+      px = vec.px();
+      py = vec.py();
+      pz = vec.pz();
+      eta = vec.eta();
+      phi = vec.phi();
+      energy = vec.energy();
+      if (!orig2->GsftrackRefPF().isNull()) {
+        charge = orig2->GsftrackRefPF()->charge();
       }
     } else if (type == reco::PFBlockElement::ECAL || type == reco::PFBlockElement::PS1 ||
                type == reco::PFBlockElement::PS2 || type == reco::PFBlockElement::HCAL ||
-               type == reco::PFBlockElement::GSF || type == reco::PFBlockElement::HO ||
-               type == reco::PFBlockElement::HFHAD || type == reco::PFBlockElement::HFEM) {
+               type == reco::PFBlockElement::HO || type == reco::PFBlockElement::HFHAD ||
+               type == reco::PFBlockElement::HFEM) {
       const auto& ref = ((const reco::PFBlockElementCluster*)&orig)->clusterRef();
       if (ref.isNonnull()) {
         eta = ref->eta();
@@ -836,6 +857,8 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
     element_px_.push_back(px);
     element_py_.push_back(py);
     element_pz_.push_back(pz);
+    element_deltap_.push_back(deltap);
+    element_sigmadeltap_.push_back(sigmadeltap);
     element_eta_.push_back(eta);
     element_phi_.push_back(phi);
     element_energy_.push_back(energy);
@@ -848,6 +871,8 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
     element_layer_.push_back(layer);
     element_depth_.push_back(depth);
     element_trajpoint_.push_back(trajpoint);
+    element_muon_dt_hits_.push_back(muon_dt_hits);
+    element_muon_csc_hits_.push_back(muon_csc_hits);
   }
 
   //associate candidates to elements
@@ -880,13 +905,6 @@ void PFAnalysis::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup
     icandidate += 1;
   }  //pfCandidates
 
-  cout << "trackingParticles=" << trackingParticles.size() << " caloParticles=" << caloParticles.size() << endl;
-  cout << "all_elements=" << all_elements.size() << endl;
-  cout << "pfCandidates=" << pfCandidates.size() << endl;
-  cout << "trackingparticle_to_element=" << trackingparticle_to_element.size()
-       << " simcluster_to_element=" << simcluster_to_element.size() << endl;
-  cout << "element_to_candidate=" << element_to_candidate.size() << endl;
-
   ev_event_ = iEvent.id().event();
   ev_lumi_ = iEvent.id().luminosityBlock();
   ev_run_ = iEvent.id().run();
@@ -957,8 +975,6 @@ pair<vector<ElementWithIndex>, vector<tuple<int, int, float>>> PFAnalysis::proce
       const auto vecidx = link.first;
       const auto dist = link.second.distance;
       const auto& ij = get_triu_vector_index(vecidx, block.elements().size());
-      cout << "block " << iblock << " " << ielem << " " << vecidx << " " << ij.first << " " << ij.second << " " << dist
-           << endl;
       auto globalindex_i = ij.first + ret.size();
       auto globalindex_j = ij.second + ret.size();
       distances.push_back(make_tuple(globalindex_i, globalindex_j, dist));
@@ -998,7 +1014,7 @@ void PFAnalysis::associateClusterToSimCluster(const vector<ElementWithIndex>& al
         if (detids.find(pfrh.detId()) != detids.end()) {
           continue;
         }
-        detids[pfrh.detId()] += pfrh.energy();
+        detids[pfrh.detId()] += pfrh.energy() * rh.fraction();
         const auto id = DetId(pfrh.detId());
         float x = 0;
         float y = 0;
@@ -1020,10 +1036,10 @@ void PFAnalysis::associateClusterToSimCluster(const vector<ElementWithIndex>& al
         rechit_subdet_.push_back(id.subdetId());
         rechit_eta_.push_back(eta);
         rechit_phi_.push_back(phi);
-        rechit_e_.push_back(pfrh.energy());
+        rechit_e_.push_back(pfrh.energy() * rh.fraction());
         rechit_idx_element_.push_back(idx_element);
         rechit_detid_.push_back(id.rawId());
-        rechits_energy_all[id.rawId()] += pfrh.energy();
+        rechits_energy_all[id.rawId()] += pfrh.energy() * rh.fraction();
       }  //rechit_fracs
     } else if (type == reco::PFBlockElement::SC) {
       const auto& clref = ((const reco::PFBlockElementSuperCluster*)&(elem.orig))->superClusterRef();
@@ -1036,7 +1052,7 @@ void PFAnalysis::associateClusterToSimCluster(const vector<ElementWithIndex>& al
         if (detids.find(rh.first.rawId()) != detids.end()) {
           continue;
         }
-        detids[rh.first.rawId()] += rh.second;
+        detids[rh.first.rawId()] += cluster.energy() * rh.second;
         const auto id = rh.first;
         float x = 0;
         float y = 0;
@@ -1061,7 +1077,7 @@ void PFAnalysis::associateClusterToSimCluster(const vector<ElementWithIndex>& al
         rechit_e_.push_back(rh.second);
         rechit_idx_element_.push_back(idx_element);
         rechit_detid_.push_back(id.rawId());
-        rechits_energy_all[id.rawId()] += rh.second;
+        rechits_energy_all[id.rawId()] += cluster.energy() * rh.second;
       }  //rechit_fracs
     }
     detids_elements.push_back(detids);
@@ -1085,7 +1101,7 @@ void PFAnalysis::associateClusterToSimCluster(const vector<ElementWithIndex>& al
         }
 
         //get the energy of the simcluster hits that matches detids of the rechits
-        double cmp = detid_compare(detids, simcluster_detids, rechits_energy_all, false);
+        double cmp = detid_compare(detids, simcluster_detids);
         if (cmp > 0) {
           simcluster_to_element.push_back(make_pair(isimcluster, ielement));
           simcluster_to_element_cmp.push_back((float)cmp);
diff --git a/Validation/RecoParticleFlow/python/customize_pfanalysis.py b/Validation/RecoParticleFlow/python/customize_pfanalysis.py
index c54f0db341417..a38ebc9358555 100644
--- a/Validation/RecoParticleFlow/python/customize_pfanalysis.py
+++ b/Validation/RecoParticleFlow/python/customize_pfanalysis.py
@@ -32,4 +32,11 @@ def customize_step3(process):
     process.FEVTDEBUGHLToutput.outputCommands.append('keep recoPFRecHits_*_*_*')
     process.FEVTDEBUGHLToutput.outputCommands.append('keep recoGsfPFRecTracks_*_*_*')
     process.FEVTDEBUGHLToutput.outputCommands.append('keep *_particleFlowBlock_*_*')
+    process.FEVTDEBUGHLToutput.outputCommands.append('keep recoTracks_standAloneMuons_*_*')
+    process.FEVTDEBUGHLToutput.outputCommands.append('keep recoTrackExtras_standAloneMuons_*_*')
+    process.FEVTDEBUGHLToutput.outputCommands.append('keep recoMuons_*_*_*')
+    process.FEVTDEBUGHLToutput.outputCommands.append('keep recoTracks_*_*_*')
+    process.FEVTDEBUGHLToutput.outputCommands.append('keep recoGsfTracks_*_*_*')
+    process.FEVTDEBUGHLToutput.outputCommands.append('keep recoPFBlocks_*_*_*')
+
     return process
diff --git a/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py b/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
index 92facf3efbbc4..e20ce73b2c735 100644
--- a/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
+++ b/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
@@ -32,8 +32,7 @@
     duplicateCheckMode = cms.untracked.string("noDuplicateCheck")
 )
 
-process.ana = cms.EDAnalyzer('PFAnalysis',
-)
+process.ana = cms.EDAnalyzer('PFAnalysisNtuplizer')
 
 process.TFileService = cms.Service("TFileService",
     fileName = cms.string("pfntuple.root")

From f41b5e56c0c43f3983ef819b3f6931570df616f9 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 1 Dec 2020 11:33:24 +0200
Subject: [PATCH 2/4] switch to const tf pointer

---
 RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc b/RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc
index 1ae5c1877052e..add3f1e27ddd1 100644
--- a/RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc
+++ b/RecoParticleFlow/PFProducer/plugins/MLPFProducer.cc
@@ -8,7 +8,7 @@
 #include "RecoParticleFlow/PFProducer/interface/MLPFModel.h"
 
 struct MLPFCache {
-  std::atomic<tensorflow::GraphDef*> graph_def;
+  const tensorflow::GraphDef* graph_def;
 };
 
 class MLPFProducer : public edm::stream::EDProducer<edm::GlobalCache<MLPFCache> > {

From 228720b1e4352ac2c30f34bca0b2de5c97fa7c6b Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Tue, 1 Dec 2020 12:22:52 +0200
Subject: [PATCH 3/4] revert change

---
 Configuration/EventContent/python/EventContent_cff.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Configuration/EventContent/python/EventContent_cff.py b/Configuration/EventContent/python/EventContent_cff.py
index 9195a125c8ea2..e7bd292dbf241 100644
--- a/Configuration/EventContent/python/EventContent_cff.py
+++ b/Configuration/EventContent/python/EventContent_cff.py
@@ -227,7 +227,6 @@ def SwapKeepAndDrop(l):
     outputCommands = RECOEventContent.outputCommands + RecoCTPPSRECO.outputCommands)
 phase2_hgcal.toModify(RECOEventContent,
     outputCommands = RECOEventContent.outputCommands + TICL_RECO.outputCommands)
-
 #
 #
 # RAWRECO Data Tier definition
@@ -401,7 +400,6 @@ def SwapKeepAndDrop(l):
     outputCommands = RECOSIMEventContent.outputCommands + RecoLocalFastTimeRECO.outputCommands)
 phase2_timing_layer.toModify(RECOSIMEventContent, 
     outputCommands = RECOSIMEventContent.outputCommands + RecoMTDRECO.outputCommands)
-
 #
 #
 # GENRAW Data Tier definition

From ff6aece37df9759efd22469ca9817960cc4bc102 Mon Sep 17 00:00:00 2001
From: Joosep Pata <joosep.pata@gmail.com>
Date: Wed, 9 Dec 2020 13:46:46 +0200
Subject: [PATCH 4/4] remove sonic triton producer for now

---
 .../plugins/MLPFProducerSonicTriton.cc        | 136 ------------------
 1 file changed, 136 deletions(-)
 delete mode 100644 RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc

diff --git a/RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc b/RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc
deleted file mode 100644
index 43c3a25b56fd4..0000000000000
--- a/RecoParticleFlow/PFProducer/plugins/MLPFProducerSonicTriton.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-#include <sstream>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <cmath>
-#include <map>
-
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-
-#include "HeterogeneousCore/SonicCore/interface/SonicEDProducer.h"
-#include "HeterogeneousCore/SonicTriton/interface/TritonClient.h"
-#include "RecoParticleFlow/PFProducer/interface/MLPFModel.h"
-
-class MLPFProducerSonicTriton : public SonicEDProducer<TritonClient> {
-public:
-  explicit MLPFProducerSonicTriton(edm::ParameterSet const& cfg)
-      : SonicEDProducer<TritonClient>(cfg),
-        pfCandidatesPutToken_{produces<reco::PFCandidateCollection>()},
-        inputTagBlocks_(consumes<reco::PFBlockCollection>(cfg.getParameter<edm::InputTag>("src"))) {
-    this->setDebugName("MLPFProducerSonic");
-  }
-
-  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, Input& iInput) override {
-    using namespace reco::mlpf;
-
-    //get the PFElements in the event. Currently use PFBlock for convenience, but we don't need anything
-    //else the block does, later we can get the events directly from the event.
-    const auto& blocks = iEvent.get(inputTagBlocks_);
-    const auto& all_elements = getPFElements(blocks);
-
-    const auto num_elements_total = all_elements.size();
-
-    //tensor size must be a multiple of the bin size and larger than the number of elements
-    const auto tensor_size = LSH_BIN_SIZE * (num_elements_total / LSH_BIN_SIZE + 1);
-    assert(tensor_size <= NUM_MAX_ELEMENTS_BATCH);
-
-    auto& input1 = iInput.at("x");
-
-    //we ignore Sonic/Triton batching, as it doesn't create a dim-3 input for batch size 1.
-    //instead, we specify the batch dim as a model dim.
-    input1.setShape(0, 1);
-    input1.setShape(1, tensor_size);
-
-    auto data1 = std::make_shared<TritonInput<float>>(1);
-    auto& vdata1 = (*data1)[0];
-    vdata1.reserve(input1.sizeShape());
-
-    //Fill the input tensor
-    for (const auto* pelem : all_elements) {
-      const auto& elem = *pelem;
-
-      //prepare the input array from the PFElement
-      const auto& props = getElementProperties(elem);
-
-      //copy features to the input array
-      for (unsigned int iprop = 0; iprop < NUM_ELEMENT_FEATURES; iprop++) {
-        vdata1.push_back(normalize(props[iprop]));
-      }
-    }
-    input1.toServer(data1);
-  }
-
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, Output const& iOutput) override {
-    using namespace reco::mlpf;
-
-    //we need the input element list to set the refs on the produced candidate. Currently use PFBlock for convenience, but we don't need anything
-    //else the block does, later we can get the events directly from the event.
-    const auto& blocks = iEvent.get(inputTagBlocks_);
-    const auto& all_elements = getPFElements(blocks);
-
-    std::vector<reco::PFCandidate> pOutputCandidateCollection;
-    const auto& output1 = iOutput.at("Identity");
-
-    //get the data of the first (and only) batch
-    const auto& out_data = output1.fromServer<float>();
-
-    //batch size 1
-    assert(output1.shape()[0] == 1);
-
-    //model should have the correct number of outputs
-    assert(output1.shape()[2] == NUM_OUTPUTS);
-
-    //we process only uyp to the true number of input elements, the predicion is padded to the bin size
-    const auto num_elem = all_elements.size();
-
-    for (size_t ielem = 0; ielem < num_elem; ielem++) {
-      //get the coefficients in the output corresponding to the class probabilities (raw logits)
-      std::vector<float> pred_id_logits;
-      for (unsigned int idx_id = 0; idx_id <= NUM_CLASS; idx_id++) {
-        pred_id_logits.push_back(out_data[0][ielem * NUM_OUTPUTS + idx_id]);
-      }
-
-      //get the most probable class PDGID
-      int pred_pid = pdgid_encoding[argMax(pred_id_logits)];
-
-      //get the predicted momentum components
-      float pred_eta = out_data[0][ielem * NUM_OUTPUTS + IDX_ETA];
-      float pred_phi = out_data[0][ielem * NUM_OUTPUTS + IDX_PHI];
-      float pred_e = out_data[0][ielem * NUM_OUTPUTS + IDX_ENERGY];
-      float pred_charge = out_data[0][ielem * NUM_OUTPUTS + IDX_CHARGE];
-
-      //a particle was predicted for this PFElement, otherwise it was a spectator
-      if (pred_pid != 0) {
-        auto cand = makeCandidate(pred_pid, pred_charge, pred_e, pred_eta, pred_phi);
-        setCandidateRefs(cand, all_elements, ielem);
-        pOutputCandidateCollection.push_back(cand);
-      }
-    }  //loop over PFElements
-
-    iEvent.emplace(pfCandidatesPutToken_, pOutputCandidateCollection);
-  }
-
-  ~MLPFProducerSonicTriton() override {}
-
-  //to ensure distinct cfi names - specialized below
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-    edm::ParameterSetDescription desc;
-    TritonClient::fillPSetDescription(desc);
-    desc.add<edm::InputTag>("src", edm::InputTag("particleFlowBlock"));
-    descriptions.addWithDefaultLabel(desc);
-  }
-
-private:
-  const edm::EDPutTokenT<reco::PFCandidateCollection> pfCandidatesPutToken_;
-  const edm::EDGetTokenT<reco::PFBlockCollection> inputTagBlocks_;
-};
-
-DEFINE_FWK_MODULE(MLPFProducerSonicTriton);
\ No newline at end of file