From db4996112894597751b77edefc35fecd9cecd4ed Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Fri, 23 Oct 2015 12:08:35 +0200 Subject: [PATCH] add CBox-focused LLC monitoring, add C wrapper to intelpcm.so Added pcm-llc.cpp (and code in cpucounters.cpp) for monitoring the individual CBoxes, with a focus on LLC activity. Added (boring) wrapper code for using intelpcm.so in C code. --- .gitignore | 3 + Makefile | 10 +- cpucounters.cpp | 194 ++++++++++++++++++++- cpucounters.h | 58 ++++++- intelpcm.so/Makefile | 12 +- intelpcm.so/c_wrap_intelpcm.cc | 74 ++++++++ intelpcm.so/c_wrap_intelpcm.h | 68 ++++++++ pcm-llc.cpp | 299 +++++++++++++++++++++++++++++++++ pcm-pcie.cpp | 4 +- types.h | 12 +- utils.cpp | 3 +- 11 files changed, 715 insertions(+), 22 deletions(-) create mode 100644 intelpcm.so/c_wrap_intelpcm.cc create mode 100644 intelpcm.so/c_wrap_intelpcm.h create mode 100644 pcm-llc.cpp diff --git a/.gitignore b/.gitignore index ba7a906..42e10d2 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ *.vxd *.exe *.tgz +tags +intelpcm.so/libintelpcm.so +intelpcm.so/libintelpcm.a diff --git a/Makefile b/Makefile index f7a78d6..d0b98ac 100644 --- a/Makefile +++ b/Makefile @@ -3,15 +3,15 @@ # written by Roman Dementiev and Jim Harris # -EXE = pcm-numa.x pcm-power.x pcm.x pcm-sensor.x pcm-msr.x pcm-memory.x pcm-tsx.x pcm-pcie.x +EXE = pcm-numa.x pcm-power.x pcm.x pcm-sensor.x pcm-msr.x pcm-memory.x pcm-tsx.x pcm-pcie.x pcm-llc.x all: $(EXE) -CXXFLAGS += -Wall -g -O3 +CXXFLAGS += -Wall -g -O3 # uncomment if you want to rely on Linux perf support (user needs CAP_SYS_ADMIN privileges) ifneq ($(wildcard /usr/include/linux/perf_event.h),) -#CXXFLAGS += -DPCM_USE_PERF +CXXFLAGS += -DPCM_USE_PERF endif UNAME:=$(shell uname) @@ -20,8 +20,8 @@ ifeq ($(UNAME), Linux) LIB= -pthread -lrt endif ifeq ($(UNAME), Darwin) -LIB= -lpthread /usr/lib/libPcmMsr.dylib -CXXFLAGS += -I/usr/include +LIB= -lpthread /usr/lib/libPcmMsr.dylib +CXXFLAGS += -I/usr/include endif ifeq ($(UNAME), FreeBSD) CXX=c++ diff --git a/cpucounters.cpp b/cpucounters.cpp index bbb5e4b..6b891b9 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -1,5 +1,6 @@ /* Copyright (c) 2009-2014, Intel Corporation +some parts Copyright (c) 2015 Marius Hillenbrand, Karlsruhe Institute of Technology All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -16,6 +17,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND // Pat Fay // Austen Ott // Jim Harris (FreeBSD) +// Marius Hillenbrand (LLC-related code for CBoxes) /*! \file cpucounters.cpp \brief The bulk of Intel PCM implementation @@ -437,6 +439,12 @@ void PCM::initL3CacheOccupancyMonitoring() return; } + /* Check if we can access MSRs at all (would segfault otherwise) */ + if(MSR == NULL) + { + return; + } + unsigned maxRMID; const uint64 event = 1; //L3 Occupancy monitoring @@ -4342,12 +4350,31 @@ void PCM::programCboOpcodeFilter(const uint32 opc, const uint32 cbo, SafeMsrHand } } -void PCM::programPCIeMissCounters(const PCM::PCIeEventCode event_, const uint32 tid_) +void PCM::programCboFilter0(const uint32 state, const int32 filterCoreId, const int32 filterThreadId, const uint32 cbo, SafeMsrHandle * msr) +{ + uint32 tid = 0; + + if(filterCoreId != -1 && filterThreadId != -1) { + tid = ( ((filterCoreId & 0xf) << 1) | (filterThreadId & 0x1) ); + } + + if(JAKETOWN == cpu_model || IVYTOWN == cpu_model) + { + // erm, I do not know, do not care. + + } else if(HASWELLX == cpu_model) + { + msr->write(CX_MSR_PMON_BOX_FILTER(cbo), HSX_CBO_MSR_PMON_BOX_FILTER_STATE(state) | tid); + } +} + + +void PCM::programPCIeMissCounters(const PCM::CBoxOpcode event_, const uint32 tid_) { programPCIeCounters(event_,tid_,1); } -void PCM::programPCIeCounters(const PCM::PCIeEventCode event_, const uint32 tid_, const uint32 miss_) +void PCM::programPCIeCounters(const PCM::CBoxOpcode event_, const uint32 tid_, const uint32 miss_) { for (int32 i = 0; (i < num_sockets) && MSR; ++i) { @@ -4389,6 +4416,124 @@ void PCM::programPCIeCounters(const PCM::PCIeEventCode event_, const uint32 tid_ } } +// programLLCCounters +void PCM::programLLCCounters(LLCRequestType requestType, CBoxOpcode opcode, int filterCoreId, int filterThreadId) +{ + + // minimal attempt at docs: + // * there are two filter config registers per CBox, which allow + // filtering events for cache line state, opcode of a request, + // and originating core/thread id and node id. + // * the event configuration specifies which filters are active. + // The LLC_LOOKUP event uses cache line state but not request + // opcode, the TOR_INSERT filters for opcode, but not cache line + // state. + // + // Here we use counter 0 for the LLC_LOOKUP event and + // counter 1 for the TOR_INSERT event for tracking incoming + // requests. + // + // We optionally filter LLC_LOOKUPs for cache line state and + // TOR_INSERTs for opcode (and request type). In both cases, the + // Any/AnyOp pseudo-filter causes all events to be counted. + // + // Further, we allow to filter events for the originating core and + // thread. + // tid field in FILTER0: + // * bit 5: non-thread related data + // * bits 4:1 core-id + // * bit 0: thread id on the core + // (someone does not like SMT beyond 2-thread HT?!) + // tid_en field in the counter config registers (bit 19) + + // TODO move this documentation somewhere else + + for (int32 i = 0; (i < num_sockets) && MSR; ++i) + { + uint32 refCore = socketRefCore[i]; + TemporalThreadAffinity tempThreadAffinity(refCore); // speedup trick for Linux + + uint32 tid_en = 0; + + if(filterCoreId != -1 && filterThreadId != -1) + tid_en = CBO_MSR_PMON_CTL_TID_EN; + + for(uint32 cbo = 0; cbo < getMaxNumOfCBoxes(); ++cbo) + { + // freeze enable + MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN); + // freeze + MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN + CBO_MSR_PMON_BOX_CTL_FRZ); + +#ifdef PCM_UNCORE_PMON_BOX_CHECK_STATUS + uint64 val = 0; + MSR[refCore]->read(CX_MSR_PMON_BOX_CTL(cbo), &val); + if ((val & UNCORE_PMON_BOX_CTL_VALID_BITS_MASK) != (CBO_MSR_PMON_BOX_CTL_FRZ_EN + CBO_MSR_PMON_BOX_CTL_FRZ)) + { + std::cerr << "ERROR: CBO counter programming seems not to work. "; + std::cerr << "C" << std::dec << cbo << "_MSR_PMON_BOX_CTL=0x" << std::hex << val << std::endl; + } +#endif + // program filter 0, state to 0x1F + programCboFilter0(0x1F, filterCoreId, filterThreadId, cbo, MSR[refCore]); + + MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 0), CBO_MSR_PMON_CTL_EN); + // LLC_Lookup event, capture any request + MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 0), \ + CBO_MSR_PMON_CTL_EN | CBO_MSR_PMON_CTL_EVENT(0x34) \ + | (CBO_MSR_PMON_CTL_UMASK(requestType)) | tid_en); + + + MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 1), CBO_MSR_PMON_CTL_EN); + // program counter 1 for TOR_INSERTS + // umask: all requests + uint64 umask; + switch(opcode) { + case AnyOp: umask = 0x08; break; + case WB: umask = 0x10; break; + default: umask = 0x01; // filtered by opcode + // program filter 1, opcode + programCboOpcodeFilter(opcode, cbo, MSR[refCore]); + break; + } + + // TODO change umask for filtering for opcodes + MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 1), \ + CBO_MSR_PMON_CTL_EN | CBO_MSR_PMON_CTL_EVENT(0x35) \ + | CBO_MSR_PMON_CTL_UMASK(umask) | tid_en ); + + // reset counter values + MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN + CBO_MSR_PMON_BOX_CTL_FRZ + CBO_MSR_PMON_BOX_CTL_RST_COUNTERS); + + // unfreeze counters + MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN); + } + } +} + +LLCCounterState PCM::getLLCCounterState(const uint32 socket_) +{ + uint32 refCore = socketRefCore[socket_]; + TemporalThreadAffinity tempThreadAffinity(refCore); // speedup trick for Linux + + uint32 cbo; + + LLCCounterState res; + + for(cbo=0; cbo < getMaxNumOfCBoxes() && cbo < 18; ++cbo) + { + uint64 ctrVal = 0; + // get number of LLC lookups from first counter + MSR[refCore]->read(CX_MSR_PMON_CTRY(cbo, 0), &ctrVal); + res.lookups[cbo] = ctrVal; + // get number of requests from second counter + MSR[refCore]->read(CX_MSR_PMON_CTRY(cbo, 1), &ctrVal); + res.requests[cbo] = ctrVal; + } + + return res; +} + PCIeCounterState PCM::getPCIeCounterState(const uint32 socket_) { PCIeCounterState result; @@ -4404,3 +4549,48 @@ PCIeCounterState PCM::getPCIeCounterState(const uint32 socket_) } return result; } + +std::ostream& operator<<(std::ostream& out, const PCM::CBoxOpcode opc) { + const char * str = 0; +#define STR_CASE(v) case(PCM::v): str = #v; break; + switch(opc) { + STR_CASE(PCIeRdCur); + STR_CASE(PCIeNSRd); + STR_CASE(PCIeWiLF); + STR_CASE(PCIeItoM); + STR_CASE(PCIeNSWr); + STR_CASE(PCIeNSWrF); + STR_CASE(RFO); + STR_CASE(CRd); + STR_CASE(DRd); + STR_CASE(PRd); + STR_CASE(WCiLF); + STR_CASE(WCiL); + STR_CASE(WiL); + STR_CASE(WbMtoI); + STR_CASE(WbMtoE); + STR_CASE(ItoM); + case PCM::AnyOp: str = "Any"; break; + STR_CASE(WB); + } + out << str; + return out; + } +#undef STR_CASE + +std::ostream& operator<<(std::ostream& out, const PCM::LLCRequestType type) { + const char * str = 0; +#define STR_CASE(v) case(PCM::v): str = #v; break; + switch(type) { + STR_CASE(DataRead); + STR_CASE(Write); + STR_CASE(RemoteSnoop); + STR_CASE(Any); + STR_CASE(Read); + STR_CASE(Nid); + } + out << str; + return out; + } +#undef STR_CASE + diff --git a/cpucounters.h b/cpucounters.h index b89d74b..f6ed970 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -186,14 +186,21 @@ class PCIeCounterState { friend uint64 getNumberOfEvents(PCIeCounterState before, PCIeCounterState after); friend class PCM; - uint64 data; public: + uint64 data; PCIeCounterState(): data(0) { } virtual ~PCIeCounterState() {} }; +struct LLCCounterState { + uint64 lookups[18]; + uint64 requests[18]; +}; + + + #ifndef HACK_TO_REMOVE_DUPLICATE_ERROR template class INTELPCM_API std::allocator; template class INTELPCM_API std::vector; @@ -446,8 +453,8 @@ class INTELPCM_API PCM uint32 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const; uint32 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const; uint32 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const; - uint32 getMaxNumOfCBoxes() const; void programCboOpcodeFilter(const uint32 opc, const uint32 cbo, SafeMsrHandle * msr); + void programCboFilter0(const uint32 state,const int32 filterCoreId, const int32 filterThreadId, const uint32 cbo, SafeMsrHandle * msr); public: /*! @@ -457,6 +464,8 @@ class INTELPCM_API PCM */ bool L3CacheOccupancyMetricAvailable(); + uint32 getMaxNumOfCBoxes() const; + /*! * \brief returns the max number of RMID supported by socket * @@ -808,7 +817,8 @@ class INTELPCM_API PCM inline void disableJKTWorkaround() { disable_JKT_workaround = true; } - enum PCIeEventCode + // hijacked for more general opcodes + enum CBoxOpcode { // PCIe read events (PCI devices reading from memory - application writes to disk/network/PCIe device) PCIeRdCur = 0x19E, // PCIe read current (full cache line) @@ -823,10 +833,18 @@ class INTELPCM_API PCM CRd = 0x181, // Demand Code Read DRd = 0x182, // Demand Data Read PRd = 0x187, // Partial Reads (UC) (MMIO Read) + WCiLF = 0x18C, // Full Streaming Store - write invalidate full cache line + WCiL = 0x18D, // Partial Streaming Store - write invalidate for partial cache line WiL = 0x18F, // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT + WbMtoI = 0x1C4, // Request writeback and invalidation of modified line + WbMtoE = 0x1C5, // Request writeback of modified line, set to exclusive ItoM = 0x1C8, // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic + WB, // pseudo-opcode for actual writebacks + AnyOp, // pseudo-opcode for do not filter }; + friend std::ostream& operator<<(std::ostream& out, const CBoxOpcode opc); + enum CBoEventTid { RFOtid = 0x3E, @@ -836,14 +854,44 @@ class INTELPCM_API PCM //! \brief Program uncore PCIe monitoring event(s) //! \param event_ a PCIe event to monitor //! \param tid_ tid filter (PCM supports it only on Haswell server) - void programPCIeCounters(const PCIeEventCode event_, const uint32 tid_ = 0, const uint32 miss_ = 0); - void programPCIeMissCounters(const PCIeEventCode event_, const uint32 tid_ = 0); + void programPCIeCounters(const CBoxOpcode event_, const uint32 tid_ = 0, const uint32 miss_ = 0); + void programPCIeMissCounters(const CBoxOpcode event_, const uint32 tid_ = 0); //! \brief Get the state of PCIe counter(s) //! \param socket_ socket of the PCIe controller //! \return State of PCIe counter(s) PCIeCounterState getPCIeCounterState(const uint32 socket_); + enum LLCRequestType + { + DataRead = 0x03, + Write = 0x05, + RemoteSnoop = 0x09, + Any = 0x11, + Read = 0x21, // any read request + Nid = 0x41, // Node Id filter + + }; + + friend std::ostream& operator<<(std::ostream& out, const LLCRequestType type); + + //! \brief Program uncore LLC monitoring event(s) + //! \param requestType which types of requests to monitor + void programLLCCounters(LLCRequestType requestType, \ + CBoxOpcode opcode, int filterCoreId, int filterThreadId); + + // TODO add function with opcode filter ..., add opcode any?! + + //! \brief Program uncore LLC monitoring for any request type + void programLLCCounters() { programLLCCounters(Any, AnyOp, -1, -1); } + + //! \brief Return current counter values of Cboxes + //! \param socket_ socket where to read Cbox counters + //! \param res pointer to array where to store counter values + //! \param res_len number of counter values to store + //! \return number of counter values read + LLCCounterState getLLCCounterState(const uint32 socket_); + uint64 extractCoreGenCounterValue(uint64 val); uint64 extractCoreFixedCounterValue(uint64 val); uint64 extractUncoreGenCounterValue(uint64 val); diff --git a/intelpcm.so/Makefile b/intelpcm.so/Makefile index 7ad56a0..928c271 100644 --- a/intelpcm.so/Makefile +++ b/intelpcm.so/Makefile @@ -1,14 +1,20 @@ # Copyright (c) 2014 Intel Corporation -all: libintelpcm.so +all: libintelpcm.so libintelpcm.a OPT= -g -O3 -CXXFLAGS+= -Wall -fPIC $(OPT) +CXXFLAGS+= -Wall -fPIC $(OPT) -I../ vpath %.cpp .. -libintelpcm.so: msr.o cpucounters.o pci.o client_bw.o +OBJS=msr.o cpucounters.o pci.o client_bw.o c_wrap_intelpcm.o + + +libintelpcm.so: $(OBJS) $(CXX) $(CXXFLAGS) -shared $^ -lpthread -o $@ +libintelpcm.a: $(OBJS) + ar rcs $@ $^ + clean: rm -rf *.x *.o *~ *.so diff --git a/intelpcm.so/c_wrap_intelpcm.cc b/intelpcm.so/c_wrap_intelpcm.cc new file mode 100644 index 0000000..97f1e78 --- /dev/null +++ b/intelpcm.so/c_wrap_intelpcm.cc @@ -0,0 +1,74 @@ +// wrap_intelpcm.cc +// Intel library is C++ only, so we need to wrap it up nicely +// Copyright 2015 Marius Hillenbrand, Karlsruhe Institute of Technology + +#include "c_wrap_intelpcm.h" +#include + +pcm_handle_t getInstance() { + + PCM * m = PCM::getInstance(); + + m->setBlocked(false); + + return (pcm_handle_t) m; +} + +int getNumSockets(pcm_handle_t instance) { + + PCM * m = (PCM *) instance; + + return m->getNumSockets(); +} + +int getMaxNumOfCBoxes(pcm_handle_t instance) { + + PCM * m = (PCM *) instance; + + return m->getMaxNumOfCBoxes(); +} + + +void programLLCCounters(pcm_handle_t instance) { + + PCM * m = (PCM *) instance; + + m->programLLCCounters(); +} + +// void freezeUncoreCounters(pcm_handle_t instance); +//void unfreezeUncoreCounters(pcm_handle_t instance); + +struct LLCCounters getLLCCounterState(pcm_handle_t instance, int socket) { + + PCM * m = (PCM *) instance; + + LLCCounterState res = m->getLLCCounterState(socket); + struct LLCCounters ret; + + memset(&ret, 0, sizeof(struct LLCCounters)); + + memcpy(&ret, &res, sizeof(ret)); + + return ret; +} + + +void programPCIeCounters(pcm_handle_t instance, enum CBoxOpc opc, uint32 tid, uint32 miss) { + + PCM::CBoxOpcode event = PCM::CBoxOpcode(opc); + + PCM * m = (PCM *) instance; + + m->programPCIeCounters(event, tid, miss); +} + +uint64_t getPCIeCounters(pcm_handle_t instance, int socket) { + + PCM * m = (PCM *) instance; + + PCIeCounterState s = m->getPCIeCounterState(socket); + + return s.data; +} + diff --git a/intelpcm.so/c_wrap_intelpcm.h b/intelpcm.so/c_wrap_intelpcm.h new file mode 100644 index 0000000..a65d72b --- /dev/null +++ b/intelpcm.so/c_wrap_intelpcm.h @@ -0,0 +1,68 @@ +// wrap_intelpcm.h +// +// definitions for C wrapper around Intel's (C++-only) PCM library +// Copyright 2015 Marius Hillenbrand, Karlsruhe Institute of Technology + +#include + +typedef void * pcm_handle_t; + +#ifdef __cplusplus +extern "C" { +#endif + +struct LLCCounters { + // only getMaxNumOfCBoxes() values will be set + uint64_t lookups[18]; + uint64_t requests[18]; +}; + +enum CBoxOpc + { + // PCIe read events (PCI devices reading from memory - + // application writes to disk/network/PCIe device) + PCIeRdCur = 0x19E, // PCIe read current (full cache line) + PCIeNSRd = 0x1E4, // PCIe non-snoop read (full cache line) + // PCIe write events (PCI devices writing to memory - + // application reads from disk/network/PCIe device) + PCIeWiLF = 0x194, // PCIe Write (non-allocating) (full cache line) + PCIeItoM = 0x19C, // PCIe Write (allocating) (full cache line) + PCIeNSWr = 0x1E5, // PCIe Non-snoop write (partial cache line) + PCIeNSWrF = 0x1E6, // PCIe Non-snoop write (full cache line) + // events shared by CPU and IO + RFO = 0x180, // Demand Data RFO; share the same code for CPU, use tid to filter PCIe only traffic + CRd = 0x181, // Demand Code Read + DRd = 0x182, // Demand Data Read + PRd = 0x187, // Partial Reads (UC) (MMIO Read) + WCiLF = 0x18C, // Full Streaming Store - write invalidate full cache line + WCiL = 0x18D, // Partial Streaming Store - write invalidate for partial cache line + WiL = 0x18F, // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT + WbMtoI = 0x1C4, // Request writeback and invalidation of modified line + WbMtoE = 0x1C5, // Request writeback of modified line, set to exclusive + ItoM = 0x1C8, // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic + WB, // pseudo-opcode for actual writebacks + AnyOp, // pseudo-opcode for do not filter + }; + +pcm_handle_t getInstance(); + +int getNumSockets(pcm_handle_t instance); +int getMaxNumOfCBoxes(pcm_handle_t instance); + +// Last Level Cache events +void programLLCCounters(pcm_handle_t instance); + +struct LLCCounters getLLCCounterState(pcm_handle_t instance, int socket); + +// PCIe related events +// program and reset counters +// track misses (miss=1) or hits (miss=0) +void programPCIeCounters(pcm_handle_t instance, enum CBoxOpc opc, uint32_t tid, uint32_t miss); + +// gets current reading of counters, aggregates over all CBoxes +uint64_t getPCIeCounters(pcm_handle_t instance, int socket); + +#ifdef __cplusplus +} +#endif + diff --git a/pcm-llc.cpp b/pcm-llc.cpp new file mode 100644 index 0000000..93e719b --- /dev/null +++ b/pcm-llc.cpp @@ -0,0 +1,299 @@ +/* + Copyright (c) 2009-2013, Intel Corporation +some parts Copyright (c) 2015 Marius Hillenbrand, Karlsruhe Institute of Technology + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +// written by Patrick Lu +// modified by Marius Hillenbrand + + +/*! \file pcm-llc.cpp + \brief Use CBox counters for monitoring LLC lookups + */ +#define HACK_TO_REMOVE_DUPLICATE_ERROR +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cpucounters.h" +#include "utils.h" + +#define PCM_DELAY_DEFAULT 2.0 // in seconds +#define PCM_DELAY_MIN 0.015 // 15 milliseconds is practical on most modern CPUs +#define PCM_CALIBRATION_INTERVAL 50 // calibrate clock only every 50th iteration + +using namespace std; + +const uint32 max_sockets = 4; + +void printInfoLine(uint64 counters[18], int cboxes, int csv) +{ + // determine hot cboxes + uint64 sum = 0; + uint64 avg; + + for(int j=0; j < cboxes; j++) + sum += counters[j]; + + avg = sum / cboxes; + + // print counter values + if(csv) + { + for(int j=0; j < cboxes; j++) + cout << "," << counters[j]; + } + else + { + for(int j=0; j < cboxes; j++) { + // show whether cbox is hot + if( counters[j] > avg ) + cout << " | *" << counters[j] << "*"; + else + cout << " | " << counters[j]; + } + } + + cout << "\n"; +} + +int main(int argc, char * argv[]) +{ + set_signal_handlers(); + + std::cout.flags ( std::ios::showbase ); + +#ifdef PCM_FORCE_SILENT + null_stream nullStream1, nullStream2; + std::cout.rdbuf(&nullStream1); + std::cerr.rdbuf(&nullStream2); +#endif + + cerr << endl; + cerr << " Intel(r) Performance Counter Monitor: LLC Monitoring Utility "<< endl; + cerr << " modified to report LLC activity per CBox (is each CBox a slice?!?!" << endl; + cerr << endl; + cerr << " Copyright (c) 2013-2014 Intel Corporation" << endl; + cerr << " Copyright (c) 2015 Marius Hillenbrand, Karlsruhe Institute of Technology" << endl; + cerr << endl; + + double delay = PCM_DELAY_DEFAULT; + bool csv = false; + string program = string(argv[0]); + + PCM * m = PCM::getInstance(); + + m->disableJKTWorkaround(); + PCM::ErrorCode status = m->program(); + switch (status) + { + case PCM::Success: + break; + case PCM::MSRAccessDenied: + cerr << "Access to Intel(r) Performance Counter Monitor has denied (no MSR or PCI CFG space access)." << endl; + exit(EXIT_FAILURE); + case PCM::PMUBusy: + cerr << "Access to Intel(r) Performance Counter Monitor has denied (Performance Monitoring Unit is occupied by other application). Try to stop the application that uses PMU." << endl; + cerr << "Alternatively you can try to reset PMU configuration at your own risk. Try to reset? (y/n)" << endl; + char yn; + std::cin >> yn; + if ('y' == yn) + { + m->resetPMU(); + cerr << "PMU configuration has been reset. Try to rerun the program again." << endl; + } + exit(EXIT_FAILURE); + default: + cerr << "Access to Intel(r) Performance Counter Monitor has denied (Unknown error)." << endl; + exit(EXIT_FAILURE); + } + + cerr << "\nDetected "<< m->getCPUBrandString() << " \"Intel(r) microarchitecture codename "<getUArchCodename()<<"\""<hasPCICFGUncore())) + { + cerr << "Jaketown, Ivytown, Haswell Server CPU is required for this tool! Program aborted" << endl; + exit(EXIT_FAILURE); + } + + if(m->getNumSockets() > max_sockets) + { + cerr << "Only systems with up to "<getNumCores() != m->getNumOnlineCores()) + { + cerr << "Core offlining is not supported yet. Program aborted" << endl; + exit(EXIT_FAILURE); + } + + m->setBlocked(false); + + cerr << "Update every "<getMaxNumOfCBoxes(); + + // setup counters + m->freezeServerUncoreCounters(); + m->programLLCCounters(rqt, opc, coreId, 0); + + //m->unfreezeServerUncoreCounters(); // should not run now + + // wait + MySleepUs(delay_ms*1000); + + + if(m->getCPUModel() == PCM::HASWELLX) // Haswell Server + { + // TODO: completely remodel this stuff + for(i=0;igetNumSockets(); ++i) + { + LLCCounterState cnt; + + m->freezeServerUncoreCounters(); + cnt = m->getLLCCounterState(i); + + // print start of line for LLC lookups + if(csv) + { + cout << i << "," << "lookup" << ","; + cout << coreId << ","; + cout << std::hex << rqt << std::dec; + } + else + { + cout << " " << i << " | " "lkp" << " | "; + cout << " " << coreId << " | "; + cout << std::hex << rqt << std::dec << " "; + } + + // print counter values + printInfoLine(cnt.lookups, cboxes, csv); + + // print start of line for LLC requests + // TODO add opcode, if filtered + if(csv) + { + cout << i << "," << "req" << ","; + cout << coreId << ","; + cout << std::hex << opc << std::dec; + } + else + { + cout << " " << i << " | " << "req" << " | "; + cout << " " << coreId << " | "; + cout << std::hex << opc << std::dec << " "; + } + + // print counter values + printInfoLine(cnt.requests, cboxes, csv); + + } + if(!csv) + { + cout << "-----------------------------------------------------------------------\n"; + cout << " * "; + cout << "\n\n"; + } + } + else // Ivytown and Older Architectures + { + cerr << "We do not support older architectures than Haswell." << endl; + break; + } + + // iterate request type forward + switch(rqt) { + case PCM::DataRead: rqt = PCM::Write; break; + case PCM::Write: rqt = PCM::RemoteSnoop; // do not do break; + case PCM::RemoteSnoop: rqt = PCM::Any; break; + case PCM::Any: rqt = PCM::Read; break; + case PCM::Read: rqt = PCM::DataRead; break; + case PCM::Nid: rqt = PCM::DataRead; break; // unused, wrap over + } + // nope, stick to Read + //rqt = PCM::Read; + + // iterate cbox opcode forward +#define ITER(p) opc = PCM::p; break; \ + case PCM::p: + + switch(opc) { + case PCM::RFO: + ITER(CRd); + ITER(DRd); + ITER(PRd); + ITER(WCiLF); + ITER(WCiL); + ITER(WiL); + ITER(WbMtoI); + ITER(WbMtoE); + ITER(ItoM); + ITER(AnyOp); + ITER(WB); opc=PCM::RFO; break; + default: opc = PCM::RFO; + } +#undef ITER + // nope, stick to RFO + //opc = PCM::RFO; + + // iterate core Id + //coreId++; + //if(coreId > 7) coreId = 0; + + } + + // ================================== End Printing Output ================================== + + exit(EXIT_SUCCESS); +} + diff --git a/pcm-pcie.cpp b/pcm-pcie.cpp index cdd9847..9a5d3a6 100644 --- a/pcm-pcie.cpp +++ b/pcm-pcie.cpp @@ -71,7 +71,7 @@ uint32 num_events = (sizeof(PCIeEvents_t)/sizeof(uint64)); using namespace std; const uint32 max_sockets = 4; -void getPCIeEvents(PCM *m, PCM::PCIeEventCode opcode, uint32 delay_ms, sample_t *sample, const uint32 tid=0); +void getPCIeEvents(PCM *m, PCM::CBoxOpcode opcode, uint32 delay_ms, sample_t *sample, const uint32 tid=0); void print_events() { @@ -784,7 +784,7 @@ int main(int argc, char * argv[]) exit(EXIT_SUCCESS); } -void getPCIeEvents(PCM *m, PCM::PCIeEventCode opcode, uint32 delay_ms, sample_t *sample, const uint32 tid) +void getPCIeEvents(PCM *m, PCM::CBoxOpcode opcode, uint32 delay_ms, sample_t *sample, const uint32 tid) { PCIeCounterState * before = new PCIeCounterState[m->getNumSockets()]; PCIeCounterState * after = new PCIeCounterState[m->getNumSockets()]; diff --git a/types.h b/types.h index 25ef460..1b5b98f 100644 --- a/types.h +++ b/types.h @@ -31,10 +31,12 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include #include -typedef unsigned long long uint64; -typedef signed long long int64; -typedef unsigned int uint32; -typedef signed int int32; +#include + +typedef uint64_t uint64; +typedef int64_t int64; +typedef uint32_t uint32; +typedef int32_t int32; /* @@ -680,6 +682,8 @@ struct BecktonUncorePMUCNTCTLRegister #define JKT_CBO_MSR_PMON_BOX_FILTER_OPC(x) (x<<23UL) #define IVTHSX_CBO_MSR_PMON_BOX_FILTER1_OPC(x) (x<<20UL) +#define HSX_CBO_MSR_PMON_BOX_FILTER_STATE(x) (x<<17UL) + #define MSR_PACKAGE_THERM_STATUS (0x01B1) #define MSR_IA32_THERM_STATUS (0x019C) #define PCM_INVALID_THERMAL_HEADROOM ((std::numeric_limits::min)()) diff --git a/utils.cpp b/utils.cpp index db47cd3..a069c6c 100644 --- a/utils.cpp +++ b/utils.cpp @@ -152,7 +152,8 @@ void sigINT_handler(int signum) // in case PCM is blocked just return and summary will be dumped in // calling function, if needed - if( PCM::getInstance()->isBlocked() ) + /* FIX: exit in case of a segmentation fault */ + if( PCM::getInstance()->isBlocked() && signum != SIGSEGV ) return; else exit_cleanup();