Skip to content

Commit

Permalink
Merge branch 'cbox_llc_mods'
Browse files Browse the repository at this point in the history
Conflicts:
	Makefile
	cpucounters.h
	intelpcm.so/Makefile
  • Loading branch information
Marius Hillenbrand committed Oct 30, 2015
2 parents 382e64a + db49961 commit 0ae2000
Show file tree
Hide file tree
Showing 11 changed files with 708 additions and 17 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@
*.vxd
*.exe
*.tgz
tags
intelpcm.so/libintelpcm.so
intelpcm.so/libintelpcm.a
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# written by Roman Dementiev and Jim Harris
#

EXE = pcm-numa.x pcm-power.x pcm.x pcm-sensor.x pcm-msr.x pcm-memory.x pcm-tsx.x pcm-pcie.x pcm-core.x
EXE = pcm-numa.x pcm-power.x pcm.x pcm-sensor.x pcm-msr.x pcm-memory.x pcm-tsx.x pcm-pcie.x pcm-core.x pcm-llc.x

all: $(EXE)

Expand Down
194 changes: 192 additions & 2 deletions cpucounters.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
Copyright (c) 2009-2014, Intel Corporation
some parts Copyright (c) 2015 Marius Hillenbrand, Karlsruhe Institute of Technology
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
Expand All @@ -16,6 +17,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// Pat Fay
// Austen Ott
// Jim Harris (FreeBSD)
// Marius Hillenbrand (LLC-related code for CBoxes)

/*! \file cpucounters.cpp
\brief The bulk of Intel PCM implementation
Expand Down Expand Up @@ -471,6 +473,12 @@ void PCM::initL3CacheOccupancyMonitoring()
return;
}

/* Check if we can access MSRs at all (would segfault otherwise) */
if(MSR == NULL)
{
return;
}

unsigned maxRMID;

const uint64 event = 1; //L3 Occupancy monitoring
Expand Down Expand Up @@ -4240,12 +4248,31 @@ void PCM::programCboOpcodeFilter(const uint32 opc, const uint32 cbo, std::shared
}
}

void PCM::programPCIeMissCounters(const PCM::PCIeEventCode event_, const uint32 tid_)
void PCM::programCboFilter0(const uint32 state, const int32 filterCoreId, const int32 filterThreadId, const uint32 cbo, SafeMsrHandle * msr)
{
uint32 tid = 0;

if(filterCoreId != -1 && filterThreadId != -1) {
tid = ( ((filterCoreId & 0xf) << 1) | (filterThreadId & 0x1) );
}

if(JAKETOWN == cpu_model || IVYTOWN == cpu_model)
{
// erm, I do not know, do not care.

} else if(HASWELLX == cpu_model)
{
msr->write(CX_MSR_PMON_BOX_FILTER(cbo), HSX_CBO_MSR_PMON_BOX_FILTER_STATE(state) | tid);
}
}


void PCM::programPCIeMissCounters(const PCM::CBoxOpcode event_, const uint32 tid_)
{
programPCIeCounters(event_,tid_,1);
}

void PCM::programPCIeCounters(const PCM::PCIeEventCode event_, const uint32 tid_, const uint32 miss_)
void PCM::programPCIeCounters(const PCM::CBoxOpcode event_, const uint32 tid_, const uint32 miss_)
{
for (int32 i = 0; (i < num_sockets) && MSR.size(); ++i)
{
Expand Down Expand Up @@ -4287,6 +4314,124 @@ void PCM::programPCIeCounters(const PCM::PCIeEventCode event_, const uint32 tid_
}
}

// programLLCCounters
void PCM::programLLCCounters(LLCRequestType requestType, CBoxOpcode opcode, int filterCoreId, int filterThreadId)
{

// minimal attempt at docs:
// * there are two filter config registers per CBox, which allow
// filtering events for cache line state, opcode of a request,
// and originating core/thread id and node id.
// * the event configuration specifies which filters are active.
// The LLC_LOOKUP event uses cache line state but not request
// opcode, the TOR_INSERT filters for opcode, but not cache line
// state.
//
// Here we use counter 0 for the LLC_LOOKUP event and
// counter 1 for the TOR_INSERT event for tracking incoming
// requests.
//
// We optionally filter LLC_LOOKUPs for cache line state and
// TOR_INSERTs for opcode (and request type). In both cases, the
// Any/AnyOp pseudo-filter causes all events to be counted.
//
// Further, we allow to filter events for the originating core and
// thread.
// tid field in FILTER0:
// * bit 5: non-thread related data
// * bits 4:1 core-id
// * bit 0: thread id on the core
// (someone does not like SMT beyond 2-thread HT?!)
// tid_en field in the counter config registers (bit 19)

// TODO move this documentation somewhere else

for (int32 i = 0; (i < num_sockets) && MSR; ++i)
{
uint32 refCore = socketRefCore[i];
TemporalThreadAffinity tempThreadAffinity(refCore); // speedup trick for Linux

uint32 tid_en = 0;

if(filterCoreId != -1 && filterThreadId != -1)
tid_en = CBO_MSR_PMON_CTL_TID_EN;

for(uint32 cbo = 0; cbo < getMaxNumOfCBoxes(); ++cbo)
{
// freeze enable
MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN);
// freeze
MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN + CBO_MSR_PMON_BOX_CTL_FRZ);

#ifdef PCM_UNCORE_PMON_BOX_CHECK_STATUS
uint64 val = 0;
MSR[refCore]->read(CX_MSR_PMON_BOX_CTL(cbo), &val);
if ((val & UNCORE_PMON_BOX_CTL_VALID_BITS_MASK) != (CBO_MSR_PMON_BOX_CTL_FRZ_EN + CBO_MSR_PMON_BOX_CTL_FRZ))
{
std::cerr << "ERROR: CBO counter programming seems not to work. ";
std::cerr << "C" << std::dec << cbo << "_MSR_PMON_BOX_CTL=0x" << std::hex << val << std::endl;
}
#endif
// program filter 0, state to 0x1F
programCboFilter0(0x1F, filterCoreId, filterThreadId, cbo, MSR[refCore]);

MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 0), CBO_MSR_PMON_CTL_EN);
// LLC_Lookup event, capture any request
MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 0), \
CBO_MSR_PMON_CTL_EN | CBO_MSR_PMON_CTL_EVENT(0x34) \
| (CBO_MSR_PMON_CTL_UMASK(requestType)) | tid_en);


MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 1), CBO_MSR_PMON_CTL_EN);
// program counter 1 for TOR_INSERTS
// umask: all requests
uint64 umask;
switch(opcode) {
case AnyOp: umask = 0x08; break;
case WB: umask = 0x10; break;
default: umask = 0x01; // filtered by opcode
// program filter 1, opcode
programCboOpcodeFilter(opcode, cbo, MSR[refCore]);
break;
}

// TODO change umask for filtering for opcodes
MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 1), \
CBO_MSR_PMON_CTL_EN | CBO_MSR_PMON_CTL_EVENT(0x35) \
| CBO_MSR_PMON_CTL_UMASK(umask) | tid_en );

// reset counter values
MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN + CBO_MSR_PMON_BOX_CTL_FRZ + CBO_MSR_PMON_BOX_CTL_RST_COUNTERS);

// unfreeze counters
MSR[refCore]->write(CX_MSR_PMON_BOX_CTL(cbo), CBO_MSR_PMON_BOX_CTL_FRZ_EN);
}
}
}

LLCCounterState PCM::getLLCCounterState(const uint32 socket_)
{
uint32 refCore = socketRefCore[socket_];
TemporalThreadAffinity tempThreadAffinity(refCore); // speedup trick for Linux

uint32 cbo;

LLCCounterState res;

for(cbo=0; cbo < getMaxNumOfCBoxes() && cbo < 18; ++cbo)
{
uint64 ctrVal = 0;
// get number of LLC lookups from first counter
MSR[refCore]->read(CX_MSR_PMON_CTRY(cbo, 0), &ctrVal);
res.lookups[cbo] = ctrVal;
// get number of requests from second counter
MSR[refCore]->read(CX_MSR_PMON_CTRY(cbo, 1), &ctrVal);
res.requests[cbo] = ctrVal;
}

return res;
}

PCIeCounterState PCM::getPCIeCounterState(const uint32 socket_)
{
PCIeCounterState result;
Expand All @@ -4302,3 +4447,48 @@ PCIeCounterState PCM::getPCIeCounterState(const uint32 socket_)
}
return result;
}

std::ostream& operator<<(std::ostream& out, const PCM::CBoxOpcode opc) {
const char * str = 0;
#define STR_CASE(v) case(PCM::v): str = #v; break;
switch(opc) {
STR_CASE(PCIeRdCur);
STR_CASE(PCIeNSRd);
STR_CASE(PCIeWiLF);
STR_CASE(PCIeItoM);
STR_CASE(PCIeNSWr);
STR_CASE(PCIeNSWrF);
STR_CASE(RFO);
STR_CASE(CRd);
STR_CASE(DRd);
STR_CASE(PRd);
STR_CASE(WCiLF);
STR_CASE(WCiL);
STR_CASE(WiL);
STR_CASE(WbMtoI);
STR_CASE(WbMtoE);
STR_CASE(ItoM);
case PCM::AnyOp: str = "Any"; break;
STR_CASE(WB);
}
out << str;
return out;
}
#undef STR_CASE

std::ostream& operator<<(std::ostream& out, const PCM::LLCRequestType type) {
const char * str = 0;
#define STR_CASE(v) case(PCM::v): str = #v; break;
switch(type) {
STR_CASE(DataRead);
STR_CASE(Write);
STR_CASE(RemoteSnoop);
STR_CASE(Any);
STR_CASE(Read);
STR_CASE(Nid);
}
out << str;
return out;
}
#undef STR_CASE

57 changes: 53 additions & 4 deletions cpucounters.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,14 +194,21 @@ class PCIeCounterState
{
friend uint64 getNumberOfEvents(PCIeCounterState before, PCIeCounterState after);
friend class PCM;
uint64 data;
public:
uint64 data;
PCIeCounterState(): data(0)
{
}
virtual ~PCIeCounterState() {}
};

struct LLCCounterState {
uint64 lookups[18];
uint64 requests[18];
};



#ifndef HACK_TO_REMOVE_DUPLICATE_ERROR
template class INTELPCM_API std::allocator<TopologyEntry>;
template class INTELPCM_API std::vector<TopologyEntry>;
Expand Down Expand Up @@ -460,6 +467,7 @@ class INTELPCM_API PCM
uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const;
uint32 getMaxNumOfCBoxes() const;
void programCboOpcodeFilter(const uint32 opc, const uint32 cbo, std::shared_ptr<SafeMsrHandle> msr);
void programCboFilter0(const uint32 state, const int32 filterCoreId, const int32 filterThreadId, const uint32 cbo, std::shared_ptr<SafeMsrHandle> msr);

public:
/*!
Expand All @@ -469,6 +477,8 @@ class INTELPCM_API PCM
*/
bool L3CacheOccupancyMetricAvailable();

uint32 getMaxNumOfCBoxes() const;

/*!
* \brief returns the max number of RMID supported by socket
*
Expand Down Expand Up @@ -843,7 +853,8 @@ class INTELPCM_API PCM

inline void disableJKTWorkaround() { disable_JKT_workaround = true; }

enum PCIeEventCode
// hijacked for more general opcodes
enum CBoxOpcode
{
// PCIe read events (PCI devices reading from memory - application writes to disk/network/PCIe device)
PCIeRdCur = 0x19E, // PCIe read current (full cache line)
Expand All @@ -858,10 +869,18 @@ class INTELPCM_API PCM
CRd = 0x181, // Demand Code Read
DRd = 0x182, // Demand Data Read
PRd = 0x187, // Partial Reads (UC) (MMIO Read)
WCiLF = 0x18C, // Full Streaming Store - write invalidate full cache line
WCiL = 0x18D, // Partial Streaming Store - write invalidate for partial cache line
WiL = 0x18F, // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT
WbMtoI = 0x1C4, // Request writeback and invalidation of modified line
WbMtoE = 0x1C5, // Request writeback of modified line, set to exclusive
ItoM = 0x1C8, // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic
WB, // pseudo-opcode for actual writebacks
AnyOp, // pseudo-opcode for do not filter
};

friend std::ostream& operator<<(std::ostream& out, const CBoxOpcode opc);

enum CBoEventTid
{
RFOtid = 0x3E,
Expand All @@ -871,14 +890,44 @@ class INTELPCM_API PCM
//! \brief Program uncore PCIe monitoring event(s)
//! \param event_ a PCIe event to monitor
//! \param tid_ tid filter (PCM supports it only on Haswell server)
void programPCIeCounters(const PCIeEventCode event_, const uint32 tid_ = 0, const uint32 miss_ = 0);
void programPCIeMissCounters(const PCIeEventCode event_, const uint32 tid_ = 0);
void programPCIeCounters(const CBoxOpcode event_, const uint32 tid_ = 0, const uint32 miss_ = 0);
void programPCIeMissCounters(const CBoxOpcode event_, const uint32 tid_ = 0);

//! \brief Get the state of PCIe counter(s)
//! \param socket_ socket of the PCIe controller
//! \return State of PCIe counter(s)
PCIeCounterState getPCIeCounterState(const uint32 socket_);

enum LLCRequestType
{
DataRead = 0x03,
Write = 0x05,
RemoteSnoop = 0x09,
Any = 0x11,
Read = 0x21, // any read request
Nid = 0x41, // Node Id filter

};

friend std::ostream& operator<<(std::ostream& out, const LLCRequestType type);

//! \brief Program uncore LLC monitoring event(s)
//! \param requestType which types of requests to monitor
void programLLCCounters(LLCRequestType requestType, \
CBoxOpcode opcode, int filterCoreId, int filterThreadId);

// TODO add function with opcode filter ..., add opcode any?!

//! \brief Program uncore LLC monitoring for any request type
void programLLCCounters() { programLLCCounters(Any, AnyOp, -1, -1); }

//! \brief Return current counter values of Cboxes
//! \param socket_ socket where to read Cbox counters
//! \param res pointer to array where to store counter values
//! \param res_len number of counter values to store
//! \return number of counter values read
LLCCounterState getLLCCounterState(const uint32 socket_);

uint64 extractCoreGenCounterValue(uint64 val);
uint64 extractCoreFixedCounterValue(uint64 val);
uint64 extractUncoreGenCounterValue(uint64 val);
Expand Down
9 changes: 6 additions & 3 deletions intelpcm.so/Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# Copyright (c) 2014 Intel Corporation

all: libintelpcm.so
all: libintelpcm.so libintelpcm.a

OPT= -g -O3
CXXFLAGS+= -Wall -fPIC $(OPT)
CXXFLAGS+= -Wall -fPIC $(OPT) -I../
vpath %.cpp ..

libintelpcm.so: msr.o cpucounters.o pci.o client_bw.o utils.o
libintelpcm.so: msr.o cpucounters.o pci.o client_bw.o utils.o c_wrap_intelpcm.o
$(CXX) $(CXXFLAGS) -shared $^ -lpthread -o $@

libintelpcm.a: $(OBJS)
ar rcs $@ $^

clean:
rm -rf *.x *.o *~ *.so

Expand Down
Loading

0 comments on commit 0ae2000

Please sign in to comment.