From 382e64a9695714d5278d8f61149be652a394c348 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Fri, 23 Oct 2015 12:52:35 +0200 Subject: [PATCH] original source of v2.9 --- .bdsignore.all | 8 + .gitignore | 1 - MacMSRDriver/DriverInterface.c | 6 +- MacMSRDriver/MSRAccessor.cpp | 6 +- MacMSRDriver/Makefile | 2 +- MacMSRDriver/PCIDriverInterface.cpp | 2 +- MacMSRDriver/PcmMsr.xcodeproj/project.pbxproj | 14 +- MacMSRDriver/PcmMsr/PcmMsr-Info.plist | 10 +- MacMSRDriver/PcmMsr/PcmMsr.cpp | 4 +- MacMSRDriver/PcmMsr/PcmMsrClient.cpp | 6 +- Makefile | 12 +- PCM-Core_Win/pcm-core-win.cpp | 6 + PCM-Core_Win/pcm-core-win.vcxproj | 183 +++ PCM-Core_Win/stdafx.cpp | 8 + PCM-Core_Win/stdafx.h | 16 + PCM-MSR_Win/pcm-msr-win.vcproj | 275 ----- PCM-MSR_Win/pcm-msr-win.vcxproj | 183 +++ PCM-Memory_Win/pcm-memory-win.vcproj | 275 ----- PCM-Memory_Win/pcm-memory-win.vcxproj | 183 +++ PCM-NUMA_Win/pcm-numa-win.vcproj | 275 ----- PCM-NUMA_Win/pcm-numa-win.vcxproj | 183 +++ PCM-PCIE_Win/pcm-pcie-win.vcproj | 275 ----- PCM-PCIE_Win/pcm-pcie-win.vcxproj | 183 +++ PCM-Power_Win/pcm-power-win.vcproj | 275 ----- PCM-Power_Win/pcm-power-win.vcxproj | 183 +++ PCM-Service_Win/PCMService.vcproj | 511 -------- PCM-Service_Win/PCMService.vcxproj | 287 +++++ PCM-TSX_Win/pcm-tsx-win.vcproj | 275 ----- PCM-TSX_Win/pcm-tsx-win.vcxproj | 182 +++ PCM_Win/pcm.vcproj | 274 ----- PCM_Win/pcm.vcxproj | 181 +++ PCM_Win/windriver.h | 41 +- build_all.bat | 17 +- check_win_build.sh | 6 - client_bw.cpp | 30 +- client_bw.h | 7 +- cpucounters.cpp | 1074 ++++++++--------- cpucounters.h | 191 ++- gen_new_win_project.sh | 15 - intelpcm.so/Makefile | 2 +- msr.cpp | 18 +- msr.h | 36 +- mutex.h | 56 + pci.cpp | 51 +- pci.h | 3 +- pcm-core.cpp | 418 +++++++ pcm-memory.cpp | 345 ++++-- pcm-msr.cpp | 29 +- pcm-numa.cpp | 7 +- pcm-pcie.cpp | 17 +- pcm-power.cpp | 23 +- pcm-tsx.cpp | 27 +- pcm.cpp | 115 +- pmu-query.py | 94 ++ types.h | 17 +- utils.cpp | 20 +- utils.h | 14 +- width_extender.h | 74 +- winpmem/winpmem.cpp | 23 +- winpmem/winpmem.h | 2 - 60 files changed, 3612 insertions(+), 3444 deletions(-) create mode 100644 PCM-Core_Win/pcm-core-win.cpp create mode 100644 PCM-Core_Win/pcm-core-win.vcxproj create mode 100644 PCM-Core_Win/stdafx.cpp create mode 100644 PCM-Core_Win/stdafx.h delete mode 100644 PCM-MSR_Win/pcm-msr-win.vcproj create mode 100644 PCM-MSR_Win/pcm-msr-win.vcxproj delete mode 100644 PCM-Memory_Win/pcm-memory-win.vcproj create mode 100644 PCM-Memory_Win/pcm-memory-win.vcxproj delete mode 100644 PCM-NUMA_Win/pcm-numa-win.vcproj create mode 100644 PCM-NUMA_Win/pcm-numa-win.vcxproj delete mode 100644 PCM-PCIE_Win/pcm-pcie-win.vcproj create mode 100644 PCM-PCIE_Win/pcm-pcie-win.vcxproj delete mode 100644 PCM-Power_Win/pcm-power-win.vcproj create mode 100644 PCM-Power_Win/pcm-power-win.vcxproj delete mode 100644 PCM-Service_Win/PCMService.vcproj create mode 100644 PCM-Service_Win/PCMService.vcxproj delete mode 100644 PCM-TSX_Win/pcm-tsx-win.vcproj create mode 100644 PCM-TSX_Win/pcm-tsx-win.vcxproj delete mode 100644 PCM_Win/pcm.vcproj create mode 100644 PCM_Win/pcm.vcxproj delete mode 100644 check_win_build.sh delete mode 100644 gen_new_win_project.sh create mode 100644 mutex.h create mode 100644 pcm-core.cpp create mode 100644 pmu-query.py diff --git a/.bdsignore.all b/.bdsignore.all index 827cc63..3a5fa09 100644 --- a/.bdsignore.all +++ b/.bdsignore.all @@ -8,6 +8,8 @@ ChangeLog TODO Debug Release +Release64 +Backup Intel_SSA My Inspector XE Results - pcm .gitignore @@ -22,3 +24,9 @@ My Inspector XE Results - pcm .*\.htm .*\.bat .*\.strings +.*\.sln +.*\.suo +.*\.sdf +.*\.vcxproj +.*\.vcxproj.user +.*\.vcxproj.filters diff --git a/.gitignore b/.gitignore index ba7a906..e7b2777 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ *.txt *.patch *.orig -*.vcxproj* *.out *.log *.sys diff --git a/MacMSRDriver/DriverInterface.c b/MacMSRDriver/DriverInterface.c index 7d3d90d..b90d690 100644 --- a/MacMSRDriver/DriverInterface.c +++ b/MacMSRDriver/DriverInterface.c @@ -109,7 +109,7 @@ kern_return_t getNumClients(io_connect_t connect, uint32_t* num_insts) #if !defined(__LP64__) if (IOConnectCallStructMethod != NULL) { #endif - uint32_t num_outputs = 1; + size_t num_outputs = 1; uint64_t knum_insts; kernResult = IOConnectCallStructMethod(connect, kGetNumInstances, NULL, 0, &knum_insts, &num_outputs); *num_insts = (uint32_t)knum_insts; @@ -128,7 +128,7 @@ kern_return_t incrementNumClients(io_connect_t connect, uint32_t* num_insts) #if !defined(__LP64__) if (IOConnectCallStructMethod != NULL) { #endif - uint32_t num_outputs = 1; + size_t num_outputs = 1; uint64_t knum_insts; kernResult = IOConnectCallStructMethod(connect, kIncrementNumInstances, NULL, 0, &knum_insts, &num_outputs); *num_insts = (uint32_t)knum_insts; @@ -148,7 +148,7 @@ kern_return_t decrementNumClients(io_connect_t connect, uint32_t* num_insts) #if !defined(__LP64__) if (IOConnectCallStructMethod != NULL) { #endif - uint32_t num_outputs = 1; + size_t num_outputs = 1; uint64_t knum_insts; kernResult = IOConnectCallStructMethod(connect, kDecrementNumInstances, NULL, 0, &knum_insts, &num_outputs); *num_insts = (uint32_t)knum_insts; diff --git a/MacMSRDriver/MSRAccessor.cpp b/MacMSRDriver/MSRAccessor.cpp index c9de3b5..4a514f5 100644 --- a/MacMSRDriver/MSRAccessor.cpp +++ b/MacMSRDriver/MSRAccessor.cpp @@ -29,7 +29,7 @@ int32_t MSRAccessor::buildTopology(uint32_t num_cores ,void* pTopos){ int32_t MSRAccessor::read(uint32_t core_num, uint64_t msr_num, uint64_t * value){ pcm_msr_data_t idatas, odatas; size_t size = sizeof(pcm_msr_data_t); - idatas.msr_num = msr_num; + idatas.msr_num = (uint32_t)msr_num; idatas.cpu_num = core_num; kern_return_t ret = readMSR(connect, &idatas, &size, &odatas, &size); if(ret == KERN_SUCCESS) @@ -46,7 +46,7 @@ int32_t MSRAccessor::write(uint32_t core_num, uint64_t msr_num, uint64_t value){ pcm_msr_data_t idatas; size_t size = sizeof(pcm_msr_data_t); idatas.value = value; - idatas.msr_num = msr_num; + idatas.msr_num = (uint32_t)msr_num; idatas.cpu_num = core_num; kern_return_t ret = writeMSR(connect, &idatas, &size); if(ret == KERN_SUCCESS) @@ -108,4 +108,4 @@ void MSRAccessor::closeConnection(){ if (kernResult != KERN_SUCCESS) { fprintf(stderr, "IOServiceClose returned 0x%08x\n\n", kernResult); } -} \ No newline at end of file +} diff --git a/MacMSRDriver/Makefile b/MacMSRDriver/Makefile index 728ff6a..4b0ec66 100644 --- a/MacMSRDriver/Makefile +++ b/MacMSRDriver/Makefile @@ -8,7 +8,7 @@ kext: xcodebuild -configuration Release -target PcmMsrDriver clean build library: - xcodebuild -configuration Release -target PcmMsrLibrary clean build + xcodebuild -configuration Release -target PcmMsrLibrary clean build install: kext library sudo sh ./kextload.sh diff --git a/MacMSRDriver/PCIDriverInterface.cpp b/MacMSRDriver/PCIDriverInterface.cpp index 8334f31..d18165b 100644 --- a/MacMSRDriver/PCIDriverInterface.cpp +++ b/MacMSRDriver/PCIDriverInterface.cpp @@ -14,7 +14,7 @@ // #include -#include +#include #include "PCIDriverInterface.h" #include #include "PcmMsr/UserKernelShared.h" diff --git a/MacMSRDriver/PcmMsr.xcodeproj/project.pbxproj b/MacMSRDriver/PcmMsr.xcodeproj/project.pbxproj index 753043f..9eb466a 100644 --- a/MacMSRDriver/PcmMsr.xcodeproj/project.pbxproj +++ b/MacMSRDriver/PcmMsr.xcodeproj/project.pbxproj @@ -296,7 +296,7 @@ EXECUTABLE_PREFIX = lib; GCC_VERSION = com.apple.compilers.llvm.clang.1_0; INSTALL_PATH = /usr/lib; - MACOSX_DEPLOYMENT_TARGET = 10.8; + MACOSX_DEPLOYMENT_TARGET = 10.9; PRODUCT_NAME = PcmMsr; }; name = Debug; @@ -311,7 +311,7 @@ EXECUTABLE_PREFIX = lib; GCC_VERSION = com.apple.compilers.llvm.clang.1_0; INSTALL_PATH = /usr/lib; - MACOSX_DEPLOYMENT_TARGET = 10.8; + MACOSX_DEPLOYMENT_TARGET = 10.9; PRODUCT_NAME = PcmMsr; }; name = Release; @@ -370,14 +370,13 @@ GCC_PRECOMPILE_PREFIX_HEADER = YES; GCC_PREFIX_HEADER = "PcmMsr/PcmMsr-Prefix.pch"; GCC_VERSION = com.apple.compilers.llvm.clang.1_0; - "HEADER_SEARCH_PATHS[arch=*]" = /usr/include; INFOPLIST_FILE = "PcmMsr/PcmMsr-Info.plist"; - MACOSX_DEPLOYMENT_TARGET = 10.8; + MACOSX_DEPLOYMENT_TARGET = 10.9; MODULE_NAME = com.intel.driver.PcmMsrDriver; MODULE_VERSION = 1.0.0d1; ONLY_ACTIVE_ARCH = YES; PRODUCT_NAME = PcmMsrDriver; - SDKROOT = macosx10.8; + SDKROOT = macosx10.9; WRAPPER_EXTENSION = kext; }; name = Debug; @@ -390,14 +389,13 @@ GCC_PRECOMPILE_PREFIX_HEADER = YES; GCC_PREFIX_HEADER = "PcmMsr/PcmMsr-Prefix.pch"; GCC_VERSION = com.apple.compilers.llvm.clang.1_0; - "HEADER_SEARCH_PATHS[arch=*]" = /usr/include; INFOPLIST_FILE = "PcmMsr/PcmMsr-Info.plist"; - MACOSX_DEPLOYMENT_TARGET = 10.8; + MACOSX_DEPLOYMENT_TARGET = 10.9; MODULE_NAME = com.intel.driver.PcmMsrDriver; MODULE_VERSION = 1.0.0d1; ONLY_ACTIVE_ARCH = YES; PRODUCT_NAME = PcmMsrDriver; - SDKROOT = macosx10.8; + SDKROOT = macosx10.9; WRAPPER_EXTENSION = kext; }; name = Release; diff --git a/MacMSRDriver/PcmMsr/PcmMsr-Info.plist b/MacMSRDriver/PcmMsr/PcmMsr-Info.plist index d8c8408..007fba9 100644 --- a/MacMSRDriver/PcmMsr/PcmMsr-Info.plist +++ b/MacMSRDriver/PcmMsr/PcmMsr-Info.plist @@ -43,15 +43,15 @@ OSBundleLibraries com.apple.kpi.bsd - 10.8 + 10.9 com.apple.kpi.mach - 10.8 + 10.9 com.apple.kpi.unsupported - 10.8 + 10.9 com.apple.kpi.iokit - 10.8 + 10.9 com.apple.kpi.libkern - 10.8 + 10.9 diff --git a/MacMSRDriver/PcmMsr/PcmMsr.cpp b/MacMSRDriver/PcmMsr/PcmMsr.cpp index 2d36c99..315ffb0 100644 --- a/MacMSRDriver/PcmMsr/PcmMsr.cpp +++ b/MacMSRDriver/PcmMsr/PcmMsr.cpp @@ -110,9 +110,9 @@ uint32_t PcmMsrDriverClassName::getNumCores() if(sizeof(int) == size) ret = *(int*)pParam; else if(sizeof(long) == size) - ret = *(long*)pParam; + ret = (uint32_t) *(long*)pParam; else if(sizeof(long long) == size) - ret = *(long long*)pParam; + ret = (uint32_t) *(long long*)pParam; else ret = *(int*)pParam; } diff --git a/MacMSRDriver/PcmMsr/PcmMsrClient.cpp b/MacMSRDriver/PcmMsr/PcmMsrClient.cpp index 46bf8f0..a88a96b 100644 --- a/MacMSRDriver/PcmMsr/PcmMsrClient.cpp +++ b/MacMSRDriver/PcmMsr/PcmMsrClient.cpp @@ -110,11 +110,11 @@ IOReturn PcmMsrClientClassName::checkActiveAndOpened (const char* memberFunction { if (fProvider == NULL || isInactive()) { IOLog("%s::%s returned kIOReturnNotAttached.\n", getName(), memberFunction); - return kIOReturnNotAttached; + return (IOReturn)kIOReturnNotAttached; } else if (!fProvider->isOpen(this)) { IOLog("%s::%s returned kIOReturnNotOpen.\n", getName(), memberFunction); - return kIOReturnNotOpen; + return (IOReturn)kIOReturnNotOpen; } return kIOReturnSuccess; } @@ -169,7 +169,7 @@ IOReturn PcmMsrClientClassName::sBuildTopology(PcmMsrClientClassName* target, vo IOReturn PcmMsrClientClassName::buildTopology(topologyEntry* data, size_t output_size) { - uint32_t num_cores = output_size / sizeof(topologyEntry); + uint32_t num_cores = (uint32_t) (output_size / sizeof(topologyEntry) ); IOReturn result = checkActiveAndOpened (__FUNCTION__); if (result == kIOReturnSuccess) diff --git a/Makefile b/Makefile index f7a78d6..7eac253 100644 --- a/Makefile +++ b/Makefile @@ -3,29 +3,33 @@ # written by Roman Dementiev and Jim Harris # -EXE = pcm-numa.x pcm-power.x pcm.x pcm-sensor.x pcm-msr.x pcm-memory.x pcm-tsx.x pcm-pcie.x +EXE = pcm-numa.x pcm-power.x pcm.x pcm-sensor.x pcm-msr.x pcm-memory.x pcm-tsx.x pcm-pcie.x pcm-core.x all: $(EXE) -CXXFLAGS += -Wall -g -O3 +klocwork: $(EXE) + +CXXFLAGS += -Wall -g -O3 -Wno-unknown-pragmas # uncomment if you want to rely on Linux perf support (user needs CAP_SYS_ADMIN privileges) ifneq ($(wildcard /usr/include/linux/perf_event.h),) -#CXXFLAGS += -DPCM_USE_PERF +CXXFLAGS += -DPCM_USE_PERF endif UNAME:=$(shell uname) ifeq ($(UNAME), Linux) LIB= -pthread -lrt +CXXFLAGS += -std=c++0x endif ifeq ($(UNAME), Darwin) LIB= -lpthread /usr/lib/libPcmMsr.dylib -CXXFLAGS += -I/usr/include +CXXFLAGS += -I/usr/include -IMacMSRDriver -std=c++0x endif ifeq ($(UNAME), FreeBSD) CXX=c++ LIB= -lpthread -lc++ +CXXFLAGS += -std=c++0x endif COMMON_OBJS = msr.o cpucounters.o pci.o client_bw.o utils.o diff --git a/PCM-Core_Win/pcm-core-win.cpp b/PCM-Core_Win/pcm-core-win.cpp new file mode 100644 index 0000000..630c332 --- /dev/null +++ b/PCM-Core_Win/pcm-core-win.cpp @@ -0,0 +1,6 @@ +// pcm-core-win.cpp : Defines the entry point for the console application. +// + +#include "stdafx.h" + +#include "../pcm-core.cpp" diff --git a/PCM-Core_Win/pcm-core-win.vcxproj b/PCM-Core_Win/pcm-core-win.vcxproj new file mode 100644 index 0000000..2211567 --- /dev/null +++ b/PCM-Core_Win/pcm-core-win.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {CBF0B26F-19F5-4A76-BAB1-2585212026FC} + pcm-core-win + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + $(OutDir)pcm-core.exe + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-core.exe + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm-core.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-core.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM-Core_Win/stdafx.cpp b/PCM-Core_Win/stdafx.cpp new file mode 100644 index 0000000..db9b2d1 --- /dev/null +++ b/PCM-Core_Win/stdafx.cpp @@ -0,0 +1,8 @@ +// stdafx.cpp : source file that includes just the standard includes +// pcm.pch will be the pre-compiled header +// stdafx.obj will contain the pre-compiled type information + +#include "stdafx.h" + +// TODO: reference any additional headers you need in STDAFX.H +// and not in this file diff --git a/PCM-Core_Win/stdafx.h b/PCM-Core_Win/stdafx.h new file mode 100644 index 0000000..4360537 --- /dev/null +++ b/PCM-Core_Win/stdafx.h @@ -0,0 +1,16 @@ +// stdafx.h : include file for standard system include files, +// or project specific include files that are used frequently, but +// are changed infrequently +// + +#pragma once + +#ifndef _WIN32_WINNT // Allow use of features specific to Windows XP or later. +#define _WIN32_WINNT 0x0501 // Change this to the appropriate value to target other versions of Windows. +#endif + +#include +#include + + +// TODO: reference additional headers your program requires here diff --git a/PCM-MSR_Win/pcm-msr-win.vcproj b/PCM-MSR_Win/pcm-msr-win.vcproj deleted file mode 100644 index 538f292..0000000 --- a/PCM-MSR_Win/pcm-msr-win.vcproj +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM-MSR_Win/pcm-msr-win.vcxproj b/PCM-MSR_Win/pcm-msr-win.vcxproj new file mode 100644 index 0000000..0101507 --- /dev/null +++ b/PCM-MSR_Win/pcm-msr-win.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {E93F94F8-1797-4BF7-BC7F-8EF144FF17D9} + pcm-msr-win + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + $(OutDir)pcm-msr.exe + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-msr.exe + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm-msr.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-msr.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM-Memory_Win/pcm-memory-win.vcproj b/PCM-Memory_Win/pcm-memory-win.vcproj deleted file mode 100644 index b953a95..0000000 --- a/PCM-Memory_Win/pcm-memory-win.vcproj +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM-Memory_Win/pcm-memory-win.vcxproj b/PCM-Memory_Win/pcm-memory-win.vcxproj new file mode 100644 index 0000000..6920640 --- /dev/null +++ b/PCM-Memory_Win/pcm-memory-win.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {66458306-D903-4946-AB5D-AE452DC8B50B} + pcm-memory-win + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + $(OutDir)pcm-memory.exe + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-memory.exe + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm-memory.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-memory.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM-NUMA_Win/pcm-numa-win.vcproj b/PCM-NUMA_Win/pcm-numa-win.vcproj deleted file mode 100644 index 0cf2f94..0000000 --- a/PCM-NUMA_Win/pcm-numa-win.vcproj +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM-NUMA_Win/pcm-numa-win.vcxproj b/PCM-NUMA_Win/pcm-numa-win.vcxproj new file mode 100644 index 0000000..cc95c10 --- /dev/null +++ b/PCM-NUMA_Win/pcm-numa-win.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {45AB82E6-DADE-46E8-84D0-7481D2818849} + pcm-numa-win + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + $(OutDir)pcm-numa.exe + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-numa.exe + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm-numa.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-numa.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM-PCIE_Win/pcm-pcie-win.vcproj b/PCM-PCIE_Win/pcm-pcie-win.vcproj deleted file mode 100644 index 8d9c84a..0000000 --- a/PCM-PCIE_Win/pcm-pcie-win.vcproj +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM-PCIE_Win/pcm-pcie-win.vcxproj b/PCM-PCIE_Win/pcm-pcie-win.vcxproj new file mode 100644 index 0000000..708fcc7 --- /dev/null +++ b/PCM-PCIE_Win/pcm-pcie-win.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {1FAE01E6-5CD0-4C91-8C82-4FBCBB4CC419} + pcm-pcie-win + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + $(OutDir)pcm-pcie.exe + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-pcie.exe + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm-pcie.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-pcie.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM-Power_Win/pcm-power-win.vcproj b/PCM-Power_Win/pcm-power-win.vcproj deleted file mode 100644 index 8871e23..0000000 --- a/PCM-Power_Win/pcm-power-win.vcproj +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM-Power_Win/pcm-power-win.vcxproj b/PCM-Power_Win/pcm-power-win.vcxproj new file mode 100644 index 0000000..7e3d99e --- /dev/null +++ b/PCM-Power_Win/pcm-power-win.vcxproj @@ -0,0 +1,183 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {78F57684-F6F2-4691-B498-C7FC0731F875} + pcm-power-win + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + $(OutDir)pcm-power.exe + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-power.exe + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm-power.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-power.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM-Service_Win/PCMService.vcproj b/PCM-Service_Win/PCMService.vcproj deleted file mode 100644 index 57772a3..0000000 --- a/PCM-Service_Win/PCMService.vcproj +++ /dev/null @@ -1,511 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM-Service_Win/PCMService.vcxproj b/PCM-Service_Win/PCMService.vcxproj new file mode 100644 index 0000000..a8b611d --- /dev/null +++ b/PCM-Service_Win/PCMService.vcxproj @@ -0,0 +1,287 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release64 + Win32 + + + Release64 + x64 + + + Release + Win32 + + + Release + x64 + + + + PCM-Service + {A0DD26AB-CC35-41E5-B676-1E0FBCEF0DDC} + PCMService + ManagedCProj + + + + Application + v120 + + + Application + v120 + Unicode + true + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + false + + + Application + v120 + Unicode + true + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>11.0.61030.0 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + false + + + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + false + false + false + + + + Disabled + ../;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;%(PreprocessorDefinitions) + MultiThreadedDebugDLL + + Level3 + ProgramDatabase + + + advapi32.lib;..\intelpcm.dll\Debug\Intelpcm.lib + true + true + MachineX86 + + + + + X64 + + + Disabled + ../;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;%(PreprocessorDefinitions) + MultiThreadedDebugDLL + + Level3 + ProgramDatabase + + + advapi32.lib;..\intelpcm.dll\x64\Debug\intelpcm.lib + true + true + MachineX64 + + + + + ../;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + advapi32.lib;..\intelpcm.dll\Release\Intelpcm.lib + true + MachineX86 + + + + + X64 + + + AnySuitable + true + Speed + false + ../;%(AdditionalIncludeDirectories) + WIN32;NDEBUG;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + advapi32.lib;..\Intelpcm.dll\x64\Release\Intelpcm.lib + true + MachineX64 + + + + + X64 + + + MaxSpeed + AnySuitable + true + Speed + ../;%(AdditionalIncludeDirectories) + WIN32;_DEBUG;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + advapi32.lib;..\Intelpcm.dll\x64\Release64\Intelpcm.lib + true + true + MachineX64 + + + + + true + true + + + true + true + + + true + true + + + true + true + + + true + true + + + true + true + + + + + + + + + + + + CppClass + Component + + + CppClass + + + + + + + + + + + + + + + + + PCMInstaller.h + + + PCMService.h + + + + + + \ No newline at end of file diff --git a/PCM-TSX_Win/pcm-tsx-win.vcproj b/PCM-TSX_Win/pcm-tsx-win.vcproj deleted file mode 100644 index c94bf08..0000000 --- a/PCM-TSX_Win/pcm-tsx-win.vcproj +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM-TSX_Win/pcm-tsx-win.vcxproj b/PCM-TSX_Win/pcm-tsx-win.vcxproj new file mode 100644 index 0000000..f809963 --- /dev/null +++ b/PCM-TSX_Win/pcm-tsx-win.vcxproj @@ -0,0 +1,182 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {2CDE03E0-BF03-448E-8AA6-D764F8BDE7C1} + pcm-tsx-win + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + $(OutDir)pcm-tsx.exe + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-tsx.exe + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm-tsx.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm-tsx.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM_Win/pcm.vcproj b/PCM_Win/pcm.vcproj deleted file mode 100644 index 7a0fc09..0000000 --- a/PCM_Win/pcm.vcproj +++ /dev/null @@ -1,274 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/PCM_Win/pcm.vcxproj b/PCM_Win/pcm.vcxproj new file mode 100644 index 0000000..7c9fa43 --- /dev/null +++ b/PCM_Win/pcm.vcxproj @@ -0,0 +1,181 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {D919CF99-5D9F-46C9-B6F0-626700E63592} + icpm + Win32Proj + + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + true + + + Application + v120 + Unicode + + + Application + v120 + Unicode + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>12.0.21005.1 + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + true + + + true + + + $(SolutionDir)$(Configuration)\ + $(Configuration)\ + false + + + false + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + true + Console + MachineX86 + + + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + true + Console + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + Level3 + ProgramDatabase + + + $(OutDir)pcm.exe + true + Console + true + true + MachineX86 + + + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + + + Level3 + ProgramDatabase + + + $(OutDir)pcm.exe + true + Console + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/PCM_Win/windriver.h b/PCM_Win/windriver.h index 9a50a40..b13a827 100644 --- a/PCM_Win/windriver.h +++ b/PCM_Win/windriver.h @@ -30,6 +30,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND The driver is required to access hardware Model Specific Registers (MSRs) under Windows. Currently only 64-bit Windows 7 has been tested. */ + class Driver { SC_HANDLE hSCManager; @@ -49,12 +50,12 @@ class Driver hSCManager = OpenSCManager(NULL, NULL, SC_MANAGER_CREATE_SERVICE); if (hSCManager) { - hService = CreateService(hSCManager, L"Test MSR 4", L"Test MSR Driver 4", SERVICE_START | DELETE | SERVICE_STOP, + hService = CreateService(hSCManager, L"PCM Test MSR", L"PCM Test MSR Driver", SERVICE_START | DELETE | SERVICE_STOP, SERVICE_KERNEL_DRIVER, SERVICE_DEMAND_START, SERVICE_ERROR_IGNORE, driverPath, NULL, NULL, NULL, NULL, NULL); if (!hService) { - hService = OpenService(hSCManager, L"Test MSR 4", SERVICE_START | DELETE | SERVICE_STOP); + hService = OpenService(hSCManager, L"PCM Test MSR", SERVICE_START | DELETE | SERVICE_STOP); } if (hService) @@ -66,8 +67,10 @@ class Driver DWORD err = GetLastError(); if (err == ERROR_SERVICE_ALREADY_RUNNING) return true; - _com_error error(err); - std::wcerr << "Starting MSR service failed with error " << err << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Starting MSR service failed with error " << err << " "; + const TCHAR * errorStr = _com_error(err).ErrorMessage(); + if (errorStr) std::wcerr << errorStr; + std::wcerr << std::endl; ControlService(hService, SERVICE_CONTROL_STOP, &ss); @@ -77,16 +80,20 @@ class Driver } else { - _com_error error(GetLastError()); - std::wcerr << "Opening service manager failed with error " << GetLastError() << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Opening service manager failed with error " << GetLastError() << " "; + const TCHAR * errorStr = _com_error(GetLastError()).ErrorMessage(); + if (errorStr) std::wcerr << errorStr; + std::wcerr << std::endl; } CloseServiceHandle(hSCManager); } else { - _com_error error(GetLastError()); - std::wcerr << "Opening service manager failed with error " << GetLastError() << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Opening service manager failed with error " << GetLastError() << " "; + const TCHAR * errorStr = _com_error(GetLastError()).ErrorMessage(); + if (errorStr) std::wcerr << errorStr; + std::wcerr << std::endl; } @@ -110,8 +117,7 @@ class Driver hSCManager = OpenSCManager(NULL, NULL, SC_MANAGER_CREATE_SERVICE); if (hSCManager) { - hService = OpenService(hSCManager, L"Test MSR 4", SERVICE_START | DELETE | SERVICE_STOP); - DWORD res = 0; + hService = OpenService(hSCManager, L"PCM Test MSR", SERVICE_START | DELETE | SERVICE_STOP); if (hService) { ControlService(hService, SERVICE_CONTROL_STOP, &ss); @@ -122,8 +128,10 @@ class Driver } else { - _com_error error(GetLastError()); - std::wcerr << "Opening service manager failed with error " << GetLastError() << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Opening service manager failed with error " << GetLastError() << " "; + const TCHAR * errorStr = _com_error(GetLastError()).ErrorMessage(); + if (errorStr) std::wcerr << errorStr; + std::wcerr << std::endl; } } @@ -136,8 +144,7 @@ class Driver hSCManager = OpenSCManager(NULL, NULL, SC_MANAGER_CREATE_SERVICE); if (hSCManager) { - hService = OpenService(hSCManager, L"Test MSR 4", SERVICE_START | DELETE | SERVICE_STOP); - DWORD res = 0; + hService = OpenService(hSCManager, L"PCM Test MSR", SERVICE_START | DELETE | SERVICE_STOP); if (hService) { ControlService(hService, SERVICE_CONTROL_STOP, &ss); @@ -149,8 +156,10 @@ class Driver } else { - _com_error error(GetLastError()); - std::wcerr << "Opening service manager failed with error " << GetLastError() << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Opening service manager failed with error " << GetLastError() << " "; + const TCHAR * errorStr = _com_error(GetLastError()).ErrorMessage(); + if (errorStr) std::wcerr << errorStr; + std::wcerr << std::endl; } } }; diff --git a/build_all.bat b/build_all.bat index 28c6f5d..53932f5 100644 --- a/build_all.bat +++ b/build_all.bat @@ -1,35 +1,32 @@ REM change path to your VCVARS.BAT -CALL "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\vcvars32.bat" +CALL "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_amd64\vcvarsx86_amd64.bat" for %%p in (PCM) do ( @echo Building %%p chdir %%p_Win - vcupgrade -overwrite %%p.vcproj - msbuild %%p.vcxproj + msbuild %%p.vcxproj /p:Configuration=Release /t:Clean,Build chdir .. ) @echo Building Intelpcm.dll chdir Intelpcm.dll - vcupgrade -overwrite Intelpcm.dll.vcproj - msbuild Intelpcm.dll.vcxproj + msbuild Intelpcm.dll.vcxproj /p:Configuration=Release /t:Clean,Build chdir .. @echo Building PCM-Service chdir PCM-Service_Win - vcupgrade -overwrite PCMService.vcproj - msbuild PCMService.vcxproj + msbuild PCMService.vcxproj /p:Configuration=Release /t:Clean,Build chdir .. -for %%p in (PCM-MSR PCM-TSX PCM-Memory PCM-NUMA PCM-PCIE PCM-Power) do ( +for %%p in (PCM-MSR PCM-TSX PCM-Memory PCM-NUMA PCM-PCIE PCM-Power PCM-Core) do ( @echo Building %%p chdir %%p_Win - vcupgrade -overwrite %%p-win.vcproj - msbuild %%p-win.vcxproj + msbuild %%p-win.vcxproj /p:Configuration=Release /t:Clean,Build chdir .. ) +exit diff --git a/check_win_build.sh b/check_win_build.sh deleted file mode 100644 index a9086ab..0000000 --- a/check_win_build.sh +++ /dev/null @@ -1,6 +0,0 @@ - -git log | head -1 - - -cmd /k build_all.bat | egrep 'Error' - diff --git a/client_bw.cpp b/client_bw.cpp index d5f351e..ce67cbd 100644 --- a/client_bw.cpp +++ b/client_bw.cpp @@ -15,14 +15,12 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND // #include -#include -#include +#include +#ifndef _MSC_VER #include +#endif #include #include -#include -#include -#include #include "pci.h" #include "client_bw.h" @@ -89,39 +87,35 @@ ClientBW::ClientBW() : pmem(new PCMPmem()) throw std::exception(); } startAddr = imcbar & (~(4096ULL-1ULL)); // round down to 4K - - Mutex = CreateMutex(NULL,FALSE,NULL); } uint64 ClientBW::getImcReads() { - WaitForSingleObject(Mutex,INFINITE); + mutex.lock(); uint32 res = pmem->read32(startAddr + PCM_CLIENT_IMC_DRAM_DATA_READS); - ReleaseMutex(Mutex); - return res; + mutex.unlock(); + return (uint64)res; } uint64 ClientBW::getImcWrites() { - WaitForSingleObject(Mutex,INFINITE); + mutex.lock(); uint32 res = pmem->read32(startAddr + PCM_CLIENT_IMC_DRAM_DATA_WRITES); - ReleaseMutex(Mutex); - return res; + mutex.unlock(); + return (uint64)res; } uint64 ClientBW::getIoRequests() { - WaitForSingleObject(Mutex,INFINITE); + mutex.lock(); uint32 res = pmem->read32(startAddr + PCM_CLIENT_IMC_DRAM_IO_REQESTS); - ReleaseMutex(Mutex); - return res; + mutex.unlock(); + return (uint64)res; } ClientBW::~ClientBW() { pmem->uninstall_driver(); - delete pmem; - CloseHandle(Mutex); } diff --git a/client_bw.h b/client_bw.h index d433361..060b7b3 100644 --- a/client_bw.h +++ b/client_bw.h @@ -30,6 +30,9 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include #endif +#include "mutex.h" +#include + #define PCM_CLIENT_IMC_BAR_OFFSET (0x0048) #define PCM_CLIENT_IMC_DRAM_IO_REQESTS (0x5048) #define PCM_CLIENT_IMC_DRAM_DATA_READS (0x5050) @@ -47,9 +50,9 @@ class ClientBW char * mmapAddr; #endif #ifdef _MSC_VER - WinPmem * pmem; + std::shared_ptr pmem; uint64 startAddr; - HANDLE Mutex; + PCM_Util::Mutex mutex; #endif public: diff --git a/cpucounters.cpp b/cpucounters.cpp index bbb5e4b..89799b0 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -23,8 +23,6 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND //#define PCM_TEST_FALLBACK_TO_ATOM -#include -#include #include #ifdef INTELPCM_EXPORTS // Intelpcm.h includes cpucounters.h @@ -53,7 +51,6 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include #include #include -#include #include #ifdef __APPLE__ @@ -118,6 +115,7 @@ class InstanceLock { // unlock ReleaseMutex(Mutex); + CloseHandle(Mutex); } }; #else // Linux or Apple @@ -180,18 +178,50 @@ class InstanceLock }; #endif // end of _MSC_VER else + +class TemporalThreadAffinity // speedup trick for Linux +{ +#ifdef __linux__ + cpu_set_t old_affinity; + TemporalThreadAffinity(); // forbiden + +public: + TemporalThreadAffinity(uint32 core_id) + { + pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &old_affinity); + + cpu_set_t new_affinity; + CPU_ZERO(&new_affinity); + CPU_SET(core_id, &new_affinity); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &new_affinity); + } + ~TemporalThreadAffinity() + { + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &old_affinity); + } +#else // not implemented for windows or os x + TemporalThreadAffinity(); // forbiden + +public: + TemporalThreadAffinity(uint32) { } +#endif +}; + + PCM * PCM::instance = NULL; -int bitCount(uint64 n) +/* +static int bitCount(uint64 n) { int count = 0; while (n) { - count += (int)(n & 0x00000001); - n >>= 1; + count += static_cast(n & 0x00000001); + n >>= static_cast(1); } return count; } +*/ PCM * PCM::getInstance() { @@ -204,7 +234,7 @@ PCM * PCM::getInstance() return instance = new PCM(); } -uint32 build_bit_ui(int beg, int end) +uint32 build_bit_ui(uint32 beg, uint32 end) { uint32 myll = 0; if (end == 31) @@ -245,7 +275,7 @@ uint64 build_bit(uint32 beg, uint32 end) uint64 myll = 0; if (end == 63) { - myll = (uint64)(-1); + myll = static_cast(-1); } else { @@ -315,7 +345,7 @@ uint64 PCM::extractL3CacheOccupancy(uint64 val) if(val & (3ULL<<62)) { // invalid reading - return PCM_INVALID_L3_CACHE_OCCUPANCY; + return static_cast(PCM_INVALID_L3_CACHE_OCCUPANCY); } // valid reading @@ -325,11 +355,11 @@ int32 extractThermalHeadroom(uint64 val) { if(val & (1ULL<<31ULL)) { // valid reading - return (int32)extract_bits(val,16,22); + return static_cast(extract_bits(val, 16, 22)); } // invalid reading - return PCM_INVALID_THERMAL_HEADROOM; + return static_cast(PCM_INVALID_THERMAL_HEADROOM); } @@ -337,8 +367,8 @@ uint64 get_frequency_from_cpuid(); union PCM_CPUID_INFO { - int array[4]; - struct { int eax,ebx,ecx,edx; } reg ; + int array[4]; + struct { unsigned int eax, ebx, ecx, edx; } reg; }; void pcm_cpuid(int leaf, PCM_CPUID_INFO & info) @@ -390,6 +420,10 @@ bool PCM::detectModel() cpu_family = (((cpuinfo.array[0]) >> 8) & 0xf) | ((cpuinfo.array[0] & 0xf00000) >> 16); cpu_model = original_cpu_model = (((cpuinfo.array[0]) & 0xf0) >> 4) | ((cpuinfo.array[0] & 0xf0000) >> 12); + if (cpuinfo.reg.ecx & (1UL<<31UL)) { + std::cerr << "Detected a hypervisor/virtualization technology. Some metrics might not be available due to configuration or availability of virtual hardware features." << std::endl; + } + if (max_cpuid >= 0xa) { // get counter related info @@ -447,7 +481,7 @@ void PCM::initL3CacheOccupancyMonitoring() std::vector rmid(num_sockets); - for(int i = 0; i < num_sockets; i ++) + for(int32 i = 0; i < num_sockets; ++i) rmid[i] = maxRMID - 1; /* Associate each core with 1 RMID */ @@ -462,14 +496,14 @@ void PCM::initL3CacheOccupancyMonitoring() MSR[core]->read(IA32_PQR_ASSOC, &msr_pqr_assoc); //std::cout << "Socket Id : " << topology[core].socket; - msr_pqr_assoc &= 0xffffffff00000000ULL; - msr_pqr_assoc |= (uint64)(rmid[topology[core].socket] & ((1ULL<<10)-1ULL)); + msr_pqr_assoc &= static_cast(0xffffffff00000000ULL); + msr_pqr_assoc |= (static_cast(rmid[topology[core].socket]) & static_cast((1ULL << 10ULL) - 1ULL)); //Write 0xC8F MSR with new RMID for each core MSR[core]->write(IA32_PQR_ASSOC,msr_pqr_assoc); //Write MSR 0xC8D , the event id and rmid for each core - msr_qm_evtsel = rmid[topology[core].socket] & ((1ULL<<10)-1ULL) ; - msr_qm_evtsel <<= 32 ; + msr_qm_evtsel = static_cast(rmid[topology[core].socket]) & static_cast((1ULL << 10) - 1ULL); + msr_qm_evtsel <<= static_cast(32ULL); msr_qm_evtsel |= event & ((1ULL<<8)-1ULL); MSR[core]->write(IA32_QM_EVTSEL,msr_qm_evtsel); @@ -487,7 +521,7 @@ void PCM::initCStateSupportTables() #define PCM_CSTATE_ARRAY(array_ , val ) \ { \ static uint64 tmp[] = val; \ - PCM_COMPILE_ASSERT( sizeof(tmp)/sizeof(uint64) == MAX_C_STATE + 1); \ + PCM_COMPILE_ASSERT(sizeof(tmp) / sizeof(uint64) == (static_cast(MAX_C_STATE)+1)); \ array_ = tmp; \ break; \ } @@ -500,6 +534,7 @@ void PCM::initCStateSupportTables() case ATOM_CENTERTON: case ATOM_AVOTON: case ATOM_BAYTRAIL: + case ATOM_CHERRYTRAIL: PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x3F8, 0, 0x3F9, 0, 0x3FA, 0, 0, 0, 0 }) ); case NEHALEM_EP: case NEHALEM: @@ -516,9 +551,12 @@ void PCM::initCStateSupportTables() case HASWELL: case HASWELL_2: case HASWELLX: + case BDX_DE: PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0, 0, 0}) ); case HASWELL_ULT: case BROADWELL: + case SKL: + case BROADWELL_XEON_E3: PCM_CSTATE_ARRAY(pkgCStateMsr, PCM_PARAM_PROTECT({0, 0, 0x60D, 0x3F8, 0, 0, 0x3F9, 0x3FA, 0x630, 0x631, 0x632}) ); default: @@ -548,9 +586,13 @@ void PCM::initCStateSupportTables() case HASWELL_2: case HASWELL_ULT: case HASWELLX: + case BDX_DE: case BROADWELL: + case BROADWELL_XEON_E3: case ATOM_BAYTRAIL: case ATOM_AVOTON: + case ATOM_CHERRYTRAIL: + case SKL: PCM_CSTATE_ARRAY(coreCStateMsr, PCM_PARAM_PROTECT({0, 0, 0, 0x3FC, 0, 0, 0x3FD, 0x3FE, 0, 0, 0}) ); default: std::cerr << "PCM error: core C-states support array is not initialized. Core C-states metrics will not be shown." << std::endl; @@ -563,25 +605,36 @@ bool PCM::discoverSystemTopology() typedef std::map socketIdMap_type; socketIdMap_type socketIdMap; -#ifdef _MSC_VER -// version for Windows - -#ifdef COMPILE_FOR_WINDOWS_7 - DWORD GroupStart[5]; // at most 4 groups on Windows 7 - GroupStart[0] = 0; - GroupStart[1] = GetActiveProcessorCount(0); - GroupStart[2] = GroupStart[1] + GetActiveProcessorCount(1); - GroupStart[3] = GroupStart[2] + GetActiveProcessorCount(2); - GroupStart[4] = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); - if (GroupStart[3] + GetActiveProcessorCount(3) != GetActiveProcessorCount(ALL_PROCESSOR_GROUPS)) - { - std::cerr << "Error in processor group size counting (1)" << std::endl; - std::cerr << "Make sure your binary is compiled for 64-bit: using 'x64' platform configuration." << std::endl; + PCM_CPUID_INFO cpuid_args; + pcm_cpuid(1, cpuid_args); + + int apic_ids_per_package = (cpuid_args.array[1] & 0x00FF0000) >> 16, apic_ids_per_core; + + if (apic_ids_per_package == 0) + { + std::cout << "apic_ids_per_package == 0" << std::endl; + return false; + } + + pcm_cpuid(0xb, 0x0, cpuid_args); + + if ((cpuid_args.array[2] & 0xFF00) == 0x100) + apic_ids_per_core = cpuid_args.array[1] & 0xFFFF; + else + apic_ids_per_core = 1; + + if (apic_ids_per_core == 0) + { + std::cout << "apic_ids_per_core == 0" << std::endl; return false; } + +#ifdef _MSC_VER +// version for Windows 7 and later version + char * slpi = new char[sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)]; - DWORD len = sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX); - DWORD res = GetLogicalProcessorInformationEx(RelationAll, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)slpi, &len); + DWORD len = (DWORD)sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX); + BOOL res = GetLogicalProcessorInformationEx(RelationAll, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)slpi, &len); while (res == FALSE) { @@ -594,9 +647,11 @@ bool PCM::discoverSystemTopology() } else { - _com_error error(GetLastError()); std::wcerr << "Error in Windows function 'GetLogicalProcessorInformationEx': " << - GetLastError() << " " << error.ErrorMessage() << std::endl; + GetLastError() << " "; + const TCHAR * strError = _com_error(GetLastError()).ErrorMessage(); + if (strError) std::wcerr << strError; + std::wcerr << std::endl; return false; } } @@ -604,7 +659,7 @@ bool PCM::discoverSystemTopology() char * base_slpi = slpi; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pi = NULL; - for ( ; slpi < base_slpi + len; slpi += pi->Size) + for ( ; slpi < base_slpi + len; slpi += (DWORD)pi->Size) { pi = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)slpi; if (pi->Relationship == RelationProcessorCore) @@ -623,37 +678,11 @@ bool PCM::discoverSystemTopology() return false; } -#undef PCM_WIN7_USE_CPUID_FOR_TOPOLOGY_ENUMERATION -#ifdef PCM_WIN7_USE_CPUID_FOR_TOPOLOGY_ENUMERATION - PCM_CPUID_INFO cpuid_args; - pcm_cpuid(1, cpuid_args); - - int apic_ids_per_package = (cpuid_args.array[1] & 0x00FF0000) >> 16, apic_ids_per_core; - - if (apic_ids_per_package == 0) - { - std::cout << "apic_ids_per_package == 0" << std::endl; - return false; - } - - pcm_cpuid(0xb, 0x0, cpuid_args); - - if ((cpuid_args.array[2] & 0xFF00) == 0x100) - apic_ids_per_core = cpuid_args.array[1] & 0xFFFF; - else - apic_ids_per_core = 1; - - if (apic_ids_per_core == 0) - { - std::cout << "apic_ids_per_core == 0" << std::endl; - return false; - } - - for (int i = 0; i < num_cores; i++) + for (int i = 0; i < (int)num_cores; i++) { ThreadGroupTempAffinity affinity(i); - pcm_cpuid(0xb, cpuid_args); + pcm_cpuid(0xb, 0x0, cpuid_args); int apic_id = cpuid_args.array[3]; @@ -665,99 +694,9 @@ bool PCM::discoverSystemTopology() topology.push_back(entry); socketIdMap[entry.socket] = 0; } -#else - - topology.resize(num_cores); - - slpi = base_slpi; - pi = NULL; - - for ( ; slpi < base_slpi + len; slpi += pi->Size) - { - pi = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)slpi; - if (pi->Relationship == RelationNumaNode) - { - ++num_sockets; - for (unsigned int c = 0; c < (unsigned int)num_cores; ++c) - { - // std::cout << "c:"<NumaNode.GroupMask.Group]: "<NumaNode.GroupMask.Group]<NumaNode.GroupMask.Group] || c >= GroupStart[(pi->NumaNode.GroupMask.Group) + 1]) - { - //std::cout <<"core "<NumaNode.GroupMask.Group << std::endl; - continue; - } - if ((1LL << (c - GroupStart[pi->NumaNode.GroupMask.Group])) & pi->NumaNode.GroupMask.Mask) - { - topology[c].core_id = c; - topology[c].os_id = c; - topology[c].socket = pi->NumaNode.NodeNumber; - // std::cout << "Core "<< c <<" is in NUMA node "<< topology[c].socket << " and belongs to processor group " << slpi->NumaNode.GroupMask.Group <= 0) - { - topology[entry.os_id] = entry; - ++num_online_cores; - if (entry.socket == 0 && entry.core_id == 0) ++threads_per_core; - } sscanf(buffer, "processor\t: %d", &entry.os_id); //std::cout << "os_core_id: "<= 0) - { - topology[entry.os_id] = entry; - ++num_online_cores; - if (entry.socket == 0 && entry.core_id == 0) ++threads_per_core; - } fclose(f_cpuinfo); #elif defined(__FreeBSD__) size_t size = sizeof(num_cores); - cpuctl_cpuid_args_t cpuid_args; - int fd, apic_ids_per_package, apic_ids_per_core; + cpuctl_cpuid_args_t cpuid_args_freebds; + int fd; if(0 != sysctlbyname("kern.smp.cpus", &num_cores, &size, NULL, 0)) { @@ -859,29 +776,6 @@ bool PCM::discoverSystemTopology() std::cout << "cpuctl(4) not loaded." << std::endl; return false; } - - do_cpuid(1, cpuid_args.data); - - apic_ids_per_package = (cpuid_args.data[1] & 0x00FF0000) >> 16; - - if (apic_ids_per_package == 0) - { - std::cout << "apic_ids_per_package == 0" << std::endl; - return false; - } - - cpuid_count(0xb, 0x0, cpuid_args.data); - - if ((cpuid_args.data[2] & 0xFF00) == 0x100) - apic_ids_per_core = cpuid_args.data[1] & 0xFFFF; - else - apic_ids_per_core = 1; - - if (apic_ids_per_core == 0) - { - std::cout << "apic_ids_per_core == 0" << std::endl; - return false; - } for (int i = 0; i < num_cores; i++) { @@ -891,11 +785,11 @@ bool PCM::discoverSystemTopology() sprintf(cpuctl_name, "/dev/cpuctl%d", i); fd = ::open(cpuctl_name, O_RDWR); - cpuid_args.level = 0xb; + cpuid_args_freebds.level = 0xb; - ::ioctl(fd, CPUCTL_CPUID, &cpuid_args); + ::ioctl(fd, CPUCTL_CPUID, &cpuid_args_freebds); - apic_id = cpuid_args.data[3]; + apic_id = cpuid_args_freebds.data[3]; entry.os_id = i; entry.socket = apic_id / apic_ids_per_package; @@ -939,10 +833,9 @@ bool PCM::discoverSystemTopology() // The OSX version needs the MSR handle earlier so that it can build the CPU topology. // This topology functionality should potentially go into a different KEXT - MSR = new SafeMsrHandle *[num_cores]; for(int i = 0; i < num_cores; i++) { - MSR[i] = new SafeMsrHandle(i); + MSR.push_back(std::shared_ptr(new SafeMsrHandle(i)) ); } TopologyEntry *entries = new TopologyEntry[num_cores]; @@ -962,10 +855,10 @@ bool PCM::discoverSystemTopology() #endif //end of ifdef _MSC_VER if(num_cores == 0) { - num_cores = topology.size(); + num_cores = (int32)topology.size(); } if(num_sockets == 0) { - num_sockets = (std::max)(socketIdMap.size(), (size_t)1); + num_sockets = (int32)(std::max)(socketIdMap.size(), (size_t)1); } socketIdMap_type::iterator s = socketIdMap.begin(); @@ -974,9 +867,9 @@ bool PCM::discoverSystemTopology() s->second = sid++; } - for (int i = 0; (i < num_cores) && (!socketIdMap.empty()); ++i) + for (int i = 0; (i < (int)num_cores) && (!socketIdMap.empty()); ++i) { - if(isCoreOnline(i)) + if(isCoreOnline((int32)i)) topology[i].socket = socketIdMap[topology[i].socket]; } @@ -991,7 +884,7 @@ bool PCM::discoverSystemTopology() if(threads_per_core == 0) { - for (int i = 0; i < num_cores; ++i) + for (int i = 0; i < (int)num_cores; ++i) { if(topology[i].socket == topology[0].socket && topology[i].core_id == topology[0].core_id) ++threads_per_core; @@ -1038,8 +931,8 @@ void PCM::printSystemTopology() const else { std::cerr << "Offlined cores: "; - for (int i = 0; i < num_cores; ++i) - if(isCoreOnline(i) == false) + for (int i = 0; i < (int)num_cores; ++i) + if(isCoreOnline((int32)i) == false) std::cerr << i << " "; std::cerr << std::endl; } @@ -1055,29 +948,23 @@ void PCM::printSystemTopology() const } } -void PCM::initMSR() +bool PCM::initMSR() { - int32 i = 0; - #ifndef __APPLE__ - MSR = new SafeMsrHandle *[num_cores]; try { - for (i = 0; i < num_cores; ++i) + for (int i = 0; i < (int)num_cores; ++i) { - MSR[i] = isCoreOnline(i) ? (new SafeMsrHandle(i)):(new SafeMsrHandle()); // the core is offlined, assign an invalid MSR handle + if (isCoreOnline((int32)i)) + MSR.push_back(std::shared_ptr(new SafeMsrHandle(i))); + else // the core is offlined, assign an invalid MSR handle + MSR.push_back(std::shared_ptr(new SafeMsrHandle())); } } catch (...) { // failed - for (int j = 0; j < i; j++) - { - if(MSR[j]) - delete MSR[j]; - } - delete[] MSR; - MSR = NULL; + MSR.clear(); std::cerr << "Can not access CPUs Model Specific Registers (MSRs)." << std::endl; #ifdef _MSC_VER @@ -1089,14 +976,15 @@ void PCM::initMSR() std::cerr << "Ensure cpuctl module is loaded and that you have read and write" << std::endl; std::cerr << "permissions for /dev/cpuctl* devices (the 'chown' command can help)." << std::endl; #endif - + return false; } #endif + return true; } bool PCM::detectNominalFrequency() { - if (MSR) + if (MSR.size()) { uint64 freq = 0; MSR[socketRefCore[0]]->read(PLATFORM_INFO_ADDR, &freq); @@ -1105,10 +993,12 @@ bool PCM::detectNominalFrequency() || cpu_model == JAKETOWN || cpu_model == IVYTOWN || cpu_model == HASWELLX + || cpu_model == BDX_DE || cpu_model == IVY_BRIDGE || cpu_model == HASWELL || cpu_model == BROADWELL || original_cpu_model == ATOM_AVOTON + || cpu_model == SKL ) ? (100000000ULL) : (133333333ULL); nominal_frequency = ((freq >> 8) & 255) * bus_freq; @@ -1131,21 +1021,24 @@ bool PCM::detectNominalFrequency() void PCM::initEnergyMonitoring() { - if(packageEnergyMetricsAvailable() && MSR) + if(packageEnergyMetricsAvailable() && MSR.size()) { uint64 rapl_power_unit = 0; MSR[socketRefCore[0]]->read(MSR_RAPL_POWER_UNIT,&rapl_power_unit); uint64 energy_status_unit = extract_bits(rapl_power_unit,8,12); - joulesPerEnergyUnit = 1./double(1ULL<read(MSR_PKG_POWER_INFO,&package_power_info); - pkgThermalSpecPower = (uint32) (double(extract_bits(package_power_info, 0, 14))*wattsPerPowerUnit); - pkgMinimumPower = (uint32) (double(extract_bits(package_power_info, 16, 30))*wattsPerPowerUnit); - pkgMaximumPower = (uint32) (double(extract_bits(package_power_info, 32, 46))*wattsPerPowerUnit); + pkgThermalSpecPower = (int32) (double(extract_bits(package_power_info, 0, 14))*wattsPerPowerUnit); + pkgMinimumPower = (int32) (double(extract_bits(package_power_info, 16, 30))*wattsPerPowerUnit); + pkgMaximumPower = (int32) (double(extract_bits(package_power_info, 32, 46))*wattsPerPowerUnit); std::cerr << "Package thermal spec power: "<< pkgThermalSpecPower << " Watt; "; std::cerr << "Package minimum power: "<< pkgMinimumPower << " Watt; "; @@ -1153,39 +1046,35 @@ void PCM::initEnergyMonitoring() int i = 0; - if(snb_energy_status.empty()) - for (i = 0; i < num_sockets; ++i) - snb_energy_status.push_back(new CounterWidthExtender(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[i]],MSR_PKG_ENERGY_STATUS)) ); + if(energy_status.empty()) + for (i = 0; i < (int)num_sockets; ++i) + energy_status.push_back( + std::shared_ptr( + new CounterWidthExtender(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[i]], MSR_PKG_ENERGY_STATUS)))); - if(dramEnergyMetricsAvailable() && jkt_dram_energy_status.empty()) - for (i = 0; i < num_sockets; ++i) - jkt_dram_energy_status.push_back(new CounterWidthExtender(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[i]],MSR_DRAM_ENERGY_STATUS))); + if(dramEnergyMetricsAvailable() && dram_energy_status.empty()) + for (i = 0; i < (int)num_sockets; ++i) + dram_energy_status.push_back( + std::shared_ptr( + new CounterWidthExtender(new CounterWidthExtender::MsrHandleCounter(MSR[socketRefCore[i]], MSR_DRAM_ENERGY_STATUS)))); } } void PCM::initUncoreObjects() { - if (hasPCICFGUncore() && MSR != NULL) + if (hasPCICFGUncore() && MSR.size()) { - server_pcicfg_uncore = new ServerPCICFGUncore *[num_sockets]; int i = 0; - try { - for (i = 0; i < num_sockets; ++i) + for (i = 0; i < (int)num_sockets; ++i) { - server_pcicfg_uncore[i] = new ServerPCICFGUncore(i, this); + server_pcicfg_uncore.push_back(std::shared_ptr(new ServerPCICFGUncore(i, this))); } } catch (...) { - // failed - for (int j = 0; j < i; j++) - delete server_pcicfg_uncore[j]; - delete[] server_pcicfg_uncore; - - server_pcicfg_uncore = NULL; - + server_pcicfg_uncore.clear(); std::cerr << "Can not access Jaketown/Ivytown PCI configuration space. Access to uncore counters (memory and QPI bandwidth) is disabled." << std::endl; #ifdef _MSC_VER std::cerr << "You must have signed msr.sys driver in your current directory and have administrator rights to run this program." << std::endl; @@ -1196,15 +1085,18 @@ void PCM::initUncoreObjects() std::cerr << "You must be root to access these Jaketown/Ivytown counters in PCM. " << std::endl; #endif } - } else if((cpu_model == SANDY_BRIDGE || cpu_model == IVY_BRIDGE || cpu_model == HASWELL || cpu_model == BROADWELL) && MSR != NULL) + } else if((cpu_model == SANDY_BRIDGE || cpu_model == IVY_BRIDGE || cpu_model == HASWELL || cpu_model == BROADWELL || cpu_model == SKL) && MSR.size()) { // initialize memory bandwidth counting try { - clientBW = new ClientBW(); - clientImcReads = new CounterWidthExtender(new CounterWidthExtender::ClientImcReadsCounter(clientBW)); - clientImcWrites = new CounterWidthExtender(new CounterWidthExtender::ClientImcWritesCounter(clientBW)); - clientIoRequests = new CounterWidthExtender(new CounterWidthExtender::ClientIoRequestsCounter(clientBW)); + clientBW = std::shared_ptr(new ClientBW()); + clientImcReads = std::shared_ptr( + new CounterWidthExtender(new CounterWidthExtender::ClientImcReadsCounter(clientBW))); + clientImcWrites = std::shared_ptr( + new CounterWidthExtender(new CounterWidthExtender::ClientImcWritesCounter(clientBW))); + clientIoRequests = std::shared_ptr( + new CounterWidthExtender(new CounterWidthExtender::ClientIoRequestsCounter(clientBW))); } catch(...) { @@ -1228,6 +1120,7 @@ void PCM::initUncoreObjects() PCU_MSR_PMON_CTRX_ADDR[2] = JKTIVT_PCU_MSR_PMON_CTR2_ADDR; PCU_MSR_PMON_CTRX_ADDR[3] = JKTIVT_PCU_MSR_PMON_CTR3_ADDR; break; + case BDX_DE: case HASWELLX: PCU_MSR_PMON_BOX_CTL_ADDR = HSX_PCU_MSR_PMON_BOX_CTL_ADDR; PCU_MSR_PMON_CTRX_ADDR[0] = HSX_PCU_MSR_PMON_CTR0_ADDR; @@ -1236,14 +1129,12 @@ void PCM::initUncoreObjects() PCU_MSR_PMON_CTRX_ADDR[3] = HSX_PCU_MSR_PMON_CTR3_ADDR; break; default: - PCU_MSR_PMON_BOX_CTL_ADDR = 0; - PCU_MSR_PMON_CTRX_ADDR[0] = 0; - PCU_MSR_PMON_CTRX_ADDR[1] = 0; - PCU_MSR_PMON_CTRX_ADDR[2] = 0; - PCU_MSR_PMON_CTRX_ADDR[3] = 0; + PCU_MSR_PMON_BOX_CTL_ADDR = (uint64)0; + PCU_MSR_PMON_CTRX_ADDR[0] = (uint64)0; + PCU_MSR_PMON_CTRX_ADDR[1] = (uint64)0; + PCU_MSR_PMON_CTRX_ADDR[2] = (uint64)0; + PCU_MSR_PMON_CTRX_ADDR[3] = (uint64)0; } - - } #ifdef __linux__ @@ -1304,17 +1195,14 @@ PCM::PCM() : perfmon_config_anythread(1), nominal_frequency(0), max_qpi_speed(0), + L3ScalingFactor(0), pkgThermalSpecPower(-1), pkgMinimumPower(-1), pkgMaximumPower(-1), allow_multiple_instances(false), programmed_pmu(false), - MSR(NULL), - server_pcicfg_uncore(NULL), - clientBW(NULL), - clientImcReads(NULL), - clientImcWrites(NULL), - clientIoRequests(NULL), + PCU_MSR_PMON_BOX_CTL_ADDR(0), + joulesPerEnergyUnit(0), disable_JKT_workaround(false), blocked(false), coreCStateMsr(NULL), @@ -1342,21 +1230,21 @@ PCM::PCM() : } #endif -#ifdef __linux__ - if(isNMIWatchdogEnabled()) return; -#endif - if(!detectModel()) return; if(!checkModel()) return; +#ifdef __linux__ + if (isNMIWatchdogEnabled()) return; +#endif + initCStateSupportTables(); if(!discoverSystemTopology()) return; printSystemTopology(); - initMSR(); + if(!initMSR()) return; if(!detectNominalFrequency()) return; @@ -1379,7 +1267,7 @@ void PCM::enableJKTWorkaround(bool enable) if(disable_JKT_workaround) return; std::cerr << "Using PCM on your system might have a performance impact as per http://software.intel.com/en-us/articles/performance-impact-when-sampling-certain-llc-events-on-snb-ep-with-vtune" << std::endl; std::cerr << "You can avoid the performance impact by using the option --noJKTWA, however the cache metrics might be wrong then." << std::endl; - if(MSR) + if(MSR.size()) { for(int32 i = 0; i < num_cores; ++i) { @@ -1392,12 +1280,9 @@ void PCM::enableJKTWorkaround(bool enable) MSR[i]->write(0x39C, val64); } } - if(server_pcicfg_uncore) + for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i) { - for (int32 i = 0; i < num_sockets; ++i) - { - if(server_pcicfg_uncore[i]) server_pcicfg_uncore[i]->enableJKTWorkaround(enable); - } + if(server_pcicfg_uncore[i].get()) server_pcicfg_uncore[i]->enableJKTWorkaround(enable); } } @@ -1420,17 +1305,27 @@ bool PCM::isCPUModelSupported(int model_) || model_ == HASWELL || model_ == IVYTOWN || model_ == HASWELLX - || model_ == BROADWELL + || model_ == BDX_DE + || model_ == BROADWELL + || model_ == SKL ); } bool PCM::checkModel() { if (cpu_model == NEHALEM) cpu_model = NEHALEM_EP; - if (cpu_model == ATOM_2 || cpu_model == ATOM_CENTERTON || cpu_model == ATOM_BAYTRAIL || cpu_model == ATOM_AVOTON) cpu_model = ATOM; + if ( cpu_model == ATOM_2 + || cpu_model == ATOM_CENTERTON + || cpu_model == ATOM_BAYTRAIL + || cpu_model == ATOM_AVOTON + || cpu_model == ATOM_CHERRYTRAIL + ) { + cpu_model = ATOM; + } if (cpu_model == HASWELL_ULT || cpu_model == HASWELL_2) cpu_model = HASWELL; + if (cpu_model == BROADWELL_XEON_E3) cpu_model = BROADWELL; - if(!isCPUModelSupported(cpu_model)) + if(!isCPUModelSupported((int)cpu_model)) { std::cerr << getUnsupportedMessage() << " CPU model number: " << cpu_model << " Brand: \"" << getCPUBrandString().c_str() <<"\""<< std::endl; /* FOR TESTING PURPOSES ONLY */ @@ -1446,13 +1341,7 @@ bool PCM::checkModel() void PCM::destroyMSR() { - if (MSR) - { - for (int i = 0; i < num_cores; ++i) - if (MSR[i]) delete MSR[i]; - delete[] MSR; - MSR = NULL; - } + MSR.clear(); } PCM::~PCM() @@ -1461,66 +1350,15 @@ PCM::~PCM() if (instance) { destroyMSR(); - - if (server_pcicfg_uncore) - { - for (int i = 0; i < num_sockets; ++i) - if (server_pcicfg_uncore[i]) delete server_pcicfg_uncore[i]; - delete[] server_pcicfg_uncore; - } - for (uint32 i = 0; i < snb_energy_status.size(); ++i) - { - delete snb_energy_status[i]; - } - for (uint32 i = 0; i < jkt_dram_energy_status.size(); ++i) - delete jkt_dram_energy_status[i]; - instance = NULL; - - if(clientImcReads) delete clientImcReads; - clientImcReads = NULL; - if(clientImcWrites) delete clientImcWrites; - clientImcWrites = NULL; - if(clientIoRequests) delete clientIoRequests; - clientIoRequests = NULL; - if(clientBW) delete clientBW; - clientBW = NULL; } } bool PCM::good() { - return MSR != NULL; + return !MSR.empty(); } -class TemporalThreadAffinity // speedup trick for Linux -{ -#ifdef __linux__ - cpu_set_t old_affinity; - TemporalThreadAffinity(); // forbiden - -public: - TemporalThreadAffinity(uint32 core_id) - { - pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &old_affinity); - - cpu_set_t new_affinity; - CPU_ZERO(&new_affinity); - CPU_SET(core_id, &new_affinity); - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &new_affinity); - } - ~TemporalThreadAffinity() - { - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &old_affinity); - } -#else // not implemented for windows or os x - TemporalThreadAffinity(); // forbiden - -public: - TemporalThreadAffinity(uint32) { } -#endif -}; - #ifdef PCM_USE_PERF perf_event_attr PCM_init_perf_event_attr() { @@ -1562,7 +1400,7 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter } InstanceLock lock(allow_multiple_instances); - if (!MSR) return PCM::MSRAccessDenied; + if (MSR.empty()) return PCM::MSRAccessDenied; ExtendedCustomCoreEventDescription * pExtDesc = (ExtendedCustomCoreEventDescription *)parameter_; @@ -1594,22 +1432,29 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter if (!numInstancesSemaphore) { _com_error error(GetLastError()); - std::wcerr << "Error in Windows function 'CreateSemaphore': " << GetLastError() << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Error in Windows function 'CreateSemaphore': " << GetLastError() << " "; + const TCHAR * strError = _com_error(GetLastError()).ErrorMessage(); + if (strError) std::wcerr << strError; + std::wcerr << std::endl; return PCM::UnknownError; } LONG prevValue = 0; if (!ReleaseSemaphore(numInstancesSemaphore, 1, &prevValue)) { _com_error error(GetLastError()); - std::wcerr << "Error in Windows function 'ReleaseSemaphore': " << GetLastError() << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Error in Windows function 'ReleaseSemaphore': " << GetLastError() << " "; + const TCHAR * strError = _com_error(GetLastError()).ErrorMessage(); + if (strError) std::wcerr << strError; + std::wcerr << std::endl; return PCM::UnknownError; } if (prevValue > 0) // already programmed since another instance exists { std::cerr << "Number of PCM instances: " << (prevValue + 1) << std::endl; - if (hasPCICFGUncore() && server_pcicfg_uncore && server_pcicfg_uncore[0] && max_qpi_speed==0) - for (int i = 0; i < num_sockets; ++i) - max_qpi_speed = (std::max)(server_pcicfg_uncore[i]->computeQPISpeed(socketRefCore[i],cpu_model), max_qpi_speed); // parenthesis to avoid macro expansion on Windows + if (hasPCICFGUncore() && max_qpi_speed==0) + for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i) + if (server_pcicfg_uncore[i].get()) + max_qpi_speed = (std::max)(server_pcicfg_uncore[i]->computeQPISpeed(socketRefCore[i], cpu_model), max_qpi_speed); // parenthesis to avoid macro expansion on Windows reportQPISpeed(); return PCM::Success; @@ -1635,10 +1480,11 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter if (curValue > 1) // already programmed since another instance exists { std::cerr << "Number of PCM instances: " << curValue << std::endl; - if (hasPCICFGUncore() && server_pcicfg_uncore && server_pcicfg_uncore[0] && max_qpi_speed==0) - for (int i = 0; i < num_sockets; ++i) { + if (hasPCICFGUncore() && max_qpi_speed==0) + for (int i = 0; i < (int)server_pcicfg_uncore.size(); ++i) { + if(server_pcicfg_uncore[i].get()) max_qpi_speed = std::max(server_pcicfg_uncore[i]->computeQPISpeed(socketRefCore[i],cpu_model), max_qpi_speed); - reportQPISpeed(); + reportQPISpeed(); } if(!canUsePerf) return PCM::Success; } @@ -1711,8 +1557,20 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter coreEventDesc[1].event_number = ARCH_LLC_REFERENCE_EVTNR; coreEventDesc[1].umask_value = ARCH_LLC_REFERENCE_UMASK; core_gen_counter_num_used = 2; - } - else if ( + + } else if ( SKL == cpu_model ) + { + coreEventDesc[0].event_number = SKL_MEM_LOAD_RETIRED_L3_MISS_EVTNR; + coreEventDesc[0].umask_value = SKL_MEM_LOAD_RETIRED_L3_MISS_UMASK; + coreEventDesc[1].event_number = SKL_MEM_LOAD_RETIRED_L3_HIT_EVTNR; + coreEventDesc[1].umask_value = SKL_MEM_LOAD_RETIRED_L3_HIT_UMASK; + coreEventDesc[2].event_number = SKL_MEM_LOAD_RETIRED_L2_MISS_EVTNR; + coreEventDesc[2].umask_value = SKL_MEM_LOAD_RETIRED_L2_MISS_UMASK; + coreEventDesc[3].event_number = SKL_MEM_LOAD_RETIRED_L2_HIT_EVTNR; + coreEventDesc[3].umask_value = SKL_MEM_LOAD_RETIRED_L2_HIT_UMASK; + core_gen_counter_num_used = 4; + + } else if ( SANDY_BRIDGE == cpu_model || JAKETOWN == cpu_model || IVYTOWN == cpu_model @@ -1720,6 +1578,7 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter || HASWELL == cpu_model || HASWELLX == cpu_model || BROADWELL == cpu_model + || BDX_DE == cpu_model ) { coreEventDesc[0].event_number = ARCH_LLC_MISS_EVTNR; @@ -1779,7 +1638,7 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter programmed_pmu = true; // Version for linux/windows - for (int i = 0; i < num_cores; ++i) + for (int i = 0; i < (int)num_cores; ++i) { // program core counters @@ -1937,9 +1796,9 @@ PCM::ErrorCode PCM::program(const PCM::ProgramMode mode_, const void * parameter std::cerr << "Successfully programmed on-core PMU using Linux perf"<program(); max_qpi_speed = (std::max)(server_pcicfg_uncore[i]->computeQPISpeed(socketRefCore[i],cpu_model), max_qpi_speed); // parenthesis to avoid macro expansion on Windows @@ -1955,11 +1814,11 @@ void PCM::reportQPISpeed() const { if (!max_qpi_speed) return; - if (hasPCICFGUncore() && server_pcicfg_uncore && server_pcicfg_uncore[0]) { - for (int i = 0; i < num_sockets; ++i) + if (hasPCICFGUncore()) { + for (size_t i = 0; i < (size_t)server_pcicfg_uncore.size(); ++i) { std::cerr << "Socket " << i << std::endl; - server_pcicfg_uncore[i]->reportQPISpeed(); + if(server_pcicfg_uncore[i].get()) server_pcicfg_uncore[i]->reportQPISpeed(); } } else { std::cerr << "Max QPI speed: " << max_qpi_speed / (1e9) << " GBytes/second (" << max_qpi_speed / (2e9) << " GT/second)" << std::endl; @@ -2059,7 +1918,7 @@ void PCM::programNehalemEPUncore(int32 core) void PCM::programBecktonUncore(int32 core) { // program Beckton uncore - if (core == socketRefCore[0]) computeQPISpeedBeckton(core); + if (core == socketRefCore[0]) computeQPISpeedBeckton((int)core); uint64 value = 1 << 29ULL; // reset all counters MSR[core]->write(U_MSR_PMON_GLOBAL_CTL, value); @@ -2175,7 +2034,7 @@ void PCM::computeNominalFrequency() } std::string PCM::getCPUBrandString() { - char buffer[sizeof(int)*4*3+1]; + char buffer[sizeof(int)*4*3+1]; PCM_CPUID_INFO * info = (PCM_CPUID_INFO *) buffer; pcm_cpuid(0x80000002, *info); ++info; @@ -2185,21 +2044,34 @@ std::string PCM::getCPUBrandString() buffer[sizeof(int)*4*3] = 0; std::string result(buffer); while(result[0]==' ') result.erase(0,1); - size_t i = std::string::npos; - while((i=result.find(" "))!=std::string::npos) result.replace(i,2," "); // remove duplicate spaces + std::string::size_type i; + while((i = result.find(" ")) != std::string::npos) result.replace(i,2," "); // remove duplicate spaces return result; } +std::string PCM::getCPUFamilyModelString() +{ + char buffer[sizeof(int)*4*3+6]; + memset(buffer,0,sizeof(buffer)); +#ifdef _MSC_VER + sprintf_s(buffer,sizeof(buffer),"GenuineIntel-%d-%2X",this->cpu_family,this->original_cpu_model); +#else + snprintf(buffer,sizeof(buffer),"GenuineIntel-%d-%2X",this->cpu_family,this->original_cpu_model); +#endif + std::string result(buffer); + return result; +} + uint64 get_frequency_from_cpuid() // from Pat Fay (Intel) { double speed=0; std::string brand = PCM::getCPUBrandString(); - if(brand.length() > 0) + if (brand.length() > std::string::size_type(0)) { - size_t unitsg = brand.find("GHz"); + std::string::size_type unitsg = brand.find("GHz"); if(unitsg != std::string::npos) { - size_t atsign = brand.rfind(' ', unitsg); + std::string::size_type atsign = brand.rfind(' ', unitsg); if(atsign != std::string::npos) { std::istringstream(brand.substr(atsign)) >> speed; @@ -2208,10 +2080,10 @@ uint64 get_frequency_from_cpuid() // from Pat Fay (Intel) } else { - size_t unitsg = brand.find("MHz"); + std::string::size_type unitsg = brand.find("MHz"); if(unitsg != std::string::npos) { - size_t atsign = brand.rfind(' ', unitsg); + std::string::size_type atsign = brand.rfind(' ', unitsg); if(atsign != std::string::npos) { std::istringstream(brand.substr(atsign)) >> speed; @@ -2219,23 +2091,23 @@ uint64 get_frequency_from_cpuid() // from Pat Fay (Intel) } } } - return (uint64)speed * 1000ULL * 1000ULL; + return (uint64)(speed * 1000. * 1000.); } std::string PCM::getSupportedUarchCodenames() const { std::ostringstream ostr; - for(int32 i=0; i < PCM::END_OF_MODEL_LIST ; ++i) - if(isCPUModelSupported(i)) + for(int32 i=0; i < static_cast(PCM::END_OF_MODEL_LIST) ; ++i) + if(isCPUModelSupported((int)i)) ostr << getUArchCodename(i) << ", "; - return ostr.str().substr(0, ostr.str().length() - 2); + return std::string(ostr.str().substr(0, ostr.str().length() - 2)); } std::string PCM::getUnsupportedMessage() const { std::ostringstream ostr; ostr << "Error: unsupported processor. Only Intel(R) processors are supported (Atom(R) and microarchitecture codename "<< getSupportedUarchCodenames() <<")."; - return ostr.str(); + return std::string(ostr.str()); } void PCM::computeQPISpeedBeckton(int core_nr) @@ -2261,11 +2133,11 @@ void PCM::computeQPISpeedBeckton(int core_nr) MSR[core_nr]->read(R_MSR_PMON_CTR0, &startFlits); const uint64 timerGranularity = 1000000ULL; // mks - uint64 startTSC = getTickCount(timerGranularity, core_nr); + uint64 startTSC = getTickCount(timerGranularity, (uint32) core_nr); uint64 endTSC; do { - endTSC = getTickCount(timerGranularity, core_nr); + endTSC = getTickCount(timerGranularity, (uint32) core_nr); } while (endTSC - startTSC < 200000ULL); // spin for 200 ms uint64 endFlits = 0; @@ -2277,7 +2149,7 @@ void PCM::computeQPISpeedBeckton(int core_nr) bool PCM::PMUinUse() { // follow the "Performance Monitoring Unit Sharing Guide" by P. Irelan and Sh. Kuo - for (int i = 0; i < num_cores; ++i) + for (int i = 0; i < (int)num_cores; ++i) { //std::cout << "Core "<write(IA32_CR_PERF_GLOBAL_CTRL, (1ULL << 32) + (1ULL << 33) + (1ULL << 34)); @@ -2394,7 +2270,7 @@ void PCM::cleanupPMU() void PCM::resetPMU() { - for (int i = 0; i < num_cores; ++i) + for (int i = 0; i < (int)num_cores; ++i) { // disable all counters MSR[i]->write(IA32_CR_PERF_GLOBAL_CTRL, 0); @@ -2426,6 +2302,10 @@ void PCM::resetPMU() } void PCM::freeRMID() { + if(!L3CacheOccupancyMetricAvailable()) { + return; + } + for(int32 core = 0; core < num_cores; core ++ ) { if(!isCoreOnline(core)) continue; @@ -2453,6 +2333,7 @@ void PCM::freeRMID() std::cerr << " Freeing up all RMIDs" << std::endl; } + void PCM::setOutput(const std::string filename) { outfile = new std::ofstream(filename.c_str()); @@ -2475,7 +2356,7 @@ void PCM::cleanup() { InstanceLock lock(allow_multiple_instances); - if (!MSR) return; + if (MSR.empty()) return; std::cerr << "Cleaning up" << std::endl; @@ -2485,6 +2366,26 @@ void PCM::cleanup() freeRMID(); } +// hle is only available when cpuid has this: +// HLE: CPUID.07H.EBX.HLE [bit 4] = 1 +bool PCM::supportsHLE() const +{ + PCM_CPUID_INFO info; + pcm_cpuid(7, 0, info); // leaf 7, subleaf 0 + + return (info.reg.ebx & (0x1 << 4)) ? true : false; +} + +// rtm is only available when cpuid has this: +// RTM: CPUID.07H.EBX.RTM [bit 11] = 1 +bool PCM::supportsRTM() const +{ + PCM_CPUID_INFO info; + pcm_cpuid(7, 0, info); // leaf 7, subleaf 0 + + return (info.reg.ebx & (0x1 << 11)) ? true : false; +} + #ifdef __APPLE__ uint32 PCM::getNumInstances() @@ -2614,7 +2515,7 @@ uint64 RDTSC() #ifdef _MSC_VER // Windows #if _MSC_VER>= 1600 - result = __rdtsc(); + result = static_cast(__rdtsc()); #endif #else // Linux @@ -2700,7 +2601,7 @@ void PCM::readPerfData(uint32 core, std::vector & outData) } #endif -void BasicCounterState::readAndAggregate(SafeMsrHandle * msr) +void BasicCounterState::readAndAggregate(std::shared_ptr msr) { uint64 cInstRetiredAny = 0, cCpuClkUnhaltedThread = 0, cCpuClkUnhaltedRef = 0; uint64 cL3Miss = 0; @@ -2710,7 +2611,7 @@ void BasicCounterState::readAndAggregate(SafeMsrHandle * msr) uint64 cInvariantTSC = 0; uint64 cL3Occupancy = 0; uint64 cCStateResidency[PCM::MAX_C_STATE + 1]; - memset(&(cCStateResidency[0]), 0, sizeof(cCStateResidency)); + memset(cCStateResidency, 0, sizeof(cCStateResidency)); uint64 thermStatus = 0; TemporalThreadAffinity tempThreadAffinity(msr->getCoreId()); // speedup trick for Linux @@ -2748,9 +2649,11 @@ void BasicCounterState::readAndAggregate(SafeMsrHandle * msr) case PCM::JAKETOWN: case PCM::IVYTOWN: case PCM::HASWELLX: + case PCM::BDX_DE: case PCM::IVY_BRIDGE: case PCM::HASWELL: case PCM::BROADWELL: + case PCM::SKL: msr->read(IA32_PMC0, &cL3Miss); msr->read(IA32_PMC1, &cL3UnsharedHit); msr->read(IA32_PMC2, &cL2HitM); @@ -2771,7 +2674,7 @@ void BasicCounterState::readAndAggregate(SafeMsrHandle * msr) else { #ifdef _MSC_VER - cInvariantTSC = ((uint64)(GetTickCount()/1000))*m->getNominalFrequency(); + cInvariantTSC = ((static_cast(GetTickCount()/1000ULL)))*m->getNominalFrequency(); #else struct timeval tp; gettimeofday(&tp, NULL); @@ -2780,7 +2683,7 @@ void BasicCounterState::readAndAggregate(SafeMsrHandle * msr) } // reading core C state counters - for(int i=0; i <= PCM::MAX_C_STATE ;++i) + for(int i=0; i <= (int)(PCM::MAX_C_STATE) ;++i) if(m->coreCStateMsr && m->coreCStateMsr[i]) msr->read(m->coreCStateMsr[i], &(cCStateResidency[i])); @@ -2798,14 +2701,26 @@ void BasicCounterState::readAndAggregate(SafeMsrHandle * msr) L2HitM += m->extractCoreGenCounterValue(cL2HitM); L2Hit += m->extractCoreGenCounterValue(cL2Hit); InvariantTSC += cInvariantTSC; - for(int i=0; i <= PCM::MAX_C_STATE ;++i) + for(int i=0; i <= int(PCM::MAX_C_STATE);++i) CStateResidency[i] += cCStateResidency[i]; ThermalHeadroom = extractThermalHeadroom(thermStatus); } +PCM::ErrorCode PCM::programServerUncoreMemoryMetrics(int rankA, int rankB) +{ + if(MSR.empty() || server_pcicfg_uncore.empty()) return PCM::MSRAccessDenied; + + for (int i = 0; (i < (int)server_pcicfg_uncore.size()) && MSR.size(); ++i) + { + server_pcicfg_uncore[i]->programServerUncoreMemoryMetrics(rankA, rankB); + } + + return PCM::Success; +} + PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands) { - if(MSR == NULL || server_pcicfg_uncore == NULL) return PCM::MSRAccessDenied; + if(MSR.empty() || server_pcicfg_uncore.empty()) return PCM::MSRAccessDenied; uint32 PCUCntConf[4] = {0,0,0,0}; @@ -2847,7 +2762,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof { PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x60) + PCU_MSR_PMON_CTL_EDGE_DET ; // number of frequency transitions PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0x60) ; // cycles spent changing frequency: FREQ_TRANS_CYCLES - } else if (HASWELLX == cpu_model ) + } else if (HASWELLX == cpu_model || BDX_DE == cpu_model) { PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x74) + PCU_MSR_PMON_CTL_EDGE_DET ; // number of frequency transitions PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0x74) ; // cycles spent changing frequency: FREQ_TRANS_CYCLES @@ -2863,7 +2778,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof { PCUCntConf[2] = PCU_MSR_PMON_CTL_EVENT(0x2B) + PCU_MSR_PMON_CTL_EDGE_DET ; // PC2 transitions PCUCntConf[3] = PCU_MSR_PMON_CTL_EVENT(0x2D) + PCU_MSR_PMON_CTL_EDGE_DET ; // PC6 transitions - } else if (HASWELLX == cpu_model ) + } else if (HASWELLX == cpu_model || BDX_DE == cpu_model) { PCUCntConf[0] = PCU_MSR_PMON_CTL_EVENT(0x4E) ; // PC1e residencys PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x4E) + PCU_MSR_PMON_CTL_EDGE_DET ; // PC1 transitions @@ -2875,7 +2790,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof } break; case 7: - if (HASWELLX == cpu_model ) + if (HASWELLX == cpu_model || BDX_DE == cpu_model) { PCUCntConf[0] = PCU_MSR_PMON_CTL_EVENT(0x7E) ; // UFS_TRANSITIONS_PERF_P_LIMIT PCUCntConf[1] = PCU_MSR_PMON_CTL_EVENT(0x7D) ; // UFS_TRANSITIONS_IO_P_LIMIT @@ -2887,7 +2802,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof } break; case 8: - if (HASWELLX == cpu_model ) + if (HASWELLX == cpu_model || BDX_DE == cpu_model) { PCUCntConf[0] = PCU_MSR_PMON_CTL_EVENT(0x7C) ; // UFS_TRANSITIONS_DOWN } else @@ -2899,7 +2814,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof std::cerr << "ERROR: unsupported PCU profile "<< pcu_profile << std::endl; } - uint32 PCU_MSR_PMON_BOX_FILTER_ADDR, PCU_MSR_PMON_CTLX_ADDR[4] = { 0, 0, 0, 0 }; + uint64 PCU_MSR_PMON_BOX_FILTER_ADDR, PCU_MSR_PMON_CTLX_ADDR[4] = { 0, 0, 0, 0 }; switch(cpu_model) { @@ -2912,6 +2827,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof PCU_MSR_PMON_CTLX_ADDR[3] = JKTIVT_PCU_MSR_PMON_CTL3_ADDR; break; case HASWELLX: + case BDX_DE: PCU_MSR_PMON_BOX_FILTER_ADDR = HSX_PCU_MSR_PMON_BOX_FILTER_ADDR; PCU_MSR_PMON_CTLX_ADDR[0] = HSX_PCU_MSR_PMON_CTL0_ADDR; PCU_MSR_PMON_CTLX_ADDR[1] = HSX_PCU_MSR_PMON_CTL1_ADDR; @@ -2923,7 +2839,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof return PCM::UnknownError; } - for (int i = 0; (i < num_sockets) && server_pcicfg_uncore && MSR; ++i) + for (int i = 0; (i < (int)server_pcicfg_uncore.size()) && MSR.size(); ++i) { server_pcicfg_uncore[i]->program_power_metrics(mc_profile); @@ -2973,7 +2889,7 @@ PCM::ErrorCode PCM::programServerUncorePowerMetrics(int mc_profile, int pcu_prof void PCM::freezeServerUncoreCounters() { - for (int i = 0; (i < num_sockets) && server_pcicfg_uncore && MSR; ++i) + for (int i = 0; (i < (int)server_pcicfg_uncore.size()) && MSR.size(); ++i) { server_pcicfg_uncore[i]->freezeCounters(); MSR[socketRefCore[i]]->write(PCU_MSR_PMON_BOX_CTL_ADDR, PCU_MSR_PMON_BOX_CTL_FRZ_EN + PCU_MSR_PMON_BOX_CTL_FRZ); @@ -2981,13 +2897,13 @@ void PCM::freezeServerUncoreCounters() } void PCM::unfreezeServerUncoreCounters() { - for (int i = 0; (i < num_sockets) && server_pcicfg_uncore && MSR; ++i) + for (int i = 0; (i < (int)server_pcicfg_uncore.size()) && MSR.size(); ++i) { server_pcicfg_uncore[i]->unfreezeCounters(); MSR[socketRefCore[i]]->write(PCU_MSR_PMON_BOX_CTL_ADDR, PCU_MSR_PMON_BOX_CTL_FRZ_EN); } } -void UncoreCounterState::readAndAggregate(SafeMsrHandle * msr) +void UncoreCounterState::readAndAggregate(std::shared_ptr msr) { TemporalThreadAffinity tempThreadAffinity(msr->getCoreId()); // speedup trick for Linux @@ -2998,13 +2914,13 @@ void UncoreCounterState::readAndAggregate(SafeMsrHandle * msr) SystemCounterState PCM::getSystemCounterState() { SystemCounterState result; - if (MSR) + if (MSR.size()) { // read core and uncore counter state for (int32 core = 0; core < num_cores; ++core) result.readAndAggregate(MSR[core]); - for (int32 s=0; s < num_sockets; s++) + for (uint32 s = 0; s < (uint32)num_sockets; s++) { readAndAggregateUncoreMCCounters(s, result); readAndAggregateEnergyCounters(s, result); @@ -3012,7 +2928,7 @@ SystemCounterState PCM::getSystemCounterState() readQPICounters(result); - result.ThermalHeadroom = PCM_INVALID_THERMAL_HEADROOM; // not available for system + result.ThermalHeadroom = static_cast(PCM_INVALID_THERMAL_HEADROOM); // not available for system } return result; } @@ -3022,7 +2938,7 @@ void PCM::readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType { if (hasPCICFGUncore()) { - if (server_pcicfg_uncore) + if (server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) { server_pcicfg_uncore[socket]->freezeCounters(); result.UncMCNormalReads += server_pcicfg_uncore[socket]->getImcReads(); @@ -3030,7 +2946,7 @@ void PCM::readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType server_pcicfg_uncore[socket]->unfreezeCounters(); } } - else if(clientBW && socket == 0) + else if(clientBW.get() && socket == 0) { result.UncMCNormalReads += clientImcReads->read(); result.UncMCFullWrites += clientImcWrites->read(); @@ -3038,7 +2954,7 @@ void PCM::readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType } else { - SafeMsrHandle * msr = MSR[socketRefCore[socket]]; + std::shared_ptr msr = MSR[socketRefCore[socket]]; TemporalThreadAffinity tempThreadAffinity(socketRefCore[socket]); // speedup trick for Linux switch (cpu_model) { @@ -3078,30 +2994,27 @@ void PCM::readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType template void PCM::readAndAggregateEnergyCounters(const uint32 socket, CounterStateType & result) { - if(socket < snb_energy_status.size()) - result.PackageEnergyStatus += snb_energy_status[socket]->read(); + if(socket < (uint32)energy_status.size()) + result.PackageEnergyStatus += energy_status[socket]->read(); - if(socket < jkt_dram_energy_status.size()) - result.DRAMEnergyStatus += jkt_dram_energy_status[socket]->read(); + if (socket < (uint32)dram_energy_status.size()) + result.DRAMEnergyStatus += dram_energy_status[socket]->read(); } template -void PCM::readAndAggregatePackageCStateResidencies(SafeMsrHandle * msr, CounterStateType & result) +void PCM::readAndAggregatePackageCStateResidencies(std::shared_ptr msr, CounterStateType & result) { // reading package C state counters uint64 cCStateResidency[PCM::MAX_C_STATE + 1]; - memset(&(cCStateResidency[0]), 0, sizeof(cCStateResidency)); + memset(cCStateResidency, 0, sizeof(cCStateResidency)); - for(int i=0; i <= PCM::MAX_C_STATE ;++i) + for(int i=0; i <= int(PCM::MAX_C_STATE) ;++i) if(pkgCStateMsr && pkgCStateMsr[i]) msr->read(pkgCStateMsr[i], &(cCStateResidency[i])); - if (&result!=NULL) { // to make security check happy - - for(int i=0; i <= PCM::MAX_C_STATE ;++i) + for(int i=0; i <= int(PCM::MAX_C_STATE) ;++i) result.CStateResidency[i] += cCStateResidency[i]; - } } void PCM::readQPICounters(SystemCounterState & result) @@ -3203,11 +3116,10 @@ void PCM::readQPICounters(SystemCounterState & result) } else if (hasPCICFGUncore()) { - if (server_pcicfg_uncore) - for (int32 s = 0; (s < num_sockets); ++s) + for (int32 s = 0; (s < (int32)server_pcicfg_uncore.size()); ++s) { server_pcicfg_uncore[s]->freezeCounters(); - for (uint32 port = 0; port < getQPILinksPerSocket(); ++port) + for (uint32 port = 0; port < (uint32)getQPILinksPerSocket(); ++port) { result.incomingQPIPackets[s][port] = server_pcicfg_uncore[s]->getIncomingDataFlits(port) / 8; result.outgoingQPIDataNonDataFlits[s][port] = server_pcicfg_uncore[s]->getOutgoingDataNonDataFlits(port); @@ -3234,7 +3146,7 @@ void PCM::readPackageThermalHeadroom(const uint32 socket, CounterStateType & res SocketCounterState PCM::getSocketCounterState(uint32 socket) { SocketCounterState result; - if (MSR) + if (MSR.size()) { // reading core and uncore counter states for (int32 core = 0; core < num_cores; ++core) @@ -3271,7 +3183,7 @@ void PCM::getAllCounterStates(SystemCounterState & systemState, std::vectorfreezeCounters(); - for(uint32 port=0;port < server_pcicfg_uncore[socket]->getNumQPIPorts();++port) + for(uint32 port=0;port < (uint32)server_pcicfg_uncore[socket]->getNumQPIPorts();++port) { result.QPIClocks[port] = server_pcicfg_uncore[socket]->getQPIClocks(port); result.QPIL0pTxCycles[port] = server_pcicfg_uncore[socket]->getQPIL0pTxCycles(port); result.QPIL1Cycles[port] = server_pcicfg_uncore[socket]->getQPIL1Cycles(port); } - for(uint32 channel=0;channelgetNumMCChannels();++channel) + for (uint32 channel = 0; channel < (uint32)server_pcicfg_uncore[socket]->getNumMCChannels(); ++channel) { result.DRAMClocks[channel] = server_pcicfg_uncore[socket]->getDRAMClocks(channel); for(uint32 cnt=0;cnt<4;++cnt) @@ -3359,7 +3271,7 @@ ServerUncorePowerState PCM::getServerUncorePowerState(uint32 socket) } server_pcicfg_uncore[socket]->unfreezeCounters(); } - if(MSR) + if(MSR.size()) { uint32 refCore = socketRefCore[socket]; TemporalThreadAffinity tempThreadAffinity(refCore); @@ -3443,7 +3355,15 @@ static const uint32 IMC_DEV_IDS[] = { 0x2fd0, 0x2fd1, 0x2fd4, - 0x2fd5 + 0x2fd5, + 0x6fb0, + 0x6fb1, + 0x6fb4, + 0x6fb5, + 0x6fd0, + 0x6fd1, + 0x6fd4, + 0x6fd5 }; std::vector > ServerPCICFGUncore::socket2bus; @@ -3464,8 +3384,8 @@ void ServerPCICFGUncore::initSocket2Bus() #endif - for(uint32 s = 0; s < mcfg.size(); ++s) - for(uint32 bus = mcfg[s].startBusNumber; bus <= mcfg[s].endBusNumber; ++bus) + for(uint32 s = 0; s < (uint32)mcfg.size(); ++s) + for (uint32 bus = (uint32)mcfg[s].startBusNumber; bus <= (uint32)mcfg[s].endBusNumber; ++bus) { uint32 value = 0; try @@ -3483,7 +3403,7 @@ void ServerPCICFGUncore::initSocket2Bus() if (vendor_id != PCM_INTEL_PCI_VENDOR_ID) continue; - for(uint32 i = 0; i< sizeof(IMC_DEV_IDS)/sizeof(IMC_DEV_IDS[0]) ; ++i) + for (uint32 i = 0; i < (uint32)sizeof(IMC_DEV_IDS) / sizeof(IMC_DEV_IDS[0]); ++i) { // match if(IMC_DEV_IDS[i] == device_id) @@ -3496,7 +3416,6 @@ void ServerPCICFGUncore::initSocket2Bus() } } - int getBusFromSocket(const uint32 socket) { int cur_bus = 0; @@ -3521,9 +3440,9 @@ int getBusFromSocket(const uint32 socket) return -1; } -PciHandleM * ServerPCICFGUncore::createIntelPerfMonDevice(uint32 groupnr_, uint32 bus_, uint32 dev_, uint32 func_, bool checkVendor) +PciHandleM * ServerPCICFGUncore::createIntelPerfMonDevice(uint32 groupnr_, int32 bus_, uint32 dev_, uint32 func_, bool checkVendor) { - if (PciHandleM::exists(bus_, dev_, func_)) + if (PciHandleM::exists((uint32)bus_, dev_, func_)) { PciHandleM * handle = new PciHandleM(groupnr_, bus_, dev_, func_); @@ -3543,10 +3462,6 @@ PciHandleM * ServerPCICFGUncore::createIntelPerfMonDevice(uint32 groupnr_, uint3 ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, PCM * pcm) : bus(-1) , groupnr(0) - , imcHandles(NULL) - , num_imc_channels(0) - , qpiLLHandles(NULL) - , num_qpi_ports(0) , qpi_speed(0) , num_imc(0) { @@ -3568,7 +3483,7 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, PCM * pcm) : PCM_PCICFG_MC_INIT(1, 2, JKTIVT) PCM_PCICFG_MC_INIT(1, 3, JKTIVT) } - else if(cpu_model == PCM::HASWELLX) + else if(cpu_model == PCM::HASWELLX || cpu_model == PCM::BDX_DE) { PCM_PCICFG_MC_INIT(0, 0, HSX) PCM_PCICFG_MC_INIT(0, 1, HSX) @@ -3614,14 +3529,12 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, PCM * pcm) : throw std::exception(); } - imcHandles = new PciHandleM *[8]; - { #define PCM_PCICFG_SETUP_MC_HANDLE(controller,channel) \ { \ PciHandleM * handle = createIntelPerfMonDevice(groupnr, bus, \ MCX_CHY_REGISTER_DEV_ADDR[controller][channel], MCX_CHY_REGISTER_FUNC_ADDR[controller][channel], true); \ - if(handle) imcHandles[num_imc_channels++] = handle; \ + if (handle) imcHandles.push_back(std::shared_ptr(handle)); \ } PCM_PCICFG_SETUP_MC_HANDLE(0,0) @@ -3629,30 +3542,28 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, PCM * pcm) : PCM_PCICFG_SETUP_MC_HANDLE(0,2) PCM_PCICFG_SETUP_MC_HANDLE(0,3) - if(num_imc_channels > 0) ++num_imc; // at least one memory controller - const uint32 num_imc_channels1 = num_imc_channels; + if (!imcHandles.empty()) ++num_imc; // at least one memory controller + const size_t num_imc_channels1 = (size_t)imcHandles.size(); PCM_PCICFG_SETUP_MC_HANDLE(1,0) PCM_PCICFG_SETUP_MC_HANDLE(1,1) PCM_PCICFG_SETUP_MC_HANDLE(1,2) PCM_PCICFG_SETUP_MC_HANDLE(1,3) - if(num_imc_channels > num_imc_channels1 ) ++num_imc; // another memory controller found + if ((size_t)imcHandles.size() > num_imc_channels1) ++num_imc; // another memory controller found #undef PCM_PCICFG_SETUP_MC_HANDLE } - if (num_imc_channels == 0) + if (imcHandles.empty()) { - delete [] imcHandles; - imcHandles = NULL; std::cerr << "PCM error: no memory controllers found." << std::endl; throw std::exception(); } - if (num_imc_channels < 3) + if (imcHandles.size() < 3) { - std::cerr << "Intel PCM: warning only " << num_imc_channels << " memory channels detected, must be >= 3." << std::endl; + std::cerr << "Intel PCM: warning only " << imcHandles.size() << " memory channels detected, must be >= 3." << std::endl; } if (total_sockets_ == 1) { @@ -3663,19 +3574,16 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, PCM * pcm) : * eliminates register programming that is not needed since no QPI traffic * is possible with single socket systems. */ - num_qpi_ports = 0; - std::cerr << num_imc<<" memory controllers detected with total number of "<< num_imc_channels <<" channels. " << std::endl; + qpiLLHandles.clear(); + std::cerr << num_imc << " memory controllers detected with total number of " << imcHandles.size() << " channels. " << std::endl; return; } #ifdef PCM_NOQPI - num_qpi_ports = 0; - std::cerr << num_imc<<" memory controllers detected with total number of "<< num_imc_channels <<" channels. " << std::endl; + qpiLLHandles.clear(); + std::cerr << num_imc<<" memory controllers detected with total number of "<< imcHandles.size() <<" channels. " << std::endl; return; #else - qpiLLHandles = new PciHandleM *[3]; - for(uint32 i=0; i<3; ++i) - qpiLLHandles[i] = NULL; #define PCM_PCICFG_QPI_INIT(port, arch) \ QPI_PORTX_REGISTER_DEV_ADDR[port] = arch##_QPI_PORT##port##_REGISTER_DEV_ADDR; \ @@ -3687,7 +3595,7 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, PCM * pcm) : PCM_PCICFG_QPI_INIT(1, JKTIVT); PCM_PCICFG_QPI_INIT(2, JKTIVT); } - else if(cpu_model == PCM::HASWELLX) + else if(cpu_model == PCM::HASWELLX || cpu_model == PCM::BDX_DE) { PCM_PCICFG_QPI_INIT(0, HSX); PCM_PCICFG_QPI_INIT(1, HSX); @@ -3706,110 +3614,95 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, PCM * pcm) : { PciHandleM * handle = createIntelPerfMonDevice(groupnr, bus, QPI_PORTX_REGISTER_DEV_ADDR[0], QPI_PORTX_REGISTER_FUNC_ADDR[0], true); if (handle) - qpiLLHandles[num_qpi_ports++] = handle; + qpiLLHandles.push_back(std::shared_ptr(handle)); else std::cerr << "ERROR: QPI LL monitoring device ("<< groupnr<<":"<(handle)); else std::cerr << "ERROR: QPI LL monitoring device ("<< groupnr<<":"<(handle)); + else { + + if (pcm->getCPUBrandString().find("E7") != std::string::npos) { // Xeon E7 + std::cerr << "ERROR: QPI LL performance monitoring device for the third QPI link was not found on " << pcm->getCPUBrandString() << + " processor in socket " << socket_ << ". Possibly BIOS hides the device. The QPI statistics will be incomplete or missing." << std::endl; + } + } } } catch (...) { std::cerr << "PCM Error: can not create QPI LL handles." <getCPUModel(); + uint32 MCCntConfig[4] = {0,0,0,0}; + switch(cpu_model) { - for(uint32 i=0; igetCPUBrandString() << " model "<< cpu_model << " does not support the requred performance events "<< std::endl; + return; } + programIMC(MCCntConfig); } - void ServerPCICFGUncore::program() { - for (uint32 i = 0; i < num_imc_channels; ++i) - { - // imc PMU - - // freeze enable - imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, MC_CH_PCI_PMON_BOX_CTL_FRZ_EN); - // freeze - imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, MC_CH_PCI_PMON_BOX_CTL_FRZ_EN + MC_CH_PCI_PMON_BOX_CTL_FRZ); - -#ifdef PCM_UNCORE_PMON_BOX_CHECK_STATUS - uint32 val = 0; - imcHandles[i]->read32(MC_CH_PCI_PMON_BOX_CTL_ADDR, &val); - if ((val & UNCORE_PMON_BOX_CTL_VALID_BITS_MASK) != (MC_CH_PCI_PMON_BOX_CTL_FRZ_EN + MC_CH_PCI_PMON_BOX_CTL_FRZ)) - { - std::cerr << "ERROR: IMC counter programming seems not to work. MC_CH" << i << "_PCI_PMON_BOX_CTL=0x" << std::hex << val << std::endl; - std::cerr << " Please see BIOS options to enable the export of performance monitoring devices." << std::endl; - } -#endif - - // enable counter 0 - imcHandles[i]->write32(MC_CH_PCI_PMON_CTL0_ADDR, MC_CH_PCI_PMON_CTL_EN); - - // monitor reads on counter 0: CAS_COUNT.RD - imcHandles[i]->write32(MC_CH_PCI_PMON_CTL0_ADDR, MC_CH_PCI_PMON_CTL_EN + MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(3)); - - // enable counter 1 - imcHandles[i]->write32(MC_CH_PCI_PMON_CTL1_ADDR, MC_CH_PCI_PMON_CTL_EN); - - // monitor writes on counter 1: CAS_COUNT.WR - imcHandles[i]->write32(MC_CH_PCI_PMON_CTL1_ADDR, MC_CH_PCI_PMON_CTL_EN + MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(12)); - - // enable counter 2 - imcHandles[i]->write32(MC_CH_PCI_PMON_CTL2_ADDR, MC_CH_PCI_PMON_CTL_EN); - - // monitor partial writes on counter 2: CAS_COUNT.RD_UNDERFILL - imcHandles[i]->write32(MC_CH_PCI_PMON_CTL2_ADDR, MC_CH_PCI_PMON_CTL_EN + MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(2)); - - // enable fixed counter (DRAM clocks) - imcHandles[i]->write32(MC_CH_PCI_PMON_FIXED_CTL_ADDR, MC_CH_PCI_PMON_FIXED_CTL_EN); - - // reset it - imcHandles[i]->write32(MC_CH_PCI_PMON_FIXED_CTL_ADDR, MC_CH_PCI_PMON_FIXED_CTL_EN + MC_CH_PCI_PMON_FIXED_CTL_RST); - - // reset counters values - imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, MC_CH_PCI_PMON_BOX_CTL_FRZ_EN + MC_CH_PCI_PMON_BOX_CTL_FRZ + MC_CH_PCI_PMON_BOX_CTL_RST_COUNTERS); - - // unfreeze counters - imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, MC_CH_PCI_PMON_BOX_CTL_FRZ_EN); - } + uint32 MCCntConfig[4] = { + MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(3), // monitor reads on counter 0: CAS_COUNT.RD + MC_CH_PCI_PMON_CTL_EVENT(0x04) + MC_CH_PCI_PMON_CTL_UMASK(12), // monitor writes on counter 1: CAS_COUNT.WR + 0, + 0 + }; + programIMC(MCCntConfig); - for (uint32 i = 0; i < num_qpi_ports; ++i) + for (uint32 i = 0; i < (uint32)qpiLLHandles.size(); ++i) { // QPI LL PMU @@ -3864,7 +3757,7 @@ uint64 ServerPCICFGUncore::getImcReads() { uint64 result = 0; - for (uint32 i = 0; i < num_imc_channels; ++i) + for (uint32 i = 0; i < (uint32)imcHandles.size(); ++i) { uint64 value = 0; imcHandles[i]->read64(MC_CH_PCI_PMON_CTR0_ADDR, &value); @@ -3878,7 +3771,7 @@ uint64 ServerPCICFGUncore::getImcWrites() { uint64 result = 0; - for (uint32 i = 0; i < num_imc_channels; ++i) + for (uint32 i = 0; i < (uint32)imcHandles.size(); ++i) { uint64 value = 0; imcHandles[i]->read64(MC_CH_PCI_PMON_CTR1_ADDR, &value); @@ -3892,7 +3785,7 @@ uint64 ServerPCICFGUncore::getIncomingDataFlits(uint32 port) { uint64 drs = 0, ncb = 0; - if (port >= num_qpi_ports) + if (port >= (uint32)qpiLLHandles.size()) return 0; qpiLLHandles[port]->read64(Q_P_PCI_PMON_CTR0_ADDR, &drs); @@ -3908,7 +3801,7 @@ uint64 ServerPCICFGUncore::getOutgoingDataNonDataFlits(uint32 port) void ServerPCICFGUncore::program_power_metrics(int mc_profile) { - for (uint32 i = 0; i < num_qpi_ports; ++i) + for (uint32 i = 0; i < (uint32)qpiLLHandles.size(); ++i) { // QPI LL PMU @@ -3977,7 +3870,12 @@ void ServerPCICFGUncore::program_power_metrics(int mc_profile) break; } - for (uint32 i = 0; i < num_imc_channels; ++i) + programIMC(MCCntConfig); +} + +void ServerPCICFGUncore::programIMC(const uint32 * MCCntConfig) +{ + for (uint32 i = 0; i < (uint32)imcHandles.size(); ++i) { // imc PMU @@ -3990,10 +3888,10 @@ void ServerPCICFGUncore::program_power_metrics(int mc_profile) uint32 val = 0; imcHandles[i]->read32(MC_CH_PCI_PMON_BOX_CTL_ADDR, &val); if ((val & UNCORE_PMON_BOX_CTL_VALID_BITS_MASK) != (MC_CH_PCI_PMON_BOX_CTL_FRZ_EN + MC_CH_PCI_PMON_BOX_CTL_FRZ)) - { + { std::cerr << "ERROR: IMC counter programming seems not to work. MC_CH" << i << "_PCI_PMON_BOX_CTL=0x" << std::hex << val << std::endl; - std::cerr << " Please see BIOS options to enable the export of performance monitoring devices." << std::endl; - } + std::cerr << " Please see BIOS options to enable the export of performance monitoring devices." << std::endl; + } #endif // enable fixed counter (DRAM clocks) @@ -4002,16 +3900,16 @@ void ServerPCICFGUncore::program_power_metrics(int mc_profile) // reset it imcHandles[i]->write32(MC_CH_PCI_PMON_FIXED_CTL_ADDR, MC_CH_PCI_PMON_FIXED_CTL_EN + MC_CH_PCI_PMON_FIXED_CTL_RST); - + imcHandles[i]->write32(MC_CH_PCI_PMON_CTL0_ADDR, MC_CH_PCI_PMON_CTL_EN); imcHandles[i]->write32(MC_CH_PCI_PMON_CTL0_ADDR, MC_CH_PCI_PMON_CTL_EN + MCCntConfig[0]); - + imcHandles[i]->write32(MC_CH_PCI_PMON_CTL1_ADDR, MC_CH_PCI_PMON_CTL_EN); imcHandles[i]->write32(MC_CH_PCI_PMON_CTL1_ADDR, MC_CH_PCI_PMON_CTL_EN + MCCntConfig[1]); - + imcHandles[i]->write32(MC_CH_PCI_PMON_CTL2_ADDR, MC_CH_PCI_PMON_CTL_EN); imcHandles[i]->write32(MC_CH_PCI_PMON_CTL2_ADDR, MC_CH_PCI_PMON_CTL_EN + MCCntConfig[2]); - + imcHandles[i]->write32(MC_CH_PCI_PMON_CTL3_ADDR, MC_CH_PCI_PMON_CTL_EN); imcHandles[i]->write32(MC_CH_PCI_PMON_CTL3_ADDR, MC_CH_PCI_PMON_CTL_EN + MCCntConfig[3]); @@ -4025,11 +3923,11 @@ void ServerPCICFGUncore::program_power_metrics(int mc_profile) void ServerPCICFGUncore::freezeCounters() { - for (uint32 i = 0; i < num_qpi_ports; ++i) + for (size_t i = 0; i < (size_t)qpiLLHandles.size(); ++i) { qpiLLHandles[i]->write32(Q_P_PCI_PMON_BOX_CTL_ADDR, Q_P_PCI_PMON_BOX_CTL_RST_FRZ_EN + Q_P_PCI_PMON_BOX_CTL_RST_FRZ); } - for (uint32 i = 0; i < num_imc_channels; ++i) + for (size_t i = 0; i < (size_t)imcHandles.size(); ++i) { imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, MC_CH_PCI_PMON_BOX_CTL_FRZ_EN + MC_CH_PCI_PMON_BOX_CTL_FRZ); } @@ -4037,11 +3935,11 @@ void ServerPCICFGUncore::freezeCounters() void ServerPCICFGUncore::unfreezeCounters() { - for (uint32 i = 0; i < num_qpi_ports; ++i) + for (size_t i = 0; i < (size_t)qpiLLHandles.size(); ++i) { qpiLLHandles[i]->write32(Q_P_PCI_PMON_BOX_CTL_ADDR, Q_P_PCI_PMON_BOX_CTL_RST_FRZ_EN); } - for (uint32 i = 0; i < num_imc_channels; ++i) + for (size_t i = 0; i < (size_t)imcHandles.size(); ++i) { imcHandles[i]->write32(MC_CH_PCI_PMON_BOX_CTL_ADDR, MC_CH_PCI_PMON_BOX_CTL_FRZ_EN); } @@ -4051,7 +3949,7 @@ uint64 ServerPCICFGUncore::getQPIClocks(uint32 port) { uint64 res = 0; - if (port >= num_qpi_ports) + if (port >= (uint32)qpiLLHandles.size()) return 0; qpiLLHandles[port]->read64(Q_P_PCI_PMON_CTR3_ADDR, &res); @@ -4063,7 +3961,7 @@ uint64 ServerPCICFGUncore::getQPIL0pTxCycles(uint32 port) { uint64 res = 0; - if (port >= num_qpi_ports) + if (port >= (uint32)qpiLLHandles.size()) return 0; qpiLLHandles[port]->read64(Q_P_PCI_PMON_CTR0_ADDR, &res); @@ -4075,7 +3973,7 @@ uint64 ServerPCICFGUncore::getQPIL1Cycles(uint32 port) { uint64 res = 0; - if (port >= num_qpi_ports) + if (port >= (uint32)qpiLLHandles.size()) return 0; qpiLLHandles[port]->read64(Q_P_PCI_PMON_CTR2_ADDR, &res); @@ -4087,7 +3985,7 @@ uint64 ServerPCICFGUncore::getDRAMClocks(uint32 channel) { uint64 result = 0; - if(channel < num_imc_channels) + if (channel < (uint32)imcHandles.size()) imcHandles[channel]->read64(MC_CH_PCI_PMON_FIXED_CTR_ADDR, &result); return result; @@ -4097,7 +3995,7 @@ uint64 ServerPCICFGUncore::getMCCounter(uint32 channel, uint32 counter) { uint64 result = 0; - if(channel < num_imc_channels) + if (channel < (uint32)imcHandles.size()) { switch(counter) { @@ -4123,7 +4021,7 @@ uint64 ServerPCICFGUncore::getQPILLCounter(uint32 port, uint32 counter) { uint64 result = 0; - if(port < num_qpi_ports) + if (port < (uint32)qpiLLHandles.size()) { switch(counter) { @@ -4182,10 +4080,10 @@ void ServerPCICFGUncore::enableJKTWorkaround(bool enable) uint64 ServerPCICFGUncore::computeQPISpeed(const uint32 core_nr, const int cpumodel) { - if(qpi_speed.size()==0) + if(qpi_speed.empty()) { - qpi_speed.resize(num_qpi_ports); - for (uint32 i=0; i((4000000000ULL + ((uint64)value)*800000000ULL)*2ULL); + if(qpi_speed[i] == 0ULL) { std::cerr << "Warning: QPI_RATE_STATUS register is not available on port "<< i <<". Computing QPI speed using a measurement loop." << std::endl; @@ -4204,7 +4102,7 @@ uint64 ServerPCICFGUncore::computeQPISpeed(const uint32 core_nr, const int cpumo const uint64 timerGranularity = 1000000ULL; // mks PCM * pcm = PCM::getInstance(); - uint64 startClocks = getQPIClocks(i); + uint64 startClocks = getQPIClocks((uint32)i); uint64 startTSC = pcm->getTickCount(timerGranularity, core_nr); uint64 endTSC; do @@ -4212,10 +4110,10 @@ uint64 ServerPCICFGUncore::computeQPISpeed(const uint32 core_nr, const int cpumo endTSC = pcm->getTickCount(timerGranularity, core_nr); } while (endTSC - startTSC < 200000ULL); // spin for 200 ms - uint64 endClocks = getQPIClocks(i); + uint64 endClocks = getQPIClocks((uint32)i); qpi_speed[i] = ((std::max)((endClocks - startClocks) * 16ULL * timerGranularity / (endTSC - startTSC),0ULL)); - if(cpumodel == PCM::HASWELLX) { + if(cpumodel == PCM::HASWELLX || cpumodel == PCM::BDX_DE) /* XXX:BDX_DE does not have QPI. */{ qpi_speed[i] /=2; // HSX runs QPI clocks with doubled speed } } @@ -4232,7 +4130,7 @@ uint64 ServerPCICFGUncore::computeQPISpeed(const uint32 core_nr, const int cpumo } #ifdef _MSC_VER -DWORD WINAPI WatchDogProc(LPVOID state) +static DWORD WINAPI WatchDogProc(LPVOID state) #else void * WatchDogProc(void * state) #endif @@ -4250,26 +4148,26 @@ void * WatchDogProc(void * state) return NULL; } -uint32 PCM::CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const +uint64 PCM::CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const { if(JAKETOWN == cpu_model || IVYTOWN == cpu_model) { return JKT_C0_MSR_PMON_CTR0 + ((JKTIVT_CBO_MSR_STEP)*Cbo) + Ctr; - } else if(HASWELLX == cpu_model) + } else if(HASWELLX == cpu_model || BDX_DE == cpu_model) { return HSX_C0_MSR_PMON_CTR0 + ((HSX_CBO_MSR_STEP)*Cbo) + Ctr; } return 0; } -uint32 PCM::CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const +uint64 PCM::CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const { if(JAKETOWN == cpu_model || IVYTOWN == cpu_model) { return JKT_C0_MSR_PMON_BOX_FILTER + ((JKTIVT_CBO_MSR_STEP)*Cbo); - } else if(HASWELLX == cpu_model) + } else if(HASWELLX == cpu_model || BDX_DE == cpu_model) { return HSX_C0_MSR_PMON_BOX_FILTER + ((HSX_CBO_MSR_STEP)*Cbo); } @@ -4277,39 +4175,39 @@ uint32 PCM::CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const return 0; } -uint32 PCM::CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const +uint64 PCM::CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const { if(IVYTOWN == cpu_model) { return IVT_C0_MSR_PMON_BOX_FILTER1 + ((JKTIVT_CBO_MSR_STEP)*Cbo); - } else if(HASWELLX == cpu_model) + } else if(HASWELLX == cpu_model || BDX_DE == cpu_model) { return HSX_C0_MSR_PMON_BOX_FILTER1 + ((HSX_CBO_MSR_STEP)*Cbo); } return 0; } -uint32 PCM::CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const +uint64 PCM::CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const { if(JAKETOWN == cpu_model || IVYTOWN == cpu_model) { return JKT_C0_MSR_PMON_CTL0 + ((JKTIVT_CBO_MSR_STEP)*Cbo) + Ctl; - } else if(HASWELLX == cpu_model) + } else if(HASWELLX == cpu_model || BDX_DE == cpu_model) { return HSX_C0_MSR_PMON_CTL0 + ((HSX_CBO_MSR_STEP)*Cbo) + Ctl; } return 0; } -uint32 PCM::CX_MSR_PMON_BOX_CTL(uint32 Cbo) const +uint64 PCM::CX_MSR_PMON_BOX_CTL(uint32 Cbo) const { if(JAKETOWN == cpu_model || IVYTOWN == cpu_model) { return JKT_C0_MSR_PMON_BOX_CTL + ((JKTIVT_CBO_MSR_STEP)*Cbo); - } else if(HASWELLX == cpu_model) + } else if(HASWELLX == cpu_model || BDX_DE == cpu_model) { return HSX_C0_MSR_PMON_BOX_CTL + ((HSX_CBO_MSR_STEP)*Cbo); } @@ -4325,18 +4223,18 @@ uint32 PCM::getMaxNumOfCBoxes() const * the number of physical cores per socket which is the expected * value to be returned. */ - return num_phys_cores_per_socket; + return (uint32)num_phys_cores_per_socket; } return 0; } -void PCM::programCboOpcodeFilter(const uint32 opc, const uint32 cbo, SafeMsrHandle * msr) +void PCM::programCboOpcodeFilter(const uint32 opc, const uint32 cbo, std::shared_ptr msr) { if(JAKETOWN == cpu_model) { msr->write(CX_MSR_PMON_BOX_FILTER(cbo), JKT_CBO_MSR_PMON_BOX_FILTER_OPC(opc)); - } else if(IVYTOWN == cpu_model || HASWELLX == cpu_model) + } else if(IVYTOWN == cpu_model || HASWELLX == cpu_model || BDX_DE == cpu_model) { msr->write(CX_MSR_PMON_BOX_FILTER1(cbo), IVTHSX_CBO_MSR_PMON_BOX_FILTER1_OPC(opc)); } @@ -4349,7 +4247,7 @@ void PCM::programPCIeMissCounters(const PCM::PCIeEventCode event_, const uint32 void PCM::programPCIeCounters(const PCM::PCIeEventCode event_, const uint32 tid_, const uint32 miss_) { - for (int32 i = 0; (i < num_sockets) && MSR; ++i) + for (int32 i = 0; (i < num_sockets) && MSR.size(); ++i) { uint32 refCore = socketRefCore[i]; TemporalThreadAffinity tempThreadAffinity(refCore); // speedup trick for Linux @@ -4371,9 +4269,9 @@ void PCM::programPCIeCounters(const PCM::PCIeEventCode event_, const uint32 tid_ } #endif - programCboOpcodeFilter(event_, cbo, MSR[refCore]); + programCboOpcodeFilter((uint32)event_, cbo, MSR[refCore]); - if(HASWELLX == cpu_model && tid_ != 0) + if((HASWELLX == cpu_model || BDX_DE == cpu_model) && tid_ != 0) MSR[refCore]->write(CX_MSR_PMON_BOX_FILTER(cbo), tid_); MSR[refCore]->write(CX_MSR_PMON_CTLY(cbo, 0), CBO_MSR_PMON_CTL_EN); diff --git a/cpucounters.h b/cpucounters.h index b89d74b..a74062a 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -22,7 +22,9 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND Include this header file if you want to access CPU counters (core and uncore - including memory controller chips and QPI) */ -#define INTEL_PCM_VERSION "V2.8 (2014-12-18 12:52:39 +0100 ID=ba39a89)" +#define INTEL_PCM_VERSION "V2.9 (2015-08-07 10:23:17 +0200 ID=721d9e3)" + +#define INTEL_PCM_COPYRIGHT " Copyright (c) 2009-2015 Intel Corporation" #ifndef INTELPCM_API #define INTELPCM_API @@ -36,6 +38,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include #include #include +#include #include #ifdef PCM_USE_PERF @@ -79,12 +82,10 @@ struct INTELPCM_API TopologyEntry // decribes a core //! Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP (Jaketown) or Ivytown-EP or Ivytown-EX class ServerPCICFGUncore { - int bus, groupnr; - PciHandleM ** imcHandles; - uint32 num_imc_channels; - - PciHandleM ** qpiLLHandles; - uint32 num_qpi_ports; + int32 bus; + uint32 groupnr; + std::vector > imcHandles; + std::vector > qpiLLHandles; std::vector qpi_speed; uint32 num_imc; uint32 MCX_CHY_REGISTER_DEV_ADDR[2][4]; @@ -97,7 +98,9 @@ class ServerPCICFGUncore ServerPCICFGUncore(); // forbidden ServerPCICFGUncore(ServerPCICFGUncore &); // forbidden - PciHandleM * createIntelPerfMonDevice(uint32 groupnr, uint32 bus, uint32 dev, uint32 func, bool checkVendor = false); + ServerPCICFGUncore & operator =(const ServerPCICFGUncore &); // forbidden + PciHandleM * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func, bool checkVendor = false); + void programIMC(const uint32 * MCCntConfig); public: //! \brief Initialize access data structures @@ -125,6 +128,11 @@ class ServerPCICFGUncore //! \param mc_profile memory controller measurement profile. See description of profiles in pcm-power.cpp void program_power_metrics(int mc_profile); + //! \brief Program memory counters (disables programming performance counters) + //! \param rankA count DIMM rank1 statistics (disables memory channel monitoring) + //! \param rankB count DIMM rank2 statistics (disables memory channel monitoring) + void programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1); + //! \brief Get number of QPI LL clocks on a QPI port //! \param port QPI port number uint64 getQPIClocks(uint32 port); @@ -159,7 +167,7 @@ class ServerPCICFGUncore void enableJKTWorkaround(bool enable); //! \brief Returns the number of detected QPI ports - uint32 getNumQPIPorts() const { return num_qpi_ports; } + size_t getNumQPIPorts() const { return (size_t)qpiLLHandles.size(); } //! \brief Returns the speed of the QPI link uint64 getQPILinkSpeed(const uint32 linkNr) const { @@ -171,7 +179,7 @@ class ServerPCICFGUncore { std::cerr.precision(1); std::cerr << std::fixed; - for (uint32 i=0; i > MSR; + std::vector > server_pcicfg_uncore; + uint64 PCU_MSR_PMON_BOX_CTL_ADDR, PCU_MSR_PMON_CTRX_ADDR[4]; double joulesPerEnergyUnit; - std::vector snb_energy_status; - std::vector jkt_dram_energy_status; + std::vector > energy_status; + std::vector > dram_energy_status; - ClientBW * clientBW; - CounterWidthExtender * clientImcReads; - CounterWidthExtender * clientImcWrites; - CounterWidthExtender * clientIoRequests; + std::shared_ptr clientBW; + std::shared_ptr clientImcReads; + std::shared_ptr clientImcWrites; + std::shared_ptr clientIoRequests; bool disable_JKT_workaround; bool blocked; // track if time-driven counter update is running or not: PCM is blocked @@ -275,13 +283,13 @@ class INTELPCM_API PCM if (state == 0 || state == 1) return true; - return (coreCStateMsr != NULL && state <= MAX_C_STATE && coreCStateMsr[state] != 0); + return (coreCStateMsr != NULL && state <= ((int)MAX_C_STATE) && coreCStateMsr[state] != 0); } //! \brief Returns true if the specified package C-state residency metric is supported bool isPackageCStateResidencySupported(int state) { - return (pkgCStateMsr != NULL && state <= MAX_C_STATE && pkgCStateMsr[state] != 0); + return (pkgCStateMsr != NULL && state <= ((int)MAX_C_STATE) && pkgCStateMsr[state] != 0); } //! \brief Redirects output destination to provided file, instead of std::cout @@ -354,6 +362,10 @@ class INTELPCM_API PCM uint32 nGPCounters; // number of general purpose counters EventSelectRegister * gpCounterCfg; // general purpose counters, if NULL, then default configuration performed for GP counters uint64 OffcoreResponseMsrValue[2]; + ExtendedCustomCoreEventDescription() : fixedCfg(NULL), nGPCounters(0), gpCounterCfg(NULL) { + OffcoreResponseMsrValue[0] = 0; + OffcoreResponseMsrValue[1] = 0; + } }; private: @@ -417,7 +429,7 @@ class INTELPCM_API PCM void initCStateSupportTables(); bool discoverSystemTopology(); void printSystemTopology() const; - void initMSR(); + bool initMSR(); bool detectNominalFrequency(); void initEnergyMonitoring(); void initUncoreObjects(); @@ -437,17 +449,17 @@ class INTELPCM_API PCM template void readPackageThermalHeadroom(const uint32 socket, CounterStateType & counterState); template - void readAndAggregatePackageCStateResidencies(SafeMsrHandle * msr, CounterStateType & result); + void readAndAggregatePackageCStateResidencies(std::shared_ptr msr, CounterStateType & result); void readQPICounters(SystemCounterState & counterState); void reportQPISpeed() const; - uint32 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const; - uint32 CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const; - uint32 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const; - uint32 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const; - uint32 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const; + uint64 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const; + uint64 CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const; + uint64 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const; + uint64 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const; + uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const; uint32 getMaxNumOfCBoxes() const; - void programCboOpcodeFilter(const uint32 opc, const uint32 cbo, SafeMsrHandle * msr); + void programCboOpcodeFilter(const uint32 opc, const uint32 cbo, std::shared_ptr msr); public: /*! @@ -508,21 +520,34 @@ class INTELPCM_API PCM */ ErrorCode program(const ProgramMode mode_ = DEFAULT_EVENTS, const void * parameter_ = NULL); // program counters and start counting - /*! \brief Programs uncore power/energy counters on microarchitectures codename SandyBridge-EP and IvyTown + /*! \brief Programs uncore power/energy counters on microarchitectures codename SandyBridge-EP and later Xeon uarch \param mc_profile profile for integrated memory controller PMU. See possible profile values in pcm-power.cpp example \param pcu_profile profile for power control unit PMU. See possible profile values in pcm-power.cpp example \param freq_bands array of three integer values for core frequency band monitoring. See usage in pcm-power.cpp example - Call this method before you start using the power counter routines on microarchitecture codename SandyBridge-EP + Call this method before you start using the power counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch - \warning After this call the memory and QPI bandwidth counters on microarchitecture codename SandyBridge-EP will not work. + \warning After this call the memory and QPI bandwidth counters on microarchitecture codename SandyBridge-EP and later Xeon uarch will not work. \warning Using this routines with other tools that *program* Performance Monitoring Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc. */ ErrorCode programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands = NULL); - + + /*! \brief Programs uncore memory counters on microarchitectures codename SandyBridge-EP and later Xeon uarch + \param rankA count DIMM rank1 statistics (disables memory channel monitoring) + \param rankB count DIMM rank2 statistics (disables memory channel monitoring) + + Call this method before you start using the memory counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch + + \warning Using this routines with other tools that *program* Performance Monitoring + Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to + program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make + VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc. + */ + ErrorCode programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1); + //! \brief Freezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown) void freezeServerUncoreCounters(); @@ -636,6 +661,7 @@ class INTELPCM_API PCM ATOM_CENTERTON = 54, ATOM_BAYTRAIL = 55, ATOM_AVOTON = 77, + ATOM_CHERRYTRAIL = 76, CLARKDALE = 37, WESTMERE_EP = 44, NEHALEM_EX = 46, @@ -649,23 +675,26 @@ class INTELPCM_API PCM IVYTOWN = 62, HASWELLX = 63, BROADWELL = 61, + BROADWELL_XEON_E3 = 71, + BDX_DE = 86, + SKL = 94, END_OF_MODEL_LIST = 0x0ffff }; //! \brief Reads CPU model id //! \return CPU model ID - uint32 getCPUModel() { return cpu_model; } + uint32 getCPUModel() { return (uint32)cpu_model; } //! \brief Reads original CPU model id //! \return CPU model ID - uint32 getOriginalCPUModel() { return original_cpu_model; } + uint32 getOriginalCPUModel() { return (uint32)original_cpu_model; } //! \brief Determines socket of given core //! \param core_id core identifier //! \return socket identifier int32 getSocketId(uint32 core_id) { - return topology[core_id].socket; + return (int32)topology[core_id].socket; } //! \brief Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket @@ -687,7 +716,8 @@ class INTELPCM_API PCM case JAKETOWN: case IVYTOWN: case HASWELLX: - return (server_pcicfg_uncore && server_pcicfg_uncore[0])?(server_pcicfg_uncore[0]->getNumQPIPorts()):0; + case BDX_DE: + return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get())?(server_pcicfg_uncore[0]->getNumQPIPorts()):0; } return 0; } @@ -707,13 +737,14 @@ class INTELPCM_API PCM case JAKETOWN: case IVYTOWN: case HASWELLX: - return (server_pcicfg_uncore && server_pcicfg_uncore[0])?(server_pcicfg_uncore[0]->getNumMC()):0; + case BDX_DE: + return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get())?(server_pcicfg_uncore[0]->getNumMC()):0; } return 0; } //! \brief Returns the total number of detected memory channels on all integrated memory controllers per socket - uint32 getMCChannelsPerSocket() const + size_t getMCChannelsPerSocket() const { switch (cpu_model) { @@ -727,7 +758,8 @@ class INTELPCM_API PCM case JAKETOWN: case IVYTOWN: case HASWELLX: - return (server_pcicfg_uncore && server_pcicfg_uncore[0])?(server_pcicfg_uncore[0]->getNumMCChannels()):0; + case BDX_DE: + return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get())?(server_pcicfg_uncore[0]->getNumMCChannels()):0; } return 0; } @@ -750,6 +782,8 @@ class INTELPCM_API PCM case HASWELL: case HASWELLX: case BROADWELL: + case BDX_DE: + case SKL: return 4; case ATOM: return 2; @@ -766,6 +800,7 @@ class INTELPCM_API PCM case IVYTOWN: return 800000000ULL; // 800 MHz case HASWELLX: + case BDX_DE: return 1000000000ULL; // 1 GHz } return 0; @@ -856,6 +891,7 @@ class INTELPCM_API PCM //! \brief Get Brand string of processor static std::string getCPUBrandString(); + std::string getCPUFamilyModelString(); bool packageEnergyMetricsAvailable() const { @@ -866,8 +902,12 @@ class INTELPCM_API PCM || cpu_model == PCM::IVY_BRIDGE || cpu_model == PCM::HASWELL || original_cpu_model == PCM::ATOM_AVOTON + || original_cpu_model == PCM::ATOM_CHERRYTRAIL + || original_cpu_model == PCM::ATOM_BAYTRAIL || cpu_model == PCM::HASWELLX || cpu_model == PCM::BROADWELL + || cpu_model == PCM::BDX_DE + || cpu_model == PCM::SKL ); } @@ -877,6 +917,7 @@ class INTELPCM_API PCM cpu_model == PCM::JAKETOWN || cpu_model == PCM::IVYTOWN || cpu_model == PCM::HASWELLX + || cpu_model == PCM::BDX_DE ); } @@ -893,6 +934,7 @@ class INTELPCM_API PCM || cpu_model == PCM::JAKETOWN || cpu_model == PCM::IVYTOWN || cpu_model == PCM::HASWELLX + || cpu_model == PCM::BDX_DE ); } @@ -916,6 +958,7 @@ class INTELPCM_API PCM || cpu_model == PCM::IVY_BRIDGE || cpu_model == PCM::HASWELL || cpu_model == PCM::BROADWELL + || cpu_model == PCM::SKL ); } @@ -932,9 +975,17 @@ class INTELPCM_API PCM cpu_model == PCM::JAKETOWN || cpu_model == PCM::IVYTOWN || cpu_model == PCM::HASWELLX + || cpu_model == PCM::BDX_DE ); } + bool supportsHLE() const; + bool supportsRTM() const; + + bool useSkylakeEvents() const { + return PCM::SKL == cpu_model; + } + ~PCM(); }; @@ -1008,10 +1059,12 @@ class BasicCounterState uint64 L3UnsharedHit; uint64 Event1; uint64 ArchLLCRef; + uint64 SKLL3Hit; }; union { uint64 L2HitM; uint64 Event2; + uint64 SKLL2Miss; }; union { uint64 L2Hit; @@ -1021,7 +1074,7 @@ class BasicCounterState uint64 CStateResidency[PCM::MAX_C_STATE + 1]; int32 ThermalHeadroom; uint64 L3Occupancy; - void readAndAggregate(SafeMsrHandle *); + void readAndAggregate(std::shared_ptr); public: BasicCounterState() : InstRetiredAny(0) @@ -1035,7 +1088,7 @@ class BasicCounterState , ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM) , L3Occupancy(0) { - memset(&(CStateResidency[0]), 0, sizeof(CStateResidency)); + memset(CStateResidency, 0, sizeof(CStateResidency)); } virtual ~BasicCounterState() { } @@ -1049,7 +1102,7 @@ class BasicCounterState Event2 += o.Event2; Event3 += o.Event3; InvariantTSC += o.InvariantTSC; - for(int i=0; i <= PCM::MAX_C_STATE ;++i) + for(int i=0; i <= (int)PCM::MAX_C_STATE ;++i) CStateResidency[i] += o.CStateResidency[i]; // ThermalHeadroom is not accumulative L3Occupancy += o.L3Occupancy; @@ -1212,7 +1265,9 @@ double getDRAMConsumedJoules(const CounterStateType & before, const CounterState if(!m) return -1.; double dram_joules_per_energy_unit; - if(PCM::HASWELLX == m->getCPUModel()) { + if( PCM::HASWELLX == m->getCPUModel() + || PCM::BDX_DE == m->getCPUModel() + ) { /* as described in sections 5.3.2 (DRAM_POWER_INFO) and 5.3.3 (DRAM_ENERGY_STATUS) of * Volume 2 (Registers) of * Intel Xeon E5-1600 v3 and Intel Xeon E5-2600 v3 (Haswell-EP) Datasheet (Ref 330784-001, Sept.2014) @@ -1254,7 +1309,7 @@ class UncoreCounterState uint64 PackageEnergyStatus; uint64 DRAMEnergyStatus; uint64 CStateResidency[PCM::MAX_C_STATE + 1]; - void readAndAggregate(SafeMsrHandle *); + void readAndAggregate(std::shared_ptr); public: UncoreCounterState() : UncMCFullWrites(0) @@ -1263,7 +1318,7 @@ class UncoreCounterState , PackageEnergyStatus(0) , DRAMEnergyStatus(0) { - memset(&(CStateResidency[0]), 0, sizeof(CStateResidency)); + memset(CStateResidency, 0, sizeof(CStateResidency)); } virtual ~UncoreCounterState() { } @@ -1274,7 +1329,7 @@ class UncoreCounterState UncMCIORequests += o.UncMCIORequests; PackageEnergyStatus += o.PackageEnergyStatus; DRAMEnergyStatus += o.DRAMEnergyStatus; - for(int i=0; i <= PCM::MAX_C_STATE ;++i) + for(int i=0; i <= (int)PCM::MAX_C_STATE ;++i) CStateResidency[i] += o.CStateResidency[i]; return *this; } @@ -1341,7 +1396,7 @@ class SocketCounterState : public BasicCounterState, public UncoreCounterState friend class PCM; protected: - void readAndAggregate(SafeMsrHandle * handle) + void readAndAggregate(std::shared_ptr handle) { BasicCounterState::readAndAggregate(handle); UncoreCounterState::readAndAggregate(handle); @@ -1364,7 +1419,7 @@ class SystemCounterState : public BasicCounterState, public UncoreCounterState uint64 uncoreTSC; protected: - void readAndAggregate(SafeMsrHandle * handle) + void readAndAggregate(std::shared_ptr handle) { BasicCounterState::readAndAggregate(handle); UncoreCounterState::readAndAggregate(handle); @@ -1390,7 +1445,6 @@ class SystemCounterState : public BasicCounterState, public UncoreCounterState void accumulateSocketState(const SocketCounterState & o) { - if (&o != NULL) // security check requirement { BasicCounterState::operator += (o); UncoreCounterState::operator += (o); @@ -1667,7 +1721,7 @@ double getCyclesLostDueL3CacheMisses(const CounterStateType & before, const Coun template double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0 { - if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return -1; + if (PCM::getInstance()->getCPUModel() == PCM::ATOM || PCM::getInstance()->useSkylakeEvents()) return -1; int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread; if (clocks != 0) { @@ -1688,11 +1742,21 @@ double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const Coun template double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0 { + if (PCM::getInstance()->useSkylakeEvents()) { + uint64 L2Hit = after.L2Hit - before.L2Hit; + uint64 L2Ref = L2Hit + after.SKLL2Miss - before.SKLL2Miss; + if (L2Ref) { + return double(L2Hit) / double(L2Ref); + } + return 1; + } if (PCM::getInstance()->getCPUModel() == PCM::ATOM) { uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss; uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef; - if (L2Ref) return 1. - (double(L2Miss) / double(L2Ref)); + if (L2Ref) { + return 1. - (double(L2Miss) / double(L2Ref)); + } return 1; } uint64 L3Miss = after.L3Miss - before.L3Miss; @@ -1716,6 +1780,15 @@ double getL2CacheHitRatio(const CounterStateType & before, const CounterStateTyp template double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0 { + if (PCM::getInstance()->useSkylakeEvents()) { + uint64 L3Hit = after.SKLL3Hit - before.SKLL3Hit; + uint64 L3Ref = L3Hit + after.L3Miss - before.L3Miss; + if (L3Ref) { + return double(L3Hit) / double(L3Ref); + } + return 1; + } + if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return -1; uint64 L3Miss = after.L3Miss - before.L3Miss; @@ -1752,6 +1825,9 @@ uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType template uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after) { + if (PCM::getInstance()->useSkylakeEvents()) { + return after.SKLL2Miss - before.SKLL2Miss; + } if (PCM::getInstance()->getCPUModel() == PCM::ATOM) { return after.ArchLLCMiss - before.ArchLLCMiss; @@ -1787,9 +1863,7 @@ uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & template uint64 getL3CacheOccupancy(const CounterStateType & now) { - return now.L3Occupancy ; - } /*! \brief Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done @@ -1802,7 +1876,7 @@ uint64 getL3CacheOccupancy(const CounterStateType & now) template uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after) { - if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return 0; + if (PCM::getInstance()->getCPUModel() == PCM::ATOM || PCM::getInstance()->useSkylakeEvents()) return 0; return after.L3UnsharedHit - before.L3UnsharedHit; } @@ -1816,6 +1890,9 @@ uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterState template uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after) { + if (PCM::getInstance()->useSkylakeEvents()) { + return after.SKLL3Hit - before.SKLL3Hit; + } if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return 0; return after.L2HitM - before.L2HitM; } diff --git a/gen_new_win_project.sh b/gen_new_win_project.sh deleted file mode 100644 index 6e1115d..0000000 --- a/gen_new_win_project.sh +++ /dev/null @@ -1,15 +0,0 @@ - -DIRNAME=PCM-NUMA_Win -UTILNAME=pcm-numa - -rm -rf $DIRNAME - -mkdir $DIRNAME -cp PCM-Power_Win/stdafx.h $DIRNAME/stdafx.h -cp PCM-Power_Win/stdafx.cpp $DIRNAME/stdafx.cpp - -sed 's/pcm-power/'$UTILNAME'/g' PCM-Power_Win/pcm-power-win.cpp > $DIRNAME/$UTILNAME-win.cpp - -sed 's/pcm-power/'$UTILNAME'/g' PCM-Power_Win/pcm-power-win.vcproj > $DIRNAME/$UTILNAME-win.vcproj - - diff --git a/intelpcm.so/Makefile b/intelpcm.so/Makefile index 7ad56a0..40e0fa4 100644 --- a/intelpcm.so/Makefile +++ b/intelpcm.so/Makefile @@ -6,7 +6,7 @@ OPT= -g -O3 CXXFLAGS+= -Wall -fPIC $(OPT) vpath %.cpp .. -libintelpcm.so: msr.o cpucounters.o pci.o client_bw.o +libintelpcm.so: msr.o cpucounters.o pci.o client_bw.o utils.o $(CXX) $(CXXFLAGS) -shared $^ -lpthread -o $@ clean: diff --git a/msr.cpp b/msr.cpp index cf94be9..7354a61 100644 --- a/msr.cpp +++ b/msr.cpp @@ -14,10 +14,11 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND // Austen Ott // Jim Harris (FreeBSD) +#include +#include #include #include #include -#include #ifndef _MSC_VER #include #endif @@ -30,7 +31,6 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include #include "utils.h" #include "Winmsrdriver\win7\msrstruct.h" -#include "winring0/OlsDef.h" #include "winring0/OlsApiInitExt.h" extern HMODULE hOpenLibSys; @@ -67,12 +67,8 @@ int32 MsrHandle::write(uint64 msr_number, uint64 value) cvt_ds cvt; cvt.ui64 = value; - #ifdef COMPILE_FOR_WINDOWS_7 ThreadGroupTempAffinity affinity(cpu_id); - BOOL status = Wrmsr((DWORD)msr_number, cvt.ui32.low, cvt.ui32.high); - #else - BOOL status = WrmsrTx((DWORD)msr_number, cvt.ui32.low, cvt.ui32.high,(1UL << cpu_id)); - #endif + DWORD status = Wrmsr((DWORD)msr_number, cvt.ui32.low, cvt.ui32.high); return status?sizeof(uint64):0; } @@ -88,18 +84,14 @@ int32 MsrHandle::read(uint64 msr_number, uint64 * value) req.msr_address = msr_number; BOOL status = DeviceIoControl(hDriver, IO_CTL_MSR_READ, &req, sizeof(MSR_Request), value, sizeof(uint64), &reslength, NULL); assert(status && "Error in DeviceIoControl"); - return reslength; + return (int32)reslength; } cvt_ds cvt; cvt.ui64 = 0; - #ifdef COMPILE_FOR_WINDOWS_7 ThreadGroupTempAffinity affinity(cpu_id); - BOOL status = Rdmsr((DWORD)msr_number, &(cvt.ui32.low), &(cvt.ui32.high)); - #else - BOOL status = RdmsrTx((DWORD)msr_number, &(cvt.ui32.low), &(cvt.ui32.high), (1UL << cpu_id)); - #endif + DWORD status = Rdmsr((DWORD)msr_number, &(cvt.ui32.low), &(cvt.ui32.high)); if(status) *value = cvt.ui64; diff --git a/msr.h b/msr.h index fb34238..4e98116 100644 --- a/msr.h +++ b/msr.h @@ -30,6 +30,8 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include #endif +#include "mutex.h" +#include class MsrHandle { @@ -43,13 +45,14 @@ class MsrHandle #endif uint32 cpu_id; MsrHandle(); // forbidden - MsrHandle(MsrHandle &); // forbidden + MsrHandle(const MsrHandle &); // forbidden + MsrHandle & operator = (const MsrHandle &); // forbidden public: MsrHandle(uint32 cpu); int32 read(uint64 msr_number, uint64 * value); int32 write(uint64 msr_number, uint64 value); - int32 getCoreId() { return cpu_id; } + int32 getCoreId() { return (int32)cpu_id; } #ifdef __APPLE__ int32 buildTopology(uint32 num_cores, void*); uint32 getNumInstances(); @@ -61,16 +64,17 @@ class MsrHandle class SafeMsrHandle { - MsrHandle * pHandle; + std::shared_ptr pHandle; + PCM_Util::Mutex mutex; - SafeMsrHandle(SafeMsrHandle &); // forbidden + SafeMsrHandle(const SafeMsrHandle &); // forbidden + SafeMsrHandle& operator = (const SafeMsrHandle &); // forbidden public: - SafeMsrHandle() : pHandle(NULL) {} + SafeMsrHandle() {} - SafeMsrHandle(uint32 core_id) + SafeMsrHandle(uint32 core_id) : pHandle(new MsrHandle(core_id)) { - pHandle = new MsrHandle(core_id); } int32 read(uint64 msr_number, uint64 * value) @@ -80,7 +84,7 @@ class SafeMsrHandle *value = 0; - return sizeof(uint64); + return (int32)sizeof(uint64); } int32 write(uint64 msr_number, uint64 value) @@ -88,7 +92,7 @@ class SafeMsrHandle if(pHandle) return pHandle->write(msr_number, value); - return sizeof(uint64); + return (int32)sizeof(uint64); } int32 getCoreId() { @@ -98,6 +102,15 @@ class SafeMsrHandle throw std::exception(); return -1; } + + void lock() { + mutex.lock(); + } + + void unlock() { + mutex.unlock(); + } + #ifdef __APPLE__ int32 buildTopology(uint32 num_cores, void* p) { @@ -134,11 +147,6 @@ class SafeMsrHandle #endif virtual ~SafeMsrHandle() { - if(pHandle) - { - delete pHandle; - pHandle = NULL; - } } }; diff --git a/mutex.h b/mutex.h new file mode 100644 index 0000000..1d4a263 --- /dev/null +++ b/mutex.h @@ -0,0 +1,56 @@ +#ifndef MUTEX_HEADER_ +#define MUTEX_HEADER_ + +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include + +namespace PCM_Util { + + class Mutex { + +#ifdef _MSC_VER + HANDLE mutex_; +#else + pthread_mutex_t mutex_; +#endif + public: + Mutex() { +#ifdef _MSC_VER + mutex_ = CreateMutex(NULL, FALSE, NULL); +#else + pthread_mutex_init(&mutex_, NULL); +#endif + } + virtual ~Mutex() { +#ifdef _MSC_VER + CloseHandle(mutex_); +#else + pthread_mutex_destroy(&mutex_); +#endif + } + + void lock() { +#ifdef _MSC_VER + WaitForSingleObject(mutex_, INFINITE); +#else + pthread_mutex_lock(&mutex_); +#endif + } + void unlock() { +#ifdef _MSC_VER + ReleaseMutex(mutex_); +#else + pthread_mutex_unlock(&mutex_); +#endif + } + }; + +}; + +#endif + diff --git a/pci.cpp b/pci.cpp index 76eebb1..593625a 100644 --- a/pci.cpp +++ b/pci.cpp @@ -16,14 +16,10 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND // Jim Harris (FreeBSD) #include -#include -#include +#include #include #include #include -#include -#include -#include #include "pci.h" #ifndef _MSC_VER @@ -80,25 +76,25 @@ int32 PciHandle::read32(uint64 offset, uint32 * value) PCICFG_Request req; ULONG64 result = 0; DWORD reslength = 0; - req.bus = bus; - req.dev = device; - req.func = function; + req.bus = (ULONG)bus; + req.dev = (ULONG)device; + req.func = (ULONG)function; req.bytes = sizeof(uint32); - req.reg = (uint32)offset; + req.reg = (ULONG)offset; - BOOL status = DeviceIoControl(hDriver, IO_CTL_PCICFG_READ, &req, sizeof(PCICFG_Request), &result, sizeof(uint64), &reslength, NULL); + BOOL status = DeviceIoControl(hDriver, IO_CTL_PCICFG_READ, &req, (DWORD)sizeof(PCICFG_Request), &result, (DWORD)sizeof(uint64), &reslength, NULL); *value = (uint32)result; if (!status) { //std::cerr << "Error reading PCI Config space at bus "< +#ifdef _MSC_VER +#pragma warning(disable : 4996) // for sprintf +#define strtok_r strtok_s +#include +#include "../PCM_Win/windriver.h" +#else +#include +#include +#include // for gettimeofday() +#endif +#include +#include +#include +#include +#include +#include +#include +#include "cpucounters.h" +#include "utils.h" +#ifdef _MSC_VER +#include "freegetopt/getopt.h" +#endif + +#include +#define PCM_DELAY_DEFAULT 1.0 // in seconds +#define PCM_DELAY_MIN 0.015 // 15 milliseconds is practical on most modern CPUs +#define PCM_CALIBRATION_INTERVAL 50 // calibrate clock only every 50th iteration + +using namespace std; + +struct CoreEvent +{ + char name[256]; + uint64 value; + uint64 msr_value; + char * description; +} events[4]; + +void print_usage(const string progname) +{ + cerr << endl << " Usage: " << endl << " " << progname + << " --help | [delay] [options] [-- external_program [external_program_options]]" << endl; + cerr << " => time interval to sample performance counters." << endl; + cerr << " If not specified, or 0, with external program given" << endl; + cerr << " will read counters only after external program finishes" << endl; + cerr << " Supported are: " << endl; + cerr << " -h | --help | /h => print this help and exit" << endl; + cerr << " -c | /c => print CPU Model name and exit (used for pmu-query.py)" << endl; + cerr << " -csv[=file.csv] | /csv[=file.csv] => output compact CSV format to screen or" << endl + << " to a file, in case filename is provided" << endl; + cerr << " [-e event1] [-e event2] [-e event3]=> optional list of custom events to monitor (up to 4)." << endl; + cerr << " Examples:" << endl; + cerr << " " << progname << " 1 => print counters every second without core and socket output" << endl; + cerr << " " << progname << " 0.5 -csv=test.log => twice a second save counter values to test.log in CSV format" << endl; + cerr << " " << progname << " /csv 5 2>/dev/null => one sampe every 5 seconds, and discard all diagnostic output" << endl; + cerr << endl; +} + +template +void print_custom_stats(const StateType & BeforeState, const StateType & AfterState ,bool csv) +{ + uint64 cycles = getCycles(BeforeState, AfterState); + uint64 instr = getInstructionsRetired(BeforeState, AfterState); + if(!csv) + { + cout << double(instr)/double(cycles) << " "; + cout << unit_format(instr) << " "; + cout << unit_format(cycles) << " "; + } + else + { + cout << double(instr)/double(cycles) << ","; + cout << instr << ","; + cout << cycles << ","; + } + for(int i=0;i<4;++i) + if(!csv) + cout << unit_format(getNumberOfCustomEvents(i, BeforeState, AfterState)) << " "; + else + cout << getNumberOfCustomEvents(i, BeforeState, AfterState)<<","; + + cout << endl; +} + +#define EVENT_SIZE 256 +void build_event(const char * argv, EventSelectRegister *reg, int idx) +{ + char *token, *subtoken, *saveptr1, *saveptr2; + char name[EVENT_SIZE], *str1, *str2; + int j, tmp; + uint64 tmp2; + reg->value = 0; + reg->fields.usr = 1; + reg->fields.os = 1; + reg->fields.enable = 1; + + memset(name,0,EVENT_SIZE); + strncpy(name,argv,EVENT_SIZE-1); + /* + uint64 apic_int : 1; + + offcore_rsp=2,period=10000 + */ + for (j = 1, str1 = name; ; j++, str1 = NULL) { + token = strtok_r(str1, "/", &saveptr1); + if (token == NULL) + break; + printf("%d: %s\n", j, token); + if(strncmp(token,"cpu",3) == 0) + continue; + + for (str2 = token; ; str2 = NULL) { + tmp = -1; + subtoken = strtok_r(str2, ",", &saveptr2); + if (subtoken == NULL) + break; + if(sscanf(subtoken,"event=%i",&tmp) == 1) + reg->fields.event_select = tmp; + else if(sscanf(subtoken,"umask=%i",&tmp) == 1) + reg->fields.umask = tmp; + else if(strcmp(subtoken,"edge") == 0) + reg->fields.edge = 1; + else if(sscanf(subtoken,"any=%i",&tmp) == 1) + reg->fields.any_thread = tmp; + else if(sscanf(subtoken,"inv=%i",&tmp) == 1) + reg->fields.invert = tmp; + else if(sscanf(subtoken,"cmask=%i",&tmp) == 1) + reg->fields.cmask = tmp; + else if(sscanf(subtoken,"in_tx=%i",&tmp) == 1) + reg->fields.in_tx = tmp; + else if(sscanf(subtoken,"in_tx_cp=%i",&tmp) == 1) + reg->fields.in_txcp = tmp; + else if(sscanf(subtoken,"pc=%i",&tmp) == 1) + reg->fields.pin_control = tmp; + else if(sscanf(subtoken,"offcore_rsp=%llx",&tmp2) == 1) { + if(idx >= 2) + { + cerr << "offcore_rsp must specify in first or second event only. idx=" << idx << endl; + throw idx; + } + events[idx].msr_value = tmp2; + } + else if(sscanf(subtoken,"name=%255s",events[idx].name) == 1) ; + else + { + cerr << "Event '" << subtoken << "' is not supported. See the list of supported events"<< endl; + throw subtoken; + } + + } + } + events[idx].value = reg->value; +} + +int main(int argc, char * argv[]) +{ + set_signal_handlers(); + +#ifdef PCM_FORCE_SILENT + null_stream nullStream1, nullStream2; + std::cout.rdbuf(&nullStream1); + std::cerr.rdbuf(&nullStream2); +#endif + + cerr << endl; + cerr << " Intel(r) Performance Counter Monitor: Core Monitoring Utility "<< endl; + cerr << endl; + cerr << INTEL_PCM_COPYRIGHT << endl; + cerr << endl; + + double delay = -1.0; + char *sysCmd = NULL; + char **sysArgv = NULL; + uint32 cur_event = 0; + bool csv = false; + long diff_usec = 0; // deviation of clock is useconds between measurements + int calibrated = PCM_CALIBRATION_INTERVAL - 2; // keeps track is the clock calibration needed + string program = string(argv[0]); + EventSelectRegister regs[4]; + PCM::ExtendedCustomCoreEventDescription conf; + conf.fixedCfg = NULL; // default + conf.nGPCounters = 4; + conf.gpCounterCfg = regs; + + PCM * m = PCM::getInstance(); + + if(argc > 1) do + { + argv++; + argc--; + if (strncmp(*argv, "--help", 6) == 0 || + strncmp(*argv, "-h", 2) == 0 || + strncmp(*argv, "/h", 2) == 0) + { + print_usage(program); + exit(EXIT_FAILURE); + } + else if (strncmp(*argv, "-csv",4) == 0 || + strncmp(*argv, "/csv",4) == 0) + { + csv = true; + string cmd = string(*argv); + size_t found = cmd.find('=',4); + if (found != string::npos) { + string filename = cmd.substr(found+1); + if (!filename.empty()) { + m->setOutput(filename); + } + } + continue; + } + else if (strncmp(*argv, "-c",2) == 0 || + strncmp(*argv, "/c",2) == 0) + { + cout << m->getCPUFamilyModelString() << endl; + exit(EXIT_SUCCESS); + } + else if (strncmp(*argv, "-e",2) == 0) + { + argv++; + argc--; + if(cur_event >= 4 ) { + cerr << "At most 4 events are allowed"<< endl; + exit(EXIT_FAILURE); + } + try { + build_event(*argv,®s[cur_event],cur_event); + cur_event++; + } catch (const char * /* str */) { + exit(EXIT_FAILURE); + } + + continue; + } + else if (strncmp(*argv, "--", 2) == 0) + { + argv++; + sysCmd = *argv; + sysArgv = argv; + break; + } + else + { + // any other options positional that is a floating point number is treated as , + // while the other options are ignored with a warning issues to stderr + double delay_input; + std::istringstream is_str_stream(*argv); + is_str_stream >> noskipws >> delay_input; + if(is_str_stream.eof() && !is_str_stream.fail()) { + delay = delay_input; + } else { + cerr << "WARNING: unknown command-line option: \"" << *argv << "\". Ignoring it." << endl; + print_usage(program); + exit(EXIT_FAILURE); + } + continue; + } + } while(argc > 1); // end of command line partsing loop + + conf.OffcoreResponseMsrValue[0] = events[0].msr_value; + conf.OffcoreResponseMsrValue[1] = events[1].msr_value; + + PCM::ErrorCode status = m->program(PCM::EXT_CUSTOM_CORE_EVENTS, &conf); + switch (status) + { + case PCM::Success: + break; + case PCM::MSRAccessDenied: + cerr << "Access to Intel(r) Performance Counter Monitor has denied (no MSR or PCI CFG space access)." << endl; + exit(EXIT_FAILURE); + case PCM::PMUBusy: + cerr << "Access to Intel(r) Performance Counter Monitor has denied (Performance Monitoring Unit is occupied by other application). Try to stop the application that uses PMU." << endl; + cerr << "Alternatively you can try to reset PMU configuration at your own risk. Try to reset? (y/n)" << endl; + char yn; + std::cin >> yn; + if ('y' == yn) + { + m->resetPMU(); + cerr << "PMU configuration has been reset. Try to rerun the program again." << endl; + } + exit(EXIT_FAILURE); + default: + cerr << "Access to Intel(r) Performance Counter Monitor has denied (Unknown error)." << endl; + exit(EXIT_FAILURE); + } + + cerr << "\nDetected "<< m->getCPUBrandString() << " \"Intel(r) microarchitecture codename "<getUArchCodename()<<"\""<getNumCores(); + std::vector BeforeState, AfterState; + std::vector DummySocketStates; + + if ( (sysCmd != NULL) && (delay<=0.0) ) { + // in case external command is provided in command line, and + // delay either not provided (-1) or is zero + m->setBlocked(true); + } else { + m->setBlocked(false); + } + + if (csv) { + if( delay<=0.0 ) delay = PCM_DELAY_DEFAULT; + } else { + // for non-CSV mode delay < 1.0 does not make a lot of practical sense: + // hard to read from the screen, or + // in case delay is not provided in command line => set default + if( ((delay<1.0) && (delay>0.0)) || (delay<=0.0) ) delay = PCM_DELAY_DEFAULT; + } + + cerr << "Update every "<getTickCount(); + m->getAllCounterStates(SysBeforeState, DummySocketStates, BeforeState); + + if( sysCmd != NULL ) { + MySystem(sysCmd, sysArgv); delay = 0; + } + + while(1) + { + if(!csv) cout << std::flush; + int delay_ms = int(delay * 1000); + int calibrated_delay_ms = delay_ms; +#ifdef _MSC_VER + // compensate slow Windows console output + if(AfterTime) delay_ms -= (int)(m->getTickCount() - BeforeTime); + if(delay_ms < 0) delay_ms = 0; +#else + // compensation of delay on Linux/UNIX + // to make the samling interval as monotone as possible + struct timeval start_ts, end_ts; + if(calibrated == 0) { + gettimeofday(&end_ts, NULL); + diff_usec = (end_ts.tv_sec-start_ts.tv_sec)*1000000.0+(end_ts.tv_usec-start_ts.tv_usec); + calibrated_delay_ms = delay_ms - diff_usec/1000.0; + } +#endif + + MySleepMs(calibrated_delay_ms); + +#ifndef _MSC_VER + calibrated = (calibrated + 1) % PCM_CALIBRATION_INTERVAL; + if(calibrated == 0) { + gettimeofday(&start_ts, NULL); + } +#endif + AfterTime = m->getTickCount(); + m->getAllCounterStates(SysAfterState, DummySocketStates, AfterState); + + cout << "Time elapsed: "<isBlocked() ) { + // in case PCM was blocked after spawning child application: break monitoring loop here + break; + } + } + exit(EXIT_SUCCESS); +} diff --git a/pcm-memory.cpp b/pcm-memory.cpp index be02053..86a18c7 100644 --- a/pcm-memory.cpp +++ b/pcm-memory.cpp @@ -16,7 +16,7 @@ /*! \file pcm-memory.cpp - \brief Example of using CPU counters: implements a performance counter monitoring utility for memory controller channels + \brief Example of using CPU counters: implements a performance counter monitoring utility for memory controller channels and DIMMs (ranks) */ #define HACK_TO_REMOVE_DUPLICATE_ERROR #include @@ -42,11 +42,17 @@ //Programmable iMC counter #define READ 0 #define WRITE 1 +#define READ_RANK_A 0 +#define WRITE_RANK_A 1 +#define READ_RANK_B 2 +#define WRITE_RANK_B 3 #define PARTIAL 2 #define PCM_DELAY_DEFAULT 1.0 // in seconds #define PCM_DELAY_MIN 0.015 // 15 milliseconds is practical on most modern CPUs #define PCM_CALIBRATION_INTERVAL 50 // calibrate clock only every 50th iteration +#define DEFAULT_DISPLAY_COLUMNS 2 + using namespace std; void print_help(const string prog_name) @@ -58,8 +64,10 @@ void print_help(const string prog_name) cerr << " will read counters only after external program finishes" << endl; cerr << " Supported are: " << endl; cerr << " -h | --help | /h => print this help and exit" << endl; + cerr << " -rank=X | /rank=X => monitor DIMM rank X. At most 2 out of 8 total ranks can be monitored simultaneously." << endl; cerr << " -csv[=file.csv] | /csv[=file.csv] => output compact CSV format to screen or" << endl << " to a file, in case filename is provided" << endl; + cerr << " -columns=X | /columns=X => Number of columns to display the NUMA Nodes, defaults to 2." << endl; #ifdef _MSC_VER cerr << " --uninstallDriver | --installDriver=> (un)install driver" << endl; #endif @@ -70,7 +78,128 @@ void print_help(const string prog_name) cerr << endl; } -void display_bandwidth(float *iMC_Rd_socket_chan, float *iMC_Wr_socket_chan, float *iMC_Rd_socket, float *iMC_Wr_socket, uint32 numSockets, uint32 num_imc_channels, uint64 *partial_write) +void printSocketBWHeader(uint32 no_columns, uint32 skt) +{ + for (uint32 i=skt; i<(no_columns+skt); ++i) { + cout << "|---------------------------------------|"; + } + cout << endl; + for (uint32 i=skt; i<(no_columns+skt); ++i) { + cout << "|-- Socket "<= 0) { + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- Mem Ch "<= 0) { + for (uint32 i=skt; i<(skt+no_columns); ++i) { + cout << "|-- Mem Ch "<getMCChannelsPerSocket(); float iMC_Rd_socket_chan[max_sockets][max_imc_channels]; @@ -292,7 +384,70 @@ void calculate_bandwidth(PCM *m, const ServerUncorePowerState uncState1[], const } display_bandwidth_csv(iMC_Rd_socket_chan[0], iMC_Wr_socket_chan[0], iMC_Rd_socket, iMC_Wr_socket, m->getNumSockets(), max_imc_channels, partial_write, elapsedTime); } else { - display_bandwidth(iMC_Rd_socket_chan[0], iMC_Wr_socket_chan[0], iMC_Rd_socket, iMC_Wr_socket, m->getNumSockets(), max_imc_channels, partial_write); + display_bandwidth(iMC_Rd_socket_chan[0], iMC_Wr_socket_chan[0], iMC_Rd_socket, iMC_Wr_socket, m->getNumSockets(), max_imc_channels, partial_write, no_columns); + } +} + +void calculate_bandwidth(PCM *m, const ServerUncorePowerState uncState1[], const ServerUncorePowerState uncState2[], uint64 elapsedTime, bool csv, bool & csvheader, uint32 no_columns, int rankA, int rankB) +{ + uint32 skt = 0; + cout.setf(ios::fixed); + cout.precision(2); + uint32 numSockets = m->getNumSockets(); + + while(skt < numSockets) + { + // Full row + if ( (skt+no_columns) <= numSockets ) + { + printSocketRankBWHeader(no_columns, skt); + printSocketChannelBW(no_columns, skt, max_imc_channels, uncState1, uncState2, elapsedTime, rankA, rankB); + for (uint32 i=skt; i<(no_columns+skt); ++i) { + cout << "|-------------------------------------------|"; + } + cout << endl; + skt += no_columns; + } + else //Display one socket in this row + { + cout << "\ + \r|-------------------------------------------|\n\ + \r|-- Socket "<=0) + cout << "|-- Mem Ch " + << setw(2) << channel + << " R " << setw(1) << rankA + <<": Reads (MB/s):" + <=0) + cout << "|-- Mem Ch " + << setw(2) << channel + << " R " << setw(1) << rankB + <<": Reads (MB/s):" + < m->getNumSockets()) + no_columns = m->getNumSockets(); + } + continue; + } + if (strncmp(*argv, "-rank", 5) == 0 || + strncmp(*argv, "/rank", 5) == 0) + { + string cmd = string(*argv); + size_t found = cmd.find('=',2); + if (found != string::npos) { + int rank = atoi(cmd.substr(found+1).c_str()); + if (rankA >= 0 && rankB >= 0) + { + std::cerr << "At most two DIMM ranks can be monitored "<< std::endl; + exit(EXIT_FAILURE); + } + else + { + if(rank > 7) { + std::cerr << "Invalid rank number "< 1); // end of command line partsing loop m->disableJKTWorkaround(); - PCM::ErrorCode status = m->program(); + PCM::ErrorCode status = m->programServerUncoreMemoryMetrics(rankA, rankB); switch (status) { case PCM::Success: @@ -473,7 +671,7 @@ int main(int argc, char * argv[]) BeforeTime = m->getTickCount(); if( sysCmd != NULL ) { - MySystem(sysCmd, sysArgv); + MySystem(sysCmd, sysArgv); delay = 0; } while(1) @@ -514,7 +712,10 @@ int main(int argc, char * argv[]) cout << "Called sleep function for "<= 0 || rankB >= 0) + calculate_bandwidth(m,BeforeState,AfterState,AfterTime-BeforeTime,csv,csvheader, no_columns, rankA, rankB); + else + calculate_bandwidth(m,BeforeState,AfterState,AfterTime-BeforeTime,csv,csvheader, no_columns); swap(BeforeTime, AfterTime); swap(BeforeState, AfterState); diff --git a/pcm-msr.cpp b/pcm-msr.cpp index 61b449b..59acce5 100644 --- a/pcm-msr.cpp +++ b/pcm-msr.cpp @@ -51,6 +51,7 @@ void print_usage(const char * progname) int main(int argc, char * argv[]) { std::cout << "\n Intel(r) Performance Counter Monitor " << INTEL_PCM_VERSION << std::endl; + std::cout << INTEL_PCM_COPYRIGHT << std::endl; std::cout << "\n MSR read/write utility\n\n"; uint64 value = 0; @@ -101,20 +102,26 @@ int main(int argc, char * argv[]) // drv.stop(); // restart driver (usually not needed) if (!drv.start(driverPath)) { - std::cout << "Can not load MSR driver." << std::endl; - std::cout << "You must have signed msr.sys driver in your current directory and have administrator rights to run this program" << std::endl; + std::cerr << "Can not load MSR driver." << std::endl; + std::cerr << "You must have signed msr.sys driver in your current directory and have administrator rights to run this program" << std::endl; return -1; } #endif - - MsrHandle h(core); - if(!dec) std::cout << std::hex << std::showbase; - if(write) + try { + MsrHandle h(core); + if (!dec) std::cout << std::hex << std::showbase; + if (write) + { + std::cout << " Writing " << value << " to MSR " << msr << " on core " << core << std::endl; + h.write(msr, value); + } + value = 0; + h.read(msr, &value); + std::cout << " Read value " << value << " from MSR " << msr << " on core " << core << "\n" << std::endl; + } + catch (std::exception & e) { - std::cout << " Writing "<< value << " to MSR "<< msr << " on core "<< core << std::endl; - h.write(msr,value); + std::cerr << "Error accessing MSRs: " << e.what() << std::endl; + std::cerr << "Please check if the program can access MSR drivers." << std::endl; } - value = 0; - h.read(msr,&value); - std::cout << " Read value "<< value << " from MSR "<< msr << " on core "<< core << "\n" << std::endl; } diff --git a/pcm-numa.cpp b/pcm-numa.cpp index 7262384..58cfb9c 100644 --- a/pcm-numa.cpp +++ b/pcm-numa.cpp @@ -61,7 +61,7 @@ void print_usage(const string progname) cerr << " Examples:" << endl; cerr << " " << progname << " 1 => print counters every second without core and socket output" << endl; cerr << " " << progname << " 0.5 -csv=test.log => twice a second save counter values to test.log in CSV format" << endl; - cerr << " " << progname << " /csv 5 2>/dev/null => one sampe every 5 seconds, and discard all diagnostic output" << endl; + cerr << " " << progname << " /csv 5 2>/dev/null => one sample every 5 seconds, and discard all diagnostic output" << endl; cerr << endl; } @@ -106,8 +106,7 @@ int main(int argc, char * argv[]) cerr << endl; cerr << " Intel(r) Performance Counter Monitor: NUMA monitoring utility "<< endl; - cerr << endl; - cerr << " Copyright (c) 2013-2014 Intel Corporation" << endl; + cerr << INTEL_PCM_COPYRIGHT << std::endl; cerr << endl; double delay = -1.0; @@ -268,7 +267,7 @@ int main(int argc, char * argv[]) m->getAllCounterStates(SysBeforeState, DummySocketStates, BeforeState); if( sysCmd != NULL ) { - MySystem(sysCmd, sysArgv); + MySystem(sysCmd, sysArgv); delay = 0; } while(1) diff --git a/pcm-pcie.cpp b/pcm-pcie.cpp index cdd9847..4f3ccee 100644 --- a/pcm-pcie.cpp +++ b/pcm-pcie.cpp @@ -84,7 +84,6 @@ void print_events() cerr << " CRd* - Demand Code Read\n"; cerr << " DRd - Demand Data Read\n"; cerr << " PCIeNSWr - PCIe Non-snoop write transfer (partial cache line)\n"; - cerr << " PRd - MMIO Read [Haswell Server only: PL verify this on IVT] (Partial Cache Line)\n"; cerr << " PCIe write events (PCI devices writing to memory - application reads from disk/network/PCIe device):\n"; cerr << " PCIeWiLF - PCIe Write transfer (non-allocating) (full cache line)\n"; cerr << " PCIeItoM - PCIe Write transfer (allocating) (full cache line)\n"; @@ -92,6 +91,8 @@ void print_events() cerr << " PCIeNSWrF - PCIe Non-snoop write transfer (full cache line)\n"; cerr << " ItoM - PCIe write full cache line\n"; cerr << " RFO - PCIe parial Write\n"; + cerr << " CPU MMIO events (CPU reading/writing to PCIe devices):\n"; + cerr << " PRd - MMIO Read [Haswell Server only] (Partial Cache Line)\n"; cerr << " WiL - MMIO Write (Full/Partial)\n\n"; cerr << " * - NOTE: Depending on the configuration of your BIOS, this tool may report '0' if the message\n"; cerr << " has not been selected.\n\n"; @@ -135,7 +136,7 @@ int main(int argc, char * argv[]) cerr << endl; cerr << " Intel(r) Performance Counter Monitor: PCIe Bandwidth Monitoring Utility "<< endl; cerr << endl; - cerr << " Copyright (c) 2013-2014 Intel Corporation" << endl; + cerr << INTEL_PCM_COPYRIGHT << std::endl; cerr << " This utility measures PCIe bandwidth in real-time" << endl; cerr << endl; print_events(); @@ -243,7 +244,7 @@ int main(int argc, char * argv[]) cerr << "\nDetected "<< m->getCPUBrandString() << " \"Intel(r) microarchitecture codename "<getUArchCodename()<<"\""<hasPCICFGUncore())) { - cerr << "Jaketown, Ivytown, Haswell Server CPU is required for this tool! Program aborted" << endl; + cerr << "Jaketown, Ivytown, Haswell, Broadwell-DE Server CPU is required for this tool! Program aborted" << endl; exit(EXIT_FAILURE); } @@ -282,12 +283,12 @@ int main(int argc, char * argv[]) uint32 i; uint32 delay_ms = uint32(delay * 1000 / num_events / NUM_SAMPLES); - delay_ms * num_events * NUM_SAMPLES < delay * 1000 ? delay_ms += 1 : delay_ms = delay_ms; //Adjust the sleep if it's less than delay time + if(delay_ms * num_events * NUM_SAMPLES < delay * 1000) ++delay_ms; //Adjust the delay_ms if it's less than delay time sample_t sample[max_sockets]; cerr << "delay_ms: " << delay_ms << endl; if( sysCmd != NULL ) { - MySystem(sysCmd, sysArgv); + MySystem(sysCmd, sysArgv); delay = 0; } // ================================== Begin Printing Output ================================== @@ -302,7 +303,7 @@ int main(int argc, char * argv[]) memset(sample,0,sizeof(sample)); memset(&aggregate_sample,0,sizeof(aggregate_sample)); - if(m->getCPUModel() == PCM::HASWELLX) // Haswell Server + if(m->getCPUModel() == PCM::HASWELLX || m->getCPUModel() == PCM::BDX_DE) // Haswell Server { for(i=0;igetCPUModel() == PCM::HASWELLX) // Haswell Server + if(m->getCPUModel() == PCM::HASWELLX || m->getCPUModel() == PCM::BDX_DE) // Haswell Server { for(i=0;i sample[i].miss.PCIeNSWr) ? sample[i].total.PCIeNSWr - sample[i].miss.PCIeNSWr : 0; - aggregate_sample.PCIeItoM += sample[i].total.PCIeItoM; + aggregate_sample.PCIeNSWr += sample[i].total.PCIeNSWr; break; case PCM::PCIeNSWrF: sample[i].total.PCIeNSWrF += (sizeof(PCIeEvents_t)/sizeof(uint64)) * getNumberOfEvents(before[i], after[i]); diff --git a/pcm-power.cpp b/pcm-power.cpp index 5393eda..bdfbf67 100644 --- a/pcm-power.cpp +++ b/pcm-power.cpp @@ -146,7 +146,8 @@ int main(int argc, char * argv[]) set_signal_handlers(); std::cerr << "\n Intel(r) Performance Counter Monitor " << INTEL_PCM_VERSION << std::endl; - std::cerr << "\n Power Monitoring Utility\n Copyright (c) 2011-2014 Intel Corporation\n"; + std::cerr << "\n Power Monitoring Utility\n"; + std::cerr << INTEL_PCM_COPYRIGHT << std::endl; int imc_profile = 0; int pcu_profile = 0; @@ -287,8 +288,12 @@ int main(int argc, char * argv[]) std::cout << std::fixed; std::cerr << "\nMC counter group: "<getTickCount(); if( sysCmd != NULL ) { - MySystem(sysCmd, sysArgv); + MySystem(sysCmd, sysArgv); delay = 0; } while(1) @@ -395,6 +400,8 @@ int main(int argc, char * argv[]) switch(pcu_profile) { case 0: + if (cpu_model == PCM::HASWELLX || cpu_model == PCM::BDX_DE) + break; std::cout << "S"< are: " << endl; cerr << " -h | --help | /h => print this help and exit" << endl; + cerr << " -F | -force => force running this program despite lack of HW RTM support (optional)" << endl; cerr << " -csv[=file.csv] | /csv[=file.csv] => output compact CSV format to screen or" << endl << " to a file, in case filename is provided" << endl; cerr << " [-e event1] [-e event2] [-e event3]=> optional list of custom TSX events to monitor (up to 4)." @@ -198,7 +199,7 @@ int main(int argc, char * argv[]) cerr << endl; cerr << " Intel(r) Performance Counter Monitor: Intel(r) Transactional Synchronization Extensions Monitoring Utility "<< endl; cerr << endl; - cerr << " Copyright (c) 2013-2014 Intel Corporation" << endl; + cerr << INTEL_PCM_COPYRIGHT << std::endl; cerr << endl; double delay = -1.0; @@ -208,6 +209,7 @@ int main(int argc, char * argv[]) int cur_event; bool csv = false; long diff_usec = 0; // deviation of clock is useconds between measurements + bool force = false; int calibrated = PCM_CALIBRATION_INTERVAL - 2; // keeps track is the clock calibration needed string program = string(argv[0]); @@ -258,6 +260,13 @@ int main(int argc, char * argv[]) continue; } else + if ( (strncmp(*argv, "-F", 2) == 0) || + (strncmp(*argv, "-f", 2) == 0) || + (strncmp(*argv, "-force", 6) == 0) ) + { + force = true; + } + else if (strncmp(*argv, "--", 2) == 0) { argv++; @@ -344,6 +353,16 @@ int main(int argc, char * argv[]) cerr << "\nDetected "<< m->getCPUBrandString() << " \"Intel(r) microarchitecture codename "<getUArchCodename()<<"\""<supportsRTM(); + + if (!rtm_support) { + if (!force) { + cerr << "No RTM support detected, use -F if you still want to run this program." << endl; + exit(EXIT_FAILURE); + } + cerr << "No RTM support detected, but -F found as argument, running anyway." << endl; + } + uint64 BeforeTime = 0, AfterTime = 0; SystemCounterState SysBeforeState, SysAfterState; const uint32 ncores = m->getNumCores(); @@ -376,7 +395,7 @@ int main(int argc, char * argv[]) m->getAllCounterStates(SysBeforeState, DummySocketStates, BeforeState); if( sysCmd != NULL ) { - MySystem(sysCmd, sysArgv); + MySystem(sysCmd, sysArgv); delay = 0; } while(1) diff --git a/pcm.cpp b/pcm.cpp index 319e5a6..2d085ee 100644 --- a/pcm.cpp +++ b/pcm.cpp @@ -126,8 +126,8 @@ void print_output(PCM * m, cout << " L2MISS: L2 cache misses (including other core's L2 cache *hits*) " << "\n"; if (cpu_model != PCM::ATOM) cout << " L3HIT : L3 cache hit ratio (0.00-1.00)" << "\n"; cout << " L2HIT : L2 cache hit ratio (0.00-1.00)" << "\n"; - if (cpu_model != PCM::ATOM) cout << " L3CLK : ratio of CPU cycles lost due to L3 cache misses (0.00-1.00), in some cases could be >1.0 due to a higher memory latency" << "\n"; - if (cpu_model != PCM::ATOM) cout << " L2CLK : ratio of CPU cycles lost due to missing L2 cache but still hitting L3 cache (0.00-1.00)" << "\n"; + if (cpu_model != PCM::ATOM) cout << " L3MPI : number of L3 cache misses per instruction\n"; + if (cpu_model != PCM::ATOM) cout << " L2MPI : number of L2 cache misses per instruction\n"; if (cpu_model != PCM::ATOM) cout << " READ : bytes read from memory controller (in GBytes)" << "\n"; if (cpu_model != PCM::ATOM) cout << " WRITE : bytes written to memory controller (in GBytes)" << "\n"; if (m->memoryIOTrafficMetricAvailable()) cout << " IO : bytes read/written due to IO requests to memory controller (in GBytes); this may be an over estimate due to same-cache-line partial requests" << "\n"; @@ -141,7 +141,7 @@ void print_output(PCM * m, cout << " Core (SKT) | EXEC | IPC | FREQ | L2MISS | L2HIT | TEMP" << "\n" << "\n"; else { - cout << " Core (SKT) | EXEC | IPC | FREQ | AFREQ | L3MISS | L2MISS | L3HIT | L2HIT | L3CLK | L2CLK |"; + cout << " Core (SKT) | EXEC | IPC | FREQ | AFREQ | L3MISS | L2MISS | L3HIT | L2HIT | L3MPI | L2MPI |"; if (m->L3CacheOccupancyMetricAvailable()) cout << " L3OCC | READ | WRITE |"; @@ -174,8 +174,8 @@ void print_output(PCM * m, " " << unit_format(getL2CacheMisses(cstates1[i], cstates2[i])) << " " << getL3CacheHitRatio(cstates1[i], cstates2[i]) << " " << getL2CacheHitRatio(cstates1[i], cstates2[i]) << - " " << getCyclesLostDueL3CacheMisses(cstates1[i], cstates2[i]) << - " " << getCyclesLostDueL2CacheMisses(cstates1[i], cstates2[i]) ; + " " << double(getL3CacheMisses(cstates1[i], cstates2[i])) / getInstructionsRetired(cstates1[i], cstates2[i]) << + " " << double(getL2CacheMisses(cstates1[i], cstates2[i])) / getInstructionsRetired(cstates1[i], cstates2[i]) ; if (m->L3CacheOccupancyMetricAvailable()) cout << " " << setw(6) << l3cache_occ_format(getL3CacheOccupancy(cstates2[i])) ; if (m->memoryIOTrafficMetricAvailable()) @@ -212,8 +212,8 @@ void print_output(PCM * m, " " << unit_format(getL2CacheMisses(sktstate1[i], sktstate2[i])) << " " << getL3CacheHitRatio(sktstate1[i], sktstate2[i]) << " " << getL2CacheHitRatio(sktstate1[i], sktstate2[i]) << - " " << getCyclesLostDueL3CacheMisses(sktstate1[i], sktstate2[i]) << - " " << getCyclesLostDueL2CacheMisses(sktstate1[i], sktstate2[i]); + " " << double(getL3CacheMisses(sktstate1[i], sktstate2[i])) / getInstructionsRetired(sktstate1[i], sktstate2[i]) << + " " << double(getL2CacheMisses(sktstate1[i], sktstate2[i])) / getInstructionsRetired(sktstate1[i], sktstate2[i]); if (m->L3CacheOccupancyMetricAvailable()) cout << " " << setw(6) << l3cache_occ_format(getL3CacheOccupancy(sktstate2[i])) ; if (m->memoryTrafficMetricsAvailable()) @@ -243,8 +243,8 @@ void print_output(PCM * m, " " << unit_format(getL2CacheMisses(sstate1, sstate2)) << " " << getL3CacheHitRatio(sstate1, sstate2) << " " << getL2CacheHitRatio(sstate1, sstate2) << - " " << getCyclesLostDueL3CacheMisses(sstate1, sstate2) << - " " << getCyclesLostDueL2CacheMisses(sstate1, sstate2) ; + " " << double(getL3CacheMisses(sstate1, sstate2)) / getInstructionsRetired(sstate1, sstate2) << + " " << double(getL2CacheMisses(sstate1, sstate2)) / getInstructionsRetired(sstate1, sstate2); if (m->L3CacheOccupancyMetricAvailable()) cout << " " << " N/A "; if (m->memoryTrafficMetricsAvailable()) @@ -376,27 +376,42 @@ void print_output(PCM * m, } if (show_socket_output) { - if (m->packageEnergyMetricsAvailable()) + cout << "\n"; + cout << " package/CPU energy (Joules) DIMM energy (Joules)\n"; + cout << "----------------------------------------------------------------------------------------------" << "\n"; + for (uint32 i = 0; i < m->getNumSockets(); ++i) { - cout << "\n"; - cout << "----------------------------------------------------------------------------------------------" << "\n"; - for (uint32 i = 0; i < m->getNumSockets(); ++i) - { - cout << " SKT " << setw(2) << i << " package consumed " << getConsumedJoules(sktstate1[i], sktstate2[i]) << " Joules\n"; - } - cout << "----------------------------------------------------------------------------------------------" << "\n"; - cout << " TOTAL: " << getConsumedJoules(sstate1, sstate2) << " Joules\n"; + cout << " SKT " << setw(2) << i << " "; + if(m->packageEnergyMetricsAvailable()) { + cout << setw(10) << getConsumedJoules(sktstate1[i], sktstate2[i]); + } else { + cout << " N/A "; + } + cout << " "; + if(m->dramEnergyMetricsAvailable()) { + cout << setw(10) << getDRAMConsumedJoules(sktstate1[i], sktstate2[i]); + } else { + cout << " N/A "; + } + cout << "\n"; } - if (m->dramEnergyMetricsAvailable()) - { - cout << "\n"; - cout << "----------------------------------------------------------------------------------------------" << "\n"; - for (uint32 i = 0; i < m->getNumSockets(); ++i) - { - cout << " SKT " << setw(2) << i << " DIMMs consumed " << getDRAMConsumedJoules(sktstate1[i], sktstate2[i]) << " Joules\n"; + cout << "----------------------------------------------------------------------------------------------" << "\n"; + if (m->getNumSockets() > 1) { + cout << " * "; + if (m->packageEnergyMetricsAvailable()) { + cout << setw(10) << getConsumedJoules(sstate1, sstate2); } - cout << "----------------------------------------------------------------------------------------------" << "\n"; - cout << " TOTAL: " << getDRAMConsumedJoules(sstate1, sstate2) << " Joules\n"; + else { + cout << " N/A "; + } + cout << " "; + if (m->dramEnergyMetricsAvailable()) { + cout << setw(10) << getDRAMConsumedJoules(sstate1, sstate2); + } + else { + cout << " N/A "; + } + cout << "\n"; } } @@ -541,9 +556,9 @@ void print_csv_header(PCM * m, if (cpu_model != PCM::ATOM) { if (m->L3CacheOccupancyMetricAvailable()) - cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3CLK;L2CLK;L3OCC;READ;WRITE;"; + cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3MPI;L2MPI;L3OCC;READ;WRITE;"; else - cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3CLK;L2CLK;READ;WRITE;"; + cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3MPI;L2MPI;READ;WRITE;"; } else { @@ -581,9 +596,9 @@ void print_csv_header(PCM * m, else { if (m->L3CacheOccupancyMetricAvailable()) - cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3CLK;L2CLK;L3OCC;READ;WRITE;TEMP;"; + cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3MPI;L2MPI;L3OCC;READ;WRITE;TEMP;"; else - cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3CLK;L2CLK;READ;WRITE;TEMP;"; + cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3MPI;L2MPI;READ;WRITE;TEMP;"; } } @@ -650,9 +665,9 @@ void print_csv_header(PCM * m, else { if (m->L3CacheOccupancyMetricAvailable()) - cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3CLK;L2CLK;L3OCC;"; + cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3MPI;L2MPI;L3OCC;"; else - cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3CLK;L2CLK;"; + cout << "EXEC;IPC;FREQ;AFREQ;L3MISS;L2MISS;L3HIT;L2HIT;L3MPI;L2MPI;"; } @@ -711,10 +726,10 @@ void print_csv(PCM * m, ';' << float_format(getL2CacheMisses(sstate1, sstate2)) << ';' << getL3CacheHitRatio(sstate1, sstate2) << ';' << getL2CacheHitRatio(sstate1, sstate2) << - ';' << getCyclesLostDueL3CacheMisses(sstate1, sstate2) << - ';' << getCyclesLostDueL2CacheMisses(sstate1, sstate2) << ";"; - if (m->L3CacheOccupancyMetricAvailable()) - cout << "N/A;"; + ';' << double(getL3CacheMisses(sstate1, sstate2)) / getInstructionsRetired(sstate1, sstate2) << + ';' << double(getL2CacheMisses(sstate1, sstate2)) / getInstructionsRetired(sstate1, sstate2) << ";"; + if (m->L3CacheOccupancyMetricAvailable()) + cout << "N/A;"; if (!(m->memoryTrafficMetricsAvailable())) cout << "N/A;N/A;"; else @@ -758,7 +773,6 @@ void print_csv(PCM * m, } - if (show_socket_output) { { @@ -779,9 +793,9 @@ void print_csv(PCM * m, ';' << float_format(getL2CacheMisses(sktstate1[i], sktstate2[i])) << ';' << getL3CacheHitRatio(sktstate1[i], sktstate2[i]) << ';' << getL2CacheHitRatio(sktstate1[i], sktstate2[i]) << - ';' << getCyclesLostDueL3CacheMisses(sktstate1[i], sktstate2[i]) << - ';' << getCyclesLostDueL2CacheMisses(sktstate1[i], sktstate2[i]); - if (m->L3CacheOccupancyMetricAvailable()) + ';' << double(getL3CacheMisses(sktstate1[i], sktstate2[i])) / getInstructionsRetired(sktstate1[i], sktstate2[i]) << + ';' << double(getL2CacheMisses(sktstate1[i], sktstate2[i])) / getInstructionsRetired(sktstate1[i], sktstate2[i]) ; + if (m->L3CacheOccupancyMetricAvailable()) cout << ';' << l3cache_occ_format(getL3CacheOccupancy(sktstate2[i])); if (!(m->memoryTrafficMetricsAvailable())) cout << ";N/A;N/A"; @@ -859,10 +873,11 @@ void print_csv(PCM * m, ';' << float_format(getL2CacheMisses(cstates1[i], cstates2[i])) << ';' << getL3CacheHitRatio(cstates1[i], cstates2[i]) << ';' << getL2CacheHitRatio(cstates1[i], cstates2[i]) << - ';' << getCyclesLostDueL3CacheMisses(cstates1[i], cstates2[i]) << - ';' << getCyclesLostDueL2CacheMisses(cstates1[i], cstates2[i]) ; + ';' << double(getL3CacheMisses(cstates1[i], cstates2[i])) / getInstructionsRetired(cstates1[i], cstates2[i]) << + ';' << double(getL2CacheMisses(cstates1[i], cstates2[i])) / getInstructionsRetired(cstates1[i], cstates2[i]); if (m->L3CacheOccupancyMetricAvailable()) cout << ';' << l3cache_occ_format(getL3CacheOccupancy(cstates2[i])) ; + cout << ';'; } else cout << getExecUsage(cstates1[i], cstates2[i]) << @@ -896,7 +911,7 @@ int main(int argc, char * argv[]) cerr << endl; cerr << " Intel(r) Performance Counter Monitor " << INTEL_PCM_VERSION << endl; cerr << endl; - cerr << " Copyright (c) 2009-2014 Intel Corporation" << endl; + cerr << INTEL_PCM_COPYRIGHT << endl; cerr << endl; // if delay is not specified: use either default (1 second), @@ -1124,7 +1139,7 @@ int main(int argc, char * argv[]) m->getAllCounterStates(sstate1, sktstate1, cstates1); if (sysCmd != NULL) { - MySystem(sysCmd, sysArgv); + MySystem(sysCmd, sysArgv); delay = 0; } unsigned int i = 1; @@ -1177,8 +1192,14 @@ int main(int argc, char * argv[]) else { assert(getNumberOfCustomEvents(0, sstate1, sstate2) == getL3CacheMisses(sstate1, sstate2)); - assert(getNumberOfCustomEvents(1, sstate1, sstate2) == getL3CacheHitsNoSnoop(sstate1, sstate2)); - assert(getNumberOfCustomEvents(2, sstate1, sstate2) == getL3CacheHitsSnoop(sstate1, sstate2)); + if (m->useSkylakeEvents()) { + assert(getNumberOfCustomEvents(1, sstate1, sstate2) == getL3CacheHits(sstate1, sstate2)); + assert(getNumberOfCustomEvents(2, sstate1, sstate2) == getL2CacheMisses(sstate1, sstate2)); + } + else { + assert(getNumberOfCustomEvents(1, sstate1, sstate2) == getL3CacheHitsNoSnoop(sstate1, sstate2)); + assert(getNumberOfCustomEvents(2, sstate1, sstate2) == getL3CacheHitsSnoop(sstate1, sstate2)); + } assert(getNumberOfCustomEvents(3, sstate1, sstate2) == getL2CacheHits(sstate1, sstate2)); } diff --git a/pmu-query.py b/pmu-query.py new file mode 100644 index 0000000..4ced0fd --- /dev/null +++ b/pmu-query.py @@ -0,0 +1,94 @@ +#!/usr/bin/python +import urllib2 +import json, csv +import subprocess +import sys +import platform +import getopt + +all_flag = False +download_flag = False +filename=None +offcore_events=[] + +try: + opts, args = getopt.getopt(sys.argv[1:],'a,f:,d',['all','file=','download']) + for o, a in opts: + if o in ('-a','--all'): + all_flag=True + if o in ('-f','--file'): + filename=a + if o in ('-d','--download'): + download_flag=True +except getopt.GetoptError, err: + print("parse error: %s\n" %(str(err))) + exit(-2) + +if filename == None: + map_file_raw=urllib2.urlopen('https://download.01.org/perfmon/mapfile.csv') + map_dict = csv.DictReader(map_file_raw) + map_file = [] + core_path = '' + offcore_path = '' + + while True: + try: + map_file.append(map_dict.next()) + except StopIteration: + break + + if platform.system() == 'CYGWIN_NT-6.1': + p = subprocess.Popen(['./pcm-core.exe -c'],stdout=subprocess.PIPE,shell=True) + elif platform.system() == 'Windows': + p = subprocess.Popen(['pcm-core.exe -c'],stdout=subprocess.PIPE,shell=True) + else: + p = subprocess.Popen(['./pcm-core.x -c'],stdout=subprocess.PIPE,shell=True) + + (output, err) = p.communicate() + p_status = p.wait() + for model in map_file: + if model['Family-model'] in output: + if(model['EventType'] == 'core'): + core_path = model['Filename'] + elif(model['EventType'] == 'offcore'): + offcore_path = model['Filename'] + print (model) + + if core_path != '': + json_core_data=urllib2.urlopen('https://download.01.org/perfmon'+core_path) + core_events=json.load(json_core_data) + if(download_flag == True): + with open(core_path.split('/')[-1],'w') as outfile: + json.dump(core_events, outfile, sort_keys=True, indent=4) + else: + print ('no core event found for %s CPU, program abort...' % (output)) + exit(-1) + + if offcore_path != '': + json_offcore_data=urllib2.urlopen('https://download.01.org/perfmon'+offcore_path) + offcore_events=json.load(json_offcore_data) + if(download_flag == True): + with open(offcore_path.split('/')[-1],'w') as outfile: + json.dump(offcore_events, outfile, sort_keys=True, indent=4) +else: + core_events=json.load(open(filename)) + +if all_flag == True: + for event in core_events+offcore_events: + if event.has_key('EventName') and event.has_key('BriefDescription'): + print (event['EventName']+':'+event['BriefDescription']) + sys.exit(0) + +name=raw_input("Event to query (empty enter to quit):") +while(name != ''): + for event in core_events+offcore_events: + if event.has_key('EventName') and name.lower() in event['EventName'].lower(): + print (event['EventName']+':'+event['BriefDescription']) + for ev_code in event['EventCode'].split(', '): + print ('cpu/umask=%s,event=%s,name=%s%s%s%s%s/' % ( + event['UMask'], ev_code, event['EventName'], + (',offcore_rsp=%s' % (event['MSRValue'])) if event['MSRValue'] != '0' else '', + (',inv=%s' % (event['Invert'])) if event['Invert'] != '0' else '', + (',any=%s' % (event['AnyThread'])) if event['AnyThread'] != '0' else '', + (',edge') if event['EdgeDetect'] != '0' else '')) + name=raw_input("Event to query (empty enter to quit):") diff --git a/types.h b/types.h index 25ef460..a185969 100644 --- a/types.h +++ b/types.h @@ -21,9 +21,6 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND \brief Internal type and constant definitions */ -// compile for Windows 7 or Windows Server 2008 R2 (processor group support needed for systems with high core count) -#define COMPILE_FOR_WINDOWS_7 - #undef PCM_DEBUG #include @@ -102,6 +99,20 @@ typedef signed int int32; #define MEM_LOAD_UOPS_RETIRED_L2_HIT_EVTNR (0xD1) #define MEM_LOAD_UOPS_RETIRED_L2_HIT_UMASK (0x02) +// Skylake on-core events + +#define SKL_MEM_LOAD_RETIRED_L3_MISS_EVTNR (0xD1) +#define SKL_MEM_LOAD_RETIRED_L3_MISS_UMASK (0x20) + +#define SKL_MEM_LOAD_RETIRED_L3_HIT_EVTNR (0xD1) +#define SKL_MEM_LOAD_RETIRED_L3_HIT_UMASK (0x04) + +#define SKL_MEM_LOAD_RETIRED_L2_MISS_EVTNR (0xD1) +#define SKL_MEM_LOAD_RETIRED_L2_MISS_UMASK (0x10) + +#define SKL_MEM_LOAD_RETIRED_L2_HIT_EVTNR (0xD1) +#define SKL_MEM_LOAD_RETIRED_L2_HIT_UMASK (0x02) + // architectural on-core events #define ARCH_LLC_REFERENCE_EVTNR (0x2E) diff --git a/utils.cpp b/utils.cpp index db47cd3..70d85e4 100644 --- a/utils.cpp +++ b/utils.cpp @@ -17,10 +17,8 @@ CT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT // written by Andrey Semin #include -#include -#include -#include #include +#include #ifdef _MSC_VER #include #include @@ -47,17 +45,16 @@ void exit_cleanup(void) #ifdef _MSC_VER -#ifdef COMPILE_FOR_WINDOWS_7 ThreadGroupTempAffinity::ThreadGroupTempAffinity(uint32 core_id) { GROUP_AFFINITY NewGroupAffinity; memset(&NewGroupAffinity, 0, sizeof(GROUP_AFFINITY)); memset(&PreviousGroupAffinity, 0, sizeof(GROUP_AFFINITY)); - uint32 currentGroupSize = 0; + DWORD currentGroupSize = 0; - while (core_id >= (currentGroupSize = GetMaximumProcessorCount(NewGroupAffinity.Group))) + while ((DWORD)core_id >= (currentGroupSize = GetActiveProcessorCount(NewGroupAffinity.Group))) { - core_id -= currentGroupSize; + core_id -= (uint32)currentGroupSize; ++NewGroupAffinity.Group; } NewGroupAffinity.Mask = 1ULL << core_id; @@ -67,7 +64,6 @@ ThreadGroupTempAffinity::~ThreadGroupTempAffinity() { SetThreadGroupAffinity(GetCurrentThread(),&PreviousGroupAffinity,NULL); } -#endif LONG unhandled_exception_handler(LPEXCEPTION_POINTERS p) { @@ -235,8 +231,10 @@ void set_signal_handlers(void) // to fix Cygwin/BASH setting Ctrl+C handler need first to restore the default one handlerStatus = SetConsoleCtrlHandler(NULL, FALSE); // restores normal processing of CTRL+C input if(handlerStatus == 0) { - _com_error error(GetLastError()); - std::wcerr << "Failed to set Ctrl+C hanlder. Error code: " << GetLastError() << " " << error.ErrorMessage() << std::endl; + std::wcerr << "Failed to set Ctrl+C hanlder. Error code: " << GetLastError() << " "; + const TCHAR * errorStr = _com_error(GetLastError()).ErrorMessage(); + if (errorStr) std::wcerr << errorStr; + std::wcerr << std::endl; _exit(EXIT_FAILURE); } SetConsoleCtrlHandler((PHANDLER_ROUTINE)sigINT_handler, TRUE); @@ -256,7 +254,7 @@ void set_signal_handlers(void) sigaction(SIGTERM, &saINT, NULL); sigaction(SIGSEGV, &saINT, NULL); - saINT.sa_flags = SA_RESTART || SA_NOCLDSTOP; + saINT.sa_flags = SA_RESTART | SA_NOCLDSTOP; sigaction(SIGCHLD, &saINT, NULL); // get there is our child exits. do nothing if it stoped/continued // install SIGHUP handler to restart diff --git a/utils.h b/utils.h index 5685682..6a6ccc6 100644 --- a/utils.h +++ b/utils.h @@ -56,7 +56,7 @@ inline void win_usleep(int delay_us) QueryPerformanceCounter((LARGE_INTEGER *) &t1); do { QueryPerformanceCounter((LARGE_INTEGER *) &t2); - YieldProcessor(); + _mm_pause(); } while ((t2-t1) < wait_tick); } #endif @@ -73,7 +73,7 @@ inline void MySleep(int delay) inline void MySleepMs(int delay_ms) { #ifdef _MSC_VER - if(delay_ms) Sleep(delay_ms); + if(delay_ms) Sleep((DWORD)delay_ms); #else struct timespec sleep_intrval; double complete_seconds; @@ -95,10 +95,17 @@ inline void MySleepUs(int delay_us) void MySystem(char * sysCmd, char ** argc); +#ifdef _MSC_VER +#pragma warning (disable : 4068 ) // disable unknown pragma warning +#endif + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Woverloaded-virtual" struct null_stream : public std::streambuf { void overflow(char) { } }; +#pragma clang diagnostic pop template inline std::string unit_format(IntType n) @@ -137,7 +144,6 @@ inline std::string unit_format(IntType n) pcm_compile_assert_failed pcm_compile_assert_failed_; \ PCM_UNUSED(pcm_compile_assert_failed_); -#ifdef COMPILE_FOR_WINDOWS_7 #ifdef _MSC_VER class ThreadGroupTempAffinity { @@ -145,12 +151,12 @@ class ThreadGroupTempAffinity ThreadGroupTempAffinity(); // forbidden ThreadGroupTempAffinity(const ThreadGroupTempAffinity &); // forbidden + ThreadGroupTempAffinity& operator = (const ThreadGroupTempAffinity &); // forbidden public: ThreadGroupTempAffinity(uint32 core_id); ~ThreadGroupTempAffinity(); }; #endif -#endif #endif diff --git a/width_extender.h b/width_extender.h index 2775ccc..cd519e9 100644 --- a/width_extender.h +++ b/width_extender.h @@ -29,6 +29,8 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include #include "cpucounters.h" #include "client_bw.h" +#include "mutex.h" +#include #ifdef _MSC_VER DWORD WINAPI WatchDogProc(LPVOID state); @@ -47,9 +49,9 @@ class CounterWidthExtender struct MsrHandleCounter : public AbstractRawCounter { - SafeMsrHandle * msr; + std::shared_ptr msr; uint64 msr_addr; - MsrHandleCounter(SafeMsrHandle * msr_, uint64 msr_addr_): msr(msr_), msr_addr(msr_addr_) {} + MsrHandleCounter(std::shared_ptr msr_, uint64 msr_addr_) : msr(msr_), msr_addr(msr_addr_) {} uint64 operator() () { uint64 value = 0; @@ -60,51 +62,78 @@ class CounterWidthExtender struct ClientImcReadsCounter : public AbstractRawCounter { - ClientBW * clientBW; - ClientImcReadsCounter(ClientBW * clientBW_): clientBW(clientBW_) {} + std::shared_ptr clientBW; + ClientImcReadsCounter(std::shared_ptr clientBW_) : clientBW(clientBW_) {} uint64 operator() () { return clientBW->getImcReads(); } }; struct ClientImcWritesCounter : public AbstractRawCounter { - ClientBW * clientBW; - ClientImcWritesCounter(ClientBW * clientBW_): clientBW(clientBW_) {} + std::shared_ptr clientBW; + ClientImcWritesCounter(std::shared_ptr clientBW_) : clientBW(clientBW_) {} uint64 operator() () { return clientBW->getImcWrites(); } }; struct ClientIoRequestsCounter : public AbstractRawCounter { - ClientBW * clientBW; - ClientIoRequestsCounter(ClientBW * clientBW_): clientBW(clientBW_) {} + std::shared_ptr clientBW; + ClientIoRequestsCounter(std::shared_ptr clientBW_) : clientBW(clientBW_) {} uint64 operator() () { return clientBW->getIoRequests(); } }; + struct MBLCounter : public AbstractRawCounter + { + std::shared_ptr msr; + MBLCounter(std::shared_ptr msr_) : msr(msr_){} + uint64 operator() () + { + msr->lock(); + msr->write(IA32_QM_EVTSEL, 0xdead); // TODO: change 0xdead to MBL event value + uint64 value = 0; + msr->read(IA32_PQR_ASSOC, &value); + msr->unlock(); + return value; + } + }; + + struct MBRCounter : public AbstractRawCounter + { + SafeMsrHandle * msr; + MBRCounter(SafeMsrHandle * msr_) : msr(msr_){} + uint64 operator() () + { + msr->lock(); + msr->write(IA32_QM_EVTSEL, 0xdead); // TODO: change 0xdead to MBR event value + uint64 value = 0; + msr->read(IA32_PQR_ASSOC, &value); + msr->unlock(); + return value; + } + }; + private: #ifdef _MSC_VER HANDLE UpdateThread; - HANDLE CounterMutex; #else pthread_t UpdateThread; - pthread_mutex_t CounterMutex; #endif + PCM_Util::Mutex CounterMutex; + AbstractRawCounter * raw_counter; uint64 extended_value; uint64 last_raw_value; CounterWidthExtender(); // forbidden CounterWidthExtender(CounterWidthExtender&); // forbidden + CounterWidthExtender & operator = (const CounterWidthExtender &); // forbidden uint64 internal_read() { - if (this==NULL) return 0; // to make security check happy uint64 result = 0, new_raw_value = 0; -#ifdef _MSC_VER - WaitForSingleObject(CounterMutex,INFINITE); -#else - pthread_mutex_lock(&CounterMutex); -#endif + CounterMutex.lock(); + new_raw_value = (*raw_counter)(); if(new_raw_value < last_raw_value) { @@ -118,11 +147,8 @@ class CounterWidthExtender last_raw_value = new_raw_value; result = extended_value; -#ifdef _MSC_VER - ReleaseMutex(CounterMutex); -#else - pthread_mutex_unlock(&CounterMutex); -#endif + + CounterMutex.unlock(); return result; } @@ -133,22 +159,18 @@ class CounterWidthExtender extended_value = last_raw_value; #ifdef _MSC_VER - CounterMutex = CreateMutex(NULL,FALSE,NULL); UpdateThread = CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)WatchDogProc,this,0,NULL); #else - pthread_mutex_init(&CounterMutex, NULL); pthread_create(&UpdateThread, NULL, WatchDogProc, this); #endif } - ~CounterWidthExtender() + virtual ~CounterWidthExtender() { #ifdef _MSC_VER TerminateThread(UpdateThread,0); CloseHandle(UpdateThread); - CloseHandle(CounterMutex); #else pthread_cancel(UpdateThread); - pthread_mutex_destroy(&CounterMutex); #endif if(raw_counter) delete raw_counter; } diff --git a/winpmem/winpmem.cpp b/winpmem/winpmem.cpp index cfd00a9..aa499e6 100644 --- a/winpmem/winpmem.cpp +++ b/winpmem/winpmem.cpp @@ -45,7 +45,7 @@ unsigned int WinPmem::read32(__int64 start) DWORD bytes_read = 0; large_start.QuadPart = start; - if(0xFFFFFFFF == SetFilePointer(fd_, large_start.LowPart, + if(0xFFFFFFFF == SetFilePointer(fd_, (LONG)large_start.LowPart, &large_start.HighPart, FILE_BEGIN)) { LogError(TEXT("Failed to seek in the pmem device.\n")); @@ -54,7 +54,7 @@ unsigned int WinPmem::read32(__int64 start) unsigned int result = 0; - if(!ReadFile(fd_, &result, sizeof(unsigned int), &bytes_read, NULL)) + if(!ReadFile(fd_, &result, (DWORD)sizeof(unsigned int), &bytes_read, NULL)) { LogError(TEXT("Failed to Read memory.")); goto error; @@ -79,12 +79,10 @@ int WinPmem::set_acquisition_mode(__int32 mode) { }; WinPmem::WinPmem(): - fd_(INVALID_HANDLE_VALUE), - buffer_size_(1024*1024), - buffer_(NULL), suppress_output(FALSE), + fd_(INVALID_HANDLE_VALUE), + out_fd_(INVALID_HANDLE_VALUE), service_name(PMEM_SERVICE_NAME) { - buffer_ = new char[buffer_size_]; _tcscpy_s(last_error, TEXT("")); max_physical_memory_ = 0; } @@ -92,10 +90,6 @@ WinPmem::WinPmem(): WinPmem::~WinPmem() { if (fd_ != INVALID_HANDLE_VALUE) { CloseHandle(fd_); - }; - - if (buffer_) { - delete [] buffer_; } } @@ -174,9 +168,9 @@ int WinPmem::install_driver(bool delete_driver) { if(fd_ == INVALID_HANDLE_VALUE) { LogError(TEXT("Can not open raw device.")); status = -1; - }; - - status = 1; + } + else + status = 1; service_error: CloseServiceHandle(service); @@ -206,8 +200,5 @@ int WinPmem::uninstall_driver() { Log(TEXT("Driver Unloaded.\n")); return 1; - - CloseServiceHandle(scm); - return 0; } diff --git a/winpmem/winpmem.h b/winpmem/winpmem.h index 4f06aa1..93fb89a 100644 --- a/winpmem/winpmem.h +++ b/winpmem/winpmem.h @@ -37,8 +37,6 @@ class WinPmem { // The file handle to the image file. HANDLE out_fd_; TCHAR *service_name; - char *buffer_; - size_t buffer_size_; TCHAR driver_filename[MAX_PATH]; // This is the maximum size of memory calculated.