6 #ifndef CPUCOUNTERS_HEADER 7 #define CPUCOUNTERS_HEADER 21 #undef PCM_HA_REQUESTS_READS_ONLY 22 #undef PCM_DEBUG_TOPOLOGY // debug of topology enumeration routine 23 #undef PCM_UNCORE_PMON_BOX_CHECK_STATUS // debug only 30 #include "exceptions/unsupported_processor_exception.hpp" 38 #include <unordered_map> 43 #include <linux/perf_event.h> 45 #define PCM_PERF_COUNT_HW_REF_CPU_CYCLES (9) 50 #include <semaphore.h> 51 #include <sys/types.h> 54 #include <sys/syscall.h> 69 void PCM_API restrictDriverAccess(LPCTSTR path);
72 class SystemCounterState;
73 class SocketCounterState;
74 class CoreCounterState;
75 class BasicCounterState;
76 class ServerUncoreCounterState;
94 int32 native_cpu_model = -1;
101 CoreType core_type = Invalid;
103 TopologyEntry() : os_id(-1), thread_id (-1), core_id(-1), tile_id(-1), socket(-1) { }
104 const char* getCoreTypeStr()
122 virtual void operator = (uint64 val) = 0;
123 virtual operator uint64 () = 0;
129 std::shared_ptr<PciHandleType> handle;
132 PCICFGRegister64(
const std::shared_ptr<PciHandleType> & handle_,
size_t offset_) :
137 void operator = (uint64 val)
override 141 handle->write32(offset, cvt.ui32.low);
142 handle->write32(offset +
sizeof(uint32), cvt.ui32.high);
144 operator uint64 ()
override 147 handle->read64(offset, &result);
154 std::shared_ptr<PciHandleType> handle;
157 PCICFGRegister32(
const std::shared_ptr<PciHandleType> & handle_,
size_t offset_) :
162 void operator = (uint64 val)
override 164 handle->write32(offset, (uint32)val);
166 operator uint64 ()
override 169 handle->read32(offset, &result);
176 std::shared_ptr<MMIORange> handle;
179 MMIORegister64(
const std::shared_ptr<MMIORange> & handle_,
size_t offset_) :
184 void operator = (uint64 val)
override 186 handle->write64(offset, val);
188 operator uint64 ()
override 190 return handle->read64(offset);
196 std::shared_ptr<MMIORange> handle;
199 MMIORegister32(
const std::shared_ptr<MMIORange> & handle_,
size_t offset_) :
204 void operator = (uint64 val)
override 206 handle->write32(offset, (uint32)val);
208 operator uint64 ()
override 210 return (uint64)handle->read32(offset);
216 std::shared_ptr<SafeMsrHandle> handle;
219 MSRRegister(
const std::shared_ptr<SafeMsrHandle> & handle_,
size_t offset_) :
224 void operator = (uint64 val)
override 226 handle->write(offset, val);
228 operator uint64 ()
override 231 handle->read(offset, &value);
238 std::shared_ptr<CounterWidthExtender> handle;
244 void operator = (uint64 val)
override 252 std::cerr <<
"ERROR: writing non-zero values to CounterWidthExtenderRegister is not supported\n";
253 throw std::exception();
256 operator uint64 ()
override 258 return handle->read();;
264 typedef std::shared_ptr<HWRegister> HWRegisterPtr;
265 HWRegisterPtr unitControl;
267 HWRegisterPtr counterControl[4];
268 HWRegisterPtr counterValue[4];
269 HWRegisterPtr fixedCounterControl;
270 HWRegisterPtr fixedCounterValue;
271 HWRegisterPtr filter[2];
273 UncorePMU(
const HWRegisterPtr & unitControl_,
274 const HWRegisterPtr & counterControl0,
275 const HWRegisterPtr & counterControl1,
276 const HWRegisterPtr & counterControl2,
277 const HWRegisterPtr & counterControl3,
278 const HWRegisterPtr & counterValue0,
279 const HWRegisterPtr & counterValue1,
280 const HWRegisterPtr & counterValue2,
281 const HWRegisterPtr & counterValue3,
282 const HWRegisterPtr & fixedCounterControl_ = HWRegisterPtr(),
283 const HWRegisterPtr & fixedCounterValue_ = HWRegisterPtr(),
284 const HWRegisterPtr & filter0 = HWRegisterPtr(),
285 const HWRegisterPtr & filter1 = HWRegisterPtr()
287 unitControl(unitControl_),
288 counterControl{ counterControl0, counterControl1, counterControl2, counterControl3 },
289 counterValue{ counterValue0, counterValue1, counterValue2, counterValue3 },
290 fixedCounterControl(fixedCounterControl_),
291 fixedCounterValue(fixedCounterValue_),
292 filter{ filter0 , filter1 }
299 return unitControl.get() !=
nullptr;
301 void writeUnitControl(
const uint32 value)
303 *unitControl = value;
306 void freeze(
const uint32 extra);
307 bool initFreeze(
const uint32 extra,
const char* xPICheckMsg =
nullptr);
308 void unfreeze(
const uint32 extra);
309 void resetUnfreeze(
const uint32 extra);
312 enum ServerUncoreMemoryMetrics
324 int32 iMCbus,UPIbus,M2Mbus;
327 typedef std::vector<UncorePMU> UncorePMUVector;
328 UncorePMUVector imcPMUs;
329 UncorePMUVector edcPMUs;
330 UncorePMUVector xpiPMUs;
331 UncorePMUVector m3upiPMUs;
332 UncorePMUVector m2mPMUs;
333 UncorePMUVector haPMUs;
334 std::vector<UncorePMUVector*> allPMUs{ &imcPMUs, &edcPMUs, &xpiPMUs, &m3upiPMUs , &m2mPMUs, &haPMUs };
335 std::vector<uint64> qpi_speed;
336 std::vector<uint32> num_imc_channels;
337 std::vector<std::pair<uint32, uint32> > XPIRegisterLocation;
338 std::vector<std::pair<uint32, uint32> > M3UPIRegisterLocation;
339 std::vector<std::vector< std::pair<uint32, uint32> > > MCRegisterLocation;
340 std::vector<std::pair<uint32, uint32> > EDCRegisterLocation;
341 std::vector<std::pair<uint32, uint32> > M2MRegisterLocation;
342 std::vector<std::pair<uint32, uint32> > HARegisterLocation;
344 static std::vector<std::pair<uint32, uint32> > socket2iMCbus;
345 static std::vector<std::pair<uint32, uint32> > socket2UPIbus;
346 static std::vector<std::pair<uint32, uint32> > socket2M2Mbus;
351 PciHandleType * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func,
bool checkVendor =
false);
352 void programIMC(
const uint32 * MCCntConfig);
353 void programEDC(
const uint32 * EDCCntConfig);
354 void programM2M(
const uint64 * M2MCntConfig);
356 void programHA(
const uint32 * config);
358 void programXPI(
const uint32 * XPICntConfig);
359 void programM3UPI(
const uint32* M3UPICntConfig);
360 typedef std::pair<size_t, std::vector<uint64 *> > MemTestParam;
361 void initMemTest(MemTestParam & param);
362 void doMemTest(
const MemTestParam & param);
363 void cleanupMemTest(
const MemTestParam & param);
364 void cleanupQPIHandles();
366 void writeAllUnitControl(
const uint32 value);
367 void initDirect(uint32 socket_,
const PCM *
pcm);
368 void initPerf(uint32 socket_,
const PCM * pcm);
369 void initBuses(uint32 socket_,
const PCM * pcm);
370 void initRegisterLocations(
const PCM * pcm);
371 uint64 getPMUCounter(std::vector<UncorePMU> & pmu,
const uint32
id,
const uint32
counter);
396 uint64 getImcReads();
399 uint64 getImcReadsForController(uint32 controller);
403 uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel);
405 uint64 getImcWrites();
407 uint64 getHALocalRequests();
409 uint64 getHARequests();
412 uint64 getPMMReads();
414 uint64 getPMMWrites();
417 uint64 getEdcReads();
419 uint64 getEdcWrites();
423 uint64 getIncomingDataFlits(uint32 port);
427 uint64 getOutgoingFlits(uint32 port);
433 void program_power_metrics(
int mc_profile);
439 void programServerUncoreMemoryMetrics(
const ServerUncoreMemoryMetrics & metrics,
const int rankA = -1,
const int rankB = -1);
450 uint64 getUPIL0TxCycles(uint32 port);
471 uint64 getQPILLCounter(uint32 port, uint32 counter);
482 void freezeCounters();
484 void unfreezeCounters();
487 uint64 computeQPISpeed(
const uint32 ref_core,
const int cpumodel);
490 void enableJKTWorkaround(
bool enable);
498 return qpi_speed.empty() ? 0 : qpi_speed[linkNr];
502 void reportQPISpeed()
const;
505 uint32
getNumMC()
const {
return (uint32)num_imc_channels.size(); }
512 size_t getNumMCChannels(
const uint32 controller)
const;
533 typedef std::vector<uint64> eventGroup_t;
535 class PerfVirtualControlRegister;
549 friend class PerfVirtualControlRegister;
553 PCM(
const PCM &) =
delete;
554 PCM & operator = (
const PCM &) =
delete;
560 int64 cpu_microcode_level;
562 int32 threads_per_core;
565 int32 num_phys_cores_per_socket;
566 int32 num_online_cores;
567 int32 num_online_sockets;
568 uint32 core_gen_counter_num_max;
569 uint32 core_gen_counter_num_used;
570 uint32 core_gen_counter_width;
571 uint32 core_fixed_counter_num_max;
572 uint32 core_fixed_counter_num_used;
573 uint32 core_fixed_counter_width;
574 uint64 core_global_ctrl_value{0ULL};
575 uint32 uncore_gen_counter_num_max;
576 uint32 uncore_gen_counter_num_used;
577 uint32 uncore_gen_counter_width;
578 uint32 uncore_fixed_counter_num_max;
579 uint32 uncore_fixed_counter_num_used;
580 uint32 uncore_fixed_counter_width;
581 uint32 perfmon_version;
582 int32 perfmon_config_anythread;
583 uint64 nominal_frequency;
584 uint64 max_qpi_speed;
585 uint32 L3ScalingFactor;
586 int32 pkgThermalSpecPower, pkgMinimumPower, pkgMaximumPower;
588 std::vector<TopologyEntry> topology;
590 std::string errorMessage;
592 static PCM * instance;
593 bool programmed_core_pmu{
false};
594 std::vector<std::shared_ptr<SafeMsrHandle> > MSR;
595 std::vector<std::shared_ptr<ServerPCICFGUncore> > server_pcicfg_uncore;
596 std::vector<UncorePMU> pcuPMUs;
597 std::vector<std::map<int32, UncorePMU> > iioPMUs;
598 std::vector<std::map<int32, UncorePMU> > irpPMUs;
599 std::vector<UncorePMU> uboxPMUs;
600 double joulesPerEnergyUnit;
601 std::vector<std::shared_ptr<CounterWidthExtender> > energy_status;
602 std::vector<std::shared_ptr<CounterWidthExtender> > dram_energy_status;
603 std::vector<std::vector<UncorePMU> > cboPMUs;
605 std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_local;
606 std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_total;
612 std::shared_ptr<FreeRunningBWCounters> clientBW;
613 std::shared_ptr<CounterWidthExtender> clientImcReads;
614 std::shared_ptr<CounterWidthExtender> clientImcWrites;
615 std::shared_ptr<CounterWidthExtender> clientGtRequests;
616 std::shared_ptr<CounterWidthExtender> clientIaRequests;
617 std::shared_ptr<CounterWidthExtender> clientIoRequests;
619 std::vector<std::shared_ptr<ServerBW> > serverBW;
621 bool disable_JKT_workaround;
624 uint64 * coreCStateMsr;
625 uint64 * pkgCStateMsr;
627 std::vector<std::shared_ptr<CoreTaskQueue> > coreTaskQueues;
629 bool L2CacheHitRatioAvailable;
630 bool L3CacheHitRatioAvailable;
631 bool L3CacheMissesAvailable;
632 bool L2CacheMissesAvailable;
633 bool L2CacheHitsAvailable;
634 bool L3CacheHitsNoSnoopAvailable;
635 bool L3CacheHitsSnoopAvailable;
636 bool L3CacheHitsAvailable;
638 bool forceRTMAbortMode;
640 std::vector<uint64> FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
641 bool isFixedCounterSupported(
unsigned c);
643 bool linux_arch_perfmon =
false;
646 enum { MAX_C_STATE = 10 };
651 if (state == 0 || state == 1)
654 return (coreCStateMsr != NULL && state <= ((
int)MAX_C_STATE) && coreCStateMsr[state] != 0);
664 return (pkgCStateMsr != NULL && state <= ((
int)MAX_C_STATE) && pkgCStateMsr[state] != 0);
668 static void setOutput(
const std::string filename,
const bool cerrToo =
false);
671 void restoreOutput();
685 bool isBlocked(
void) {
return blocked; }
686 void setBlocked(
const bool new_blocked) { blocked = new_blocked; }
691 CUSTOM_CORE_EVENTS = 1,
692 EXT_CUSTOM_CORE_EVENTS = 2,
745 enum SkylakeIIOStacks {
746 SKX_IIO_CBDMA_DMI = 0,
752 SKX_IIO_STACK_COUNT = 6
756 enum IcelakeIIOStacks {
762 ICX_IIO_CBDMA_DMI = 5,
763 ICX_IIO_STACK_COUNT = 6
767 enum SnowridgeIIOStacks {
769 SNR_IIO_CBDMA_DMI = 1,
773 SNR_IIO_STACK_COUNT = 5
778 enum PCIeWidthMode width;
779 std::string pciDevName;
780 std::string busNumber;
794 int32 event_number = 0, umask_value = 0;
812 uint64 OffcoreResponseMsrValue[2];
813 uint64 LoadLatencyMsrValue, FrontendMsrValue;
814 bool defaultUncoreProgramming{
true};
815 static uint64 invalidMsrValue() {
return ~0ULL; }
816 ExtendedCustomCoreEventDescription() : fixedCfg(NULL), nGPCounters(0), gpCounterCfg(
nullptr), gpCounterHybridAtomCfg(
nullptr), LoadLatencyMsrValue(invalidMsrValue()), FrontendMsrValue(invalidMsrValue())
818 OffcoreResponseMsrValue[0] = 0;
819 OffcoreResponseMsrValue[1] = 0;
826 std::string eventNames[4];
832 enum MSREventPosition
848 std::vector<int32> socketRefCore;
852 typedef std::vector<std::vector<int> > PerfEventHandleContainer;
853 PerfEventHandleContainer perfEventHandle;
854 std::vector<PerfEventHandleContainer> perfEventTaskHandle;
855 void readPerfData(uint32 core, std::vector<uint64> &
data);
856 void closePerfHandles(
const bool silent =
false);
859 PERF_INST_RETIRED_POS = 0,
860 PERF_CPU_CLK_UNHALTED_THREAD_POS = 1,
861 PERF_CPU_CLK_UNHALTED_REF_POS = 2,
862 PERF_GEN_EVENT_0_POS = 3,
863 PERF_GEN_EVENT_1_POS = 4,
864 PERF_GEN_EVENT_2_POS = 5,
865 PERF_GEN_EVENT_3_POS = 6,
866 PERF_TOPDOWN_SLOTS_POS = PERF_GEN_EVENT_0_POS + PERF_MAX_CUSTOM_COUNTERS,
867 PERF_TOPDOWN_FRONTEND_POS = PERF_TOPDOWN_SLOTS_POS + 1,
868 PERF_TOPDOWN_BADSPEC_POS = PERF_TOPDOWN_SLOTS_POS + 2,
869 PERF_TOPDOWN_BACKEND_POS = PERF_TOPDOWN_SLOTS_POS + 3,
870 PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4
873 std::array<int, (PERF_TOPDOWN_RETIRING_POS + 1)> perfTopDownPos;
876 PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_POS,
877 PERF_TOPDOWN_GROUP_LEADER_COUNTER = PERF_TOPDOWN_SLOTS_POS
880 static std::ofstream * outfile;
881 static std::streambuf * backup_ofile;
882 static std::streambuf * backup_ofile_cerr;
885 bool needToRestoreNMIWatchdog;
887 std::vector<std::vector<EventSelectRegister> > lastProgrammedCustomCounters;
888 uint32 checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr);
890 std::vector<EventSelectRegister> & programmedCustomCounters,
const std::vector<int> & tids);
893 void cleanupPMU(
const bool silent =
false);
894 void cleanupRDT(
const bool silent =
false);
896 void computeQPISpeedBeckton(
int core_nr);
898 void computeNominalFrequency();
899 static bool isCPUModelSupported(
const int model_);
900 std::string getSupportedUarchCodenames()
const;
901 std::string getUnsupportedMessage()
const;
905 void initCStateSupportTables();
906 bool discoverSystemTopology();
907 void printSystemTopology()
const;
909 bool detectNominalFrequency();
910 void showSpecControlMSRs();
911 void initEnergyMonitoring();
912 void initUncoreObjects();
926 void initQOSevent(
const uint64 event,
const int32 core);
927 void programBecktonUncore(
int core);
928 void programNehalemEPUncore(
int core);
929 void enableJKTWorkaround(
bool enable);
930 template <
class CounterStateType>
931 void readAndAggregateMemoryBWCounters(
const uint32 core, CounterStateType & counterState);
932 template <
class CounterStateType>
933 void readAndAggregateUncoreMCCounters(
const uint32 socket, CounterStateType & counterState);
934 template <
class CounterStateType>
935 void readAndAggregateEnergyCounters(
const uint32 socket, CounterStateType & counterState);
936 template <
class CounterStateType>
937 void readPackageThermalHeadroom(
const uint32 socket, CounterStateType & counterState);
938 template <
class CounterStateType>
939 void readAndAggregatePackageCStateResidencies(std::shared_ptr<SafeMsrHandle> msr, CounterStateType & result);
943 template <
class CounterStateType>
944 void readMSRs(std::shared_ptr<SafeMsrHandle> msr,
const RawPMUConfig & msrConfig, CounterStateType & result);
946 void reportQPISpeed()
const;
947 void readCoreCounterConfig(
const bool complainAboutMSR =
false);
948 void readCPUMicrocodeLevel();
950 uint64 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr)
const;
951 uint64 CX_MSR_PMON_BOX_FILTER(uint32 Cbo)
const;
952 uint64 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo)
const;
953 uint64 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl)
const;
954 uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo)
const;
955 void programCboOpcodeFilter(
const uint32 opc0,
UncorePMU & pmu,
const uint32 nc_,
const uint32 opc1,
const uint32 loc,
const uint32 rem);
956 void initLLCReadMissLatencyEvents(uint64 * events, uint32 & opCode);
957 void initCHARequestEvents(uint64 * events);
959 uint64 getCBOCounterState(
const uint32 socket,
const uint32 ctr_);
960 template <
class Iterator>
961 static void program(
UncorePMU& pmu,
const Iterator& eventsBegin,
const Iterator& eventsEnd,
const uint32 extra)
963 if (!eventsBegin)
return;
964 Iterator curEvent = eventsBegin;
965 for (
int c = 0; curEvent != eventsEnd; ++c, ++curEvent)
967 auto ctrl = pmu.counterControl[c];
968 if (ctrl.get() !=
nullptr)
970 *ctrl = MC_CH_PCI_PMON_CTL_EN;
971 *ctrl = MC_CH_PCI_PMON_CTL_EN | *curEvent;
976 pmu.resetUnfreeze(extra);
979 void programPCU(uint32 * events,
const uint64 filter);
980 void programUBOX(
const uint64* events);
982 void cleanupUncorePMUs(
const bool silent =
false);
986 return (PCM::SKX == cpu_model) && (cpu_stepping > 4 && cpu_stepping < 8);
989 static bool isCPX(
int cpu_model_,
int cpu_stepping_)
991 return (PCM::SKX == cpu_model_) && (cpu_stepping_ >= 10);
996 return isCPX(cpu_model, cpu_stepping);
999 void initUncorePMUsDirect();
1000 void initUncorePMUsPerf();
1001 bool isRDTDisabled()
const;
1005 bool isHWTMAL1Supported()
const;
1015 bool isSecureBoot()
const;
1018 bool useLinuxPerfForUncore()
const;
1026 return *systemTopology;
1030 void printDetailedSystemTopology();
1037 bool QOSMetricAvailable()
const;
1043 bool L3QOSMetricAvailable()
const;
1049 bool L3CacheOccupancyMetricAvailable()
const;
1055 bool CoreLocalMemoryBWMetricAvailable()
const;
1061 bool CoreRemoteMemoryBWMetricAvailable()
const;
1067 unsigned getMaxRMID()
const;
1070 uint32 getMaxNumOfCBoxes()
const;
1073 uint32 getMaxNumOfIIOStacks()
const;
1083 static PCM * getInstance();
1100 return errorMessage;
1116 ErrorCode program(
const ProgramMode mode_ = DEFAULT_EVENTS,
const void * parameter_ = NULL,
const bool silent =
false,
const int pid = -1);
1134 ErrorCode programServerUncoreLatencyMetrics(
bool enable_pmm);
1149 ErrorCode programServerUncorePowerMetrics(
int mc_profile,
int pcu_profile,
int * freq_bands = NULL);
1163 ErrorCode programServerUncoreMemoryMetrics(
const ServerUncoreMemoryMetrics & metrics,
int rankA = -1,
int rankB = -1);
1168 typedef std::pair<std::array<uint64, 5>, std::string> RawEventConfig;
1171 std::vector<RawEventConfig> programmable;
1172 std::vector<RawEventConfig> fixed;
1180 typedef std::map<std::string, RawPMUConfig> RawPMUConfigs;
1181 ErrorCode program(
const RawPMUConfigs& curPMUConfigs,
const bool silent =
false,
const int pid = -1);
1183 std::pair<unsigned, unsigned> getOCREventNr(
const int event,
const unsigned coreID)
const 1185 assert (coreID < topology.size());
1191 if (topology[coreID].core_type == TopologyEntry::Atom)
1193 return std::make_pair(OFFCORE_RESPONSE_0_EVTNR, event + 1);
1198 bool useGLCOCREvent =
false;
1202 useGLCOCREvent =
true;
1208 return std::make_pair(useGLCOCREvent ? GLC_OFFCORE_RESPONSE_0_EVTNR : OFFCORE_RESPONSE_0_EVTNR, OFFCORE_RESPONSE_0_UMASK);
1210 return std::make_pair(useGLCOCREvent ? GLC_OFFCORE_RESPONSE_1_EVTNR : OFFCORE_RESPONSE_1_EVTNR, OFFCORE_RESPONSE_1_UMASK);
1212 assert (
false &&
"wrong event nr in getOCREventNr");
1213 return std::make_pair(0U, 0U);
1217 void freezeServerUncoreCounters();
1220 void unfreezeServerUncoreCounters();
1233 void cleanup(
const bool silent =
false);
1249 void getAllCounterStates(
SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates, std::vector<CoreCounterState> & coreStates,
const bool readAndAggregateSocketUncoreCounters =
true);
1257 void getUncoreCounterStates(
SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates);
1263 bool isCoreOnline(int32 os_core_id)
const;
1269 bool isSocketOnline(int32 socket_id)
const;
1299 uint32 getNumCores()
const;
1304 uint32 getNumOnlineCores()
const;
1309 uint32 getNumSockets()
const;
1314 uint32 getNumOnlineSockets()
const;
1321 uint32 getThreadsPerCore()
const;
1326 bool getSMT()
const;
1331 uint64 getNominalFrequency()
const;
1337 uint32 getL3ScalingFactor()
const;
1344 bool isSomeCoreOfflined();
1348 int32 getMaxCustomCoreEvents();
1352 static int getCPUModelFromCPUID();
1381 BROADWELL_XEON_E3 = 71,
1401 END_OF_MODEL_LIST = 0x0ffff
1404 #define PCM_SKL_PATH_CASES \ 1415 bool useSKLPath()
const 1438 int32
getThreadId(uint32 os_id)
const {
return (int32)topology[os_id].thread_id; }
1443 int32
getCoreId(uint32 os_id)
const {
return (int32)topology[os_id].core_id; }
1448 int32
getTileId(uint32 os_id)
const {
return (int32)topology[os_id].tile_id; }
1453 int32
getSocketId(uint32 core_id)
const {
return (int32)topology[core_id].socket; }
1464 if (num_sockets == 2)
1478 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumQPIPorts()) : 0;
1503 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMC()) : 0;
1529 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMCChannels()) : 0;
1557 return (socket < server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) ? (server_pcicfg_uncore[socket]->getNumMCChannels(controller)) : 0;
1569 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumEDCChannels()) : 0;
1579 if (ICL == cpu_model || TGL == cpu_model || RKL == cpu_model)
return 5;
1614 std::cerr <<
"MaxIPC is not defined for your cpu model " << cpu_model <<
'\n';
1626 return 800000000ULL;
1631 return 1000000000ULL;
1635 return 1100000000ULL;
1667 return !isServerCPU();
1673 uint64 getTickCount(uint64 multiplier = 1000 , uint32 core = 0);
1675 uint64 getInvariantTSC_Fast(uint32 core = 0);
1685 return hasPCICFGUncore() ? server_pcicfg_uncore[socketNr]->getQPILinkSpeed(linkNr) : max_qpi_speed;
1700 #ifndef NO_WINRING // In cases where loading the WinRing0 driver is not desirable as a fallback to MSR.sys, add -DNO_WINRING to compile command to remove ability to load driver 1701 static bool initWinRing0Lib();
1704 #endif // NO_WINRING 1706 inline void disableJKTWorkaround() { disable_JKT_workaround =
true; }
1735 enum ChaPipelineQueue
1750 void programPCIeEventGroup(eventGroup_t &eventGroup);
1751 uint64 getPCIeCounterData(
const uint32 socket_,
const uint32 ctr_);
1760 void programCbo(
const uint64 * events,
const uint32 opCode = 0,
const uint32 nc_ = 0,
const uint32 llc_lookup_tid_filter = 0,
const uint32 loc = 1,
const uint32 rem = 1);
1766 void programCboRaw(
const uint64* events,
const uint64 filter0,
const uint64 filter1);
1771 PCIeCounterState getPCIeCounterState(
const uint32 socket_,
const uint32 ctr_ = 0);
1776 void programIIOCounters(uint64 rawEvents[4],
int IIOStack = -1);
1781 void programIRPCounters(uint64 rawEvents[4],
int IIOStack = -1);
1787 IIOCounterState getIIOCounterState(
int socket,
int IIOStack,
int counter);
1793 void getIIOCounterStates(
int socket,
int IIOStack, IIOCounterState * result);
1795 uint64 extractCoreGenCounterValue(uint64 val);
1796 uint64 extractCoreFixedCounterValue(uint64 val);
1797 uint64 extractUncoreGenCounterValue(uint64 val);
1798 uint64 extractUncoreFixedCounterValue(uint64 val);
1799 uint64 extractQOSMonitoring(uint64 val);
1803 const char * getUArchCodename(
const int32 cpu_model_ = -1)
const;
1806 static std::string getCPUBrandString();
1807 std::string getCPUFamilyModelString();
1811 void enableForceRTMAbortMode(
const bool silent =
false);
1814 bool isForceRTMAbortModeEnabled()
const;
1817 void disableForceRTMAbortMode(
const bool silent =
false);
1820 bool isForceRTMAbortModeAvailable()
const;
1828 return cpu_model_ == ATOM
1829 || cpu_model_ == ATOM_2
1830 || cpu_model_ == CENTERTON
1831 || cpu_model_ == BAYTRAIL
1832 || cpu_model_ == AVOTON
1833 || cpu_model_ == CHERRYTRAIL
1834 || cpu_model_ == APOLLO_LAKE
1835 || cpu_model_ == DENVERTON
1843 return isAtom(cpu_model);
1846 bool packageEnergyMetricsAvailable()
const 1849 cpu_model == PCM::JAKETOWN
1850 || cpu_model == PCM::IVYTOWN
1851 || cpu_model == PCM::SANDY_BRIDGE
1852 || cpu_model == PCM::IVY_BRIDGE
1853 || cpu_model == PCM::HASWELL
1854 || cpu_model == PCM::AVOTON
1855 || cpu_model == PCM::CHERRYTRAIL
1856 || cpu_model == PCM::BAYTRAIL
1857 || cpu_model == PCM::APOLLO_LAKE
1858 || cpu_model == PCM::DENVERTON
1859 || cpu_model == PCM::SNOWRIDGE
1860 || cpu_model == PCM::HASWELLX
1861 || cpu_model == PCM::BROADWELL
1862 || cpu_model == PCM::BDX_DE
1863 || cpu_model == PCM::BDX
1864 || cpu_model == PCM::KNL
1866 || cpu_model == PCM::SKX
1867 || cpu_model == PCM::ICX
1868 || cpu_model == PCM::ADL
1872 bool dramEnergyMetricsAvailable()
const 1875 cpu_model == PCM::JAKETOWN
1876 || cpu_model == PCM::IVYTOWN
1877 || cpu_model == PCM::HASWELLX
1878 || cpu_model == PCM::BDX_DE
1879 || cpu_model == PCM::BDX
1880 || cpu_model == PCM::KNL
1881 || cpu_model == PCM::SKX
1882 || cpu_model == PCM::ICX
1886 bool packageThermalMetricsAvailable()
const 1888 return packageEnergyMetricsAvailable();
1891 bool outgoingQPITrafficMetricsAvailable()
const 1893 return getQPILinksPerSocket() > 0 &&
1895 cpu_model == PCM::NEHALEM_EX
1896 || cpu_model == PCM::WESTMERE_EX
1897 || cpu_model == PCM::JAKETOWN
1898 || cpu_model == PCM::IVYTOWN
1899 || cpu_model == PCM::HASWELLX
1900 || cpu_model == PCM::BDX
1901 || cpu_model == PCM::SKX
1902 || cpu_model == PCM::ICX
1906 bool incomingQPITrafficMetricsAvailable()
const 1908 return getQPILinksPerSocket() > 0 &&
1910 cpu_model == PCM::NEHALEM_EX
1911 || cpu_model == PCM::WESTMERE_EX
1912 || cpu_model == PCM::JAKETOWN
1913 || cpu_model == PCM::IVYTOWN
1914 || (cpu_model == PCM::SKX && cpu_stepping > 1)
1915 || cpu_model == PCM::ICX
1919 bool localMemoryRequestRatioMetricAvailable()
const 1921 return cpu_model == PCM::HASWELLX
1922 || cpu_model == PCM::BDX
1923 || cpu_model == PCM::SKX
1924 || cpu_model == PCM::ICX
1928 bool qpiUtilizationMetricsAvailable()
const 1930 return outgoingQPITrafficMetricsAvailable();
1933 bool memoryTrafficMetricsAvailable()
const 1935 return (!(isAtom() || cpu_model == PCM::CLARKDALE))
1939 bool MCDRAMmemoryTrafficMetricsAvailable()
const 1941 return (cpu_model == PCM::KNL);
1944 bool memoryIOTrafficMetricAvailable()
const 1946 if (cpu_model == TGL)
return false;
1948 cpu_model == PCM::SANDY_BRIDGE
1949 || cpu_model == PCM::IVY_BRIDGE
1950 || cpu_model == PCM::HASWELL
1951 || cpu_model == PCM::BROADWELL
1956 bool IIOEventsAvailable()
const 1959 cpu_model == PCM::SKX
1960 || cpu_model == PCM::ICX
1961 || cpu_model == PCM::SNOWRIDGE
1965 bool uncoreFrequencyMetricAvailable()
const 1967 return MSR.empty() ==
false && uboxPMUs.size() == getNumSockets() && getNumCores() == getNumOnlineCores();
1970 bool LatencyMetricsAvailable()
const 1973 cpu_model == PCM::HASWELLX
1974 || cpu_model == PCM::BDX
1975 || cpu_model == PCM::SKX
1976 || cpu_model == PCM::ICX
1981 bool DDRLatencyMetricsAvailable()
const 1984 cpu_model == PCM::SKX
1985 || cpu_model == PCM::ICX
1989 bool PMMTrafficMetricsAvailable()
const 1994 || cpu_model == PCM::ICX
1995 || cpu_model == PCM::SNOWRIDGE
1999 bool LLCReadMissLatencyMetricsAvailable()
const 2002 HASWELLX == cpu_model
2003 || BDX_DE == cpu_model
2007 #ifdef PCM_ENABLE_LLCRDLAT_SKX_MP
2010 || ((SKX == cpu_model) && (num_sockets == 1))
2013 || SNOWRIDGE == cpu_model
2017 bool hasBecktonUncore()
const 2020 cpu_model == PCM::NEHALEM_EX
2021 || cpu_model == PCM::WESTMERE_EX
2024 bool hasPCICFGUncore()
const 2027 cpu_model == PCM::JAKETOWN
2028 || cpu_model == PCM::SNOWRIDGE
2029 || cpu_model == PCM::IVYTOWN
2030 || cpu_model == PCM::HASWELLX
2031 || cpu_model == PCM::BDX_DE
2032 || cpu_model == PCM::SKX
2033 || cpu_model == PCM::ICX
2034 || cpu_model == PCM::BDX
2035 || cpu_model == PCM::KNL
2039 bool isSkxCompatible()
const 2042 cpu_model == PCM::SKX
2046 static bool hasUPI(
const int32 cpu_model_)
2049 cpu_model_ == PCM::SKX
2050 || cpu_model_ == PCM::ICX
2056 return hasUPI(cpu_model);
2059 const char * xPI()
const 2070 cpu_model == PCM::SKX
2071 || cpu_model == PCM::ICX
2075 bool supportsHLE()
const;
2076 bool supportsRTM()
const;
2077 bool supportsRDTSCP()
const;
2079 bool useSkylakeEvents()
const 2082 || PCM::SKX == cpu_model
2083 || PCM::ICX == cpu_model
2087 bool hasClientMCCounters()
const 2089 return cpu_model == SANDY_BRIDGE
2090 || cpu_model == IVY_BRIDGE
2091 || cpu_model == HASWELL
2092 || cpu_model == BROADWELL
2097 static double getBytesPerFlit(int32 cpu_model_)
2099 if (hasUPI(cpu_model_))
2108 double getBytesPerFlit()
const 2110 return getBytesPerFlit(cpu_model);
2113 static double getDataBytesPerFlit(int32 cpu_model_)
2115 if (hasUPI(cpu_model_))
2124 double getDataBytesPerFlit()
const 2126 return getDataBytesPerFlit(cpu_model);
2129 static double getFlitsPerLinkCycle(int32 cpu_model_)
2131 if (hasUPI(cpu_model_))
2139 static double getBytesPerLinkCycle(int32 cpu_model_)
2141 return getBytesPerFlit(cpu_model_) * getFlitsPerLinkCycle(cpu_model_);
2144 double getBytesPerLinkCycle()
const 2146 return getBytesPerLinkCycle(cpu_model);
2149 static double getLinkTransfersPerLinkCycle()
2154 double getBytesPerLinkTransfer()
const 2156 return getBytesPerLinkCycle() / getLinkTransfersPerLinkCycle();
2163 #define PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(m) bool is##m() const { return m; } 2165 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitRatioAvailable)
2166 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)
2167 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheMissesAvailable)
2168 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheMissesAvailable)
2169 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitsAvailable)
2170 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsNoSnoopAvailable)
2171 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsSnoopAvailable)
2172 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsAvailable)
2174 #undef PCM_GEN_METRIC_AVAILABLE_FUNCTION 2176 bool isActiveRelativeFrequencyAvailable()
const 2191 template <
class CounterStateType>
2192 friend double getExecUsage(
const CounterStateType & before,
const CounterStateType & after);
2193 template <
class CounterStateType>
2194 friend double getIPC(
const CounterStateType & before,
const CounterStateType & after);
2195 template <
class CounterStateType>
2196 friend double getAverageFrequency(
const CounterStateType & before,
const CounterStateType & after);
2197 template <
class CounterStateType>
2198 friend double getAverageFrequencyFromClocks(
const int64 clocks,
const CounterStateType& before,
const CounterStateType& after);
2199 template <
class CounterStateType>
2201 template <
class CounterStateType>
2202 friend double getRelativeFrequency(
const CounterStateType & before,
const CounterStateType & after);
2203 template <
class CounterStateType>
2205 template <
class CounterStateType>
2206 friend double getL2CacheHitRatio(
const CounterStateType & before,
const CounterStateType & after);
2207 template <
class CounterStateType>
2208 friend double getL3CacheHitRatio(
const CounterStateType & before,
const CounterStateType & after);
2209 template <
class CounterStateType>
2210 friend uint64
getL3CacheMisses(
const CounterStateType & before,
const CounterStateType & after);
2211 template <
class CounterStateType>
2212 friend uint64
getL2CacheMisses(
const CounterStateType & before,
const CounterStateType & after);
2213 template <
class CounterStateType>
2214 friend uint64
getL2CacheHits(
const CounterStateType & before,
const CounterStateType & after);
2215 template <
class CounterStateType>
2216 friend uint64
getL3CacheHitsNoSnoop(
const CounterStateType & before,
const CounterStateType & after);
2217 template <
class CounterStateType>
2218 friend uint64
getL3CacheHitsSnoop(
const CounterStateType & before,
const CounterStateType & after);
2219 template <
class CounterStateType>
2220 friend uint64
getL3CacheHits(
const CounterStateType & before,
const CounterStateType & after);
2221 template <
class CounterStateType>
2223 template <
class CounterStateType>
2224 friend uint64
getLocalMemoryBW(
const CounterStateType & before,
const CounterStateType & after);
2225 template <
class CounterStateType>
2226 friend uint64
getRemoteMemoryBW(
const CounterStateType & before,
const CounterStateType & after);
2227 template <
class CounterStateType>
2228 friend uint64
getCycles(
const CounterStateType & before,
const CounterStateType & after);
2229 template <
class CounterStateType>
2231 template <
class CounterStateType>
2232 friend uint64 getCycles(
const CounterStateType & now);
2233 template <
class CounterStateType>
2234 friend uint64 getInstructionsRetired(
const CounterStateType & now);
2235 template <
class CounterStateType>
2236 friend uint64
getNumberOfCustomEvents(int32 eventCounterNr,
const CounterStateType & before,
const CounterStateType & after);
2237 template <
class CounterStateType>
2238 friend uint64
getInvariantTSC(
const CounterStateType & before,
const CounterStateType & after);
2239 template <
class CounterStateType>
2240 friend uint64
getRefCycles(
const CounterStateType & before,
const CounterStateType & after);
2241 template <
class CounterStateType>
2242 friend double getCoreCStateResidency(
int state,
const CounterStateType & before,
const CounterStateType & after);
2243 template <
class CounterStateType>
2244 friend uint64 getCoreCStateResidency(
int state,
const CounterStateType& now);
2245 template <
class CounterStateType>
2246 friend uint64
getSMICount(
const CounterStateType & before,
const CounterStateType & after);
2247 template <
class CounterStateType>
2248 friend uint64 getAllSlotsRaw(
const CounterStateType& before,
const CounterStateType& after);
2249 template <
class CounterStateType>
2250 friend uint64 getAllSlots(
const CounterStateType & before,
const CounterStateType & after);
2251 template <
class CounterStateType>
2252 friend double getBackendBound(
const CounterStateType & before,
const CounterStateType & after);
2253 template <
class CounterStateType>
2254 friend double getFrontendBound(
const CounterStateType & before,
const CounterStateType & after);
2255 template <
class CounterStateType>
2256 friend double getBadSpeculation(
const CounterStateType & before,
const CounterStateType & after);
2257 template <
class CounterStateType>
2258 friend double getRetiring(
const CounterStateType & before,
const CounterStateType & after);
2259 template <
class CounterStateType>
2260 friend uint64 getMSREvent(
const uint64 & index,
const PCM::MSRType & type,
const CounterStateType& before,
const CounterStateType& after);
2270 L3UnsharedHitPos = 1,
2277 uint64 InvariantTSC;
2278 uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2279 int32 ThermalHeadroom;
2281 uint64 MemoryBWLocal;
2282 uint64 MemoryBWTotal;
2284 uint64 FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
2285 std::unordered_map<uint64, uint64> MSRValues;
2290 ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM),
2295 FrontendBoundSlots(0),
2296 BadSpeculationSlots(0),
2297 BackendBoundSlots(0),
2301 std::fill(CStateResidency, CStateResidency + PCM::MAX_C_STATE + 1, 0);
2311 InstRetiredAny += o.InstRetiredAny;
2312 CpuClkUnhaltedThread += o.CpuClkUnhaltedThread;
2313 CpuClkUnhaltedRef += o.CpuClkUnhaltedRef;
2314 for (
int i = 0; i < PERF_MAX_CUSTOM_COUNTERS; ++i)
2316 Event[i] += o.Event[i];
2318 InvariantTSC += o.InvariantTSC;
2319 for (
int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2320 CStateResidency[i] += o.CStateResidency[i];
2322 L3Occupancy += o.L3Occupancy;
2323 MemoryBWLocal += o.MemoryBWLocal;
2324 MemoryBWTotal += o.MemoryBWTotal;
2325 SMICount += o.SMICount;
2328 FrontendBoundSlots += o.FrontendBoundSlots;
2329 BadSpeculationSlots += o.BadSpeculationSlots;
2330 BackendBoundSlots += o.BackendBoundSlots;
2331 RetiringSlots += o.RetiringSlots;
2332 AllSlotsRaw += o.AllSlotsRaw;
2334 assert(FrontendBoundSlots >= old.FrontendBoundSlots);
2335 assert(BadSpeculationSlots >= old.BadSpeculationSlots);
2336 assert(BackendBoundSlots >= old.BackendBoundSlots);
2337 assert(RetiringSlots >= old.RetiringSlots);
2341 void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2342 void readAndAggregateTSC(std::shared_ptr<SafeMsrHandle>);
2348 inline uint64 RDTSC()
2354 result =
static_cast<uint64
>(__rdtsc());
2358 uint32 high = 0, low = 0;
2359 asm volatile(
"rdtsc" :
"=a" (low),
"=d" (high));
2360 result = low + (uint64(high)<<32ULL);
2366 inline uint64 RDTSCP()
2373 result = __rdtscp(&Aux);
2377 uint32 high = 0, low = 0;
2381 "mov %%eax, %1\n\t":
2382 "=r" (high),
"=r" (low) ::
"%rax",
"%rcx",
"%rdx");
2383 result = low + (uint64(high)<<32ULL);
2388 template <
class CounterStateType>
2389 int32 getThermalHeadroom(
const CounterStateType & ,
const CounterStateType & after)
2391 return after.getThermalHeadroom();
2400 template <
class CounterStateType>
2412 template <
class CounterStateType>
2423 template <
class CounterStateType>
2424 uint64
getDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after)
2426 const auto clk = after.DRAMClocks[channel] - before.DRAMClocks[channel];
2428 if (cpu_model == PCM::ICX || cpu_model == PCM::SNOWRIDGE)
2440 template <
class CounterStateType>
2441 uint64
getMCDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after)
2443 return after.MCDRAMClocks[channel] - before.MCDRAMClocks[channel];
2453 template <
class CounterStateType>
2454 uint64
getMCCounter(uint32 channel, uint32
counter,
const CounterStateType & before,
const CounterStateType & after)
2456 return after.MCCounter[channel][counter] - before.MCCounter[channel][counter];
2465 template <
class CounterStateType>
2468 return after.M3UPICounter[port][counter] - before.M3UPICounter[port][counter];
2477 template <
class CounterStateType>
2480 return after.CBOCounter[cbo][counter] - before.CBOCounter[cbo][counter];
2488 template <
class CounterStateType>
2491 return after.UBOXCounter[counter] - before.UBOXCounter[counter];
2500 template <
class CounterStateType>
2503 return after.IIOCounter[stack][counter] - before.IIOCounter[stack][counter];
2512 template <
class CounterStateType>
2515 return after.IRPCounter[stack][counter] - before.IRPCounter[stack][counter];
2524 template <
class CounterStateType>
2527 return after.xPICounter[port][counter] - before.xPICounter[port][counter];
2536 template <
class CounterStateType>
2537 uint64
getM2MCounter(uint32 controller, uint32
counter,
const CounterStateType & before,
const CounterStateType & after)
2539 return after.M2MCounter[controller][counter] - before.M2MCounter[controller][counter];
2549 template <
class CounterStateType>
2550 uint64
getEDCCounter(uint32 channel, uint32
counter,
const CounterStateType & before,
const CounterStateType & after)
2553 return after.EDCCounter[channel][counter] - before.EDCCounter[channel][counter];
2562 template <
class CounterStateType>
2565 return after.PCUCounter[counter] - before.PCUCounter[counter];
2572 template <
class CounterStateType>
2573 uint64
getPCUClocks(
const CounterStateType & before,
const CounterStateType & after)
2582 template <
class CounterStateType>
2585 return after.PackageEnergyStatus - before.PackageEnergyStatus;
2592 template <
class CounterStateType>
2595 return after.DRAMEnergyStatus - before.DRAMEnergyStatus;
2604 template <
class CounterStateType>
2605 int64
getFreeRunningCounter(
const typename CounterStateType::FreeRunningCounterID &
counter,
const CounterStateType & before,
const CounterStateType & after)
2607 const auto beforeIt = before.freeRunningCounter.find(counter);
2608 const auto afterIt = after.freeRunningCounter.find(counter);
2609 if (beforeIt != before.freeRunningCounter.end() &&
2610 afterIt != after.freeRunningCounter.end())
2612 return afterIt->second - beforeIt->second;
2622 template <
class CounterStateType>
2625 return after.UncClocks - before.UncClocks;
2632 template <
class CounterStateType>
2645 template <
class CounterStateType>
2650 double dram_joules_per_energy_unit = 0.;
2653 if (PCM::HASWELLX == cpu_model
2654 || PCM::BDX_DE == cpu_model
2655 || PCM::BDX == cpu_model
2656 || PCM::SKX == cpu_model
2657 || PCM::ICX == cpu_model
2658 || PCM::KNL == cpu_model
2665 dram_joules_per_energy_unit = 0.0000153;
2682 template <
class CounterStateType>
2683 friend uint64
getBytesReadFromMC(
const CounterStateType & before,
const CounterStateType & after);
2684 template <
class CounterStateType>
2685 friend uint64
getBytesWrittenToMC(
const CounterStateType & before,
const CounterStateType & after);
2686 template <
class CounterStateType>
2687 friend uint64
getBytesReadFromPMM(
const CounterStateType & before,
const CounterStateType & after);
2688 template <
class CounterStateType>
2689 friend uint64
getBytesWrittenToPMM(
const CounterStateType & before,
const CounterStateType & after);
2690 template <
class CounterStateType>
2691 friend uint64
getBytesReadFromEDC(
const CounterStateType & before,
const CounterStateType & after);
2692 template <
class CounterStateType>
2693 friend uint64
getBytesWrittenToEDC(
const CounterStateType & before,
const CounterStateType & after);
2694 template <
class CounterStateType>
2696 template <
class CounterStateType>
2698 template <
class CounterStateType>
2700 template <
class CounterStateType>
2701 friend uint64
getConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2702 template <
class CounterStateType>
2703 friend uint64
getDRAMConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2704 template <
class CounterStateType>
2705 friend uint64
getUncoreClocks(
const CounterStateType& before,
const CounterStateType& after);
2706 template <
class CounterStateType>
2708 template <
class CounterStateType>
2709 friend uint64 getPackageCStateResidency(
int state,
const CounterStateType& now);
2710 template <
class CounterStateType>
2711 friend double getLLCReadMissLatency(
const CounterStateType & before,
const CounterStateType & after);
2712 template <
class CounterStateType>
2714 template <
class CounterStateType>
2716 template <
class CounterStateType>
2717 friend double getAverageFrequencyFromClocks(
const int64 clocks,
const CounterStateType& before,
const CounterStateType& after);
2720 uint64 UncMCFullWrites;
2721 uint64 UncMCNormalReads;
2722 uint64 UncHARequests;
2723 uint64 UncHALocalRequests;
2724 uint64 UncPMMWrites;
2726 uint64 UncEDCFullWrites;
2727 uint64 UncEDCNormalReads;
2728 uint64 UncMCGTRequests;
2729 uint64 UncMCIARequests;
2730 uint64 UncMCIORequests;
2731 uint64 PackageEnergyStatus;
2732 uint64 DRAMEnergyStatus;
2733 uint64 TOROccupancyIAMiss;
2734 uint64 TORInsertsIAMiss;
2736 uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2737 void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2742 UncMCNormalReads(0),
2744 UncHALocalRequests(0),
2747 UncEDCFullWrites(0),
2748 UncEDCNormalReads(0),
2752 PackageEnergyStatus(0),
2753 DRAMEnergyStatus(0),
2754 TOROccupancyIAMiss(0),
2755 TORInsertsIAMiss(0),
2758 std::fill(CStateResidency, CStateResidency + PCM::MAX_C_STATE + 1, 0);
2768 UncMCFullWrites += o.UncMCFullWrites;
2769 UncMCNormalReads += o.UncMCNormalReads;
2770 UncHARequests += o.UncHARequests;
2771 UncHALocalRequests += o.UncHALocalRequests;
2772 UncPMMReads += o.UncPMMReads;
2773 UncPMMWrites += o.UncPMMWrites;
2774 UncEDCFullWrites += o.UncEDCFullWrites;
2775 UncEDCNormalReads += o.UncEDCNormalReads;
2776 UncMCGTRequests += o.UncMCGTRequests;
2777 UncMCIARequests += o.UncMCIARequests;
2778 UncMCIORequests += o.UncMCIORequests;
2779 PackageEnergyStatus += o.PackageEnergyStatus;
2780 DRAMEnergyStatus += o.DRAMEnergyStatus;
2781 TOROccupancyIAMiss += o.TOROccupancyIAMiss;
2782 TORInsertsIAMiss += o.TORInsertsIAMiss;
2783 UncClocks += o.UncClocks;
2784 for (
int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2785 CStateResidency[i] += o.CStateResidency[i];
2806 xPI_TxL0P_POWER_CYCLES = 0,
2807 xPI_L1_POWER_CYCLES = 2,
2810 enum FreeRunningCounterID
2818 std::array<std::array<uint64, maxCounters>, maxXPILinks> xPICounter;
2819 std::array<std::array<uint64, maxCounters>, maxXPILinks> M3UPICounter;
2820 std::array<std::array<uint64, maxCounters>, maxCBOs> CBOCounter;
2821 std::array<std::array<uint64, maxCounters>, maxIIOStacks> IIOCounter;
2822 std::array<std::array<uint64, maxCounters>, maxIIOStacks> IRPCounter;
2823 std::array<uint64, maxCounters> UBOXCounter;
2824 std::array<uint64, maxChannels> DRAMClocks;
2825 std::array<uint64, maxChannels> MCDRAMClocks;
2826 std::array<std::array<uint64, maxCounters>, maxChannels> MCCounter;
2827 std::array<std::array<uint64, maxCounters>, maxControllers> M2MCounter;
2828 std::array<std::array<uint64, maxCounters>, maxChannels> EDCCounter;
2829 std::array<uint64, maxCounters> PCUCounter;
2830 std::unordered_map<int, uint64> freeRunningCounter;
2831 int32 PackageThermalHeadroom;
2832 uint64 InvariantTSC;
2834 template <
class CounterStateType>
2835 friend uint64
getDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after);
2836 template <
class CounterStateType>
2837 friend uint64
getMCDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after);
2838 template <
class CounterStateType>
2839 friend uint64
getMCCounter(uint32 channel, uint32
counter,
const CounterStateType & before,
const CounterStateType & after);
2840 template <
class CounterStateType>
2841 friend uint64
getM3UPICounter(uint32 port, uint32 counter,
const CounterStateType& before,
const CounterStateType& after);
2842 template <
class CounterStateType>
2843 friend uint64
getCBOCounter(uint32 cbo, uint32 counter,
const CounterStateType& before,
const CounterStateType& after);
2844 template <
class CounterStateType>
2845 friend uint64
getUBOXCounter(uint32 counter,
const CounterStateType& before,
const CounterStateType& after);
2846 template <
class CounterStateType>
2847 friend uint64
getIIOCounter(uint32 stack, uint32 counter,
const CounterStateType& before,
const CounterStateType& after);
2848 template <
class CounterStateType>
2849 friend uint64
getIRPCounter(uint32 stack, uint32 counter,
const CounterStateType& before,
const CounterStateType& after);
2850 template <
class CounterStateType>
2851 friend uint64
getXPICounter(uint32 port, uint32 counter,
const CounterStateType& before,
const CounterStateType& after);
2852 template <
class CounterStateType>
2853 friend uint64
getM2MCounter(uint32 controller, uint32 counter,
const CounterStateType & before,
const CounterStateType & after);
2854 template <
class CounterStateType>
2855 friend uint64
getEDCCounter(uint32 channel, uint32 counter,
const CounterStateType & before,
const CounterStateType & after);
2856 template <
class CounterStateType>
2857 friend uint64
getPCUCounter(uint32 counter,
const CounterStateType & before,
const CounterStateType & after);
2858 template <
class CounterStateType>
2859 friend uint64
getConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2860 template <
class CounterStateType>
2861 friend uint64
getDRAMConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2862 template <
class CounterStateType>
2863 friend uint64
getInvariantTSC(
const CounterStateType & before,
const CounterStateType & after);
2864 template <
class CounterStateType>
2865 friend int64
getFreeRunningCounter(
const typename CounterStateType::FreeRunningCounterID &,
const CounterStateType & before,
const CounterStateType & after);
2866 template <
class CounterStateType>
2867 friend double getAverageFrequencyFromClocks(
const int64 clocks,
const CounterStateType& before,
const CounterStateType& after);
2885 PackageThermalHeadroom(0),
2896 template <
class CounterStateType>
2897 uint64
getQPIClocks(uint32 port,
const CounterStateType& before,
const CounterStateType& after)
2899 return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_CLOCKTICKS, before, after);
2907 template <
class CounterStateType>
2908 uint64
getQPIL0pTxCycles(uint32 port,
const CounterStateType& before,
const CounterStateType& after)
2910 return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_TxL0P_POWER_CYCLES, before, after);
2918 template <
class CounterStateType>
2919 uint64
getQPIL1Cycles(uint32 port,
const CounterStateType& before,
const CounterStateType& after)
2921 return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_L1_POWER_CYCLES, before, after);
2943 void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2945 BasicCounterState::readAndAggregate(handle);
2946 UncoreCounterState::readAndAggregate(handle);
2952 BasicCounterState::operator += ( ccs );
2959 UncoreCounterState::operator += ( ucs );
2970 UncoreCounterState::operator = ( std::move(ucs) );
2982 std::vector<std::vector<uint64> > incomingQPIPackets;
2983 std::vector<std::vector<uint64> > outgoingQPIFlits;
2984 std::vector<std::vector<uint64> > TxL0Cycles;
2988 void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2990 BasicCounterState::readAndAggregate(handle);
2991 UncoreCounterState::readAndAggregate(handle);
2996 friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr,
const SystemCounterState & now);
2999 friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr,
const SystemCounterState & now);
3019 BasicCounterState::operator += ( scs );
3020 UncoreCounterState::operator += ( scs );
3027 UncoreCounterState::operator += ( ucs );
3070 template <
class CounterStateType>
3071 double getIPC(
const CounterStateType & before,
const CounterStateType & after)
3073 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3075 return double(after.InstRetiredAny - before.InstRetiredAny) / double(clocks);
3086 template <
class CounterStateType>
3089 return after.InstRetiredAny - before.InstRetiredAny;
3098 template <
class CounterStateType>
3099 double getExecUsage(
const CounterStateType & before,
const CounterStateType & after)
3101 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3102 if (timer_clocks != 0)
3103 return double(after.InstRetiredAny - before.InstRetiredAny) / double(timer_clocks);
3112 template <
class CounterStateType>
3115 return now.InstRetiredAny.getRawData_NoOverflowProtection();
3135 template <
class CounterStateType>
3136 uint64
getCycles(
const CounterStateType & before,
const CounterStateType & after)
3138 return after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3151 template <
class CounterStateType>
3152 uint64
getRefCycles(
const CounterStateType & before,
const CounterStateType & after)
3154 return after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3164 template <
class CounterStateType>
3167 return now.CpuClkUnhaltedThread.getRawData_NoOverflowProtection();
3178 template <
class CounterStateType>
3179 inline double getCoreIPC(
const CounterStateType & before,
const CounterStateType & after)
3181 double ipc =
getIPC(before, after);
3196 template <
class CounterStateType>
3206 template <
class StateType>
3207 double getAverageFrequencyFromClocks(
const int64 clocks,
const StateType& before,
const StateType& after)
3209 const int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3211 if (timer_clocks != 0 && m)
3222 template <
class CounterStateType>
3225 return getAverageFrequencyFromClocks(after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread, before, after);
3234 template <
class UncoreStateType>
3239 return double(m->getNumOnlineCores()) * getAverageFrequencyFromClocks(after.UncClocks - before.UncClocks, before, after) / double(m->getNumOnlineSockets());
3248 template <
class CounterStateType>
3251 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3252 int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3254 if (ref_clocks != 0 && m)
3265 template <
class CounterStateType>
3268 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3269 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3270 if (timer_clocks != 0)
3271 return double(clocks) / double(timer_clocks);
3281 template <
class CounterStateType>
3285 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3286 int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3287 if (ref_clocks != 0)
3288 return double(clocks) / double(ref_clocks);
3299 template <
class CounterStateType>
3305 return double(hits) / double(hits + misses);
3315 template <
class CounterStateType>
3321 return double(hits) / double(hits + misses);
3331 template <
class CounterStateType>
3335 return after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3345 template <
class CounterStateType>
3349 if (
pcm->isL2CacheMissesAvailable() ==
false)
return 0ULL;
3350 const auto cpu_model =
pcm->getCPUModel();
3351 if (
pcm->useSkylakeEvents() || cpu_model == PCM::SNOWRIDGE || cpu_model == PCM::ADL) {
3352 return after.Event[BasicCounterState::SKLL2MissPos] - before.Event[BasicCounterState::SKLL2MissPos];
3354 if (
pcm->isAtom() || cpu_model == PCM::KNL)
3356 return after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3358 uint64 L3Miss = after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3359 uint64 L3UnsharedHit = after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3360 uint64 L2HitM = after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3361 return L2HitM + L3UnsharedHit + L3Miss;
3371 template <
class CounterStateType>
3372 uint64
getL2CacheHits(
const CounterStateType & before,
const CounterStateType & after)
3375 if (
pcm->isL2CacheHitsAvailable() ==
false)
return 0ULL;
3376 if (
pcm->isAtom() ||
pcm->getCPUModel() == PCM::KNL)
3378 uint64 L2Miss = after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3379 uint64 L2Ref = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3380 return L2Ref - L2Miss;
3382 return after.Event[BasicCounterState::L2HitPos] - before.Event[BasicCounterState::L2HitPos];
3388 template <
class CounterStateType>
3391 if (
PCM::getInstance()->L3CacheOccupancyMetricAvailable() ==
false)
return 0ULL;
3392 return now.L3Occupancy;
3397 template <
class CounterStateType>
3400 if (
PCM::getInstance()->CoreLocalMemoryBWMetricAvailable() ==
false)
return 0ULL;
3401 return after.MemoryBWLocal - before.MemoryBWLocal;
3407 template <
class CounterStateType>
3410 if (
PCM::getInstance()->CoreRemoteMemoryBWMetricAvailable() ==
false)
return 0ULL;
3411 const uint64 total = after.MemoryBWTotal - before.MemoryBWTotal;
3414 return total - local;
3426 template <
class CounterStateType>
3430 return after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3440 template <
class CounterStateType>
3444 if (!
pcm->isL3CacheHitsSnoopAvailable())
return 0;
3445 const auto cpu_model =
pcm->getCPUModel();
3446 if (cpu_model == PCM::SNOWRIDGE || cpu_model == PCM::ADL)
3449 const int64 refs = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3450 const int64 hits = refs - misses;
3451 return (hits > 0)? hits : 0;
3453 if (
pcm->useSkylakeEvents()) {
3454 return after.Event[BasicCounterState::SKLL3HitPos] - before.Event[BasicCounterState::SKLL3HitPos];
3456 return after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3467 template <
class CounterStateType>
3468 uint64
getL3CacheHits(
const CounterStateType & before,
const CounterStateType & after)
3482 template <
class CounterStateType>
3485 return after.InvariantTSC - before.InvariantTSC;
3495 template <
class CounterStateType>
3500 if (state == 0)
return double(
getRefCycles(before, after)) / tsc;
3505 double result = 1.0 - double(
getRefCycles(before, after)) / tsc;
3506 for (
int i = 2; i <= PCM::MAX_C_STATE; ++i)
3508 result -= (after.BasicCounterState::CStateResidency[i] - before.BasicCounterState::CStateResidency[i]) / tsc;
3510 if (result < 0.) result = 0.;
3511 else if (result > 1.) result = 1.;
3515 return (after.BasicCounterState::CStateResidency[state] - before.BasicCounterState::CStateResidency[state]) / tsc;
3524 template <
class CounterStateType>
3527 if (state == 0)
return now.CpuClkUnhaltedRef.getRawData_NoOverflowProtection();
3529 return now.BasicCounterState::CStateResidency[state];
3539 template <
class CounterStateType>
3546 double result = 1.0;
3547 for (
int i = 1; i <= PCM::MAX_C_STATE; ++i)
3549 result -= (after.UncoreCounterState::CStateResidency[i] - before.UncoreCounterState::CStateResidency[i]) / tsc;
3551 if (result < 0.) result = 0.;
3552 else if (result > 1.) result = 1.;
3556 return double(after.UncoreCounterState::CStateResidency[state] - before.UncoreCounterState::CStateResidency[state]) / tsc;
3565 template <
class CounterStateType>
3568 return now.UncoreCounterState::CStateResidency[state];
3577 template <
class CounterStateType>
3581 return (after.UncMCNormalReads - before.UncMCNormalReads) * 64;
3591 template <
class CounterStateType>
3595 return (after.UncMCFullWrites - before.UncMCFullWrites) * 64;
3605 template <
class CounterStateType>
3609 return (after.UncPMMReads - before.UncPMMReads) * 64;
3619 template <
class CounterStateType>
3623 return (after.UncPMMWrites - before.UncPMMWrites) * 64;
3633 template <
class CounterStateType>
3637 return (after.UncEDCNormalReads - before.UncEDCNormalReads) * 64;
3647 template <
class CounterStateType>
3651 return (after.UncEDCFullWrites - before.UncEDCFullWrites) * 64;
3661 template <
class CounterStateType>
3665 return (after.UncMCGTRequests - before.UncMCGTRequests) * 64;
3675 template <
class CounterStateType>
3679 return (after.UncMCIARequests - before.UncMCIARequests) * 64;
3689 template <
class CounterStateType>
3693 return (after.UncMCIORequests - before.UncMCIORequests) * 64;
3703 template <
class CounterStateType>
3704 uint64
getSMICount(
const CounterStateType & before,
const CounterStateType & after)
3706 return after.SMICount - before.SMICount;
3718 template <
class CounterStateType>
3721 return after.Event[eventCounterNr] - before.Event[eventCounterNr];
3736 if (!
PCM::getInstance()->incomingQPITrafficMetricsAvailable())
return 0ULL;
3737 uint64 b = before.incomingQPIPackets[socketNr][linkNr];
3738 uint64 a = after.incomingQPIPackets[socketNr][linkNr];
3740 return (a > b) ? (64 * (a - b)) : 0;
3756 if (!(m->qpiUtilizationMetricsAvailable()))
return 0.;
3761 return bytes / max_bytes;
3778 if (m->outgoingQPITrafficMetricsAvailable() ==
false)
return 0.;
3780 if (m->hasBecktonUncore())
3782 const uint64 b = before.outgoingQPIFlits[socketNr][linkNr];
3783 const uint64 a = after.outgoingQPIFlits[socketNr][linkNr];
3785 const double idle_flits = (double)((a > b) ? (a - b) : 0);
3786 const uint64 bTSC = before.uncoreTSC;
3787 const uint64 aTSC = after.uncoreTSC;
3788 const double tsc = (double)((aTSC > bTSC) ? (aTSC - bTSC) : 0);
3789 if (idle_flits >= tsc)
return 0.;
3791 return (1. - (idle_flits / tsc));
3792 }
else if (m->hasPCICFGUncore())
3794 const uint64 b = before.outgoingQPIFlits[socketNr][linkNr];
3795 const uint64 a = after.outgoingQPIFlits[socketNr][linkNr];
3797 double flits = (double)((a > b) ? (a - b) : 0);
3803 if (flits > max_flits)
return 1.;
3804 return (flits / max_flits);
3823 if (!(m->outgoingQPITrafficMetricsAvailable()))
return 0ULL;
3828 return (uint64)(max_bytes * util);
3847 for (uint32 s = 0; s < ns; ++s)
3848 for (uint32 q = 0; q < qpiLinks; ++q)
3869 for (uint32 s = 0; s < ns; ++s)
3870 for (uint32 q = 0; q < qpiLinks; ++q)
3889 return 64 * now.incomingQPIPackets[socketNr][linkNr];
3907 for (uint32 q = 0; q < qpiLinks; ++q)
3926 for (uint32 s = 0; s < ns; ++s)
3949 return double(totalQPI) / double(memTraffic);
3958 template <
class CounterStateType>
3961 if (
PCM::getInstance()->localMemoryRequestRatioMetricAvailable() ==
false)
return -1.;
3962 const auto all = after.UncHARequests - before.UncHARequests;
3963 const auto local = after.UncHALocalRequests - before.UncHALocalRequests;
3965 return double(local)/double(all);
3971 template <
class CounterType>
3974 return after.data - before.data;
3978 template <
class CounterStateType>
3982 if (m->LLCReadMissLatencyMetricsAvailable() ==
false)
return -1.;
3983 const double occupancy = double(after.TOROccupancyIAMiss) - double(before.TOROccupancyIAMiss);
3984 const double inserts = double(after.TORInsertsIAMiss) - double(before.TORInsertsIAMiss);
3985 const double unc_clocks = double(after.UncClocks) - double(before.UncClocks);
3986 const double seconds = double(
getInvariantTSC(before, after)) / double(m->getNumOnlineCores()/m->getNumSockets()) /
double(m->getNominalFrequency());
3987 return 1e9*seconds*(occupancy/inserts)/unc_clocks;
3990 template <
class CounterStateType>
3991 inline uint64 getAllSlots(
const CounterStateType & before,
const CounterStateType & after)
3993 const int64 a = after.BackendBoundSlots - before.BackendBoundSlots;
3994 const int64 b = after.FrontendBoundSlots - before.FrontendBoundSlots;
3995 const int64 c = after.BadSpeculationSlots - before.BadSpeculationSlots;
3996 const int64 d = after.RetiringSlots - before.RetiringSlots;
4003 return a + b + c + d;
4006 template <
class CounterStateType>
4007 inline uint64 getAllSlotsRaw(
const CounterStateType& before,
const CounterStateType& after)
4009 return after.AllSlotsRaw - before.AllSlotsRaw;
4013 template <
class CounterStateType>
4014 inline double getBackendBound(
const CounterStateType & before,
const CounterStateType & after)
4018 return double(after.BackendBoundSlots - before.BackendBoundSlots)/double(getAllSlots(before, after));
4023 template <
class CounterStateType>
4024 inline double getFrontendBound(
const CounterStateType & before,
const CounterStateType & after)
4028 return double(after.FrontendBoundSlots - before.FrontendBoundSlots)/double(getAllSlots(before, after));
4033 template <
class CounterStateType>
4038 return double(after.BadSpeculationSlots - before.BadSpeculationSlots)/double(getAllSlots(before, after));
4043 template <
class CounterStateType>
4044 inline double getRetiring(
const CounterStateType & before,
const CounterStateType & after)
4048 return double(after.RetiringSlots - before.RetiringSlots)/double(getAllSlots(before, after));
4052 template <
class CounterStateType>
4053 uint64 getMSREvent(
const uint64& index,
const PCM::MSRType& type,
const CounterStateType& before,
const CounterStateType& after)
4057 case PCM::MSRType::Freerun:
4059 const auto beforeIt = before.MSRValues.find(index);
4060 const auto afterIt = after.MSRValues.find(index);
4061 if (beforeIt != before.MSRValues.end() && afterIt != after.MSRValues.end())
4063 return afterIt->second - beforeIt->second;
4067 case PCM::MSRType::Static:
4069 const auto result = after.MSRValues.find(index);
4070 if (result != after.MSRValues.end())
4072 return result->second;
uint64 getLocalMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Local Memory Bandwidth.
Definition: cpucounters.h:3398
uint64 getNominalFrequency() const
Reads the nominal core frequency.
Definition: cpucounters.cpp:5479
uint32 getNumCores() const
Reads number of logical cores in the system.
Definition: cpucounters.cpp:5448
double getConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by processor (excluding DRAM)
Definition: cpucounters.h:2633
Definition: memoptest.cpp:24
uint32 getNumMC() const
Returns the number of detected integrated memory controllers.
Definition: cpucounters.h:505
uint64 getQPILinksPerSocket() const
Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket.
Definition: cpucounters.h:1457
uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/...
Definition: cpucounters.h:2537
uint64 getMCDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns MCDRAM clock ticks.
Definition: cpucounters.h:2441
int getRunState(void)
Returns program's Run State.
Definition: cpucounters.h:683
uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:3820
uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of M3UPI PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2466
uint32 getMaxIPC() const
Returns the max number of instructions per cycle.
Definition: cpucounters.h:1577
uint32 getThreadsPerCore() const
Reads how many hardware threads has a physical core "Hardware thread" is a logical core in a differen...
Definition: cpucounters.cpp:5469
uint32 getMCPerSocket() const
Returns the number of detected integrated memory controllers per socket.
Definition: cpucounters.h:1484
double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:3266
Internal type and constant definitions.
uint32 getCPUStepping() const
Reads CPU stepping id.
Definition: cpucounters.h:1433
bool isServerCPU() const
Returns whether it is a server part.
Definition: cpucounters.h:1641
double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:3316
(Logical) core-wide counter state
Definition: cpucounters.h:2925
uint64 getRemoteMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Remote Memory Bandwidth.
Definition: cpucounters.h:3408
SystemRoot const & getSystemTopology() const
The system, sockets, uncores, cores and threads are structured like a tree.
Definition: cpucounters.h:1025
uint64 getBytesWrittenToPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to PMM memory.
Definition: cpucounters.h:3620
uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: p...
Definition: cpucounters.h:2550
double getAverageUncoreFrequency(const UncoreStateType &before, const UncoreStateType &after)
Computes average uncore frequency.
Definition: cpucounters.h:3235
uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:3690
Definition: cpucounters.h:776
ProgramMode
Mode of programming (parameter in the program() method)
Definition: cpucounters.h:689
uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:2424
double getDRAMConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by DRAM.
Definition: cpucounters.h:2646
uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:3372
uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:3136
Definition: cpucounters.h:823
uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:2454
Definition: topology.h:349
int32 getPackageThermalSpecPower() const
Returns thermal specification power of the package domain in Watt.
Definition: cpucounters.h:1692
size_t getMCChannels(uint32 socket, uint32 controller) const
Returns the number of detected memory channels on given integrated memory controllers.
Definition: cpucounters.h:1537
double getRetiring(const CounterStateType &before, const CounterStateType &after)
Returns pipeline slots utilized by uops that eventually retire (commit)
Definition: cpucounters.h:4044
double getLocalMemoryRequestRatio(const CounterStateType &before, const CounterStateType &after)
Get local memory access ration measured in home agent.
Definition: cpucounters.h:3959
double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:3496
Socket-wide counter state.
Definition: cpucounters.h:2938
double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get data utilization of incoming QPI link (0..1)
Definition: cpucounters.h:3753
interface to MBM and CMT using Linux resctrl
Definition: cpucounters.h:119
int32 getThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:2345
uint64 getBytesReadFromPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from PMM memory.
Definition: cpucounters.h:3606
uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
Return QPI Link Speed in GBytes/second.
Definition: cpucounters.h:1683
Definition: cpucounters.h:518
double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:3223
uint32 getNumOnlineCores() const
Reads number of online logical cores in the system.
Definition: cpucounters.cpp:5453
Custom Core event description.
Definition: cpucounters.h:792
double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:3249
Definition: pcm-sensor-server.cpp:247
uint32 getCPUModel() const
Reads CPU model id.
Definition: cpucounters.h:1429
uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of IIO PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2501
double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:3282
uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:2908
int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID &counter, const CounterStateType &before, const CounterStateType &after)
Returns free running counter if it exists, -1 otherwise.
Definition: cpucounters.h:2605
bool isPackageCStateResidencySupported(int state)
Returns true if the specified package C-state residency metric is supported.
Definition: cpucounters.h:658
uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:3441
Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP...
Definition: cpucounters.h:321
size_t getMCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1509
Definition: cpucounters.h:1169
SupportedCPUModels
Identifiers of supported CPU models.
Definition: cpucounters.h:1355
double getTotalExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time interval for the entire system combining ins...
Definition: cpucounters.h:3197
uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:3346
const std::string & getErrorMessage() const
Returns the error message.
Definition: cpucounters.h:1098
ErrorCode
Return codes (e.g. for program(..) method)
Definition: cpucounters.h:697
double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:2401
Basic core counter state.
Definition: cpucounters.h:2187
uint64 getUncoreClocks(const CounterStateType &before, const CounterStateType &after)
Returns uncore clock ticks.
Definition: cpucounters.h:2623
Definition: cpucounters.h:236
int32 getPackageMinimumPower() const
Returns minimum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1695
Extended custom core event description.
Definition: cpucounters.h:806
void setRunState(int new_state)
Set Run State.
Definition: cpucounters.h:677
uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:3087
CPU Performance Monitor.
Definition: cpucounters.h:543
double getCoreIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle for the entire system combining instru...
Definition: cpucounters.h:3179
double getBadSpeculation(const CounterStateType &before, const CounterStateType &after)
Returns wasted pipeline slots due to incorrect speculation, covering whole penalty: Utilized by uops ...
Definition: cpucounters.h:4034
static PCM * getInstance()
Returns PCM object.
Definition: cpucounters.cpp:239
uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:2563
uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:3734
uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:3578
SystemCounterState getSystemCounterState()
Reads the counter state of the system.
Definition: cpucounters.cpp:4232
Definition: cpucounters.h:174
uint64 getBytesWrittenToEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to MCDRAM memory controllers.
Definition: cpucounters.h:3648
uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occurred custom core events.
Definition: cpucounters.h:3719
uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:3332
uint64 getGTRequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from GT engine.
Definition: cpucounters.h:3662
uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:3389
Low level interface to access PCI configuration space.
double getBackendBound(const CounterStateType &before, const CounterStateType &after)
Returns unutilized pipeline slots where no uop was delivered due to lack of back-end resources as ran...
Definition: cpucounters.h:4014
bool isCoreCStateResidencySupported(int state)
Returns true if the specified core C-state residency metric is supported.
Definition: cpucounters.h:649
double getLLCReadMissLatency(const CounterStateType &before, const CounterStateType &after)
Returns average last level cache read+prefetch miss latency in ns.
Definition: cpucounters.h:3979
uint64 getAllIncomingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data traffic.
Definition: cpucounters.h:3840
Provides 64-bit "virtual" counters from underlying 32-bit HW counters.
double getQPItoMCTrafficRatio(const SystemCounterState &before, const SystemCounterState &after)
Get QPI data to Memory Controller traffic ratio.
Definition: cpucounters.h:3941
Definition: topology.h:241
int32 getPackageMaximumPower() const
Returns maximum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1698
uint64 getPCUFrequency() const
Returns the frequency of Power Control Unit.
Definition: cpucounters.h:1620
Server uncore power counter state.
Definition: cpucounters.h:2793
uint64 getQPILinkSpeed(const uint32 linkNr) const
Returns the speed of the QPI link.
Definition: cpucounters.h:496
Definition: cpucounters.h:262
uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:3483
Definition: topology.h:455
size_t getNumQPIPorts() const
Returns the number of detected QPI ports.
Definition: cpucounters.h:493
System-wide counter state.
Definition: cpucounters.h:2978
uint32 getNumSockets() const
Reads number of sockets (CPUs) in the system.
Definition: cpucounters.cpp:5458
Definition: topology.h:271
uint64 getSMICount(const CounterStateType &before, const CounterStateType &after)
Returns the number of occurred system management interrupts.
Definition: cpucounters.h:3704
int32 getThreadId(uint32 os_id) const
Determines physical thread of given processor ID within a core.
Definition: cpucounters.h:1438
Definition: cpucounters.h:194
uint64 getBytesReadFromEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from MCDRAM memory controllers.
Definition: cpucounters.h:3634
uint64 getNumberOfEvents(const CounterType &before, const CounterType &after)
Returns the raw count of events.
Definition: cpucounters.h:3972
uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:2919
Definition: topology.h:109
Definition: cpucounters.h:214
bool isAtom() const
returns true if CPU is Atom-based
Definition: cpucounters.h:1841
int32 getPackageThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:2871
SocketCounterState getSocketCounterState(uint32 socket)
Reads the counter state of a socket.
Definition: cpucounters.cpp:4240
uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:3592
bool isClientCPU() const
Returns whether it is a client part.
Definition: cpucounters.h:1665
double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:3540
Definition: cpucounters.h:152
uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:3152
int32 getCoreId(uint32 os_id) const
Determines physical core of given processor ID within a socket.
Definition: cpucounters.h:1443
uint64 getPCUClocks(const CounterStateType &before, const CounterStateType &after)
Returns clock ticks of power control unit.
Definition: cpucounters.h:2573
double getFrontendBound(const CounterStateType &before, const CounterStateType &after)
Returns unutilized pipeline slots where Front-end did not deliver a uop while back-end is ready as ra...
Definition: cpucounters.h:4024
uint64 getIRPCounter(uint32 stack, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of IRP PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2513
uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState &now)
Get estimation of total QPI data traffic for this socket.
Definition: cpucounters.h:3901
uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:2897
size_t getNumEDCChannels() const
Returns the total number of detected memory channels on all embedded DRAM controllers (EDC) ...
Definition: cpucounters.h:515
uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of CHA or CBO PMU counter (counter meaning depends on the programming: power/performance/...
Definition: cpucounters.h:2478
CoreCounterState getCoreCounterState(uint32 core)
Reads the counter state of a (logical) core.
Definition: cpucounters.cpp:4248
int32 getTileId(uint32 os_id) const
Determines physical tile (cores sharing L2 cache) of given processor ID.
Definition: cpucounters.h:1448
uint64 getIARequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IA.
Definition: cpucounters.h:3676
double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:3071
Basic uncore counter state.
Definition: cpucounters.h:2678
uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of UPI or QPI PMU counter (counter meaning depends on the programming: power/performance/...
Definition: cpucounters.h:2525
uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2593
uint64 getUBOXCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of UBOX PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2489
double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:3300
double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time interval.
Definition: cpucounters.h:3099
static bool isAtom(const int32 cpu_model_)
returns true if CPU model is Atom-based
Definition: cpucounters.h:1826
Definition: cpucounters.h:127
uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:3468
uint64 getAllOutgoingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data+nondata traffic.
Definition: cpucounters.h:3862
double getJoulesPerEnergyUnit() const
Returns how many joules are in an internal processor energy unit.
Definition: cpucounters.h:1689
size_t getEDCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1564
uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:3427
int32 getSocketId(uint32 core_id) const
Determines socket of given core.
Definition: cpucounters.h:1453
double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:3774
int64 getCPUMicrocodeLevel() const
Get microcode level (returns -1 if retrieval not supported due to some restrictions) ...
Definition: cpucounters.h:1823
size_t getNumMCChannels() const
Returns the total number of detected memory channels on all integrated memory controllers.
Definition: cpucounters.h:508
Definition: pcm-iio.cpp:146
Interfaces to access free-running bandwidth counters.
double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:2413
Definition: cpucounters.h:87
uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, excluding DRAM (measured in internal units) ...
Definition: cpucounters.h:2583