Processor Counter Monitor
cpucounters.h
Go to the documentation of this file.
1 // SPDX-License-Identifier: BSD-3-Clause
2 // Copyright (c) 2009-2020, Intel Corporation
3 // written by Roman Dementiev
4 // Thomas Willhalm
5 
6 #ifndef CPUCOUNTERS_HEADER
7 #define CPUCOUNTERS_HEADER
8 
15 #include "version.h"
16 
17 #ifndef PCM_API
18 #define PCM_API
19 #endif
20 
21 #undef PCM_HA_REQUESTS_READS_ONLY
22 #undef PCM_DEBUG_TOPOLOGY // debug of topology enumeration routine
23 #undef PCM_UNCORE_PMON_BOX_CHECK_STATUS // debug only
24 
25 #include "types.h"
26 #include "msr.h"
27 #include "pci.h"
28 #include "bw.h"
29 #include "width_extender.h"
30 #include "exceptions/unsupported_processor_exception.hpp"
31 
32 #include <vector>
33 #include <array>
34 #include <limits>
35 #include <string>
36 #include <memory>
37 #include <map>
38 #include <unordered_map>
39 #include <string.h>
40 #include <assert.h>
41 
42 #ifdef PCM_USE_PERF
43 #include <linux/perf_event.h>
44 #include <errno.h>
45 #define PCM_PERF_COUNT_HW_REF_CPU_CYCLES (9)
46 #endif
47 
48 #ifndef _MSC_VER
49 #define NOMINMAX
50 #include <semaphore.h>
51 #include <sys/types.h>
52 #include <sys/stat.h>
53 #include <fcntl.h>
54 #include <sys/syscall.h>
55 #include <unistd.h>
56 #endif
57 
58 #ifdef _MSC_VER
59 #if _MSC_VER>= 1600
60 #include <intrin.h>
61 #endif
62 #endif
63 
64 #include "resctrl.h"
65 
66 namespace pcm {
67 
68 #ifdef _MSC_VER
69 void PCM_API restrictDriverAccess(LPCTSTR path);
70 #endif
71 
72 class SystemCounterState;
73 class SocketCounterState;
74 class CoreCounterState;
75 class BasicCounterState;
76 class ServerUncoreCounterState;
77 class PCM;
78 class CoreTaskQueue;
79 class SystemRoot;
80 
81 /*
82  CPU performance monitoring routines
83 
84  A set of performance monitoring routines for recent Intel CPUs
85 */
86 
87 struct PCM_API TopologyEntry // describes a core
88 {
89  int32 os_id;
90  int32 thread_id;
91  int32 core_id;
92  int32 tile_id; // tile is a constalation of 1 or more cores sharing salem L2 cache. Unique for entire system
93  int32 socket;
94  int32 native_cpu_model = -1;
95  enum CoreType
96  {
97  Atom = 0x20,
98  Core = 0x40,
99  Invalid = -1
100  };
101  CoreType core_type = Invalid;
102 
103  TopologyEntry() : os_id(-1), thread_id (-1), core_id(-1), tile_id(-1), socket(-1) { }
104  const char* getCoreTypeStr()
105  {
106  switch (core_type)
107  {
108  case Atom:
109  return "Atom";
110  case Core:
111  return "Core";
112  case Invalid:
113  return "invalid";
114  }
115  return "unknown";
116  }
117 };
118 
120 {
121 public:
122  virtual void operator = (uint64 val) = 0; // write operation
123  virtual operator uint64 () = 0; //read operation
124  virtual ~HWRegister() {}
125 };
126 
128 {
129  std::shared_ptr<PciHandleType> handle;
130  size_t offset;
131 public:
132  PCICFGRegister64(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
133  handle(handle_),
134  offset(offset_)
135  {
136  }
137  void operator = (uint64 val) override
138  {
139  cvt_ds cvt;
140  cvt.ui64 = val;
141  handle->write32(offset, cvt.ui32.low);
142  handle->write32(offset + sizeof(uint32), cvt.ui32.high);
143  }
144  operator uint64 () override
145  {
146  uint64 result = 0;
147  handle->read64(offset, &result);
148  return result;
149  }
150 };
151 
153 {
154  std::shared_ptr<PciHandleType> handle;
155  size_t offset;
156 public:
157  PCICFGRegister32(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
158  handle(handle_),
159  offset(offset_)
160  {
161  }
162  void operator = (uint64 val) override
163  {
164  handle->write32(offset, (uint32)val);
165  }
166  operator uint64 () override
167  {
168  uint32 result = 0;
169  handle->read32(offset, &result);
170  return result;
171  }
172 };
173 
175 {
176  std::shared_ptr<MMIORange> handle;
177  size_t offset;
178 public:
179  MMIORegister64(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
180  handle(handle_),
181  offset(offset_)
182  {
183  }
184  void operator = (uint64 val) override
185  {
186  handle->write64(offset, val);
187  }
188  operator uint64 () override
189  {
190  return handle->read64(offset);
191  }
192 };
193 
195 {
196  std::shared_ptr<MMIORange> handle;
197  size_t offset;
198 public:
199  MMIORegister32(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
200  handle(handle_),
201  offset(offset_)
202  {
203  }
204  void operator = (uint64 val) override
205  {
206  handle->write32(offset, (uint32)val);
207  }
208  operator uint64 () override
209  {
210  return (uint64)handle->read32(offset);
211  }
212 };
213 
214 class MSRRegister : public HWRegister
215 {
216  std::shared_ptr<SafeMsrHandle> handle;
217  size_t offset;
218 public:
219  MSRRegister(const std::shared_ptr<SafeMsrHandle> & handle_, size_t offset_) :
220  handle(handle_),
221  offset(offset_)
222  {
223  }
224  void operator = (uint64 val) override
225  {
226  handle->write(offset, val);
227  }
228  operator uint64 () override
229  {
230  uint64 value = 0;
231  handle->read(offset, &value);
232  return value;
233  }
234 };
235 
237 {
238  std::shared_ptr<CounterWidthExtender> handle;
239 public:
240  CounterWidthExtenderRegister(const std::shared_ptr<CounterWidthExtender> & handle_) :
241  handle(handle_)
242  {
243  }
244  void operator = (uint64 val) override
245  {
246  if (val == 0)
247  {
248  handle->reset();
249  }
250  else
251  {
252  std::cerr << "ERROR: writing non-zero values to CounterWidthExtenderRegister is not supported\n";
253  throw std::exception();
254  }
255  }
256  operator uint64 () override
257  {
258  return handle->read();;
259  }
260 };
261 
263 {
264  typedef std::shared_ptr<HWRegister> HWRegisterPtr;
265  HWRegisterPtr unitControl;
266 public:
267  HWRegisterPtr counterControl[4];
268  HWRegisterPtr counterValue[4];
269  HWRegisterPtr fixedCounterControl;
270  HWRegisterPtr fixedCounterValue;
271  HWRegisterPtr filter[2];
272 
273  UncorePMU(const HWRegisterPtr & unitControl_,
274  const HWRegisterPtr & counterControl0,
275  const HWRegisterPtr & counterControl1,
276  const HWRegisterPtr & counterControl2,
277  const HWRegisterPtr & counterControl3,
278  const HWRegisterPtr & counterValue0,
279  const HWRegisterPtr & counterValue1,
280  const HWRegisterPtr & counterValue2,
281  const HWRegisterPtr & counterValue3,
282  const HWRegisterPtr & fixedCounterControl_ = HWRegisterPtr(),
283  const HWRegisterPtr & fixedCounterValue_ = HWRegisterPtr(),
284  const HWRegisterPtr & filter0 = HWRegisterPtr(),
285  const HWRegisterPtr & filter1 = HWRegisterPtr()
286  ) :
287  unitControl(unitControl_),
288  counterControl{ counterControl0, counterControl1, counterControl2, counterControl3 },
289  counterValue{ counterValue0, counterValue1, counterValue2, counterValue3 },
290  fixedCounterControl(fixedCounterControl_),
291  fixedCounterValue(fixedCounterValue_),
292  filter{ filter0 , filter1 }
293  {
294  }
295  UncorePMU() {}
296  virtual ~UncorePMU() {}
297  bool valid() const
298  {
299  return unitControl.get() != nullptr;
300  }
301  void writeUnitControl(const uint32 value)
302  {
303  *unitControl = value;
304  }
305  void cleanup();
306  void freeze(const uint32 extra);
307  bool initFreeze(const uint32 extra, const char* xPICheckMsg = nullptr);
308  void unfreeze(const uint32 extra);
309  void resetUnfreeze(const uint32 extra);
310 };
311 
312 enum ServerUncoreMemoryMetrics
313 {
314  PartialWrites,
315  Pmem,
316  PmemMemoryMode,
317  PmemMixedMode
318 };
319 
322 {
323  friend class PCM;
324  int32 iMCbus,UPIbus,M2Mbus;
325  uint32 groupnr;
326  int32 cpu_model;
327  typedef std::vector<UncorePMU> UncorePMUVector;
328  UncorePMUVector imcPMUs;
329  UncorePMUVector edcPMUs;
330  UncorePMUVector xpiPMUs;
331  UncorePMUVector m3upiPMUs;
332  UncorePMUVector m2mPMUs;
333  UncorePMUVector haPMUs;
334  std::vector<UncorePMUVector*> allPMUs{ &imcPMUs, &edcPMUs, &xpiPMUs, &m3upiPMUs , &m2mPMUs, &haPMUs };
335  std::vector<uint64> qpi_speed;
336  std::vector<uint32> num_imc_channels; // number of memory channels in each memory controller
337  std::vector<std::pair<uint32, uint32> > XPIRegisterLocation; // (device, function)
338  std::vector<std::pair<uint32, uint32> > M3UPIRegisterLocation; // (device, function)
339  std::vector<std::vector< std::pair<uint32, uint32> > > MCRegisterLocation; // MCRegisterLocation[controller]: (device, function)
340  std::vector<std::pair<uint32, uint32> > EDCRegisterLocation; // EDCRegisterLocation: (device, function)
341  std::vector<std::pair<uint32, uint32> > M2MRegisterLocation; // M2MRegisterLocation: (device, function)
342  std::vector<std::pair<uint32, uint32> > HARegisterLocation; // HARegisterLocation: (device, function)
343 
344  static std::vector<std::pair<uint32, uint32> > socket2iMCbus;
345  static std::vector<std::pair<uint32, uint32> > socket2UPIbus;
346  static std::vector<std::pair<uint32, uint32> > socket2M2Mbus;
347 
348  ServerPCICFGUncore(); // forbidden
349  ServerPCICFGUncore(ServerPCICFGUncore &); // forbidden
350  ServerPCICFGUncore & operator = (const ServerPCICFGUncore &); // forbidden
351  PciHandleType * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func, bool checkVendor = false);
352  void programIMC(const uint32 * MCCntConfig);
353  void programEDC(const uint32 * EDCCntConfig);
354  void programM2M(const uint64 * M2MCntConfig);
355  void programM2M();
356  void programHA(const uint32 * config);
357  void programHA();
358  void programXPI(const uint32 * XPICntConfig);
359  void programM3UPI(const uint32* M3UPICntConfig);
360  typedef std::pair<size_t, std::vector<uint64 *> > MemTestParam;
361  void initMemTest(MemTestParam & param);
362  void doMemTest(const MemTestParam & param);
363  void cleanupMemTest(const MemTestParam & param);
364  void cleanupQPIHandles();
365  void cleanupPMUs();
366  void writeAllUnitControl(const uint32 value);
367  void initDirect(uint32 socket_, const PCM * pcm);
368  void initPerf(uint32 socket_, const PCM * pcm);
369  void initBuses(uint32 socket_, const PCM * pcm);
370  void initRegisterLocations(const PCM * pcm);
371  uint64 getPMUCounter(std::vector<UncorePMU> & pmu, const uint32 id, const uint32 counter);
372 
373 public:
374  enum EventPosition {
375  READ=0,
376  WRITE=1,
377  READ_RANK_A=0,
378  WRITE_RANK_A=1,
379  READ_RANK_B=2,
380  WRITE_RANK_B=3,
381  PARTIAL=2,
382  PMM_READ=2,
383  PMM_WRITE=3,
384  PMM_MM_MISS_CLEAN=2,
385  PMM_MM_MISS_DIRTY=3,
386  NM_HIT=0, // NM : Near Memory (DRAM cache) in Memory Mode
387  M2M_CLOCKTICKS=1
388  };
392  ServerPCICFGUncore(uint32 socket_, const PCM * pcm);
394  void program();
396  uint64 getImcReads();
399  uint64 getImcReadsForController(uint32 controller);
403  uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel);
405  uint64 getImcWrites();
407  uint64 getHALocalRequests();
409  uint64 getHARequests();
410 
412  uint64 getPMMReads();
414  uint64 getPMMWrites();
415 
417  uint64 getEdcReads();
419  uint64 getEdcWrites();
420 
423  uint64 getIncomingDataFlits(uint32 port);
424 
427  uint64 getOutgoingFlits(uint32 port);
428 
430 
433  void program_power_metrics(int mc_profile);
434 
439  void programServerUncoreMemoryMetrics(const ServerUncoreMemoryMetrics & metrics, const int rankA = -1, const int rankB = -1);
440 
443  uint64 getQPIClocks(uint32 port);
444 
447  uint64 getQPIL0pTxCycles(uint32 port);
450  uint64 getUPIL0TxCycles(uint32 port);
453  uint64 getQPIL1Cycles(uint32 port);
456  uint64 getDRAMClocks(uint32 channel);
459  uint64 getMCDRAMClocks(uint32 channel);
463  uint64 getMCCounter(uint32 channel, uint32 counter);
467  uint64 getEDCCounter(uint32 channel, uint32 counter);
471  uint64 getQPILLCounter(uint32 port, uint32 counter);
475  uint64 getM3UPICounter(uint32 port, uint32 counter);
479  uint64 getM2MCounter(uint32 box, uint32 counter);
480 
482  void freezeCounters();
484  void unfreezeCounters();
485 
487  uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel);
488 
490  void enableJKTWorkaround(bool enable);
491 
493  size_t getNumQPIPorts() const { return xpiPMUs.size(); }
494 
496  uint64 getQPILinkSpeed(const uint32 linkNr) const
497  {
498  return qpi_speed.empty() ? 0 : qpi_speed[linkNr];
499  }
500 
502  void reportQPISpeed() const;
503 
505  uint32 getNumMC() const { return (uint32)num_imc_channels.size(); }
506 
508  size_t getNumMCChannels() const { return (size_t)imcPMUs.size(); }
509 
512  size_t getNumMCChannels(const uint32 controller) const;
513 
515  size_t getNumEDCChannels() const { return edcPMUs.size(); }
516 };
517 
519 {
520  template <class T>
521  friend uint64 getNumberOfEvents(const T & before, const T & after);
522  friend class PCM;
523  uint64 data;
524 
525 public:
526  SimpleCounterState() : data(0)
527  { }
528  virtual ~SimpleCounterState() { }
529 };
530 
533 typedef std::vector<uint64> eventGroup_t;
534 
535 class PerfVirtualControlRegister;
536 
543 class PCM_API PCM
544 {
545  friend class BasicCounterState;
546  friend class UncoreCounterState;
547  friend class Socket;
548  friend class ServerUncore;
549  friend class PerfVirtualControlRegister;
550  friend class Aggregator;
551  friend class ServerPCICFGUncore;
552  PCM(); // forbidden to call directly because it is a singleton
553  PCM(const PCM &) = delete;
554  PCM & operator = (const PCM &) = delete;
555 
556  int32 cpu_family;
557  int32 cpu_model;
558  bool hybrid = false;
559  int32 cpu_stepping;
560  int64 cpu_microcode_level;
561  uint32 max_cpuid;
562  int32 threads_per_core;
563  int32 num_cores;
564  int32 num_sockets;
565  int32 num_phys_cores_per_socket;
566  int32 num_online_cores;
567  int32 num_online_sockets;
568  uint32 core_gen_counter_num_max;
569  uint32 core_gen_counter_num_used;
570  uint32 core_gen_counter_width;
571  uint32 core_fixed_counter_num_max;
572  uint32 core_fixed_counter_num_used;
573  uint32 core_fixed_counter_width;
574  uint64 core_global_ctrl_value{0ULL};
575  uint32 uncore_gen_counter_num_max;
576  uint32 uncore_gen_counter_num_used;
577  uint32 uncore_gen_counter_width;
578  uint32 uncore_fixed_counter_num_max;
579  uint32 uncore_fixed_counter_num_used;
580  uint32 uncore_fixed_counter_width;
581  uint32 perfmon_version;
582  int32 perfmon_config_anythread;
583  uint64 nominal_frequency;
584  uint64 max_qpi_speed; // in GBytes/second
585  uint32 L3ScalingFactor;
586  int32 pkgThermalSpecPower, pkgMinimumPower, pkgMaximumPower;
587 
588  std::vector<TopologyEntry> topology;
589  SystemRoot* systemTopology;
590  std::string errorMessage;
591 
592  static PCM * instance;
593  bool programmed_core_pmu{false};
594  std::vector<std::shared_ptr<SafeMsrHandle> > MSR;
595  std::vector<std::shared_ptr<ServerPCICFGUncore> > server_pcicfg_uncore;
596  std::vector<UncorePMU> pcuPMUs;
597  std::vector<std::map<int32, UncorePMU> > iioPMUs;
598  std::vector<std::map<int32, UncorePMU> > irpPMUs;
599  std::vector<UncorePMU> uboxPMUs;
600  double joulesPerEnergyUnit;
601  std::vector<std::shared_ptr<CounterWidthExtender> > energy_status;
602  std::vector<std::shared_ptr<CounterWidthExtender> > dram_energy_status;
603  std::vector<std::vector<UncorePMU> > cboPMUs;
604 
605  std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_local;
606  std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_total;
607 #ifdef __linux__
608  Resctrl resctrl;
609 #endif
610  bool useResctrl;
611 
612  std::shared_ptr<FreeRunningBWCounters> clientBW;
613  std::shared_ptr<CounterWidthExtender> clientImcReads;
614  std::shared_ptr<CounterWidthExtender> clientImcWrites;
615  std::shared_ptr<CounterWidthExtender> clientGtRequests;
616  std::shared_ptr<CounterWidthExtender> clientIaRequests;
617  std::shared_ptr<CounterWidthExtender> clientIoRequests;
618 
619  std::vector<std::shared_ptr<ServerBW> > serverBW;
620 
621  bool disable_JKT_workaround;
622  bool blocked; // track if time-driven counter update is running or not: PCM is blocked
623 
624  uint64 * coreCStateMsr; // MSR addresses of core C-state free-running counters
625  uint64 * pkgCStateMsr; // MSR addresses of package C-state free-running counters
626 
627  std::vector<std::shared_ptr<CoreTaskQueue> > coreTaskQueues;
628 
629  bool L2CacheHitRatioAvailable;
630  bool L3CacheHitRatioAvailable;
631  bool L3CacheMissesAvailable;
632  bool L2CacheMissesAvailable;
633  bool L2CacheHitsAvailable;
634  bool L3CacheHitsNoSnoopAvailable;
635  bool L3CacheHitsSnoopAvailable;
636  bool L3CacheHitsAvailable;
637 
638  bool forceRTMAbortMode;
639 
640  std::vector<uint64> FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
641  bool isFixedCounterSupported(unsigned c);
642  bool vm = false;
643  bool linux_arch_perfmon = false;
644 
645 public:
646  enum { MAX_C_STATE = 10 }; // max C-state on Intel architecture
647 
650  {
651  if (state == 0 || state == 1)
652  return true;
653 
654  return (coreCStateMsr != NULL && state <= ((int)MAX_C_STATE) && coreCStateMsr[state] != 0);
655  }
656 
659  {
660  if (state == 0)
661  {
662  return true;
663  }
664  return (pkgCStateMsr != NULL && state <= ((int)MAX_C_STATE) && pkgCStateMsr[state] != 0);
665  }
666 
668  static void setOutput(const std::string filename, const bool cerrToo = false);
669 
671  void restoreOutput();
672 
674  // Arguments:
675  // -- 1 - program is running
676  // -- 0 -pgram is sleeping
677  void setRunState(int new_state) { run_state = new_state; }
678 
680  // Results:
681  // -- 1 - program is running
682  // -- 0 -pgram is sleeping
683  int getRunState(void) { return run_state; }
684 
685  bool isBlocked(void) { return blocked; }
686  void setBlocked(const bool new_blocked) { blocked = new_blocked; }
687 
689  enum ProgramMode {
690  DEFAULT_EVENTS = 0,
691  CUSTOM_CORE_EVENTS = 1,
692  EXT_CUSTOM_CORE_EVENTS = 2,
693  INVALID_MODE
694  };
695 
697  enum ErrorCode {
698  Success = 0,
699  MSRAccessDenied = 1,
700  PMUBusy = 2,
701  UnknownError
702  };
703 
704  enum PerfmonField {
705  INVALID, /* Use to parse invalid field */
706  OPCODE,
707  EVENT_SELECT,
708  UMASK,
709  RESET,
710  EDGE_DET,
711  IGNORED,
712  OVERFLOW_ENABLE,
713  ENABLE,
714  INVERT,
715  THRESH,
716  CH_MASK,
717  FC_MASK,
718  /* Below are not part of perfmon definition */
719  H_EVENT_NAME,
720  V_EVENT_NAME,
721  MULTIPLIER,
722  DIVIDER,
723  COUNTER_INDEX
724  };
725 
726  enum PCIeWidthMode {
727  X1,
728  X4,
729  X8,
730  X16,
731  XFF
732  };
733 
734  enum { // offsets/enumeration of IIO stacks
735  IIO_CBDMA = 0, // shared with DMI
736  IIO_PCIe0 = 1,
737  IIO_PCIe1 = 2,
738  IIO_PCIe2 = 3,
739  IIO_MCP0 = 4,
740  IIO_MCP1 = 5,
741  IIO_STACK_COUNT = 6
742  };
743 
744  // Offsets/enumeration of IIO stacks Skylake server.
745  enum SkylakeIIOStacks {
746  SKX_IIO_CBDMA_DMI = 0,
747  SKX_IIO_PCIe0 = 1,
748  SKX_IIO_PCIe1 = 2,
749  SKX_IIO_PCIe2 = 3,
750  SKX_IIO_MCP0 = 4,
751  SKX_IIO_MCP1 = 5,
752  SKX_IIO_STACK_COUNT = 6
753  };
754 
755  // Offsets/enumeration of IIO stacks for IceLake server.
756  enum IcelakeIIOStacks {
757  ICX_IIO_PCIe0 = 0,
758  ICX_IIO_PCIe1 = 1,
759  ICX_IIO_MCP0 = 2,
760  ICX_IIO_PCIe2 = 3,
761  ICX_IIO_PCIe3 = 4,
762  ICX_IIO_CBDMA_DMI = 5,
763  ICX_IIO_STACK_COUNT = 6
764  };
765 
766  // Offsets/enumeration of IIO stacks for IceLake server.
767  enum SnowridgeIIOStacks {
768  SNR_IIO_QAT = 0,
769  SNR_IIO_CBDMA_DMI = 1,
770  SNR_IIO_NIS = 2,
771  SNR_IIO_HQM = 3,
772  SNR_IIO_PCIe0 = 4,
773  SNR_IIO_STACK_COUNT = 5
774  };
775 
777  {
778  enum PCIeWidthMode width;
779  std::string pciDevName;
780  std::string busNumber;
781 
782  SimplePCIeDevInfo() : width(XFF) { }
783  };
784 
793  {
794  int32 event_number = 0, umask_value = 0;
795  };
796 
807  {
808  FixedEventControlRegister * fixedCfg; // if NULL, then default configuration performed for fixed counters
809  uint32 nGPCounters; // number of general purpose counters
810  EventSelectRegister * gpCounterCfg; // general purpose counters, if NULL, then default configuration performed for GP counters
811  EventSelectRegister * gpCounterHybridAtomCfg; // general purpose counters for Atom cores in hybrid processors
812  uint64 OffcoreResponseMsrValue[2];
813  uint64 LoadLatencyMsrValue, FrontendMsrValue;
814  bool defaultUncoreProgramming{true};
815  static uint64 invalidMsrValue() { return ~0ULL; }
816  ExtendedCustomCoreEventDescription() : fixedCfg(NULL), nGPCounters(0), gpCounterCfg(nullptr), gpCounterHybridAtomCfg(nullptr), LoadLatencyMsrValue(invalidMsrValue()), FrontendMsrValue(invalidMsrValue())
817  {
818  OffcoreResponseMsrValue[0] = 0;
819  OffcoreResponseMsrValue[1] = 0;
820  }
821  };
822 
824  {
825  /* We program the same counters to every IIO Stacks */
826  std::string eventNames[4];
827  IIOPMUCNTCTLRegister eventOpcodes[4];
828  int multiplier[4]; //Some IIO event requires transformation to get meaningful output (i.e. DWord to bytes)
829  int divider[4]; //We usually like to have some kind of divider (i.e. /10e6 )
830  };
831 
832  enum MSREventPosition
833  {
834  index = 0,
835  type = 1
836  };
837  enum MSRType
838  {
839  Static = 0,
840  Freerun = 1
841  };
842 
843 private:
844  ProgramMode mode;
845  CustomCoreEventDescription coreEventDesc[PERF_MAX_CUSTOM_COUNTERS];
846  CustomCoreEventDescription hybridAtomEventDesc[PERF_MAX_CUSTOM_COUNTERS];
847 
848  std::vector<int32> socketRefCore;
849 
850  bool canUsePerf;
851 #ifdef PCM_USE_PERF
852  typedef std::vector<std::vector<int> > PerfEventHandleContainer;
853  PerfEventHandleContainer perfEventHandle;
854  std::vector<PerfEventHandleContainer> perfEventTaskHandle;
855  void readPerfData(uint32 core, std::vector<uint64> & data);
856  void closePerfHandles(const bool silent = false);
857 
858  enum {
859  PERF_INST_RETIRED_POS = 0,
860  PERF_CPU_CLK_UNHALTED_THREAD_POS = 1,
861  PERF_CPU_CLK_UNHALTED_REF_POS = 2,
862  PERF_GEN_EVENT_0_POS = 3,
863  PERF_GEN_EVENT_1_POS = 4,
864  PERF_GEN_EVENT_2_POS = 5,
865  PERF_GEN_EVENT_3_POS = 6,
866  PERF_TOPDOWN_SLOTS_POS = PERF_GEN_EVENT_0_POS + PERF_MAX_CUSTOM_COUNTERS,
867  PERF_TOPDOWN_FRONTEND_POS = PERF_TOPDOWN_SLOTS_POS + 1,
868  PERF_TOPDOWN_BADSPEC_POS = PERF_TOPDOWN_SLOTS_POS + 2,
869  PERF_TOPDOWN_BACKEND_POS = PERF_TOPDOWN_SLOTS_POS + 3,
870  PERF_TOPDOWN_RETIRING_POS = PERF_TOPDOWN_SLOTS_POS + 4
871  };
872 
873  std::array<int, (PERF_TOPDOWN_RETIRING_POS + 1)> perfTopDownPos;
874 
875  enum {
876  PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_POS,
877  PERF_TOPDOWN_GROUP_LEADER_COUNTER = PERF_TOPDOWN_SLOTS_POS
878  };
879 #endif
880  static std::ofstream * outfile; // output file stream
881  static std::streambuf * backup_ofile; // backup of original output = cout
882  static std::streambuf * backup_ofile_cerr; // backup of original output = cerr
883  int run_state; // either running (1) or sleeping (0)
884 
885  bool needToRestoreNMIWatchdog;
886 
887  std::vector<std::vector<EventSelectRegister> > lastProgrammedCustomCounters;
888  uint32 checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr);
889  ErrorCode programCoreCounters(int core, const PCM::ProgramMode mode, const ExtendedCustomCoreEventDescription * pExtDesc,
890  std::vector<EventSelectRegister> & programmedCustomCounters, const std::vector<int> & tids);
891 
892  bool PMUinUse();
893  void cleanupPMU(const bool silent = false);
894  void cleanupRDT(const bool silent = false);
895 
896  void computeQPISpeedBeckton(int core_nr);
897  void destroyMSR();
898  void computeNominalFrequency();
899  static bool isCPUModelSupported(const int model_);
900  std::string getSupportedUarchCodenames() const;
901  std::string getUnsupportedMessage() const;
902  bool detectModel();
903  bool checkModel();
904 
905  void initCStateSupportTables();
906  bool discoverSystemTopology();
907  void printSystemTopology() const;
908  bool initMSR();
909  bool detectNominalFrequency();
910  void showSpecControlMSRs();
911  void initEnergyMonitoring();
912  void initUncoreObjects();
918  void initRDT();
926  void initQOSevent(const uint64 event, const int32 core);
927  void programBecktonUncore(int core);
928  void programNehalemEPUncore(int core);
929  void enableJKTWorkaround(bool enable);
930  template <class CounterStateType>
931  void readAndAggregateMemoryBWCounters(const uint32 core, CounterStateType & counterState);
932  template <class CounterStateType>
933  void readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType & counterState);
934  template <class CounterStateType>
935  void readAndAggregateEnergyCounters(const uint32 socket, CounterStateType & counterState);
936  template <class CounterStateType>
937  void readPackageThermalHeadroom(const uint32 socket, CounterStateType & counterState);
938  template <class CounterStateType>
939  void readAndAggregatePackageCStateResidencies(std::shared_ptr<SafeMsrHandle> msr, CounterStateType & result);
940 public:
941  struct RawPMUConfig;
942 private:
943  template <class CounterStateType>
944  void readMSRs(std::shared_ptr<SafeMsrHandle> msr, const RawPMUConfig & msrConfig, CounterStateType & result);
945  void readQPICounters(SystemCounterState & counterState);
946  void reportQPISpeed() const;
947  void readCoreCounterConfig(const bool complainAboutMSR = false);
948  void readCPUMicrocodeLevel();
949 
950  uint64 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const;
951  uint64 CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const;
952  uint64 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const;
953  uint64 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const;
954  uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const;
955  void programCboOpcodeFilter(const uint32 opc0, UncorePMU & pmu, const uint32 nc_, const uint32 opc1, const uint32 loc, const uint32 rem);
956  void initLLCReadMissLatencyEvents(uint64 * events, uint32 & opCode);
957  void initCHARequestEvents(uint64 * events);
958  void programCbo();
959  uint64 getCBOCounterState(const uint32 socket, const uint32 ctr_);
960  template <class Iterator>
961  static void program(UncorePMU& pmu, const Iterator& eventsBegin, const Iterator& eventsEnd, const uint32 extra)
962  {
963  if (!eventsBegin) return;
964  Iterator curEvent = eventsBegin;
965  for (int c = 0; curEvent != eventsEnd; ++c, ++curEvent)
966  {
967  auto ctrl = pmu.counterControl[c];
968  if (ctrl.get() != nullptr)
969  {
970  *ctrl = MC_CH_PCI_PMON_CTL_EN;
971  *ctrl = MC_CH_PCI_PMON_CTL_EN | *curEvent;
972  }
973  }
974  if (extra)
975  {
976  pmu.resetUnfreeze(extra);
977  }
978  }
979  void programPCU(uint32 * events, const uint64 filter);
980  void programUBOX(const uint64* events);
981 
982  void cleanupUncorePMUs(const bool silent = false);
983 
984  bool isCLX() const // Cascade Lake-SP
985  {
986  return (PCM::SKX == cpu_model) && (cpu_stepping > 4 && cpu_stepping < 8);
987  }
988 
989  static bool isCPX(int cpu_model_, int cpu_stepping_) // Cooper Lake
990  {
991  return (PCM::SKX == cpu_model_) && (cpu_stepping_ >= 10);
992  }
993 
994  bool isCPX() const
995  {
996  return isCPX(cpu_model, cpu_stepping);
997  }
998 
999  void initUncorePMUsDirect();
1000  void initUncorePMUsPerf();
1001  bool isRDTDisabled() const;
1002 
1003 public:
1005  bool isHWTMAL1Supported() const;
1006 
1007  enum EventPosition
1008  {
1009  TOR_OCCUPANCY = 0,
1010  TOR_INSERTS = 1,
1011  REQUESTS_ALL = 2,
1012  REQUESTS_LOCAL = 3
1013  };
1015  bool isSecureBoot() const;
1016 
1018  bool useLinuxPerfForUncore() const;
1019 
1025  SystemRoot const & getSystemTopology() const {
1026  return *systemTopology;
1027  }
1028 
1030  void printDetailedSystemTopology();
1031 
1037  bool QOSMetricAvailable() const;
1043  bool L3QOSMetricAvailable() const;
1049  bool L3CacheOccupancyMetricAvailable() const;
1055  bool CoreLocalMemoryBWMetricAvailable() const;
1061  bool CoreRemoteMemoryBWMetricAvailable() const;
1067  unsigned getMaxRMID() const;
1068 
1070  uint32 getMaxNumOfCBoxes() const;
1071 
1073  uint32 getMaxNumOfIIOStacks() const;
1074 
1083  static PCM * getInstance(); // the only way to get access
1084 
1092  bool good(); // true if access to CPU counters works
1093 
1098  const std::string & getErrorMessage() const
1099  {
1100  return errorMessage;
1101  }
1102 
1116  ErrorCode program(const ProgramMode mode_ = DEFAULT_EVENTS, const void * parameter_ = NULL, const bool silent = false, const int pid = -1); // program counters and start counting
1117 
1121  void checkError(const ErrorCode code);
1122 
1134  ErrorCode programServerUncoreLatencyMetrics(bool enable_pmm);
1135 
1149  ErrorCode programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands = NULL);
1150 
1151  /* \brief Program memory counters (disables programming performance counters)
1152  \param rankA count DIMM rank1 statistics (disables memory channel monitoring)
1153  \param rankB count DIMM rank2 statistics (disables memory channel monitoring)
1154  \brief metrics metric set (see the ServerUncoreMemoryMetrics enum)
1155 
1156  Call this method before you start using the memory counter routines on microarchitecture codename SandyBridge-EP and later Xeon uarch
1157 
1158  \warning Using this routines with other tools that *program* Performance Monitoring
1159  Units (PMUs) on CPUs is not recommended because PMU can not be shared. Tools that are known to
1160  program PMUs: Intel(r) VTune(tm), Intel(r) Performance Tuning Utility (PTU). This code may make
1161  VTune or PTU measurements invalid. VTune or PTU measurement may make measurement with this code invalid. Please enable either usage of these routines or VTune/PTU/etc.
1162  */
1163  ErrorCode programServerUncoreMemoryMetrics(const ServerUncoreMemoryMetrics & metrics, int rankA = -1, int rankB = -1);
1164 
1165  // vector of IDs. E.g. for core {raw event} or {raw event, offcore response1 msr value, } or {raw event, offcore response1 msr value, offcore response2}
1166  // or for cha/cbo {raw event, filter value}, etc
1167  // + user-supplied name
1168  typedef std::pair<std::array<uint64, 5>, std::string> RawEventConfig;
1170  {
1171  std::vector<RawEventConfig> programmable;
1172  std::vector<RawEventConfig> fixed;
1173  };
1174  enum {
1175  OCR0Pos = 1,
1176  OCR1Pos = 2,
1177  LoadLatencyPos = 3,
1178  FrontendPos = 4
1179  };
1180  typedef std::map<std::string, RawPMUConfig> RawPMUConfigs;
1181  ErrorCode program(const RawPMUConfigs& curPMUConfigs, const bool silent = false, const int pid = -1);
1182 
1183  std::pair<unsigned, unsigned> getOCREventNr(const int event, const unsigned coreID) const
1184  {
1185  assert (coreID < topology.size());
1186  if (hybrid)
1187  {
1188  switch (cpu_model)
1189  {
1190  case ADL:
1191  if (topology[coreID].core_type == TopologyEntry::Atom)
1192  {
1193  return std::make_pair(OFFCORE_RESPONSE_0_EVTNR, event + 1);
1194  }
1195  break;
1196  }
1197  }
1198  bool useGLCOCREvent = false;
1199  switch (cpu_model)
1200  {
1201  case ADL: // ADL big core (GLC)
1202  useGLCOCREvent = true;
1203  break;
1204  }
1205  switch (event)
1206  {
1207  case 0:
1208  return std::make_pair(useGLCOCREvent ? GLC_OFFCORE_RESPONSE_0_EVTNR : OFFCORE_RESPONSE_0_EVTNR, OFFCORE_RESPONSE_0_UMASK);
1209  case 1:
1210  return std::make_pair(useGLCOCREvent ? GLC_OFFCORE_RESPONSE_1_EVTNR : OFFCORE_RESPONSE_1_EVTNR, OFFCORE_RESPONSE_1_UMASK);
1211  }
1212  assert (false && "wrong event nr in getOCREventNr");
1213  return std::make_pair(0U, 0U);
1214  }
1215 
1217  void freezeServerUncoreCounters();
1218 
1220  void unfreezeServerUncoreCounters();
1221 
1226  ServerUncoreCounterState getServerUncoreCounterState(uint32 socket);
1227 
1233  void cleanup(const bool silent = false);
1234 
1239  void resetPMU();
1240 
1249  void getAllCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates, std::vector<CoreCounterState> & coreStates, const bool readAndAggregateSocketUncoreCounters = true);
1250 
1257  void getUncoreCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates);
1258 
1263  bool isCoreOnline(int32 os_core_id) const;
1264 
1269  bool isSocketOnline(int32 socket_id) const;
1270 
1279 
1285 
1295 
1299  uint32 getNumCores() const;
1300 
1304  uint32 getNumOnlineCores() const;
1305 
1309  uint32 getNumSockets() const;
1310 
1314  uint32 getNumOnlineSockets() const;
1315 
1321  uint32 getThreadsPerCore() const;
1322 
1326  bool getSMT() const; // returns true iff SMT ("Hyperthreading") is on
1327 
1331  uint64 getNominalFrequency() const; // in Hz
1332 
1337  uint32 getL3ScalingFactor() const;
1338 
1344  bool isSomeCoreOfflined();
1345 
1348  int32 getMaxCustomCoreEvents();
1349 
1352  static int getCPUModelFromCPUID();
1353 
1356  {
1357  NEHALEM_EP = 26,
1358  NEHALEM = 30,
1359  ATOM = 28,
1360  ATOM_2 = 53,
1361  CENTERTON = 54,
1362  BAYTRAIL = 55,
1363  AVOTON = 77,
1364  CHERRYTRAIL = 76,
1365  APOLLO_LAKE = 92,
1366  DENVERTON = 95,
1367  SNOWRIDGE = 134,
1368  CLARKDALE = 37,
1369  WESTMERE_EP = 44,
1370  NEHALEM_EX = 46,
1371  WESTMERE_EX = 47,
1372  SANDY_BRIDGE = 42,
1373  JAKETOWN = 45,
1374  IVY_BRIDGE = 58,
1375  HASWELL = 60,
1376  HASWELL_ULT = 69,
1377  HASWELL_2 = 70,
1378  IVYTOWN = 62,
1379  HASWELLX = 63,
1380  BROADWELL = 61,
1381  BROADWELL_XEON_E3 = 71,
1382  BDX_DE = 86,
1383  SKL_UY = 78,
1384  KBL = 158,
1385  KBL_1 = 142,
1386  CML = 166,
1387  CML_1 = 165,
1388  ICL = 126,
1389  ICL_1 = 125,
1390  RKL = 167,
1391  TGL = 140,
1392  TGL_1 = 141,
1393  ADL = 151,
1394  ADL_1 = 154,
1395  BDX = 79,
1396  KNL = 87,
1397  SKL = 94,
1398  SKX = 85,
1399  ICX_D = 108,
1400  ICX = 106,
1401  END_OF_MODEL_LIST = 0x0ffff
1402  };
1403 
1404 #define PCM_SKL_PATH_CASES \
1405  case PCM::SKL_UY: \
1406  case PCM::KBL: \
1407  case PCM::KBL_1: \
1408  case PCM::CML: \
1409  case PCM::ICL: \
1410  case PCM::RKL: \
1411  case PCM::TGL: \
1412  case PCM::SKL:
1413 
1414 private:
1415  bool useSKLPath() const
1416  {
1417  switch (cpu_model)
1418  {
1419  PCM_SKL_PATH_CASES
1420  return true;
1421  }
1422  return false;
1423  }
1424  RawPMUConfig threadMSRConfig{}, packageMSRConfig{};
1425 public:
1426 
1429  uint32 getCPUModel() const { return (uint32)cpu_model; }
1430 
1433  uint32 getCPUStepping() const { return (uint32)cpu_stepping; }
1434 
1438  int32 getThreadId(uint32 os_id) const { return (int32)topology[os_id].thread_id; }
1439 
1443  int32 getCoreId(uint32 os_id) const { return (int32)topology[os_id].core_id; }
1444 
1448  int32 getTileId(uint32 os_id) const { return (int32)topology[os_id].tile_id; }
1449 
1453  int32 getSocketId(uint32 core_id) const { return (int32)topology[core_id].socket; }
1454 
1457  uint64 getQPILinksPerSocket() const
1458  {
1459  switch (cpu_model)
1460  {
1461  case NEHALEM_EP:
1462  case WESTMERE_EP:
1463  case CLARKDALE:
1464  if (num_sockets == 2)
1465  return 2;
1466  else
1467  return 1;
1468  case NEHALEM_EX:
1469  case WESTMERE_EX:
1470  return 4;
1471  case JAKETOWN:
1472  case IVYTOWN:
1473  case HASWELLX:
1474  case BDX_DE:
1475  case BDX:
1476  case SKX:
1477  case ICX:
1478  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumQPIPorts()) : 0;
1479  }
1480  return 0;
1481  }
1482 
1484  uint32 getMCPerSocket() const
1485  {
1486  switch (cpu_model)
1487  {
1488  case NEHALEM_EP:
1489  case WESTMERE_EP:
1490  case CLARKDALE:
1491  return 1;
1492  case NEHALEM_EX:
1493  case WESTMERE_EX:
1494  return 2;
1495  case JAKETOWN:
1496  case IVYTOWN:
1497  case HASWELLX:
1498  case BDX_DE:
1499  case SKX:
1500  case ICX:
1501  case BDX:
1502  case KNL:
1503  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMC()) : 0;
1504  }
1505  return 0;
1506  }
1507 
1509  size_t getMCChannelsPerSocket() const
1510  {
1511  switch (cpu_model)
1512  {
1513  case NEHALEM_EP:
1514  case WESTMERE_EP:
1515  case CLARKDALE:
1516  return 3;
1517  case NEHALEM_EX:
1518  case WESTMERE_EX:
1519  return 4;
1520  case JAKETOWN:
1521  case IVYTOWN:
1522  case HASWELLX:
1523  case BDX_DE:
1524  case SKX:
1525  case ICX:
1526  case BDX:
1527  case KNL:
1528  case SNOWRIDGE:
1529  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMCChannels()) : 0;
1530  }
1531  return 0;
1532  }
1533 
1537  size_t getMCChannels(uint32 socket, uint32 controller) const
1538  {
1539  switch (cpu_model)
1540  {
1541  case NEHALEM_EP:
1542  case WESTMERE_EP:
1543  case CLARKDALE:
1544  return 3;
1545  case NEHALEM_EX:
1546  case WESTMERE_EX:
1547  return 4;
1548  case JAKETOWN:
1549  case IVYTOWN:
1550  case HASWELLX:
1551  case BDX_DE:
1552  case SKX:
1553  case ICX:
1554  case BDX:
1555  case KNL:
1556  case SNOWRIDGE:
1557  return (socket < server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) ? (server_pcicfg_uncore[socket]->getNumMCChannels(controller)) : 0;
1558  }
1559  return 0;
1560  }
1561 
1562 
1565  {
1566  switch (cpu_model)
1567  {
1568  case KNL:
1569  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumEDCChannels()) : 0;
1570  }
1571  return 0;
1572  }
1573 
1574 
1577  uint32 getMaxIPC() const
1578  {
1579  if (ICL == cpu_model || TGL == cpu_model || RKL == cpu_model) return 5;
1580  switch (cpu_model)
1581  {
1582  case ADL:
1583  return 6;
1584  case SNOWRIDGE:
1585  return 4;
1586  case DENVERTON:
1587  return 3;
1588  case NEHALEM_EP:
1589  case WESTMERE_EP:
1590  case NEHALEM_EX:
1591  case WESTMERE_EX:
1592  case CLARKDALE:
1593  case SANDY_BRIDGE:
1594  case JAKETOWN:
1595  case IVYTOWN:
1596  case IVY_BRIDGE:
1597  case HASWELL:
1598  case HASWELLX:
1599  case BROADWELL:
1600  case BDX_DE:
1601  case BDX:
1602  PCM_SKL_PATH_CASES
1603  case SKX:
1604  return 4;
1605  case KNL:
1606  return 2;
1607  case ICX:
1608  return 5;
1609  }
1610  if (isAtom())
1611  {
1612  return 2;
1613  }
1614  std::cerr << "MaxIPC is not defined for your cpu model " << cpu_model << '\n';
1615  assert (0);
1616  return 0;
1617  }
1618 
1620  uint64 getPCUFrequency() const
1621  {
1622  switch (cpu_model)
1623  {
1624  case JAKETOWN:
1625  case IVYTOWN:
1626  return 800000000ULL; // 800 MHz
1627  case HASWELLX:
1628  case BDX_DE:
1629  case BDX:
1630  case KNL:
1631  return 1000000000ULL; // 1 GHz
1632  case SKX:
1633  case ICX:
1634  case SNOWRIDGE:
1635  return 1100000000ULL; // 1.1 GHz
1636  }
1637  return 0;
1638  }
1639 
1641  bool isServerCPU() const
1642  {
1643  switch (cpu_model)
1644  {
1645  case NEHALEM_EP:
1646  case NEHALEM_EX:
1647  case WESTMERE_EP:
1648  case WESTMERE_EX:
1649  case JAKETOWN:
1650  case IVYTOWN:
1651  case HASWELLX:
1652  case BDX:
1653  case BDX_DE:
1654  case SKX:
1655  case ICX:
1656  case SNOWRIDGE:
1657  case KNL:
1658  return true;
1659  default:
1660  return false;
1661  };
1662  }
1663 
1665  bool isClientCPU() const
1666  {
1667  return !isServerCPU();
1668  }
1673  uint64 getTickCount(uint64 multiplier = 1000 /* ms */, uint32 core = 0);
1674 
1675  uint64 getInvariantTSC_Fast(uint32 core = 0);
1676 
1678  uint64 getUncoreClocks(const uint32 socket_);
1679 
1683  uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
1684  {
1685  return hasPCICFGUncore() ? server_pcicfg_uncore[socketNr]->getQPILinkSpeed(linkNr) : max_qpi_speed;
1686  }
1687 
1689  double getJoulesPerEnergyUnit() const { return joulesPerEnergyUnit; }
1690 
1692  int32 getPackageThermalSpecPower() const { return pkgThermalSpecPower; }
1693 
1695  int32 getPackageMinimumPower() const { return pkgMinimumPower; }
1696 
1698  int32 getPackageMaximumPower() const { return pkgMaximumPower; }
1699 
1700  #ifndef NO_WINRING // In cases where loading the WinRing0 driver is not desirable as a fallback to MSR.sys, add -DNO_WINRING to compile command to remove ability to load driver
1701  static bool initWinRing0Lib();
1704  #endif // NO_WINRING
1705 
1706  inline void disableJKTWorkaround() { disable_JKT_workaround = true; }
1707 
1708  enum PCIeEventCode
1709  {
1710  // PCIe read events (PCI devices reading from memory - application writes to disk/network/PCIe device)
1711  PCIeRdCur = 0x19E, // PCIe read current (full cache line)
1712  PCIeNSRd = 0x1E4, // PCIe non-snoop read (full cache line)
1713  // PCIe write events (PCI devices writing to memory - application reads from disk/network/PCIe device)
1714  PCIeWiLF = 0x194, // PCIe Write (non-allocating) (full cache line)
1715  PCIeItoM = 0x19C, // PCIe Write (allocating) (full cache line)
1716  PCIeNSWr = 0x1E5, // PCIe Non-snoop write (partial cache line)
1717  PCIeNSWrF = 0x1E6, // PCIe Non-snoop write (full cache line)
1718  // events shared by CPU and IO
1719  RFO = 0x180, // Demand Data RFO; share the same code for CPU, use tid to filter PCIe only traffic
1720  CRd = 0x181, // Demand Code Read
1721  DRd = 0x182, // Demand Data Read
1722  PRd = 0x187, // Partial Reads (UC) (MMIO Read)
1723  WiL = 0x18F, // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT
1724  ItoM = 0x1C8, // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic
1725 
1726  SKX_RFO = 0x200,
1727  SKX_CRd = 0x201,
1728  SKX_DRd = 0x202,
1729  SKX_PRd = 0x207,
1730  SKX_WiL = 0x20F,
1731  SKX_RdCur = 0x21E,
1732  SKX_ItoM = 0x248,
1733  };
1734 
1735  enum ChaPipelineQueue
1736  {
1737  None,
1738  IRQ,
1739  PRQ,
1740  };
1741 
1742  enum CBoEventTid
1743  {
1744  RFOtid = 0x3E,
1745  ItoMtid = 0x3E,
1746  };
1747 
1750  void programPCIeEventGroup(eventGroup_t &eventGroup);
1751  uint64 getPCIeCounterData(const uint32 socket_, const uint32 ctr_);
1752 
1760  void programCbo(const uint64 * events, const uint32 opCode = 0, const uint32 nc_ = 0, const uint32 llc_lookup_tid_filter = 0, const uint32 loc = 1, const uint32 rem = 1);
1761 
1766  void programCboRaw(const uint64* events, const uint64 filter0, const uint64 filter1);
1767 
1771  PCIeCounterState getPCIeCounterState(const uint32 socket_, const uint32 ctr_ = 0);
1772 
1776  void programIIOCounters(uint64 rawEvents[4], int IIOStack = -1);
1777 
1781  void programIRPCounters(uint64 rawEvents[4], int IIOStack = -1);
1782 
1787  IIOCounterState getIIOCounterState(int socket, int IIOStack, int counter);
1788 
1793  void getIIOCounterStates(int socket, int IIOStack, IIOCounterState * result);
1794 
1795  uint64 extractCoreGenCounterValue(uint64 val);
1796  uint64 extractCoreFixedCounterValue(uint64 val);
1797  uint64 extractUncoreGenCounterValue(uint64 val);
1798  uint64 extractUncoreFixedCounterValue(uint64 val);
1799  uint64 extractQOSMonitoring(uint64 val);
1800 
1803  const char * getUArchCodename(const int32 cpu_model_ = -1) const;
1804 
1806  static std::string getCPUBrandString();
1807  std::string getCPUFamilyModelString();
1808 
1809 
1811  void enableForceRTMAbortMode(const bool silent = false);
1812 
1814  bool isForceRTMAbortModeEnabled() const;
1815 
1817  void disableForceRTMAbortMode(const bool silent = false);
1818 
1820  bool isForceRTMAbortModeAvailable() const;
1821 
1823  int64 getCPUMicrocodeLevel() const { return cpu_microcode_level; }
1824 
1826  static bool isAtom(const int32 cpu_model_)
1827  {
1828  return cpu_model_ == ATOM
1829  || cpu_model_ == ATOM_2
1830  || cpu_model_ == CENTERTON
1831  || cpu_model_ == BAYTRAIL
1832  || cpu_model_ == AVOTON
1833  || cpu_model_ == CHERRYTRAIL
1834  || cpu_model_ == APOLLO_LAKE
1835  || cpu_model_ == DENVERTON
1836  // || cpu_model_ == SNOWRIDGE do not use Atom code for SNOWRIDGE
1837  ;
1838  }
1839 
1841  bool isAtom() const
1842  {
1843  return isAtom(cpu_model);
1844  }
1845 
1846  bool packageEnergyMetricsAvailable() const
1847  {
1848  return (
1849  cpu_model == PCM::JAKETOWN
1850  || cpu_model == PCM::IVYTOWN
1851  || cpu_model == PCM::SANDY_BRIDGE
1852  || cpu_model == PCM::IVY_BRIDGE
1853  || cpu_model == PCM::HASWELL
1854  || cpu_model == PCM::AVOTON
1855  || cpu_model == PCM::CHERRYTRAIL
1856  || cpu_model == PCM::BAYTRAIL
1857  || cpu_model == PCM::APOLLO_LAKE
1858  || cpu_model == PCM::DENVERTON
1859  || cpu_model == PCM::SNOWRIDGE
1860  || cpu_model == PCM::HASWELLX
1861  || cpu_model == PCM::BROADWELL
1862  || cpu_model == PCM::BDX_DE
1863  || cpu_model == PCM::BDX
1864  || cpu_model == PCM::KNL
1865  || useSKLPath()
1866  || cpu_model == PCM::SKX
1867  || cpu_model == PCM::ICX
1868  || cpu_model == PCM::ADL
1869  );
1870  }
1871 
1872  bool dramEnergyMetricsAvailable() const
1873  {
1874  return (
1875  cpu_model == PCM::JAKETOWN
1876  || cpu_model == PCM::IVYTOWN
1877  || cpu_model == PCM::HASWELLX
1878  || cpu_model == PCM::BDX_DE
1879  || cpu_model == PCM::BDX
1880  || cpu_model == PCM::KNL
1881  || cpu_model == PCM::SKX
1882  || cpu_model == PCM::ICX
1883  );
1884  }
1885 
1886  bool packageThermalMetricsAvailable() const
1887  {
1888  return packageEnergyMetricsAvailable();
1889  }
1890 
1891  bool outgoingQPITrafficMetricsAvailable() const
1892  {
1893  return getQPILinksPerSocket() > 0 &&
1894  (
1895  cpu_model == PCM::NEHALEM_EX
1896  || cpu_model == PCM::WESTMERE_EX
1897  || cpu_model == PCM::JAKETOWN
1898  || cpu_model == PCM::IVYTOWN
1899  || cpu_model == PCM::HASWELLX
1900  || cpu_model == PCM::BDX
1901  || cpu_model == PCM::SKX
1902  || cpu_model == PCM::ICX
1903  );
1904  }
1905 
1906  bool incomingQPITrafficMetricsAvailable() const
1907  {
1908  return getQPILinksPerSocket() > 0 &&
1909  (
1910  cpu_model == PCM::NEHALEM_EX
1911  || cpu_model == PCM::WESTMERE_EX
1912  || cpu_model == PCM::JAKETOWN
1913  || cpu_model == PCM::IVYTOWN
1914  || (cpu_model == PCM::SKX && cpu_stepping > 1)
1915  || cpu_model == PCM::ICX
1916  );
1917  }
1918 
1919  bool localMemoryRequestRatioMetricAvailable() const
1920  {
1921  return cpu_model == PCM::HASWELLX
1922  || cpu_model == PCM::BDX
1923  || cpu_model == PCM::SKX
1924  || cpu_model == PCM::ICX
1925  ;
1926  }
1927 
1928  bool qpiUtilizationMetricsAvailable() const
1929  {
1930  return outgoingQPITrafficMetricsAvailable();
1931  }
1932 
1933  bool memoryTrafficMetricsAvailable() const
1934  {
1935  return (!(isAtom() || cpu_model == PCM::CLARKDALE))
1936  ;
1937  }
1938 
1939  bool MCDRAMmemoryTrafficMetricsAvailable() const
1940  {
1941  return (cpu_model == PCM::KNL);
1942  }
1943 
1944  bool memoryIOTrafficMetricAvailable() const
1945  {
1946  if (cpu_model == TGL) return false;
1947  return (
1948  cpu_model == PCM::SANDY_BRIDGE
1949  || cpu_model == PCM::IVY_BRIDGE
1950  || cpu_model == PCM::HASWELL
1951  || cpu_model == PCM::BROADWELL
1952  || useSKLPath()
1953  );
1954  }
1955 
1956  bool IIOEventsAvailable() const
1957  {
1958  return (
1959  cpu_model == PCM::SKX
1960  || cpu_model == PCM::ICX
1961  || cpu_model == PCM::SNOWRIDGE
1962  );
1963  }
1964 
1965  bool uncoreFrequencyMetricAvailable() const
1966  {
1967  return MSR.empty() == false && uboxPMUs.size() == getNumSockets() && getNumCores() == getNumOnlineCores();
1968  }
1969 
1970  bool LatencyMetricsAvailable() const
1971  {
1972  return (
1973  cpu_model == PCM::HASWELLX
1974  || cpu_model == PCM::BDX
1975  || cpu_model == PCM::SKX
1976  || cpu_model == PCM::ICX
1977  || useSKLPath()
1978  );
1979  }
1980 
1981  bool DDRLatencyMetricsAvailable() const
1982  {
1983  return (
1984  cpu_model == PCM::SKX
1985  || cpu_model == PCM::ICX
1986  );
1987  }
1988 
1989  bool PMMTrafficMetricsAvailable() const
1990  {
1991  return (
1992  isCLX()
1993  || isCPX()
1994  || cpu_model == PCM::ICX
1995  || cpu_model == PCM::SNOWRIDGE
1996  );
1997  }
1998 
1999  bool LLCReadMissLatencyMetricsAvailable() const
2000  {
2001  return (
2002  HASWELLX == cpu_model
2003  || BDX_DE == cpu_model
2004  || BDX == cpu_model
2005  || isCLX()
2006  || isCPX()
2007 #ifdef PCM_ENABLE_LLCRDLAT_SKX_MP
2008  || SKX == cpu_model
2009 #else
2010  || ((SKX == cpu_model) && (num_sockets == 1))
2011 #endif
2012  || ICX == cpu_model
2013  || SNOWRIDGE == cpu_model
2014  );
2015  }
2016 
2017  bool hasBecktonUncore() const
2018  {
2019  return (
2020  cpu_model == PCM::NEHALEM_EX
2021  || cpu_model == PCM::WESTMERE_EX
2022  );
2023  }
2024  bool hasPCICFGUncore() const // has PCICFG uncore PMON
2025  {
2026  return (
2027  cpu_model == PCM::JAKETOWN
2028  || cpu_model == PCM::SNOWRIDGE
2029  || cpu_model == PCM::IVYTOWN
2030  || cpu_model == PCM::HASWELLX
2031  || cpu_model == PCM::BDX_DE
2032  || cpu_model == PCM::SKX
2033  || cpu_model == PCM::ICX
2034  || cpu_model == PCM::BDX
2035  || cpu_model == PCM::KNL
2036  );
2037  }
2038 
2039  bool isSkxCompatible() const
2040  {
2041  return (
2042  cpu_model == PCM::SKX
2043  );
2044  }
2045 
2046  static bool hasUPI(const int32 cpu_model_) // Intel(r) Ultra Path Interconnect
2047  {
2048  return (
2049  cpu_model_ == PCM::SKX
2050  || cpu_model_ == PCM::ICX
2051  );
2052  }
2053 
2054  bool hasUPI() const
2055  {
2056  return hasUPI(cpu_model);
2057  }
2058 
2059  const char * xPI() const
2060  {
2061  if (hasUPI())
2062  return "UPI";
2063 
2064  return "QPI";
2065  }
2066 
2067  bool hasCHA() const
2068  {
2069  return (
2070  cpu_model == PCM::SKX
2071  || cpu_model == PCM::ICX
2072  );
2073  }
2074 
2075  bool supportsHLE() const;
2076  bool supportsRTM() const;
2077  bool supportsRDTSCP() const;
2078 
2079  bool useSkylakeEvents() const
2080  {
2081  return useSKLPath()
2082  || PCM::SKX == cpu_model
2083  || PCM::ICX == cpu_model
2084  ;
2085  }
2086 
2087  bool hasClientMCCounters() const
2088  {
2089  return cpu_model == SANDY_BRIDGE
2090  || cpu_model == IVY_BRIDGE
2091  || cpu_model == HASWELL
2092  || cpu_model == BROADWELL
2093  || useSKLPath()
2094  ;
2095  }
2096 
2097  static double getBytesPerFlit(int32 cpu_model_)
2098  {
2099  if (hasUPI(cpu_model_))
2100  {
2101  // 172 bits per UPI flit
2102  return 172./8.;
2103  }
2104  // 8 bytes per QPI flit
2105  return 8.;
2106  }
2107 
2108  double getBytesPerFlit() const
2109  {
2110  return getBytesPerFlit(cpu_model);
2111  }
2112 
2113  static double getDataBytesPerFlit(int32 cpu_model_)
2114  {
2115  if (hasUPI(cpu_model_))
2116  {
2117  // 9 UPI flits to transfer 64 bytes
2118  return 64./9.;
2119  }
2120  // 8 bytes per QPI flit
2121  return 8.;
2122  }
2123 
2124  double getDataBytesPerFlit() const
2125  {
2126  return getDataBytesPerFlit(cpu_model);
2127  }
2128 
2129  static double getFlitsPerLinkCycle(int32 cpu_model_)
2130  {
2131  if (hasUPI(cpu_model_))
2132  {
2133  // 5 UPI flits sent every 6 link cycles
2134  return 5./6.;
2135  }
2136  return 2.;
2137  }
2138 
2139  static double getBytesPerLinkCycle(int32 cpu_model_)
2140  {
2141  return getBytesPerFlit(cpu_model_) * getFlitsPerLinkCycle(cpu_model_);
2142  }
2143 
2144  double getBytesPerLinkCycle() const
2145  {
2146  return getBytesPerLinkCycle(cpu_model);
2147  }
2148 
2149  static double getLinkTransfersPerLinkCycle()
2150  {
2151  return 8.;
2152  }
2153 
2154  double getBytesPerLinkTransfer() const
2155  {
2156  return getBytesPerLinkCycle() / getLinkTransfersPerLinkCycle();
2157  }
2158 
2161  void setupCustomCoreEventsForNuma(PCM::ExtendedCustomCoreEventDescription& conf) const;
2162 
2163  #define PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(m) bool is##m() const { return m; }
2164 
2165  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitRatioAvailable)
2166  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)
2167  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheMissesAvailable)
2168  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheMissesAvailable)
2169  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitsAvailable)
2170  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsNoSnoopAvailable)
2171  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsSnoopAvailable)
2172  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsAvailable)
2173 
2174  #undef PCM_GEN_METRIC_AVAILABLE_FUNCTION
2175 
2176  bool isActiveRelativeFrequencyAvailable() const
2177  {
2178  return !isAtom();
2179  }
2180 
2181  ~PCM();
2182 };
2183 
2188 {
2189  friend class PCM;
2190  friend class JSONPrinter;
2191  template <class CounterStateType>
2192  friend double getExecUsage(const CounterStateType & before, const CounterStateType & after);
2193  template <class CounterStateType>
2194  friend double getIPC(const CounterStateType & before, const CounterStateType & after);
2195  template <class CounterStateType>
2196  friend double getAverageFrequency(const CounterStateType & before, const CounterStateType & after);
2197  template <class CounterStateType>
2198  friend double getAverageFrequencyFromClocks(const int64 clocks, const CounterStateType& before, const CounterStateType& after);
2199  template <class CounterStateType>
2200  friend double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after);
2201  template <class CounterStateType>
2202  friend double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
2203  template <class CounterStateType>
2204  friend double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
2205  template <class CounterStateType>
2206  friend double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
2207  template <class CounterStateType>
2208  friend double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
2209  template <class CounterStateType>
2210  friend uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
2211  template <class CounterStateType>
2212  friend uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
2213  template <class CounterStateType>
2214  friend uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after);
2215  template <class CounterStateType>
2216  friend uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after);
2217  template <class CounterStateType>
2218  friend uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after);
2219  template <class CounterStateType>
2220  friend uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after);
2221  template <class CounterStateType>
2222  friend uint64 getL3CacheOccupancy(const CounterStateType & now);
2223  template <class CounterStateType>
2224  friend uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after);
2225  template <class CounterStateType>
2226  friend uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after);
2227  template <class CounterStateType>
2228  friend uint64 getCycles(const CounterStateType & before, const CounterStateType & after);
2229  template <class CounterStateType>
2230  friend uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after);
2231  template <class CounterStateType>
2232  friend uint64 getCycles(const CounterStateType & now);
2233  template <class CounterStateType>
2234  friend uint64 getInstructionsRetired(const CounterStateType & now);
2235  template <class CounterStateType>
2236  friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after);
2237  template <class CounterStateType>
2238  friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
2239  template <class CounterStateType>
2240  friend uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after);
2241  template <class CounterStateType>
2242  friend double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
2243  template <class CounterStateType>
2244  friend uint64 getCoreCStateResidency(int state, const CounterStateType& now);
2245  template <class CounterStateType>
2246  friend uint64 getSMICount(const CounterStateType & before, const CounterStateType & after);
2247  template <class CounterStateType>
2248  friend uint64 getAllSlotsRaw(const CounterStateType& before, const CounterStateType& after);
2249  template <class CounterStateType>
2250  friend uint64 getAllSlots(const CounterStateType & before, const CounterStateType & after);
2251  template <class CounterStateType>
2252  friend double getBackendBound(const CounterStateType & before, const CounterStateType & after);
2253  template <class CounterStateType>
2254  friend double getFrontendBound(const CounterStateType & before, const CounterStateType & after);
2255  template <class CounterStateType>
2256  friend double getBadSpeculation(const CounterStateType & before, const CounterStateType & after);
2257  template <class CounterStateType>
2258  friend double getRetiring(const CounterStateType & before, const CounterStateType & after);
2259  template <class CounterStateType>
2260  friend uint64 getMSREvent(const uint64 & index, const PCM::MSRType & type, const CounterStateType& before, const CounterStateType& after);
2261 protected:
2262  checked_uint64 InstRetiredAny{};
2263  checked_uint64 CpuClkUnhaltedThread{};
2264  checked_uint64 CpuClkUnhaltedRef{};
2265  checked_uint64 Event[PERF_MAX_CUSTOM_COUNTERS];
2266  enum
2267  {
2268  L3MissPos = 0,
2269  ArchLLCMissPos = 0,
2270  L3UnsharedHitPos = 1,
2271  ArchLLCRefPos = 1,
2272  SKLL3HitPos = 1,
2273  L2HitMPos = 2,
2274  SKLL2MissPos = 2,
2275  L2HitPos = 3
2276  };
2277  uint64 InvariantTSC; // invariant time stamp counter
2278  uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2279  int32 ThermalHeadroom;
2280  uint64 L3Occupancy;
2281  uint64 MemoryBWLocal;
2282  uint64 MemoryBWTotal;
2283  uint64 SMICount;
2284  uint64 FrontendBoundSlots, BadSpeculationSlots, BackendBoundSlots, RetiringSlots, AllSlotsRaw;
2285  std::unordered_map<uint64, uint64> MSRValues;
2286 
2287 public:
2288  BasicCounterState() :
2289  InvariantTSC(0),
2290  ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM),
2291  L3Occupancy(0),
2292  MemoryBWLocal(0),
2293  MemoryBWTotal(0),
2294  SMICount(0),
2295  FrontendBoundSlots(0),
2296  BadSpeculationSlots(0),
2297  BackendBoundSlots(0),
2298  RetiringSlots(0),
2299  AllSlotsRaw(0)
2300  {
2301  std::fill(CStateResidency, CStateResidency + PCM::MAX_C_STATE + 1, 0);
2302  }
2303  virtual ~BasicCounterState() { }
2304 
2305  BasicCounterState( const BasicCounterState& ) = default;
2306  BasicCounterState( BasicCounterState&& ) = default;
2307  BasicCounterState & operator = ( BasicCounterState&& ) = default;
2308 
2309  BasicCounterState & operator += (const BasicCounterState & o)
2310  {
2311  InstRetiredAny += o.InstRetiredAny;
2312  CpuClkUnhaltedThread += o.CpuClkUnhaltedThread;
2313  CpuClkUnhaltedRef += o.CpuClkUnhaltedRef;
2314  for (int i = 0; i < PERF_MAX_CUSTOM_COUNTERS; ++i)
2315  {
2316  Event[i] += o.Event[i];
2317  }
2318  InvariantTSC += o.InvariantTSC;
2319  for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2320  CStateResidency[i] += o.CStateResidency[i];
2321  // ThermalHeadroom is not accumulative
2322  L3Occupancy += o.L3Occupancy;
2323  MemoryBWLocal += o.MemoryBWLocal;
2324  MemoryBWTotal += o.MemoryBWTotal;
2325  SMICount += o.SMICount;
2326  // std::cout << "before PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <<RetiringSlots << std::endl;
2327  BasicCounterState old = *this;
2328  FrontendBoundSlots += o.FrontendBoundSlots;
2329  BadSpeculationSlots += o.BadSpeculationSlots;
2330  BackendBoundSlots += o.BackendBoundSlots;
2331  RetiringSlots += o.RetiringSlots;
2332  AllSlotsRaw += o.AllSlotsRaw;
2333  //std::cout << "after PCM debug aggregate "<< FrontendBoundSlots << " " << BadSpeculationSlots << " " << BackendBoundSlots << " " <<RetiringSlots << std::endl;
2334  assert(FrontendBoundSlots >= old.FrontendBoundSlots);
2335  assert(BadSpeculationSlots >= old.BadSpeculationSlots);
2336  assert(BackendBoundSlots >= old.BackendBoundSlots);
2337  assert(RetiringSlots >= old.RetiringSlots);
2338  return *this;
2339  }
2340 
2341  void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2342  void readAndAggregateTSC(std::shared_ptr<SafeMsrHandle>);
2343 
2345  int32 getThermalHeadroom() const { return ThermalHeadroom; }
2346 };
2347 
2348 inline uint64 RDTSC()
2349 {
2350  uint64 result = 0;
2351 #ifdef _MSC_VER
2352  // Windows
2353  #if _MSC_VER>= 1600
2354  result = static_cast<uint64>(__rdtsc());
2355  #endif
2356 #else
2357  // Linux
2358  uint32 high = 0, low = 0;
2359  asm volatile("rdtsc" : "=a" (low), "=d" (high));
2360  result = low + (uint64(high)<<32ULL);
2361 #endif
2362  return result;
2363 
2364 }
2365 
2366 inline uint64 RDTSCP()
2367 {
2368  uint64 result = 0;
2369 #ifdef _MSC_VER
2370  // Windows
2371  #if _MSC_VER>= 1600
2372  unsigned int Aux;
2373  result = __rdtscp(&Aux);
2374  #endif
2375 #else
2376  // Linux and OS X
2377  uint32 high = 0, low = 0;
2378  asm volatile (
2379  "rdtscp\n\t"
2380  "mov %%edx, %0\n\t"
2381  "mov %%eax, %1\n\t":
2382  "=r" (high), "=r" (low) :: "%rax", "%rcx", "%rdx");
2383  result = low + (uint64(high)<<32ULL);
2384 #endif
2385  return result;
2386 }
2387 
2388 template <class CounterStateType>
2389 int32 getThermalHeadroom(const CounterStateType & /* before */, const CounterStateType & after)
2390 {
2391  return after.getThermalHeadroom();
2392 }
2393 
2400 template <class CounterStateType>
2401 double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
2402 {
2403  return double(getQPIL0pTxCycles(port, before, after)) / double(getQPIClocks(port, before, after));
2404 }
2405 
2412 template <class CounterStateType>
2413 double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
2414 {
2415  return double(getQPIL1Cycles(port, before, after)) / double(getQPIClocks(port, before, after));
2416 }
2417 
2423 template <class CounterStateType>
2424 uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
2425 {
2426  const auto clk = after.DRAMClocks[channel] - before.DRAMClocks[channel];
2427  const auto cpu_model = PCM::getInstance()->getCPUModel();
2428  if (cpu_model == PCM::ICX || cpu_model == PCM::SNOWRIDGE)
2429  {
2430  return 2 * clk;
2431  }
2432  return clk;
2433 }
2434 
2440 template <class CounterStateType>
2441 uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
2442 {
2443  return after.MCDRAMClocks[channel] - before.MCDRAMClocks[channel];
2444 }
2445 
2446 
2453 template <class CounterStateType>
2454 uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2455 {
2456  return after.MCCounter[channel][counter] - before.MCCounter[channel][counter];
2457 }
2458 
2465 template <class CounterStateType>
2466 uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2467 {
2468  return after.M3UPICounter[port][counter] - before.M3UPICounter[port][counter];
2469 }
2470 
2477 template <class CounterStateType>
2478 uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2479 {
2480  return after.CBOCounter[cbo][counter] - before.CBOCounter[cbo][counter];
2481 }
2482 
2488 template <class CounterStateType>
2489 uint64 getUBOXCounter(uint32 counter, const CounterStateType& before, const CounterStateType& after)
2490 {
2491  return after.UBOXCounter[counter] - before.UBOXCounter[counter];
2492 }
2493 
2500 template <class CounterStateType>
2501 uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2502 {
2503  return after.IIOCounter[stack][counter] - before.IIOCounter[stack][counter];
2504 }
2505 
2512 template <class CounterStateType>
2513 uint64 getIRPCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2514 {
2515  return after.IRPCounter[stack][counter] - before.IRPCounter[stack][counter];
2516 }
2517 
2524 template <class CounterStateType>
2525 uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after)
2526 {
2527  return after.xPICounter[port][counter] - before.xPICounter[port][counter];
2528 }
2529 
2536 template <class CounterStateType>
2537 uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2538 {
2539  return after.M2MCounter[controller][counter] - before.M2MCounter[controller][counter];
2540 }
2541 
2542 
2549 template <class CounterStateType>
2550 uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2551 {
2552  if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
2553  return after.EDCCounter[channel][counter] - before.EDCCounter[channel][counter];
2554  return 0ULL;
2555 }
2556 
2562 template <class CounterStateType>
2563 uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after)
2564 {
2565  return after.PCUCounter[counter] - before.PCUCounter[counter];
2566 }
2567 
2572 template <class CounterStateType>
2573 uint64 getPCUClocks(const CounterStateType & before, const CounterStateType & after)
2574 {
2575  return getPCUCounter(0, before, after);
2576 }
2577 
2582 template <class CounterStateType>
2583 uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2584 {
2585  return after.PackageEnergyStatus - before.PackageEnergyStatus;
2586 }
2587 
2592 template <class CounterStateType>
2593 uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2594 {
2595  return after.DRAMEnergyStatus - before.DRAMEnergyStatus;
2596 }
2597 
2598 
2604 template <class CounterStateType>
2605 int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID & counter, const CounterStateType & before, const CounterStateType & after)
2606 {
2607  const auto beforeIt = before.freeRunningCounter.find(counter);
2608  const auto afterIt = after.freeRunningCounter.find(counter);
2609  if (beforeIt != before.freeRunningCounter.end() &&
2610  afterIt != after.freeRunningCounter.end())
2611  {
2612  return afterIt->second - beforeIt->second;
2613  }
2614  return -1;
2615 }
2616 
2617 
2622 template <class CounterStateType>
2623 uint64 getUncoreClocks(const CounterStateType& before, const CounterStateType& after)
2624 {
2625  return after.UncClocks - before.UncClocks;
2626 }
2627 
2632 template <class CounterStateType>
2633 double getConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2634 {
2635  PCM * m = PCM::getInstance();
2636  if (!m) return -1.;
2637 
2638  return double(getConsumedEnergy(before, after)) * m->getJoulesPerEnergyUnit();
2639 }
2640 
2645 template <class CounterStateType>
2646 double getDRAMConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2647 {
2648  PCM * m = PCM::getInstance();
2649  if (!m) return -1.;
2650  double dram_joules_per_energy_unit = 0.;
2651  const auto cpu_model = m->getCPUModel();
2652 
2653  if (PCM::HASWELLX == cpu_model
2654  || PCM::BDX_DE == cpu_model
2655  || PCM::BDX == cpu_model
2656  || PCM::SKX == cpu_model
2657  || PCM::ICX == cpu_model
2658  || PCM::KNL == cpu_model
2659  ) {
2660 /* as described in sections 5.3.2 (DRAM_POWER_INFO) and 5.3.3 (DRAM_ENERGY_STATUS) of
2661  * Volume 2 (Registers) of
2662  * Intel Xeon E5-1600 v3 and Intel Xeon E5-2600 v3 (Haswell-EP) Datasheet (Ref 330784-001, Sept.2014)
2663  * ENERGY_UNIT for DRAM domain is fixed to 15.3 uJ for server HSX, BDW and KNL processors.
2664  */
2665  dram_joules_per_energy_unit = 0.0000153;
2666  } else {
2667 /* for all other processors (including Haswell client/mobile SKUs) the ENERGY_UNIT for DRAM domain
2668  * should be read from PACKAGE_POWER_SKU register (usually value around ~61uJ)
2669  */
2670  dram_joules_per_energy_unit = m->getJoulesPerEnergyUnit();
2671  }
2672  return double(getDRAMConsumedEnergy(before, after)) * dram_joules_per_energy_unit;
2673 }
2674 
2679 {
2680  friend class PCM;
2681  friend class JSONPrinter;
2682  template <class CounterStateType>
2683  friend uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after);
2684  template <class CounterStateType>
2685  friend uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after);
2686  template <class CounterStateType>
2687  friend uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after);
2688  template <class CounterStateType>
2689  friend uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after);
2690  template <class CounterStateType>
2691  friend uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after);
2692  template <class CounterStateType>
2693  friend uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after);
2694  template <class CounterStateType>
2695  friend uint64 getGTRequestBytesFromMC(const CounterStateType & before, const CounterStateType & after);
2696  template <class CounterStateType>
2697  friend uint64 getIARequestBytesFromMC(const CounterStateType & before, const CounterStateType & after);
2698  template <class CounterStateType>
2699  friend uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after);
2700  template <class CounterStateType>
2701  friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2702  template <class CounterStateType>
2703  friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2704  template <class CounterStateType>
2705  friend uint64 getUncoreClocks(const CounterStateType& before, const CounterStateType& after);
2706  template <class CounterStateType>
2707  friend double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
2708  template <class CounterStateType>
2709  friend uint64 getPackageCStateResidency(int state, const CounterStateType& now);
2710  template <class CounterStateType>
2711  friend double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after);
2712  template <class CounterStateType>
2713  friend double getLocalMemoryRequestRatio(const CounterStateType & before, const CounterStateType & after);
2714  template <class CounterStateType>
2715  friend double getAverageUncoreFrequency(const CounterStateType& before, const CounterStateType& after);
2716  template <class CounterStateType>
2717  friend double getAverageFrequencyFromClocks(const int64 clocks, const CounterStateType& before, const CounterStateType& after);
2718 
2719 protected:
2720  uint64 UncMCFullWrites;
2721  uint64 UncMCNormalReads;
2722  uint64 UncHARequests;
2723  uint64 UncHALocalRequests;
2724  uint64 UncPMMWrites;
2725  uint64 UncPMMReads;
2726  uint64 UncEDCFullWrites;
2727  uint64 UncEDCNormalReads;
2728  uint64 UncMCGTRequests;
2729  uint64 UncMCIARequests;
2730  uint64 UncMCIORequests;
2731  uint64 PackageEnergyStatus;
2732  uint64 DRAMEnergyStatus;
2733  uint64 TOROccupancyIAMiss;
2734  uint64 TORInsertsIAMiss;
2735  uint64 UncClocks;
2736  uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2737  void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2738 
2739 public:
2740  UncoreCounterState() :
2741  UncMCFullWrites(0),
2742  UncMCNormalReads(0),
2743  UncHARequests(0),
2744  UncHALocalRequests(0),
2745  UncPMMWrites(0),
2746  UncPMMReads(0),
2747  UncEDCFullWrites(0),
2748  UncEDCNormalReads(0),
2749  UncMCGTRequests(0),
2750  UncMCIARequests(0),
2751  UncMCIORequests(0),
2752  PackageEnergyStatus(0),
2753  DRAMEnergyStatus(0),
2754  TOROccupancyIAMiss(0),
2755  TORInsertsIAMiss(0),
2756  UncClocks(0)
2757  {
2758  std::fill(CStateResidency, CStateResidency + PCM::MAX_C_STATE + 1, 0);
2759  }
2760  virtual ~UncoreCounterState() { }
2761 
2762  UncoreCounterState( const UncoreCounterState& ) = default;
2763  UncoreCounterState( UncoreCounterState&& ) = default;
2764  UncoreCounterState & operator = ( UncoreCounterState&& ) = default;
2765 
2766  UncoreCounterState & operator += (const UncoreCounterState & o)
2767  {
2768  UncMCFullWrites += o.UncMCFullWrites;
2769  UncMCNormalReads += o.UncMCNormalReads;
2770  UncHARequests += o.UncHARequests;
2771  UncHALocalRequests += o.UncHALocalRequests;
2772  UncPMMReads += o.UncPMMReads;
2773  UncPMMWrites += o.UncPMMWrites;
2774  UncEDCFullWrites += o.UncEDCFullWrites;
2775  UncEDCNormalReads += o.UncEDCNormalReads;
2776  UncMCGTRequests += o.UncMCGTRequests;
2777  UncMCIARequests += o.UncMCIARequests;
2778  UncMCIORequests += o.UncMCIORequests;
2779  PackageEnergyStatus += o.PackageEnergyStatus;
2780  DRAMEnergyStatus += o.DRAMEnergyStatus;
2781  TOROccupancyIAMiss += o.TOROccupancyIAMiss;
2782  TORInsertsIAMiss += o.TORInsertsIAMiss;
2783  UncClocks += o.UncClocks;
2784  for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2785  CStateResidency[i] += o.CStateResidency[i];
2786  return *this;
2787  }
2788 };
2789 
2790 
2794 {
2795 public:
2796  enum {
2797  maxControllers = 4,
2798  maxChannels = 12,
2799  maxXPILinks = 6,
2800  maxCBOs = 128,
2801  maxIIOStacks = 16,
2802  maxCounters = 4
2803  };
2804  enum EventPosition
2805  {
2806  xPI_TxL0P_POWER_CYCLES = 0,
2807  xPI_L1_POWER_CYCLES = 2,
2808  xPI_CLOCKTICKS = 3
2809  };
2810  enum FreeRunningCounterID
2811  {
2812  ImcReads,
2813  ImcWrites,
2814  PMMReads,
2815  PMMWrites
2816  };
2817 private:
2818  std::array<std::array<uint64, maxCounters>, maxXPILinks> xPICounter;
2819  std::array<std::array<uint64, maxCounters>, maxXPILinks> M3UPICounter;
2820  std::array<std::array<uint64, maxCounters>, maxCBOs> CBOCounter;
2821  std::array<std::array<uint64, maxCounters>, maxIIOStacks> IIOCounter;
2822  std::array<std::array<uint64, maxCounters>, maxIIOStacks> IRPCounter;
2823  std::array<uint64, maxCounters> UBOXCounter;
2824  std::array<uint64, maxChannels> DRAMClocks;
2825  std::array<uint64, maxChannels> MCDRAMClocks;
2826  std::array<std::array<uint64, maxCounters>, maxChannels> MCCounter; // channel X counter
2827  std::array<std::array<uint64, maxCounters>, maxControllers> M2MCounter; // M2M/iMC boxes x counter
2828  std::array<std::array<uint64, maxCounters>, maxChannels> EDCCounter; // EDC controller X counter
2829  std::array<uint64, maxCounters> PCUCounter;
2830  std::unordered_map<int, uint64> freeRunningCounter;
2831  int32 PackageThermalHeadroom;
2832  uint64 InvariantTSC; // invariant time stamp counter
2833  friend class PCM;
2834  template <class CounterStateType>
2835  friend uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2836  template <class CounterStateType>
2837  friend uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2838  template <class CounterStateType>
2839  friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2840  template <class CounterStateType>
2841  friend uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2842  template <class CounterStateType>
2843  friend uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2844  template <class CounterStateType>
2845  friend uint64 getUBOXCounter(uint32 counter, const CounterStateType& before, const CounterStateType& after);
2846  template <class CounterStateType>
2847  friend uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2848  template <class CounterStateType>
2849  friend uint64 getIRPCounter(uint32 stack, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2850  template <class CounterStateType>
2851  friend uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType& before, const CounterStateType& after);
2852  template <class CounterStateType>
2853  friend uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2854  template <class CounterStateType>
2855  friend uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2856  template <class CounterStateType>
2857  friend uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after);
2858  template <class CounterStateType>
2859  friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2860  template <class CounterStateType>
2861  friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2862  template <class CounterStateType>
2863  friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
2864  template <class CounterStateType>
2865  friend int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID &, const CounterStateType & before, const CounterStateType & after);
2866  template <class CounterStateType>
2867  friend double getAverageFrequencyFromClocks(const int64 clocks, const CounterStateType& before, const CounterStateType& after);
2868 
2869 public:
2871  int32 getPackageThermalHeadroom() const { return PackageThermalHeadroom; }
2873  xPICounter{{}},
2874  M3UPICounter{{}},
2875  CBOCounter{{}},
2876  IIOCounter{{}},
2877  IRPCounter{{}},
2878  UBOXCounter{{}},
2879  DRAMClocks{{}},
2880  MCDRAMClocks{{}},
2881  MCCounter{{}},
2882  M2MCounter{{}},
2883  EDCCounter{{}},
2884  PCUCounter{{}},
2885  PackageThermalHeadroom(0),
2886  InvariantTSC(0)
2887  {
2888  }
2889 };
2890 
2896 template <class CounterStateType>
2897 uint64 getQPIClocks(uint32 port, const CounterStateType& before, const CounterStateType& after)
2898 {
2899  return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_CLOCKTICKS, before, after);
2900 }
2901 
2907 template <class CounterStateType>
2908 uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType& before, const CounterStateType& after)
2909 {
2910  return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_TxL0P_POWER_CYCLES, before, after);
2911 }
2912 
2918 template <class CounterStateType>
2919 uint64 getQPIL1Cycles(uint32 port, const CounterStateType& before, const CounterStateType& after)
2920 {
2921  return getXPICounter(port, ServerUncoreCounterState::EventPosition::xPI_L1_POWER_CYCLES, before, after);
2922 }
2923 
2926 {
2927  friend class PCM;
2928 
2929 public:
2930  CoreCounterState() = default;
2931  CoreCounterState( const CoreCounterState& ) = default;
2932  CoreCounterState( CoreCounterState&& ) = default;
2933  CoreCounterState & operator= ( CoreCounterState&& ) = default;
2934  virtual ~ CoreCounterState() {}
2935 };
2936 
2939 {
2940  friend class PCM;
2941 
2942 protected:
2943  void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2944  {
2945  BasicCounterState::readAndAggregate(handle);
2946  UncoreCounterState::readAndAggregate(handle);
2947  }
2948 
2949 public:
2950  SocketCounterState& operator += ( const BasicCounterState& ccs )
2951  {
2952  BasicCounterState::operator += ( ccs );
2953 
2954  return *this;
2955  }
2956 
2957  SocketCounterState& operator += ( const UncoreCounterState& ucs )
2958  {
2959  UncoreCounterState::operator += ( ucs );
2960 
2961  return *this;
2962  }
2963 
2964  SocketCounterState() = default;
2965  SocketCounterState( const SocketCounterState& ) = default;
2966  SocketCounterState( SocketCounterState&& ) = default;
2967  SocketCounterState & operator = ( SocketCounterState&& ) = default;
2968 
2969  SocketCounterState & operator = ( UncoreCounterState&& ucs ) {
2970  UncoreCounterState::operator = ( std::move(ucs) );
2971  return *this;
2972  }
2973 
2974  virtual ~ SocketCounterState() {}
2975 };
2976 
2979 {
2980  friend class PCM;
2981 
2982  std::vector<std::vector<uint64> > incomingQPIPackets; // each 64 byte
2983  std::vector<std::vector<uint64> > outgoingQPIFlits; // idle or data/non-data flits depending on the architecture
2984  std::vector<std::vector<uint64> > TxL0Cycles;
2985  uint64 uncoreTSC;
2986 
2987 protected:
2988  void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2989  {
2990  BasicCounterState::readAndAggregate(handle);
2991  UncoreCounterState::readAndAggregate(handle);
2992  }
2993 
2994 public:
2995  friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2996  friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
2997  friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2998  friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2999  friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
3000 
3001  SystemCounterState() :
3002  uncoreTSC(0)
3003  {
3004  PCM * m = PCM::getInstance();
3005  incomingQPIPackets.resize(m->getNumSockets(),
3006  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
3007  outgoingQPIFlits.resize(m->getNumSockets(),
3008  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
3009  TxL0Cycles.resize(m->getNumSockets(),
3010  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
3011  }
3012 
3013  SystemCounterState( const SystemCounterState& ) = default;
3014  SystemCounterState( SystemCounterState&& ) = default;
3015  SystemCounterState & operator = ( SystemCounterState&& ) = default;
3016 
3017  SystemCounterState & operator += ( const SocketCounterState& scs )
3018  {
3019  BasicCounterState::operator += ( scs );
3020  UncoreCounterState::operator += ( scs );
3021 
3022  return *this;
3023  }
3024 
3025  SystemCounterState & operator += ( const UncoreCounterState& ucs )
3026  {
3027  UncoreCounterState::operator += ( ucs );
3028 
3029  return *this;
3030  }
3031  virtual ~ SystemCounterState() {}
3032 };
3033 
3044 
3052 PCM_API SocketCounterState getSocketCounterState(uint32 socket);
3053 
3061 PCM_API CoreCounterState getCoreCounterState(uint32 core);
3062 
3063 
3070 template <class CounterStateType>
3071 double getIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
3072 {
3073  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3074  if (clocks != 0)
3075  return double(after.InstRetiredAny - before.InstRetiredAny) / double(clocks);
3076  return -1;
3077 }
3078 
3079 
3086 template <class CounterStateType>
3087 uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after) // instructions
3088 {
3089  return after.InstRetiredAny - before.InstRetiredAny;
3090 }
3091 
3098 template <class CounterStateType>
3099 double getExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
3100 {
3101  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3102  if (timer_clocks != 0)
3103  return double(after.InstRetiredAny - before.InstRetiredAny) / double(timer_clocks);
3104  return -1;
3105 }
3106 
3112 template <class CounterStateType>
3113 uint64 getInstructionsRetired(const CounterStateType & now) // instructions
3114 {
3115  return now.InstRetiredAny.getRawData_NoOverflowProtection();
3116 }
3117 
3135 template <class CounterStateType>
3136 uint64 getCycles(const CounterStateType & before, const CounterStateType & after) // clocks
3137 {
3138  return after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3139 }
3140 
3151 template <class CounterStateType>
3152 uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after) // clocks
3153 {
3154  return after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3155 }
3156 
3164 template <class CounterStateType>
3165 uint64 getCycles(const CounterStateType & now) // clocks
3166 {
3167  return now.CpuClkUnhaltedThread.getRawData_NoOverflowProtection();
3168 }
3169 
3178 template <class CounterStateType>
3179 inline double getCoreIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
3180 {
3181  double ipc = getIPC(before, after);
3182  PCM * m = PCM::getInstance();
3183  if (ipc >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
3184  return ipc * double(m->getThreadsPerCore());
3185  return -1;
3186 }
3187 
3196 template <class CounterStateType>
3197 inline double getTotalExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
3198 {
3199  double usage = getExecUsage(before, after);
3200  PCM * m = PCM::getInstance();
3201  if (usage >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
3202  return usage * double(m->getThreadsPerCore());
3203  return -1;
3204 }
3205 
3206 template <class StateType>
3207 double getAverageFrequencyFromClocks(const int64 clocks, const StateType& before, const StateType& after) // in Hz
3208 {
3209  const int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3210  PCM* m = PCM::getInstance();
3211  if (timer_clocks != 0 && m)
3212  return double(m->getNominalFrequency()) * double(clocks) / double(timer_clocks);
3213  return -1;
3214 }
3215 
3222 template <class CounterStateType>
3223 double getAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
3224 {
3225  return getAverageFrequencyFromClocks(after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread, before, after);
3226 }
3227 
3234 template <class UncoreStateType>
3235 double getAverageUncoreFrequency(const UncoreStateType& before, const UncoreStateType & after) // in Hz
3236 {
3237  auto m = PCM::getInstance();
3238  assert(m);
3239  return double(m->getNumOnlineCores()) * getAverageFrequencyFromClocks(after.UncClocks - before.UncClocks, before, after) / double(m->getNumOnlineSockets());
3240 }
3241 
3248 template <class CounterStateType>
3249 double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
3250 {
3251  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3252  int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3253  PCM * m = PCM::getInstance();
3254  if (ref_clocks != 0 && m)
3255  return double(m->getNominalFrequency()) * double(clocks) / double(ref_clocks);
3256  return -1;
3257 }
3258 
3265 template <class CounterStateType>
3266 double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
3267 {
3268  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3269  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
3270  if (timer_clocks != 0)
3271  return double(clocks) / double(timer_clocks);
3272  return -1;
3273 }
3274 
3281 template <class CounterStateType>
3282 double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
3283 {
3284  if (!PCM::getInstance()->isActiveRelativeFrequencyAvailable()) return -1.;
3285  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
3286  int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
3287  if (ref_clocks != 0)
3288  return double(clocks) / double(ref_clocks);
3289  return -1;
3290 }
3291 
3299 template <class CounterStateType>
3300 double getL2CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
3301 {
3302  if (!PCM::getInstance()->isL2CacheHitRatioAvailable()) return 0;
3303  const auto hits = getL2CacheHits(before, after);
3304  const auto misses = getL2CacheMisses(before, after);
3305  return double(hits) / double(hits + misses);
3306 }
3307 
3315 template <class CounterStateType>
3316 double getL3CacheHitRatio(const CounterStateType& before, const CounterStateType& after) // 0.0 - 1.0
3317 {
3318  if (!PCM::getInstance()->isL3CacheHitRatioAvailable()) return 0;
3319  const auto hits = getL3CacheHits(before, after);
3320  const auto misses = getL3CacheMisses(before, after);
3321  return double(hits) / double(hits + misses);
3322 }
3323 
3331 template <class CounterStateType>
3332 uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after)
3333 {
3334  if (!PCM::getInstance()->isL3CacheMissesAvailable()) return 0;
3335  return after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3336 }
3337 
3345 template <class CounterStateType>
3346 uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after)
3347 {
3348  auto pcm = PCM::getInstance();
3349  if (pcm->isL2CacheMissesAvailable() == false) return 0ULL;
3350  const auto cpu_model = pcm->getCPUModel();
3351  if (pcm->useSkylakeEvents() || cpu_model == PCM::SNOWRIDGE || cpu_model == PCM::ADL) {
3352  return after.Event[BasicCounterState::SKLL2MissPos] - before.Event[BasicCounterState::SKLL2MissPos];
3353  }
3354  if (pcm->isAtom() || cpu_model == PCM::KNL)
3355  {
3356  return after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3357  }
3358  uint64 L3Miss = after.Event[BasicCounterState::L3MissPos] - before.Event[BasicCounterState::L3MissPos];
3359  uint64 L3UnsharedHit = after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3360  uint64 L2HitM = after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3361  return L2HitM + L3UnsharedHit + L3Miss;
3362 }
3363 
3371 template <class CounterStateType>
3372 uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after)
3373 {
3374  auto pcm = PCM::getInstance();
3375  if (pcm->isL2CacheHitsAvailable() == false) return 0ULL;
3376  if (pcm->isAtom() || pcm->getCPUModel() == PCM::KNL)
3377  {
3378  uint64 L2Miss = after.Event[BasicCounterState::ArchLLCMissPos] - before.Event[BasicCounterState::ArchLLCMissPos];
3379  uint64 L2Ref = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3380  return L2Ref - L2Miss;
3381  }
3382  return after.Event[BasicCounterState::L2HitPos] - before.Event[BasicCounterState::L2HitPos];
3383 }
3384 
3388 template <class CounterStateType>
3389 uint64 getL3CacheOccupancy(const CounterStateType & now)
3390 {
3391  if (PCM::getInstance()->L3CacheOccupancyMetricAvailable() == false) return 0ULL;
3392  return now.L3Occupancy;
3393 }
3397 template <class CounterStateType>
3398 uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after)
3399 {
3400  if (PCM::getInstance()->CoreLocalMemoryBWMetricAvailable() == false) return 0ULL;
3401  return after.MemoryBWLocal - before.MemoryBWLocal;
3402 }
3403 
3407 template <class CounterStateType>
3408 uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after)
3409 {
3410  if (PCM::getInstance()->CoreRemoteMemoryBWMetricAvailable() == false) return 0ULL;
3411  const uint64 total = after.MemoryBWTotal - before.MemoryBWTotal;
3412  const uint64 local = getLocalMemoryBW(before, after);
3413  if (total > local)
3414  return total - local;
3415 
3416  return 0;
3417 }
3418 
3426 template <class CounterStateType>
3427 uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after)
3428 {
3429  if (!PCM::getInstance()->isL3CacheHitsNoSnoopAvailable()) return 0;
3430  return after.Event[BasicCounterState::L3UnsharedHitPos] - before.Event[BasicCounterState::L3UnsharedHitPos];
3431 }
3432 
3440 template <class CounterStateType>
3441 uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after)
3442 {
3443  auto pcm = PCM::getInstance();
3444  if (!pcm->isL3CacheHitsSnoopAvailable()) return 0;
3445  const auto cpu_model = pcm->getCPUModel();
3446  if (cpu_model == PCM::SNOWRIDGE || cpu_model == PCM::ADL)
3447  {
3448  const int64 misses = getL3CacheMisses(before, after);
3449  const int64 refs = after.Event[BasicCounterState::ArchLLCRefPos] - before.Event[BasicCounterState::ArchLLCRefPos];
3450  const int64 hits = refs - misses;
3451  return (hits > 0)? hits : 0;
3452  }
3453  if (pcm->useSkylakeEvents()) {
3454  return after.Event[BasicCounterState::SKLL3HitPos] - before.Event[BasicCounterState::SKLL3HitPos];
3455  }
3456  return after.Event[BasicCounterState::L2HitMPos] - before.Event[BasicCounterState::L2HitMPos];
3457 }
3458 
3459 
3467 template <class CounterStateType>
3468 uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after)
3469 {
3470  if (!PCM::getInstance()->isL3CacheHitsAvailable()) return 0;
3471  return getL3CacheHitsSnoop(before, after) + getL3CacheHitsNoSnoop(before, after);
3472 }
3473 
3482 template <class CounterStateType>
3483 uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after)
3484 {
3485  return after.InvariantTSC - before.InvariantTSC;
3486 }
3487 
3495 template <class CounterStateType>
3496 inline double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
3497 {
3498  const double tsc = double(getInvariantTSC(before, after));
3499 
3500  if (state == 0) return double(getRefCycles(before, after)) / tsc;
3501 
3502  if (state == 1)
3503  {
3504  PCM * m = PCM::getInstance();
3505  double result = 1.0 - double(getRefCycles(before, after)) / tsc; // 1.0 - cC0
3506  for (int i = 2; i <= PCM::MAX_C_STATE; ++i)
3507  if (m->isCoreCStateResidencySupported(state))
3508  result -= (after.BasicCounterState::CStateResidency[i] - before.BasicCounterState::CStateResidency[i]) / tsc;
3509 
3510  if (result < 0.) result = 0.; // fix counter dissynchronization
3511  else if (result > 1.) result = 1.; // fix counter dissynchronization
3512 
3513  return result;
3514  }
3515  return (after.BasicCounterState::CStateResidency[state] - before.BasicCounterState::CStateResidency[state]) / tsc;
3516 }
3517 
3524 template <class CounterStateType>
3525 inline uint64 getCoreCStateResidency(int state, const CounterStateType& now)
3526 {
3527  if (state == 0) return now.CpuClkUnhaltedRef.getRawData_NoOverflowProtection();
3528 
3529  return now.BasicCounterState::CStateResidency[state];
3530 }
3531 
3539 template <class CounterStateType>
3540 inline double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
3541 {
3542  const double tsc = double(getInvariantTSC(before, after));
3543  if (state == 0)
3544  {
3545  PCM * m = PCM::getInstance();
3546  double result = 1.0;
3547  for (int i = 1; i <= PCM::MAX_C_STATE; ++i)
3548  if (m->isPackageCStateResidencySupported(state))
3549  result -= (after.UncoreCounterState::CStateResidency[i] - before.UncoreCounterState::CStateResidency[i]) / tsc;
3550 
3551  if (result < 0.) result = 0.; // fix counter dissynchronization
3552  else if (result > 1.) result = 1.; // fix counter dissynchronization
3553 
3554  return result;
3555  }
3556  return double(after.UncoreCounterState::CStateResidency[state] - before.UncoreCounterState::CStateResidency[state]) / tsc;
3557 }
3558 
3565 template <class CounterStateType>
3566 inline uint64 getPackageCStateResidency(int state, const CounterStateType& now)
3567 {
3568  return now.UncoreCounterState::CStateResidency[state];
3569 }
3570 
3577 template <class CounterStateType>
3578 uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after)
3579 {
3580  if (PCM::getInstance()->memoryTrafficMetricsAvailable())
3581  return (after.UncMCNormalReads - before.UncMCNormalReads) * 64;
3582  return 0ULL;
3583 }
3584 
3591 template <class CounterStateType>
3592 uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after)
3593 {
3594  if (PCM::getInstance()->memoryTrafficMetricsAvailable())
3595  return (after.UncMCFullWrites - before.UncMCFullWrites) * 64;
3596  return 0ULL;
3597 }
3598 
3605 template <class CounterStateType>
3606 uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after)
3607 {
3608  if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3609  return (after.UncPMMReads - before.UncPMMReads) * 64;
3610  return 0ULL;
3611 }
3612 
3619 template <class CounterStateType>
3620 uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after)
3621 {
3622  if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3623  return (after.UncPMMWrites - before.UncPMMWrites) * 64;
3624  return 0ULL;
3625 }
3626 
3633 template <class CounterStateType>
3634 uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after)
3635 {
3636  if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
3637  return (after.UncEDCNormalReads - before.UncEDCNormalReads) * 64;
3638  return 0ULL;
3639 }
3640 
3647 template <class CounterStateType>
3648 uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after)
3649 {
3650  if (PCM::getInstance()->MCDRAMmemoryTrafficMetricsAvailable())
3651  return (after.UncEDCFullWrites - before.UncEDCFullWrites) * 64;
3652  return 0ULL;
3653 }
3654 
3661 template <class CounterStateType>
3662 uint64 getGTRequestBytesFromMC(const CounterStateType & before, const CounterStateType & after)
3663 {
3664  if (PCM::getInstance()->memoryIOTrafficMetricAvailable())
3665  return (after.UncMCGTRequests - before.UncMCGTRequests) * 64;
3666  return 0ULL;
3667 }
3668 
3675 template <class CounterStateType>
3676 uint64 getIARequestBytesFromMC(const CounterStateType & before, const CounterStateType & after)
3677 {
3678  if (PCM::getInstance()->memoryIOTrafficMetricAvailable())
3679  return (after.UncMCIARequests - before.UncMCIARequests) * 64;
3680  return 0ULL;
3681 }
3682 
3689 template <class CounterStateType>
3690 uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after)
3691 {
3692  if (PCM::getInstance()->memoryIOTrafficMetricAvailable())
3693  return (after.UncMCIORequests - before.UncMCIORequests) * 64;
3694  return 0ULL;
3695 }
3696 
3703 template <class CounterStateType>
3704 uint64 getSMICount(const CounterStateType & before, const CounterStateType & after)
3705 {
3706  return after.SMICount - before.SMICount;
3707 }
3708 
3718 template <class CounterStateType>
3719 uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after)
3720 {
3721  return after.Event[eventCounterNr] - before.Event[eventCounterNr];
3722 }
3723 
3734 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3735 {
3736  if (!PCM::getInstance()->incomingQPITrafficMetricsAvailable()) return 0ULL;
3737  uint64 b = before.incomingQPIPackets[socketNr][linkNr];
3738  uint64 a = after.incomingQPIPackets[socketNr][linkNr];
3739  // prevent overflows due to counter dissynchronisation
3740  return (a > b) ? (64 * (a - b)) : 0;
3741 }
3742 
3753 inline double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3754 {
3755  PCM * m = PCM::getInstance();
3756  if (!(m->qpiUtilizationMetricsAvailable())) return 0.;
3757 
3758  const double bytes = (double)getIncomingQPILinkBytes(socketNr, linkNr, before, after);
3759  const uint64 max_speed = m->getQPILinkSpeed(socketNr, linkNr);
3760  const double max_bytes = (double)(double(max_speed) * double(getInvariantTSC(before, after) / double(m->getNumOnlineCores())) / double(m->getNominalFrequency()));
3761  return bytes / max_bytes;
3762 }
3763 
3774 inline double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3775 {
3776  PCM * m = PCM::getInstance();
3777 
3778  if (m->outgoingQPITrafficMetricsAvailable() == false) return 0.;
3779 
3780  if (m->hasBecktonUncore())
3781  {
3782  const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // idle flits
3783  const uint64 a = after.outgoingQPIFlits[socketNr][linkNr]; // idle flits
3784  // prevent overflows due to counter dissynchronisation
3785  const double idle_flits = (double)((a > b) ? (a - b) : 0);
3786  const uint64 bTSC = before.uncoreTSC;
3787  const uint64 aTSC = after.uncoreTSC;
3788  const double tsc = (double)((aTSC > bTSC) ? (aTSC - bTSC) : 0);
3789  if (idle_flits >= tsc) return 0.; // prevent overflows due to potential counter dissynchronization
3790 
3791  return (1. - (idle_flits / tsc));
3792  } else if (m->hasPCICFGUncore())
3793  {
3794  const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3795  const uint64 a = after.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3796  // prevent overflows due to counter dissynchronisation
3797  double flits = (double)((a > b) ? (a - b) : 0);
3798  const double max_flits = ((double(getInvariantTSC(before, after)) * double(m->getQPILinkSpeed(socketNr, linkNr)) / m->getBytesPerFlit()) / double(m->getNominalFrequency())) / double(m->getNumOnlineCores());
3799  if(m->hasUPI())
3800  {
3801  flits = flits/3.;
3802  }
3803  if (flits > max_flits) return 1.; // prevent overflows due to potential counter dissynchronization
3804  return (flits / max_flits);
3805  }
3806 
3807  return 0;
3808 }
3809 
3820 inline uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3821 {
3822  PCM * m = PCM::getInstance();
3823  if (!(m->outgoingQPITrafficMetricsAvailable())) return 0ULL;
3824 
3825  const double util = getOutgoingQPILinkUtilization(socketNr, linkNr, before, after);
3826  const double max_bytes = (double(m->getQPILinkSpeed(socketNr, linkNr)) * double(getInvariantTSC(before, after) / double(m->getNumOnlineCores())) / double(m->getNominalFrequency()));
3827 
3828  return (uint64)(max_bytes * util);
3829 }
3830 
3831 
3840 inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3841 {
3842  PCM * m = PCM::getInstance();
3843  const uint32 ns = m->getNumSockets();
3844  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3845  uint64 sum = 0;
3846 
3847  for (uint32 s = 0; s < ns; ++s)
3848  for (uint32 q = 0; q < qpiLinks; ++q)
3849  sum += getIncomingQPILinkBytes(s, q, before, after);
3850 
3851  return sum;
3852 }
3853 
3862 inline uint64 getAllOutgoingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3863 {
3864  PCM * m = PCM::getInstance();
3865  const uint32 ns = m->getNumSockets();
3866  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3867  uint64 sum = 0;
3868 
3869  for (uint32 s = 0; s < ns; ++s)
3870  for (uint32 q = 0; q < qpiLinks; ++q)
3871  sum += getOutgoingQPILinkBytes(s, q, before, after);
3872 
3873  return sum;
3874 }
3875 
3876 
3886 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now)
3887 {
3888  if (PCM::getInstance()->incomingQPITrafficMetricsAvailable())
3889  return 64 * now.incomingQPIPackets[socketNr][linkNr];
3890  return 0ULL;
3891 }
3892 
3901 inline uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState & now)
3902 {
3903  PCM * m = PCM::getInstance();
3904  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3905  uint64 sum = 0;
3906 
3907  for (uint32 q = 0; q < qpiLinks; ++q)
3908  sum += getIncomingQPILinkBytes(socketNr, q, now);
3909 
3910  return sum;
3911 }
3912 
3921 {
3922  PCM * m = PCM::getInstance();
3923  const uint32 ns = m->getNumSockets();
3924  uint64 sum = 0;
3925 
3926  for (uint32 s = 0; s < ns; ++s)
3927  sum += getSocketIncomingQPILinkBytes(s, now);
3928  return sum;
3929 }
3930 
3931 
3941 inline double getQPItoMCTrafficRatio(const SystemCounterState & before, const SystemCounterState & after)
3942 {
3943  const uint64 totalQPI = getAllIncomingQPILinkBytes(before, after);
3944  uint64 memTraffic = getBytesReadFromMC(before, after) + getBytesWrittenToMC(before, after);
3945  if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3946  {
3947  memTraffic += getBytesReadFromPMM(before, after) + getBytesWrittenToPMM(before, after);
3948  }
3949  return double(totalQPI) / double(memTraffic);
3950 }
3951 
3958 template <class CounterStateType>
3959 inline double getLocalMemoryRequestRatio(const CounterStateType & before, const CounterStateType & after)
3960 {
3961  if (PCM::getInstance()->localMemoryRequestRatioMetricAvailable() == false) return -1.;
3962  const auto all = after.UncHARequests - before.UncHARequests;
3963  const auto local = after.UncHALocalRequests - before.UncHALocalRequests;
3964  // std::cout << "PCM DEBUG "<< 64*all/1e6 << " " << 64*local/1e6 << "\n";
3965  return double(local)/double(all);
3966 }
3967 
3971 template <class CounterType>
3972 inline uint64 getNumberOfEvents(const CounterType & before, const CounterType & after)
3973 {
3974  return after.data - before.data;
3975 }
3977 
3978 template <class CounterStateType>
3979 inline double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after)
3980 {
3981  auto * m = PCM::getInstance();
3982  if (m->LLCReadMissLatencyMetricsAvailable() == false) return -1.;
3983  const double occupancy = double(after.TOROccupancyIAMiss) - double(before.TOROccupancyIAMiss);
3984  const double inserts = double(after.TORInsertsIAMiss) - double(before.TORInsertsIAMiss);
3985  const double unc_clocks = double(after.UncClocks) - double(before.UncClocks);
3986  const double seconds = double(getInvariantTSC(before, after)) / double(m->getNumOnlineCores()/m->getNumSockets()) / double(m->getNominalFrequency());
3987  return 1e9*seconds*(occupancy/inserts)/unc_clocks;
3988 }
3989 
3990 template <class CounterStateType>
3991 inline uint64 getAllSlots(const CounterStateType & before, const CounterStateType & after)
3992 {
3993  const int64 a = after.BackendBoundSlots - before.BackendBoundSlots;
3994  const int64 b = after.FrontendBoundSlots - before.FrontendBoundSlots;
3995  const int64 c = after.BadSpeculationSlots - before.BadSpeculationSlots;
3996  const int64 d = after.RetiringSlots - before.RetiringSlots;
3997  // std::cout << "before DEBUG: " << before.FrontendBoundSlots << " " << before.BadSpeculationSlots << " "<< before.BackendBoundSlots << " " << before.RetiringSlots << std::endl;
3998  // std::cout << "after DEBUG: " << after.FrontendBoundSlots << " " << after.BadSpeculationSlots << " " << after.BackendBoundSlots << " " << after.RetiringSlots << std::endl;
3999  assert(a >= 0);
4000  assert(b >= 0);
4001  assert(c >= 0);
4002  assert(d >= 0);
4003  return a + b + c + d;
4004 }
4005 
4006 template <class CounterStateType>
4007 inline uint64 getAllSlotsRaw(const CounterStateType& before, const CounterStateType& after)
4008 {
4009  return after.AllSlotsRaw - before.AllSlotsRaw;
4010 }
4011 
4013 template <class CounterStateType>
4014 inline double getBackendBound(const CounterStateType & before, const CounterStateType & after)
4015 {
4016 // std::cout << "DEBUG: "<< after.BackendBoundSlots - before.BackendBoundSlots << " " << getAllSlots(before, after) << std::endl;
4017  if (PCM::getInstance()->isHWTMAL1Supported())
4018  return double(after.BackendBoundSlots - before.BackendBoundSlots)/double(getAllSlots(before, after));
4019  return 0.;
4020 }
4021 
4023 template <class CounterStateType>
4024 inline double getFrontendBound(const CounterStateType & before, const CounterStateType & after)
4025 {
4026 // std::cout << "DEBUG: "<< after.FrontendBoundSlots - before.FrontendBoundSlots << " " << getAllSlots(before, after) << std::endl;
4027  if (PCM::getInstance()->isHWTMAL1Supported())
4028  return double(after.FrontendBoundSlots - before.FrontendBoundSlots)/double(getAllSlots(before, after));
4029  return 0.;
4030 }
4031 
4033 template <class CounterStateType>
4034 inline double getBadSpeculation(const CounterStateType & before, const CounterStateType & after)
4035 {
4036 // std::cout << "DEBUG: "<< after.BadSpeculationSlots - before.BadSpeculationSlots << " " << getAllSlots(before, after) << std::endl;
4037  if (PCM::getInstance()->isHWTMAL1Supported())
4038  return double(after.BadSpeculationSlots - before.BadSpeculationSlots)/double(getAllSlots(before, after));
4039  return 0.;
4040 }
4041 
4043 template <class CounterStateType>
4044 inline double getRetiring(const CounterStateType & before, const CounterStateType & after)
4045 {
4046 // std::cout << "DEBUG: "<< after.RetiringSlots - before.RetiringSlots << " " << getAllSlots(before, after) << std::endl;
4047  if (PCM::getInstance()->isHWTMAL1Supported())
4048  return double(after.RetiringSlots - before.RetiringSlots)/double(getAllSlots(before, after));
4049  return 0.;
4050 }
4051 
4052 template <class CounterStateType>
4053 uint64 getMSREvent(const uint64& index, const PCM::MSRType& type, const CounterStateType& before, const CounterStateType& after)
4054 {
4055  switch (type)
4056  {
4057  case PCM::MSRType::Freerun:
4058  {
4059  const auto beforeIt = before.MSRValues.find(index);
4060  const auto afterIt = after.MSRValues.find(index);
4061  if (beforeIt != before.MSRValues.end() && afterIt != after.MSRValues.end())
4062  {
4063  return afterIt->second - beforeIt->second;
4064  }
4065  break;
4066  }
4067  case PCM::MSRType::Static:
4068  {
4069  const auto result = after.MSRValues.find(index);
4070  if (result != after.MSRValues.end())
4071  {
4072  return result->second;
4073  }
4074  break;
4075  }
4076  }
4077  return 0ULL;
4078 }
4079 
4080 } // namespace pcm
4081 
4082 #endif
uint64 getLocalMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Local Memory Bandwidth.
Definition: cpucounters.h:3398
uint64 getNominalFrequency() const
Reads the nominal core frequency.
Definition: cpucounters.cpp:5479
uint32 getNumCores() const
Reads number of logical cores in the system.
Definition: cpucounters.cpp:5448
double getConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by processor (excluding DRAM)
Definition: cpucounters.h:2633
Definition: memoptest.cpp:24
uint32 getNumMC() const
Returns the number of detected integrated memory controllers.
Definition: cpucounters.h:505
uint64 getQPILinksPerSocket() const
Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket.
Definition: cpucounters.h:1457
uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/...
Definition: cpucounters.h:2537
uint64 getMCDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns MCDRAM clock ticks.
Definition: cpucounters.h:2441
int getRunState(void)
Returns program&#39;s Run State.
Definition: cpucounters.h:683
uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:3820
uint64 getM3UPICounter(uint32 port, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of M3UPI PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2466
uint32 getMaxIPC() const
Returns the max number of instructions per cycle.
Definition: cpucounters.h:1577
uint32 getThreadsPerCore() const
Reads how many hardware threads has a physical core "Hardware thread" is a logical core in a differen...
Definition: cpucounters.cpp:5469
uint32 getMCPerSocket() const
Returns the number of detected integrated memory controllers per socket.
Definition: cpucounters.h:1484
double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:3266
Internal type and constant definitions.
uint32 getCPUStepping() const
Reads CPU stepping id.
Definition: cpucounters.h:1433
bool isServerCPU() const
Returns whether it is a server part.
Definition: cpucounters.h:1641
double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:3316
(Logical) core-wide counter state
Definition: cpucounters.h:2925
uint64 getRemoteMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Remote Memory Bandwidth.
Definition: cpucounters.h:3408
SystemRoot const & getSystemTopology() const
The system, sockets, uncores, cores and threads are structured like a tree.
Definition: cpucounters.h:1025
uint64 getBytesWrittenToPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to PMM memory.
Definition: cpucounters.h:3620
uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: p...
Definition: cpucounters.h:2550
double getAverageUncoreFrequency(const UncoreStateType &before, const UncoreStateType &after)
Computes average uncore frequency.
Definition: cpucounters.h:3235
uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:3690
Definition: types.h:352
Definition: cpucounters.h:776
ProgramMode
Mode of programming (parameter in the program() method)
Definition: cpucounters.h:689
uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:2424
double getDRAMConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by DRAM.
Definition: cpucounters.h:2646
uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:3372
uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:3136
Definition: cpucounters.h:823
uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:2454
Definition: topology.h:349
int32 getPackageThermalSpecPower() const
Returns thermal specification power of the package domain in Watt.
Definition: cpucounters.h:1692
size_t getMCChannels(uint32 socket, uint32 controller) const
Returns the number of detected memory channels on given integrated memory controllers.
Definition: cpucounters.h:1537
double getRetiring(const CounterStateType &before, const CounterStateType &after)
Returns pipeline slots utilized by uops that eventually retire (commit)
Definition: cpucounters.h:4044
double getLocalMemoryRequestRatio(const CounterStateType &before, const CounterStateType &after)
Get local memory access ration measured in home agent.
Definition: cpucounters.h:3959
double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:3496
Socket-wide counter state.
Definition: cpucounters.h:2938
double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get data utilization of incoming QPI link (0..1)
Definition: cpucounters.h:3753
interface to MBM and CMT using Linux resctrl
Definition: cpucounters.h:119
int32 getThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:2345
uint64 getBytesReadFromPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from PMM memory.
Definition: cpucounters.h:3606
uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
Return QPI Link Speed in GBytes/second.
Definition: cpucounters.h:1683
Definition: cpucounters.h:518
Definition: lspci.h:258
double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:3223
uint32 getNumOnlineCores() const
Reads number of online logical cores in the system.
Definition: cpucounters.cpp:5453
Custom Core event description.
Definition: cpucounters.h:792
double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:3249
Definition: pcm-sensor-server.cpp:247
uint32 getCPUModel() const
Reads CPU model id.
Definition: cpucounters.h:1429
uint64 getIIOCounter(uint32 stack, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of IIO PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2501
double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:3282
uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:2908
int64 getFreeRunningCounter(const typename CounterStateType::FreeRunningCounterID &counter, const CounterStateType &before, const CounterStateType &after)
Returns free running counter if it exists, -1 otherwise.
Definition: cpucounters.h:2605
bool isPackageCStateResidencySupported(int state)
Returns true if the specified package C-state residency metric is supported.
Definition: cpucounters.h:658
uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:3441
Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP...
Definition: cpucounters.h:321
size_t getMCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1509
Definition: cpucounters.h:1169
SupportedCPUModels
Identifiers of supported CPU models.
Definition: cpucounters.h:1355
double getTotalExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time interval for the entire system combining ins...
Definition: cpucounters.h:3197
uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:3346
const std::string & getErrorMessage() const
Returns the error message.
Definition: cpucounters.h:1098
ErrorCode
Return codes (e.g. for program(..) method)
Definition: cpucounters.h:697
double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:2401
Basic core counter state.
Definition: cpucounters.h:2187
uint64 getUncoreClocks(const CounterStateType &before, const CounterStateType &after)
Returns uncore clock ticks.
Definition: cpucounters.h:2623
Definition: cpucounters.h:236
int32 getPackageMinimumPower() const
Returns minimum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1695
Extended custom core event description.
Definition: cpucounters.h:806
void setRunState(int new_state)
Set Run State.
Definition: cpucounters.h:677
uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:3087
CPU Performance Monitor.
Definition: cpucounters.h:543
double getCoreIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle for the entire system combining instru...
Definition: cpucounters.h:3179
double getBadSpeculation(const CounterStateType &before, const CounterStateType &after)
Returns wasted pipeline slots due to incorrect speculation, covering whole penalty: Utilized by uops ...
Definition: cpucounters.h:4034
static PCM * getInstance()
Returns PCM object.
Definition: cpucounters.cpp:239
uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:2563
Definition: bw.cpp:12
uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:3734
uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:3578
SystemCounterState getSystemCounterState()
Reads the counter state of the system.
Definition: cpucounters.cpp:4232
Definition: cpucounters.h:174
uint64 getBytesWrittenToEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to MCDRAM memory controllers.
Definition: cpucounters.h:3648
uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occurred custom core events.
Definition: cpucounters.h:3719
uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:3332
uint64 getGTRequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from GT engine.
Definition: cpucounters.h:3662
uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:3389
Low level interface to access PCI configuration space.
double getBackendBound(const CounterStateType &before, const CounterStateType &after)
Returns unutilized pipeline slots where no uop was delivered due to lack of back-end resources as ran...
Definition: cpucounters.h:4014
bool isCoreCStateResidencySupported(int state)
Returns true if the specified core C-state residency metric is supported.
Definition: cpucounters.h:649
double getLLCReadMissLatency(const CounterStateType &before, const CounterStateType &after)
Returns average last level cache read+prefetch miss latency in ns.
Definition: cpucounters.h:3979
uint64 getAllIncomingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data traffic.
Definition: cpucounters.h:3840
Provides 64-bit "virtual" counters from underlying 32-bit HW counters.
double getQPItoMCTrafficRatio(const SystemCounterState &before, const SystemCounterState &after)
Get QPI data to Memory Controller traffic ratio.
Definition: cpucounters.h:3941
Definition: topology.h:241
Definition: types.h:1245
int32 getPackageMaximumPower() const
Returns maximum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1698
uint64 getPCUFrequency() const
Returns the frequency of Power Control Unit.
Definition: cpucounters.h:1620
Server uncore power counter state.
Definition: cpucounters.h:2793
uint64 getQPILinkSpeed(const uint32 linkNr) const
Returns the speed of the QPI link.
Definition: cpucounters.h:496
Definition: cpucounters.h:262
uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:3483
Definition: topology.h:455
size_t getNumQPIPorts() const
Returns the number of detected QPI ports.
Definition: cpucounters.h:493
System-wide counter state.
Definition: cpucounters.h:2978
uint32 getNumSockets() const
Reads number of sockets (CPUs) in the system.
Definition: cpucounters.cpp:5458
Definition: topology.h:271
uint64 getSMICount(const CounterStateType &before, const CounterStateType &after)
Returns the number of occurred system management interrupts.
Definition: cpucounters.h:3704
int32 getThreadId(uint32 os_id) const
Determines physical thread of given processor ID within a core.
Definition: cpucounters.h:1438
Definition: cpucounters.h:194
uint64 getBytesReadFromEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from MCDRAM memory controllers.
Definition: cpucounters.h:3634
uint64 getNumberOfEvents(const CounterType &before, const CounterType &after)
Returns the raw count of events.
Definition: cpucounters.h:3972
uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:2919
Definition: types.h:1165
Definition: topology.h:109
Definition: cpucounters.h:214
bool isAtom() const
returns true if CPU is Atom-based
Definition: cpucounters.h:1841
int32 getPackageThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:2871
SocketCounterState getSocketCounterState(uint32 socket)
Reads the counter state of a socket.
Definition: cpucounters.cpp:4240
uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:3592
bool isClientCPU() const
Returns whether it is a client part.
Definition: cpucounters.h:1665
double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:3540
Definition: cpucounters.h:152
uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:3152
int32 getCoreId(uint32 os_id) const
Determines physical core of given processor ID within a socket.
Definition: cpucounters.h:1443
uint64 getPCUClocks(const CounterStateType &before, const CounterStateType &after)
Returns clock ticks of power control unit.
Definition: cpucounters.h:2573
double getFrontendBound(const CounterStateType &before, const CounterStateType &after)
Returns unutilized pipeline slots where Front-end did not deliver a uop while back-end is ready as ra...
Definition: cpucounters.h:4024
uint64 getIRPCounter(uint32 stack, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of IRP PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2513
uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState &now)
Get estimation of total QPI data traffic for this socket.
Definition: cpucounters.h:3901
uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:2897
size_t getNumEDCChannels() const
Returns the total number of detected memory channels on all embedded DRAM controllers (EDC) ...
Definition: cpucounters.h:515
uint64 getCBOCounter(uint32 cbo, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of CHA or CBO PMU counter (counter meaning depends on the programming: power/performance/...
Definition: cpucounters.h:2478
CoreCounterState getCoreCounterState(uint32 core)
Reads the counter state of a (logical) core.
Definition: cpucounters.cpp:4248
int32 getTileId(uint32 os_id) const
Determines physical tile (cores sharing L2 cache) of given processor ID.
Definition: cpucounters.h:1448
uint64 getIARequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IA.
Definition: cpucounters.h:3676
double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:3071
Basic uncore counter state.
Definition: cpucounters.h:2678
uint64 getXPICounter(uint32 port, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of UPI or QPI PMU counter (counter meaning depends on the programming: power/performance/...
Definition: cpucounters.h:2525
uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2593
uint64 getUBOXCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of UBOX PMU counter (counter meaning depends on the programming: power/performance/etc) ...
Definition: cpucounters.h:2489
double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:3300
double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time interval.
Definition: cpucounters.h:3099
static bool isAtom(const int32 cpu_model_)
returns true if CPU model is Atom-based
Definition: cpucounters.h:1826
Definition: cpucounters.h:127
uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:3468
Definition: utils.h:159
uint64 getAllOutgoingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data+nondata traffic.
Definition: cpucounters.h:3862
double getJoulesPerEnergyUnit() const
Returns how many joules are in an internal processor energy unit.
Definition: cpucounters.h:1689
size_t getEDCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1564
uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:3427
int32 getSocketId(uint32 core_id) const
Determines socket of given core.
Definition: cpucounters.h:1453
double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:3774
int64 getCPUMicrocodeLevel() const
Get microcode level (returns -1 if retrieval not supported due to some restrictions) ...
Definition: cpucounters.h:1823
Definition: types.h:317
size_t getNumMCChannels() const
Returns the total number of detected memory channels on all integrated memory controllers.
Definition: cpucounters.h:508
Definition: pcm-iio.cpp:146
Interfaces to access free-running bandwidth counters.
double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:2413
Definition: cpucounters.h:87
uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, excluding DRAM (measured in internal units) ...
Definition: cpucounters.h:2583