numeric-linalg

Educational material on the SciPy implementation of numerical linear algebra algorithms

File Name Size Mode
perf.h 3428B -rw-r--r--
  1 #ifndef PERF_H_
  2 #define PERF_H_
  3 
  4 #ifndef __linux__
  5 #error "The perf.h library can only be compiled in the Linux platform"
  6 #endif // __linux__
  7 
  8 #include <sys/syscall.h>
  9 #include <sys/ioctl.h>
 10 #include <unistd.h>
 11 
 12 #include <linux/perf_event.h>
 13 #include <linux/hw_breakpoint.h>
 14 
 15 #include <assert.h>
 16 
 17 enum {
 18   // Here we use L1-dcache-loads & L1-dcache-loads-misses instead of
 19   // cache-misses & cache-references because the L1 data cache is the only
 20   // CPU-specific cache accessible to perf: the LD cache is shared between
 21   // cores
 22   CACHE_LOADS = 0, /* L1-dcache-loads */
 23   CACHE_MISSES,    /* L1-dcache-loads-misses */
 24   CPU_CYCLES,      /* cpu-cycles */
 25 
 26   PERF_EVENT_COUNT
 27 };
 28 
 29 static_assert(PERF_EVENT_COUNT == 3,
 30               "We should update this array if we add more events");
 31 uint32_t perf_event_types[PERF_EVENT_COUNT] = {
 32   [CACHE_LOADS]  = PERF_TYPE_HW_CACHE,
 33   [CACHE_MISSES] = PERF_TYPE_HW_CACHE,
 34   [CPU_CYCLES]   = PERF_TYPE_HARDWARE,
 35 };
 36 
 37 static_assert(PERF_EVENT_COUNT == 3,
 38               "We should update this array if we add more events");
 39 uint64_t perf_event_configs[PERF_EVENT_COUNT] = {
 40   [CACHE_LOADS]  = PERF_COUNT_HW_CACHE_L1D
 41                  | (PERF_COUNT_HW_CACHE_OP_READ << 8)
 42                  | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
 43   [CACHE_MISSES] = PERF_COUNT_HW_CACHE_L1D
 44                  | (PERF_COUNT_HW_CACHE_OP_READ << 8)
 45                  | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
 46   [CPU_CYCLES]   = PERF_COUNT_HW_CPU_CYCLES,
 47 };
 48 
 49 static_assert(PERF_EVENT_COUNT == 3,
 50               "We should update this array if we add more events");
 51 const char *perf_event_str[PERF_EVENT_COUNT] = {
 52   [CACHE_LOADS]  = "CACHE_LOADS",
 53   [CACHE_MISSES] = "CACHE_MISSES",
 54   [CPU_CYCLES]   = "CPU_CYCLES",
 55 };
 56 
 57 static_assert(PERF_EVENT_COUNT == 3,
 58               "We should add more filds for this structure "
 59               "if we add more events");
 60 typedef struct {
 61   uint64_t cache_loads;
 62   uint64_t cache_misses;
 63   uint64_t cpu_cycles;
 64 } PerfResult;
 65 
 66 typedef struct {
 67   int fds[PERF_EVENT_COUNT];
 68 } PerfRecorder;
 69 
 70 void perf_start_recording(PerfRecorder *pr, size_t cpu_id)
 71 {
 72   struct perf_event_attr pe = {0};
 73 
 74   for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
 75     pe.type = perf_event_types[i];
 76     pe.size = sizeof(struct perf_event_attr);
 77     pe.config = perf_event_configs[i];
 78     pe.disabled = 1;       // start disabled
 79     pe.exclude_kernel = 1; // exclude kernel events
 80     pe.exclude_hv = 1;     // exclude hypervisor events
 81 
 82     int fd = syscall(SYS_perf_event_open, &pe, 0,
 83                      (int)cpu_id /* only count events in this specific CPU */,
 84                      -1, 0);
 85     if (fd == -1) {
 86       fprintf(stderr, "ERROR: Couldn't open perf event %s!",
 87               perf_event_str[i]);
 88       exit(EXIT_FAILURE);
 89     }
 90 
 91     pr->fds[i] = fd;
 92     ioctl(fd, PERF_EVENT_IOC_RESET,  0); // reset the counter
 93     ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); // start counting
 94   }
 95 }
 96 
 97 PerfResult perf_stop_recording(PerfRecorder *pr)
 98 {
 99   union { uint64_t raw_result[PERF_EVENT_COUNT]; PerfResult result; } r;
100 
101   for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
102     ioctl(pr->fds[i], PERF_EVENT_IOC_DISABLE, 0); // stop counting
103     if (!read(pr->fds[i], &r.raw_result[i], sizeof(uint64_t))) {
104       fprintf(stderr, "ERROR: Coulnd't read perf counter for %s!\n",
105               perf_event_str[i]);
106       exit(EXIT_FAILURE);
107     }
108     close(pr->fds[i]);
109   }
110 
111   return r.result;
112 }
113 
114 #endif // PERF_H_