numeric-linalg

Educational material on the SciPy implementation of numerical linear algebra algorithms

Name	Size	Mode
..
getrf/benchmarks/src/perf.h	3K	-rw-r--r--

#ifndef PERF_H_
#define PERF_H_

#ifndef __linux__
#error "The perf.h library can only be compiled in the Linux platform"
#endif // __linux__

#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <unistd.h>

#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>

#include <assert.h>

enum {
  // here we use L1-dcache-loads & L1-dcache-loads-misses instead of
  // cache-misses & cache-references because the L1 data cache is the only
  // CPU-specific cache accessible to perf: the LD cache is shared between
  // cores
  CACHE_LOADS = 0, /* L1-dcache-loads */
  CACHE_MISSES,    /* L1-dcache-loads-misses */
  CPU_CYCLES,      /* cpu-cycles */

  PERF_EVENT_COUNT
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should update this array if we add more events");
uint32_t perf_event_types[PERF_EVENT_COUNT] = {
  [CACHE_LOADS]  = PERF_TYPE_HW_CACHE,
  [CACHE_MISSES] = PERF_TYPE_HW_CACHE,
  [CPU_CYCLES]   = PERF_TYPE_HARDWARE,
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should update this array if we add more events");
uint64_t perf_event_configs[PERF_EVENT_COUNT] = {
  [CACHE_LOADS]  = PERF_COUNT_HW_CACHE_L1D
                 | (PERF_COUNT_HW_CACHE_OP_READ << 8)
                 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
  [CACHE_MISSES] = PERF_COUNT_HW_CACHE_L1D
                 | (PERF_COUNT_HW_CACHE_OP_READ << 8)
                 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
  [CPU_CYCLES]   = PERF_COUNT_HW_CPU_CYCLES,
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should update this array if we add more events");
const char *perf_event_str[PERF_EVENT_COUNT] = {
  [CACHE_LOADS]  = "CACHE_LOADS",
  [CACHE_MISSES] = "CACHE_MISSES",
  [CPU_CYCLES]   = "CPU_CYCLES",
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should add more filds for this structure "
              "if we add more events");
typedef struct {
  uint64_t cache_loads;
  uint64_t cache_misses;
  uint64_t cpu_cycles;
} PerfResult;

typedef struct {
  int fds[PERF_EVENT_COUNT];
} PerfRecorder;

void perf_start_recording(PerfRecorder *pr, size_t cpu_id)
{
  struct perf_event_attr pe = {0};

  for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
    pe.type = perf_event_types[i];
    pe.size = sizeof(struct perf_event_attr);
    pe.config = perf_event_configs[i];
    pe.disabled = 1;       // start disabled
    pe.exclude_kernel = 1; // exclude kernel events
    pe.exclude_hv = 1;     // exclude hypervisor events

    int fd = syscall(SYS_perf_event_open, &pe, 0,
                     (int)cpu_id /* only count events in this specific CPU */,
                     -1, 0);
    if (fd == -1) {
      fprintf(stderr, "ERROR: Couldn't open perf event %s!",
              perf_event_str[i]);
      exit(EXIT_FAILURE);
    }

    pr->fds[i] = fd;
    ioctl(fd, PERF_EVENT_IOC_RESET,  0); // reset the counter
    ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); // start counting
  }
}

PerfResult perf_stop_recording(PerfRecorder *pr)
{
  union { uint64_t raw_result[PERF_EVENT_COUNT]; PerfResult result; } r;

  for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
    ioctl(pr->fds[i], PERF_EVENT_IOC_DISABLE, 0); // stop counting
    if (!read(pr->fds[i], &r.raw_result[i], sizeof(uint64_t))) {
      fprintf(stderr, "ERROR: Coulnd't read perf counter for %s!\n",
              perf_event_str[i]);
      exit(EXIT_FAILURE);
    }
    close(pr->fds[i]);
  }

  return r.result;
}

#endif // PERF_H_