numeric-linalg

Educational material on the SciPy implementation of numerical linear algebra algorithms

NameSizeMode
..
getrf/benchmarks/src/perf.h 3428B -rw-r--r--
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#ifndef PERF_H_
#define PERF_H_

#ifndef __linux__
#error "The perf.h library can only be compiled in the Linux platform"
#endif // __linux__

#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <unistd.h>

#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>

#include <assert.h>

enum {
  // here we use L1-dcache-loads & L1-dcache-loads-misses instead of
  // cache-misses & cache-references because the L1 data cache is the only
  // CPU-specific cache accessible to perf: the LD cache is shared between
  // cores
  CACHE_LOADS = 0, /* L1-dcache-loads */
  CACHE_MISSES,    /* L1-dcache-loads-misses */
  CPU_CYCLES,      /* cpu-cycles */

  PERF_EVENT_COUNT
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should update this array if we add more events");
uint32_t perf_event_types[PERF_EVENT_COUNT] = {
  [CACHE_LOADS]  = PERF_TYPE_HW_CACHE,
  [CACHE_MISSES] = PERF_TYPE_HW_CACHE,
  [CPU_CYCLES]   = PERF_TYPE_HARDWARE,
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should update this array if we add more events");
uint64_t perf_event_configs[PERF_EVENT_COUNT] = {
  [CACHE_LOADS]  = PERF_COUNT_HW_CACHE_L1D
                 | (PERF_COUNT_HW_CACHE_OP_READ << 8)
                 | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
  [CACHE_MISSES] = PERF_COUNT_HW_CACHE_L1D
                 | (PERF_COUNT_HW_CACHE_OP_READ << 8)
                 | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
  [CPU_CYCLES]   = PERF_COUNT_HW_CPU_CYCLES,
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should update this array if we add more events");
const char *perf_event_str[PERF_EVENT_COUNT] = {
  [CACHE_LOADS]  = "CACHE_LOADS",
  [CACHE_MISSES] = "CACHE_MISSES",
  [CPU_CYCLES]   = "CPU_CYCLES",
};

static_assert(PERF_EVENT_COUNT == 3,
              "We should add more filds for this structure "
              "if we add more events");
typedef struct {
  uint64_t cache_loads;
  uint64_t cache_misses;
  uint64_t cpu_cycles;
} PerfResult;

typedef struct {
  int fds[PERF_EVENT_COUNT];
} PerfRecorder;

void perf_start_recording(PerfRecorder *pr, size_t cpu_id)
{
  struct perf_event_attr pe = {0};

  for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
    pe.type = perf_event_types[i];
    pe.size = sizeof(struct perf_event_attr);
    pe.config = perf_event_configs[i];
    pe.disabled = 1;       // start disabled
    pe.exclude_kernel = 1; // exclude kernel events
    pe.exclude_hv = 1;     // exclude hypervisor events

    int fd = syscall(SYS_perf_event_open, &pe, 0,
                     (int)cpu_id /* only count events in this specific CPU */,
                     -1, 0);
    if (fd == -1) {
      fprintf(stderr, "ERROR: Couldn't open perf event %s!",
              perf_event_str[i]);
      exit(EXIT_FAILURE);
    }

    pr->fds[i] = fd;
    ioctl(fd, PERF_EVENT_IOC_RESET,  0); // reset the counter
    ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); // start counting
  }
}

PerfResult perf_stop_recording(PerfRecorder *pr)
{
  union { uint64_t raw_result[PERF_EVENT_COUNT]; PerfResult result; } r;

  for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
    ioctl(pr->fds[i], PERF_EVENT_IOC_DISABLE, 0); // stop counting
    if (!read(pr->fds[i], &r.raw_result[i], sizeof(uint64_t))) {
      fprintf(stderr, "ERROR: Coulnd't read perf counter for %s!\n",
              perf_event_str[i]);
      exit(EXIT_FAILURE);
    }
    close(pr->fds[i]);
  }

  return r.result;
}

#endif // PERF_H_