Educational material on the SciPy implementation of numerical linear algebra algorithms
diff --git a/getrf/benchmark/src/perf.h b/getrf/benchmark/src/perf.h
@@ -10,20 +10,22 @@
#include <assert.h>
-typedef enum {
+enum {
// Here we use L1-dcache-loads & L1-dcache-loads-misses instead of
// cache-misses & cache-references because the L1 data cache is the only
// CPU-specific cache accessible to perf: the LD cache is shared between
// cores
CACHE_LOADS = 0, /* L1-dcache-loads */
CACHE_MISSES, /* L1-dcache-loads-misses */
+ CPU_CYCLES, /* cpu-cycles */
PERF_EVENT_COUNT
-} PerfEvent;
+};
uint32_t perf_event_types[PERF_EVENT_COUNT] = {
[CACHE_LOADS] = PERF_TYPE_HW_CACHE,
[CACHE_MISSES] = PERF_TYPE_HW_CACHE,
+ [CPU_CYCLES] = PERF_TYPE_HARDWARE,
};
uint64_t perf_event_configs[PERF_EVENT_COUNT] = {
@@ -33,19 +35,22 @@ uint64_t perf_event_configs[PERF_EVENT_COUNT] = {
[CACHE_MISSES] = PERF_COUNT_HW_CACHE_L1D
| (PERF_COUNT_HW_CACHE_OP_READ << 8)
| (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
+ [CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES,
};
const char *perf_event_str[PERF_EVENT_COUNT] = {
[CACHE_LOADS] = "CACHE_LOADS",
[CACHE_MISSES] = "CACHE_MISSES",
+ [CPU_CYCLES] = "CPU_CYCLES",
};
-static_assert(PERF_EVENT_COUNT == 2,
- "We should add more filds for this structure"
+static_assert(PERF_EVENT_COUNT == 3,
+ "We should add more filds for this structure "
"if we add more events");
typedef struct {
uint64_t cache_loads;
uint64_t cache_misses;
+ uint64_t cpu_cycles;
} PerfResult;
typedef struct {
@@ -83,7 +88,7 @@ void perf_start_recording(PerfRecorder *pr, size_t cpu_id)
}
}
-void perf_stop_recording(PerfRecorder *pr)
+PerfResult perf_stop_recording(PerfRecorder *pr)
{
for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
ioctl(pr->fds[i], PERF_EVENT_IOC_DISABLE, 0); // stop counting
@@ -94,6 +99,8 @@ void perf_stop_recording(PerfRecorder *pr)
}
close(pr->fds[i]);
}
+
+ return pr->result;
}
#endif // PERF_H_