numeric-linalg
Educational material on the SciPy implementation of numerical linear algebra algorithms
Name | Size | Mode | |
.. | |||
getrf/benchmarks/src/perf.h | 3428B | -rw-r--r-- |
001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
#ifndef PERF_H_ #define PERF_H_ #ifndef __linux__ #error "The perf.h library can only be compiled in the Linux platform" #endif // __linux__ #include <sys/syscall.h> #include <sys/ioctl.h> #include <unistd.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <assert.h> enum { // here we use L1-dcache-loads & L1-dcache-loads-misses instead of // cache-misses & cache-references because the L1 data cache is the only // CPU-specific cache accessible to perf: the LD cache is shared between // cores CACHE_LOADS = 0, /* L1-dcache-loads */ CACHE_MISSES, /* L1-dcache-loads-misses */ CPU_CYCLES, /* cpu-cycles */ PERF_EVENT_COUNT }; static_assert(PERF_EVENT_COUNT == 3, "We should update this array if we add more events"); uint32_t perf_event_types[PERF_EVENT_COUNT] = { [CACHE_LOADS] = PERF_TYPE_HW_CACHE, [CACHE_MISSES] = PERF_TYPE_HW_CACHE, [CPU_CYCLES] = PERF_TYPE_HARDWARE, }; static_assert(PERF_EVENT_COUNT == 3, "We should update this array if we add more events"); uint64_t perf_event_configs[PERF_EVENT_COUNT] = { [CACHE_LOADS] = PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), [CACHE_MISSES] = PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), [CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES, }; static_assert(PERF_EVENT_COUNT == 3, "We should update this array if we add more events"); const char *perf_event_str[PERF_EVENT_COUNT] = { [CACHE_LOADS] = "CACHE_LOADS", [CACHE_MISSES] = "CACHE_MISSES", [CPU_CYCLES] = "CPU_CYCLES", }; static_assert(PERF_EVENT_COUNT == 3, "We should add more filds for this structure " "if we add more events"); typedef struct { uint64_t cache_loads; uint64_t cache_misses; uint64_t cpu_cycles; } PerfResult; typedef struct { int fds[PERF_EVENT_COUNT]; } PerfRecorder; void perf_start_recording(PerfRecorder *pr, size_t cpu_id) { struct perf_event_attr pe = {0}; for (size_t i = 0; i < PERF_EVENT_COUNT; i++) { pe.type = perf_event_types[i]; pe.size = sizeof(struct perf_event_attr); pe.config = perf_event_configs[i]; pe.disabled = 1; // start disabled pe.exclude_kernel = 1; // exclude kernel events pe.exclude_hv = 1; // exclude hypervisor events int fd = syscall(SYS_perf_event_open, &pe, 0, (int)cpu_id /* only count events in this specific CPU */, -1, 0); if (fd == -1) { fprintf(stderr, "ERROR: Couldn't open perf event %s!", perf_event_str[i]); exit(EXIT_FAILURE); } pr->fds[i] = fd; ioctl(fd, PERF_EVENT_IOC_RESET, 0); // reset the counter ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); // start counting } } PerfResult perf_stop_recording(PerfRecorder *pr) { union { uint64_t raw_result[PERF_EVENT_COUNT]; PerfResult result; } r; for (size_t i = 0; i < PERF_EVENT_COUNT; i++) { ioctl(pr->fds[i], PERF_EVENT_IOC_DISABLE, 0); // stop counting if (!read(pr->fds[i], &r.raw_result[i], sizeof(uint64_t))) { fprintf(stderr, "ERROR: Coulnd't read perf counter for %s!\n", perf_event_str[i]); exit(EXIT_FAILURE); } close(pr->fds[i]); } return r.result; } #endif // PERF_H_