numeric-linalg
Educational material on the SciPy implementation of numerical linear algebra algorithms
Name | Size | Mode | |
.. | |||
getrf/benchmarks/src/main.c | 6714B | -rw-r--r-- |
001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
#include <stdio.h> #include <stdlib.h> #define __USE_GNU #include <sched.h> #include <pthread.h> #include <stdint.h> #include <stdbool.h> #include <string.h> #include <errno.h> #include "progress-bar.h" #include "perf.h" #include "config.h" #define HISTOGRAM_SIZE (MAX_N/STEP) #define LOGISTICS_INITIAL_CONDITION (-800.) uint64_t histograms[PERF_EVENT_COUNT][HISTOGRAM_SIZE]; void (*getrf)(int *m, int *n, double *A, int *lda, int *ipiv, int *info); ProgressBar progress = { .total = HISTOGRAM_SIZE, .count = 0, .mutex = PTHREAD_MUTEX_INITIALIZER, }; // .data is a pointer because it should be allocated in the heap // (.data DOES NOT fit in the stack 🤡) typedef struct { double *ref_data; // input parameters for dgetrf double *data; int32_t ipiv[MAX_N]; int n, m, lda, info; size_t id; PerfRecorder recorder; } Thread; typedef struct { Thread threads[N_THREADS]; } Benchmarker; /* * LAPACK functions */ extern void dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info); extern void dgetrfnaive_(int *m, int *n, double *A, int *lda, int *ipiv, int *info); extern void dgetrf2_(int *m, int *n, double *A, int *lda, int *ipiv, int *info); PerfResult thread_run_benchmark(Thread *thread, size_t n) { // reinitializes the values in the .data array: this avoids progressively // moving larger values to the beginning of the array, which would decrease // the number of row interchanges required for computations memcpy(thread->data, thread->ref_data, sizeof(double)*n*n); thread->n = n; thread->m = n; thread->lda = n; perf_start_recording(&thread->recorder, thread->id); getrf(&thread->n, &thread->m, thread->data, &thread->lda, thread->ipiv, &thread->info); return perf_stop_recording(&thread->recorder); } void *thread_benchmark(void *arg) { Thread *thread = (Thread*) arg; // we need to lock the running thread to some CPU so that we can mask // perf_event_open with this specific CPU cpu_set_t set; CPU_ZERO(&set); CPU_SET(thread->id, &set); if (sched_setaffinity(0, sizeof(set), &set) == -1) { fprintf(stderr, "ERROR: Could not lock CPU affinity for thread %lu\n", thread->id); } // computations are distributed evenly across threads for (size_t n = 1 + thread->id*STEP; n < MAX_N; n += STEP*N_THREADS) { PerfResult result = thread_run_benchmark(thread, n); size_t i = (n - 1)/STEP; static_assert(PERF_EVENT_COUNT == 3, "We need to update the assignements " "if we add more perf events"); histograms[CACHE_LOADS][i] = result.cache_loads; histograms[CACHE_MISSES][i] = result.cache_misses; histograms[CPU_CYCLES][i] = result.cpu_cycles; progress_bar_inc(&progress); } return NULL; } Benchmarker benchmarker_new(double *ref_data) { Benchmarker bench = {0}; // this array will live for the entire duration of the program, // so we might as well leak it 🤡 double *data = malloc(sizeof(double)*MAX_N*MAX_N*N_THREADS); if (data == NULL) { fprintf(stderr, "ERROR: Buy more RAM!\n"); exit(EXIT_FAILURE); } for (size_t i = 0; i < N_THREADS; i++) { bench.threads[i].data = data + i*MAX_N*MAX_N; bench.threads[i].ref_data = ref_data; bench.threads[i].id = i; } return bench; } void benchmarker_run(Benchmarker *bench) { pthread_t handles[N_THREADS] = {0}; for (size_t i = 0; i < N_THREADS; i++) { if (pthread_create(&handles[i], NULL, thread_benchmark, &bench->threads[i]) != 0) { fprintf(stderr, "ERROR: Failed to spawn thread %lu!\n", i); exit(EXIT_FAILURE); } } for (size_t i = 0; i < N_THREADS; i++) { pthread_join(handles[i], NULL); } } int main(int argc, char **argv) { const char *output_paths[PERF_EVENT_COUNT]; static_assert(PERF_EVENT_COUNT == 3, "We need to update argv parsing if we add more perf events"); if (argc < 2 || strcmp(argv[1], "standard") == 0) { getrf = dgetrf_; output_paths[CACHE_LOADS] = OUTPUT_DIR "cache-loads.bin"; output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses.bin"; output_paths[CPU_CYCLES] = OUTPUT_DIR "cpu-cycles.bin"; } else if (strcmp(argv[1], "naive") == 0) { getrf = dgetrfnaive_; output_paths[CACHE_LOADS] = OUTPUT_DIR "cache-loads-naive.bin"; output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-naive.bin"; output_paths[CPU_CYCLES] = OUTPUT_DIR "cpu-cycles-naive.bin"; } else if (strcmp(argv[1], "unblocked") == 0) { getrf = dgetrf2_; output_paths[CACHE_LOADS] = OUTPUT_DIR "cache-loads-unblocked.bin"; output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-unblocked.bin"; output_paths[CPU_CYCLES] = OUTPUT_DIR "cpu-cycles-unblocked.bin"; } else { fprintf(stderr, "ERROR: unknown command \"%s\"\n", argv[1]); fprintf(stderr, "USAGE: %s [standard|naive|unblocked]\n", argv[0]); exit(EXIT_FAILURE); } // ======================================================================== printf("INFO: Initializing random input data... "); // this array will live for the entire duration of the program, // so we might as well leak it 🤡 double *ref_data = malloc(sizeof(double)*MAX_N*MAX_N); if (ref_data == NULL) { fprintf(stderr, "ERROR: Buy more RAM!\n"); exit(EXIT_FAILURE); } // pseudorandom data given by the logistic map double acc = LOGISTICS_INITIAL_CONDITION; for (size_t i = 0; i < MAX_N*MAX_N; i++) { ref_data[i] = acc; acc = 1000. - acc*acc/500.; } printf("done!\n"); // ======================================================================== printf("INFO: Benchmarking the dgetrf on %ux%u matrices... (using %u threads)\n", MAX_N, MAX_N, N_THREADS); Benchmarker bench = benchmarker_new(ref_data); benchmarker_run(&bench); // ======================================================================== printf("INFO: Done with benchmarking! Saving histograms to disk...\n"); for (size_t i = 0; i < PERF_EVENT_COUNT; i++) { FILE *output = fopen(output_paths[i], "w"); if (output == NULL) { fprintf(stderr, "ERROR: Coundn't open output file \"%s\": %s", output_paths[i], strerror(errno)); return EXIT_FAILURE; } size_t written = fwrite(&histograms[i], sizeof(uint64_t), HISTOGRAM_SIZE, output); if (written < HISTOGRAM_SIZE) { fprintf(stderr, "ERROR: Failed to write histogram to output file \"%s\": %s\n", output_paths[i], strerror(errno)); return EXIT_FAILURE; } }; printf("INFO: Done!\n"); return EXIT_SUCCESS; }