numeric-linalg
Educational material on the SciPy implementation of numerical linear algebra algorithms
File Name | Size | Mode |
main.c | 6722B | -rw-r--r-- |
1 #include <stdio.h> 2 #include <stdlib.h> 3 4 #define __USE_GNU 5 #include <sched.h> 6 #include <pthread.h> 7 8 #include <stdint.h> 9 #include <stdbool.h> 10 11 #include <string.h> 12 #include <errno.h> 13 14 #include "progress-bar.h" 15 #include "perf.h" 16 #include "config.h" 17 18 #define HISTOGRAM_SIZE (MAX_N/STEP) 19 #define LOGISTICS_INITIAL_CONDITION (-800.) 20 21 uint64_t histograms[PERF_EVENT_COUNT][HISTOGRAM_SIZE]; 22 void (*getrf)(int *m, int *n, double *A, int *lda, int *ipiv, int *info); 23 24 ProgressBar progress = { 25 .total = HISTOGRAM_SIZE, 26 .count = 0, 27 .mutex = PTHREAD_MUTEX_INITIALIZER, 28 }; 29 30 // .data is a pointer because it should be allocated in the heap 31 // (.data DOES NOT fit in the stack 🤡) 32 typedef struct { 33 double *ref_data; 34 35 // Input parameters for dgetrf 36 double *data; 37 int32_t ipiv[MAX_N]; 38 int n, m, lda, info; 39 40 size_t id; 41 PerfRecorder recorder; 42 } Thread; 43 44 typedef struct { 45 Thread threads[N_THREADS]; 46 } Benchmarker; 47 48 /* 49 * LAPACK functions 50 */ 51 extern void dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info); 52 extern void dgetrfnaive_(int *m, int *n, double *A, int *lda, int *ipiv, int *info); 53 extern void dgetrf2_(int *m, int *n, double *A, int *lda, int *ipiv, int *info); 54 55 PerfResult thread_run_benchmark(Thread *thread, size_t n) 56 { 57 // Reinitializes the values in the .data array: this avoids progressively 58 // moving larger values to the beginning of the array, which would decrise 59 // decrease the number of row interchanges required for computations 60 memcpy(thread->data, thread->ref_data, sizeof(double)*n*n); 61 thread->n = n; thread->m = n; thread->lda = n; 62 63 perf_start_recording(&thread->recorder, thread->id); 64 getrf(&thread->n, &thread->m, thread->data, &thread->lda, 65 thread->ipiv, &thread->info); 66 return perf_stop_recording(&thread->recorder); 67 } 68 69 void *thread_benchmark(void *arg) 70 { 71 Thread *thread = (Thread*) arg; 72 73 // We need to lock the running thread to some CPU so that we can mask 74 // perf_event_open with this specific CPU 75 cpu_set_t set; 76 CPU_ZERO(&set); 77 CPU_SET(thread->id, &set); 78 if (sched_setaffinity(0, sizeof(set), &set) == -1) { 79 fprintf(stderr, "ERROR: Could not lock CPU affinity for thread %lu\n", 80 thread->id); 81 } 82 83 // Computations are distributed evenly across threads 84 for (size_t n = 1 + thread->id*STEP; n < MAX_N; n += STEP*N_THREADS) { 85 PerfResult result = thread_run_benchmark(thread, n); 86 size_t i = (n - 1)/STEP; 87 88 static_assert(PERF_EVENT_COUNT == 3, 89 "We need to update the assignements " 90 "if we add more perf events"); 91 histograms[CACHE_LOADS][i] = result.cache_loads; 92 histograms[CACHE_MISSES][i] = result.cache_misses; 93 histograms[CPU_CYCLES][i] = result.cpu_cycles; 94 95 progress_bar_inc(&progress); 96 } 97 98 return NULL; 99 } 100 101 Benchmarker benchmarker_new(double *ref_data) 102 { 103 Benchmarker bench = {0}; 104 105 // This array will live for the entire duration of the program, 106 // so we might as well leak it 🤡 107 double *data = malloc(sizeof(double)*MAX_N*MAX_N*N_THREADS); 108 if (data == NULL) { 109 fprintf(stderr, "ERROR: Buy more RAM!\n"); 110 exit(EXIT_FAILURE); 111 } 112 113 for (size_t i = 0; i < N_THREADS; i++) { 114 bench.threads[i].data = data + i*MAX_N*MAX_N; 115 bench.threads[i].ref_data = ref_data; 116 bench.threads[i].id = i; 117 } 118 119 return bench; 120 } 121 122 void benchmarker_run(Benchmarker *bench) 123 { 124 pthread_t handles[N_THREADS] = {0}; 125 for (size_t i = 0; i < N_THREADS; i++) { 126 if (pthread_create(&handles[i], NULL, 127 thread_benchmark, 128 &bench->threads[i]) != 0) { 129 fprintf(stderr, "ERROR: Failed to spawn thread %lu!\n", i); 130 exit(EXIT_FAILURE); 131 } 132 } 133 134 for (size_t i = 0; i < N_THREADS; i++) { 135 pthread_join(handles[i], NULL); 136 } 137 } 138 139 140 int main(int argc, char **argv) 141 { 142 const char *output_paths[PERF_EVENT_COUNT]; 143 144 static_assert(PERF_EVENT_COUNT == 3, 145 "We need to update argv parsing if we add more perf events"); 146 if (argc < 2 || strcmp(argv[1], "standard") == 0) { 147 getrf = dgetrf_; 148 output_paths[CACHE_LOADS] = OUTPUT_DIR "cache-loads.bin"; 149 output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses.bin"; 150 output_paths[CPU_CYCLES] = OUTPUT_DIR "cpu-cycles.bin"; 151 } else if (strcmp(argv[1], "naive") == 0) { 152 getrf = dgetrfnaive_; 153 output_paths[CACHE_LOADS] = OUTPUT_DIR "cache-loads-naive.bin"; 154 output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-naive.bin"; 155 output_paths[CPU_CYCLES] = OUTPUT_DIR "cpu-cycles-naive.bin"; 156 } else if (strcmp(argv[1], "unblocked") == 0) { 157 getrf = dgetrf2_; 158 output_paths[CACHE_LOADS] = OUTPUT_DIR "cache-loads-unblocked.bin"; 159 output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-unblocked.bin"; 160 output_paths[CPU_CYCLES] = OUTPUT_DIR "cpu-cycles-unblocked.bin"; 161 } else { 162 fprintf(stderr, "ERROR: unknown command \"%s\"\n", argv[1]); 163 fprintf(stderr, "USAGE: %s [standard|naive|unblocked]\n", argv[0]); 164 exit(EXIT_FAILURE); 165 } 166 167 // ======================================================================== 168 printf("INFO: Initializing random input data... "); 169 170 // This array will live for the entire duration of the program, 171 // so we might as well leak it 🤡 172 double *ref_data = malloc(sizeof(double)*MAX_N*MAX_N); 173 if (ref_data == NULL) { 174 fprintf(stderr, "ERROR: Buy more RAM!\n"); 175 exit(EXIT_FAILURE); 176 } 177 178 // Pseudorandom data given by the logistic map 179 double acc = LOGISTICS_INITIAL_CONDITION; 180 for (size_t i = 0; i < MAX_N*MAX_N; i++) { 181 ref_data[i] = acc; 182 acc = 1000. - acc*acc/500.; 183 } 184 185 printf("done!\n"); 186 187 // ======================================================================== 188 printf("INFO: Benchmarking the dgetrf on %ux%u matrices... (using %u threads)\n", 189 MAX_N, MAX_N, N_THREADS); 190 191 Benchmarker bench = benchmarker_new(ref_data); 192 benchmarker_run(&bench); 193 194 // ======================================================================== 195 printf("INFO: Done with benchmarking! Saving histograms to disk...\n"); 196 197 for (size_t i = 0; i < PERF_EVENT_COUNT; i++) { 198 FILE *output = fopen(output_paths[i], "w"); 199 if (output == NULL) { 200 fprintf(stderr, 201 "ERROR: Coundn't open output file \"%s\": %s", 202 output_paths[i], strerror(errno)); 203 return EXIT_FAILURE; 204 } 205 206 size_t written = fwrite(&histograms[i], sizeof(uint64_t), HISTOGRAM_SIZE, output); 207 if (written < HISTOGRAM_SIZE) { 208 fprintf(stderr, 209 "ERROR: Failed to write histogram to output file \"%s\": %s\n", 210 output_paths[i], strerror(errno)); 211 return EXIT_FAILURE; 212 } 213 }; 214 215 printf("INFO: Done!\n"); 216 return EXIT_SUCCESS; 217 }