numeric-linalg

Educational material on the SciPy implementation of numerical linear algebra algorithms

File Name Size Mode
cache-ratio.c 6076B -rw-r--r--
  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 
  4 #define __USE_GNU
  5 #include <sched.h>
  6 #include <pthread.h>
  7 
  8 #include <stdint.h>
  9 #include <stdbool.h>
 10 
 11 #include <string.h>
 12 #include <errno.h>
 13 
 14 #include "progress-bar.h"
 15 #include "perf.h"
 16 #include "config.h"
 17 
 18 #define HISTOGRAM_SIZE              (MAX_N/STEP)
 19 #define LOGISTICS_INITIAL_CONDITION (-800.)
 20 
 21 // X-macros for defining variables for each perf event
 22 #define EVENTS  EVENT(loads) EVENT(misses)
 23 
 24 #define EVENT(name) uint64_t histogram_##name[HISTOGRAM_SIZE];
 25 EVENTS
 26 #undef  EVENT
 27 
 28 void (*getrf)(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
 29 
 30 ProgressBar progress = {
 31   .total = HISTOGRAM_SIZE,
 32   .count = 0,
 33   .mutex = PTHREAD_MUTEX_INITIALIZER,
 34 };
 35 
 36 // .data is a pointer because it should be allocated in the heap
 37 // (.data DOES NOT fit in the stack 🤡)
 38 typedef struct {
 39   double       *ref_data;
 40 
 41   // Input parameters for dgetrf
 42   double       *data;
 43   int32_t      ipiv[MAX_N];
 44   int          n, m, lda, info;
 45 
 46   size_t       id;
 47   PerfRecorder recorder;
 48 } Thread;
 49 
 50 typedef struct {
 51   Thread threads[N_THREADS];
 52 } Benchmarker;
 53 
 54 /*
 55  * LAPACK functions
 56  */
 57 extern void dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
 58 extern void dgetrf2_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
 59 
 60 PerfResult thread_run_benchmark(Thread *thread, size_t n)
 61 {
 62   // Reinitializes the values in the .data array: this avoids progressively
 63   // moving larger values to the beginning of the array, which would decrise
 64   // decrease the number of row interchanges required for computations
 65   memcpy(thread->data, thread->ref_data, sizeof(double)*n*n);
 66   thread->n = n; thread->m = n; thread->lda = n;
 67 
 68   perf_start_recording(&thread->recorder, thread->id);
 69   getrf(&thread->n, &thread->m, thread->data, &thread->lda,
 70         thread->ipiv, &thread->info);
 71   perf_stop_recording(&thread->recorder);
 72 
 73   return thread->recorder.result;
 74 }
 75 
 76 void *thread_benchmark(void *arg)
 77 {
 78   Thread *thread = (Thread*) arg;
 79 
 80  // We need to lock the running thread to some CPU so that we can mask
 81  // perf_event_open with this specific CPU
 82   cpu_set_t set;
 83   CPU_SET(thread->id, &set);
 84   if (sched_setaffinity(0, sizeof(set), &set) == -1) {
 85     fprintf(stderr, "ERROR: Could not lock CPU affinity for thread %lu\n",
 86             thread->id);
 87   }
 88 
 89   // Computations are distributed evenly across threads
 90   for (size_t n = 1 + thread->id*STEP; n < MAX_N; n += STEP*N_THREADS) {
 91     PerfResult result = thread_run_benchmark(thread, n);
 92     size_t i = (n - 1)/STEP;
 93 
 94     histogram_loads[i]  = result.cache_loads;
 95     histogram_misses[i] = result.cache_misses;
 96 
 97     progress_bar_inc(&progress);
 98   }
 99 
100   return NULL;
101 }
102 
103 Benchmarker benchmarker_new(double *ref_data)
104 {
105   Benchmarker bench = {0};
106 
107   // This array will live for the entire duration of the program,
108   // so we might as well leak it 🤡
109   double *data = malloc(sizeof(double)*MAX_N*MAX_N*N_THREADS);
110   if (data == NULL) {
111     fprintf(stderr, "ERROR: Buy more RAM!\n");
112     exit(EXIT_FAILURE);
113   }
114 
115   for (size_t i = 0; i < N_THREADS; i++) {
116     bench.threads[i].data     = data + i*MAX_N*MAX_N;
117     bench.threads[i].ref_data = ref_data;
118     bench.threads[i].id       = i;
119   }
120 
121   return bench;
122 }
123 
124 void benchmarker_run(Benchmarker *bench)
125 {
126   pthread_t handles[N_THREADS] = {0};
127   for (size_t i = 0; i < N_THREADS; i++) {
128     if (pthread_create(&handles[i], NULL,
129                        thread_benchmark,
130                        &bench->threads[i]) != 0) {
131       fprintf(stderr, "ERROR: Failed to spawn thread %lu!\n", i);
132       exit(EXIT_FAILURE);
133     }
134   }
135 
136   for (size_t i = 0; i < N_THREADS; i++) {
137     pthread_join(handles[i], NULL);
138   }
139 }
140 
141 
142 int main(int argc, char **argv)
143 {
144 #define EVENT(name) const char *output_path_##name;
145   EVENTS
146 #undef  EVENT
147 
148   if (argc < 2 || strcmp(argv[1], "standard") == 0) {
149     getrf = dgetrf_;
150 #define EVENT(name) \
151     output_path_##name = OUTPUT_DIR "cache-" #name ".bin";
152     EVENTS
153 #undef EVENT
154   } else if (strcmp(argv[1], "unblocked") == 0) {
155     getrf = dgetrf2_;
156 #define EVENT(name) \
157     output_path_##name = OUTPUT_DIR "unblocked-cache-" #name ".bin";
158     EVENTS
159 #undef EVENT
160   } else {
161     fprintf(stderr, "ERROR: unknown command \"%s\"\n", argv[1]);
162     fprintf(stderr, "USAGE: %s [standard|naive|unblocked]\n", argv[0]);
163     exit(EXIT_FAILURE);
164   }
165 
166   // ========================================================================
167   printf("INFO: Initializing random input data... ");
168 
169   // This array will live for the entire duration of the program,
170   // so we might as well leak it 🤡
171   double *ref_data = malloc(sizeof(double)*MAX_N*MAX_N);
172   if (ref_data == NULL) {
173     fprintf(stderr, "ERROR: Buy more RAM!\n");
174     exit(EXIT_FAILURE);
175   }
176 
177   // Pseudorandom data given by the logistic map
178   double acc = LOGISTICS_INITIAL_CONDITION;
179   for (size_t i = 0; i < MAX_N*MAX_N; i++) {
180     ref_data[i] = acc;
181     acc = 1000. - acc*acc/500.;
182   }
183 
184   printf("done!\n");
185 
186   // ========================================================================
187   printf("INFO: Benchmarking the cache miss ratio of dgetrf on %ux%u matrices... (using %u threads)\n",
188          MAX_N, MAX_N, N_THREADS);
189 
190   Benchmarker bench = benchmarker_new(ref_data);
191   benchmarker_run(&bench);
192 
193   // ========================================================================
194   printf("INFO: Done with benchmarking! Saving histograms to disk...\n");
195 
196 #define EVENT(name) \
197   do { \
198     FILE *output = fopen(output_path_##name, "w"); \
199     if (output == NULL) { \
200       fprintf(stderr, \
201               "ERROR: Coundn't open output file \"%s\": %s", \
202               output_path_##name, strerror(errno)); \
203       return EXIT_FAILURE; \
204     } \
205     size_t written = fwrite(&histogram_##name, sizeof(uint64_t), HISTOGRAM_SIZE, output); \
206     if (written < HISTOGRAM_SIZE) { \
207       fprintf(stderr, \
208               "ERROR: Failed to write histogram to output file \"%s\": %s\n", \
209               output_path_##name, strerror(errno)); \
210       return EXIT_FAILURE; \
211     } \
212   } while(0);
213   EVENTS
214 #undef EVENT
215 
216   printf("INFO: Done!\n");
217 
218   return EXIT_SUCCESS;
219 }