numeric-linalg

Educational material on the SciPy implementation of numerical linear algebra algorithms

File Name Size Mode
main.c 6722B -rw-r--r--
  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 
  4 #define __USE_GNU
  5 #include <sched.h>
  6 #include <pthread.h>
  7 
  8 #include <stdint.h>
  9 #include <stdbool.h>
 10 
 11 #include <string.h>
 12 #include <errno.h>
 13 
 14 #include "progress-bar.h"
 15 #include "perf.h"
 16 #include "config.h"
 17 
 18 #define HISTOGRAM_SIZE              (MAX_N/STEP)
 19 #define LOGISTICS_INITIAL_CONDITION (-800.)
 20 
 21 uint64_t histograms[PERF_EVENT_COUNT][HISTOGRAM_SIZE];
 22 void (*getrf)(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
 23 
 24 ProgressBar progress = {
 25   .total = HISTOGRAM_SIZE,
 26   .count = 0,
 27   .mutex = PTHREAD_MUTEX_INITIALIZER,
 28 };
 29 
 30 // .data is a pointer because it should be allocated in the heap
 31 // (.data DOES NOT fit in the stack 🤡)
 32 typedef struct {
 33   double       *ref_data;
 34 
 35   // Input parameters for dgetrf
 36   double       *data;
 37   int32_t      ipiv[MAX_N];
 38   int          n, m, lda, info;
 39 
 40   size_t       id;
 41   PerfRecorder recorder;
 42 } Thread;
 43 
 44 typedef struct {
 45   Thread threads[N_THREADS];
 46 } Benchmarker;
 47 
 48 /*
 49  * LAPACK functions
 50  */
 51 extern void dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
 52 extern void dgetrfnaive_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
 53 extern void dgetrf2_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
 54 
 55 PerfResult thread_run_benchmark(Thread *thread, size_t n)
 56 {
 57   // Reinitializes the values in the .data array: this avoids progressively
 58   // moving larger values to the beginning of the array, which would decrise
 59   // decrease the number of row interchanges required for computations
 60   memcpy(thread->data, thread->ref_data, sizeof(double)*n*n);
 61   thread->n = n; thread->m = n; thread->lda = n;
 62 
 63   perf_start_recording(&thread->recorder, thread->id);
 64   getrf(&thread->n, &thread->m, thread->data, &thread->lda,
 65         thread->ipiv, &thread->info);
 66   return perf_stop_recording(&thread->recorder);
 67 }
 68 
 69 void *thread_benchmark(void *arg)
 70 {
 71   Thread *thread = (Thread*) arg;
 72 
 73   // We need to lock the running thread to some CPU so that we can mask
 74   // perf_event_open with this specific CPU
 75   cpu_set_t set;
 76   CPU_ZERO(&set);
 77   CPU_SET(thread->id, &set);
 78   if (sched_setaffinity(0, sizeof(set), &set) == -1) {
 79     fprintf(stderr, "ERROR: Could not lock CPU affinity for thread %lu\n",
 80             thread->id);
 81   }
 82 
 83   // Computations are distributed evenly across threads
 84   for (size_t n = 1 + thread->id*STEP; n < MAX_N; n += STEP*N_THREADS) {
 85     PerfResult result = thread_run_benchmark(thread, n);
 86     size_t i = (n - 1)/STEP;
 87 
 88     static_assert(PERF_EVENT_COUNT == 3,
 89                   "We need to update the assignements "
 90                   "if we add more perf events");
 91     histograms[CACHE_LOADS][i]  = result.cache_loads;
 92     histograms[CACHE_MISSES][i] = result.cache_misses;
 93     histograms[CPU_CYCLES][i]   = result.cpu_cycles;
 94 
 95     progress_bar_inc(&progress);
 96   }
 97 
 98   return NULL;
 99 }
100 
101 Benchmarker benchmarker_new(double *ref_data)
102 {
103   Benchmarker bench = {0};
104 
105   // This array will live for the entire duration of the program,
106   // so we might as well leak it 🤡
107   double *data = malloc(sizeof(double)*MAX_N*MAX_N*N_THREADS);
108   if (data == NULL) {
109     fprintf(stderr, "ERROR: Buy more RAM!\n");
110     exit(EXIT_FAILURE);
111   }
112 
113   for (size_t i = 0; i < N_THREADS; i++) {
114     bench.threads[i].data     = data + i*MAX_N*MAX_N;
115     bench.threads[i].ref_data = ref_data;
116     bench.threads[i].id       = i;
117   }
118 
119   return bench;
120 }
121 
122 void benchmarker_run(Benchmarker *bench)
123 {
124   pthread_t handles[N_THREADS] = {0};
125   for (size_t i = 0; i < N_THREADS; i++) {
126     if (pthread_create(&handles[i], NULL,
127                        thread_benchmark,
128                        &bench->threads[i]) != 0) {
129       fprintf(stderr, "ERROR: Failed to spawn thread %lu!\n", i);
130       exit(EXIT_FAILURE);
131     }
132   }
133 
134   for (size_t i = 0; i < N_THREADS; i++) {
135     pthread_join(handles[i], NULL);
136   }
137 }
138 
139 
140 int main(int argc, char **argv)
141 {
142   const char *output_paths[PERF_EVENT_COUNT];
143 
144   static_assert(PERF_EVENT_COUNT == 3,
145                 "We need to update argv parsing if we add more perf events");
146   if (argc < 2 || strcmp(argv[1], "standard") == 0) {
147     getrf = dgetrf_;
148     output_paths[CACHE_LOADS]  = OUTPUT_DIR "cache-loads.bin";
149     output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses.bin";
150     output_paths[CPU_CYCLES]   = OUTPUT_DIR "cpu-cycles.bin";
151   } else if (strcmp(argv[1], "naive") == 0) {
152     getrf = dgetrfnaive_;
153     output_paths[CACHE_LOADS]  = OUTPUT_DIR "cache-loads-naive.bin";
154     output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-naive.bin";
155     output_paths[CPU_CYCLES]   = OUTPUT_DIR "cpu-cycles-naive.bin";
156   } else if (strcmp(argv[1], "unblocked") == 0) {
157     getrf = dgetrf2_;
158     output_paths[CACHE_LOADS]  = OUTPUT_DIR "cache-loads-unblocked.bin";
159     output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-unblocked.bin";
160     output_paths[CPU_CYCLES]   = OUTPUT_DIR "cpu-cycles-unblocked.bin";
161   } else {
162     fprintf(stderr, "ERROR: unknown command \"%s\"\n", argv[1]);
163     fprintf(stderr, "USAGE: %s [standard|naive|unblocked]\n", argv[0]);
164     exit(EXIT_FAILURE);
165   }
166 
167   // ========================================================================
168   printf("INFO: Initializing random input data... ");
169 
170   // This array will live for the entire duration of the program,
171   // so we might as well leak it 🤡
172   double *ref_data = malloc(sizeof(double)*MAX_N*MAX_N);
173   if (ref_data == NULL) {
174     fprintf(stderr, "ERROR: Buy more RAM!\n");
175     exit(EXIT_FAILURE);
176   }
177 
178   // Pseudorandom data given by the logistic map
179   double acc = LOGISTICS_INITIAL_CONDITION;
180   for (size_t i = 0; i < MAX_N*MAX_N; i++) {
181     ref_data[i] = acc;
182     acc = 1000. - acc*acc/500.;
183   }
184 
185   printf("done!\n");
186 
187   // ========================================================================
188   printf("INFO: Benchmarking the dgetrf on %ux%u matrices... (using %u threads)\n",
189          MAX_N, MAX_N, N_THREADS);
190 
191   Benchmarker bench = benchmarker_new(ref_data);
192   benchmarker_run(&bench);
193 
194   // ========================================================================
195   printf("INFO: Done with benchmarking! Saving histograms to disk...\n");
196 
197   for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
198     FILE *output = fopen(output_paths[i], "w");
199     if (output == NULL) {
200       fprintf(stderr,
201               "ERROR: Coundn't open output file \"%s\": %s",
202               output_paths[i], strerror(errno));
203       return EXIT_FAILURE;
204     }
205 
206     size_t written = fwrite(&histograms[i], sizeof(uint64_t), HISTOGRAM_SIZE, output);
207     if (written < HISTOGRAM_SIZE) {
208       fprintf(stderr,
209               "ERROR: Failed to write histogram to output file \"%s\": %s\n",
210               output_paths[i], strerror(errno));
211       return EXIT_FAILURE;
212     }
213   };
214 
215   printf("INFO: Done!\n");
216   return EXIT_SUCCESS;
217 }