numeric-linalg

Educational material on the SciPy implementation of numerical linear algebra algorithms

Name	Size	Mode
..
getrf/benchmarks/src/main.c	6K	-rw-r--r--

#include <stdio.h>
#include <stdlib.h>

#define __USE_GNU
#include <sched.h>
#include <pthread.h>

#include <stdint.h>
#include <stdbool.h>

#include <string.h>
#include <errno.h>

#include "progress-bar.h"
#include "perf.h"
#include "config.h"

#define HISTOGRAM_SIZE              (MAX_N/STEP)
#define LOGISTICS_INITIAL_CONDITION (-800.)

uint64_t histograms[PERF_EVENT_COUNT][HISTOGRAM_SIZE];
void (*getrf)(int *m, int *n, double *A, int *lda, int *ipiv, int *info);

ProgressBar progress = {
  .total = HISTOGRAM_SIZE,
  .count = 0,
  .mutex = PTHREAD_MUTEX_INITIALIZER,
};

// .data is a pointer because it should be allocated in the heap
// (.data DOES NOT fit in the stack 🤡)
typedef struct {
  double       *ref_data;

  // input parameters for dgetrf
  double       *data;
  int32_t      ipiv[MAX_N];
  int          n, m, lda, info;

  size_t       id;
  PerfRecorder recorder;
} Thread;

typedef struct {
  Thread threads[N_THREADS];
} Benchmarker;

/*
 * LAPACK functions
 */
extern void dgetrf_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
extern void dgetrfnaive_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);
extern void dgetrf2_(int *m, int *n, double *A, int *lda, int *ipiv, int *info);

PerfResult thread_run_benchmark(Thread *thread, size_t n)
{
  // reinitializes the values in the .data array: this avoids progressively
  // moving larger values to the beginning of the array, which would decrease
  // the number of row interchanges required for computations
  memcpy(thread->data, thread->ref_data, sizeof(double)*n*n);
  thread->n = n; thread->m = n; thread->lda = n;

  perf_start_recording(&thread->recorder, thread->id);
  getrf(&thread->n, &thread->m, thread->data, &thread->lda,
        thread->ipiv, &thread->info);
  return perf_stop_recording(&thread->recorder);
}

void *thread_benchmark(void *arg)
{
  Thread *thread = (Thread*) arg;

  // we need to lock the running thread to some CPU so that we can mask
  // perf_event_open with this specific CPU
  cpu_set_t set;
  CPU_ZERO(&set);
  CPU_SET(thread->id, &set);
  if (sched_setaffinity(0, sizeof(set), &set) == -1) {
    fprintf(stderr, "ERROR: Could not lock CPU affinity for thread %lu\n",
            thread->id);
  }

  // computations are distributed evenly across threads
  for (size_t n = 1 + thread->id*STEP; n < MAX_N; n += STEP*N_THREADS) {
    PerfResult result = thread_run_benchmark(thread, n);
    size_t i = (n - 1)/STEP;

    static_assert(PERF_EVENT_COUNT == 3,
                  "We need to update the assignements "
                  "if we add more perf events");
    histograms[CACHE_LOADS][i]  = result.cache_loads;
    histograms[CACHE_MISSES][i] = result.cache_misses;
    histograms[CPU_CYCLES][i]   = result.cpu_cycles;

    progress_bar_inc(&progress);
  }

  return NULL;
}

Benchmarker benchmarker_new(double *ref_data)
{
  Benchmarker bench = {0};

  // this array will live for the entire duration of the program,
  // so we might as well leak it 🤡
  double *data = malloc(sizeof(double)*MAX_N*MAX_N*N_THREADS);
  if (data == NULL) {
    fprintf(stderr, "ERROR: Buy more RAM!\n");
    exit(EXIT_FAILURE);
  }

  for (size_t i = 0; i < N_THREADS; i++) {
    bench.threads[i].data     = data + i*MAX_N*MAX_N;
    bench.threads[i].ref_data = ref_data;
    bench.threads[i].id       = i;
  }

  return bench;
}

void benchmarker_run(Benchmarker *bench)
{
  pthread_t handles[N_THREADS] = {0};
  for (size_t i = 0; i < N_THREADS; i++) {
    if (pthread_create(&handles[i], NULL,
                       thread_benchmark,
                       &bench->threads[i]) != 0) {
      fprintf(stderr, "ERROR: Failed to spawn thread %lu!\n", i);
      exit(EXIT_FAILURE);
    }
  }

  for (size_t i = 0; i < N_THREADS; i++) {
    pthread_join(handles[i], NULL);
  }
}


int main(int argc, char **argv)
{
  const char *output_paths[PERF_EVENT_COUNT];

  static_assert(PERF_EVENT_COUNT == 3,
                "We need to update argv parsing if we add more perf events");
  if (argc < 2 || strcmp(argv[1], "standard") == 0) {
    getrf = dgetrf_;
    output_paths[CACHE_LOADS]  = OUTPUT_DIR "cache-loads.bin";
    output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses.bin";
    output_paths[CPU_CYCLES]   = OUTPUT_DIR "cpu-cycles.bin";
  } else if (strcmp(argv[1], "naive") == 0) {
    getrf = dgetrfnaive_;
    output_paths[CACHE_LOADS]  = OUTPUT_DIR "cache-loads-naive.bin";
    output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-naive.bin";
    output_paths[CPU_CYCLES]   = OUTPUT_DIR "cpu-cycles-naive.bin";
  } else if (strcmp(argv[1], "unblocked") == 0) {
    getrf = dgetrf2_;
    output_paths[CACHE_LOADS]  = OUTPUT_DIR "cache-loads-unblocked.bin";
    output_paths[CACHE_MISSES] = OUTPUT_DIR "cache-misses-unblocked.bin";
    output_paths[CPU_CYCLES]   = OUTPUT_DIR "cpu-cycles-unblocked.bin";
  } else {
    fprintf(stderr, "ERROR: unknown command \"%s\"\n", argv[1]);
    fprintf(stderr, "USAGE: %s [standard|naive|unblocked]\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  // ========================================================================
  printf("INFO: Initializing random input data... ");

  // this array will live for the entire duration of the program,
  // so we might as well leak it 🤡
  double *ref_data = malloc(sizeof(double)*MAX_N*MAX_N);
  if (ref_data == NULL) {
    fprintf(stderr, "ERROR: Buy more RAM!\n");
    exit(EXIT_FAILURE);
  }

  // pseudorandom data given by the logistic map
  double acc = LOGISTICS_INITIAL_CONDITION;
  for (size_t i = 0; i < MAX_N*MAX_N; i++) {
    ref_data[i] = acc;
    acc = 1000. - acc*acc/500.;
  }

  printf("done!\n");

  // ========================================================================
  printf("INFO: Benchmarking the dgetrf on %ux%u matrices... (using %u threads)\n",
         MAX_N, MAX_N, N_THREADS);

  Benchmarker bench = benchmarker_new(ref_data);
  benchmarker_run(&bench);

  // ========================================================================
  printf("INFO: Done with benchmarking! Saving histograms to disk...\n");

  for (size_t i = 0; i < PERF_EVENT_COUNT; i++) {
    FILE *output = fopen(output_paths[i], "w");
    if (output == NULL) {
      fprintf(stderr,
              "ERROR: Coundn't open output file \"%s\": %s",
              output_paths[i], strerror(errno));
      return EXIT_FAILURE;
    }

    size_t written = fwrite(&histograms[i], sizeof(uint64_t), HISTOGRAM_SIZE, output);
    if (written < HISTOGRAM_SIZE) {
      fprintf(stderr,
              "ERROR: Failed to write histogram to output file \"%s\": %s\n",
              output_paths[i], strerror(errno));
      return EXIT_FAILURE;
    }
  };

  printf("INFO: Done!\n");
  return EXIT_SUCCESS;
}