matmul.c

Matrix multiplication helper library
git clone git://git.finwo.net/lib/matmul.c
Log | Files | Refs | README | LICENSE

benchmark.c (7464B)


      1 #define _POSIX_C_SOURCE 199309L
      2 #include <stdint.h>
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 #include <time.h>
      7 
      8 #include "../src/matmul.h"
      9 
     10 #define RUNS 100
     11 
     12 static struct timespec timespec_now(void) {
     13   struct timespec ts;
     14   clock_gettime(CLOCK_MONOTONIC, &ts);
     15   return ts;
     16 }
     17 
     18 static double timespec_diff_ms(struct timespec start, struct timespec end) {
     19   return (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_nsec - start.tv_nsec) / 1e6;
     20 }
     21 
     22 static int compare_double(const void *a, const void *b) {
     23   double da = *(const double *)a;
     24   double db = *(const double *)b;
     25   return (da > db) - (da < db);
     26 }
     27 
     28 static double percentile(double *sorted, double p, int n) {
     29   double idx  = p / 100.0 * (n - 1);
     30   int    lo   = (int)idx;
     31   int    hi   = lo + 1;
     32   double frac = idx - lo;
     33   if (hi >= n) hi = n - 1;
     34   return sorted[lo] * (1.0 - frac) + sorted[hi] * frac;
     35 }
     36 
     37 static void bench_u8_i8_u8(size_t m, size_t n, size_t p, int runs) {
     38   uint8_t *A     = malloc(m * n);
     39   int8_t  *B     = malloc(n * p);
     40   uint8_t *C     = malloc(m * p);
     41   uint8_t *Cwarm = malloc(m * p);
     42   double   times[RUNS];
     43 
     44   if (!A || !B || !C || !Cwarm) {
     45     fprintf(stderr, "OOM for %zu x %zu\n", m, n);
     46     free(A);
     47     free(B);
     48     free(C);
     49     free(Cwarm);
     50     return;
     51   }
     52 
     53   for (size_t i = 0; i < m * n; i++) A[i] = (uint8_t)(rand() % 256);
     54   for (size_t i = 0; i < n * p; i++) B[i] = (int8_t)(rand() % 256);
     55   memset(C, 0, m * p);
     56   memset(Cwarm, 0, m * p);
     57 
     58   matmul_u8_i8_u8(m, n, p, A, B, Cwarm, 0.0);
     59 
     60   int actual_runs = runs;
     61   if (m >= 4096) actual_runs = 3;
     62 
     63   for (int r = 0; r < actual_runs; r++) {
     64     memset(C, 0, m * p);
     65     struct timespec start = timespec_now();
     66     matmul_u8_i8_u8(m, n, p, A, B, C, 0.0);
     67     struct timespec end = timespec_now();
     68     times[r]            = timespec_diff_ms(start, end);
     69   }
     70 
     71   qsort(times, actual_runs, sizeof(double), compare_double);
     72 
     73   double gflops = 2.0 * m * n * p / (percentile(times, 50, actual_runs) * 1e6);
     74 
     75   printf("%8zu x %8zu | %6.2f | %6.2f | %6.2f | %6.2f | %6.2f | %8.1f\n", m, n, percentile(times, 1, actual_runs),
     76          percentile(times, 5, actual_runs), percentile(times, 50, actual_runs), percentile(times, 95, actual_runs),
     77          percentile(times, 99, actual_runs), gflops);
     78 
     79   free(A);
     80   free(B);
     81   free(C);
     82   free(Cwarm);
     83 }
     84 
     85 static void bench_f32_f32_f32(size_t m, size_t n, size_t p, int runs) {
     86   float *A     = malloc(m * n * sizeof(float));
     87   float *B     = malloc(n * p * sizeof(float));
     88   float *C     = malloc(m * p * sizeof(float));
     89   float *Cwarm = malloc(m * p * sizeof(float));
     90   double times[RUNS];
     91 
     92   if (!A || !B || !C || !Cwarm) {
     93     fprintf(stderr, "OOM for %zu x %zu\n", m, n);
     94     free(A);
     95     free(B);
     96     free(C);
     97     free(Cwarm);
     98     return;
     99   }
    100 
    101   for (size_t i = 0; i < m * n; i++) A[i] = (float)(rand() % 256);
    102   for (size_t i = 0; i < n * p; i++) B[i] = (float)(rand() % 256);
    103   memset(C, 0, m * p * sizeof(float));
    104   memset(Cwarm, 0, m * p * sizeof(float));
    105 
    106   matmul_f32_f32_f32(m, n, p, A, B, Cwarm, 0.0);
    107 
    108   int actual_runs = runs;
    109   if (m >= 4096) actual_runs = 3;
    110 
    111   for (int r = 0; r < actual_runs; r++) {
    112     memset(C, 0, m * p * sizeof(float));
    113     struct timespec start = timespec_now();
    114     matmul_f32_f32_f32(m, n, p, A, B, C, 0.0);
    115     struct timespec end = timespec_now();
    116     times[r]            = timespec_diff_ms(start, end);
    117   }
    118 
    119   qsort(times, actual_runs, sizeof(double), compare_double);
    120 
    121   double gflops = 2.0 * m * n * p / (percentile(times, 50, actual_runs) * 1e6);
    122 
    123   printf("%8zu x %8zu | %6.2f | %6.2f | %6.2f | %6.2f | %6.2f | %8.1f\n", m, n, percentile(times, 1, actual_runs),
    124          percentile(times, 5, actual_runs), percentile(times, 50, actual_runs), percentile(times, 95, actual_runs),
    125          percentile(times, 99, actual_runs), gflops);
    126 
    127   free(A);
    128   free(B);
    129   free(C);
    130   free(Cwarm);
    131 }
    132 
    133 static void bench_f64_f64_f64(size_t m, size_t n, size_t p, int runs) {
    134   double *A     = malloc(m * n * sizeof(double));
    135   double *B     = malloc(n * p * sizeof(double));
    136   double *C     = malloc(m * p * sizeof(double));
    137   double *Cwarm = malloc(m * p * sizeof(double));
    138   double  times[RUNS];
    139 
    140   if (!A || !B || !C || !Cwarm) {
    141     fprintf(stderr, "OOM for %zu x %zu\n", m, n);
    142     free(A);
    143     free(B);
    144     free(C);
    145     free(Cwarm);
    146     return;
    147   }
    148 
    149   for (size_t i = 0; i < m * n; i++) A[i] = (double)(rand() % 256);
    150   for (size_t i = 0; i < n * p; i++) B[i] = (double)(rand() % 256);
    151   memset(C, 0, m * p * sizeof(double));
    152   memset(Cwarm, 0, m * p * sizeof(double));
    153 
    154   matmul_f64_f64_f64(m, n, p, A, B, Cwarm, 0.0);
    155 
    156   int actual_runs = runs;
    157   if (m >= 4096) actual_runs = 3;
    158 
    159   for (int r = 0; r < actual_runs; r++) {
    160     memset(C, 0, m * p * sizeof(double));
    161     struct timespec start = timespec_now();
    162     matmul_f64_f64_f64(m, n, p, A, B, C, 0.0);
    163     struct timespec end = timespec_now();
    164     times[r]            = timespec_diff_ms(start, end);
    165   }
    166 
    167   qsort(times, actual_runs, sizeof(double), compare_double);
    168 
    169   double gflops = 2.0 * m * n * p / (percentile(times, 50, actual_runs) * 1e6);
    170 
    171   printf("%8zu x %8zu | %6.2f | %6.2f | %6.2f | %6.2f | %6.2f | %8.1f\n", m, n, percentile(times, 1, actual_runs),
    172          percentile(times, 5, actual_runs), percentile(times, 50, actual_runs), percentile(times, 95, actual_runs),
    173          percentile(times, 99, actual_runs), gflops);
    174 
    175   free(A);
    176   free(B);
    177   free(C);
    178   free(Cwarm);
    179 }
    180 
    181 int main(void) {
    182   srand(42);
    183 
    184   const size_t sizes[][3] = {
    185       {16, 16, 16}, {64, 64, 64}, {256, 256, 256}, {1024, 1024, 1024}, {4096, 4096, 4096},
    186   };
    187   const int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
    188 
    189   printf("Benchmark: u8_i8_u8 matmul, %d runs per size\n", RUNS);
    190   printf("--------------------------------------------------------------------------------\n");
    191   printf("%8s | %8s | %8s | %8s | %8s | %8s | %8s\n", "M x N", "1% (ms)", "5% (ms)", "50% (ms)", "95% (ms)", "99% (ms)",
    192          "GFLOPS");
    193   printf("--------------------------------------------------------------------------------\n");
    194   for (int i = 0; i < num_sizes; i++) {
    195     bench_u8_i8_u8(sizes[i][0], sizes[i][1], sizes[i][2], RUNS);
    196   }
    197   printf("--------------------------------------------------------------------------------\n");
    198   printf("\n");
    199 
    200   printf("Benchmark: f32_f32_f32 matmul, %d runs per size\n", RUNS);
    201   printf("--------------------------------------------------------------------------------\n");
    202   printf("%8s | %8s | %8s | %8s | %8s | %8s | %8s\n", "M x N", "1% (ms)", "5% (ms)", "50% (ms)", "95% (ms)", "99% (ms)",
    203          "GFLOPS");
    204   printf("--------------------------------------------------------------------------------\n");
    205   for (int i = 0; i < num_sizes; i++) {
    206     bench_f32_f32_f32(sizes[i][0], sizes[i][1], sizes[i][2], RUNS);
    207   }
    208   printf("--------------------------------------------------------------------------------\n");
    209   printf("\n");
    210 
    211   printf("Benchmark: f64_f64_f64 matmul, %d runs per size\n", RUNS);
    212   printf("--------------------------------------------------------------------------------\n");
    213   printf("%8s | %8s | %8s | %8s | %8s | %8s | %8s\n", "M x N", "1% (ms)", "5% (ms)", "50% (ms)", "95% (ms)", "99% (ms)",
    214          "GFLOPS");
    215   printf("--------------------------------------------------------------------------------\n");
    216   for (int i = 0; i < num_sizes; i++) {
    217     bench_f64_f64_f64(sizes[i][0], sizes[i][1], sizes[i][2], RUNS);
    218   }
    219   printf("--------------------------------------------------------------------------------\n");
    220 
    221   return 0;
    222 }