benchmark.c (7464B)
1 #define _POSIX_C_SOURCE 199309L 2 #include <stdint.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 #include <time.h> 7 8 #include "../src/matmul.h" 9 10 #define RUNS 100 11 12 static struct timespec timespec_now(void) { 13 struct timespec ts; 14 clock_gettime(CLOCK_MONOTONIC, &ts); 15 return ts; 16 } 17 18 static double timespec_diff_ms(struct timespec start, struct timespec end) { 19 return (end.tv_sec - start.tv_sec) * 1000.0 + (end.tv_nsec - start.tv_nsec) / 1e6; 20 } 21 22 static int compare_double(const void *a, const void *b) { 23 double da = *(const double *)a; 24 double db = *(const double *)b; 25 return (da > db) - (da < db); 26 } 27 28 static double percentile(double *sorted, double p, int n) { 29 double idx = p / 100.0 * (n - 1); 30 int lo = (int)idx; 31 int hi = lo + 1; 32 double frac = idx - lo; 33 if (hi >= n) hi = n - 1; 34 return sorted[lo] * (1.0 - frac) + sorted[hi] * frac; 35 } 36 37 static void bench_u8_i8_u8(size_t m, size_t n, size_t p, int runs) { 38 uint8_t *A = malloc(m * n); 39 int8_t *B = malloc(n * p); 40 uint8_t *C = malloc(m * p); 41 uint8_t *Cwarm = malloc(m * p); 42 double times[RUNS]; 43 44 if (!A || !B || !C || !Cwarm) { 45 fprintf(stderr, "OOM for %zu x %zu\n", m, n); 46 free(A); 47 free(B); 48 free(C); 49 free(Cwarm); 50 return; 51 } 52 53 for (size_t i = 0; i < m * n; i++) A[i] = (uint8_t)(rand() % 256); 54 for (size_t i = 0; i < n * p; i++) B[i] = (int8_t)(rand() % 256); 55 memset(C, 0, m * p); 56 memset(Cwarm, 0, m * p); 57 58 matmul_u8_i8_u8(m, n, p, A, B, Cwarm, 0.0); 59 60 int actual_runs = runs; 61 if (m >= 4096) actual_runs = 3; 62 63 for (int r = 0; r < actual_runs; r++) { 64 memset(C, 0, m * p); 65 struct timespec start = timespec_now(); 66 matmul_u8_i8_u8(m, n, p, A, B, C, 0.0); 67 struct timespec end = timespec_now(); 68 times[r] = timespec_diff_ms(start, end); 69 } 70 71 qsort(times, actual_runs, sizeof(double), compare_double); 72 73 double gflops = 2.0 * m * n * p / (percentile(times, 50, actual_runs) * 1e6); 74 75 printf("%8zu x %8zu | %6.2f | %6.2f | %6.2f | %6.2f | %6.2f | %8.1f\n", m, n, percentile(times, 1, actual_runs), 76 percentile(times, 5, actual_runs), percentile(times, 50, actual_runs), percentile(times, 95, actual_runs), 77 percentile(times, 99, actual_runs), gflops); 78 79 free(A); 80 free(B); 81 free(C); 82 free(Cwarm); 83 } 84 85 static void bench_f32_f32_f32(size_t m, size_t n, size_t p, int runs) { 86 float *A = malloc(m * n * sizeof(float)); 87 float *B = malloc(n * p * sizeof(float)); 88 float *C = malloc(m * p * sizeof(float)); 89 float *Cwarm = malloc(m * p * sizeof(float)); 90 double times[RUNS]; 91 92 if (!A || !B || !C || !Cwarm) { 93 fprintf(stderr, "OOM for %zu x %zu\n", m, n); 94 free(A); 95 free(B); 96 free(C); 97 free(Cwarm); 98 return; 99 } 100 101 for (size_t i = 0; i < m * n; i++) A[i] = (float)(rand() % 256); 102 for (size_t i = 0; i < n * p; i++) B[i] = (float)(rand() % 256); 103 memset(C, 0, m * p * sizeof(float)); 104 memset(Cwarm, 0, m * p * sizeof(float)); 105 106 matmul_f32_f32_f32(m, n, p, A, B, Cwarm, 0.0); 107 108 int actual_runs = runs; 109 if (m >= 4096) actual_runs = 3; 110 111 for (int r = 0; r < actual_runs; r++) { 112 memset(C, 0, m * p * sizeof(float)); 113 struct timespec start = timespec_now(); 114 matmul_f32_f32_f32(m, n, p, A, B, C, 0.0); 115 struct timespec end = timespec_now(); 116 times[r] = timespec_diff_ms(start, end); 117 } 118 119 qsort(times, actual_runs, sizeof(double), compare_double); 120 121 double gflops = 2.0 * m * n * p / (percentile(times, 50, actual_runs) * 1e6); 122 123 printf("%8zu x %8zu | %6.2f | %6.2f | %6.2f | %6.2f | %6.2f | %8.1f\n", m, n, percentile(times, 1, actual_runs), 124 percentile(times, 5, actual_runs), percentile(times, 50, actual_runs), percentile(times, 95, actual_runs), 125 percentile(times, 99, actual_runs), gflops); 126 127 free(A); 128 free(B); 129 free(C); 130 free(Cwarm); 131 } 132 133 static void bench_f64_f64_f64(size_t m, size_t n, size_t p, int runs) { 134 double *A = malloc(m * n * sizeof(double)); 135 double *B = malloc(n * p * sizeof(double)); 136 double *C = malloc(m * p * sizeof(double)); 137 double *Cwarm = malloc(m * p * sizeof(double)); 138 double times[RUNS]; 139 140 if (!A || !B || !C || !Cwarm) { 141 fprintf(stderr, "OOM for %zu x %zu\n", m, n); 142 free(A); 143 free(B); 144 free(C); 145 free(Cwarm); 146 return; 147 } 148 149 for (size_t i = 0; i < m * n; i++) A[i] = (double)(rand() % 256); 150 for (size_t i = 0; i < n * p; i++) B[i] = (double)(rand() % 256); 151 memset(C, 0, m * p * sizeof(double)); 152 memset(Cwarm, 0, m * p * sizeof(double)); 153 154 matmul_f64_f64_f64(m, n, p, A, B, Cwarm, 0.0); 155 156 int actual_runs = runs; 157 if (m >= 4096) actual_runs = 3; 158 159 for (int r = 0; r < actual_runs; r++) { 160 memset(C, 0, m * p * sizeof(double)); 161 struct timespec start = timespec_now(); 162 matmul_f64_f64_f64(m, n, p, A, B, C, 0.0); 163 struct timespec end = timespec_now(); 164 times[r] = timespec_diff_ms(start, end); 165 } 166 167 qsort(times, actual_runs, sizeof(double), compare_double); 168 169 double gflops = 2.0 * m * n * p / (percentile(times, 50, actual_runs) * 1e6); 170 171 printf("%8zu x %8zu | %6.2f | %6.2f | %6.2f | %6.2f | %6.2f | %8.1f\n", m, n, percentile(times, 1, actual_runs), 172 percentile(times, 5, actual_runs), percentile(times, 50, actual_runs), percentile(times, 95, actual_runs), 173 percentile(times, 99, actual_runs), gflops); 174 175 free(A); 176 free(B); 177 free(C); 178 free(Cwarm); 179 } 180 181 int main(void) { 182 srand(42); 183 184 const size_t sizes[][3] = { 185 {16, 16, 16}, {64, 64, 64}, {256, 256, 256}, {1024, 1024, 1024}, {4096, 4096, 4096}, 186 }; 187 const int num_sizes = sizeof(sizes) / sizeof(sizes[0]); 188 189 printf("Benchmark: u8_i8_u8 matmul, %d runs per size\n", RUNS); 190 printf("--------------------------------------------------------------------------------\n"); 191 printf("%8s | %8s | %8s | %8s | %8s | %8s | %8s\n", "M x N", "1% (ms)", "5% (ms)", "50% (ms)", "95% (ms)", "99% (ms)", 192 "GFLOPS"); 193 printf("--------------------------------------------------------------------------------\n"); 194 for (int i = 0; i < num_sizes; i++) { 195 bench_u8_i8_u8(sizes[i][0], sizes[i][1], sizes[i][2], RUNS); 196 } 197 printf("--------------------------------------------------------------------------------\n"); 198 printf("\n"); 199 200 printf("Benchmark: f32_f32_f32 matmul, %d runs per size\n", RUNS); 201 printf("--------------------------------------------------------------------------------\n"); 202 printf("%8s | %8s | %8s | %8s | %8s | %8s | %8s\n", "M x N", "1% (ms)", "5% (ms)", "50% (ms)", "95% (ms)", "99% (ms)", 203 "GFLOPS"); 204 printf("--------------------------------------------------------------------------------\n"); 205 for (int i = 0; i < num_sizes; i++) { 206 bench_f32_f32_f32(sizes[i][0], sizes[i][1], sizes[i][2], RUNS); 207 } 208 printf("--------------------------------------------------------------------------------\n"); 209 printf("\n"); 210 211 printf("Benchmark: f64_f64_f64 matmul, %d runs per size\n", RUNS); 212 printf("--------------------------------------------------------------------------------\n"); 213 printf("%8s | %8s | %8s | %8s | %8s | %8s | %8s\n", "M x N", "1% (ms)", "5% (ms)", "50% (ms)", "95% (ms)", "99% (ms)", 214 "GFLOPS"); 215 printf("--------------------------------------------------------------------------------\n"); 216 for (int i = 0; i < num_sizes; i++) { 217 bench_f64_f64_f64(sizes[i][0], sizes[i][1], sizes[i][2], RUNS); 218 } 219 printf("--------------------------------------------------------------------------------\n"); 220 221 return 0; 222 }