matmul.c

Matrix multiplication helper library
git clone git://git.finwo.net/lib/matmul.c
Log | Files | Refs | README | LICENSE

commit 176fed04bc21ee1573d28992ec8f3205f3d1c73a
parent e8df912dce76b1125439ba6f62e722e42490b47a
Author: finwo <finwo@pm.me>
Date:   Thu, 16 Apr 2026 12:43:38 +0200

Add openmp support

Diffstat:
Msrc/matmul.c | 196+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 196 insertions(+), 0 deletions(-)

diff --git a/src/matmul.c b/src/matmul.c @@ -42,6 +42,10 @@ #include <stdlib.h> #include <string.h> +#ifdef _OPENMP +#include <omp.h> +#endif + static matmul_feature_t g_feature = MATMUL_SCALAR; matmul_feature_t matmul_get_feature(void) { @@ -65,6 +69,9 @@ const char *matmul_get_feature_name(matmul_feature_t feat) { } int matmul_scalar_f32_f32_f32(size_t m, size_t n, size_t p, const float *A, const float *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -79,6 +86,9 @@ int matmul_scalar_f32_f32_f32(size_t m, size_t n, size_t p, const float *A, cons } int matmul_scalar_f32_f32_f64(size_t m, size_t n, size_t p, const float *A, const float *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -93,6 +103,9 @@ int matmul_scalar_f32_f32_f64(size_t m, size_t n, size_t p, const float *A, cons } int matmul_scalar_f32_f32_i8(size_t m, size_t n, size_t p, const float *A, const float *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -111,6 +124,9 @@ int matmul_scalar_f32_f32_i8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_f32_u8(size_t m, size_t n, size_t p, const float *A, const float *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -129,6 +145,9 @@ int matmul_scalar_f32_f32_u8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_f64_f32(size_t m, size_t n, size_t p, const float *A, const double *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -143,6 +162,9 @@ int matmul_scalar_f32_f64_f32(size_t m, size_t n, size_t p, const float *A, cons } int matmul_scalar_f32_f64_f64(size_t m, size_t n, size_t p, const float *A, const double *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -157,6 +179,9 @@ int matmul_scalar_f32_f64_f64(size_t m, size_t n, size_t p, const float *A, cons } int matmul_scalar_f32_f64_i8(size_t m, size_t n, size_t p, const float *A, const double *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -175,6 +200,9 @@ int matmul_scalar_f32_f64_i8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_f64_u8(size_t m, size_t n, size_t p, const float *A, const double *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -193,6 +221,9 @@ int matmul_scalar_f32_f64_u8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_i8_f32(size_t m, size_t n, size_t p, const float *A, const int8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -207,6 +238,9 @@ int matmul_scalar_f32_i8_f32(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_i8_f64(size_t m, size_t n, size_t p, const float *A, const int8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -221,6 +255,9 @@ int matmul_scalar_f32_i8_f64(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_i8_i8(size_t m, size_t n, size_t p, const float *A, const int8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -239,6 +276,9 @@ int matmul_scalar_f32_i8_i8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_i8_u8(size_t m, size_t n, size_t p, const float *A, const int8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -257,6 +297,9 @@ int matmul_scalar_f32_i8_u8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_u8_f32(size_t m, size_t n, size_t p, const float *A, const uint8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -271,6 +314,9 @@ int matmul_scalar_f32_u8_f32(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_u8_f64(size_t m, size_t n, size_t p, const float *A, const uint8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -285,6 +331,9 @@ int matmul_scalar_f32_u8_f64(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_u8_i8(size_t m, size_t n, size_t p, const float *A, const uint8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -303,6 +352,9 @@ int matmul_scalar_f32_u8_i8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f32_u8_u8(size_t m, size_t n, size_t p, const float *A, const uint8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -321,6 +373,9 @@ int matmul_scalar_f32_u8_u8(size_t m, size_t n, size_t p, const float *A, const } int matmul_scalar_f64_f32_f32(size_t m, size_t n, size_t p, const double *A, const float *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -335,6 +390,9 @@ int matmul_scalar_f64_f32_f32(size_t m, size_t n, size_t p, const double *A, con } int matmul_scalar_f64_f32_f64(size_t m, size_t n, size_t p, const double *A, const float *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -349,6 +407,9 @@ int matmul_scalar_f64_f32_f64(size_t m, size_t n, size_t p, const double *A, con } int matmul_scalar_f64_f32_i8(size_t m, size_t n, size_t p, const double *A, const float *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -367,6 +428,9 @@ int matmul_scalar_f64_f32_i8(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_f32_u8(size_t m, size_t n, size_t p, const double *A, const float *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -385,6 +449,9 @@ int matmul_scalar_f64_f32_u8(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_f64_f32(size_t m, size_t n, size_t p, const double *A, const double *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -399,6 +466,9 @@ int matmul_scalar_f64_f64_f32(size_t m, size_t n, size_t p, const double *A, con } int matmul_scalar_f64_f64_f64(size_t m, size_t n, size_t p, const double *A, const double *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -413,6 +483,9 @@ int matmul_scalar_f64_f64_f64(size_t m, size_t n, size_t p, const double *A, con } int matmul_scalar_f64_f64_i8(size_t m, size_t n, size_t p, const double *A, const double *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -431,6 +504,9 @@ int matmul_scalar_f64_f64_i8(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_f64_u8(size_t m, size_t n, size_t p, const double *A, const double *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -449,6 +525,9 @@ int matmul_scalar_f64_f64_u8(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_i8_f32(size_t m, size_t n, size_t p, const double *A, const int8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -463,6 +542,9 @@ int matmul_scalar_f64_i8_f32(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_i8_f64(size_t m, size_t n, size_t p, const double *A, const int8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -477,6 +559,9 @@ int matmul_scalar_f64_i8_f64(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_i8_i8(size_t m, size_t n, size_t p, const double *A, const int8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -495,6 +580,9 @@ int matmul_scalar_f64_i8_i8(size_t m, size_t n, size_t p, const double *A, const } int matmul_scalar_f64_i8_u8(size_t m, size_t n, size_t p, const double *A, const int8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -513,6 +601,9 @@ int matmul_scalar_f64_i8_u8(size_t m, size_t n, size_t p, const double *A, const } int matmul_scalar_f64_u8_f32(size_t m, size_t n, size_t p, const double *A, const uint8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -527,6 +618,9 @@ int matmul_scalar_f64_u8_f32(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_u8_f64(size_t m, size_t n, size_t p, const double *A, const uint8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -541,6 +635,9 @@ int matmul_scalar_f64_u8_f64(size_t m, size_t n, size_t p, const double *A, cons } int matmul_scalar_f64_u8_i8(size_t m, size_t n, size_t p, const double *A, const uint8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -559,6 +656,9 @@ int matmul_scalar_f64_u8_i8(size_t m, size_t n, size_t p, const double *A, const } int matmul_scalar_f64_u8_u8(size_t m, size_t n, size_t p, const double *A, const uint8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -577,6 +677,9 @@ int matmul_scalar_f64_u8_u8(size_t m, size_t n, size_t p, const double *A, const } int matmul_scalar_i8_f32_f32(size_t m, size_t n, size_t p, const int8_t *A, const float *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -591,6 +694,9 @@ int matmul_scalar_i8_f32_f32(size_t m, size_t n, size_t p, const int8_t *A, cons } int matmul_scalar_i8_f32_f64(size_t m, size_t n, size_t p, const int8_t *A, const float *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -605,6 +711,9 @@ int matmul_scalar_i8_f32_f64(size_t m, size_t n, size_t p, const int8_t *A, cons } int matmul_scalar_i8_f32_i8(size_t m, size_t n, size_t p, const int8_t *A, const float *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -623,6 +732,9 @@ int matmul_scalar_i8_f32_i8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_f32_u8(size_t m, size_t n, size_t p, const int8_t *A, const float *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -641,6 +753,9 @@ int matmul_scalar_i8_f32_u8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_f64_f32(size_t m, size_t n, size_t p, const int8_t *A, const double *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -655,6 +770,9 @@ int matmul_scalar_i8_f64_f32(size_t m, size_t n, size_t p, const int8_t *A, cons } int matmul_scalar_i8_f64_f64(size_t m, size_t n, size_t p, const int8_t *A, const double *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -669,6 +787,9 @@ int matmul_scalar_i8_f64_f64(size_t m, size_t n, size_t p, const int8_t *A, cons } int matmul_scalar_i8_f64_i8(size_t m, size_t n, size_t p, const int8_t *A, const double *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -687,6 +808,9 @@ int matmul_scalar_i8_f64_i8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_f64_u8(size_t m, size_t n, size_t p, const int8_t *A, const double *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -705,6 +829,9 @@ int matmul_scalar_i8_f64_u8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_i8_f32(size_t m, size_t n, size_t p, const int8_t *A, const int8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -719,6 +846,9 @@ int matmul_scalar_i8_i8_f32(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_i8_f64(size_t m, size_t n, size_t p, const int8_t *A, const int8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -733,6 +863,9 @@ int matmul_scalar_i8_i8_f64(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_i8_i8(size_t m, size_t n, size_t p, const int8_t *A, const int8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -751,6 +884,9 @@ int matmul_scalar_i8_i8_i8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_i8_u8(size_t m, size_t n, size_t p, const int8_t *A, const int8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -769,6 +905,9 @@ int matmul_scalar_i8_i8_u8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_u8_f32(size_t m, size_t n, size_t p, const int8_t *A, const uint8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -783,6 +922,9 @@ int matmul_scalar_i8_u8_f32(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_u8_f64(size_t m, size_t n, size_t p, const int8_t *A, const uint8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -797,6 +939,9 @@ int matmul_scalar_i8_u8_f64(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_u8_i8(size_t m, size_t n, size_t p, const int8_t *A, const uint8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -815,6 +960,9 @@ int matmul_scalar_i8_u8_i8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_i8_u8_u8(size_t m, size_t n, size_t p, const int8_t *A, const uint8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -833,6 +981,9 @@ int matmul_scalar_i8_u8_u8(size_t m, size_t n, size_t p, const int8_t *A, const } int matmul_scalar_u8_f32_f32(size_t m, size_t n, size_t p, const uint8_t *A, const float *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -847,6 +998,9 @@ int matmul_scalar_u8_f32_f32(size_t m, size_t n, size_t p, const uint8_t *A, con } int matmul_scalar_u8_f32_f64(size_t m, size_t n, size_t p, const uint8_t *A, const float *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -861,6 +1015,9 @@ int matmul_scalar_u8_f32_f64(size_t m, size_t n, size_t p, const uint8_t *A, con } int matmul_scalar_u8_f32_i8(size_t m, size_t n, size_t p, const uint8_t *A, const float *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -879,6 +1036,9 @@ int matmul_scalar_u8_f32_i8(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_f32_u8(size_t m, size_t n, size_t p, const uint8_t *A, const float *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -897,6 +1057,9 @@ int matmul_scalar_u8_f32_u8(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_f64_f32(size_t m, size_t n, size_t p, const uint8_t *A, const double *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -911,6 +1074,9 @@ int matmul_scalar_u8_f64_f32(size_t m, size_t n, size_t p, const uint8_t *A, con } int matmul_scalar_u8_f64_f64(size_t m, size_t n, size_t p, const uint8_t *A, const double *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -925,6 +1091,9 @@ int matmul_scalar_u8_f64_f64(size_t m, size_t n, size_t p, const uint8_t *A, con } int matmul_scalar_u8_f64_i8(size_t m, size_t n, size_t p, const uint8_t *A, const double *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -943,6 +1112,9 @@ int matmul_scalar_u8_f64_i8(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_f64_u8(size_t m, size_t n, size_t p, const uint8_t *A, const double *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -961,6 +1133,9 @@ int matmul_scalar_u8_f64_u8(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_i8_f32(size_t m, size_t n, size_t p, const uint8_t *A, const int8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -975,6 +1150,9 @@ int matmul_scalar_u8_i8_f32(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_i8_f64(size_t m, size_t n, size_t p, const uint8_t *A, const int8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -989,6 +1167,9 @@ int matmul_scalar_u8_i8_f64(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_i8_i8(size_t m, size_t n, size_t p, const uint8_t *A, const int8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -1007,6 +1188,9 @@ int matmul_scalar_u8_i8_i8(size_t m, size_t n, size_t p, const uint8_t *A, const } int matmul_scalar_u8_i8_u8(size_t m, size_t n, size_t p, const uint8_t *A, const int8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -1025,6 +1209,9 @@ int matmul_scalar_u8_i8_u8(size_t m, size_t n, size_t p, const uint8_t *A, const } int matmul_scalar_u8_u8_f32(size_t m, size_t n, size_t p, const uint8_t *A, const uint8_t *B, float *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { float sum = 0; @@ -1039,6 +1226,9 @@ int matmul_scalar_u8_u8_f32(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_u8_f64(size_t m, size_t n, size_t p, const uint8_t *A, const uint8_t *B, double *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { double sum = 0; @@ -1053,6 +1243,9 @@ int matmul_scalar_u8_u8_f64(size_t m, size_t n, size_t p, const uint8_t *A, cons } int matmul_scalar_u8_u8_i8(size_t m, size_t n, size_t p, const uint8_t *A, const uint8_t *B, int8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0; @@ -1071,6 +1264,9 @@ int matmul_scalar_u8_u8_i8(size_t m, size_t n, size_t p, const uint8_t *A, const } int matmul_scalar_u8_u8_u8(size_t m, size_t n, size_t p, const uint8_t *A, const uint8_t *B, uint8_t *C, double scale) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (size_t i = 0; i < m; i++) { for (size_t j = 0; j < p; j++) { int sum = 0;