Skip to content

Commit ccb5be1

Browse files
committed
feat: naive benchmarks
1 parent d4c70a9 commit ccb5be1

File tree

10 files changed

+446
-0
lines changed

10 files changed

+446
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
Running NaiveMatmulMNKBench 2048x2048x2048 benchmark
2+
Total time (s): 7.9635
3+
Total reps: 1
4+
Total floating point operations: 17179869184
5+
Estimated GFLOPS/sec: 2.15733
6+
--------------------------------------------------
7+
Running NaiveMatmulBrMNKBench 1024x1024x1024 br=16 benchmark
8+
Total time (s): 45.542
9+
Total reps: 1
10+
Total floating point operations: 34359738368
11+
Estimated GFLOPS/sec: 0.754462
12+
--------------------------------------------------
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
Running NaiveSigmoidPrimitiveBench 50x50 benchmark
2+
Total time (s): 3
3+
Total reps: 968308
4+
Total number of elements: 2420770000
5+
Total amount of processed data (GiB): 18.0361
6+
Bandwidth (GiB/s) 6.01205
7+
--------------------------------------------------
8+
Running NaiveSigmoidPrimitiveBench 64x64 benchmark
9+
Total time (s): 3
10+
Total reps: 593757
11+
Total number of elements: 2432028672
12+
Total amount of processed data (GiB): 18.12
13+
Bandwidth (GiB/s) 6.04001
14+
--------------------------------------------------
15+
Running NaiveSigmoidPrimitiveBench 512x512 benchmark
16+
Total time (s): 3.00025
17+
Total reps: 9206
18+
Total number of elements: 2413297664
19+
Total amount of processed data (GiB): 17.9805
20+
Bandwidth (GiB/s) 5.99299
21+
--------------------------------------------------
22+
Running NaiveSigmoidPrimitiveBench 2048x2048 benchmark
23+
Total time (s): 3.00451
24+
Total reps: 567
25+
Total number of elements: 2378170368
26+
Total amount of processed data (GiB): 17.7188
27+
Bandwidth (GiB/s) 5.89738
28+
--------------------------------------------------

include/mlc/benchmarks/all_benchmarks.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55
#include <mlc/benchmarks/TensorOperation.bench.h>
66
#include <mlc/benchmarks/matmul/Matmul_br_m_n_k.bench.h>
77
#include <mlc/benchmarks/matmul/Matmul_m_n_k.bench.h>
8+
#include <mlc/benchmarks/matmul/naive_matmul_br_m_n_k.bench.h>
9+
#include <mlc/benchmarks/matmul/naive_matmul_m_n_k.bench.h>
810
#include <mlc/benchmarks/unary/fast_sigmoid_primitive.bench.h>
911
#include <mlc/benchmarks/unary/identity_primitive.bench.h>
1012
#include <mlc/benchmarks/unary/identity_trans_primitive.bench.h>
13+
#include <mlc/benchmarks/unary/naive_sigmoid_primitive.bench.h>
1114
#include <mlc/benchmarks/unary/reciprocal_primitive.bench.h>
1215
#include <mlc/benchmarks/unary/relu_primitive.bench.h>
1316
#include <mlc/benchmarks/unary/relu_trans_primitive.bench.h>
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#ifndef NAIVE_MATMUL_BR_M_N_K_BENCH_H
2+
#define NAIVE_MATMUL_BR_M_N_K_BENCH_H
3+
#include <mlc/benchmarks/Benchmark.h>
4+
5+
namespace mini_jit
6+
{
7+
namespace benchmarks
8+
{
9+
/**
10+
* @brief Benchmark for naive matrix multiplication using BRGEMM.
11+
*/
12+
class NaiveMatmulBrMNKBench : public Benchmark
13+
{
14+
public:
15+
/**
16+
* @brief Constructor for the benchmark for naive matrix multiplication using BRGEMM.
17+
* @param run_time The time to run the benchmark in seconds.
18+
* @param m number of rows in A and C.
19+
* @param n number of columns in B and C.
20+
* @param k number of columns in A and rows in B.
21+
* @param br_size The size of the batch-reduce.
22+
*/
23+
NaiveMatmulBrMNKBench(double run_time,
24+
int m,
25+
int n,
26+
int k,
27+
int br_size);
28+
//! Destructor
29+
~NaiveMatmulBrMNKBench() override = default;
30+
//! Runs the benchmark.
31+
void run() override;
32+
33+
private:
34+
int m_M;
35+
int m_N;
36+
int m_K;
37+
int m_br_size;
38+
double m_run_time;
39+
float* m_A;
40+
float* m_B;
41+
float* m_C;
42+
};
43+
44+
} // namespace benchmarks
45+
} // namespace mini_jit
46+
47+
#endif // NAIVE_MATMUL_BR_M_N_K_BENCH_H
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#ifndef NAIVE_MATMUL_M_N_K_BENCH_H
2+
#define NAIVE_MATMUL_M_N_K_BENCH_H
3+
#include <mlc/benchmarks/Benchmark.h>
4+
5+
namespace mini_jit
6+
{
7+
namespace benchmarks
8+
{
9+
class NaiveMatmulMNKBench : public Benchmark
10+
{
11+
public:
12+
/**
13+
* @brief Constructor for the benchmark for naive matrix multiplication using GEMM.
14+
* @param run_time The time to run the benchmark in seconds.
15+
* @param m number of rows in A and C.
16+
* @param n number of columns in B and C.
17+
* @param k number of columns in A and rows in B.
18+
*/
19+
NaiveMatmulMNKBench(double run_time,
20+
int m,
21+
int n,
22+
int k);
23+
//! Destructor
24+
~NaiveMatmulMNKBench() override = default;
25+
//! Runs the benchmark.
26+
void run() override;
27+
28+
private:
29+
int m_M;
30+
int m_N;
31+
int m_K;
32+
double m_run_time;
33+
float* m_A;
34+
float* m_B;
35+
float* m_C;
36+
};
37+
38+
} // namespace benchmarks
39+
} // namespace mini_jit
40+
41+
#endif // NAIVE_MATMUL_M_N_K_BENCH_H
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#ifndef NAIVE_SIGMOID_PRIMITIVE_BENCH_H
2+
#define NAIVE_SIGMOID_PRIMITIVE_BENCH_H
3+
#include <cstdint>
4+
#include <mlc/benchmarks/Benchmark.h>
5+
6+
namespace mini_jit
7+
{
8+
namespace benchmarks
9+
{
10+
class NaiveSigmoidPrimitiveBench : public Benchmark
11+
{
12+
public:
13+
/**
14+
* @brief Constructor for the benchmark for the naive sigmoid primitive.
15+
* @param runTime The time to run the benchmark in seconds.
16+
* @param m number of rows in A and B.
17+
* @param n number of columns in A and B.
18+
*/
19+
NaiveSigmoidPrimitiveBench(double runTime,
20+
uint32_t m,
21+
uint32_t n);
22+
//! Destructor
23+
~NaiveSigmoidPrimitiveBench() override = default;
24+
//! Runs the benchmark.
25+
void run() override;
26+
27+
private:
28+
uint32_t m_M;
29+
uint32_t m_N;
30+
double m_runTime;
31+
float* m_A;
32+
float* m_B;
33+
};
34+
35+
} // namespace benchmarks
36+
} // namespace mini_jit
37+
38+
#endif // NAIVE_SIGMOID_PRIMITIVE_BENCH_H

src/benchmarks.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,7 @@ int main(int argc, char* argv[])
621621
bool has_gemm = false;
622622
bool has_brgemm = false;
623623
bool has_matmul = false;
624+
bool has_naive_matmul = false;
624625
bool has_unary = false;
625626
bool has_tensor_operations = false;
626627
bool has_shared_tensor_operations = false;
@@ -629,6 +630,7 @@ int main(int argc, char* argv[])
629630
bool has_opt_einsum_benchmark = false;
630631
bool has_reciprocal = false;
631632
bool has_sigmoid = false;
633+
bool has_naive_sigmoid = false;
632634
for (int i = 1; i < argc; ++i)
633635
{
634636
if (strcmp(argv[i], "gemm") == 0)
@@ -637,6 +639,8 @@ int main(int argc, char* argv[])
637639
has_brgemm = true;
638640
else if (strcmp(argv[i], "matmul") == 0)
639641
has_matmul = true;
642+
else if (strcmp(argv[i], "naive-matmul") == 0)
643+
has_naive_matmul = true;
640644
else if (strcmp(argv[i], "unary") == 0)
641645
has_unary = true;
642646
else if (strcmp(argv[i], "top") == 0)
@@ -653,6 +657,8 @@ int main(int argc, char* argv[])
653657
has_reciprocal = true;
654658
else if (strcmp(argv[i], "sigmoid") == 0)
655659
has_sigmoid = true;
660+
else if (strcmp(argv[i], "naive-sigmoid") == 0)
661+
has_naive_sigmoid = true;
656662
else if (strcmp(argv[i], "help") == 0)
657663
std::cout << "Usage: " << argv[0] << " [gemm|brgemm|matmul|unary|top|top-shared|top-opt|einsum|opt-einsum|reciprocal|sigmoid]" << std::endl;
658664
else
@@ -683,6 +689,16 @@ int main(int argc, char* argv[])
683689
matmul_bm.close();
684690
}
685691

692+
if (has_naive_matmul)
693+
{
694+
mini_jit::benchmarks::NaiveMatmulMNKBench bench_mnk(3.0, 2048, 2048, 2048);
695+
mini_jit::benchmarks::NaiveMatmulBrMNKBench bench_brmnk(3.0, 1024, 1024, 1024, 16);
696+
std::ofstream matmul_bm("benchmarks/naive_matmul_benchmarks.txt");
697+
print_throughput(bench_mnk, matmul_bm, "NaiveMatmulMNKBench 2048x2048x2048");
698+
print_throughput(bench_brmnk, matmul_bm, "NaiveMatmulBrMNKBench 1024x1024x1024 br=16");
699+
matmul_bm.close();
700+
}
701+
686702
if (has_unary)
687703
{
688704
const double RUN_TIME = 3.0;
@@ -1024,5 +1040,22 @@ int main(int argc, char* argv[])
10241040
sigmoid_bm.close();
10251041
}
10261042

1043+
if (has_naive_sigmoid)
1044+
{
1045+
const double RUN_TIME = 3.0;
1046+
std::ofstream sigmoid_bm("benchmarks/naive_sigmoid_benchmark.txt");
1047+
1048+
mini_jit::benchmarks::NaiveSigmoidPrimitiveBench bench_naive_sigmoid_50_50(RUN_TIME, 50, 50);
1049+
mini_jit::benchmarks::NaiveSigmoidPrimitiveBench bench_naive_sigmoid_64_64(RUN_TIME, 64, 64);
1050+
mini_jit::benchmarks::NaiveSigmoidPrimitiveBench bench_naive_sigmoid_512_512(RUN_TIME, 512, 512);
1051+
mini_jit::benchmarks::NaiveSigmoidPrimitiveBench bench_naive_sigmoid_2048_2048(RUN_TIME, 2048, 2048);
1052+
print_bandwidth(bench_naive_sigmoid_50_50, sigmoid_bm, "NaiveSigmoidPrimitiveBench 50x50");
1053+
print_bandwidth(bench_naive_sigmoid_64_64, sigmoid_bm, "NaiveSigmoidPrimitiveBench 64x64");
1054+
print_bandwidth(bench_naive_sigmoid_512_512, sigmoid_bm, "NaiveSigmoidPrimitiveBench 512x512");
1055+
print_bandwidth(bench_naive_sigmoid_2048_2048, sigmoid_bm, "NaiveSigmoidPrimitiveBench 2048x2048");
1056+
1057+
sigmoid_bm.close();
1058+
}
1059+
10271060
return 0;
10281061
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#include <chrono>
2+
#include <mlc/Brgemm.h>
3+
#include <mlc/Kernel.h>
4+
#include <mlc/benchmarks/Benchmark.h>
5+
#include <mlc/benchmarks/matmul/naive_matmul_br_m_n_k.bench.h>
6+
#include <random>
7+
8+
mini_jit::benchmarks::NaiveMatmulBrMNKBench::NaiveMatmulBrMNKBench(double run_time,
9+
int m,
10+
int n,
11+
int k,
12+
int br_size)
13+
: Benchmark()
14+
{
15+
m_M = m;
16+
m_N = n;
17+
m_K = k;
18+
m_br_size = br_size;
19+
m_run_time = run_time;
20+
}
21+
22+
void mini_jit::benchmarks::NaiveMatmulBrMNKBench::run()
23+
{
24+
m_A = new float[m_M * m_K * m_br_size];
25+
m_B = new float[m_K * m_N * m_br_size];
26+
m_C = new float[m_M * m_N];
27+
28+
// Initialize matrices A and B with random values
29+
std::random_device rd;
30+
std::mt19937 gen(rd());
31+
std::uniform_real_distribution<float> dist(-10.0f, 10.0f);
32+
33+
for (int i = 0; i < m_M * m_K * m_br_size; i++)
34+
{
35+
m_A[i] = dist(gen);
36+
}
37+
for (int i = 0; i < m_K * m_N * m_br_size; i++)
38+
{
39+
m_B[i] = dist(gen);
40+
}
41+
// Initialize matrix C with zeros
42+
for (int i = 0; i < m_M * m_N; ++i)
43+
{
44+
m_C[i] = 0.0f;
45+
}
46+
47+
// RUN
48+
long l_num_reps = 0;
49+
auto l_start_time = std::chrono::high_resolution_clock::now();
50+
double l_elapsed = 0.0;
51+
double l_runTimeMs = m_run_time * 1e6;
52+
do
53+
{
54+
for (int col = 0; col < m_N; ++col)
55+
{
56+
for (int row = 0; row < m_M; ++row)
57+
{
58+
float sum = 0.0f;
59+
for (int br = 0; br < m_br_size; ++br)
60+
{
61+
for (int k = 0; k < m_K; ++k)
62+
{
63+
sum += m_A[br * m_M * m_K + row + k * m_M] * m_B[br * m_K * m_N + k + col * m_K];
64+
}
65+
}
66+
m_C[row + col * m_M] += sum;
67+
}
68+
}
69+
++l_num_reps;
70+
auto l_now = std::chrono::high_resolution_clock::now();
71+
l_elapsed = std::chrono::duration_cast<std::chrono::microseconds>(
72+
l_now - l_start_time)
73+
.count();
74+
} while (l_elapsed < l_runTimeMs);
75+
l_elapsed /= 1e6; // Convert to seconds
76+
// END RUN
77+
78+
// Calculate metrics
79+
long l_totalOperations = 2.0 * m_M * m_N * m_K * l_num_reps * m_br_size;
80+
double l_gflops = ((double)l_totalOperations) / (l_elapsed * 1e9);
81+
82+
// Store the results
83+
m_benchmarkResult.numReps = l_num_reps;
84+
m_benchmarkResult.elapsedSeconds = l_elapsed;
85+
m_benchmarkResult.totalNumberElements = m_M * m_N * m_K * l_num_reps;
86+
m_benchmarkResult.totalOperations = l_totalOperations;
87+
m_benchmarkResult.gflops = l_gflops;
88+
89+
delete[] m_A;
90+
delete[] m_B;
91+
delete[] m_C;
92+
}

0 commit comments

Comments
 (0)