test-barrier.cpp source code [llama.cpp/tests/test-barrier.cpp]

1	#include "ggml.h"
2	#include "ggml-cpu.h"
3
4	#include <chrono>
5	#include <iostream>
6	#include <cstdio>
7	#include <cstdlib>
8	#include <cassert>
9	#include <vector>
10	#include <thread>
11
12	#define MAX_NARGS 2
13
14	int main(int argc, char *argv[]) {
15
16	int n_threads = std::max(a: `1`, b: std::min(a: `4`, b: (int) std::thread::hardware_concurrency()));
17	int n_rounds = `100`;
18
19	if (argc > `1`) {
20	n_threads = std::atoi(nptr: argv[`1`]);
21	}
22
23	if (argc > `2`) {
24	n_rounds = std::atoi(nptr: argv[`2`]);
25	}
26
27	struct ggml_init_params params = {
28	/ .mem_size = / `1024``1024``1024`,
29	/ .mem_buffer = / NULL,
30	/ .no_alloc = / false,
31	};
32
33	struct ggml_context * ctx = ggml_init(params);
34
35	// Create graph
36	struct ggml_cgraph * gf = ggml_new_graph(ctx);
37
38	// Lots of small, parallel ops where barriers in between will dominate
39	struct ggml_tensor * out = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: `64`);
40	for (int i = `0`; i < `1000`; i++) {
41	struct ggml_tensor * a = ggml_new_tensor_2d(ctx, type: GGML_TYPE_Q4_0, ne0: `64`, ne1: `128`);
42	out = ggml_mul_mat(ctx, a, b: out);
43
44	struct ggml_tensor * d = ggml_new_tensor_2d(ctx, type: GGML_TYPE_Q4_0, ne0: `128`, ne1: `64`);
45	out = ggml_mul_mat(ctx, a: d, b: out);
46	}
47
48	ggml_build_forward_expand(cgraph: gf, tensor: out);
49	int n_nodes = ggml_graph_n_nodes(cgraph: gf);
50
51	// Create threadpool
52	struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
53	struct ggml_threadpool* threadpool = ggml_threadpool_new(params: &tpp);
54	if (!threadpool) {
55	fprintf(stderr, format: "threadpool create failed : n_threads %d\n", n_threads);
56	exit(status: `1`);
57	}
58
59	// Create compute plan
60	struct ggml_cplan cplan = ggml_graph_plan(cgraph: gf, n_threads, threadpool);
61
62	std::vector<uint8_t> work_data(cplan.work_size);
63	cplan.work_data = work_data.data();
64
65	std::cerr << "graph-compute with"
66	<< "\n n_threads: " << n_threads
67	<< "\n n_nodes: " << n_nodes
68	<< "\n n_rounds: " << n_rounds
69	<< "\n";
70	// ggml_graph_print(gf);
71
72	// Warmup
73	ggml_graph_compute(cgraph: gf, cplan: &cplan);
74
75	auto t0 = std::chrono::high_resolution_clock::now();
76
77	for (int i=`0`; i < n_rounds; i++) {
78	ggml_graph_compute(cgraph: gf, cplan: &cplan);
79	}
80
81	auto t1 = std::chrono::high_resolution_clock::now();
82
83	auto usec = std::chrono::duration_cast<std::chrono::microseconds>(d: t1 -t0).count();
84	auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(d: t1 -t0).count();
85	std::cerr << "graph-compute took " << usec << " usec "
86	<< "\n " << (float) usec / n_rounds << " usec per-iter"
87	<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
88	<< "\n";
89
90	ggml_threadpool_free(threadpool);
91	ggml_free(ctx);
92
93	return `0`;
94	}
95

Browse the source code of llama.cpp/tests/test-barrier.cpp