1#include "common.h"
2#include "llama.h"
3#include "ggml.h"
4
5#ifdef GGML_USE_CUDA
6#include "ggml-cuda.h"
7#endif
8
9#ifdef GGML_USE_METAL
10#include "ggml-metal.h"
11#endif
12
13#include <cstdio>
14#include <ctime>
15#include <random>
16#include <string>
17#include <vector>
18
19#define DEBUG_POS 5
20
21static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
22 printf(format: "%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(type: t->type), (int) t->ne[0], (int) t->ne[1]);
23 if (!with_data) return;
24 printf(format: "%s: %s[0] = [", __func__, t->name);
25 for (size_t i = 0; i <= DEBUG_POS; i++) {
26 printf(format: " %f,", ggml_get_f32_nd(tensor: t, i0: i, i1: 0, i2: 0, i3: 0));
27 }
28 printf(format: " ... ]\n");
29}
30
31namespace PCA {
32
33// input params for PCA computations
34struct pca_params {
35 int n_threads = 1;
36 int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
37 int n_iterations = 1000;
38 float tolerance = 1e-7;
39
40 // for debugging
41 int i_layer = 0;
42 int n_layers = 0;
43};
44
45// result from each iteration
46struct pca_result {
47 struct ggml_tensor * calculated_square = NULL;
48 std::vector<struct ggml_tensor *> eigenvectors;
49 std::vector<float> distances;
50};
51
52struct pca_model {
53 ggml_backend_t backend = NULL;
54 ggml_backend_buffer_t buffer;
55 struct ggml_context * ctx; // context to compute graph on target device
56 struct ggml_context * ctx_host; // host context to store results
57
58 // tensors on target device
59 struct ggml_tensor * dev_input;
60 struct ggml_tensor * dev_square;
61 struct ggml_tensor * dev_eigenvector;
62
63 pca_model(struct ggml_tensor * t_input) {
64#ifdef GGML_USE_CUDA
65 fprintf(stderr, format: "%s: using CUDA backend\n", __func__);
66 backend = ggml_backend_cuda_init(device: 0); // init device 0
67 if (!backend) {
68 fprintf(stderr, format: "%s: ggml_backend_cuda_init() failed\n", __func__);
69 }
70#endif
71
72// TODO: enable Metal support when support for GGML_OP_SQRT is added
73// #ifdef GGML_USE_METAL
74// fprintf(stderr, "%s: using Metal backend\n", __func__);
75// backend = ggml_backend_metal_init();
76// if (!backend) {
77// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
78// }
79// #endif
80
81 // if there aren't GPU Backends fallback to CPU backend
82 if (!backend) {
83 backend = ggml_backend_cpu_init();
84 }
85
86 const int num_tensors = 4;
87 struct ggml_init_params params {
88 /*.mem_size =*/ .mem_size: ggml_tensor_overhead() * num_tensors,
89 /*.mem_buffer =*/ NULL,
90 /*.no_alloc =*/ .no_alloc: true,
91 };
92 ctx = ggml_init(params);
93
94 auto n_samples = t_input->ne[0];
95 auto n_embd = t_input->ne[1];
96
97 dev_input = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_samples, ne1: n_embd);
98 dev_square = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd);
99 dev_eigenvector = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd);
100
101 ggml_set_name(tensor: dev_input, name: "dev_input");
102 ggml_set_name(tensor: dev_square, name: "dev_square");
103 ggml_set_name(tensor: dev_eigenvector, name: "dev_eigenvector");
104 buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
105 ggml_backend_tensor_set(tensor: dev_input, data: t_input->data, offset: 0, size: ggml_nbytes(tensor: t_input));
106
107 // initialize eigenvector to random normalized vector
108 {
109 std::vector<float> random_vec(ggml_nelements(tensor: dev_eigenvector), 0.0);
110 std::default_random_engine generator(static_cast<unsigned int>(std::time(timer: 0)));
111 std::uniform_real_distribution<float> distribution(0.0, 1.0);
112 float sum_sqr = 0.0; // for normalizing random_vec
113 for (size_t i = 0; i < random_vec.size(); ++i) {
114 float f = distribution(generator);
115 sum_sqr += f * f;
116 random_vec[i] = f;
117 }
118 // normalize it
119 float random_vec_norm = std::sqrt(x: sum_sqr);
120 for (size_t i = 0; i < random_vec.size(); ++i) {
121 random_vec[i] /= random_vec_norm;
122 }
123 ggml_backend_tensor_set(tensor: dev_eigenvector, data: random_vec.data(), offset: 0, size: ggml_nbytes(tensor: dev_eigenvector));
124 }
125 }
126
127 ~pca_model() {
128 ggml_free(ctx);
129 ggml_backend_buffer_free(buffer);
130 ggml_backend_free(backend);
131 }
132};
133
134static struct ggml_cgraph * build_graph_piter(
135 const struct pca_params & params,
136 const pca_model & model,
137 bool calc_square = false) {
138 GGML_ASSERT(params.n_batch > 0);
139 // TODO: buf_size must be able to scale with params.n_batch
140 static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
141 static std::vector<uint8_t> buf(buf_size);
142
143 struct ggml_init_params params0 = {
144 /*.mem_size =*/ .mem_size: buf_size,
145 /*.mem_buffer =*/ .mem_buffer: buf.data(),
146 /*.no_alloc =*/ .no_alloc: true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
147 };
148 // create a temporally context to build the graph
149 struct ggml_context * ctx0 = ggml_init(params: params0);
150 struct ggml_cgraph * gf = ggml_new_graph(ctx: ctx0);
151
152 // turn v_diff_original into square matrix if needed
153 struct ggml_tensor * tmp_square;
154 if (calc_square) {
155 tmp_square = ggml_mul_mat(ctx: ctx0, a: model.dev_input, b: model.dev_input);
156 ggml_set_name(tensor: tmp_square, name: "tmp_square");
157 }
158
159 struct ggml_tensor * b_tensor;
160 struct ggml_tensor * distance;
161 struct ggml_tensor * old_eigen = model.dev_eigenvector;
162 struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
163
164 for (int i = 0; i < params.n_batch; ++i) {
165 // b_tensor = square * eigenvector^T
166 b_tensor = ggml_mul_mat(ctx: ctx0, a: input_square, b: old_eigen);
167 ggml_set_name(tensor: b_tensor, name: "b_tensor");
168
169 // normalize
170 b_tensor = ggml_div_inplace(ctx: ctx0,
171 a: b_tensor,
172 b: ggml_sqrt_inplace(ctx: ctx0, a: ggml_sum_rows(ctx: ctx0, a: ggml_sqr(ctx: ctx0, a: b_tensor)))
173 );
174 ggml_format_name(tensor: b_tensor, fmt: "b_tensor_norm_%d", i);
175
176 // calculate distance(new eigenvector - old eigenvector)
177 // we don't use ggml_sub because it may not be implemented on GPU backend
178 struct ggml_tensor * new_sub_old = ggml_add(ctx: ctx0, a: old_eigen, b: ggml_scale(ctx: ctx0, a: b_tensor, s: -1));
179 distance = ggml_sqrt_inplace(ctx: ctx0,
180 a: ggml_sum_rows(ctx: ctx0, a: ggml_sqr_inplace(ctx: ctx0, a: new_sub_old)));
181 ggml_format_name(tensor: distance, fmt: "distance_%d", i);
182
183 old_eigen = b_tensor;
184
185 // build operations nodes
186 ggml_build_forward_expand(cgraph: gf, tensor: distance);
187 }
188
189 // delete the temporally context used to build the graph
190 ggml_free(ctx: ctx0);
191 return gf;
192}
193
194static ggml_status compute_piter(
195 const struct pca_params & params,
196 const pca_model & model,
197 struct ggml_cgraph * gf,
198 ggml_gallocr_t allocr,
199 struct pca_result & result) {
200 // allocate tensors
201 ggml_gallocr_alloc_graph(galloc: allocr, graph: gf);
202
203 if (ggml_backend_is_cpu(backend: model.backend)) {
204 ggml_backend_cpu_set_n_threads(backend_cpu: model.backend, n_threads: params.n_threads);
205 }
206
207 ggml_status res = ggml_backend_graph_compute(backend: model.backend, cgraph: gf);
208 if (res == GGML_STATUS_SUCCESS) {
209 auto extract_i = [](std::string prefix, std::string str) -> int {
210 int i = -1;
211 if (str.rfind(str: prefix, pos: 0) == 0) {
212 sscanf(s: str.c_str(), format: (prefix + "%d").c_str(), &i);
213 }
214 return i;
215 };
216 result.calculated_square = NULL;
217 result.eigenvectors.clear();
218 result.distances.clear();
219 result.eigenvectors.resize(new_size: params.n_batch);
220 result.distances.resize(new_size: params.n_batch);
221 // get output nodes
222 for (int i = 0; i < ggml_graph_n_nodes(cgraph: gf); ++i) {
223 auto node = ggml_graph_node(cgraph: gf, i);
224 int iter = -1;
225 // find b_tensor (without copying data from device)
226 if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
227 result.eigenvectors[iter] = node;
228 }
229 // find distances, then copy data from device
230 if ((iter = extract_i("distance_", node->name)) > -1) {
231 float d;
232 ggml_backend_tensor_get(tensor: node, data: &d, offset: 0, size: sizeof(float));
233 result.distances[iter] = d;
234 // std::cout << node->name << " = " << d << "\n";
235 }
236 // find tmp_square if it exists (without copying data from device)
237 if (std::string(node->name) == "tmp_square") {
238 result.calculated_square = node;
239 }
240 }
241 }
242 return res;
243}
244
245static void power_iteration(
246 const struct pca_params & params,
247 struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
248 struct ggml_tensor * output) {
249 //printf("in power iteration\n");
250 struct pca_model model(input);
251
252 ggml_gallocr_t allocr = ggml_gallocr_new(buft: ggml_backend_get_default_buffer_type(backend: model.backend));
253 struct pca_result result;
254 struct ggml_tensor * last_eigenvector = NULL;
255
256 int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
257 for (int iter = 0; iter < n_iters; ++iter) {
258 bool calc_square = (iter == 0); // only need to calculate square for first iteration
259 struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
260 // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
261 compute_piter(params, model, gf, allocr, result);
262
263 for (size_t k = 0; k < result.distances.size(); ++k) {
264 last_eigenvector = result.eigenvectors[k];
265 if (result.distances[k] < params.tolerance) {
266 break; // done
267 }
268 }
269
270 if (calc_square) {
271 // copy and store the square matrix if needed
272 GGML_ASSERT(result.calculated_square != NULL);
273 ggml_backend_tensor_copy(src: result.calculated_square, dst: model.dev_square);
274 }
275
276 {
277 // copy last eigen vector and store as input for next iteration
278 GGML_ASSERT(last_eigenvector != NULL);
279 ggml_backend_tensor_copy(src: last_eigenvector, dst: model.dev_eigenvector);
280 }
281
282 printf(format: "%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
283 __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
284 }
285
286 // get output tensor
287 GGML_ASSERT(last_eigenvector);
288 ggml_backend_tensor_get(tensor: last_eigenvector, data: output->data, offset: 0, size: ggml_nbytes(tensor: last_eigenvector));
289 //print_debug_tensor(output);
290 ggml_gallocr_free(galloc: allocr);
291
292 // TODO @ngxson : The output vector is randomly inverted
293 // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
294}
295
296static void run_pca(
297 struct pca_params & params,
298 const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
299 const std::vector<struct ggml_tensor *> & v_output) {
300 printf(format: "%s: Running PCA...\n", __func__);
301 for (size_t il = 0; il < v_input.size(); ++il) {
302
303 // prepare output vector
304 struct ggml_tensor * ctrl_out = v_output[il];
305 ggml_format_name(tensor: ctrl_out, fmt: "direction.%zu", il+1);
306
307 // run power_iteration
308 params.i_layer = il;
309 params.n_layers = v_input.size();
310 power_iteration(params, input: v_input[il], output: ctrl_out);
311 printf(format: "%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
312 }
313}
314
315}
316