| 1 | #include "common.h" |
| 2 | #include "llama.h" |
| 3 | #include "ggml.h" |
| 4 | |
| 5 | #ifdef GGML_USE_CUDA |
| 6 | #include "ggml-cuda.h" |
| 7 | #endif |
| 8 | |
| 9 | #ifdef GGML_USE_METAL |
| 10 | #include "ggml-metal.h" |
| 11 | #endif |
| 12 | |
| 13 | #include <cstdio> |
| 14 | #include <ctime> |
| 15 | #include <random> |
| 16 | #include <string> |
| 17 | #include <vector> |
| 18 | |
| 19 | #define DEBUG_POS 5 |
| 20 | |
| 21 | static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) { |
| 22 | printf(format: "%s: %s (%s): [%d, %d]\n" , __func__, t->name, ggml_type_name(type: t->type), (int) t->ne[0], (int) t->ne[1]); |
| 23 | if (!with_data) return; |
| 24 | printf(format: "%s: %s[0] = [" , __func__, t->name); |
| 25 | for (size_t i = 0; i <= DEBUG_POS; i++) { |
| 26 | printf(format: " %f," , ggml_get_f32_nd(tensor: t, i0: i, i1: 0, i2: 0, i3: 0)); |
| 27 | } |
| 28 | printf(format: " ... ]\n" ); |
| 29 | } |
| 30 | |
| 31 | namespace PCA { |
| 32 | |
| 33 | // input params for PCA computations |
| 34 | struct pca_params { |
| 35 | int n_threads = 1; |
| 36 | int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used |
| 37 | int n_iterations = 1000; |
| 38 | float tolerance = 1e-7; |
| 39 | |
| 40 | // for debugging |
| 41 | int i_layer = 0; |
| 42 | int n_layers = 0; |
| 43 | }; |
| 44 | |
| 45 | // result from each iteration |
| 46 | struct pca_result { |
| 47 | struct ggml_tensor * calculated_square = NULL; |
| 48 | std::vector<struct ggml_tensor *> eigenvectors; |
| 49 | std::vector<float> distances; |
| 50 | }; |
| 51 | |
| 52 | struct pca_model { |
| 53 | ggml_backend_t backend = NULL; |
| 54 | ggml_backend_buffer_t buffer; |
| 55 | struct ggml_context * ctx; // context to compute graph on target device |
| 56 | struct ggml_context * ctx_host; // host context to store results |
| 57 | |
| 58 | // tensors on target device |
| 59 | struct ggml_tensor * dev_input; |
| 60 | struct ggml_tensor * dev_square; |
| 61 | struct ggml_tensor * dev_eigenvector; |
| 62 | |
| 63 | pca_model(struct ggml_tensor * t_input) { |
| 64 | #ifdef GGML_USE_CUDA |
| 65 | fprintf(stderr, format: "%s: using CUDA backend\n" , __func__); |
| 66 | backend = ggml_backend_cuda_init(device: 0); // init device 0 |
| 67 | if (!backend) { |
| 68 | fprintf(stderr, format: "%s: ggml_backend_cuda_init() failed\n" , __func__); |
| 69 | } |
| 70 | #endif |
| 71 | |
| 72 | // TODO: enable Metal support when support for GGML_OP_SQRT is added |
| 73 | // #ifdef GGML_USE_METAL |
| 74 | // fprintf(stderr, "%s: using Metal backend\n", __func__); |
| 75 | // backend = ggml_backend_metal_init(); |
| 76 | // if (!backend) { |
| 77 | // fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); |
| 78 | // } |
| 79 | // #endif |
| 80 | |
| 81 | // if there aren't GPU Backends fallback to CPU backend |
| 82 | if (!backend) { |
| 83 | backend = ggml_backend_cpu_init(); |
| 84 | } |
| 85 | |
| 86 | const int num_tensors = 4; |
| 87 | struct ggml_init_params params { |
| 88 | /*.mem_size =*/ .mem_size: ggml_tensor_overhead() * num_tensors, |
| 89 | /*.mem_buffer =*/ NULL, |
| 90 | /*.no_alloc =*/ .no_alloc: true, |
| 91 | }; |
| 92 | ctx = ggml_init(params); |
| 93 | |
| 94 | auto n_samples = t_input->ne[0]; |
| 95 | auto n_embd = t_input->ne[1]; |
| 96 | |
| 97 | dev_input = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_samples, ne1: n_embd); |
| 98 | dev_square = ggml_new_tensor_2d(ctx, type: GGML_TYPE_F32, ne0: n_embd, ne1: n_embd); |
| 99 | dev_eigenvector = ggml_new_tensor_1d(ctx, type: GGML_TYPE_F32, ne0: n_embd); |
| 100 | |
| 101 | ggml_set_name(tensor: dev_input, name: "dev_input" ); |
| 102 | ggml_set_name(tensor: dev_square, name: "dev_square" ); |
| 103 | ggml_set_name(tensor: dev_eigenvector, name: "dev_eigenvector" ); |
| 104 | buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); |
| 105 | ggml_backend_tensor_set(tensor: dev_input, data: t_input->data, offset: 0, size: ggml_nbytes(tensor: t_input)); |
| 106 | |
| 107 | // initialize eigenvector to random normalized vector |
| 108 | { |
| 109 | std::vector<float> random_vec(ggml_nelements(tensor: dev_eigenvector), 0.0); |
| 110 | std::default_random_engine generator(static_cast<unsigned int>(std::time(timer: 0))); |
| 111 | std::uniform_real_distribution<float> distribution(0.0, 1.0); |
| 112 | float sum_sqr = 0.0; // for normalizing random_vec |
| 113 | for (size_t i = 0; i < random_vec.size(); ++i) { |
| 114 | float f = distribution(generator); |
| 115 | sum_sqr += f * f; |
| 116 | random_vec[i] = f; |
| 117 | } |
| 118 | // normalize it |
| 119 | float random_vec_norm = std::sqrt(x: sum_sqr); |
| 120 | for (size_t i = 0; i < random_vec.size(); ++i) { |
| 121 | random_vec[i] /= random_vec_norm; |
| 122 | } |
| 123 | ggml_backend_tensor_set(tensor: dev_eigenvector, data: random_vec.data(), offset: 0, size: ggml_nbytes(tensor: dev_eigenvector)); |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | ~pca_model() { |
| 128 | ggml_free(ctx); |
| 129 | ggml_backend_buffer_free(buffer); |
| 130 | ggml_backend_free(backend); |
| 131 | } |
| 132 | }; |
| 133 | |
| 134 | static struct ggml_cgraph * build_graph_piter( |
| 135 | const struct pca_params & params, |
| 136 | const pca_model & model, |
| 137 | bool calc_square = false) { |
| 138 | GGML_ASSERT(params.n_batch > 0); |
| 139 | // TODO: buf_size must be able to scale with params.n_batch |
| 140 | static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); |
| 141 | static std::vector<uint8_t> buf(buf_size); |
| 142 | |
| 143 | struct ggml_init_params params0 = { |
| 144 | /*.mem_size =*/ .mem_size: buf_size, |
| 145 | /*.mem_buffer =*/ .mem_buffer: buf.data(), |
| 146 | /*.no_alloc =*/ .no_alloc: true, // the tensors will be allocated later by ggml_allocr_alloc_graph() |
| 147 | }; |
| 148 | // create a temporally context to build the graph |
| 149 | struct ggml_context * ctx0 = ggml_init(params: params0); |
| 150 | struct ggml_cgraph * gf = ggml_new_graph(ctx: ctx0); |
| 151 | |
| 152 | // turn v_diff_original into square matrix if needed |
| 153 | struct ggml_tensor * tmp_square; |
| 154 | if (calc_square) { |
| 155 | tmp_square = ggml_mul_mat(ctx: ctx0, a: model.dev_input, b: model.dev_input); |
| 156 | ggml_set_name(tensor: tmp_square, name: "tmp_square" ); |
| 157 | } |
| 158 | |
| 159 | struct ggml_tensor * b_tensor; |
| 160 | struct ggml_tensor * distance; |
| 161 | struct ggml_tensor * old_eigen = model.dev_eigenvector; |
| 162 | struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square; |
| 163 | |
| 164 | for (int i = 0; i < params.n_batch; ++i) { |
| 165 | // b_tensor = square * eigenvector^T |
| 166 | b_tensor = ggml_mul_mat(ctx: ctx0, a: input_square, b: old_eigen); |
| 167 | ggml_set_name(tensor: b_tensor, name: "b_tensor" ); |
| 168 | |
| 169 | // normalize |
| 170 | b_tensor = ggml_div_inplace(ctx: ctx0, |
| 171 | a: b_tensor, |
| 172 | b: ggml_sqrt_inplace(ctx: ctx0, a: ggml_sum_rows(ctx: ctx0, a: ggml_sqr(ctx: ctx0, a: b_tensor))) |
| 173 | ); |
| 174 | ggml_format_name(tensor: b_tensor, fmt: "b_tensor_norm_%d" , i); |
| 175 | |
| 176 | // calculate distance(new eigenvector - old eigenvector) |
| 177 | // we don't use ggml_sub because it may not be implemented on GPU backend |
| 178 | struct ggml_tensor * new_sub_old = ggml_add(ctx: ctx0, a: old_eigen, b: ggml_scale(ctx: ctx0, a: b_tensor, s: -1)); |
| 179 | distance = ggml_sqrt_inplace(ctx: ctx0, |
| 180 | a: ggml_sum_rows(ctx: ctx0, a: ggml_sqr_inplace(ctx: ctx0, a: new_sub_old))); |
| 181 | ggml_format_name(tensor: distance, fmt: "distance_%d" , i); |
| 182 | |
| 183 | old_eigen = b_tensor; |
| 184 | |
| 185 | // build operations nodes |
| 186 | ggml_build_forward_expand(cgraph: gf, tensor: distance); |
| 187 | } |
| 188 | |
| 189 | // delete the temporally context used to build the graph |
| 190 | ggml_free(ctx: ctx0); |
| 191 | return gf; |
| 192 | } |
| 193 | |
| 194 | static ggml_status compute_piter( |
| 195 | const struct pca_params & params, |
| 196 | const pca_model & model, |
| 197 | struct ggml_cgraph * gf, |
| 198 | ggml_gallocr_t allocr, |
| 199 | struct pca_result & result) { |
| 200 | // allocate tensors |
| 201 | ggml_gallocr_alloc_graph(galloc: allocr, graph: gf); |
| 202 | |
| 203 | if (ggml_backend_is_cpu(backend: model.backend)) { |
| 204 | ggml_backend_cpu_set_n_threads(backend_cpu: model.backend, n_threads: params.n_threads); |
| 205 | } |
| 206 | |
| 207 | ggml_status res = ggml_backend_graph_compute(backend: model.backend, cgraph: gf); |
| 208 | if (res == GGML_STATUS_SUCCESS) { |
| 209 | auto = [](std::string prefix, std::string str) -> int { |
| 210 | int i = -1; |
| 211 | if (str.rfind(str: prefix, pos: 0) == 0) { |
| 212 | sscanf(s: str.c_str(), format: (prefix + "%d" ).c_str(), &i); |
| 213 | } |
| 214 | return i; |
| 215 | }; |
| 216 | result.calculated_square = NULL; |
| 217 | result.eigenvectors.clear(); |
| 218 | result.distances.clear(); |
| 219 | result.eigenvectors.resize(new_size: params.n_batch); |
| 220 | result.distances.resize(new_size: params.n_batch); |
| 221 | // get output nodes |
| 222 | for (int i = 0; i < ggml_graph_n_nodes(cgraph: gf); ++i) { |
| 223 | auto node = ggml_graph_node(cgraph: gf, i); |
| 224 | int iter = -1; |
| 225 | // find b_tensor (without copying data from device) |
| 226 | if ((iter = extract_i("b_tensor_norm_" , node->name)) > -1) { |
| 227 | result.eigenvectors[iter] = node; |
| 228 | } |
| 229 | // find distances, then copy data from device |
| 230 | if ((iter = extract_i("distance_" , node->name)) > -1) { |
| 231 | float d; |
| 232 | ggml_backend_tensor_get(tensor: node, data: &d, offset: 0, size: sizeof(float)); |
| 233 | result.distances[iter] = d; |
| 234 | // std::cout << node->name << " = " << d << "\n"; |
| 235 | } |
| 236 | // find tmp_square if it exists (without copying data from device) |
| 237 | if (std::string(node->name) == "tmp_square" ) { |
| 238 | result.calculated_square = node; |
| 239 | } |
| 240 | } |
| 241 | } |
| 242 | return res; |
| 243 | } |
| 244 | |
| 245 | static void power_iteration( |
| 246 | const struct pca_params & params, |
| 247 | struct ggml_tensor * input, // shape of input: [n_samples, n_embd] |
| 248 | struct ggml_tensor * output) { |
| 249 | //printf("in power iteration\n"); |
| 250 | struct pca_model model(input); |
| 251 | |
| 252 | ggml_gallocr_t allocr = ggml_gallocr_new(buft: ggml_backend_get_default_buffer_type(backend: model.backend)); |
| 253 | struct pca_result result; |
| 254 | struct ggml_tensor * last_eigenvector = NULL; |
| 255 | |
| 256 | int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations |
| 257 | for (int iter = 0; iter < n_iters; ++iter) { |
| 258 | bool calc_square = (iter == 0); // only need to calculate square for first iteration |
| 259 | struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square); |
| 260 | // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); |
| 261 | compute_piter(params, model, gf, allocr, result); |
| 262 | |
| 263 | for (size_t k = 0; k < result.distances.size(); ++k) { |
| 264 | last_eigenvector = result.eigenvectors[k]; |
| 265 | if (result.distances[k] < params.tolerance) { |
| 266 | break; // done |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | if (calc_square) { |
| 271 | // copy and store the square matrix if needed |
| 272 | GGML_ASSERT(result.calculated_square != NULL); |
| 273 | ggml_backend_tensor_copy(src: result.calculated_square, dst: model.dev_square); |
| 274 | } |
| 275 | |
| 276 | { |
| 277 | // copy last eigen vector and store as input for next iteration |
| 278 | GGML_ASSERT(last_eigenvector != NULL); |
| 279 | ggml_backend_tensor_copy(src: last_eigenvector, dst: model.dev_eigenvector); |
| 280 | } |
| 281 | |
| 282 | printf(format: "%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n" , |
| 283 | __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch); |
| 284 | } |
| 285 | |
| 286 | // get output tensor |
| 287 | GGML_ASSERT(last_eigenvector); |
| 288 | ggml_backend_tensor_get(tensor: last_eigenvector, data: output->data, offset: 0, size: ggml_nbytes(tensor: last_eigenvector)); |
| 289 | //print_debug_tensor(output); |
| 290 | ggml_gallocr_free(galloc: allocr); |
| 291 | |
| 292 | // TODO @ngxson : The output vector is randomly inverted |
| 293 | // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171 |
| 294 | } |
| 295 | |
| 296 | static void run_pca( |
| 297 | struct pca_params & params, |
| 298 | const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd] |
| 299 | const std::vector<struct ggml_tensor *> & v_output) { |
| 300 | printf(format: "%s: Running PCA...\n" , __func__); |
| 301 | for (size_t il = 0; il < v_input.size(); ++il) { |
| 302 | |
| 303 | // prepare output vector |
| 304 | struct ggml_tensor * ctrl_out = v_output[il]; |
| 305 | ggml_format_name(tensor: ctrl_out, fmt: "direction.%zu" , il+1); |
| 306 | |
| 307 | // run power_iteration |
| 308 | params.i_layer = il; |
| 309 | params.n_layers = v_input.size(); |
| 310 | power_iteration(params, input: v_input[il], output: ctrl_out); |
| 311 | printf(format: "%s: Done layer %d / %d\n" , __func__, (int) il+1, (int) v_input.size()); |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | } |
| 316 | |