| 1 | #include "common.cuh" |
| 2 | #include "ggml.h" |
| 3 | |
| 4 | #include <initializer_list> |
| 5 | |
| 6 | void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx, |
| 7 | const ggml_tensor * logits, |
| 8 | ggml_tensor * weights, |
| 9 | ggml_tensor * ids, |
| 10 | const bool with_norm, |
| 11 | const bool delayed_softmax = false, |
| 12 | ggml_tensor * weight_clamp = nullptr); |
| 13 | |
| 14 | bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr); |
| 15 | |
| 16 | std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false); |
| 17 | |