1#include "unary.cuh"
2#include "convert.cuh"
3
4static __device__ __forceinline__ float op_abs(float x) {
5 return fabsf(a: x);
6}
7
8static __device__ __forceinline__ float op_sgn(float x) {
9 return (x > 0.f ? 1.f : ((x < 0.f ? -1.f : 0.f)));
10}
11
12static __device__ __forceinline__ float op_neg(float x) {
13 return -x;
14}
15
16static __device__ __forceinline__ float op_step(float x) {
17 return x > 0.0f;
18}
19
20static __device__ __forceinline__ float op_gelu(float x) {
21 return ggml_cuda_op_gelu_single(x);
22}
23
24static __device__ __forceinline__ float op_gelu_erf(float x) {
25 const float SQRT_2_INV = 0.70710678118654752440084436210484f;
26
27 return 0.5f*x*(1.0f + erff(a: x*SQRT_2_INV));
28}
29
30static __device__ __forceinline__ float op_gelu_quick(float x) {
31 const float GELU_QUICK_COEF = -1.702f;
32
33 return x * (1.0f / (1.0f + expf(a: GELU_QUICK_COEF * x)));
34}
35
36static __device__ __forceinline__ float op_silu(float x) {
37 return ggml_cuda_op_silu_single(x);
38}
39
40static __device__ __forceinline__ float op_tanh(float x) {
41 return tanhf(a: x);
42}
43
44static __device__ __forceinline__ float op_relu(float x) {
45 return fmaxf(a: x, b: 0);
46}
47
48static __device__ __forceinline__ float op_sigmoid(float x) {
49 return 1.0f / (1.0f + expf(a: -x));
50}
51
52static __device__ __forceinline__ float op_hardsigmoid(float x) {
53 return fminf(a: 1.0f, b: fmaxf(a: 0.0f, b: (x + 3.0f) / 6.0f));
54}
55
56static __device__ __forceinline__ float op_hardswish(float x) {
57 return x * fminf(a: 1.0f, b: fmaxf(a: 0.0f, b: (x + 3.0f) / 6.0f));
58}
59
60static __device__ __forceinline__ float op_exp(float x) {
61 return expf(a: x);
62}
63
64static __device__ __forceinline__ float op_sqr(float x) {
65 return x * x;
66}
67
68static __device__ __forceinline__ float op_sqrt(float x) {
69 return sqrtf(a: x);
70}
71
72static __device__ __forceinline__ float op_sin(float x) {
73 return sinf(a: x);
74}
75
76static __device__ __forceinline__ float op_cos(float x) {
77 return cosf(a: x);
78}
79
80static __device__ __forceinline__ float op_log(float x) {
81 return logf(a: x);
82}
83
84static __device__ __forceinline__ float op_elu(float x) {
85 return (x > 0.f) ? x : expm1f(a: x);
86}
87
88static __device__ __forceinline__ float op_floor(float x) {
89 return floorf(f: x);
90}
91
92static __device__ __forceinline__ float op_ceil(float x) {
93 return ceilf(a: x);
94}
95
96static __device__ __forceinline__ float op_round(float x) {
97 return round(a: x);
98}
99
100static __device__ __forceinline__ float op_trunc(float x) {
101 return trunc(a: x);
102}
103
104template <float (*op)(float), typename T>
105static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
106 const int i = blockDim.x*blockIdx.x + threadIdx.x;
107
108 if (i >= k) {
109 return;
110 }
111
112 dst[i] = (T)op((float)x[i]);
113}
114
115template <float (*op)(float), typename T>
116static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
117 const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
118 unary_op_kernel<op><<<gridDim: num_blocks, CUDA_NEG_BLOCK_SIZE, sharedMem: 0, stream>>>(x, dst, k);
119}
120
121template <float (*op)(float)>
122void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
123 const ggml_tensor * src0 = dst->src[0];
124 const void * src0_d = src0->data;
125 void * dst_d = dst->data;
126 cudaStream_t stream = ctx.stream();
127
128 GGML_ASSERT(ggml_is_contiguous(src0));
129
130 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
131 GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
132 GGML_ASSERT(src0->type == dst->type);
133
134 if (src0->type == GGML_TYPE_F16) {
135 unary_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
136 } else {
137 unary_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
138 }
139}
140
141void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
142 ggml_cuda_op_unary<op_abs>(ctx, dst);
143}
144
145void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
146 ggml_cuda_op_unary<op_sgn>(ctx, dst);
147}
148
149void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
150 ggml_cuda_op_unary<op_neg>(ctx, dst);
151}
152
153void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
154 ggml_cuda_op_unary<op_step>(ctx, dst);
155}
156
157void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
158 ggml_cuda_op_unary<op_gelu>(ctx, dst);
159}
160
161void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
162 ggml_cuda_op_unary<op_gelu_erf>(ctx, dst);
163}
164
165void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
166 ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
167}
168
169void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
170 ggml_cuda_op_unary<op_silu>(ctx, dst);
171}
172
173void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
174 ggml_cuda_op_unary<op_tanh>(ctx, dst);
175}
176
177void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
178 ggml_cuda_op_unary<op_relu>(ctx, dst);
179}
180
181void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
182 ggml_cuda_op_unary<op_sigmoid>(ctx, dst);
183}
184
185void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
186 ggml_cuda_op_unary<op_hardsigmoid>(ctx, dst);
187}
188
189void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
190 ggml_cuda_op_unary<op_hardswish>(ctx, dst);
191}
192
193void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
194 ggml_cuda_op_unary<op_exp>(ctx, dst);
195}
196
197void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
198 ggml_cuda_op_unary<op_sqr>(ctx, dst);
199}
200
201void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
202 ggml_cuda_op_unary<op_sqrt>(ctx, dst);
203}
204
205void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
206 ggml_cuda_op_unary<op_sin>(ctx, dst);
207}
208
209void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
210 ggml_cuda_op_unary<op_cos>(ctx, dst);
211}
212
213void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
214 ggml_cuda_op_unary<op_log>(ctx, dst);
215}
216
217void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
218 ggml_cuda_op_unary<op_elu>(ctx, dst);
219}
220
221void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
222 ggml_cuda_op_unary<op_floor>(ctx, dst);
223}
224
225void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
226 ggml_cuda_op_unary<op_ceil>(ctx, dst);
227}
228
229void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
230 ggml_cuda_op_unary<op_round>(ctx, dst);
231}
232
233void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
234 ggml_cuda_op_unary<op_trunc>(ctx, dst);
235}
236/* gated ops */
237
238template <float (*op)(float), typename T>
239static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) {
240 const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x;
241
242 if (i >= k) {
243 return;
244 }
245
246 // perform base op and multiply with gate (either offset in same tensor or a separate one)
247 const int64_t j0 = (i / n) * o0 + (i % n);
248 const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
249
250 dst[i] = (T)(op((float)x[j0]) * (float)g[j1]);
251}
252
253template <float (*op)(float), typename T>
254static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) {
255 const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE;
256 unary_gated_op_kernel<op><<<gridDim: num_blocks, CUDA_GLU_BLOCK_SIZE, sharedMem: 0, stream>>>(x, g, dst, k, n, o0, o1);
257}
258
259template <float (*op)(float)>
260void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
261 const ggml_tensor * src0 = dst->src[0];
262 const ggml_tensor * src1 = dst->src[1];
263 void * src0_d = src0->data;
264 void * src1_d = src1 ? src1->data : src0->data;
265 const int64_t src0_o = src0->nb[1];
266 const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
267 void * dst_d = dst->data;
268 const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
269 cudaStream_t stream = ctx.stream();
270
271 GGML_ASSERT(ggml_is_contiguous_1(src0));
272 GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
273 GGML_ASSERT(ggml_is_contiguous(dst));
274
275 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
276 GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
277 GGML_ASSERT(src0->type == dst->type);
278 GGML_ASSERT(dst->ne[0] == nc);
279 GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
280
281 if (src1) {
282 GGML_ASSERT(ggml_is_contiguous_1(src1));
283 GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
284 GGML_ASSERT(src1->ne[0] == nc);
285 GGML_ASSERT(src0->type == src1->type);
286 }
287
288 const int32_t swapped = ((const int32_t *) dst->op_params)[1];
289
290 if (src0->type == GGML_TYPE_F16) {
291 half * src0_p = (half *) src0_d;
292 half * src1_p = (half *) src1_d;
293
294 if (!src1) {
295 src0_p += swapped ? nc : 0;
296 src1_p += swapped ? 0 : nc;
297 }
298
299 unary_gated_cuda<op>(src0_p, src1_p, (half *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(half), src1_o / sizeof(half), stream);
300 } else {
301 float * src0_p = (float *) src0_d;
302 float * src1_p = (float *) src1_d;
303
304 if (!src1) {
305 src0_p += swapped ? nc : 0;
306 src1_p += swapped ? 0 : nc;
307 }
308
309 unary_gated_cuda<op>(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), stream);
310 }
311}
312
313void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
314 ggml_cuda_op_unary_gated<op_relu>(ctx, dst);
315}
316
317void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
318 ggml_cuda_op_unary_gated<op_gelu>(ctx, dst);
319}
320
321void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
322 ggml_cuda_op_unary_gated<op_silu>(ctx, dst);
323}
324
325void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
326 ggml_cuda_op_unary_gated<op_gelu_erf>(ctx, dst);
327}
328
329void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
330 ggml_cuda_op_unary_gated<op_gelu_quick>(ctx, dst);
331}
332
333// swiglu_oai
334
335template <typename T>
336static __global__ void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, float alpha, float limit) {
337 const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x;
338
339 if (i >= k) {
340 return;
341 }
342
343 // perform base op and multiply with gate (either offset in same tensor or a separate one)
344 const int64_t j0 = (i / n) * o0 + (i % n);
345 const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
346
347 float xi = x[j0];
348 float gi = g[j1];
349
350 dst[i] = ggml_cuda_op_swiglu_oai_single(x: xi, g: gi, alpha, limit);
351}
352
353template <typename T>
354static void swiglu_oai_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, const float alpha, const float limit, cudaStream_t stream) {
355 const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE;
356 swiglu_oai_kernel<<<gridDim: num_blocks, CUDA_GLU_BLOCK_SIZE, sharedMem: 0, stream>>>(x, g, dst, k, n, o0, o1, alpha, limit);
357}
358
359void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
360 const ggml_tensor * src0 = dst->src[0];
361 const ggml_tensor * src1 = dst->src[1];
362 void * src0_d = src0->data;
363 void * src1_d = src1 ? src1->data : src0->data;
364 const int64_t src0_o = src0->nb[1];
365 const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
366 void * dst_d = dst->data;
367 const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
368 cudaStream_t stream = ctx.stream();
369
370 GGML_ASSERT(ggml_is_contiguous_1(src0));
371 GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
372 GGML_ASSERT(ggml_is_contiguous(dst));
373
374 GGML_ASSERT(src0->type == GGML_TYPE_F32);
375 GGML_ASSERT( dst->type == GGML_TYPE_F32);
376 GGML_ASSERT(src0->type == dst->type);
377 GGML_ASSERT(dst->ne[0] == nc);
378 GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
379
380 if (src1) {
381 GGML_ASSERT(ggml_is_contiguous_1(src1));
382 GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
383 GGML_ASSERT(src1->ne[0] == nc);
384 GGML_ASSERT(src0->type == src1->type);
385 }
386
387 //const int32_t swapped = ((const int32_t *) dst->op_params)[1];
388 const int32_t swapped = ggml_get_op_params_i32(dst, 1);
389 const float alpha = ggml_get_op_params_f32(dst, 2);
390 const float limit = ggml_get_op_params_f32(dst, 3);
391
392 float * src0_p = (float *) src0_d;
393 float * src1_p = (float *) src1_d;
394
395 if (!src1) {
396 src0_p += swapped ? nc : 0;
397 src1_p += swapped ? 0 : nc;
398 }
399
400 swiglu_oai_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
401}
402
403/* CUDA kernel + launcher for xIELU */
404
405template <typename T>
406static __global__ void xielu_kernel(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps) {
407 const int i = blockDim.x*blockIdx.x + threadIdx.x;
408
409 if (i >= k) {
410 return;
411 }
412
413 const float xi = ggml_cuda_cast<float>(x[i]);
414
415 const float gate_pos = (xi > 0.0f);
416 const float y_pos = alpha_p * xi * xi + beta * xi;
417 const float min_v_eps = fminf(a: xi, b: eps);
418 const float y_neg = (expm1f(a: min_v_eps) - xi) * alpha_n + beta * xi;
419 const float out = gate_pos * y_pos + (1.0f - gate_pos) * y_neg;
420
421 dst[i] = ggml_cuda_cast<T>(out);
422}
423
424template <typename T>
425static void xielu_cuda(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps, cudaStream_t stream) {
426 const int num_blocks = (k + CUDA_XIELU_BLOCK_SIZE) / CUDA_XIELU_BLOCK_SIZE;
427 xielu_kernel<<<gridDim: num_blocks, CUDA_XIELU_BLOCK_SIZE, sharedMem: 0, stream>>>(x, dst, k, alpha_n, alpha_p, beta, eps);
428}
429
430void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
431 const ggml_tensor * src0 = dst->src[0];
432 const void * src0_d = src0->data;
433 void * dst_d = dst->data;
434 cudaStream_t stream = ctx.stream();
435
436 GGML_ASSERT(ggml_is_contiguous(src0));
437
438 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
439 GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
440 GGML_ASSERT(src0->type == dst->type);
441
442 const float alpha_n = ggml_get_op_params_f32(dst, 1);
443 const float alpha_p = ggml_get_op_params_f32(dst, 2);
444 const float beta = ggml_get_op_params_f32(dst, 3);
445 const float eps = ggml_get_op_params_f32(dst, 4);
446
447 if (src0->type == GGML_TYPE_F16) {
448 xielu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
449 } else {
450 xielu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
451 }
452}
453
454
455
456/* silu_back */
457
458static __device__ __forceinline__ float op_silu_back(float grad, float x) {
459 const float s = 1.0f / (1.0f + expf(a: -x));
460 return grad * s * (1.0f + x * (1.0f - s));
461}
462
463template <class T>
464static __global__ void silu_back_kernel(const T * grad, const T * xf, T * dst, const int k) {
465 const int i = blockDim.x*blockIdx.x + threadIdx.x;
466
467 if (i >= k) {
468 return;
469 }
470
471 dst[i] = (T)op_silu_back(grad: (float)grad[i], x: (float)xf[i]);
472}
473
474template <class T>
475static void silu_back_cuda(const T * grad, const T * x, T * dst, const int k, cudaStream_t stream) {
476 const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
477 silu_back_kernel<<<gridDim: num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, sharedMem: 0, stream>>>(grad, x, dst, k);
478}
479
480void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
481 const ggml_tensor * src0 = dst->src[0]; // input from forward pass
482 const ggml_tensor * src1 = dst->src[1]; // grads of forward pass output
483
484 const float * src0_d = (const float *) src0->data;
485 const float * src1_d = (const float *) src1->data;
486 float * dst_d = (float *) dst->data;
487
488 cudaStream_t stream = ctx.stream();
489
490 GGML_ASSERT(ggml_is_contiguous(src0));
491
492 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
493 GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
494 GGML_ASSERT(src0->type == dst->type);
495
496 if (src0->type == GGML_TYPE_F16) {
497 silu_back_cuda((const half *)src0_d, (const half *)src1_d, (half *)dst_d, ggml_nelements(src0), stream);
498 } else {
499 silu_back_cuda((const float*)src0_d, (const float*)src1_d, (float *)dst_d, ggml_nelements(src0), stream);
500 }
501}
502
503/* leaky relu */
504
505static __device__ __forceinline__ float op_leaky_relu(float x, const float negative_slope) {
506 return fmaxf(a: x, b: 0) + fminf(a: x, b: 0.0f) * negative_slope;
507}
508
509template <class T>
510static __global__ void leaky_relu_kernel(const T * x, T * dst, const int k, const float negative_slope) {
511 const int i = blockDim.x*blockIdx.x + threadIdx.x;
512
513 if (i >= k) {
514 return;
515 }
516
517 dst[i] = (T)op_leaky_relu(x: (float)x[i], negative_slope);
518}
519
520template <class T>
521static void leaky_relu_cuda(const T * x, T * dst, const int k, const float negative_slope, cudaStream_t stream) {
522 const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
523 leaky_relu_kernel<<<gridDim: num_blocks, CUDA_RELU_BLOCK_SIZE, sharedMem: 0, stream>>>(x, dst, k, negative_slope);
524}
525
526void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
527 const ggml_tensor * src0 = dst->src[0];
528 const void * src0_d = src0->data;
529 void * dst_d = dst->data;
530 cudaStream_t stream = ctx.stream();
531
532 GGML_ASSERT(ggml_is_contiguous(src0));
533
534 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
535 GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
536 GGML_ASSERT(src0->type == dst->type);
537
538 float negative_slope;
539 memcpy(&negative_slope, dst->op_params, sizeof(float));
540
541 if (src0->type == GGML_TYPE_F16) {
542 leaky_relu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), negative_slope, stream);
543 } else {
544 leaky_relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), negative_slope, stream);
545 }
546}
547