| 1 | #include "scale.cuh" |
| 2 | |
| 3 | #define MAX_GRIDDIM_X 0x7FFFFFFF |
| 4 | |
| 5 | static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) { |
| 6 | int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x; |
| 7 | int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x; |
| 8 | |
| 9 | for (int64_t i = tid; i < nelements; i += stride) { |
| 10 | dst[i] = scale * x[i] + bias; |
| 11 | } |
| 12 | } |
| 13 | |
| 14 | static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) { |
| 15 | const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; |
| 16 | scale_f32<<<gridDim: MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, sharedMem: 0, stream>>>(x, dst, scale, bias, nelements); |
| 17 | } |
| 18 | |
| 19 | void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { |
| 20 | const ggml_tensor * src0 = dst->src[0]; |
| 21 | const float * src0_d = (const float *)src0->data; |
| 22 | float * dst_d = (float *)dst->data; |
| 23 | cudaStream_t stream = ctx.stream(); |
| 24 | |
| 25 | GGML_ASSERT(src0->type == GGML_TYPE_F32); |
| 26 | GGML_ASSERT( dst->type == GGML_TYPE_F32); |
| 27 | |
| 28 | float scale; |
| 29 | float bias; |
| 30 | memcpy(dest: &scale, src: (float *) dst->op_params + 0, n: sizeof(float)); |
| 31 | memcpy(dest: &bias, src: (float *) dst->op_params + 1, n: sizeof(float)); |
| 32 | |
| 33 | scale_f32_cuda(src0_d, dst_d, scale, bias, ggml_nelements(src0), stream); |
| 34 | } |
| 35 | |