| 1 | #include "pool2d.cuh" |
| 2 | |
| 3 | template <typename Ti, typename To> |
| 4 | static __global__ void pool2d_nchw_kernel( |
| 5 | const int ih, const int iw, const int oh, const int ow, |
| 6 | const int kh, const int kw, const int sh, const int sw, |
| 7 | const int ph, const int pw, const int parallel_elements, |
| 8 | const Ti* src, To* dst, const enum ggml_op_pool op) { |
| 9 | int idx = threadIdx.x + blockIdx.x * blockDim.x; |
| 10 | if (idx >= parallel_elements) { |
| 11 | return; |
| 12 | } |
| 13 | |
| 14 | const int I_HW = ih * iw; |
| 15 | const int O_HW = oh * ow; |
| 16 | const int nc = idx / O_HW; |
| 17 | const int cur_oh = idx % O_HW / ow; |
| 18 | const int cur_ow = idx % O_HW % ow; |
| 19 | const Ti* i_ptr = src + nc * I_HW; |
| 20 | To* o_ptr = dst + nc * O_HW; |
| 21 | const int start_h = cur_oh * sh - ph; |
| 22 | const int bh = max(a: 0, b: start_h); |
| 23 | const int eh = min(a: ih, b: start_h + kh); |
| 24 | const int start_w = cur_ow * sw - pw; |
| 25 | const int bw = max(a: 0, b: start_w); |
| 26 | const int ew = min(a: iw, b: start_w + kw); |
| 27 | const To scale = 1. / (kh * kw); |
| 28 | To res = 0; |
| 29 | |
| 30 | switch (op) { |
| 31 | case GGML_OP_POOL_AVG: res = 0; break; |
| 32 | case GGML_OP_POOL_MAX: res = -FLT_MAX; break; |
| 33 | default: assert(false); |
| 34 | } |
| 35 | |
| 36 | for (int i = bh; i < eh; i += 1) { |
| 37 | for (int j = bw; j < ew; j += 1) { |
| 38 | #if __CUDA_ARCH__ >= 350 |
| 39 | Ti cur = __ldg(i_ptr + i * iw + j); |
| 40 | #else |
| 41 | Ti cur = i_ptr[i * iw + j]; |
| 42 | #endif |
| 43 | switch (op) { |
| 44 | case GGML_OP_POOL_AVG: res += cur * scale; break; |
| 45 | case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; |
| 46 | default: assert(false); |
| 47 | } |
| 48 | } |
| 49 | } |
| 50 | o_ptr[cur_oh * ow + cur_ow] = res; |
| 51 | } |
| 52 | |
| 53 | static void pool2d_nchw_kernel_f32_f32_cuda( |
| 54 | const int ih, const int iw, const int oh, const int ow, |
| 55 | const int kh, const int kw, const int sh, const int sw, |
| 56 | const int ph, const int pw, const int parallel_elements, |
| 57 | const float * src, float * dst, const enum ggml_op_pool op, |
| 58 | cudaStream_t stream) { |
| 59 | |
| 60 | const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE; |
| 61 | dim3 block_nums(num_blocks); |
| 62 | pool2d_nchw_kernel<<<gridDim: block_nums, CUDA_POOL2D_BLOCK_SIZE, sharedMem: 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op); |
| 63 | } |
| 64 | |
| 65 | void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { |
| 66 | const ggml_tensor * src0 = dst->src[0]; |
| 67 | const float * src0_d = (const float *)src0->data; |
| 68 | float * dst_d = (float *)dst->data; |
| 69 | cudaStream_t stream = ctx.stream(); |
| 70 | |
| 71 | GGML_ASSERT(src0->type == GGML_TYPE_F32); |
| 72 | GGML_ASSERT( dst->type == GGML_TYPE_F32); |
| 73 | |
| 74 | const int32_t * opts = (const int32_t *)dst->op_params; |
| 75 | enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]); |
| 76 | const int k0 = opts[1]; |
| 77 | const int k1 = opts[2]; |
| 78 | const int s0 = opts[3]; |
| 79 | const int s1 = opts[4]; |
| 80 | const int p0 = opts[5]; |
| 81 | const int p1 = opts[6]; |
| 82 | |
| 83 | const int64_t IH = src0->ne[1]; |
| 84 | const int64_t IW = src0->ne[0]; |
| 85 | |
| 86 | const int64_t N = dst->ne[3]; |
| 87 | const int64_t OC = dst->ne[2]; |
| 88 | const int64_t OH = dst->ne[1]; |
| 89 | const int64_t OW = dst->ne[0]; |
| 90 | |
| 91 | const int parallel_elements = N * OC * OH * OW; |
| 92 | |
| 93 | pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream); |
| 94 | } |
| 95 | |