| 1 | #include "quantize.cuh" |
| 2 | #include <cstdint> |
| 3 | |
| 4 | __launch_bounds__(CUDA_QUANTIZE_BLOCK_SIZE, 1) |
| 5 | static __global__ void quantize_q8_1( |
| 6 | const float * __restrict__ x, void * __restrict__ vy, |
| 7 | const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, |
| 8 | const int64_t ne0, const uint32_t ne1, const uint3 ne2) { |
| 9 | const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; |
| 10 | |
| 11 | if (i0 >= ne0) { |
| 12 | return; |
| 13 | } |
| 14 | |
| 15 | const int64_t i3 = fastdiv(n: blockIdx.z, fastdiv_values: ne2); |
| 16 | const int64_t i2 = blockIdx.z - i3*ne2.z; |
| 17 | const int64_t i1 = blockIdx.y; |
| 18 | |
| 19 | const int64_t & i00 = i0; |
| 20 | const int64_t & i01 = i1; |
| 21 | const int64_t & i02 = i2; |
| 22 | const int64_t & i03 = i3; |
| 23 | |
| 24 | const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0; |
| 25 | |
| 26 | block_q8_1 * y = (block_q8_1 *) vy; |
| 27 | |
| 28 | const int64_t ib = i_cont / QK8_1; // block index |
| 29 | const int64_t iqs = i_cont % QK8_1; // quant index |
| 30 | |
| 31 | const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; |
| 32 | float amax = fabsf(a: xi); |
| 33 | float sum = xi; |
| 34 | |
| 35 | amax = warp_reduce_max<QK8_1>(amax); |
| 36 | sum = warp_reduce_sum<QK8_1>(sum); |
| 37 | |
| 38 | const float d = amax / 127.0f; |
| 39 | const int8_t q = amax == 0.0f ? 0 : roundf(a: xi / d); |
| 40 | |
| 41 | y[ib].qs[iqs] = q; |
| 42 | |
| 43 | if (iqs > 0) { |
| 44 | return; |
| 45 | } |
| 46 | |
| 47 | y[ib].ds = make_half2(x: d, y: sum); |
| 48 | } |
| 49 | |
| 50 | template <mmq_q8_1_ds_layout ds_layout> |
| 51 | static __global__ void quantize_mmq_q8_1( |
| 52 | const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, |
| 53 | const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, |
| 54 | const int64_t ne0, const int ne1, const int ne2) { |
| 55 | |
| 56 | constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32; |
| 57 | constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32; |
| 58 | |
| 59 | const int64_t i0 = ((int64_t)blockDim.x*blockIdx.y + threadIdx.x)*4; |
| 60 | |
| 61 | if (i0 >= ne0) { |
| 62 | return; |
| 63 | } |
| 64 | |
| 65 | const int64_t i1 = blockIdx.x; |
| 66 | const int64_t i2 = blockIdx.z % ne2; |
| 67 | const int64_t i3 = blockIdx.z / ne2; |
| 68 | |
| 69 | const int64_t i00 = i0; |
| 70 | const int64_t i01 = ids ? ids[i1] : i1; |
| 71 | const int64_t i02 = i2; |
| 72 | const int64_t i03 = i3; |
| 73 | |
| 74 | const float4 * x4 = (const float4 *) x; |
| 75 | |
| 76 | block_q8_1_mmq * y = (block_q8_1_mmq *) vy; |
| 77 | |
| 78 | const int64_t ib0 = blockIdx.z*((int64_t)gridDim.x*gridDim.y*blockDim.x/QK8_1); // first block of channel |
| 79 | const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.x; // block index in channel |
| 80 | const int64_t iqs = i0 % (4*QK8_1); // quant index in block |
| 81 | |
| 82 | // Load 4 floats per thread and calculate max. abs. value between them: |
| 83 | const float4 xi = i0 < ne00 ? x4[(i03*s03 + i02*s02 + i01*s01 + i00)/4] : make_float4(x: 0.0f, y: 0.0f, z: 0.0f, w: 0.0f); |
| 84 | float amax = fabsf(a: xi.x); |
| 85 | amax = fmaxf(a: amax, b: fabsf(a: xi.y)); |
| 86 | amax = fmaxf(a: amax, b: fabsf(a: xi.z)); |
| 87 | amax = fmaxf(a: amax, b: fabsf(a: xi.w)); |
| 88 | |
| 89 | // Exchange max. abs. value between vals_per_scale/4 threads. |
| 90 | #pragma unroll |
| 91 | for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) { |
| 92 | amax = fmaxf(a: amax, b: __shfl_xor_sync(mask: 0xFFFFFFFF, val: amax, offset: offset, WARP_SIZE)); |
| 93 | } |
| 94 | |
| 95 | float sum; |
| 96 | if (ds_layout != MMQ_Q8_1_DS_LAYOUT_D4) { |
| 97 | sum = xi.x + xi.y + xi.z + xi.w; |
| 98 | |
| 99 | // Calculate sums across vals_per_sum/4 threads. |
| 100 | #pragma unroll |
| 101 | for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) { |
| 102 | sum += __shfl_xor_sync(mask: 0xFFFFFFFF, val: sum, offset: offset, WARP_SIZE); |
| 103 | } |
| 104 | } |
| 105 | |
| 106 | const float d_inv = 127.0f / amax; |
| 107 | char4 q; |
| 108 | q.x = roundf(a: xi.x*d_inv); |
| 109 | q.y = roundf(a: xi.y*d_inv); |
| 110 | q.z = roundf(a: xi.z*d_inv); |
| 111 | q.w = roundf(a: xi.w*d_inv); |
| 112 | |
| 113 | // Write back 4 int8 values as a single 32 bit value for better memroy bandwidth: |
| 114 | char4 * yqs4 = (char4 *) y[ib].qs; |
| 115 | yqs4[iqs/4] = q; |
| 116 | |
| 117 | if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6) { |
| 118 | if (iqs % 16 != 0 || iqs >= 96) { |
| 119 | return; |
| 120 | } |
| 121 | |
| 122 | y[ib].d2s6[2 + iqs/16] = sum; |
| 123 | |
| 124 | if (iqs % 64 != 0) { |
| 125 | return; |
| 126 | } |
| 127 | |
| 128 | const float d = 1.0f / d_inv; |
| 129 | |
| 130 | y[ib].d2s6[iqs/64] = d; |
| 131 | |
| 132 | return; |
| 133 | } |
| 134 | |
| 135 | if (iqs % 32 != 0) { |
| 136 | return; |
| 137 | } |
| 138 | |
| 139 | const float d = 1.0f / d_inv; |
| 140 | |
| 141 | if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) { |
| 142 | y[ib].ds4[iqs/32] = make_half2(x: d, y: sum); |
| 143 | } else { |
| 144 | y[ib].d4[iqs/32] = d; |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | void quantize_row_q8_1_cuda( |
| 149 | const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, |
| 150 | const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, |
| 151 | const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { |
| 152 | GGML_ASSERT(!ids); |
| 153 | GGML_ASSERT(ne0 % QK8_1 == 0); |
| 154 | |
| 155 | const uint3 ne2_fastdiv = init_fastdiv_values(d_64: ne2); |
| 156 | |
| 157 | const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; |
| 158 | const dim3 num_blocks(block_num_x, ne1, ne2*ne3); |
| 159 | const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); |
| 160 | quantize_q8_1<<<gridDim: num_blocks, blockDim: block_size, sharedMem: 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2: ne2_fastdiv); |
| 161 | GGML_UNUSED(type_src0); |
| 162 | } |
| 163 | |
| 164 | void quantize_mmq_q8_1_cuda( |
| 165 | const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, |
| 166 | const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, |
| 167 | const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { |
| 168 | GGML_ASSERT(ne00 % 4 == 0); |
| 169 | GGML_ASSERT(ne0 % (4*QK8_1) == 0); |
| 170 | |
| 171 | // ne1 tends to assume the highest values, therefore use it as the "x" dimension of the CUDA grid: |
| 172 | const int64_t block_num_y = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); |
| 173 | const dim3 num_blocks(ne1, block_num_y, ne2*ne3); |
| 174 | const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1); |
| 175 | switch (mmq_get_q8_1_ds_layout(type_src0)) { |
| 176 | case MMQ_Q8_1_DS_LAYOUT_D4: |
| 177 | quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4> |
| 178 | <<<gridDim: num_blocks, blockDim: block_size, sharedMem: 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); |
| 179 | break; |
| 180 | case MMQ_Q8_1_DS_LAYOUT_DS4: |
| 181 | quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4> |
| 182 | <<<gridDim: num_blocks, blockDim: block_size, sharedMem: 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); |
| 183 | break; |
| 184 | case MMQ_Q8_1_DS_LAYOUT_D2S6: |
| 185 | quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6> |
| 186 | <<<gridDim: num_blocks, blockDim: block_size, sharedMem: 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); |
| 187 | break; |
| 188 | default: |
| 189 | GGML_ABORT("fatal error" ); |
| 190 | break; |
| 191 | } |
| 192 | } |
| 193 | |