1#pragma once
2
3#include <IO/ReadBuffer.h>
4#include <IO/WriteBuffer.h>
5#include <IO/ReadHelpers.h>
6#include <IO/WriteHelpers.h>
7#include <Core/Defines.h>
8
9namespace DB
10{
11
12namespace ErrorCodes
13{
14 extern const int NO_AVAILABLE_DATA;
15}
16
17
18/** Compact array for data storage, size `content_width`, in bits, of which is
19 * less than one byte. Instead of storing each value in a separate
20 * bytes, which leads to a waste of 37.5% of the space for content_width = 5, CompactArray stores
21 * adjacent `content_width`-bit values in the byte array, that is actually CompactArray
22 * simulates an array of `content_width`-bit values.
23 */
24template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
25class CompactArray final
26{
27public:
28 class Reader;
29 class Locus;
30
31public:
32 CompactArray() = default;
33
34 UInt8 ALWAYS_INLINE operator[](BucketIndex bucket_index) const
35 {
36 Locus locus(bucket_index);
37
38 if (locus.index_l == locus.index_r)
39 return locus.read(bitset[locus.index_l]);
40 else
41 return locus.read(bitset[locus.index_l], bitset[locus.index_r]);
42 }
43
44 Locus ALWAYS_INLINE operator[](BucketIndex bucket_index)
45 {
46 Locus locus(bucket_index);
47
48 locus.content_l = &bitset[locus.index_l];
49
50 if (locus.index_l == locus.index_r)
51 locus.content_r = locus.content_l;
52 else
53 locus.content_r = &bitset[locus.index_r];
54
55 return locus;
56 }
57
58 /// Used only in arcadia/metrika
59 void readText(ReadBuffer & in)
60 {
61 for (size_t i = 0; i < BITSET_SIZE; ++i)
62 {
63 if (i != 0)
64 assertChar(',', in);
65 readIntText(bitset[i], in);
66 }
67 }
68
69 /// Used only in arcadia/metrika
70 void writeText(WriteBuffer & out) const
71 {
72 for (size_t i = 0; i < BITSET_SIZE; ++i)
73 {
74 if (i != 0)
75 writeCString(",", out);
76 writeIntText(bitset[i], out);
77 }
78 }
79
80private:
81 /// number of bytes in bitset
82 static constexpr size_t BITSET_SIZE = (static_cast<size_t>(bucket_count) * content_width + 7) / 8;
83 UInt8 bitset[BITSET_SIZE] = { 0 };
84};
85
86/** A class for sequentially reading cells from a compact array on a disk.
87 */
88template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
89class CompactArray<BucketIndex, content_width, bucket_count>::Reader final
90{
91public:
92 Reader(ReadBuffer & in_)
93 : in(in_)
94 {
95 }
96
97 Reader(const Reader &) = delete;
98 Reader & operator=(const Reader &) = delete;
99
100 bool next()
101 {
102 if (current_bucket_index == bucket_count)
103 {
104 is_eof = true;
105 return false;
106 }
107
108 locus.init(current_bucket_index);
109
110 if (current_bucket_index == 0)
111 {
112 in.readStrict(reinterpret_cast<char *>(&value_l), 1);
113 ++read_count;
114 }
115 else
116 value_l = value_r;
117
118 if (locus.index_l != locus.index_r)
119 {
120 if (read_count == BITSET_SIZE)
121 fits_in_byte = true;
122 else
123 {
124 fits_in_byte = false;
125 in.readStrict(reinterpret_cast<char *>(&value_r), 1);
126 ++read_count;
127 }
128 }
129 else
130 {
131 fits_in_byte = true;
132 value_r = value_l;
133 }
134
135 ++current_bucket_index;
136
137 return true;
138 }
139
140 /** Return the current cell number and the corresponding content.
141 */
142 inline std::pair<BucketIndex, UInt8> get() const
143 {
144 if ((current_bucket_index == 0) || is_eof)
145 throw Exception("No available data.", ErrorCodes::NO_AVAILABLE_DATA);
146
147 if (fits_in_byte)
148 return std::make_pair(current_bucket_index - 1, locus.read(value_l));
149 else
150 return std::make_pair(current_bucket_index - 1, locus.read(value_l, value_r));
151 }
152
153private:
154 ReadBuffer & in;
155 /// The physical location of the current cell.
156 Locus locus;
157 /// The current position in the file as a cell number.
158 BucketIndex current_bucket_index = 0;
159 /// The number of bytes read.
160 size_t read_count = 0;
161 /// The content in the current position.
162 UInt8 value_l;
163 UInt8 value_r;
164 ///
165 bool is_eof = false;
166 /// Does the cell fully fit into one byte?
167 bool fits_in_byte;
168};
169
170/** TODO This code looks very suboptimal.
171 *
172 * The `Locus` structure contains the necessary information to find for each cell
173 * the corresponding byte and offset, in bits, from the beginning of the cell. Since in general
174 * case the size of one byte is not divisible by the size of one cell, cases possible
175 * when one cell overlaps two bytes. Therefore, the `Locus` structure contains two
176 * pairs (index, offset).
177 */
178template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
179class CompactArray<BucketIndex, content_width, bucket_count>::Locus final
180{
181 friend class CompactArray;
182 friend class CompactArray::Reader;
183
184public:
185 ALWAYS_INLINE operator UInt8() const
186 {
187 if (content_l == content_r)
188 return read(*content_l);
189 else
190 return read(*content_l, *content_r);
191 }
192
193 Locus ALWAYS_INLINE & operator=(UInt8 content)
194 {
195 if ((index_l == index_r) || (index_l == (BITSET_SIZE - 1)))
196 {
197 /// The cell completely fits into one byte.
198 *content_l &= ~(((1 << content_width) - 1) << offset_l);
199 *content_l |= content << offset_l;
200 }
201 else
202 {
203 /// The cell overlaps two bytes.
204 size_t left = 8 - offset_l;
205
206 *content_l &= ~(((1 << left) - 1) << offset_l);
207 *content_l |= (content & ((1 << left) - 1)) << offset_l;
208
209 *content_r &= ~((1 << offset_r) - 1);
210 *content_r |= content >> left;
211 }
212
213 return *this;
214 }
215
216private:
217 Locus() = default;
218
219 Locus(BucketIndex bucket_index)
220 {
221 init(bucket_index);
222 }
223
224 void ALWAYS_INLINE init(BucketIndex bucket_index)
225 {
226 /// offset in bits to the leftmost bit
227 size_t l = static_cast<size_t>(bucket_index) * content_width;
228
229 /// offset of byte that contains the leftmost bit
230 index_l = l / 8;
231
232 /// offset in bits to the leftmost bit at that byte
233 offset_l = l % 8;
234
235 /// offset of byte that contains the rightmost bit
236 index_r = (l + content_width - 1) / 8;
237
238 /// offset in bits to the next to the rightmost bit at that byte; or zero if the rightmost bit is the rightmost bit in that byte.
239 offset_r = (l + content_width) % 8;
240 }
241
242 UInt8 ALWAYS_INLINE read(UInt8 value_l) const
243 {
244 /// The cell completely fits into one byte.
245 return (value_l >> offset_l) & ((1 << content_width) - 1);
246 }
247
248 UInt8 ALWAYS_INLINE read(UInt8 value_l, UInt8 value_r) const
249 {
250 /// The cell overlaps two bytes.
251 return ((value_l >> offset_l) & ((1 << (8 - offset_l)) - 1))
252 | ((value_r & ((1 << offset_r) - 1)) << (8 - offset_l));
253 }
254
255private:
256 size_t index_l;
257 size_t offset_l;
258 size_t index_r;
259 size_t offset_r;
260
261 UInt8 * content_l;
262 UInt8 * content_r;
263
264 /// Checks
265 static_assert((content_width > 0) && (content_width < 8), "Invalid parameter value");
266 static_assert(bucket_count <= (std::numeric_limits<size_t>::max() / content_width), "Invalid parameter value");
267};
268
269}
270
271