RLEv1.cc source code [ClickHouse/contrib/orc/c++/src/RLEv1.cc]

1	/**
2	* Licensed to the Apache Software Foundation (ASF) under one
3	* or more contributor license agreements. See the NOTICE file
4	* distributed with this work for additional information
5	* regarding copyright ownership. The ASF licenses this file
6	* to you under the Apache License, Version 2.0 (the
7	* "License"); you may not use this file except in compliance
8	* with the License. You may obtain a copy of the License at
9	*
10	* http://www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing, software
13	* distributed under the License is distributed on an "AS IS" BASIS,
14	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	* See the License for the specific language governing permissions and
16	* limitations under the License.
17	*/
18
19	#include "Adaptor.hh"
20	#include "Compression.hh"
21	#include "orc/Exceptions.hh"
22	#include "RLEv1.hh"
23
24	#include <algorithm>
25
26	namespace orc {
27
28	const int MINIMUM_REPEAT = `3`;
29	const int MAXIMUM_REPEAT = `127` + MINIMUM_REPEAT;
30
31	const int64_t BASE_128_MASK = `0x7f`;
32
33	const int MAX_DELTA = `127`;
34	const int MIN_DELTA = -`128`;
35	const int MAX_LITERAL_SIZE = `128`;
36
37	RleEncoderV1::RleEncoderV1(
38	std::unique_ptr<BufferedOutputStream> outStream,
39	bool hasSigned):
40	outputStream (std::move(outStream)) {
41	isSigned = hasSigned;
42	literals = new int64_t[MAX_LITERAL_SIZE];
43	numLiterals = `0`;
44	delta = `0`;
45	repeat = false;
46	tailRunLength = `0`;
47	bufferPosition = `0`;
48	bufferLength = `0`;
49	buffer = nullptr;
50	}
51
52	RleEncoderV1::~RleEncoderV1() {
53	delete [] literals;
54	}
55
56	void RleEncoderV1::add(const int64_t* data, uint64_t numValues,
57	const char* notNull) {
58	for (uint64_t i = `0`; i < numValues; ++i) {
59	if (!notNull \|\| notNull[i]) {
60	write(data[i]);
61	}
62	}
63	}
64
65	void RleEncoderV1::writeByte(char c) {
66	if (bufferPosition == bufferLength) {
67	int addedSize = `0`;
68	if (!outputStream ->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
69	throw std::bad_alloc ();
70	}
71	bufferPosition = `0`;
72	bufferLength = addedSize;
73	}
74	buffer[bufferPosition++] = c;
75	}
76
77	void RleEncoderV1::writeValues() {
78	if (numLiterals != `0`) {
79	if (repeat) {
80	writeByte(static_cast<char>
81	(static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT));
82	writeByte(static_cast<char>(delta));
83	if (isSigned) {
84	writeVslong(literals[`0`]);
85	} else {
86	writeVulong(literals[`0`]);
87	}
88	} else {
89	writeByte(static_cast<char>(-numLiterals));
90	for(int i=`0`; i < numLiterals; ++i) {
91	if (isSigned) {
92	writeVslong(literals[i]);
93	} else {
94	writeVulong(literals[i]);
95	}
96	}
97	}
98	repeat = false;
99	numLiterals = `0`;
100	tailRunLength = `0`;
101	}
102	}
103
104	uint64_t RleEncoderV1::flush() {
105	writeValues();
106	outputStream ->BackUp(bufferLength - bufferPosition);
107	uint64_t dataSize = outputStream ->flush();
108	bufferLength = bufferPosition = `0`;
109	return dataSize;
110	}
111
112	void RleEncoderV1::write(int64_t value) {
113	if (numLiterals == `0`) {
114	literals[numLiterals++] = value;
115	tailRunLength = `1`;
116	} else if (repeat) {
117	if (value == literals[`0`] + delta * numLiterals) {
118	numLiterals += `1`;
119	if (numLiterals == MAXIMUM_REPEAT) {
120	writeValues();
121	}
122	} else {
123	writeValues();
124	literals[numLiterals++] = value;
125	tailRunLength = `1`;
126	}
127	} else {
128	if (tailRunLength == `1`) {
129	delta = value - literals[numLiterals - `1`];
130	if (delta < MIN_DELTA \|\| delta > MAX_DELTA) {
131	tailRunLength = `1`;
132	} else {
133	tailRunLength = `2`;
134	}
135	} else if (value == literals[numLiterals - `1`] + delta) {
136	tailRunLength += `1`;
137	} else {
138	delta = value - literals[numLiterals - `1`];
139	if (delta < MIN_DELTA \|\| delta > MAX_DELTA) {
140	tailRunLength = `1`;
141	} else {
142	tailRunLength = `2`;
143	}
144	}
145	if (tailRunLength == MINIMUM_REPEAT) {
146	if (numLiterals + `1` == MINIMUM_REPEAT) {
147	repeat = true;
148	numLiterals += `1`;
149	} else {
150	numLiterals -= static_cast<int>(MINIMUM_REPEAT - `1`);
151	int64_t base = literals[numLiterals];
152	writeValues();
153	literals[`0`] = base;
154	repeat = true;
155	numLiterals = MINIMUM_REPEAT;
156	}
157	} else {
158	literals[numLiterals++] = value;
159	if (numLiterals == MAX_LITERAL_SIZE) {
160	writeValues();
161	}
162	}
163	}
164	}
165
166	void RleEncoderV1::writeVslong(int64_t val) {
167	writeVulong((val << `1`) ^ (val >> `63`));
168	}
169
170	void RleEncoderV1::writeVulong(int64_t val) {
171	while (true) {
172	if ((val & ~BASE_128_MASK) == `0`) {
173	writeByte(static_cast<char>(val));
174	return;
175	} else {
176	writeByte(static_cast<char>(`0x80` \| (val & BASE_128_MASK)));
177	// cast val to unsigned so as to force 0-fill right shift
178	val = (static_cast<uint64_t>(val) >> `7`);
179	}
180	}
181	}
182
183	void RleEncoderV1::recordPosition(PositionRecorder* recorder) const {
184	uint64_t flushedSize = outputStream ->getSize();
185	uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
186	if (outputStream ->isCompressed()) {
187	recorder->add(flushedSize);
188	recorder->add(unflushedSize);
189	} else {
190	flushedSize -= static_cast<uint64_t>(bufferLength);
191	recorder->add(flushedSize + unflushedSize);
192	}
193	recorder->add(static_cast<uint64_t>(numLiterals));
194	}
195
196	signed char RleDecoderV1::readByte() {
197	if (bufferStart == bufferEnd) {
198	int bufferLength;
199	const void* bufferPointer;
200	if (!inputStream ->Next(&bufferPointer, &bufferLength)) {
201	throw ParseError ("bad read in readByte");
202	}
203	bufferStart = static_cast<const char*>(bufferPointer);
204	bufferEnd = bufferStart + bufferLength;
205	}
206	return *(bufferStart++);
207	}
208
209	uint64_t RleDecoderV1::readLong() {
210	uint64_t result = `0`;
211	int64_t offset = `0`;
212	signed char ch = readByte();
213	if (ch >= `0`) {
214	result = static_cast<uint64_t>(ch);
215	} else {
216	result = static_cast<uint64_t>(ch) & BASE_128_MASK;
217	while ((ch = readByte()) < `0`) {
218	offset += `7`;
219	result \|= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset;
220	}
221	result \|= static_cast<uint64_t>(ch) << (offset + `7`);
222	}
223	return result;
224	}
225
226	void RleDecoderV1::skipLongs(uint64_t numValues) {
227	while (numValues > `0`) {
228	if (readByte() >= `0`) {
229	--numValues;
230	}
231	}
232	}
233
234	void RleDecoderV1::readHeader() {
235	signed char ch = readByte();
236	if (ch < `0`) {
237	remainingValues = static_cast<uint64_t>(-ch);
238	repeating = false;
239	} else {
240	remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
241	repeating = true;
242	delta = readByte();
243	value = isSigned
244	? unZigZag(readLong())
245	: static_cast<int64_t>(readLong());
246	}
247	}
248
249	RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
250	bool hasSigned)
251	: inputStream (std::move(input)),
252	isSigned(hasSigned),
253	remainingValues(`0`),
254	value(`0`),
255	bufferStart(nullptr),
256	bufferEnd(bufferStart),
257	delta(`0`),
258	repeating(false) {
259	}
260
261	void RleDecoderV1::seek(PositionProvider& location) {
262	// move the input stream
263	inputStream ->seek(location);
264	// force a re-read from the stream
265	bufferEnd = bufferStart;
266	// read a new header
267	readHeader();
268	// skip ahead the given number of records
269	skip(location.next());
270	}
271
272	void RleDecoderV1::skip(uint64_t numValues) {
273	while (numValues > `0`) {
274	if (remainingValues == `0`) {
275	readHeader();
276	}
277	uint64_t count = std::min(numValues, remainingValues);
278	remainingValues -= count;
279	numValues -= count;
280	if (repeating) {
281	value += delta * static_cast<int64_t>(count);
282	} else {
283	skipLongs(count);
284	}
285	}
286	}
287
288	void RleDecoderV1::next(int64_t* const data,
289	const uint64_t numValues,
290	const char* const notNull) {
291	uint64_t position = `0`;
292	// skipNulls()
293	if (notNull) {
294	// Skip over null values.
295	while (position < numValues && !notNull[position]) {
296	++position;
297	}
298	}
299	while (position < numValues) {
300	// If we are out of values, read more.
301	if (remainingValues == `0`) {
302	readHeader();
303	}
304	// How many do we read out of this block?
305	uint64_t count = std::min(numValues - position, remainingValues);
306	uint64_t consumed = `0`;
307	if (repeating) {
308	if (notNull) {
309	for (uint64_t i = `0`; i < count; ++i) {
310	if (notNull[position + i]) {
311	data[position + i] = value + static_cast<int64_t>(consumed) * delta;
312	consumed += `1`;
313	}
314	}
315	} else {
316	for (uint64_t i = `0`; i < count; ++i) {
317	data[position + i] = value + static_cast<int64_t>(i) * delta;
318	}
319	consumed = count;
320	}
321	value += static_cast<int64_t>(consumed) * delta;
322	} else {
323	if (notNull) {
324	for (uint64_t i = `0` ; i < count; ++i) {
325	if (notNull[position + i]) {
326	data[position + i] = isSigned
327	? unZigZag(readLong())
328	: static_cast<int64_t>(readLong());
329	++consumed;
330	}
331	}
332	} else {
333	if (isSigned) {
334	for (uint64_t i = `0`; i < count; ++i) {
335	data[position + i] = unZigZag(readLong());
336	}
337	} else {
338	for (uint64_t i = `0`; i < count; ++i) {
339	data[position + i] = static_cast<int64_t>(readLong());
340	}
341	}
342	consumed = count;
343	}
344	}
345	remainingValues -= consumed;
346	position += count;
347
348	// skipNulls()
349	if (notNull) {
350	// Skip over null values.
351	while (position < numValues && !notNull[position]) {
352	++position;
353	}
354	}
355	}
356	}
357
358	} // namespace orc
359

Browse the source code of ClickHouse/contrib/orc/c++/src/RLEv1.cc