1/**
2* Licensed to the Apache Software Foundation (ASF) under one
3* or more contributor license agreements. See the NOTICE file
4* distributed with this work for additional information
5* regarding copyright ownership. The ASF licenses this file
6* to you under the Apache License, Version 2.0 (the
7* "License"); you may not use this file except in compliance
8* with the License. You may obtain a copy of the License at
9*
10* http://www.apache.org/licenses/LICENSE-2.0
11*
12* Unless required by applicable law or agreed to in writing, software
13* distributed under the License is distributed on an "AS IS" BASIS,
14* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15* See the License for the specific language governing permissions and
16* limitations under the License.
17*/
18
19#ifndef ORC_RLEV2_HH
20#define ORC_RLEV2_HH
21
22#include "Adaptor.hh"
23#include "orc/Exceptions.hh"
24#include "RLE.hh"
25
26#include <vector>
27
28namespace orc {
29
30class RleDecoderV2 : public RleDecoder {
31public:
32
33 enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };
34
35 RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
36 bool isSigned, MemoryPool& pool);
37
38 /**
39 * Seek to a particular spot.
40 */
41 void seek(PositionProvider&) override;
42
43 /**
44 * Seek over a given number of values.
45 */
46 void skip(uint64_t numValues) override;
47
48 /**
49 * Read a number of values into the batch.
50 */
51 void next(int64_t* data, uint64_t numValues,
52 const char* notNull) override;
53
54private:
55
56 // Used by PATCHED_BASE
57 void adjustGapAndPatch() {
58 curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
59 patchBitSize;
60 curPatch = unpackedPatch[patchIdx] & patchMask;
61 actualGap = 0;
62
63 // special case: gap is >255 then patch value will be 0.
64 // if gap is <=255 then patch value cannot be 0
65 while (curGap == 255 && curPatch == 0) {
66 actualGap += 255;
67 ++patchIdx;
68 curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
69 patchBitSize;
70 curPatch = unpackedPatch[patchIdx] & patchMask;
71 }
72 // add the left over gap
73 actualGap += curGap;
74 }
75
76 void resetReadLongs() {
77 bitsLeft = 0;
78 curByte = 0;
79 }
80
81 void resetRun() {
82 resetReadLongs();
83 bitSize = 0;
84 }
85
86 unsigned char readByte() {
87 if (bufferStart == bufferEnd) {
88 int bufferLength;
89 const void* bufferPointer;
90 if (!inputStream->Next(&bufferPointer, &bufferLength)) {
91 throw ParseError("bad read in RleDecoderV2::readByte");
92 }
93 bufferStart = static_cast<const char*>(bufferPointer);
94 bufferEnd = bufferStart + bufferLength;
95 }
96
97 unsigned char result = static_cast<unsigned char>(*bufferStart++);
98 return result;
99}
100
101 int64_t readLongBE(uint64_t bsz);
102 int64_t readVslong();
103 uint64_t readVulong();
104 uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len,
105 uint64_t fb, const char* notNull = nullptr) {
106 uint64_t ret = 0;
107
108 // TODO: unroll to improve performance
109 for(uint64_t i = offset; i < (offset + len); i++) {
110 // skip null positions
111 if (notNull && !notNull[i]) {
112 continue;
113 }
114 uint64_t result = 0;
115 uint64_t bitsLeftToRead = fb;
116 while (bitsLeftToRead > bitsLeft) {
117 result <<= bitsLeft;
118 result |= curByte & ((1 << bitsLeft) - 1);
119 bitsLeftToRead -= bitsLeft;
120 curByte = readByte();
121 bitsLeft = 8;
122 }
123
124 // handle the left over bits
125 if (bitsLeftToRead > 0) {
126 result <<= bitsLeftToRead;
127 bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
128 result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
129 }
130 data[i] = static_cast<int64_t>(result);
131 ++ret;
132 }
133
134 return ret;
135}
136
137
138 uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
139 const char* notNull);
140 uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues,
141 const char* notNull);
142 uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues,
143 const char* notNull);
144 uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues,
145 const char* notNull);
146
147 const std::unique_ptr<SeekableInputStream> inputStream;
148 const bool isSigned;
149
150 unsigned char firstByte;
151 uint64_t runLength;
152 uint64_t runRead;
153 const char *bufferStart;
154 const char *bufferEnd;
155 int64_t deltaBase; // Used by DELTA
156 uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE
157 int64_t firstValue; // Used by SHORT_REPEAT and DELTA
158 int64_t prevValue; // Used by DELTA
159 uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA
160 uint32_t bitsLeft; // Used by anything that uses readLongs
161 uint32_t curByte; // Used by anything that uses readLongs
162 uint32_t patchBitSize; // Used by PATCHED_BASE
163 uint64_t unpackedIdx; // Used by PATCHED_BASE
164 uint64_t patchIdx; // Used by PATCHED_BASE
165 int64_t base; // Used by PATCHED_BASE
166 uint64_t curGap; // Used by PATCHED_BASE
167 int64_t curPatch; // Used by PATCHED_BASE
168 int64_t patchMask; // Used by PATCHED_BASE
169 int64_t actualGap; // Used by PATCHED_BASE
170 DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE
171 DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
172};
173} // namespace orc
174
175#endif // ORC_RLEV2_HH
176