1/**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#ifndef ORC_READER_IMPL_HH
20#define ORC_READER_IMPL_HH
21
22#include "orc/Int128.hh"
23#include "orc/OrcFile.hh"
24#include "orc/Reader.hh"
25
26#include "ColumnReader.hh"
27#include "orc/Exceptions.hh"
28#include "RLE.hh"
29#include "TypeImpl.hh"
30
31namespace orc {
32
33 static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;
34
35 /**
36 * WriterVersion Implementation
37 */
38 class WriterVersionImpl {
39 private:
40 WriterVersion version;
41 public:
42 // Known Versions with issues resolved
43 // The static method below is to fix global constructors Clang warning
44 static const WriterVersionImpl& VERSION_HIVE_8732();
45
46 WriterVersionImpl(WriterVersion ver) : version(ver) {}
47
48 bool compareGT(const WriterVersion other) const {
49 return version > other;
50 }
51 };
52
53 /**
54 * State shared between Reader and Row Reader
55 */
56 struct FileContents {
57 std::unique_ptr<InputStream> stream;
58 std::unique_ptr<proto::PostScript> postscript;
59 std::unique_ptr<proto::Footer> footer;
60 std::unique_ptr<Type> schema;
61 uint64_t blockSize;
62 CompressionKind compression;
63 MemoryPool *pool;
64 std::ostream *errorStream;
65 };
66
67 proto::StripeFooter getStripeFooter(const proto::StripeInformation& info,
68 const FileContents& contents);
69
70 class ReaderImpl;
71
72 class ColumnSelector {
73 private:
74 std::map<std::string, uint64_t> nameIdMap;
75 std::map<uint64_t, const Type*> idTypeMap;
76 const FileContents* contents;
77 std::vector<std::string> columns;
78
79 // build map from type name and id, id to Type
80 void buildTypeNameIdMap(const Type* type);
81 std::string toDotColumnPath();
82
83 public:
84 // Select a field by name
85 void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name);
86 // Select a field by id
87 void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId);
88 // Select a type by id
89 void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId);
90
91 // Select all of the recursive children of the given type.
92 void selectChildren(std::vector<bool>& selectedColumns, const Type& type);
93
94 // For each child of type, select it if one of its children
95 // is selected.
96 bool selectParents(std::vector<bool>& selectedColumns, const Type& type);
97 /**
98 * Constructor that selects columns.
99 * @param contents of the file
100 */
101 ColumnSelector(const FileContents* contents);
102
103 // Select the columns from the RowReaderoptions object
104 void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options);
105
106 // Select the columns from the Readeroptions object
107 void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options);
108 };
109
110
111 class RowReaderImpl : public RowReader {
112 private:
113 const Timezone& localTimezone;
114
115 // contents
116 std::shared_ptr<FileContents> contents;
117 const bool throwOnHive11DecimalOverflow;
118 const int32_t forcedScaleOnHive11Decimal;
119
120 // inputs
121 std::vector<bool> selectedColumns;
122
123 // footer
124 proto::Footer* footer;
125 DataBuffer<uint64_t> firstRowOfStripe;
126 mutable std::unique_ptr<Type> selectedSchema;
127
128 // reading state
129 uint64_t previousRow;
130 uint64_t firstStripe;
131 uint64_t currentStripe;
132 uint64_t lastStripe; // the stripe AFTER the last one
133 uint64_t currentRowInStripe;
134 uint64_t rowsInCurrentStripe;
135 proto::StripeInformation currentStripeInfo;
136 proto::StripeFooter currentStripeFooter;
137 std::unique_ptr<ColumnReader> reader;
138
139 // internal methods
140 void startNextStripe();
141
142 public:
143 /**
144 * Constructor that lets the user specify additional options.
145 * @param contents of the file
146 * @param options options for reading
147 */
148 RowReaderImpl(std::shared_ptr<FileContents> contents,
149 const RowReaderOptions& options);
150
151 // Select the columns from the options object
152 void updateSelected();
153 const std::vector<bool> getSelectedColumns() const override;
154
155 const Type& getSelectedType() const override;
156
157 std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size
158 ) const override;
159
160 bool next(ColumnVectorBatch& data) override;
161
162 CompressionKind getCompression() const;
163
164 uint64_t getCompressionSize() const;
165
166 uint64_t getRowNumber() const override;
167
168 void seekToRow(uint64_t rowNumber) override;
169
170 const FileContents& getFileContents() const;
171 bool getThrowOnHive11DecimalOverflow() const;
172 int32_t getForcedScaleOnHive11Decimal() const;
173 };
174
175 class ReaderImpl : public Reader {
176 private:
177 // FileContents
178 std::shared_ptr<FileContents> contents;
179
180 // inputs
181 const ReaderOptions options;
182 const uint64_t fileLength;
183 const uint64_t postscriptLength;
184
185 // footer
186 proto::Footer* footer;
187 uint64_t numberOfStripes;
188 uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns);
189
190 // internal methods
191 void readMetadata() const;
192 void checkOrcVersion();
193 void getRowIndexStatistics(uint64_t stripeOffset,
194 const proto::StripeFooter& currentStripeFooter,
195 std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const;
196
197 // metadata
198 mutable std::unique_ptr<proto::Metadata> metadata;
199 mutable bool isMetadataLoaded;
200 public:
201 /**
202 * Constructor that lets the user specify additional options.
203 * @param contents of the file
204 * @param options options for reading
205 * @param fileLength the length of the file in bytes
206 * @param postscriptLength the length of the postscript in bytes
207 */
208 ReaderImpl(std::shared_ptr<FileContents> contents,
209 const ReaderOptions& options,
210 uint64_t fileLength,
211 uint64_t postscriptLength);
212
213 const ReaderOptions& getReaderOptions() const;
214
215 CompressionKind getCompression() const override;
216
217 FileVersion getFormatVersion() const override;
218
219 WriterId getWriterId() const override;
220
221 uint32_t getWriterIdValue() const override;
222
223 WriterVersion getWriterVersion() const override;
224
225 uint64_t getNumberOfRows() const override;
226
227 uint64_t getRowIndexStride() const override;
228
229 std::list<std::string> getMetadataKeys() const override;
230
231 std::string getMetadataValue(const std::string& key) const override;
232
233 bool hasMetadataValue(const std::string& key) const override;
234
235 uint64_t getCompressionSize() const override;
236
237 uint64_t getNumberOfStripes() const override;
238
239 std::unique_ptr<StripeInformation> getStripe(uint64_t
240 ) const override;
241
242 uint64_t getNumberOfStripeStatistics() const override;
243
244 const std::string& getStreamName() const override;
245
246 std::unique_ptr<StripeStatistics>
247 getStripeStatistics(uint64_t stripeIndex) const override;
248
249 std::unique_ptr<RowReader> createRowReader() const override;
250
251 std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options
252 ) const override;
253
254 uint64_t getContentLength() const override;
255 uint64_t getStripeStatisticsLength() const override;
256 uint64_t getFileFooterLength() const override;
257 uint64_t getFilePostscriptLength() const override;
258 uint64_t getFileLength() const override;
259
260 std::unique_ptr<Statistics> getStatistics() const override;
261
262 std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId
263 ) const override;
264
265 std::string getSerializedFileTail() const override;
266
267 const Type& getType() const override;
268
269 bool hasCorrectStatistics() const override;
270
271 const proto::PostScript* getPostscript() const {return contents->postscript.get();}
272
273 uint64_t getBlockSize() const {return contents->blockSize;}
274
275 const proto::Footer* getFooter() const {return contents->footer.get();}
276
277 const Type* getSchema() const {return contents->schema.get();}
278
279 InputStream* getStream() const {return contents->stream.get();}
280
281 uint64_t getMemoryUse(int stripeIx = -1) override;
282
283 uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override;
284
285 uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override;
286
287 uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override;
288 };
289
290}// namespace
291
292#endif
293