1 | /** |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, software |
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | * See the License for the specific language governing permissions and |
16 | * limitations under the License. |
17 | */ |
18 | |
19 | #ifndef ORC_READER_IMPL_HH |
20 | #define ORC_READER_IMPL_HH |
21 | |
22 | #include "orc/Int128.hh" |
23 | #include "orc/OrcFile.hh" |
24 | #include "orc/Reader.hh" |
25 | |
26 | #include "ColumnReader.hh" |
27 | #include "orc/Exceptions.hh" |
28 | #include "RLE.hh" |
29 | #include "TypeImpl.hh" |
30 | |
31 | namespace orc { |
32 | |
33 | static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; |
34 | |
35 | /** |
36 | * WriterVersion Implementation |
37 | */ |
38 | class WriterVersionImpl { |
39 | private: |
40 | WriterVersion version; |
41 | public: |
42 | // Known Versions with issues resolved |
43 | // The static method below is to fix global constructors Clang warning |
44 | static const WriterVersionImpl& VERSION_HIVE_8732(); |
45 | |
46 | WriterVersionImpl(WriterVersion ver) : version(ver) {} |
47 | |
48 | bool compareGT(const WriterVersion other) const { |
49 | return version > other; |
50 | } |
51 | }; |
52 | |
53 | /** |
54 | * State shared between Reader and Row Reader |
55 | */ |
56 | struct FileContents { |
57 | std::unique_ptr<InputStream> stream; |
58 | std::unique_ptr<proto::PostScript> postscript; |
59 | std::unique_ptr<proto::Footer> ; |
60 | std::unique_ptr<Type> schema; |
61 | uint64_t blockSize; |
62 | CompressionKind compression; |
63 | MemoryPool *pool; |
64 | std::ostream *errorStream; |
65 | }; |
66 | |
67 | proto::StripeFooter (const proto::StripeInformation& info, |
68 | const FileContents& contents); |
69 | |
70 | class ReaderImpl; |
71 | |
72 | class ColumnSelector { |
73 | private: |
74 | std::map<std::string, uint64_t> nameIdMap; |
75 | std::map<uint64_t, const Type*> idTypeMap; |
76 | const FileContents* contents; |
77 | std::vector<std::string> columns; |
78 | |
79 | // build map from type name and id, id to Type |
80 | void buildTypeNameIdMap(const Type* type); |
81 | std::string toDotColumnPath(); |
82 | |
83 | public: |
84 | // Select a field by name |
85 | void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name); |
86 | // Select a field by id |
87 | void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId); |
88 | // Select a type by id |
89 | void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId); |
90 | |
91 | // Select all of the recursive children of the given type. |
92 | void selectChildren(std::vector<bool>& selectedColumns, const Type& type); |
93 | |
94 | // For each child of type, select it if one of its children |
95 | // is selected. |
96 | bool selectParents(std::vector<bool>& selectedColumns, const Type& type); |
97 | /** |
98 | * Constructor that selects columns. |
99 | * @param contents of the file |
100 | */ |
101 | ColumnSelector(const FileContents* contents); |
102 | |
103 | // Select the columns from the RowReaderoptions object |
104 | void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options); |
105 | |
106 | // Select the columns from the Readeroptions object |
107 | void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); |
108 | }; |
109 | |
110 | |
111 | class RowReaderImpl : public RowReader { |
112 | private: |
113 | const Timezone& localTimezone; |
114 | |
115 | // contents |
116 | std::shared_ptr<FileContents> contents; |
117 | const bool throwOnHive11DecimalOverflow; |
118 | const int32_t forcedScaleOnHive11Decimal; |
119 | |
120 | // inputs |
121 | std::vector<bool> selectedColumns; |
122 | |
123 | // footer |
124 | proto::Footer* ; |
125 | DataBuffer<uint64_t> firstRowOfStripe; |
126 | mutable std::unique_ptr<Type> selectedSchema; |
127 | |
128 | // reading state |
129 | uint64_t previousRow; |
130 | uint64_t firstStripe; |
131 | uint64_t currentStripe; |
132 | uint64_t lastStripe; // the stripe AFTER the last one |
133 | uint64_t currentRowInStripe; |
134 | uint64_t rowsInCurrentStripe; |
135 | proto::StripeInformation currentStripeInfo; |
136 | proto::StripeFooter ; |
137 | std::unique_ptr<ColumnReader> reader; |
138 | |
139 | // internal methods |
140 | void startNextStripe(); |
141 | |
142 | public: |
143 | /** |
144 | * Constructor that lets the user specify additional options. |
145 | * @param contents of the file |
146 | * @param options options for reading |
147 | */ |
148 | RowReaderImpl(std::shared_ptr<FileContents> contents, |
149 | const RowReaderOptions& options); |
150 | |
151 | // Select the columns from the options object |
152 | void updateSelected(); |
153 | const std::vector<bool> getSelectedColumns() const override; |
154 | |
155 | const Type& getSelectedType() const override; |
156 | |
157 | std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size |
158 | ) const override; |
159 | |
160 | bool next(ColumnVectorBatch& data) override; |
161 | |
162 | CompressionKind getCompression() const; |
163 | |
164 | uint64_t getCompressionSize() const; |
165 | |
166 | uint64_t getRowNumber() const override; |
167 | |
168 | void seekToRow(uint64_t rowNumber) override; |
169 | |
170 | const FileContents& getFileContents() const; |
171 | bool getThrowOnHive11DecimalOverflow() const; |
172 | int32_t getForcedScaleOnHive11Decimal() const; |
173 | }; |
174 | |
175 | class ReaderImpl : public Reader { |
176 | private: |
177 | // FileContents |
178 | std::shared_ptr<FileContents> contents; |
179 | |
180 | // inputs |
181 | const ReaderOptions options; |
182 | const uint64_t fileLength; |
183 | const uint64_t postscriptLength; |
184 | |
185 | // footer |
186 | proto::Footer* ; |
187 | uint64_t numberOfStripes; |
188 | uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); |
189 | |
190 | // internal methods |
191 | void readMetadata() const; |
192 | void checkOrcVersion(); |
193 | void getRowIndexStatistics(uint64_t stripeOffset, |
194 | const proto::StripeFooter& , |
195 | std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; |
196 | |
197 | // metadata |
198 | mutable std::unique_ptr<proto::Metadata> metadata; |
199 | mutable bool isMetadataLoaded; |
200 | public: |
201 | /** |
202 | * Constructor that lets the user specify additional options. |
203 | * @param contents of the file |
204 | * @param options options for reading |
205 | * @param fileLength the length of the file in bytes |
206 | * @param postscriptLength the length of the postscript in bytes |
207 | */ |
208 | ReaderImpl(std::shared_ptr<FileContents> contents, |
209 | const ReaderOptions& options, |
210 | uint64_t fileLength, |
211 | uint64_t postscriptLength); |
212 | |
213 | const ReaderOptions& getReaderOptions() const; |
214 | |
215 | CompressionKind getCompression() const override; |
216 | |
217 | FileVersion getFormatVersion() const override; |
218 | |
219 | WriterId getWriterId() const override; |
220 | |
221 | uint32_t getWriterIdValue() const override; |
222 | |
223 | WriterVersion getWriterVersion() const override; |
224 | |
225 | uint64_t getNumberOfRows() const override; |
226 | |
227 | uint64_t getRowIndexStride() const override; |
228 | |
229 | std::list<std::string> getMetadataKeys() const override; |
230 | |
231 | std::string getMetadataValue(const std::string& key) const override; |
232 | |
233 | bool hasMetadataValue(const std::string& key) const override; |
234 | |
235 | uint64_t getCompressionSize() const override; |
236 | |
237 | uint64_t getNumberOfStripes() const override; |
238 | |
239 | std::unique_ptr<StripeInformation> getStripe(uint64_t |
240 | ) const override; |
241 | |
242 | uint64_t getNumberOfStripeStatistics() const override; |
243 | |
244 | const std::string& getStreamName() const override; |
245 | |
246 | std::unique_ptr<StripeStatistics> |
247 | getStripeStatistics(uint64_t stripeIndex) const override; |
248 | |
249 | std::unique_ptr<RowReader> createRowReader() const override; |
250 | |
251 | std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options |
252 | ) const override; |
253 | |
254 | uint64_t getContentLength() const override; |
255 | uint64_t getStripeStatisticsLength() const override; |
256 | uint64_t () const override; |
257 | uint64_t getFilePostscriptLength() const override; |
258 | uint64_t getFileLength() const override; |
259 | |
260 | std::unique_ptr<Statistics> getStatistics() const override; |
261 | |
262 | std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId |
263 | ) const override; |
264 | |
265 | std::string getSerializedFileTail() const override; |
266 | |
267 | const Type& getType() const override; |
268 | |
269 | bool hasCorrectStatistics() const override; |
270 | |
271 | const proto::PostScript* getPostscript() const {return contents->postscript.get();} |
272 | |
273 | uint64_t getBlockSize() const {return contents->blockSize;} |
274 | |
275 | const proto::Footer* () const {return contents->footer.get();} |
276 | |
277 | const Type* getSchema() const {return contents->schema.get();} |
278 | |
279 | InputStream* getStream() const {return contents->stream.get();} |
280 | |
281 | uint64_t getMemoryUse(int stripeIx = -1) override; |
282 | |
283 | uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; |
284 | |
285 | uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; |
286 | |
287 | uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; |
288 | }; |
289 | |
290 | }// namespace |
291 | |
292 | #endif |
293 | |