| 1 | /** |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | * or more contributor license agreements. See the NOTICE file |
| 4 | * distributed with this work for additional information |
| 5 | * regarding copyright ownership. The ASF licenses this file |
| 6 | * to you under the Apache License, Version 2.0 (the |
| 7 | * "License"); you may not use this file except in compliance |
| 8 | * with the License. You may obtain a copy of the License at |
| 9 | * |
| 10 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | * |
| 12 | * Unless required by applicable law or agreed to in writing, software |
| 13 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | * See the License for the specific language governing permissions and |
| 16 | * limitations under the License. |
| 17 | */ |
| 18 | |
| 19 | #ifndef ORC_READER_IMPL_HH |
| 20 | #define ORC_READER_IMPL_HH |
| 21 | |
| 22 | #include "orc/Int128.hh" |
| 23 | #include "orc/OrcFile.hh" |
| 24 | #include "orc/Reader.hh" |
| 25 | |
| 26 | #include "ColumnReader.hh" |
| 27 | #include "orc/Exceptions.hh" |
| 28 | #include "RLE.hh" |
| 29 | #include "TypeImpl.hh" |
| 30 | |
| 31 | namespace orc { |
| 32 | |
| 33 | static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; |
| 34 | |
| 35 | /** |
| 36 | * WriterVersion Implementation |
| 37 | */ |
| 38 | class WriterVersionImpl { |
| 39 | private: |
| 40 | WriterVersion version; |
| 41 | public: |
| 42 | // Known Versions with issues resolved |
| 43 | // The static method below is to fix global constructors Clang warning |
| 44 | static const WriterVersionImpl& VERSION_HIVE_8732(); |
| 45 | |
| 46 | WriterVersionImpl(WriterVersion ver) : version(ver) {} |
| 47 | |
| 48 | bool compareGT(const WriterVersion other) const { |
| 49 | return version > other; |
| 50 | } |
| 51 | }; |
| 52 | |
| 53 | /** |
| 54 | * State shared between Reader and Row Reader |
| 55 | */ |
| 56 | struct FileContents { |
| 57 | std::unique_ptr<InputStream> stream; |
| 58 | std::unique_ptr<proto::PostScript> postscript; |
| 59 | std::unique_ptr<proto::Footer> ; |
| 60 | std::unique_ptr<Type> schema; |
| 61 | uint64_t blockSize; |
| 62 | CompressionKind compression; |
| 63 | MemoryPool *pool; |
| 64 | std::ostream *errorStream; |
| 65 | }; |
| 66 | |
| 67 | proto::StripeFooter (const proto::StripeInformation& info, |
| 68 | const FileContents& contents); |
| 69 | |
| 70 | class ReaderImpl; |
| 71 | |
| 72 | class ColumnSelector { |
| 73 | private: |
| 74 | std::map<std::string, uint64_t> nameIdMap; |
| 75 | std::map<uint64_t, const Type*> idTypeMap; |
| 76 | const FileContents* contents; |
| 77 | std::vector<std::string> columns; |
| 78 | |
| 79 | // build map from type name and id, id to Type |
| 80 | void buildTypeNameIdMap(const Type* type); |
| 81 | std::string toDotColumnPath(); |
| 82 | |
| 83 | public: |
| 84 | // Select a field by name |
| 85 | void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name); |
| 86 | // Select a field by id |
| 87 | void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId); |
| 88 | // Select a type by id |
| 89 | void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId); |
| 90 | |
| 91 | // Select all of the recursive children of the given type. |
| 92 | void selectChildren(std::vector<bool>& selectedColumns, const Type& type); |
| 93 | |
| 94 | // For each child of type, select it if one of its children |
| 95 | // is selected. |
| 96 | bool selectParents(std::vector<bool>& selectedColumns, const Type& type); |
| 97 | /** |
| 98 | * Constructor that selects columns. |
| 99 | * @param contents of the file |
| 100 | */ |
| 101 | ColumnSelector(const FileContents* contents); |
| 102 | |
| 103 | // Select the columns from the RowReaderoptions object |
| 104 | void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options); |
| 105 | |
| 106 | // Select the columns from the Readeroptions object |
| 107 | void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); |
| 108 | }; |
| 109 | |
| 110 | |
| 111 | class RowReaderImpl : public RowReader { |
| 112 | private: |
| 113 | const Timezone& localTimezone; |
| 114 | |
| 115 | // contents |
| 116 | std::shared_ptr<FileContents> contents; |
| 117 | const bool throwOnHive11DecimalOverflow; |
| 118 | const int32_t forcedScaleOnHive11Decimal; |
| 119 | |
| 120 | // inputs |
| 121 | std::vector<bool> selectedColumns; |
| 122 | |
| 123 | // footer |
| 124 | proto::Footer* ; |
| 125 | DataBuffer<uint64_t> firstRowOfStripe; |
| 126 | mutable std::unique_ptr<Type> selectedSchema; |
| 127 | |
| 128 | // reading state |
| 129 | uint64_t previousRow; |
| 130 | uint64_t firstStripe; |
| 131 | uint64_t currentStripe; |
| 132 | uint64_t lastStripe; // the stripe AFTER the last one |
| 133 | uint64_t currentRowInStripe; |
| 134 | uint64_t rowsInCurrentStripe; |
| 135 | proto::StripeInformation currentStripeInfo; |
| 136 | proto::StripeFooter ; |
| 137 | std::unique_ptr<ColumnReader> reader; |
| 138 | |
| 139 | // internal methods |
| 140 | void startNextStripe(); |
| 141 | |
| 142 | public: |
| 143 | /** |
| 144 | * Constructor that lets the user specify additional options. |
| 145 | * @param contents of the file |
| 146 | * @param options options for reading |
| 147 | */ |
| 148 | RowReaderImpl(std::shared_ptr<FileContents> contents, |
| 149 | const RowReaderOptions& options); |
| 150 | |
| 151 | // Select the columns from the options object |
| 152 | void updateSelected(); |
| 153 | const std::vector<bool> getSelectedColumns() const override; |
| 154 | |
| 155 | const Type& getSelectedType() const override; |
| 156 | |
| 157 | std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size |
| 158 | ) const override; |
| 159 | |
| 160 | bool next(ColumnVectorBatch& data) override; |
| 161 | |
| 162 | CompressionKind getCompression() const; |
| 163 | |
| 164 | uint64_t getCompressionSize() const; |
| 165 | |
| 166 | uint64_t getRowNumber() const override; |
| 167 | |
| 168 | void seekToRow(uint64_t rowNumber) override; |
| 169 | |
| 170 | const FileContents& getFileContents() const; |
| 171 | bool getThrowOnHive11DecimalOverflow() const; |
| 172 | int32_t getForcedScaleOnHive11Decimal() const; |
| 173 | }; |
| 174 | |
| 175 | class ReaderImpl : public Reader { |
| 176 | private: |
| 177 | // FileContents |
| 178 | std::shared_ptr<FileContents> contents; |
| 179 | |
| 180 | // inputs |
| 181 | const ReaderOptions options; |
| 182 | const uint64_t fileLength; |
| 183 | const uint64_t postscriptLength; |
| 184 | |
| 185 | // footer |
| 186 | proto::Footer* ; |
| 187 | uint64_t numberOfStripes; |
| 188 | uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); |
| 189 | |
| 190 | // internal methods |
| 191 | void readMetadata() const; |
| 192 | void checkOrcVersion(); |
| 193 | void getRowIndexStatistics(uint64_t stripeOffset, |
| 194 | const proto::StripeFooter& , |
| 195 | std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; |
| 196 | |
| 197 | // metadata |
| 198 | mutable std::unique_ptr<proto::Metadata> metadata; |
| 199 | mutable bool isMetadataLoaded; |
| 200 | public: |
| 201 | /** |
| 202 | * Constructor that lets the user specify additional options. |
| 203 | * @param contents of the file |
| 204 | * @param options options for reading |
| 205 | * @param fileLength the length of the file in bytes |
| 206 | * @param postscriptLength the length of the postscript in bytes |
| 207 | */ |
| 208 | ReaderImpl(std::shared_ptr<FileContents> contents, |
| 209 | const ReaderOptions& options, |
| 210 | uint64_t fileLength, |
| 211 | uint64_t postscriptLength); |
| 212 | |
| 213 | const ReaderOptions& getReaderOptions() const; |
| 214 | |
| 215 | CompressionKind getCompression() const override; |
| 216 | |
| 217 | FileVersion getFormatVersion() const override; |
| 218 | |
| 219 | WriterId getWriterId() const override; |
| 220 | |
| 221 | uint32_t getWriterIdValue() const override; |
| 222 | |
| 223 | WriterVersion getWriterVersion() const override; |
| 224 | |
| 225 | uint64_t getNumberOfRows() const override; |
| 226 | |
| 227 | uint64_t getRowIndexStride() const override; |
| 228 | |
| 229 | std::list<std::string> getMetadataKeys() const override; |
| 230 | |
| 231 | std::string getMetadataValue(const std::string& key) const override; |
| 232 | |
| 233 | bool hasMetadataValue(const std::string& key) const override; |
| 234 | |
| 235 | uint64_t getCompressionSize() const override; |
| 236 | |
| 237 | uint64_t getNumberOfStripes() const override; |
| 238 | |
| 239 | std::unique_ptr<StripeInformation> getStripe(uint64_t |
| 240 | ) const override; |
| 241 | |
| 242 | uint64_t getNumberOfStripeStatistics() const override; |
| 243 | |
| 244 | const std::string& getStreamName() const override; |
| 245 | |
| 246 | std::unique_ptr<StripeStatistics> |
| 247 | getStripeStatistics(uint64_t stripeIndex) const override; |
| 248 | |
| 249 | std::unique_ptr<RowReader> createRowReader() const override; |
| 250 | |
| 251 | std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options |
| 252 | ) const override; |
| 253 | |
| 254 | uint64_t getContentLength() const override; |
| 255 | uint64_t getStripeStatisticsLength() const override; |
| 256 | uint64_t () const override; |
| 257 | uint64_t getFilePostscriptLength() const override; |
| 258 | uint64_t getFileLength() const override; |
| 259 | |
| 260 | std::unique_ptr<Statistics> getStatistics() const override; |
| 261 | |
| 262 | std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId |
| 263 | ) const override; |
| 264 | |
| 265 | std::string getSerializedFileTail() const override; |
| 266 | |
| 267 | const Type& getType() const override; |
| 268 | |
| 269 | bool hasCorrectStatistics() const override; |
| 270 | |
| 271 | const proto::PostScript* getPostscript() const {return contents->postscript.get();} |
| 272 | |
| 273 | uint64_t getBlockSize() const {return contents->blockSize;} |
| 274 | |
| 275 | const proto::Footer* () const {return contents->footer.get();} |
| 276 | |
| 277 | const Type* getSchema() const {return contents->schema.get();} |
| 278 | |
| 279 | InputStream* getStream() const {return contents->stream.get();} |
| 280 | |
| 281 | uint64_t getMemoryUse(int stripeIx = -1) override; |
| 282 | |
| 283 | uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; |
| 284 | |
| 285 | uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; |
| 286 | |
| 287 | uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; |
| 288 | }; |
| 289 | |
| 290 | }// namespace |
| 291 | |
| 292 | #endif |
| 293 | |