| 1 | /** | 
|---|
| 2 | * Licensed to the Apache Software Foundation (ASF) under one | 
|---|
| 3 | * or more contributor license agreements.  See the NOTICE file | 
|---|
| 4 | * distributed with this work for additional information | 
|---|
| 5 | * regarding copyright ownership.  The ASF licenses this file | 
|---|
| 6 | * to you under the Apache License, Version 2.0 (the | 
|---|
| 7 | * "License"); you may not use this file except in compliance | 
|---|
| 8 | * with the License.  You may obtain a copy of the License at | 
|---|
| 9 | * | 
|---|
| 10 | *     http://www.apache.org/licenses/LICENSE-2.0 | 
|---|
| 11 | * | 
|---|
| 12 | * Unless required by applicable law or agreed to in writing, software | 
|---|
| 13 | * distributed under the License is distributed on an "AS IS" BASIS, | 
|---|
| 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|---|
| 15 | * See the License for the specific language governing permissions and | 
|---|
| 16 | * limitations under the License. | 
|---|
| 17 | */ | 
|---|
| 18 |  | 
|---|
| 19 | #ifndef ORC_READER_IMPL_HH | 
|---|
| 20 | #define ORC_READER_IMPL_HH | 
|---|
| 21 |  | 
|---|
| 22 | #include "orc/Int128.hh" | 
|---|
| 23 | #include "orc/OrcFile.hh" | 
|---|
| 24 | #include "orc/Reader.hh" | 
|---|
| 25 |  | 
|---|
| 26 | #include "ColumnReader.hh" | 
|---|
| 27 | #include "orc/Exceptions.hh" | 
|---|
| 28 | #include "RLE.hh" | 
|---|
| 29 | #include "TypeImpl.hh" | 
|---|
| 30 |  | 
|---|
| 31 | namespace orc { | 
|---|
| 32 |  | 
|---|
| 33 | static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; | 
|---|
| 34 |  | 
|---|
| 35 | /** | 
|---|
| 36 | * WriterVersion Implementation | 
|---|
| 37 | */ | 
|---|
| 38 | class WriterVersionImpl { | 
|---|
| 39 | private: | 
|---|
| 40 | WriterVersion version; | 
|---|
| 41 | public: | 
|---|
| 42 | // Known Versions with issues resolved | 
|---|
| 43 | // The static method below is to fix global constructors Clang warning | 
|---|
| 44 | static const WriterVersionImpl& VERSION_HIVE_8732(); | 
|---|
| 45 |  | 
|---|
| 46 | WriterVersionImpl(WriterVersion ver) : version(ver) {} | 
|---|
| 47 |  | 
|---|
| 48 | bool compareGT(const WriterVersion other) const { | 
|---|
| 49 | return version > other; | 
|---|
| 50 | } | 
|---|
| 51 | }; | 
|---|
| 52 |  | 
|---|
| 53 | /** | 
|---|
| 54 | * State shared between Reader and Row Reader | 
|---|
| 55 | */ | 
|---|
| 56 | struct FileContents { | 
|---|
| 57 | std::unique_ptr<InputStream> stream; | 
|---|
| 58 | std::unique_ptr<proto::PostScript> postscript; | 
|---|
| 59 | std::unique_ptr<proto::Footer> ; | 
|---|
| 60 | std::unique_ptr<Type> schema; | 
|---|
| 61 | uint64_t blockSize; | 
|---|
| 62 | CompressionKind compression; | 
|---|
| 63 | MemoryPool *pool; | 
|---|
| 64 | std::ostream *errorStream; | 
|---|
| 65 | }; | 
|---|
| 66 |  | 
|---|
| 67 | proto::StripeFooter (const proto::StripeInformation& info, | 
|---|
| 68 | const FileContents& contents); | 
|---|
| 69 |  | 
|---|
| 70 | class ReaderImpl; | 
|---|
| 71 |  | 
|---|
| 72 | class ColumnSelector { | 
|---|
| 73 | private: | 
|---|
| 74 | std::map<std::string, uint64_t> nameIdMap; | 
|---|
| 75 | std::map<uint64_t, const Type*> idTypeMap; | 
|---|
| 76 | const FileContents* contents; | 
|---|
| 77 | std::vector<std::string> columns; | 
|---|
| 78 |  | 
|---|
| 79 | // build map from type name and id, id to Type | 
|---|
| 80 | void buildTypeNameIdMap(const Type* type); | 
|---|
| 81 | std::string toDotColumnPath(); | 
|---|
| 82 |  | 
|---|
| 83 | public: | 
|---|
| 84 | // Select a field by name | 
|---|
| 85 | void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name); | 
|---|
| 86 | // Select a field by id | 
|---|
| 87 | void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId); | 
|---|
| 88 | // Select a type by id | 
|---|
| 89 | void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId); | 
|---|
| 90 |  | 
|---|
| 91 | // Select all of the recursive children of the given type. | 
|---|
| 92 | void selectChildren(std::vector<bool>& selectedColumns, const Type& type); | 
|---|
| 93 |  | 
|---|
| 94 | // For each child of type, select it if one of its children | 
|---|
| 95 | // is selected. | 
|---|
| 96 | bool selectParents(std::vector<bool>& selectedColumns, const Type& type); | 
|---|
| 97 | /** | 
|---|
| 98 | * Constructor that selects columns. | 
|---|
| 99 | * @param contents of the file | 
|---|
| 100 | */ | 
|---|
| 101 | ColumnSelector(const FileContents* contents); | 
|---|
| 102 |  | 
|---|
| 103 | // Select the columns from the RowReaderoptions object | 
|---|
| 104 | void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options); | 
|---|
| 105 |  | 
|---|
| 106 | // Select the columns from the Readeroptions object | 
|---|
| 107 | void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); | 
|---|
| 108 | }; | 
|---|
| 109 |  | 
|---|
| 110 |  | 
|---|
| 111 | class RowReaderImpl : public RowReader { | 
|---|
| 112 | private: | 
|---|
| 113 | const Timezone& localTimezone; | 
|---|
| 114 |  | 
|---|
| 115 | // contents | 
|---|
| 116 | std::shared_ptr<FileContents> contents; | 
|---|
| 117 | const bool throwOnHive11DecimalOverflow; | 
|---|
| 118 | const int32_t forcedScaleOnHive11Decimal; | 
|---|
| 119 |  | 
|---|
| 120 | // inputs | 
|---|
| 121 | std::vector<bool> selectedColumns; | 
|---|
| 122 |  | 
|---|
| 123 | // footer | 
|---|
| 124 | proto::Footer* ; | 
|---|
| 125 | DataBuffer<uint64_t> firstRowOfStripe; | 
|---|
| 126 | mutable std::unique_ptr<Type> selectedSchema; | 
|---|
| 127 |  | 
|---|
| 128 | // reading state | 
|---|
| 129 | uint64_t previousRow; | 
|---|
| 130 | uint64_t firstStripe; | 
|---|
| 131 | uint64_t currentStripe; | 
|---|
| 132 | uint64_t lastStripe; // the stripe AFTER the last one | 
|---|
| 133 | uint64_t currentRowInStripe; | 
|---|
| 134 | uint64_t rowsInCurrentStripe; | 
|---|
| 135 | proto::StripeInformation currentStripeInfo; | 
|---|
| 136 | proto::StripeFooter ; | 
|---|
| 137 | std::unique_ptr<ColumnReader> reader; | 
|---|
| 138 |  | 
|---|
| 139 | // internal methods | 
|---|
| 140 | void startNextStripe(); | 
|---|
| 141 |  | 
|---|
| 142 | public: | 
|---|
| 143 | /** | 
|---|
| 144 | * Constructor that lets the user specify additional options. | 
|---|
| 145 | * @param contents of the file | 
|---|
| 146 | * @param options options for reading | 
|---|
| 147 | */ | 
|---|
| 148 | RowReaderImpl(std::shared_ptr<FileContents> contents, | 
|---|
| 149 | const RowReaderOptions& options); | 
|---|
| 150 |  | 
|---|
| 151 | // Select the columns from the options object | 
|---|
| 152 | void updateSelected(); | 
|---|
| 153 | const std::vector<bool> getSelectedColumns() const override; | 
|---|
| 154 |  | 
|---|
| 155 | const Type& getSelectedType() const override; | 
|---|
| 156 |  | 
|---|
| 157 | std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size | 
|---|
| 158 | ) const override; | 
|---|
| 159 |  | 
|---|
| 160 | bool next(ColumnVectorBatch& data) override; | 
|---|
| 161 |  | 
|---|
| 162 | CompressionKind getCompression() const; | 
|---|
| 163 |  | 
|---|
| 164 | uint64_t getCompressionSize() const; | 
|---|
| 165 |  | 
|---|
| 166 | uint64_t getRowNumber() const override; | 
|---|
| 167 |  | 
|---|
| 168 | void seekToRow(uint64_t rowNumber) override; | 
|---|
| 169 |  | 
|---|
| 170 | const FileContents& getFileContents() const; | 
|---|
| 171 | bool getThrowOnHive11DecimalOverflow() const; | 
|---|
| 172 | int32_t getForcedScaleOnHive11Decimal() const; | 
|---|
| 173 | }; | 
|---|
| 174 |  | 
|---|
| 175 | class ReaderImpl : public Reader { | 
|---|
| 176 | private: | 
|---|
| 177 | // FileContents | 
|---|
| 178 | std::shared_ptr<FileContents> contents; | 
|---|
| 179 |  | 
|---|
| 180 | // inputs | 
|---|
| 181 | const ReaderOptions options; | 
|---|
| 182 | const uint64_t fileLength; | 
|---|
| 183 | const uint64_t postscriptLength; | 
|---|
| 184 |  | 
|---|
| 185 | // footer | 
|---|
| 186 | proto::Footer* ; | 
|---|
| 187 | uint64_t numberOfStripes; | 
|---|
| 188 | uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); | 
|---|
| 189 |  | 
|---|
| 190 | // internal methods | 
|---|
| 191 | void readMetadata() const; | 
|---|
| 192 | void checkOrcVersion(); | 
|---|
| 193 | void getRowIndexStatistics(uint64_t stripeOffset, | 
|---|
| 194 | const proto::StripeFooter& , | 
|---|
| 195 | std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; | 
|---|
| 196 |  | 
|---|
| 197 | // metadata | 
|---|
| 198 | mutable std::unique_ptr<proto::Metadata> metadata; | 
|---|
| 199 | mutable bool isMetadataLoaded; | 
|---|
| 200 | public: | 
|---|
| 201 | /** | 
|---|
| 202 | * Constructor that lets the user specify additional options. | 
|---|
| 203 | * @param contents of the file | 
|---|
| 204 | * @param options options for reading | 
|---|
| 205 | * @param fileLength the length of the file in bytes | 
|---|
| 206 | * @param postscriptLength the length of the postscript in bytes | 
|---|
| 207 | */ | 
|---|
| 208 | ReaderImpl(std::shared_ptr<FileContents> contents, | 
|---|
| 209 | const ReaderOptions& options, | 
|---|
| 210 | uint64_t fileLength, | 
|---|
| 211 | uint64_t postscriptLength); | 
|---|
| 212 |  | 
|---|
| 213 | const ReaderOptions& getReaderOptions() const; | 
|---|
| 214 |  | 
|---|
| 215 | CompressionKind getCompression() const override; | 
|---|
| 216 |  | 
|---|
| 217 | FileVersion getFormatVersion() const override; | 
|---|
| 218 |  | 
|---|
| 219 | WriterId getWriterId() const override; | 
|---|
| 220 |  | 
|---|
| 221 | uint32_t getWriterIdValue() const override; | 
|---|
| 222 |  | 
|---|
| 223 | WriterVersion getWriterVersion() const override; | 
|---|
| 224 |  | 
|---|
| 225 | uint64_t getNumberOfRows() const override; | 
|---|
| 226 |  | 
|---|
| 227 | uint64_t getRowIndexStride() const override; | 
|---|
| 228 |  | 
|---|
| 229 | std::list<std::string> getMetadataKeys() const override; | 
|---|
| 230 |  | 
|---|
| 231 | std::string getMetadataValue(const std::string& key) const override; | 
|---|
| 232 |  | 
|---|
| 233 | bool hasMetadataValue(const std::string& key) const override; | 
|---|
| 234 |  | 
|---|
| 235 | uint64_t getCompressionSize() const override; | 
|---|
| 236 |  | 
|---|
| 237 | uint64_t getNumberOfStripes() const override; | 
|---|
| 238 |  | 
|---|
| 239 | std::unique_ptr<StripeInformation> getStripe(uint64_t | 
|---|
| 240 | ) const override; | 
|---|
| 241 |  | 
|---|
| 242 | uint64_t getNumberOfStripeStatistics() const override; | 
|---|
| 243 |  | 
|---|
| 244 | const std::string& getStreamName() const override; | 
|---|
| 245 |  | 
|---|
| 246 | std::unique_ptr<StripeStatistics> | 
|---|
| 247 | getStripeStatistics(uint64_t stripeIndex) const override; | 
|---|
| 248 |  | 
|---|
| 249 | std::unique_ptr<RowReader> createRowReader() const override; | 
|---|
| 250 |  | 
|---|
| 251 | std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options | 
|---|
| 252 | ) const override; | 
|---|
| 253 |  | 
|---|
| 254 | uint64_t getContentLength() const override; | 
|---|
| 255 | uint64_t getStripeStatisticsLength() const override; | 
|---|
| 256 | uint64_t () const override; | 
|---|
| 257 | uint64_t getFilePostscriptLength() const override; | 
|---|
| 258 | uint64_t getFileLength() const override; | 
|---|
| 259 |  | 
|---|
| 260 | std::unique_ptr<Statistics> getStatistics() const override; | 
|---|
| 261 |  | 
|---|
| 262 | std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId | 
|---|
| 263 | ) const override; | 
|---|
| 264 |  | 
|---|
| 265 | std::string getSerializedFileTail() const override; | 
|---|
| 266 |  | 
|---|
| 267 | const Type& getType() const override; | 
|---|
| 268 |  | 
|---|
| 269 | bool hasCorrectStatistics() const override; | 
|---|
| 270 |  | 
|---|
| 271 | const proto::PostScript* getPostscript() const {return contents->postscript.get();} | 
|---|
| 272 |  | 
|---|
| 273 | uint64_t getBlockSize() const {return contents->blockSize;} | 
|---|
| 274 |  | 
|---|
| 275 | const proto::Footer* () const {return contents->footer.get();} | 
|---|
| 276 |  | 
|---|
| 277 | const Type* getSchema() const {return contents->schema.get();} | 
|---|
| 278 |  | 
|---|
| 279 | InputStream* getStream() const {return contents->stream.get();} | 
|---|
| 280 |  | 
|---|
| 281 | uint64_t getMemoryUse(int stripeIx = -1) override; | 
|---|
| 282 |  | 
|---|
| 283 | uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; | 
|---|
| 284 |  | 
|---|
| 285 | uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; | 
|---|
| 286 |  | 
|---|
| 287 | uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; | 
|---|
| 288 | }; | 
|---|
| 289 |  | 
|---|
| 290 | }// namespace | 
|---|
| 291 |  | 
|---|
| 292 | #endif | 
|---|
| 293 |  | 
|---|