| 1 | // Licensed to the Apache Software Foundation (ASF) under one |
| 2 | // or more contributor license agreements. See the NOTICE file |
| 3 | // distributed with this work for additional information |
| 4 | // regarding copyright ownership. The ASF licenses this file |
| 5 | // to you under the Apache License, Version 2.0 (the |
| 6 | // "License"); you may not use this file except in compliance |
| 7 | // with the License. You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, |
| 12 | // software distributed under the License is distributed on an |
| 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | // KIND, either express or implied. See the License for the |
| 15 | // specific language governing permissions and limitations |
| 16 | // under the License. |
| 17 | |
| 18 | #ifndef PARQUET_FILE_READER_H |
| 19 | #define PARQUET_FILE_READER_H |
| 20 | |
| 21 | #include <cstdint> |
| 22 | #include <memory> |
| 23 | #include <string> |
| 24 | #include <vector> |
| 25 | |
| 26 | #include "parquet/metadata.h" // IWYU pragma: keep |
| 27 | #include "parquet/platform.h" |
| 28 | #include "parquet/properties.h" |
| 29 | |
| 30 | namespace parquet { |
| 31 | |
| 32 | class ColumnReader; |
| 33 | class FileMetaData; |
| 34 | class ; |
| 35 | class RandomAccessSource; |
| 36 | class RowGroupMetaData; |
| 37 | |
| 38 | class PARQUET_EXPORT RowGroupReader { |
| 39 | public: |
| 40 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
| 41 | // easily create test fixtures |
| 42 | // An implementation of the Contents class is defined in the .cc file |
| 43 | struct Contents { |
| 44 | virtual ~Contents() {} |
| 45 | virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; |
| 46 | virtual const RowGroupMetaData* metadata() const = 0; |
| 47 | virtual const ReaderProperties* properties() const = 0; |
| 48 | }; |
| 49 | |
| 50 | explicit RowGroupReader(std::unique_ptr<Contents> contents); |
| 51 | |
| 52 | // Returns the rowgroup metadata |
| 53 | const RowGroupMetaData* metadata() const; |
| 54 | |
| 55 | // Construct a ColumnReader for the indicated row group-relative |
| 56 | // column. Ownership is shared with the RowGroupReader. |
| 57 | std::shared_ptr<ColumnReader> Column(int i); |
| 58 | |
| 59 | std::unique_ptr<PageReader> GetColumnPageReader(int i); |
| 60 | |
| 61 | private: |
| 62 | // Holds a pointer to an instance of Contents implementation |
| 63 | std::unique_ptr<Contents> contents_; |
| 64 | }; |
| 65 | |
| 66 | class PARQUET_EXPORT ParquetFileReader { |
| 67 | public: |
| 68 | // Declare a virtual class 'Contents' to aid dependency injection and more |
| 69 | // easily create test fixtures |
| 70 | // An implementation of the Contents class is defined in the .cc file |
| 71 | struct PARQUET_EXPORT Contents { |
| 72 | static std::unique_ptr<Contents> Open( |
| 73 | const std::shared_ptr<::arrow::io::RandomAccessFile>& source, |
| 74 | const ReaderProperties& props = default_reader_properties(), |
| 75 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
| 76 | |
| 77 | virtual ~Contents() = default; |
| 78 | // Perform any cleanup associated with the file contents |
| 79 | virtual void Close() = 0; |
| 80 | virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; |
| 81 | virtual std::shared_ptr<FileMetaData> metadata() const = 0; |
| 82 | }; |
| 83 | |
| 84 | ParquetFileReader(); |
| 85 | ~ParquetFileReader(); |
| 86 | |
| 87 | // Create a reader from some implementation of parquet-cpp's generic file |
| 88 | // input interface |
| 89 | // |
| 90 | // If you cannot provide exclusive access to your file resource, create a |
| 91 | // subclass of RandomAccessSource that wraps the shared resource |
| 92 | ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version" ) |
| 93 | static std::unique_ptr<ParquetFileReader> Open( |
| 94 | std::unique_ptr<RandomAccessSource> source, |
| 95 | const ReaderProperties& props = default_reader_properties(), |
| 96 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
| 97 | |
| 98 | // Create a file reader instance from an Arrow file object. Thread-safety is |
| 99 | // the responsibility of the file implementation |
| 100 | static std::unique_ptr<ParquetFileReader> Open( |
| 101 | const std::shared_ptr<::arrow::io::RandomAccessFile>& source, |
| 102 | const ReaderProperties& props = default_reader_properties(), |
| 103 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
| 104 | |
| 105 | // API Convenience to open a serialized Parquet file on disk, using Arrow IO |
| 106 | // interfaces. |
| 107 | static std::unique_ptr<ParquetFileReader> OpenFile( |
| 108 | const std::string& path, bool memory_map = true, |
| 109 | const ReaderProperties& props = default_reader_properties(), |
| 110 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
| 111 | |
| 112 | void Open(std::unique_ptr<Contents> contents); |
| 113 | void Close(); |
| 114 | |
| 115 | // The RowGroupReader is owned by the FileReader |
| 116 | std::shared_ptr<RowGroupReader> RowGroup(int i); |
| 117 | |
| 118 | // Returns the file metadata. Only one instance is ever created |
| 119 | std::shared_ptr<FileMetaData> metadata() const; |
| 120 | |
| 121 | private: |
| 122 | // Holds a pointer to an instance of Contents implementation |
| 123 | std::unique_ptr<Contents> contents_; |
| 124 | }; |
| 125 | |
| 126 | // Read only Parquet file metadata |
| 127 | std::shared_ptr<FileMetaData> PARQUET_EXPORT |
| 128 | ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); |
| 129 | |
| 130 | /// \brief Scan all values in file. Useful for performance testing |
| 131 | /// \param[in] columns the column numbers to scan. If empty scans all |
| 132 | /// \param[in] column_batch_size number of values to read at a time when scanning column |
| 133 | /// \param[in] reader a ParquetFileReader instance |
| 134 | /// \return number of semantic rows in file |
| 135 | PARQUET_EXPORT |
| 136 | int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, |
| 137 | ParquetFileReader* reader); |
| 138 | |
| 139 | } // namespace parquet |
| 140 | |
| 141 | #endif // PARQUET_FILE_READER_H |
| 142 | |