1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_FILE_READER_H |
19 | #define PARQUET_FILE_READER_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | #include "parquet/metadata.h" // IWYU pragma: keep |
27 | #include "parquet/platform.h" |
28 | #include "parquet/properties.h" |
29 | |
30 | namespace parquet { |
31 | |
32 | class ColumnReader; |
33 | class FileMetaData; |
34 | class ; |
35 | class RandomAccessSource; |
36 | class RowGroupMetaData; |
37 | |
38 | class PARQUET_EXPORT RowGroupReader { |
39 | public: |
40 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
41 | // easily create test fixtures |
42 | // An implementation of the Contents class is defined in the .cc file |
43 | struct Contents { |
44 | virtual ~Contents() {} |
45 | virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; |
46 | virtual const RowGroupMetaData* metadata() const = 0; |
47 | virtual const ReaderProperties* properties() const = 0; |
48 | }; |
49 | |
50 | explicit RowGroupReader(std::unique_ptr<Contents> contents); |
51 | |
52 | // Returns the rowgroup metadata |
53 | const RowGroupMetaData* metadata() const; |
54 | |
55 | // Construct a ColumnReader for the indicated row group-relative |
56 | // column. Ownership is shared with the RowGroupReader. |
57 | std::shared_ptr<ColumnReader> Column(int i); |
58 | |
59 | std::unique_ptr<PageReader> GetColumnPageReader(int i); |
60 | |
61 | private: |
62 | // Holds a pointer to an instance of Contents implementation |
63 | std::unique_ptr<Contents> contents_; |
64 | }; |
65 | |
66 | class PARQUET_EXPORT ParquetFileReader { |
67 | public: |
68 | // Declare a virtual class 'Contents' to aid dependency injection and more |
69 | // easily create test fixtures |
70 | // An implementation of the Contents class is defined in the .cc file |
71 | struct PARQUET_EXPORT Contents { |
72 | static std::unique_ptr<Contents> Open( |
73 | const std::shared_ptr<::arrow::io::RandomAccessFile>& source, |
74 | const ReaderProperties& props = default_reader_properties(), |
75 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
76 | |
77 | virtual ~Contents() = default; |
78 | // Perform any cleanup associated with the file contents |
79 | virtual void Close() = 0; |
80 | virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; |
81 | virtual std::shared_ptr<FileMetaData> metadata() const = 0; |
82 | }; |
83 | |
84 | ParquetFileReader(); |
85 | ~ParquetFileReader(); |
86 | |
87 | // Create a reader from some implementation of parquet-cpp's generic file |
88 | // input interface |
89 | // |
90 | // If you cannot provide exclusive access to your file resource, create a |
91 | // subclass of RandomAccessSource that wraps the shared resource |
92 | ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version" ) |
93 | static std::unique_ptr<ParquetFileReader> Open( |
94 | std::unique_ptr<RandomAccessSource> source, |
95 | const ReaderProperties& props = default_reader_properties(), |
96 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
97 | |
98 | // Create a file reader instance from an Arrow file object. Thread-safety is |
99 | // the responsibility of the file implementation |
100 | static std::unique_ptr<ParquetFileReader> Open( |
101 | const std::shared_ptr<::arrow::io::RandomAccessFile>& source, |
102 | const ReaderProperties& props = default_reader_properties(), |
103 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
104 | |
105 | // API Convenience to open a serialized Parquet file on disk, using Arrow IO |
106 | // interfaces. |
107 | static std::unique_ptr<ParquetFileReader> OpenFile( |
108 | const std::string& path, bool memory_map = true, |
109 | const ReaderProperties& props = default_reader_properties(), |
110 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
111 | |
112 | void Open(std::unique_ptr<Contents> contents); |
113 | void Close(); |
114 | |
115 | // The RowGroupReader is owned by the FileReader |
116 | std::shared_ptr<RowGroupReader> RowGroup(int i); |
117 | |
118 | // Returns the file metadata. Only one instance is ever created |
119 | std::shared_ptr<FileMetaData> metadata() const; |
120 | |
121 | private: |
122 | // Holds a pointer to an instance of Contents implementation |
123 | std::unique_ptr<Contents> contents_; |
124 | }; |
125 | |
126 | // Read only Parquet file metadata |
127 | std::shared_ptr<FileMetaData> PARQUET_EXPORT |
128 | ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); |
129 | |
130 | /// \brief Scan all values in file. Useful for performance testing |
131 | /// \param[in] columns the column numbers to scan. If empty scans all |
132 | /// \param[in] column_batch_size number of values to read at a time when scanning column |
133 | /// \param[in] reader a ParquetFileReader instance |
134 | /// \return number of semantic rows in file |
135 | PARQUET_EXPORT |
136 | int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, |
137 | ParquetFileReader* reader); |
138 | |
139 | } // namespace parquet |
140 | |
141 | #endif // PARQUET_FILE_READER_H |
142 | |