1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_FILE_READER_H |
19 | #define PARQUET_FILE_READER_H |
20 | |
21 | #include <cstdint> |
22 | #include <iosfwd> |
23 | #include <list> |
24 | #include <memory> |
25 | #include <string> |
26 | #include <vector> |
27 | |
28 | #include "parquet/column_reader.h" |
29 | #include "parquet/metadata.h" |
30 | #include "parquet/properties.h" |
31 | #include "parquet/schema.h" |
32 | #include "parquet/statistics.h" |
33 | #include "parquet/util/macros.h" |
34 | #include "parquet/util/memory.h" |
35 | #include "parquet/util/visibility.h" |
36 | |
37 | namespace parquet { |
38 | |
39 | class ColumnReader; |
40 | |
41 | class PARQUET_EXPORT RowGroupReader { |
42 | public: |
43 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
44 | // easily create test fixtures |
45 | // An implementation of the Contents class is defined in the .cc file |
46 | struct Contents { |
47 | virtual ~Contents() {} |
48 | virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; |
49 | virtual const RowGroupMetaData* metadata() const = 0; |
50 | virtual const ReaderProperties* properties() const = 0; |
51 | }; |
52 | |
53 | explicit RowGroupReader(std::unique_ptr<Contents> contents); |
54 | |
55 | // Returns the rowgroup metadata |
56 | const RowGroupMetaData* metadata() const; |
57 | |
58 | // Construct a ColumnReader for the indicated row group-relative |
59 | // column. Ownership is shared with the RowGroupReader. |
60 | std::shared_ptr<ColumnReader> Column(int i); |
61 | |
62 | std::unique_ptr<PageReader> GetColumnPageReader(int i); |
63 | |
64 | private: |
65 | // Holds a pointer to an instance of Contents implementation |
66 | std::unique_ptr<Contents> contents_; |
67 | }; |
68 | |
69 | class PARQUET_EXPORT ParquetFileReader { |
70 | public: |
71 | // Declare a virtual class 'Contents' to aid dependency injection and more |
72 | // easily create test fixtures |
73 | // An implementation of the Contents class is defined in the .cc file |
74 | struct PARQUET_EXPORT Contents { |
75 | static std::unique_ptr<Contents> Open( |
76 | std::unique_ptr<RandomAccessSource> source, |
77 | const ReaderProperties& props = default_reader_properties(), |
78 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
79 | |
80 | virtual ~Contents() {} |
81 | // Perform any cleanup associated with the file contents |
82 | virtual void Close() = 0; |
83 | virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; |
84 | virtual std::shared_ptr<FileMetaData> metadata() const = 0; |
85 | }; |
86 | |
87 | ParquetFileReader(); |
88 | ~ParquetFileReader(); |
89 | |
90 | // Create a reader from some implementation of parquet-cpp's generic file |
91 | // input interface |
92 | // |
93 | // If you cannot provide exclusive access to your file resource, create a |
94 | // subclass of RandomAccessSource that wraps the shared resource |
95 | static std::unique_ptr<ParquetFileReader> Open( |
96 | std::unique_ptr<RandomAccessSource> source, |
97 | const ReaderProperties& props = default_reader_properties(), |
98 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
99 | |
100 | // Create a file reader instance from an Arrow file object. Thread-safety is |
101 | // the responsibility of the file implementation |
102 | static std::unique_ptr<ParquetFileReader> Open( |
103 | const std::shared_ptr<::arrow::io::ReadableFileInterface>& source, |
104 | const ReaderProperties& props = default_reader_properties(), |
105 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
106 | |
107 | // API Convenience to open a serialized Parquet file on disk, using Arrow IO |
108 | // interfaces. |
109 | static std::unique_ptr<ParquetFileReader> OpenFile( |
110 | const std::string& path, bool memory_map = true, |
111 | const ReaderProperties& props = default_reader_properties(), |
112 | const std::shared_ptr<FileMetaData>& metadata = NULLPTR); |
113 | |
114 | void Open(std::unique_ptr<Contents> contents); |
115 | void Close(); |
116 | |
117 | // The RowGroupReader is owned by the FileReader |
118 | std::shared_ptr<RowGroupReader> RowGroup(int i); |
119 | |
120 | // Returns the file metadata. Only one instance is ever created |
121 | std::shared_ptr<FileMetaData> metadata() const; |
122 | |
123 | private: |
124 | // Holds a pointer to an instance of Contents implementation |
125 | std::unique_ptr<Contents> contents_; |
126 | }; |
127 | |
128 | // Read only Parquet file metadata |
129 | std::shared_ptr<FileMetaData> PARQUET_EXPORT |
130 | ReadMetaData(const std::shared_ptr<::arrow::io::ReadableFileInterface>& source); |
131 | |
132 | /// \brief Scan all values in file. Useful for performance testing |
133 | /// \param[in] columns the column numbers to scan. If empty scans all |
134 | /// \param[in] column_batch_size number of values to read at a time when scanning column |
135 | /// \param[in] reader a ParquetFileReader instance |
136 | /// \return number of semantic rows in file |
137 | PARQUET_EXPORT |
138 | int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, |
139 | ParquetFileReader* reader); |
140 | |
141 | } // namespace parquet |
142 | |
143 | #endif // PARQUET_FILE_READER_H |
144 | |