1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_FILE_READER_H
19#define PARQUET_FILE_READER_H
20
21#include <cstdint>
22#include <memory>
23#include <string>
24#include <vector>
25
26#include "parquet/metadata.h" // IWYU pragma: keep
27#include "parquet/platform.h"
28#include "parquet/properties.h"
29
30namespace parquet {
31
32class ColumnReader;
33class FileMetaData;
34class PageReader;
35class RandomAccessSource;
36class RowGroupMetaData;
37
38class PARQUET_EXPORT RowGroupReader {
39 public:
40 // Forward declare a virtual class 'Contents' to aid dependency injection and more
41 // easily create test fixtures
42 // An implementation of the Contents class is defined in the .cc file
43 struct Contents {
44 virtual ~Contents() {}
45 virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
46 virtual const RowGroupMetaData* metadata() const = 0;
47 virtual const ReaderProperties* properties() const = 0;
48 };
49
50 explicit RowGroupReader(std::unique_ptr<Contents> contents);
51
52 // Returns the rowgroup metadata
53 const RowGroupMetaData* metadata() const;
54
55 // Construct a ColumnReader for the indicated row group-relative
56 // column. Ownership is shared with the RowGroupReader.
57 std::shared_ptr<ColumnReader> Column(int i);
58
59 std::unique_ptr<PageReader> GetColumnPageReader(int i);
60
61 private:
62 // Holds a pointer to an instance of Contents implementation
63 std::unique_ptr<Contents> contents_;
64};
65
66class PARQUET_EXPORT ParquetFileReader {
67 public:
68 // Declare a virtual class 'Contents' to aid dependency injection and more
69 // easily create test fixtures
70 // An implementation of the Contents class is defined in the .cc file
71 struct PARQUET_EXPORT Contents {
72 static std::unique_ptr<Contents> Open(
73 const std::shared_ptr<::arrow::io::RandomAccessFile>& source,
74 const ReaderProperties& props = default_reader_properties(),
75 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
76
77 virtual ~Contents() = default;
78 // Perform any cleanup associated with the file contents
79 virtual void Close() = 0;
80 virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
81 virtual std::shared_ptr<FileMetaData> metadata() const = 0;
82 };
83
84 ParquetFileReader();
85 ~ParquetFileReader();
86
87 // Create a reader from some implementation of parquet-cpp's generic file
88 // input interface
89 //
90 // If you cannot provide exclusive access to your file resource, create a
91 // subclass of RandomAccessSource that wraps the shared resource
92 ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version")
93 static std::unique_ptr<ParquetFileReader> Open(
94 std::unique_ptr<RandomAccessSource> source,
95 const ReaderProperties& props = default_reader_properties(),
96 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
97
98 // Create a file reader instance from an Arrow file object. Thread-safety is
99 // the responsibility of the file implementation
100 static std::unique_ptr<ParquetFileReader> Open(
101 const std::shared_ptr<::arrow::io::RandomAccessFile>& source,
102 const ReaderProperties& props = default_reader_properties(),
103 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
104
105 // API Convenience to open a serialized Parquet file on disk, using Arrow IO
106 // interfaces.
107 static std::unique_ptr<ParquetFileReader> OpenFile(
108 const std::string& path, bool memory_map = true,
109 const ReaderProperties& props = default_reader_properties(),
110 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
111
112 void Open(std::unique_ptr<Contents> contents);
113 void Close();
114
115 // The RowGroupReader is owned by the FileReader
116 std::shared_ptr<RowGroupReader> RowGroup(int i);
117
118 // Returns the file metadata. Only one instance is ever created
119 std::shared_ptr<FileMetaData> metadata() const;
120
121 private:
122 // Holds a pointer to an instance of Contents implementation
123 std::unique_ptr<Contents> contents_;
124};
125
126// Read only Parquet file metadata
127std::shared_ptr<FileMetaData> PARQUET_EXPORT
128ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
129
130/// \brief Scan all values in file. Useful for performance testing
131/// \param[in] columns the column numbers to scan. If empty scans all
132/// \param[in] column_batch_size number of values to read at a time when scanning column
133/// \param[in] reader a ParquetFileReader instance
134/// \return number of semantic rows in file
135PARQUET_EXPORT
136int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
137 ParquetFileReader* reader);
138
139} // namespace parquet
140
141#endif // PARQUET_FILE_READER_H
142