1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_FILE_READER_H
19#define PARQUET_FILE_READER_H
20
21#include <cstdint>
22#include <iosfwd>
23#include <list>
24#include <memory>
25#include <string>
26#include <vector>
27
28#include "parquet/column_reader.h"
29#include "parquet/metadata.h"
30#include "parquet/properties.h"
31#include "parquet/schema.h"
32#include "parquet/statistics.h"
33#include "parquet/util/macros.h"
34#include "parquet/util/memory.h"
35#include "parquet/util/visibility.h"
36
37namespace parquet {
38
39class ColumnReader;
40
41class PARQUET_EXPORT RowGroupReader {
42 public:
43 // Forward declare a virtual class 'Contents' to aid dependency injection and more
44 // easily create test fixtures
45 // An implementation of the Contents class is defined in the .cc file
46 struct Contents {
47 virtual ~Contents() {}
48 virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
49 virtual const RowGroupMetaData* metadata() const = 0;
50 virtual const ReaderProperties* properties() const = 0;
51 };
52
53 explicit RowGroupReader(std::unique_ptr<Contents> contents);
54
55 // Returns the rowgroup metadata
56 const RowGroupMetaData* metadata() const;
57
58 // Construct a ColumnReader for the indicated row group-relative
59 // column. Ownership is shared with the RowGroupReader.
60 std::shared_ptr<ColumnReader> Column(int i);
61
62 std::unique_ptr<PageReader> GetColumnPageReader(int i);
63
64 private:
65 // Holds a pointer to an instance of Contents implementation
66 std::unique_ptr<Contents> contents_;
67};
68
69class PARQUET_EXPORT ParquetFileReader {
70 public:
71 // Declare a virtual class 'Contents' to aid dependency injection and more
72 // easily create test fixtures
73 // An implementation of the Contents class is defined in the .cc file
74 struct PARQUET_EXPORT Contents {
75 static std::unique_ptr<Contents> Open(
76 std::unique_ptr<RandomAccessSource> source,
77 const ReaderProperties& props = default_reader_properties(),
78 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
79
80 virtual ~Contents() {}
81 // Perform any cleanup associated with the file contents
82 virtual void Close() = 0;
83 virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
84 virtual std::shared_ptr<FileMetaData> metadata() const = 0;
85 };
86
87 ParquetFileReader();
88 ~ParquetFileReader();
89
90 // Create a reader from some implementation of parquet-cpp's generic file
91 // input interface
92 //
93 // If you cannot provide exclusive access to your file resource, create a
94 // subclass of RandomAccessSource that wraps the shared resource
95 static std::unique_ptr<ParquetFileReader> Open(
96 std::unique_ptr<RandomAccessSource> source,
97 const ReaderProperties& props = default_reader_properties(),
98 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
99
100 // Create a file reader instance from an Arrow file object. Thread-safety is
101 // the responsibility of the file implementation
102 static std::unique_ptr<ParquetFileReader> Open(
103 const std::shared_ptr<::arrow::io::ReadableFileInterface>& source,
104 const ReaderProperties& props = default_reader_properties(),
105 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
106
107 // API Convenience to open a serialized Parquet file on disk, using Arrow IO
108 // interfaces.
109 static std::unique_ptr<ParquetFileReader> OpenFile(
110 const std::string& path, bool memory_map = true,
111 const ReaderProperties& props = default_reader_properties(),
112 const std::shared_ptr<FileMetaData>& metadata = NULLPTR);
113
114 void Open(std::unique_ptr<Contents> contents);
115 void Close();
116
117 // The RowGroupReader is owned by the FileReader
118 std::shared_ptr<RowGroupReader> RowGroup(int i);
119
120 // Returns the file metadata. Only one instance is ever created
121 std::shared_ptr<FileMetaData> metadata() const;
122
123 private:
124 // Holds a pointer to an instance of Contents implementation
125 std::unique_ptr<Contents> contents_;
126};
127
128// Read only Parquet file metadata
129std::shared_ptr<FileMetaData> PARQUET_EXPORT
130ReadMetaData(const std::shared_ptr<::arrow::io::ReadableFileInterface>& source);
131
132/// \brief Scan all values in file. Useful for performance testing
133/// \param[in] columns the column numbers to scan. If empty scans all
134/// \param[in] column_batch_size number of values to read at a time when scanning column
135/// \param[in] reader a ParquetFileReader instance
136/// \return number of semantic rows in file
137PARQUET_EXPORT
138int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
139 ParquetFileReader* reader);
140
141} // namespace parquet
142
143#endif // PARQUET_FILE_READER_H
144