1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_FILE_WRITER_H
19#define PARQUET_FILE_WRITER_H
20
21#include <cstdint>
22#include <memory>
23
24#include "parquet/metadata.h"
25#include "parquet/platform.h"
26#include "parquet/properties.h"
27#include "parquet/schema.h"
28
29namespace parquet {
30
31class ColumnWriter;
32class OutputStream;
33
34class PARQUET_EXPORT RowGroupWriter {
35 public:
36 // Forward declare a virtual class 'Contents' to aid dependency injection and more
37 // easily create test fixtures
38 // An implementation of the Contents class is defined in the .cc file
39 struct Contents {
40 virtual ~Contents() = default;
41 virtual int num_columns() const = 0;
42 virtual int64_t num_rows() const = 0;
43
44 // to be used only with ParquetFileWriter::AppendRowGroup
45 virtual ColumnWriter* NextColumn() = 0;
46 // to be used only with ParquetFileWriter::AppendBufferedRowGroup
47 virtual ColumnWriter* column(int i) = 0;
48
49 virtual int current_column() const = 0;
50 virtual void Close() = 0;
51
52 // total bytes written by the page writer
53 virtual int64_t total_bytes_written() const = 0;
54 // total bytes still compressed but not written
55 virtual int64_t total_compressed_bytes() const = 0;
56 };
57
58 explicit RowGroupWriter(std::unique_ptr<Contents> contents);
59
60 /// Construct a ColumnWriter for the indicated row group-relative column.
61 ///
62 /// To be used only with ParquetFileWriter::AppendRowGroup
63 /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
64 /// valid until the next call to NextColumn or Close. As the contents are
65 /// directly written to the sink, once a new column is started, the contents
66 /// of the previous one cannot be modified anymore.
67 ColumnWriter* NextColumn();
68 /// Index of currently written column
69 int current_column();
70 void Close();
71
72 int num_columns() const;
73
74 /// Construct a ColumnWriter for the indicated row group column.
75 ///
76 /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
77 /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
78 /// valid until Close. The contents are buffered in memory and written to sink
79 /// on Close
80 ColumnWriter* column(int i);
81
82 /**
83 * Number of rows that shall be written as part of this RowGroup.
84 */
85 int64_t num_rows() const;
86
87 int64_t total_bytes_written() const;
88 int64_t total_compressed_bytes() const;
89
90 private:
91 // Holds a pointer to an instance of Contents implementation
92 std::unique_ptr<Contents> contents_;
93};
94
95ARROW_DEPRECATED("Use version with arrow::io::OutputStream*")
96PARQUET_EXPORT
97void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink);
98
99PARQUET_EXPORT
100void WriteFileMetaData(const FileMetaData& file_metadata,
101 ::arrow::io::OutputStream* sink);
102
103PARQUET_EXPORT
104void WriteMetaDataFile(const FileMetaData& file_metadata,
105 ::arrow::io::OutputStream* sink);
106
107class PARQUET_EXPORT ParquetFileWriter {
108 public:
109 // Forward declare a virtual class 'Contents' to aid dependency injection and more
110 // easily create test fixtures
111 // An implementation of the Contents class is defined in the .cc file
112 struct Contents {
113 Contents(const std::shared_ptr<::parquet::schema::GroupNode>& schema,
114 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata)
115 : schema_(), key_value_metadata_(key_value_metadata) {
116 schema_.Init(schema);
117 }
118 virtual ~Contents() {}
119 // Perform any cleanup associated with the file contents
120 virtual void Close() = 0;
121
122 /// \note Deprecated since 1.3.0
123 RowGroupWriter* AppendRowGroup(int64_t num_rows);
124
125 virtual RowGroupWriter* AppendRowGroup() = 0;
126 virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
127
128 virtual int64_t num_rows() const = 0;
129 virtual int num_columns() const = 0;
130 virtual int num_row_groups() const = 0;
131
132 virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
133
134 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
135 return key_value_metadata_;
136 }
137
138 // Return const-pointer to make it clear that this object is not to be copied
139 const SchemaDescriptor* schema() const { return &schema_; }
140
141 SchemaDescriptor schema_;
142
143 /// This should be the only place this is stored. Everything else is a const reference
144 std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
145
146 const std::shared_ptr<FileMetaData> metadata() const { return file_metadata_; }
147 std::shared_ptr<FileMetaData> file_metadata_;
148 };
149
150 ParquetFileWriter();
151 ~ParquetFileWriter();
152
153 static std::unique_ptr<ParquetFileWriter> Open(
154 const std::shared_ptr<::arrow::io::OutputStream>& sink,
155 const std::shared_ptr<schema::GroupNode>& schema,
156 const std::shared_ptr<WriterProperties>& properties = default_writer_properties(),
157 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
158
159 ARROW_DEPRECATED("Use version with arrow::io::OutputStream")
160 static std::unique_ptr<ParquetFileWriter> Open(
161 const std::shared_ptr<OutputStream>& sink,
162 const std::shared_ptr<schema::GroupNode>& schema,
163 const std::shared_ptr<WriterProperties>& properties = default_writer_properties(),
164 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
165
166 void Open(std::unique_ptr<Contents> contents);
167 void Close();
168
169 // Construct a RowGroupWriter for the indicated number of rows.
170 //
171 // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
172 // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
173 // @param num_rows The number of rows that are stored in the new RowGroup
174 //
175 // \deprecated Since 1.3.0
176 RowGroupWriter* AppendRowGroup(int64_t num_rows);
177
178 /// Construct a RowGroupWriter with an arbitrary number of rows.
179 ///
180 /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
181 /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
182 RowGroupWriter* AppendRowGroup();
183
184 /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
185 /// Use this if you want to write a RowGroup based on a certain size
186 ///
187 /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
188 /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
189 RowGroupWriter* AppendBufferedRowGroup();
190
191 /// Number of columns.
192 ///
193 /// This number is fixed during the lifetime of the writer as it is determined via
194 /// the schema.
195 int num_columns() const;
196
197 /// Number of rows in the yet started RowGroups.
198 ///
199 /// Changes on the addition of a new RowGroup.
200 int64_t num_rows() const;
201
202 /// Number of started RowGroups.
203 int num_row_groups() const;
204
205 /// Configuration passed to the writer, e.g. the used Parquet format version.
206 const std::shared_ptr<WriterProperties>& properties() const;
207
208 /// Returns the file schema descriptor
209 const SchemaDescriptor* schema() const;
210
211 /// Returns a column descriptor in schema
212 const ColumnDescriptor* descr(int i) const;
213
214 /// Returns the file custom metadata
215 const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
216
217 /// Returns the file metadata, only available after calling Close().
218 const std::shared_ptr<FileMetaData> metadata() const;
219
220 private:
221 // Holds a pointer to an instance of Contents implementation
222 std::unique_ptr<Contents> contents_;
223 std::shared_ptr<FileMetaData> file_metadata_;
224};
225
226} // namespace parquet
227
228#endif // PARQUET_FILE_WRITER_H
229