1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_FILE_WRITER_H |
19 | #define PARQUET_FILE_WRITER_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | |
24 | #include "parquet/metadata.h" |
25 | #include "parquet/platform.h" |
26 | #include "parquet/properties.h" |
27 | #include "parquet/schema.h" |
28 | |
29 | namespace parquet { |
30 | |
31 | class ColumnWriter; |
32 | class OutputStream; |
33 | |
34 | class PARQUET_EXPORT RowGroupWriter { |
35 | public: |
36 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
37 | // easily create test fixtures |
38 | // An implementation of the Contents class is defined in the .cc file |
39 | struct Contents { |
40 | virtual ~Contents() = default; |
41 | virtual int num_columns() const = 0; |
42 | virtual int64_t num_rows() const = 0; |
43 | |
44 | // to be used only with ParquetFileWriter::AppendRowGroup |
45 | virtual ColumnWriter* NextColumn() = 0; |
46 | // to be used only with ParquetFileWriter::AppendBufferedRowGroup |
47 | virtual ColumnWriter* column(int i) = 0; |
48 | |
49 | virtual int current_column() const = 0; |
50 | virtual void Close() = 0; |
51 | |
52 | // total bytes written by the page writer |
53 | virtual int64_t total_bytes_written() const = 0; |
54 | // total bytes still compressed but not written |
55 | virtual int64_t total_compressed_bytes() const = 0; |
56 | }; |
57 | |
58 | explicit RowGroupWriter(std::unique_ptr<Contents> contents); |
59 | |
60 | /// Construct a ColumnWriter for the indicated row group-relative column. |
61 | /// |
62 | /// To be used only with ParquetFileWriter::AppendRowGroup |
63 | /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only |
64 | /// valid until the next call to NextColumn or Close. As the contents are |
65 | /// directly written to the sink, once a new column is started, the contents |
66 | /// of the previous one cannot be modified anymore. |
67 | ColumnWriter* NextColumn(); |
68 | /// Index of currently written column |
69 | int current_column(); |
70 | void Close(); |
71 | |
72 | int num_columns() const; |
73 | |
74 | /// Construct a ColumnWriter for the indicated row group column. |
75 | /// |
76 | /// To be used only with ParquetFileWriter::AppendBufferedRowGroup |
77 | /// Ownership is solely within the RowGroupWriter. The ColumnWriter is |
78 | /// valid until Close. The contents are buffered in memory and written to sink |
79 | /// on Close |
80 | ColumnWriter* column(int i); |
81 | |
82 | /** |
83 | * Number of rows that shall be written as part of this RowGroup. |
84 | */ |
85 | int64_t num_rows() const; |
86 | |
87 | int64_t total_bytes_written() const; |
88 | int64_t total_compressed_bytes() const; |
89 | |
90 | private: |
91 | // Holds a pointer to an instance of Contents implementation |
92 | std::unique_ptr<Contents> contents_; |
93 | }; |
94 | |
95 | ARROW_DEPRECATED("Use version with arrow::io::OutputStream*" ) |
96 | PARQUET_EXPORT |
97 | void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); |
98 | |
99 | PARQUET_EXPORT |
100 | void WriteFileMetaData(const FileMetaData& file_metadata, |
101 | ::arrow::io::OutputStream* sink); |
102 | |
103 | PARQUET_EXPORT |
104 | void WriteMetaDataFile(const FileMetaData& file_metadata, |
105 | ::arrow::io::OutputStream* sink); |
106 | |
107 | class PARQUET_EXPORT ParquetFileWriter { |
108 | public: |
109 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
110 | // easily create test fixtures |
111 | // An implementation of the Contents class is defined in the .cc file |
112 | struct Contents { |
113 | Contents(const std::shared_ptr<::parquet::schema::GroupNode>& schema, |
114 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) |
115 | : schema_(), key_value_metadata_(key_value_metadata) { |
116 | schema_.Init(schema); |
117 | } |
118 | virtual ~Contents() {} |
119 | // Perform any cleanup associated with the file contents |
120 | virtual void Close() = 0; |
121 | |
122 | /// \note Deprecated since 1.3.0 |
123 | RowGroupWriter* AppendRowGroup(int64_t num_rows); |
124 | |
125 | virtual RowGroupWriter* AppendRowGroup() = 0; |
126 | virtual RowGroupWriter* AppendBufferedRowGroup() = 0; |
127 | |
128 | virtual int64_t num_rows() const = 0; |
129 | virtual int num_columns() const = 0; |
130 | virtual int num_row_groups() const = 0; |
131 | |
132 | virtual const std::shared_ptr<WriterProperties>& properties() const = 0; |
133 | |
134 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const { |
135 | return key_value_metadata_; |
136 | } |
137 | |
138 | // Return const-pointer to make it clear that this object is not to be copied |
139 | const SchemaDescriptor* schema() const { return &schema_; } |
140 | |
141 | SchemaDescriptor schema_; |
142 | |
143 | /// This should be the only place this is stored. Everything else is a const reference |
144 | std::shared_ptr<const KeyValueMetadata> key_value_metadata_; |
145 | |
146 | const std::shared_ptr<FileMetaData> metadata() const { return file_metadata_; } |
147 | std::shared_ptr<FileMetaData> file_metadata_; |
148 | }; |
149 | |
150 | ParquetFileWriter(); |
151 | ~ParquetFileWriter(); |
152 | |
153 | static std::unique_ptr<ParquetFileWriter> Open( |
154 | const std::shared_ptr<::arrow::io::OutputStream>& sink, |
155 | const std::shared_ptr<schema::GroupNode>& schema, |
156 | const std::shared_ptr<WriterProperties>& properties = default_writer_properties(), |
157 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
158 | |
159 | ARROW_DEPRECATED("Use version with arrow::io::OutputStream" ) |
160 | static std::unique_ptr<ParquetFileWriter> Open( |
161 | const std::shared_ptr<OutputStream>& sink, |
162 | const std::shared_ptr<schema::GroupNode>& schema, |
163 | const std::shared_ptr<WriterProperties>& properties = default_writer_properties(), |
164 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
165 | |
166 | void Open(std::unique_ptr<Contents> contents); |
167 | void Close(); |
168 | |
169 | // Construct a RowGroupWriter for the indicated number of rows. |
170 | // |
171 | // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
172 | // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
173 | // @param num_rows The number of rows that are stored in the new RowGroup |
174 | // |
175 | // \deprecated Since 1.3.0 |
176 | RowGroupWriter* AppendRowGroup(int64_t num_rows); |
177 | |
178 | /// Construct a RowGroupWriter with an arbitrary number of rows. |
179 | /// |
180 | /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
181 | /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
182 | RowGroupWriter* AppendRowGroup(); |
183 | |
184 | /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready. |
185 | /// Use this if you want to write a RowGroup based on a certain size |
186 | /// |
187 | /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
188 | /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
189 | RowGroupWriter* AppendBufferedRowGroup(); |
190 | |
191 | /// Number of columns. |
192 | /// |
193 | /// This number is fixed during the lifetime of the writer as it is determined via |
194 | /// the schema. |
195 | int num_columns() const; |
196 | |
197 | /// Number of rows in the yet started RowGroups. |
198 | /// |
199 | /// Changes on the addition of a new RowGroup. |
200 | int64_t num_rows() const; |
201 | |
202 | /// Number of started RowGroups. |
203 | int num_row_groups() const; |
204 | |
205 | /// Configuration passed to the writer, e.g. the used Parquet format version. |
206 | const std::shared_ptr<WriterProperties>& properties() const; |
207 | |
208 | /// Returns the file schema descriptor |
209 | const SchemaDescriptor* schema() const; |
210 | |
211 | /// Returns a column descriptor in schema |
212 | const ColumnDescriptor* descr(int i) const; |
213 | |
214 | /// Returns the file custom metadata |
215 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const; |
216 | |
217 | /// Returns the file metadata, only available after calling Close(). |
218 | const std::shared_ptr<FileMetaData> metadata() const; |
219 | |
220 | private: |
221 | // Holds a pointer to an instance of Contents implementation |
222 | std::unique_ptr<Contents> contents_; |
223 | std::shared_ptr<FileMetaData> file_metadata_; |
224 | }; |
225 | |
226 | } // namespace parquet |
227 | |
228 | #endif // PARQUET_FILE_WRITER_H |
229 | |