1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_FILE_WRITER_H |
19 | #define PARQUET_FILE_WRITER_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | |
24 | #include "parquet/metadata.h" |
25 | #include "parquet/properties.h" |
26 | #include "parquet/schema.h" |
27 | #include "parquet/util/macros.h" |
28 | #include "parquet/util/memory.h" |
29 | #include "parquet/util/visibility.h" |
30 | |
31 | namespace parquet { |
32 | |
33 | class ColumnWriter; |
34 | class PageWriter; |
35 | class OutputStream; |
36 | |
37 | namespace schema { |
38 | |
39 | class GroupNode; |
40 | |
41 | } // namespace schema |
42 | |
43 | class PARQUET_EXPORT RowGroupWriter { |
44 | public: |
45 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
46 | // easily create test fixtures |
47 | // An implementation of the Contents class is defined in the .cc file |
48 | struct Contents { |
49 | virtual ~Contents() = default; |
50 | virtual int num_columns() const = 0; |
51 | virtual int64_t num_rows() const = 0; |
52 | |
53 | // to be used only with ParquetFileWriter::AppendRowGroup |
54 | virtual ColumnWriter* NextColumn() = 0; |
55 | // to be used only with ParquetFileWriter::AppendBufferedRowGroup |
56 | virtual ColumnWriter* column(int i) = 0; |
57 | |
58 | virtual int current_column() const = 0; |
59 | virtual void Close() = 0; |
60 | |
61 | // total bytes written by the page writer |
62 | virtual int64_t total_bytes_written() const = 0; |
63 | // total bytes still compressed but not written |
64 | virtual int64_t total_compressed_bytes() const = 0; |
65 | }; |
66 | |
67 | explicit RowGroupWriter(std::unique_ptr<Contents> contents); |
68 | |
69 | /// Construct a ColumnWriter for the indicated row group-relative column. |
70 | /// |
71 | /// To be used only with ParquetFileWriter::AppendRowGroup |
72 | /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only |
73 | /// valid until the next call to NextColumn or Close. As the contents are |
74 | /// directly written to the sink, once a new column is started, the contents |
75 | /// of the previous one cannot be modified anymore. |
76 | ColumnWriter* NextColumn(); |
77 | /// Index of currently written column |
78 | int current_column(); |
79 | void Close(); |
80 | |
81 | int num_columns() const; |
82 | |
83 | /// Construct a ColumnWriter for the indicated row group column. |
84 | /// |
85 | /// To be used only with ParquetFileWriter::AppendBufferedRowGroup |
86 | /// Ownership is solely within the RowGroupWriter. The ColumnWriter is |
87 | /// valid until Close. The contents are buffered in memory and written to sink |
88 | /// on Close |
89 | ColumnWriter* column(int i); |
90 | |
91 | /** |
92 | * Number of rows that shall be written as part of this RowGroup. |
93 | */ |
94 | int64_t num_rows() const; |
95 | |
96 | int64_t total_bytes_written() const; |
97 | int64_t total_compressed_bytes() const; |
98 | |
99 | private: |
100 | // Holds a pointer to an instance of Contents implementation |
101 | std::unique_ptr<Contents> contents_; |
102 | }; |
103 | |
104 | PARQUET_EXPORT |
105 | void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); |
106 | |
107 | class PARQUET_EXPORT ParquetFileWriter { |
108 | public: |
109 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
110 | // easily create test fixtures |
111 | // An implementation of the Contents class is defined in the .cc file |
112 | struct Contents { |
113 | Contents(const std::shared_ptr<::parquet::schema::GroupNode>& schema, |
114 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) |
115 | : schema_(), key_value_metadata_(key_value_metadata) { |
116 | schema_.Init(schema); |
117 | } |
118 | virtual ~Contents() {} |
119 | // Perform any cleanup associated with the file contents |
120 | virtual void Close() = 0; |
121 | |
122 | /// \note Deprecated since 1.3.0 |
123 | RowGroupWriter* AppendRowGroup(int64_t num_rows); |
124 | |
125 | virtual RowGroupWriter* AppendRowGroup() = 0; |
126 | virtual RowGroupWriter* AppendBufferedRowGroup() = 0; |
127 | |
128 | virtual int64_t num_rows() const = 0; |
129 | virtual int num_columns() const = 0; |
130 | virtual int num_row_groups() const = 0; |
131 | |
132 | virtual const std::shared_ptr<WriterProperties>& properties() const = 0; |
133 | |
134 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const { |
135 | return key_value_metadata_; |
136 | } |
137 | |
138 | // Return const-pointer to make it clear that this object is not to be copied |
139 | const SchemaDescriptor* schema() const { return &schema_; } |
140 | |
141 | SchemaDescriptor schema_; |
142 | |
143 | /// This should be the only place this is stored. Everything else is a const reference |
144 | std::shared_ptr<const KeyValueMetadata> key_value_metadata_; |
145 | }; |
146 | |
147 | ParquetFileWriter(); |
148 | ~ParquetFileWriter(); |
149 | |
150 | static std::unique_ptr<ParquetFileWriter> Open( |
151 | const std::shared_ptr<::arrow::io::OutputStream>& sink, |
152 | const std::shared_ptr<schema::GroupNode>& schema, |
153 | const std::shared_ptr<WriterProperties>& properties = default_writer_properties(), |
154 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
155 | |
156 | static std::unique_ptr<ParquetFileWriter> Open( |
157 | const std::shared_ptr<OutputStream>& sink, |
158 | const std::shared_ptr<schema::GroupNode>& schema, |
159 | const std::shared_ptr<WriterProperties>& properties = default_writer_properties(), |
160 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
161 | |
162 | void Open(std::unique_ptr<Contents> contents); |
163 | void Close(); |
164 | |
165 | // Construct a RowGroupWriter for the indicated number of rows. |
166 | // |
167 | // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
168 | // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
169 | // @param num_rows The number of rows that are stored in the new RowGroup |
170 | // |
171 | // \deprecated Since 1.3.0 |
172 | RowGroupWriter* AppendRowGroup(int64_t num_rows); |
173 | |
174 | /// Construct a RowGroupWriter with an arbitrary number of rows. |
175 | /// |
176 | /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
177 | /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
178 | RowGroupWriter* AppendRowGroup(); |
179 | |
180 | /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready. |
181 | /// Use this if you want to write a RowGroup based on a certain size |
182 | /// |
183 | /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
184 | /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
185 | RowGroupWriter* AppendBufferedRowGroup(); |
186 | |
187 | /// Number of columns. |
188 | /// |
189 | /// This number is fixed during the lifetime of the writer as it is determined via |
190 | /// the schema. |
191 | int num_columns() const; |
192 | |
193 | /// Number of rows in the yet started RowGroups. |
194 | /// |
195 | /// Changes on the addition of a new RowGroup. |
196 | int64_t num_rows() const; |
197 | |
198 | /// Number of started RowGroups. |
199 | int num_row_groups() const; |
200 | |
201 | /// Configuration passed to the writer, e.g. the used Parquet format version. |
202 | const std::shared_ptr<WriterProperties>& properties() const; |
203 | |
204 | /// Returns the file schema descriptor |
205 | const SchemaDescriptor* schema() const; |
206 | |
207 | /// Returns a column descriptor in schema |
208 | const ColumnDescriptor* descr(int i) const; |
209 | |
210 | /// Returns the file custom metadata |
211 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const; |
212 | |
213 | private: |
214 | // Holds a pointer to an instance of Contents implementation |
215 | std::unique_ptr<Contents> contents_; |
216 | }; |
217 | |
218 | } // namespace parquet |
219 | |
220 | #endif // PARQUET_FILE_WRITER_H |
221 | |