| 1 | // Licensed to the Apache Software Foundation (ASF) under one |
| 2 | // or more contributor license agreements. See the NOTICE file |
| 3 | // distributed with this work for additional information |
| 4 | // regarding copyright ownership. The ASF licenses this file |
| 5 | // to you under the Apache License, Version 2.0 (the |
| 6 | // "License"); you may not use this file except in compliance |
| 7 | // with the License. You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, |
| 12 | // software distributed under the License is distributed on an |
| 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | // KIND, either express or implied. See the License for the |
| 15 | // specific language governing permissions and limitations |
| 16 | // under the License. |
| 17 | |
| 18 | #ifndef PARQUET_FILE_WRITER_H |
| 19 | #define PARQUET_FILE_WRITER_H |
| 20 | |
| 21 | #include <cstdint> |
| 22 | #include <memory> |
| 23 | |
| 24 | #include "parquet/metadata.h" |
| 25 | #include "parquet/platform.h" |
| 26 | #include "parquet/properties.h" |
| 27 | #include "parquet/schema.h" |
| 28 | |
| 29 | namespace parquet { |
| 30 | |
| 31 | class ColumnWriter; |
| 32 | class OutputStream; |
| 33 | |
| 34 | class PARQUET_EXPORT RowGroupWriter { |
| 35 | public: |
| 36 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
| 37 | // easily create test fixtures |
| 38 | // An implementation of the Contents class is defined in the .cc file |
| 39 | struct Contents { |
| 40 | virtual ~Contents() = default; |
| 41 | virtual int num_columns() const = 0; |
| 42 | virtual int64_t num_rows() const = 0; |
| 43 | |
| 44 | // to be used only with ParquetFileWriter::AppendRowGroup |
| 45 | virtual ColumnWriter* NextColumn() = 0; |
| 46 | // to be used only with ParquetFileWriter::AppendBufferedRowGroup |
| 47 | virtual ColumnWriter* column(int i) = 0; |
| 48 | |
| 49 | virtual int current_column() const = 0; |
| 50 | virtual void Close() = 0; |
| 51 | |
| 52 | // total bytes written by the page writer |
| 53 | virtual int64_t total_bytes_written() const = 0; |
| 54 | // total bytes still compressed but not written |
| 55 | virtual int64_t total_compressed_bytes() const = 0; |
| 56 | }; |
| 57 | |
| 58 | explicit RowGroupWriter(std::unique_ptr<Contents> contents); |
| 59 | |
| 60 | /// Construct a ColumnWriter for the indicated row group-relative column. |
| 61 | /// |
| 62 | /// To be used only with ParquetFileWriter::AppendRowGroup |
| 63 | /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only |
| 64 | /// valid until the next call to NextColumn or Close. As the contents are |
| 65 | /// directly written to the sink, once a new column is started, the contents |
| 66 | /// of the previous one cannot be modified anymore. |
| 67 | ColumnWriter* NextColumn(); |
| 68 | /// Index of currently written column |
| 69 | int current_column(); |
| 70 | void Close(); |
| 71 | |
| 72 | int num_columns() const; |
| 73 | |
| 74 | /// Construct a ColumnWriter for the indicated row group column. |
| 75 | /// |
| 76 | /// To be used only with ParquetFileWriter::AppendBufferedRowGroup |
| 77 | /// Ownership is solely within the RowGroupWriter. The ColumnWriter is |
| 78 | /// valid until Close. The contents are buffered in memory and written to sink |
| 79 | /// on Close |
| 80 | ColumnWriter* column(int i); |
| 81 | |
| 82 | /** |
| 83 | * Number of rows that shall be written as part of this RowGroup. |
| 84 | */ |
| 85 | int64_t num_rows() const; |
| 86 | |
| 87 | int64_t total_bytes_written() const; |
| 88 | int64_t total_compressed_bytes() const; |
| 89 | |
| 90 | private: |
| 91 | // Holds a pointer to an instance of Contents implementation |
| 92 | std::unique_ptr<Contents> contents_; |
| 93 | }; |
| 94 | |
| 95 | ARROW_DEPRECATED("Use version with arrow::io::OutputStream*" ) |
| 96 | PARQUET_EXPORT |
| 97 | void WriteFileMetaData(const FileMetaData& file_metadata, OutputStream* sink); |
| 98 | |
| 99 | PARQUET_EXPORT |
| 100 | void WriteFileMetaData(const FileMetaData& file_metadata, |
| 101 | ::arrow::io::OutputStream* sink); |
| 102 | |
| 103 | PARQUET_EXPORT |
| 104 | void WriteMetaDataFile(const FileMetaData& file_metadata, |
| 105 | ::arrow::io::OutputStream* sink); |
| 106 | |
| 107 | class PARQUET_EXPORT ParquetFileWriter { |
| 108 | public: |
| 109 | // Forward declare a virtual class 'Contents' to aid dependency injection and more |
| 110 | // easily create test fixtures |
| 111 | // An implementation of the Contents class is defined in the .cc file |
| 112 | struct Contents { |
| 113 | Contents(const std::shared_ptr<::parquet::schema::GroupNode>& schema, |
| 114 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) |
| 115 | : schema_(), key_value_metadata_(key_value_metadata) { |
| 116 | schema_.Init(schema); |
| 117 | } |
| 118 | virtual ~Contents() {} |
| 119 | // Perform any cleanup associated with the file contents |
| 120 | virtual void Close() = 0; |
| 121 | |
| 122 | /// \note Deprecated since 1.3.0 |
| 123 | RowGroupWriter* AppendRowGroup(int64_t num_rows); |
| 124 | |
| 125 | virtual RowGroupWriter* AppendRowGroup() = 0; |
| 126 | virtual RowGroupWriter* AppendBufferedRowGroup() = 0; |
| 127 | |
| 128 | virtual int64_t num_rows() const = 0; |
| 129 | virtual int num_columns() const = 0; |
| 130 | virtual int num_row_groups() const = 0; |
| 131 | |
| 132 | virtual const std::shared_ptr<WriterProperties>& properties() const = 0; |
| 133 | |
| 134 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const { |
| 135 | return key_value_metadata_; |
| 136 | } |
| 137 | |
| 138 | // Return const-pointer to make it clear that this object is not to be copied |
| 139 | const SchemaDescriptor* schema() const { return &schema_; } |
| 140 | |
| 141 | SchemaDescriptor schema_; |
| 142 | |
| 143 | /// This should be the only place this is stored. Everything else is a const reference |
| 144 | std::shared_ptr<const KeyValueMetadata> key_value_metadata_; |
| 145 | |
| 146 | const std::shared_ptr<FileMetaData> metadata() const { return file_metadata_; } |
| 147 | std::shared_ptr<FileMetaData> file_metadata_; |
| 148 | }; |
| 149 | |
| 150 | ParquetFileWriter(); |
| 151 | ~ParquetFileWriter(); |
| 152 | |
| 153 | static std::unique_ptr<ParquetFileWriter> Open( |
| 154 | const std::shared_ptr<::arrow::io::OutputStream>& sink, |
| 155 | const std::shared_ptr<schema::GroupNode>& schema, |
| 156 | const std::shared_ptr<WriterProperties>& properties = default_writer_properties(), |
| 157 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
| 158 | |
| 159 | ARROW_DEPRECATED("Use version with arrow::io::OutputStream" ) |
| 160 | static std::unique_ptr<ParquetFileWriter> Open( |
| 161 | const std::shared_ptr<OutputStream>& sink, |
| 162 | const std::shared_ptr<schema::GroupNode>& schema, |
| 163 | const std::shared_ptr<WriterProperties>& properties = default_writer_properties(), |
| 164 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR); |
| 165 | |
| 166 | void Open(std::unique_ptr<Contents> contents); |
| 167 | void Close(); |
| 168 | |
| 169 | // Construct a RowGroupWriter for the indicated number of rows. |
| 170 | // |
| 171 | // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
| 172 | // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
| 173 | // @param num_rows The number of rows that are stored in the new RowGroup |
| 174 | // |
| 175 | // \deprecated Since 1.3.0 |
| 176 | RowGroupWriter* AppendRowGroup(int64_t num_rows); |
| 177 | |
| 178 | /// Construct a RowGroupWriter with an arbitrary number of rows. |
| 179 | /// |
| 180 | /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
| 181 | /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
| 182 | RowGroupWriter* AppendRowGroup(); |
| 183 | |
| 184 | /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready. |
| 185 | /// Use this if you want to write a RowGroup based on a certain size |
| 186 | /// |
| 187 | /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid |
| 188 | /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close. |
| 189 | RowGroupWriter* AppendBufferedRowGroup(); |
| 190 | |
| 191 | /// Number of columns. |
| 192 | /// |
| 193 | /// This number is fixed during the lifetime of the writer as it is determined via |
| 194 | /// the schema. |
| 195 | int num_columns() const; |
| 196 | |
| 197 | /// Number of rows in the yet started RowGroups. |
| 198 | /// |
| 199 | /// Changes on the addition of a new RowGroup. |
| 200 | int64_t num_rows() const; |
| 201 | |
| 202 | /// Number of started RowGroups. |
| 203 | int num_row_groups() const; |
| 204 | |
| 205 | /// Configuration passed to the writer, e.g. the used Parquet format version. |
| 206 | const std::shared_ptr<WriterProperties>& properties() const; |
| 207 | |
| 208 | /// Returns the file schema descriptor |
| 209 | const SchemaDescriptor* schema() const; |
| 210 | |
| 211 | /// Returns a column descriptor in schema |
| 212 | const ColumnDescriptor* descr(int i) const; |
| 213 | |
| 214 | /// Returns the file custom metadata |
| 215 | const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const; |
| 216 | |
| 217 | /// Returns the file metadata, only available after calling Close(). |
| 218 | const std::shared_ptr<FileMetaData> metadata() const; |
| 219 | |
| 220 | private: |
| 221 | // Holds a pointer to an instance of Contents implementation |
| 222 | std::unique_ptr<Contents> contents_; |
| 223 | std::shared_ptr<FileMetaData> file_metadata_; |
| 224 | }; |
| 225 | |
| 226 | } // namespace parquet |
| 227 | |
| 228 | #endif // PARQUET_FILE_WRITER_H |
| 229 | |