1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_IO_HDFS
19#define ARROW_IO_HDFS
20
21#include <cstdint>
22#include <memory>
23#include <string>
24#include <unordered_map>
25#include <vector>
26
27#include "arrow/io/interfaces.h"
28#include "arrow/util/macros.h"
29#include "arrow/util/visibility.h"
30
31namespace arrow {
32
33class Buffer;
34class MemoryPool;
35class Status;
36
37namespace io {
38
39class HdfsReadableFile;
40class HdfsOutputStream;
41
42struct HdfsPathInfo {
43 ObjectType::type kind;
44
45 std::string name;
46 std::string owner;
47 std::string group;
48
49 // Access times in UNIX timestamps (seconds)
50 int64_t size;
51 int64_t block_size;
52
53 int32_t last_modified_time;
54 int32_t last_access_time;
55
56 int16_t replication;
57 int16_t permissions;
58};
59
60enum class HdfsDriver : char { LIBHDFS, LIBHDFS3 };
61
62struct HdfsConnectionConfig {
63 std::string host;
64 int port;
65 std::string user;
66 std::string kerb_ticket;
67 std::unordered_map<std::string, std::string> extra_conf;
68 HdfsDriver driver;
69};
70
71class ARROW_EXPORT HadoopFileSystem : public FileSystem {
72 public:
73 ~HadoopFileSystem() override;
74
75 // Connect to an HDFS cluster given a configuration
76 //
77 // @param config (in): configuration for connecting
78 // @param fs (out): the created client
79 // @returns Status
80 static Status Connect(const HdfsConnectionConfig* config,
81 std::shared_ptr<HadoopFileSystem>* fs);
82
83 // Create directory and all parents
84 //
85 // @param path (in): absolute HDFS path
86 // @returns Status
87 Status MakeDirectory(const std::string& path) override;
88
89 // Delete file or directory
90 // @param path: absolute path to data
91 // @param recursive: if path is a directory, delete contents as well
92 // @returns error status on failure
93 Status Delete(const std::string& path, bool recursive = false);
94
95 Status DeleteDirectory(const std::string& path) override;
96
97 // Disconnect from cluster
98 //
99 // @returns Status
100 Status Disconnect();
101
102 // @param path (in): absolute HDFS path
103 // @returns bool, true if the path exists, false if not (or on error)
104 bool Exists(const std::string& path);
105
106 // @param path (in): absolute HDFS path
107 // @param info (out)
108 // @returns Status
109 Status GetPathInfo(const std::string& path, HdfsPathInfo* info);
110
111 // @param nbytes (out): total capacity of the filesystem
112 // @returns Status
113 Status GetCapacity(int64_t* nbytes);
114
115 // @param nbytes (out): total bytes used of the filesystem
116 // @returns Status
117 Status GetUsed(int64_t* nbytes);
118
119 Status GetChildren(const std::string& path, std::vector<std::string>* listing) override;
120
121 Status ListDirectory(const std::string& path, std::vector<HdfsPathInfo>* listing);
122
123 /// Change
124 ///
125 /// @param path file path to change
126 /// @param owner pass null for no change
127 /// @param group pass null for no change
128 Status Chown(const std::string& path, const char* owner, const char* group);
129
130 /// Change path permissions
131 ///
132 /// \param path Absolute path in file system
133 /// \param mode Mode bitset
134 /// \return Status
135 Status Chmod(const std::string& path, int mode);
136
137 // Move file or directory from source path to destination path within the
138 // current filesystem
139 Status Rename(const std::string& src, const std::string& dst) override;
140
141 Status Stat(const std::string& path, FileStatistics* stat) override;
142
143 // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory
144
145 // Open an HDFS file in READ mode. Returns error
146 // status if the file is not found.
147 //
148 // @param path complete file path
149 Status OpenReadable(const std::string& path, int32_t buffer_size,
150 std::shared_ptr<HdfsReadableFile>* file);
151
152 Status OpenReadable(const std::string& path, std::shared_ptr<HdfsReadableFile>* file);
153
154 // FileMode::WRITE options
155 // @param path complete file path
156 // @param buffer_size, 0 for default
157 // @param replication, 0 for default
158 // @param default_block_size, 0 for default
159 Status OpenWritable(const std::string& path, bool append, int32_t buffer_size,
160 int16_t replication, int64_t default_block_size,
161 std::shared_ptr<HdfsOutputStream>* file);
162
163 Status OpenWritable(const std::string& path, bool append,
164 std::shared_ptr<HdfsOutputStream>* file);
165
166 ARROW_DEPRECATED("Use OpenWritable")
167 Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size,
168 int16_t replication, int64_t default_block_size,
169 std::shared_ptr<HdfsOutputStream>* file);
170
171 ARROW_DEPRECATED("Use OpenWritable")
172 Status OpenWriteable(const std::string& path, bool append,
173 std::shared_ptr<HdfsOutputStream>* file);
174
175 private:
176 friend class HdfsReadableFile;
177 friend class HdfsOutputStream;
178
179 class ARROW_NO_EXPORT HadoopFileSystemImpl;
180 std::unique_ptr<HadoopFileSystemImpl> impl_;
181
182 HadoopFileSystem();
183 ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem);
184};
185
186class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile {
187 public:
188 ~HdfsReadableFile() override;
189
190 Status Close() override;
191
192 bool closed() const override;
193
194 Status GetSize(int64_t* size) override;
195
196 // NOTE: If you wish to read a particular range of a file in a multithreaded
197 // context, you may prefer to use ReadAt to avoid locking issues
198 Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override;
199
200 Status Read(int64_t nbytes, std::shared_ptr<Buffer>* out) override;
201
202 Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read,
203 void* buffer) override;
204
205 Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr<Buffer>* out) override;
206
207 Status Seek(int64_t position) override;
208 Status Tell(int64_t* position) const override;
209
210 void set_memory_pool(MemoryPool* pool);
211
212 private:
213 explicit HdfsReadableFile(MemoryPool* pool = NULLPTR);
214
215 class ARROW_NO_EXPORT HdfsReadableFileImpl;
216 std::unique_ptr<HdfsReadableFileImpl> impl_;
217
218 friend class HadoopFileSystem::HadoopFileSystemImpl;
219
220 ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile);
221};
222
223// Naming this file OutputStream because it does not support seeking (like the
224// WritableFile interface)
225class ARROW_EXPORT HdfsOutputStream : public OutputStream {
226 public:
227 ~HdfsOutputStream() override;
228
229 Status Close() override;
230
231 bool closed() const override;
232
233 Status Write(const void* buffer, int64_t nbytes) override;
234
235 Status Write(const void* buffer, int64_t nbytes, int64_t* bytes_written);
236
237 Status Flush() override;
238
239 Status Tell(int64_t* position) const override;
240
241 private:
242 class ARROW_NO_EXPORT HdfsOutputStreamImpl;
243 std::unique_ptr<HdfsOutputStreamImpl> impl_;
244
245 friend class HadoopFileSystem::HadoopFileSystemImpl;
246
247 HdfsOutputStream();
248
249 ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream);
250};
251
252Status ARROW_EXPORT HaveLibHdfs();
253Status ARROW_EXPORT HaveLibHdfs3();
254
255} // namespace io
256} // namespace arrow
257
258#endif // ARROW_IO_HDFS
259