1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_IO_HDFS |
19 | #define ARROW_IO_HDFS |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | #include <string> |
24 | #include <unordered_map> |
25 | #include <vector> |
26 | |
27 | #include "arrow/io/interfaces.h" |
28 | #include "arrow/util/macros.h" |
29 | #include "arrow/util/visibility.h" |
30 | |
31 | namespace arrow { |
32 | |
33 | class Buffer; |
34 | class MemoryPool; |
35 | class Status; |
36 | |
37 | namespace io { |
38 | |
39 | class HdfsReadableFile; |
40 | class HdfsOutputStream; |
41 | |
42 | struct HdfsPathInfo { |
43 | ObjectType::type kind; |
44 | |
45 | std::string name; |
46 | std::string owner; |
47 | std::string group; |
48 | |
49 | // Access times in UNIX timestamps (seconds) |
50 | int64_t size; |
51 | int64_t block_size; |
52 | |
53 | int32_t last_modified_time; |
54 | int32_t last_access_time; |
55 | |
56 | int16_t replication; |
57 | int16_t permissions; |
58 | }; |
59 | |
60 | enum class HdfsDriver : char { LIBHDFS, LIBHDFS3 }; |
61 | |
62 | struct HdfsConnectionConfig { |
63 | std::string host; |
64 | int port; |
65 | std::string user; |
66 | std::string kerb_ticket; |
67 | std::unordered_map<std::string, std::string> ; |
68 | HdfsDriver driver; |
69 | }; |
70 | |
71 | class ARROW_EXPORT HadoopFileSystem : public FileSystem { |
72 | public: |
73 | ~HadoopFileSystem() override; |
74 | |
75 | // Connect to an HDFS cluster given a configuration |
76 | // |
77 | // @param config (in): configuration for connecting |
78 | // @param fs (out): the created client |
79 | // @returns Status |
80 | static Status Connect(const HdfsConnectionConfig* config, |
81 | std::shared_ptr<HadoopFileSystem>* fs); |
82 | |
83 | // Create directory and all parents |
84 | // |
85 | // @param path (in): absolute HDFS path |
86 | // @returns Status |
87 | Status MakeDirectory(const std::string& path) override; |
88 | |
89 | // Delete file or directory |
90 | // @param path: absolute path to data |
91 | // @param recursive: if path is a directory, delete contents as well |
92 | // @returns error status on failure |
93 | Status Delete(const std::string& path, bool recursive = false); |
94 | |
95 | Status DeleteDirectory(const std::string& path) override; |
96 | |
97 | // Disconnect from cluster |
98 | // |
99 | // @returns Status |
100 | Status Disconnect(); |
101 | |
102 | // @param path (in): absolute HDFS path |
103 | // @returns bool, true if the path exists, false if not (or on error) |
104 | bool Exists(const std::string& path); |
105 | |
106 | // @param path (in): absolute HDFS path |
107 | // @param info (out) |
108 | // @returns Status |
109 | Status GetPathInfo(const std::string& path, HdfsPathInfo* info); |
110 | |
111 | // @param nbytes (out): total capacity of the filesystem |
112 | // @returns Status |
113 | Status GetCapacity(int64_t* nbytes); |
114 | |
115 | // @param nbytes (out): total bytes used of the filesystem |
116 | // @returns Status |
117 | Status GetUsed(int64_t* nbytes); |
118 | |
119 | Status GetChildren(const std::string& path, std::vector<std::string>* listing) override; |
120 | |
121 | Status ListDirectory(const std::string& path, std::vector<HdfsPathInfo>* listing); |
122 | |
123 | /// Change |
124 | /// |
125 | /// @param path file path to change |
126 | /// @param owner pass null for no change |
127 | /// @param group pass null for no change |
128 | Status Chown(const std::string& path, const char* owner, const char* group); |
129 | |
130 | /// Change path permissions |
131 | /// |
132 | /// \param path Absolute path in file system |
133 | /// \param mode Mode bitset |
134 | /// \return Status |
135 | Status Chmod(const std::string& path, int mode); |
136 | |
137 | // Move file or directory from source path to destination path within the |
138 | // current filesystem |
139 | Status Rename(const std::string& src, const std::string& dst) override; |
140 | |
141 | Status Stat(const std::string& path, FileStatistics* stat) override; |
142 | |
143 | // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory |
144 | |
145 | // Open an HDFS file in READ mode. Returns error |
146 | // status if the file is not found. |
147 | // |
148 | // @param path complete file path |
149 | Status OpenReadable(const std::string& path, int32_t buffer_size, |
150 | std::shared_ptr<HdfsReadableFile>* file); |
151 | |
152 | Status OpenReadable(const std::string& path, std::shared_ptr<HdfsReadableFile>* file); |
153 | |
154 | // FileMode::WRITE options |
155 | // @param path complete file path |
156 | // @param buffer_size, 0 for default |
157 | // @param replication, 0 for default |
158 | // @param default_block_size, 0 for default |
159 | Status OpenWritable(const std::string& path, bool append, int32_t buffer_size, |
160 | int16_t replication, int64_t default_block_size, |
161 | std::shared_ptr<HdfsOutputStream>* file); |
162 | |
163 | Status OpenWritable(const std::string& path, bool append, |
164 | std::shared_ptr<HdfsOutputStream>* file); |
165 | |
166 | ARROW_DEPRECATED("Use OpenWritable" ) |
167 | Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, |
168 | int16_t replication, int64_t default_block_size, |
169 | std::shared_ptr<HdfsOutputStream>* file); |
170 | |
171 | ARROW_DEPRECATED("Use OpenWritable" ) |
172 | Status OpenWriteable(const std::string& path, bool append, |
173 | std::shared_ptr<HdfsOutputStream>* file); |
174 | |
175 | private: |
176 | friend class HdfsReadableFile; |
177 | friend class HdfsOutputStream; |
178 | |
179 | class ARROW_NO_EXPORT HadoopFileSystemImpl; |
180 | std::unique_ptr<HadoopFileSystemImpl> impl_; |
181 | |
182 | HadoopFileSystem(); |
183 | ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem); |
184 | }; |
185 | |
186 | class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { |
187 | public: |
188 | ~HdfsReadableFile() override; |
189 | |
190 | Status Close() override; |
191 | |
192 | bool closed() const override; |
193 | |
194 | Status GetSize(int64_t* size) override; |
195 | |
196 | // NOTE: If you wish to read a particular range of a file in a multithreaded |
197 | // context, you may prefer to use ReadAt to avoid locking issues |
198 | Status Read(int64_t nbytes, int64_t* bytes_read, void* buffer) override; |
199 | |
200 | Status Read(int64_t nbytes, std::shared_ptr<Buffer>* out) override; |
201 | |
202 | Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, |
203 | void* buffer) override; |
204 | |
205 | Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr<Buffer>* out) override; |
206 | |
207 | Status Seek(int64_t position) override; |
208 | Status Tell(int64_t* position) const override; |
209 | |
210 | void set_memory_pool(MemoryPool* pool); |
211 | |
212 | private: |
213 | explicit HdfsReadableFile(MemoryPool* pool = NULLPTR); |
214 | |
215 | class ARROW_NO_EXPORT HdfsReadableFileImpl; |
216 | std::unique_ptr<HdfsReadableFileImpl> impl_; |
217 | |
218 | friend class HadoopFileSystem::HadoopFileSystemImpl; |
219 | |
220 | ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); |
221 | }; |
222 | |
223 | // Naming this file OutputStream because it does not support seeking (like the |
224 | // WritableFile interface) |
225 | class ARROW_EXPORT HdfsOutputStream : public OutputStream { |
226 | public: |
227 | ~HdfsOutputStream() override; |
228 | |
229 | Status Close() override; |
230 | |
231 | bool closed() const override; |
232 | |
233 | Status Write(const void* buffer, int64_t nbytes) override; |
234 | |
235 | Status Write(const void* buffer, int64_t nbytes, int64_t* bytes_written); |
236 | |
237 | Status Flush() override; |
238 | |
239 | Status Tell(int64_t* position) const override; |
240 | |
241 | private: |
242 | class ARROW_NO_EXPORT HdfsOutputStreamImpl; |
243 | std::unique_ptr<HdfsOutputStreamImpl> impl_; |
244 | |
245 | friend class HadoopFileSystem::HadoopFileSystemImpl; |
246 | |
247 | HdfsOutputStream(); |
248 | |
249 | ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); |
250 | }; |
251 | |
252 | Status ARROW_EXPORT HaveLibHdfs(); |
253 | Status ARROW_EXPORT HaveLibHdfs3(); |
254 | |
255 | } // namespace io |
256 | } // namespace arrow |
257 | |
258 | #endif // ARROW_IO_HDFS |
259 | |