| 1 | #include <Storages/StorageFile.h> |
| 2 | #include <Storages/StorageFactory.h> |
| 3 | |
| 4 | #include <Interpreters/Context.h> |
| 5 | #include <Interpreters/evaluateConstantExpression.h> |
| 6 | |
| 7 | #include <Parsers/ASTLiteral.h> |
| 8 | #include <Parsers/ASTIdentifier.h> |
| 9 | |
| 10 | #include <IO/ReadBufferFromFile.h> |
| 11 | #include <IO/ReadHelpers.h> |
| 12 | #include <IO/WriteBufferFromFile.h> |
| 13 | #include <IO/WriteHelpers.h> |
| 14 | |
| 15 | #include <Formats/FormatFactory.h> |
| 16 | #include <DataStreams/IBlockInputStream.h> |
| 17 | #include <DataStreams/IBlockOutputStream.h> |
| 18 | #include <DataStreams/AddingDefaultsBlockInputStream.h> |
| 19 | #include <DataStreams/narrowBlockInputStreams.h> |
| 20 | |
| 21 | #include <Common/escapeForFileName.h> |
| 22 | #include <Common/typeid_cast.h> |
| 23 | #include <Common/parseGlobs.h> |
| 24 | |
| 25 | #include <fcntl.h> |
| 26 | |
| 27 | #include <Poco/Path.h> |
| 28 | #include <Poco/File.h> |
| 29 | |
| 30 | #include <re2/re2.h> |
| 31 | #include <filesystem> |
| 32 | |
| 33 | namespace fs = std::filesystem; |
| 34 | |
| 35 | namespace DB |
| 36 | { |
| 37 | |
| 38 | namespace ErrorCodes |
| 39 | { |
| 40 | extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR; |
| 41 | extern const int CANNOT_SEEK_THROUGH_FILE; |
| 42 | extern const int DATABASE_ACCESS_DENIED; |
| 43 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
| 44 | extern const int UNKNOWN_IDENTIFIER; |
| 45 | extern const int INCORRECT_FILE_NAME; |
| 46 | extern const int FILE_DOESNT_EXIST; |
| 47 | extern const int EMPTY_LIST_OF_COLUMNS_PASSED; |
| 48 | } |
| 49 | |
| 50 | namespace |
| 51 | { |
| 52 | |
| 53 | /* Recursive directory listing with matched paths as a result. |
| 54 | * Have the same method in StorageHDFS. |
| 55 | */ |
| 56 | static std::vector<std::string> listFilesWithRegexpMatching(const std::string & path_for_ls, const std::string & for_match) |
| 57 | { |
| 58 | const size_t first_glob = for_match.find_first_of("*?{" ); |
| 59 | |
| 60 | const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/'); |
| 61 | const std::string suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' |
| 62 | |
| 63 | const size_t next_slash = suffix_with_globs.find('/', 1); |
| 64 | re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash))); |
| 65 | |
| 66 | std::vector<std::string> result; |
| 67 | const std::string prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); |
| 68 | if (!fs::exists(fs::path(prefix_without_globs.data()))) |
| 69 | { |
| 70 | return result; |
| 71 | } |
| 72 | const fs::directory_iterator end; |
| 73 | for (fs::directory_iterator it(prefix_without_globs); it != end; ++it) |
| 74 | { |
| 75 | const std::string full_path = it->path().string(); |
| 76 | const size_t last_slash = full_path.rfind('/'); |
| 77 | const String file_name = full_path.substr(last_slash); |
| 78 | const bool looking_for_directory = next_slash != std::string::npos; |
| 79 | /// Condition is_directory means what kind of path is it in current iteration of ls |
| 80 | if (!fs::is_directory(it->path()) && !looking_for_directory) |
| 81 | { |
| 82 | if (re2::RE2::FullMatch(file_name, matcher)) |
| 83 | { |
| 84 | result.push_back(it->path().string()); |
| 85 | } |
| 86 | } |
| 87 | else if (fs::is_directory(it->path()) && looking_for_directory) |
| 88 | { |
| 89 | if (re2::RE2::FullMatch(file_name, matcher)) |
| 90 | { |
| 91 | /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. |
| 92 | Strings result_part = listFilesWithRegexpMatching(full_path + "/" , suffix_with_globs.substr(next_slash)); |
| 93 | std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | return result; |
| 98 | } |
| 99 | |
| 100 | static std::string getTablePath(const std::string & table_dir_path, const std::string & format_name) |
| 101 | { |
| 102 | return table_dir_path + "/data." + escapeForFileName(format_name); |
| 103 | } |
| 104 | |
| 105 | /// Both db_dir_path and table_path must be converted to absolute paths (in particular, path cannot contain '..'). |
| 106 | static void checkCreationIsAllowed(const Context & context_global, const std::string & db_dir_path, const std::string & table_path) |
| 107 | { |
| 108 | if (context_global.getApplicationType() != Context::ApplicationType::SERVER) |
| 109 | return; |
| 110 | |
| 111 | /// "/dev/null" is allowed for perf testing |
| 112 | if (!startsWith(table_path, db_dir_path) && table_path != "/dev/null" ) |
| 113 | throw Exception("Part path " + table_path + " is not inside " + db_dir_path, ErrorCodes::DATABASE_ACCESS_DENIED); |
| 114 | |
| 115 | Poco::File table_path_poco_file = Poco::File(table_path); |
| 116 | if (table_path_poco_file.exists() && table_path_poco_file.isDirectory()) |
| 117 | throw Exception("File " + table_path + " must not be a directory" , ErrorCodes::INCORRECT_FILE_NAME); |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | |
| 122 | StorageFile::StorageFile(int table_fd_, CommonArguments args) |
| 123 | : StorageFile(args) |
| 124 | { |
| 125 | if (args.context.getApplicationType() == Context::ApplicationType::SERVER) |
| 126 | throw Exception("Using file descriptor as source of storage isn't allowed for server daemons" , ErrorCodes::DATABASE_ACCESS_DENIED); |
| 127 | |
| 128 | is_db_table = false; |
| 129 | use_table_fd = true; |
| 130 | table_fd = table_fd_; |
| 131 | |
| 132 | /// Save initial offset, it will be used for repeating SELECTs |
| 133 | /// If FD isn't seekable (lseek returns -1), then the second and subsequent SELECTs will fail. |
| 134 | table_fd_init_offset = lseek(table_fd, 0, SEEK_CUR); |
| 135 | } |
| 136 | |
| 137 | StorageFile::StorageFile(const std::string & table_path_, const std::string & user_files_path, CommonArguments args) |
| 138 | : StorageFile(args) |
| 139 | { |
| 140 | is_db_table = false; |
| 141 | std::string user_files_absolute_path = Poco::Path(user_files_path).makeAbsolute().makeDirectory().toString(); |
| 142 | Poco::Path poco_path = Poco::Path(table_path_); |
| 143 | if (poco_path.isRelative()) |
| 144 | poco_path = Poco::Path(user_files_absolute_path, poco_path); |
| 145 | |
| 146 | const std::string path = poco_path.absolute().toString(); |
| 147 | if (path.find_first_of("*?{" ) == std::string::npos) |
| 148 | { |
| 149 | paths.push_back(path); |
| 150 | } |
| 151 | else |
| 152 | paths = listFilesWithRegexpMatching("/" , path); |
| 153 | for (const auto & cur_path : paths) |
| 154 | checkCreationIsAllowed(args.context, user_files_absolute_path, cur_path); |
| 155 | } |
| 156 | |
| 157 | StorageFile::StorageFile(const std::string & relative_table_dir_path, CommonArguments args) |
| 158 | : StorageFile(args) |
| 159 | { |
| 160 | if (relative_table_dir_path.empty()) |
| 161 | throw Exception("Storage " + getName() + " requires data path" , ErrorCodes::INCORRECT_FILE_NAME); |
| 162 | |
| 163 | String table_dir_path = base_path + relative_table_dir_path + "/" ; |
| 164 | Poco::File(table_dir_path).createDirectories(); |
| 165 | paths = {getTablePath(table_dir_path, format_name)}; |
| 166 | } |
| 167 | |
| 168 | StorageFile::StorageFile(CommonArguments args) |
| 169 | : table_name(args.table_name), database_name(args.database_name), format_name(args.format_name) |
| 170 | , compression_method(args.compression_method), base_path(args.context.getPath()) |
| 171 | { |
| 172 | setColumns(args.columns); |
| 173 | setConstraints(args.constraints); |
| 174 | } |
| 175 | |
| 176 | class StorageFileBlockInputStream : public IBlockInputStream |
| 177 | { |
| 178 | public: |
| 179 | StorageFileBlockInputStream(std::shared_ptr<StorageFile> storage_, |
| 180 | const Context & context, UInt64 max_block_size, |
| 181 | std::string file_path, |
| 182 | const CompressionMethod compression_method) |
| 183 | : storage(std::move(storage_)) |
| 184 | { |
| 185 | if (storage->use_table_fd) |
| 186 | { |
| 187 | unique_lock = std::unique_lock(storage->rwlock); |
| 188 | |
| 189 | /// We could use common ReadBuffer and WriteBuffer in storage to leverage cache |
| 190 | /// and add ability to seek unseekable files, but cache sync isn't supported. |
| 191 | |
| 192 | if (storage->table_fd_was_used) /// We need seek to initial position |
| 193 | { |
| 194 | if (storage->table_fd_init_offset < 0) |
| 195 | throw Exception("File descriptor isn't seekable, inside " + storage->getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); |
| 196 | |
| 197 | /// ReadBuffer's seek() doesn't make sense, since cache is empty |
| 198 | if (lseek(storage->table_fd, storage->table_fd_init_offset, SEEK_SET) < 0) |
| 199 | throwFromErrno("Cannot seek file descriptor, inside " + storage->getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); |
| 200 | } |
| 201 | |
| 202 | storage->table_fd_was_used = true; |
| 203 | read_buf = getReadBuffer<ReadBufferFromFileDescriptor>(compression_method, storage->table_fd); |
| 204 | } |
| 205 | else |
| 206 | { |
| 207 | shared_lock = std::shared_lock(storage->rwlock); |
| 208 | read_buf = getReadBuffer<ReadBufferFromFile>(compression_method, file_path); |
| 209 | } |
| 210 | |
| 211 | reader = FormatFactory::instance().getInput(storage->format_name, *read_buf, storage->getSampleBlock(), context, max_block_size); |
| 212 | } |
| 213 | |
| 214 | String getName() const override |
| 215 | { |
| 216 | return storage->getName(); |
| 217 | } |
| 218 | |
| 219 | Block readImpl() override |
| 220 | { |
| 221 | return reader->read(); |
| 222 | } |
| 223 | |
| 224 | Block () const override { return reader->getHeader(); } |
| 225 | |
| 226 | void readPrefixImpl() override |
| 227 | { |
| 228 | reader->readPrefix(); |
| 229 | } |
| 230 | |
| 231 | void readSuffixImpl() override |
| 232 | { |
| 233 | reader->readSuffix(); |
| 234 | } |
| 235 | |
| 236 | private: |
| 237 | std::shared_ptr<StorageFile> storage; |
| 238 | Block sample_block; |
| 239 | std::unique_ptr<ReadBuffer> read_buf; |
| 240 | BlockInputStreamPtr reader; |
| 241 | |
| 242 | std::shared_lock<std::shared_mutex> shared_lock; |
| 243 | std::unique_lock<std::shared_mutex> unique_lock; |
| 244 | }; |
| 245 | |
| 246 | |
| 247 | BlockInputStreams StorageFile::read( |
| 248 | const Names & /*column_names*/, |
| 249 | const SelectQueryInfo & /*query_info*/, |
| 250 | const Context & context, |
| 251 | QueryProcessingStage::Enum /*processed_stage*/, |
| 252 | size_t max_block_size, |
| 253 | unsigned num_streams) |
| 254 | { |
| 255 | const ColumnsDescription & columns_ = getColumns(); |
| 256 | auto column_defaults = columns_.getDefaults(); |
| 257 | BlockInputStreams blocks_input; |
| 258 | if (use_table_fd) /// need to call ctr BlockInputStream |
| 259 | paths = {"" }; /// when use fd, paths are empty |
| 260 | else |
| 261 | { |
| 262 | if (paths.size() == 1 && !Poco::File(paths[0]).exists()) |
| 263 | throw Exception("File " + paths[0] + " doesn't exist" , ErrorCodes::FILE_DOESNT_EXIST); |
| 264 | } |
| 265 | blocks_input.reserve(paths.size()); |
| 266 | for (const auto & file_path : paths) |
| 267 | { |
| 268 | BlockInputStreamPtr cur_block = std::make_shared<StorageFileBlockInputStream>( |
| 269 | std::static_pointer_cast<StorageFile>(shared_from_this()), context, max_block_size, file_path, IStorage::chooseCompressionMethod(file_path, compression_method)); |
| 270 | blocks_input.push_back(column_defaults.empty() ? cur_block : std::make_shared<AddingDefaultsBlockInputStream>(cur_block, column_defaults, context)); |
| 271 | } |
| 272 | return narrowBlockInputStreams(blocks_input, num_streams); |
| 273 | } |
| 274 | |
| 275 | |
| 276 | class StorageFileBlockOutputStream : public IBlockOutputStream |
| 277 | { |
| 278 | public: |
| 279 | explicit StorageFileBlockOutputStream(StorageFile & storage_, |
| 280 | const CompressionMethod compression_method, |
| 281 | const Context & context) |
| 282 | : storage(storage_), lock(storage.rwlock) |
| 283 | { |
| 284 | if (storage.use_table_fd) |
| 285 | { |
| 286 | /** NOTE: Using real file binded to FD may be misleading: |
| 287 | * SELECT *; INSERT insert_data; SELECT *; last SELECT returns initil_fd_data + insert_data |
| 288 | * INSERT data; SELECT *; last SELECT returns only insert_data |
| 289 | */ |
| 290 | storage.table_fd_was_used = true; |
| 291 | write_buf = getWriteBuffer<WriteBufferFromFileDescriptor>(compression_method, storage.table_fd); |
| 292 | } |
| 293 | else |
| 294 | { |
| 295 | if (storage.paths.size() != 1) |
| 296 | throw Exception("Table '" + storage.table_name + "' is in readonly mode because of globs in filepath" , ErrorCodes::DATABASE_ACCESS_DENIED); |
| 297 | write_buf = getWriteBuffer<WriteBufferFromFile>(compression_method, storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); |
| 298 | } |
| 299 | |
| 300 | writer = FormatFactory::instance().getOutput(storage.format_name, *write_buf, storage.getSampleBlock(), context); |
| 301 | } |
| 302 | |
| 303 | Block () const override { return storage.getSampleBlock(); } |
| 304 | |
| 305 | void write(const Block & block) override |
| 306 | { |
| 307 | writer->write(block); |
| 308 | } |
| 309 | |
| 310 | void writePrefix() override |
| 311 | { |
| 312 | writer->writePrefix(); |
| 313 | } |
| 314 | |
| 315 | void writeSuffix() override |
| 316 | { |
| 317 | writer->writeSuffix(); |
| 318 | } |
| 319 | |
| 320 | void flush() override |
| 321 | { |
| 322 | writer->flush(); |
| 323 | } |
| 324 | |
| 325 | private: |
| 326 | StorageFile & storage; |
| 327 | std::unique_lock<std::shared_mutex> lock; |
| 328 | std::unique_ptr<WriteBuffer> write_buf; |
| 329 | BlockOutputStreamPtr writer; |
| 330 | }; |
| 331 | |
| 332 | BlockOutputStreamPtr StorageFile::write( |
| 333 | const ASTPtr & /*query*/, |
| 334 | const Context & context) |
| 335 | { |
| 336 | return std::make_shared<StorageFileBlockOutputStream>(*this, |
| 337 | IStorage::chooseCompressionMethod(paths[0], compression_method), context); |
| 338 | } |
| 339 | |
| 340 | Strings StorageFile::getDataPaths() const |
| 341 | { |
| 342 | if (paths.empty()) |
| 343 | throw Exception("Table '" + table_name + "' is in readonly mode" , ErrorCodes::DATABASE_ACCESS_DENIED); |
| 344 | return paths; |
| 345 | } |
| 346 | |
| 347 | void StorageFile::rename(const String & new_path_to_table_data, const String & new_database_name, const String & new_table_name, TableStructureWriteLockHolder &) |
| 348 | { |
| 349 | if (!is_db_table) |
| 350 | throw Exception("Can't rename table '" + table_name + "' binded to user-defined file (or FD)" , ErrorCodes::DATABASE_ACCESS_DENIED); |
| 351 | |
| 352 | if (paths.size() != 1) |
| 353 | throw Exception("Can't rename table '" + table_name + "' in readonly mode" , ErrorCodes::DATABASE_ACCESS_DENIED); |
| 354 | |
| 355 | std::unique_lock<std::shared_mutex> lock(rwlock); |
| 356 | |
| 357 | std::string path_new = getTablePath(base_path + new_path_to_table_data, format_name); |
| 358 | Poco::File(Poco::Path(path_new).parent()).createDirectories(); |
| 359 | Poco::File(paths[0]).renameTo(path_new); |
| 360 | |
| 361 | paths[0] = std::move(path_new); |
| 362 | table_name = new_table_name; |
| 363 | database_name = new_database_name; |
| 364 | } |
| 365 | |
| 366 | |
| 367 | void registerStorageFile(StorageFactory & factory) |
| 368 | { |
| 369 | factory.registerStorage("File" , [](const StorageFactory::Arguments & args) |
| 370 | { |
| 371 | ASTs & engine_args = args.engine_args; |
| 372 | |
| 373 | if (!(engine_args.size() >= 1 && engine_args.size() <= 3)) |
| 374 | throw Exception( |
| 375 | "Storage File requires from 1 to 3 arguments: name of used format, source and compression_method." , |
| 376 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
| 377 | |
| 378 | engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.local_context); |
| 379 | String format_name = engine_args[0]->as<ASTLiteral &>().value.safeGet<String>(); |
| 380 | |
| 381 | String compression_method; |
| 382 | StorageFile::CommonArguments common_args{args.database_name, args.table_name, format_name, compression_method, |
| 383 | args.columns, args.constraints, args.context}; |
| 384 | |
| 385 | if (engine_args.size() == 1) /// Table in database |
| 386 | return StorageFile::create(args.relative_data_path, common_args); |
| 387 | |
| 388 | /// Will use FD if engine_args[1] is int literal or identifier with std* name |
| 389 | int source_fd = -1; |
| 390 | String source_path; |
| 391 | |
| 392 | if (auto opt_name = tryGetIdentifierName(engine_args[1])) |
| 393 | { |
| 394 | if (*opt_name == "stdin" ) |
| 395 | source_fd = STDIN_FILENO; |
| 396 | else if (*opt_name == "stdout" ) |
| 397 | source_fd = STDOUT_FILENO; |
| 398 | else if (*opt_name == "stderr" ) |
| 399 | source_fd = STDERR_FILENO; |
| 400 | else |
| 401 | throw Exception("Unknown identifier '" + *opt_name + "' in second arg of File storage constructor" , |
| 402 | ErrorCodes::UNKNOWN_IDENTIFIER); |
| 403 | } |
| 404 | else if (const auto * literal = engine_args[1]->as<ASTLiteral>()) |
| 405 | { |
| 406 | auto type = literal->value.getType(); |
| 407 | if (type == Field::Types::Int64) |
| 408 | source_fd = static_cast<int>(literal->value.get<Int64>()); |
| 409 | else if (type == Field::Types::UInt64) |
| 410 | source_fd = static_cast<int>(literal->value.get<UInt64>()); |
| 411 | else if (type == Field::Types::String) |
| 412 | source_path = literal->value.get<String>(); |
| 413 | else |
| 414 | throw Exception("Second argument must be path or file descriptor" , ErrorCodes::BAD_ARGUMENTS); |
| 415 | } |
| 416 | |
| 417 | if (engine_args.size() == 3) |
| 418 | { |
| 419 | engine_args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[2], args.local_context); |
| 420 | compression_method = engine_args[2]->as<ASTLiteral &>().value.safeGet<String>(); |
| 421 | } |
| 422 | else |
| 423 | compression_method = "auto" ; |
| 424 | |
| 425 | if (0 <= source_fd) /// File descriptor |
| 426 | return StorageFile::create(source_fd, common_args); |
| 427 | else /// User's file |
| 428 | return StorageFile::create(source_path, args.context.getUserFilesPath(), common_args); |
| 429 | }); |
| 430 | } |
| 431 | |
| 432 | } |
| 433 | |