| 1 | #include <Interpreters/DDLWorker.h> |
| 2 | #include <Parsers/ASTAlterQuery.h> |
| 3 | #include <Parsers/ASTDropQuery.h> |
| 4 | #include <Parsers/ASTOptimizeQuery.h> |
| 5 | #include <Parsers/ASTQueryWithOnCluster.h> |
| 6 | #include <Parsers/ASTQueryWithTableAndOutput.h> |
| 7 | #include <Parsers/ParserQuery.h> |
| 8 | #include <Parsers/parseQuery.h> |
| 9 | #include <Parsers/queryToString.h> |
| 10 | #include <IO/WriteHelpers.h> |
| 11 | #include <IO/ReadHelpers.h> |
| 12 | #include <IO/Operators.h> |
| 13 | #include <IO/ReadBufferFromString.h> |
| 14 | #include <Storages/IStorage.h> |
| 15 | #include <DataStreams/IBlockInputStream.h> |
| 16 | #include <Interpreters/executeQuery.h> |
| 17 | #include <Interpreters/Cluster.h> |
| 18 | #include <Interpreters/AddDefaultDatabaseVisitor.h> |
| 19 | #include <Common/DNSResolver.h> |
| 20 | #include <Common/Macros.h> |
| 21 | #include <Common/getFQDNOrHostName.h> |
| 22 | #include <Common/setThreadName.h> |
| 23 | #include <Common/Stopwatch.h> |
| 24 | #include <Common/randomSeed.h> |
| 25 | #include <common/sleep.h> |
| 26 | #include <DataTypes/DataTypesNumber.h> |
| 27 | #include <DataTypes/DataTypeString.h> |
| 28 | #include <DataTypes/DataTypeArray.h> |
| 29 | #include <Columns/ColumnsNumber.h> |
| 30 | #include <Columns/ColumnString.h> |
| 31 | #include <Columns/ColumnArray.h> |
| 32 | #include <Common/ZooKeeper/ZooKeeper.h> |
| 33 | #include <Common/ZooKeeper/KeeperException.h> |
| 34 | #include <Common/ZooKeeper/Lock.h> |
| 35 | #include <Common/isLocalAddress.h> |
| 36 | #include <Storages/StorageReplicatedMergeTree.h> |
| 37 | #include <Poco/Timestamp.h> |
| 38 | #include <random> |
| 39 | #include <pcg_random.hpp> |
| 40 | #include <Poco/Net/NetException.h> |
| 41 | |
| 42 | |
| 43 | namespace DB |
| 44 | { |
| 45 | |
| 46 | namespace ErrorCodes |
| 47 | { |
| 48 | extern const int LOGICAL_ERROR; |
| 49 | extern const int UNKNOWN_ELEMENT_IN_CONFIG; |
| 50 | extern const int INVALID_CONFIG_PARAMETER; |
| 51 | extern const int UNKNOWN_FORMAT_VERSION; |
| 52 | extern const int INCONSISTENT_TABLE_ACCROSS_SHARDS; |
| 53 | extern const int INCONSISTENT_CLUSTER_DEFINITION; |
| 54 | extern const int TIMEOUT_EXCEEDED; |
| 55 | extern const int UNKNOWN_TYPE_OF_QUERY; |
| 56 | extern const int UNFINISHED; |
| 57 | extern const int UNKNOWN_STATUS_OF_DISTRIBUTED_DDL_TASK; |
| 58 | extern const int QUERY_IS_PROHIBITED; |
| 59 | } |
| 60 | |
| 61 | |
| 62 | namespace |
| 63 | { |
| 64 | |
| 65 | struct HostID |
| 66 | { |
| 67 | String host_name; |
| 68 | UInt16 port; |
| 69 | |
| 70 | HostID() = default; |
| 71 | |
| 72 | explicit HostID(const Cluster::Address & address) |
| 73 | : host_name(address.host_name), port(address.port) {} |
| 74 | |
| 75 | static HostID fromString(const String & host_port_str) |
| 76 | { |
| 77 | HostID res; |
| 78 | std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str); |
| 79 | return res; |
| 80 | } |
| 81 | |
| 82 | String toString() const |
| 83 | { |
| 84 | return Cluster::Address::toString(host_name, port); |
| 85 | } |
| 86 | |
| 87 | String readableString() const |
| 88 | { |
| 89 | return host_name + ":" + DB::toString(port); |
| 90 | } |
| 91 | |
| 92 | bool isLocalAddress(UInt16 clickhouse_port) const |
| 93 | { |
| 94 | try |
| 95 | { |
| 96 | return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port); |
| 97 | } |
| 98 | catch (const Poco::Net::NetException &) |
| 99 | { |
| 100 | /// Avoid "Host not found" exceptions |
| 101 | return false; |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | static String applyToString(const HostID & host_id) |
| 106 | { |
| 107 | return host_id.toString(); |
| 108 | } |
| 109 | }; |
| 110 | |
| 111 | } |
| 112 | |
| 113 | |
| 114 | struct DDLLogEntry |
| 115 | { |
| 116 | String query; |
| 117 | std::vector<HostID> hosts; |
| 118 | String initiator; // optional |
| 119 | |
| 120 | static constexpr int CURRENT_VERSION = 1; |
| 121 | |
| 122 | String toString() |
| 123 | { |
| 124 | WriteBufferFromOwnString wb; |
| 125 | |
| 126 | Strings host_id_strings(hosts.size()); |
| 127 | std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); |
| 128 | |
| 129 | auto version = CURRENT_VERSION; |
| 130 | wb << "version: " << version << "\n" ; |
| 131 | wb << "query: " << escape << query << "\n" ; |
| 132 | wb << "hosts: " << host_id_strings << "\n" ; |
| 133 | wb << "initiator: " << initiator << "\n" ; |
| 134 | |
| 135 | return wb.str(); |
| 136 | } |
| 137 | |
| 138 | void parse(const String & data) |
| 139 | { |
| 140 | ReadBufferFromString rb(data); |
| 141 | |
| 142 | int version; |
| 143 | rb >> "version: " >> version >> "\n" ; |
| 144 | |
| 145 | if (version != CURRENT_VERSION) |
| 146 | throw Exception("Unknown DDLLogEntry format version: " + DB::toString(version), ErrorCodes::UNKNOWN_FORMAT_VERSION); |
| 147 | |
| 148 | Strings host_id_strings; |
| 149 | rb >> "query: " >> escape >> query >> "\n" ; |
| 150 | rb >> "hosts: " >> host_id_strings >> "\n" ; |
| 151 | |
| 152 | if (!rb.eof()) |
| 153 | rb >> "initiator: " >> initiator >> "\n" ; |
| 154 | else |
| 155 | initiator.clear(); |
| 156 | |
| 157 | assertEOF(rb); |
| 158 | |
| 159 | hosts.resize(host_id_strings.size()); |
| 160 | std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); |
| 161 | } |
| 162 | }; |
| 163 | |
| 164 | |
| 165 | struct DDLTask |
| 166 | { |
| 167 | /// Stages of task lifetime correspond ordering of these data fields: |
| 168 | |
| 169 | /// Stage 1: parse entry |
| 170 | String entry_name; |
| 171 | String entry_path; |
| 172 | DDLLogEntry entry; |
| 173 | |
| 174 | /// Stage 2: resolve host_id and check that |
| 175 | HostID host_id; |
| 176 | String host_id_str; |
| 177 | |
| 178 | /// Stage 3.1: parse query |
| 179 | ASTPtr query; |
| 180 | ASTQueryWithOnCluster * query_on_cluster = nullptr; |
| 181 | |
| 182 | /// Stage 3.2: check cluster and find the host in cluster |
| 183 | String cluster_name; |
| 184 | ClusterPtr cluster; |
| 185 | Cluster::Address address_in_cluster; |
| 186 | size_t host_shard_num; |
| 187 | size_t host_replica_num; |
| 188 | |
| 189 | /// Stage 3.3: execute query |
| 190 | ExecutionStatus execution_status; |
| 191 | bool was_executed = false; |
| 192 | |
| 193 | /// Stage 4: commit results to ZooKeeper |
| 194 | }; |
| 195 | |
| 196 | |
| 197 | static std::unique_ptr<zkutil::Lock> createSimpleZooKeeperLock( |
| 198 | const std::shared_ptr<zkutil::ZooKeeper> & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) |
| 199 | { |
| 200 | auto zookeeper_holder = std::make_shared<zkutil::ZooKeeperHolder>(); |
| 201 | zookeeper_holder->initFromInstance(zookeeper); |
| 202 | return std::make_unique<zkutil::Lock>(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); |
| 203 | } |
| 204 | |
| 205 | |
| 206 | static bool isSupportedAlterType(int type) |
| 207 | { |
| 208 | static const std::unordered_set<int> unsupported_alter_types{ |
| 209 | ASTAlterCommand::ATTACH_PARTITION, |
| 210 | ASTAlterCommand::REPLACE_PARTITION, |
| 211 | ASTAlterCommand::FETCH_PARTITION, |
| 212 | ASTAlterCommand::FREEZE_PARTITION, |
| 213 | ASTAlterCommand::FREEZE_ALL, |
| 214 | ASTAlterCommand::NO_TYPE, |
| 215 | }; |
| 216 | |
| 217 | return unsupported_alter_types.count(type) == 0; |
| 218 | } |
| 219 | |
| 220 | |
| 221 | DDLWorker::DDLWorker(const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix) |
| 222 | : context(context_), log(&Logger::get("DDLWorker" )) |
| 223 | { |
| 224 | queue_dir = zk_root_dir; |
| 225 | if (queue_dir.back() == '/') |
| 226 | queue_dir.resize(queue_dir.size() - 1); |
| 227 | |
| 228 | if (config) |
| 229 | { |
| 230 | task_max_lifetime = config->getUInt64(prefix + ".task_max_lifetime" , static_cast<UInt64>(task_max_lifetime)); |
| 231 | cleanup_delay_period = config->getUInt64(prefix + ".cleanup_delay_period" , static_cast<UInt64>(cleanup_delay_period)); |
| 232 | max_tasks_in_queue = std::max<UInt64>(1, config->getUInt64(prefix + ".max_tasks_in_queue" , max_tasks_in_queue)); |
| 233 | |
| 234 | if (config->has(prefix + ".profile" )) |
| 235 | context.setSetting("profile" , config->getString(prefix + ".profile" )); |
| 236 | } |
| 237 | |
| 238 | if (context.getSettingsRef().readonly) |
| 239 | { |
| 240 | LOG_WARNING(log, "Distributed DDL worker is run with readonly settings, it will not be able to execute DDL queries" |
| 241 | << " Set apropriate system_profile or distributed_ddl.profile to fix this." ); |
| 242 | } |
| 243 | |
| 244 | host_fqdn = getFQDNOrHostName(); |
| 245 | host_fqdn_id = Cluster::Address::toString(host_fqdn, context.getTCPPort()); |
| 246 | |
| 247 | main_thread = ThreadFromGlobalPool(&DDLWorker::runMainThread, this); |
| 248 | cleanup_thread = ThreadFromGlobalPool(&DDLWorker::runCleanupThread, this); |
| 249 | } |
| 250 | |
| 251 | |
| 252 | DDLWorker::~DDLWorker() |
| 253 | { |
| 254 | stop_flag = true; |
| 255 | queue_updated_event->set(); |
| 256 | cleanup_event->set(); |
| 257 | main_thread.join(); |
| 258 | cleanup_thread.join(); |
| 259 | } |
| 260 | |
| 261 | |
| 262 | DDLWorker::ZooKeeperPtr DDLWorker::tryGetZooKeeper() const |
| 263 | { |
| 264 | std::lock_guard lock(zookeeper_mutex); |
| 265 | return current_zookeeper; |
| 266 | } |
| 267 | |
| 268 | DDLWorker::ZooKeeperPtr DDLWorker::getAndSetZooKeeper() |
| 269 | { |
| 270 | std::lock_guard lock(zookeeper_mutex); |
| 271 | |
| 272 | if (!current_zookeeper || current_zookeeper->expired()) |
| 273 | current_zookeeper = context.getZooKeeper(); |
| 274 | |
| 275 | return current_zookeeper; |
| 276 | } |
| 277 | |
| 278 | |
| 279 | bool DDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) |
| 280 | { |
| 281 | String node_data; |
| 282 | String entry_path = queue_dir + "/" + entry_name; |
| 283 | |
| 284 | if (!zookeeper->tryGet(entry_path, node_data)) |
| 285 | { |
| 286 | /// It is Ok that node could be deleted just now. It means that there are no current host in node's host list. |
| 287 | out_reason = "The task was deleted" ; |
| 288 | return false; |
| 289 | } |
| 290 | |
| 291 | auto task = std::make_unique<DDLTask>(); |
| 292 | task->entry_name = entry_name; |
| 293 | task->entry_path = entry_path; |
| 294 | |
| 295 | try |
| 296 | { |
| 297 | task->entry.parse(node_data); |
| 298 | } |
| 299 | catch (...) |
| 300 | { |
| 301 | /// What should we do if we even cannot parse host name and therefore cannot properly submit execution status? |
| 302 | /// We can try to create fail node using FQDN if it equal to host name in cluster config attempt will be successful. |
| 303 | /// Otherwise, that node will be ignored by DDLQueryStatusInputStream. |
| 304 | |
| 305 | tryLogCurrentException(log, "Cannot parse DDL task " + entry_name + ", will try to send error status" ); |
| 306 | |
| 307 | String status = ExecutionStatus::fromCurrentException().serializeText(); |
| 308 | try |
| 309 | { |
| 310 | createStatusDirs(entry_path, zookeeper); |
| 311 | zookeeper->tryCreate(entry_path + "/finished/" + host_fqdn_id, status, zkutil::CreateMode::Persistent); |
| 312 | } |
| 313 | catch (...) |
| 314 | { |
| 315 | tryLogCurrentException(log, "Can't report the task has invalid format" ); |
| 316 | } |
| 317 | |
| 318 | out_reason = "Incorrect task format" ; |
| 319 | return false; |
| 320 | } |
| 321 | |
| 322 | bool host_in_hostlist = false; |
| 323 | for (const HostID & host : task->entry.hosts) |
| 324 | { |
| 325 | auto maybe_secure_port = context.getTCPPortSecure(); |
| 326 | |
| 327 | /// The port is considered local if it matches TCP or TCP secure port that the server is listening. |
| 328 | bool is_local_port = (maybe_secure_port && host.isLocalAddress(*maybe_secure_port)) |
| 329 | || host.isLocalAddress(context.getTCPPort()); |
| 330 | |
| 331 | if (!is_local_port) |
| 332 | continue; |
| 333 | |
| 334 | if (host_in_hostlist) |
| 335 | { |
| 336 | /// This check could be slow a little bit |
| 337 | LOG_WARNING(log, "There are two the same ClickHouse instances in task " << entry_name |
| 338 | << ": " << task->host_id.readableString() << " and " << host.readableString() << ". Will use the first one only." ); |
| 339 | } |
| 340 | else |
| 341 | { |
| 342 | host_in_hostlist = true; |
| 343 | task->host_id = host; |
| 344 | task->host_id_str = host.toString(); |
| 345 | } |
| 346 | } |
| 347 | |
| 348 | if (host_in_hostlist) |
| 349 | current_task = std::move(task); |
| 350 | else |
| 351 | out_reason = "There is no a local address in host list" ; |
| 352 | |
| 353 | return host_in_hostlist; |
| 354 | } |
| 355 | |
| 356 | |
| 357 | static void filterAndSortQueueNodes(Strings & all_nodes) |
| 358 | { |
| 359 | all_nodes.erase(std::remove_if(all_nodes.begin(), all_nodes.end(), [] (const String & s) { return !startsWith(s, "query-" ); }), all_nodes.end()); |
| 360 | std::sort(all_nodes.begin(), all_nodes.end()); |
| 361 | } |
| 362 | |
| 363 | |
| 364 | void DDLWorker::processTasks() |
| 365 | { |
| 366 | LOG_DEBUG(log, "Processing tasks" ); |
| 367 | auto zookeeper = tryGetZooKeeper(); |
| 368 | |
| 369 | Strings queue_nodes = zookeeper->getChildren(queue_dir, nullptr, queue_updated_event); |
| 370 | filterAndSortQueueNodes(queue_nodes); |
| 371 | if (queue_nodes.empty()) |
| 372 | return; |
| 373 | |
| 374 | bool server_startup = last_processed_task_name.empty(); |
| 375 | |
| 376 | auto begin_node = server_startup |
| 377 | ? queue_nodes.begin() |
| 378 | : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_processed_task_name); |
| 379 | |
| 380 | for (auto it = begin_node; it != queue_nodes.end(); ++it) |
| 381 | { |
| 382 | String entry_name = *it; |
| 383 | |
| 384 | if (current_task) |
| 385 | { |
| 386 | if (current_task->entry_name == entry_name) |
| 387 | { |
| 388 | LOG_INFO(log, "Trying to process task " << entry_name << " again" ); |
| 389 | } |
| 390 | else |
| 391 | { |
| 392 | LOG_INFO(log, "Task " << current_task->entry_name << " was deleted from ZooKeeper before current host committed it" ); |
| 393 | current_task = nullptr; |
| 394 | } |
| 395 | } |
| 396 | |
| 397 | if (!current_task) |
| 398 | { |
| 399 | String reason; |
| 400 | if (!initAndCheckTask(entry_name, reason, zookeeper)) |
| 401 | { |
| 402 | LOG_DEBUG(log, "Will not execute task " << entry_name << ": " << reason); |
| 403 | last_processed_task_name = entry_name; |
| 404 | continue; |
| 405 | } |
| 406 | } |
| 407 | |
| 408 | DDLTask & task = *current_task; |
| 409 | |
| 410 | bool already_processed = zookeeper->exists(task.entry_path + "/finished/" + task.host_id_str); |
| 411 | if (!server_startup && !task.was_executed && already_processed) |
| 412 | { |
| 413 | throw Exception( |
| 414 | "Server expects that DDL task " + task.entry_name + " should be processed, but it was already processed according to ZK" , |
| 415 | ErrorCodes::LOGICAL_ERROR); |
| 416 | } |
| 417 | |
| 418 | if (!already_processed) |
| 419 | { |
| 420 | try |
| 421 | { |
| 422 | processTask(task, zookeeper); |
| 423 | } |
| 424 | catch (...) |
| 425 | { |
| 426 | LOG_WARNING(log, "An error occurred while processing task " << task.entry_name << " (" << task.entry.query << ") : " |
| 427 | << getCurrentExceptionMessage(true)); |
| 428 | throw; |
| 429 | } |
| 430 | } |
| 431 | else |
| 432 | { |
| 433 | LOG_DEBUG(log, "Task " << task.entry_name << " (" << task.entry.query << ") has been already processed" ); |
| 434 | } |
| 435 | |
| 436 | last_processed_task_name = task.entry_name; |
| 437 | current_task.reset(); |
| 438 | |
| 439 | if (stop_flag) |
| 440 | break; |
| 441 | } |
| 442 | } |
| 443 | |
| 444 | |
| 445 | /// Parses query and resolves cluster and host in cluster |
| 446 | void DDLWorker::parseQueryAndResolveHost(DDLTask & task) |
| 447 | { |
| 448 | { |
| 449 | const char * begin = task.entry.query.data(); |
| 450 | const char * end = begin + task.entry.query.size(); |
| 451 | |
| 452 | ParserQuery parser_query(end); |
| 453 | String description; |
| 454 | task.query = parseQuery(parser_query, begin, end, description, 0); |
| 455 | } |
| 456 | |
| 457 | // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! |
| 458 | if (!task.query || !(task.query_on_cluster = dynamic_cast<ASTQueryWithOnCluster *>(task.query.get()))) |
| 459 | throw Exception("Received unknown DDL query" , ErrorCodes::UNKNOWN_TYPE_OF_QUERY); |
| 460 | |
| 461 | task.cluster_name = task.query_on_cluster->cluster; |
| 462 | task.cluster = context.tryGetCluster(task.cluster_name); |
| 463 | if (!task.cluster) |
| 464 | { |
| 465 | throw Exception("DDL task " + task.entry_name + " contains current host " + task.host_id.readableString() |
| 466 | + " in cluster " + task.cluster_name + ", but there are no such cluster here." , ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION); |
| 467 | } |
| 468 | |
| 469 | /// Try to find host from task host list in cluster |
| 470 | /// At the first, try find exact match (host name and ports should be literally equal) |
| 471 | /// If the attempt fails, try find it resolving host name of each instance |
| 472 | const auto & shards = task.cluster->getShardsAddresses(); |
| 473 | |
| 474 | bool found_exact_match = false; |
| 475 | for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) |
| 476 | { |
| 477 | for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) |
| 478 | { |
| 479 | const Cluster::Address & address = shards[shard_num][replica_num]; |
| 480 | |
| 481 | if (address.host_name == task.host_id.host_name && address.port == task.host_id.port) |
| 482 | { |
| 483 | if (found_exact_match) |
| 484 | { |
| 485 | throw Exception("There are two exactly the same ClickHouse instances " + address.readableString() |
| 486 | + " in cluster " + task.cluster_name, ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION); |
| 487 | } |
| 488 | |
| 489 | found_exact_match = true; |
| 490 | task.host_shard_num = shard_num; |
| 491 | task.host_replica_num = replica_num; |
| 492 | task.address_in_cluster = address; |
| 493 | } |
| 494 | } |
| 495 | } |
| 496 | |
| 497 | if (found_exact_match) |
| 498 | return; |
| 499 | |
| 500 | LOG_WARNING(log, "Not found the exact match of host " << task.host_id.readableString() << " from task " << task.entry_name |
| 501 | << " in cluster " << task.cluster_name << " definition. Will try to find it using host name resolving." ); |
| 502 | |
| 503 | bool found_via_resolving = false; |
| 504 | for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) |
| 505 | { |
| 506 | for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) |
| 507 | { |
| 508 | const Cluster::Address & address = shards[shard_num][replica_num]; |
| 509 | |
| 510 | if (auto resolved = address.getResolvedAddress(); |
| 511 | resolved && (isLocalAddress(*resolved, context.getTCPPort()) |
| 512 | || (context.getTCPPortSecure() && isLocalAddress(*resolved, *context.getTCPPortSecure())))) |
| 513 | { |
| 514 | if (found_via_resolving) |
| 515 | { |
| 516 | throw Exception("There are two the same ClickHouse instances in cluster " + task.cluster_name + " : " |
| 517 | + task.address_in_cluster.readableString() + " and " + address.readableString(), ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION); |
| 518 | } |
| 519 | else |
| 520 | { |
| 521 | found_via_resolving = true; |
| 522 | task.host_shard_num = shard_num; |
| 523 | task.host_replica_num = replica_num; |
| 524 | task.address_in_cluster = address; |
| 525 | } |
| 526 | } |
| 527 | } |
| 528 | } |
| 529 | |
| 530 | if (!found_via_resolving) |
| 531 | { |
| 532 | throw Exception("Not found host " + task.host_id.readableString() + " in definition of cluster " + task.cluster_name, |
| 533 | ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION); |
| 534 | } |
| 535 | else |
| 536 | { |
| 537 | LOG_INFO(log, "Resolved host " << task.host_id.readableString() << " from task " << task.entry_name |
| 538 | << " as host " << task.address_in_cluster.readableString() << " in definition of cluster " << task.cluster_name); |
| 539 | } |
| 540 | } |
| 541 | |
| 542 | |
| 543 | bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, ExecutionStatus & status) |
| 544 | { |
| 545 | /// Add special comment at the start of query to easily identify DDL-produced queries in query_log |
| 546 | String query_prefix = "/* ddl_entry=" + task.entry_name + " */ " ; |
| 547 | String query_to_execute = query_prefix + query; |
| 548 | |
| 549 | ReadBufferFromString istr(query_to_execute); |
| 550 | String dummy_string; |
| 551 | WriteBufferFromString ostr(dummy_string); |
| 552 | |
| 553 | try |
| 554 | { |
| 555 | current_context = std::make_unique<Context>(context); |
| 556 | current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; |
| 557 | current_context->setCurrentQueryId("" ); // generate random query_id |
| 558 | executeQuery(istr, ostr, false, *current_context, {}, {}); |
| 559 | } |
| 560 | catch (...) |
| 561 | { |
| 562 | status = ExecutionStatus::fromCurrentException(); |
| 563 | tryLogCurrentException(log, "Query " + query + " wasn't finished successfully" ); |
| 564 | |
| 565 | return false; |
| 566 | } |
| 567 | |
| 568 | status = ExecutionStatus(0); |
| 569 | LOG_DEBUG(log, "Executed query: " << query); |
| 570 | |
| 571 | return true; |
| 572 | } |
| 573 | |
| 574 | void DDLWorker::attachToThreadGroup() |
| 575 | { |
| 576 | if (thread_group) |
| 577 | { |
| 578 | /// Put all threads to one thread pool |
| 579 | CurrentThread::attachToIfDetached(thread_group); |
| 580 | } |
| 581 | else |
| 582 | { |
| 583 | CurrentThread::initializeQuery(); |
| 584 | thread_group = CurrentThread::getGroup(); |
| 585 | } |
| 586 | } |
| 587 | |
| 588 | |
| 589 | void DDLWorker::processTask(DDLTask & task, const ZooKeeperPtr & zookeeper) |
| 590 | { |
| 591 | LOG_DEBUG(log, "Processing task " << task.entry_name << " (" << task.entry.query << ")" ); |
| 592 | |
| 593 | String dummy; |
| 594 | String active_node_path = task.entry_path + "/active/" + task.host_id_str; |
| 595 | String finished_node_path = task.entry_path + "/finished/" + task.host_id_str; |
| 596 | |
| 597 | auto code = zookeeper->tryCreate(active_node_path, "" , zkutil::CreateMode::Ephemeral, dummy); |
| 598 | |
| 599 | if (code == Coordination::ZOK || code == Coordination::ZNODEEXISTS) |
| 600 | { |
| 601 | // Ok |
| 602 | } |
| 603 | else if (code == Coordination::ZNONODE) |
| 604 | { |
| 605 | /// There is no parent |
| 606 | createStatusDirs(task.entry_path, zookeeper); |
| 607 | if (Coordination::ZOK != zookeeper->tryCreate(active_node_path, "" , zkutil::CreateMode::Ephemeral, dummy)) |
| 608 | throw Coordination::Exception(code, active_node_path); |
| 609 | } |
| 610 | else |
| 611 | throw Coordination::Exception(code, active_node_path); |
| 612 | |
| 613 | if (!task.was_executed) |
| 614 | { |
| 615 | try |
| 616 | { |
| 617 | parseQueryAndResolveHost(task); |
| 618 | |
| 619 | ASTPtr rewritten_ast = task.query_on_cluster->getRewrittenASTWithoutOnCluster(task.address_in_cluster.default_database); |
| 620 | String rewritten_query = queryToString(rewritten_ast); |
| 621 | LOG_DEBUG(log, "Executing query: " << rewritten_query); |
| 622 | |
| 623 | if (auto query_with_table = dynamic_cast<ASTQueryWithTableAndOutput *>(rewritten_ast.get()); query_with_table) |
| 624 | { |
| 625 | String database = query_with_table->database.empty() ? context.getCurrentDatabase() : query_with_table->database; |
| 626 | StoragePtr storage = context.tryGetTable(database, query_with_table->table); |
| 627 | |
| 628 | /// For some reason we check consistency of cluster definition only |
| 629 | /// in case of ALTER query, but not in case of CREATE/DROP etc. |
| 630 | /// It's strange, but this behaviour exits for a long and we cannot change it. |
| 631 | if (storage && query_with_table->as<ASTAlterQuery>()) |
| 632 | checkShardConfig(query_with_table->table, task, storage); |
| 633 | |
| 634 | if (storage && taskShouldBeExecutedOnLeader(rewritten_ast, storage)) |
| 635 | tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); |
| 636 | else |
| 637 | tryExecuteQuery(rewritten_query, task, task.execution_status); |
| 638 | } |
| 639 | else |
| 640 | tryExecuteQuery(rewritten_query, task, task.execution_status); |
| 641 | } |
| 642 | catch (const Coordination::Exception &) |
| 643 | { |
| 644 | throw; |
| 645 | } |
| 646 | catch (...) |
| 647 | { |
| 648 | tryLogCurrentException(log, "An error occurred before execution of DDL task: " ); |
| 649 | task.execution_status = ExecutionStatus::fromCurrentException("An error occurred before execution" ); |
| 650 | } |
| 651 | |
| 652 | /// We need to distinguish ZK errors occured before and after query executing |
| 653 | task.was_executed = true; |
| 654 | } |
| 655 | |
| 656 | /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. |
| 657 | |
| 658 | /// Delete active flag and create finish flag |
| 659 | Coordination::Requests ops; |
| 660 | ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); |
| 661 | ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); |
| 662 | zookeeper->multi(ops); |
| 663 | } |
| 664 | |
| 665 | |
| 666 | bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const StoragePtr storage) const |
| 667 | { |
| 668 | /// Pure DROP queries have to be executed on each node separately |
| 669 | if (auto query = ast_ddl->as<ASTDropQuery>(); query && query->kind != ASTDropQuery::Kind::Truncate) |
| 670 | return false; |
| 671 | |
| 672 | if (!ast_ddl->as<ASTAlterQuery>() && !ast_ddl->as<ASTOptimizeQuery>() && !ast_ddl->as<ASTDropQuery>()) |
| 673 | return false; |
| 674 | |
| 675 | return storage->supportsReplication(); |
| 676 | } |
| 677 | |
| 678 | |
| 679 | void DDLWorker::checkShardConfig(const String & table, const DDLTask & task, StoragePtr storage) const |
| 680 | { |
| 681 | const auto & shard_info = task.cluster->getShardsInfo().at(task.host_shard_num); |
| 682 | bool config_is_replicated_shard = shard_info.hasInternalReplication(); |
| 683 | |
| 684 | if (storage->supportsReplication() && !config_is_replicated_shard) |
| 685 | { |
| 686 | throw Exception("Table '" + table + "' is replicated, but shard #" + toString(task.host_shard_num + 1) + |
| 687 | " isn't replicated according to its cluster definition." |
| 688 | " Possibly <internal_replication>true</internal_replication> is forgotten in the cluster config." , |
| 689 | ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION); |
| 690 | } |
| 691 | |
| 692 | if (!storage->supportsReplication() && config_is_replicated_shard) |
| 693 | { |
| 694 | throw Exception("Table '" + table + "' isn't replicated, but shard #" + toString(task.host_shard_num + 1) + |
| 695 | " is replicated according to its cluster definition" , ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION); |
| 696 | } |
| 697 | } |
| 698 | |
| 699 | |
| 700 | bool DDLWorker::tryExecuteQueryOnLeaderReplica( |
| 701 | DDLTask & task, |
| 702 | StoragePtr storage, |
| 703 | const String & rewritten_query, |
| 704 | const String & node_path, |
| 705 | const ZooKeeperPtr & zookeeper) |
| 706 | { |
| 707 | StorageReplicatedMergeTree * replicated_storage = dynamic_cast<StorageReplicatedMergeTree *>(storage.get()); |
| 708 | |
| 709 | /// If we will develop new replicated storage |
| 710 | if (!replicated_storage) |
| 711 | throw Exception("Storage type '" + storage->getName() + "' is not supported by distributed DDL" , ErrorCodes::NOT_IMPLEMENTED); |
| 712 | |
| 713 | /// Generate unique name for shard node, it will be used to execute the query by only single host |
| 714 | /// Shard node name has format 'replica_name1,replica_name2,...,replica_nameN' |
| 715 | /// Where replica_name is 'replica_config_host_name:replica_port' |
| 716 | auto get_shard_name = [] (const Cluster::Addresses & shard_addresses) |
| 717 | { |
| 718 | Strings replica_names; |
| 719 | for (const Cluster::Address & address : shard_addresses) |
| 720 | replica_names.emplace_back(address.readableString()); |
| 721 | std::sort(replica_names.begin(), replica_names.end()); |
| 722 | |
| 723 | String res; |
| 724 | for (auto it = replica_names.begin(); it != replica_names.end(); ++it) |
| 725 | res += *it + (std::next(it) != replica_names.end() ? "," : "" ); |
| 726 | |
| 727 | return res; |
| 728 | }; |
| 729 | |
| 730 | String shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); |
| 731 | String shard_path = node_path + "/shards/" + shard_node_name; |
| 732 | String is_executed_path = shard_path + "/executed" ; |
| 733 | zookeeper->createAncestors(shard_path + "/" ); |
| 734 | |
| 735 | auto is_already_executed = [&]() -> bool |
| 736 | { |
| 737 | String executed_by; |
| 738 | if (zookeeper->tryGet(is_executed_path, executed_by)) |
| 739 | { |
| 740 | LOG_DEBUG(log, "Task " << task.entry_name << " has already been executed by leader replica (" |
| 741 | << executed_by << ") of the same shard." ); |
| 742 | return true; |
| 743 | } |
| 744 | |
| 745 | return false; |
| 746 | }; |
| 747 | |
| 748 | pcg64 rng(randomSeed()); |
| 749 | |
| 750 | auto lock = createSimpleZooKeeperLock(zookeeper, shard_path, "lock" , task.host_id_str); |
| 751 | static const size_t max_tries = 20; |
| 752 | bool executed_by_leader = false; |
| 753 | for (size_t num_tries = 0; num_tries < max_tries; ++num_tries) |
| 754 | { |
| 755 | if (is_already_executed()) |
| 756 | { |
| 757 | executed_by_leader = true; |
| 758 | break; |
| 759 | } |
| 760 | |
| 761 | StorageReplicatedMergeTree::Status status; |
| 762 | replicated_storage->getStatus(status); |
| 763 | |
| 764 | /// Leader replica take lock |
| 765 | if (status.is_leader && lock->tryLock()) |
| 766 | { |
| 767 | if (is_already_executed()) |
| 768 | { |
| 769 | executed_by_leader = true; |
| 770 | break; |
| 771 | } |
| 772 | |
| 773 | /// If the leader will unexpectedly changed this method will return false |
| 774 | /// and on the next iteration new leader will take lock |
| 775 | if (tryExecuteQuery(rewritten_query, task, task.execution_status)) |
| 776 | { |
| 777 | zookeeper->create(is_executed_path, task.host_id_str, zkutil::CreateMode::Persistent); |
| 778 | executed_by_leader = true; |
| 779 | break; |
| 780 | } |
| 781 | |
| 782 | } |
| 783 | |
| 784 | /// Does nothing if wasn't previously locked |
| 785 | lock->unlock(); |
| 786 | std::this_thread::sleep_for(std::chrono::milliseconds(std::uniform_int_distribution<long>(0, 1000)(rng))); |
| 787 | } |
| 788 | |
| 789 | /// Not executed by leader so was not executed at all |
| 790 | if (!executed_by_leader) |
| 791 | { |
| 792 | task.execution_status = ExecutionStatus(ErrorCodes::NOT_IMPLEMENTED, |
| 793 | "Cannot execute replicated DDL query on leader" ); |
| 794 | return false; |
| 795 | } |
| 796 | return true; |
| 797 | } |
| 798 | |
| 799 | |
| 800 | void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper) |
| 801 | { |
| 802 | LOG_DEBUG(log, "Cleaning queue" ); |
| 803 | |
| 804 | Strings queue_nodes = zookeeper->getChildren(queue_dir); |
| 805 | filterAndSortQueueNodes(queue_nodes); |
| 806 | |
| 807 | size_t num_outdated_nodes = (queue_nodes.size() > max_tasks_in_queue) ? queue_nodes.size() - max_tasks_in_queue : 0; |
| 808 | auto first_non_outdated_node = queue_nodes.begin() + num_outdated_nodes; |
| 809 | |
| 810 | for (auto it = queue_nodes.cbegin(); it < queue_nodes.cend(); ++it) |
| 811 | { |
| 812 | if (stop_flag) |
| 813 | return; |
| 814 | |
| 815 | String node_name = *it; |
| 816 | String node_path = queue_dir + "/" + node_name; |
| 817 | String lock_path = node_path + "/lock" ; |
| 818 | |
| 819 | Coordination::Stat stat; |
| 820 | String dummy; |
| 821 | |
| 822 | try |
| 823 | { |
| 824 | /// Already deleted |
| 825 | if (!zookeeper->exists(node_path, &stat)) |
| 826 | continue; |
| 827 | |
| 828 | /// Delete node if its lifetmie is expired (according to task_max_lifetime parameter) |
| 829 | constexpr UInt64 zookeeper_time_resolution = 1000; |
| 830 | Int64 zookeeper_time_seconds = stat.ctime / zookeeper_time_resolution; |
| 831 | bool node_lifetime_is_expired = zookeeper_time_seconds + task_max_lifetime < current_time_seconds; |
| 832 | |
| 833 | /// If too many nodes in task queue (> max_tasks_in_queue), delete oldest one |
| 834 | bool node_is_outside_max_window = it < first_non_outdated_node; |
| 835 | |
| 836 | if (!node_lifetime_is_expired && !node_is_outside_max_window) |
| 837 | continue; |
| 838 | |
| 839 | /// Skip if there are active nodes (it is weak guard) |
| 840 | if (zookeeper->exists(node_path + "/active" , &stat) && stat.numChildren > 0) |
| 841 | { |
| 842 | LOG_INFO(log, "Task " << node_name << " should be deleted, but there are active workers. Skipping it." ); |
| 843 | continue; |
| 844 | } |
| 845 | |
| 846 | /// Usage of the lock is not necessary now (tryRemoveRecursive correctly removes node in a presence of concurrent cleaners) |
| 847 | /// But the lock will be required to implement system.distributed_ddl_queue table |
| 848 | auto lock = createSimpleZooKeeperLock(zookeeper, node_path, "lock" , host_fqdn_id); |
| 849 | if (!lock->tryLock()) |
| 850 | { |
| 851 | LOG_INFO(log, "Task " << node_name << " should be deleted, but it is locked. Skipping it." ); |
| 852 | continue; |
| 853 | } |
| 854 | |
| 855 | if (node_lifetime_is_expired) |
| 856 | LOG_INFO(log, "Lifetime of task " << node_name << " is expired, deleting it" ); |
| 857 | else if (node_is_outside_max_window) |
| 858 | LOG_INFO(log, "Task " << node_name << " is outdated, deleting it" ); |
| 859 | |
| 860 | /// Deleting |
| 861 | { |
| 862 | Strings childs = zookeeper->getChildren(node_path); |
| 863 | for (const String & child : childs) |
| 864 | { |
| 865 | if (child != "lock" ) |
| 866 | zookeeper->tryRemoveRecursive(node_path + "/" + child); |
| 867 | } |
| 868 | |
| 869 | /// Remove the lock node and its parent atomically |
| 870 | Coordination::Requests ops; |
| 871 | ops.emplace_back(zkutil::makeRemoveRequest(lock_path, -1)); |
| 872 | ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1)); |
| 873 | zookeeper->multi(ops); |
| 874 | |
| 875 | lock->unlockAssumeLockNodeRemovedManually(); |
| 876 | } |
| 877 | } |
| 878 | catch (...) |
| 879 | { |
| 880 | LOG_INFO(log, "An error occured while checking and cleaning task " + node_name + " from queue: " + getCurrentExceptionMessage(false)); |
| 881 | } |
| 882 | } |
| 883 | } |
| 884 | |
| 885 | |
| 886 | /// Try to create nonexisting "status" dirs for a node |
| 887 | void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper) |
| 888 | { |
| 889 | Coordination::Requests ops; |
| 890 | { |
| 891 | Coordination::CreateRequest request; |
| 892 | request.path = node_path + "/active" ; |
| 893 | ops.emplace_back(std::make_shared<Coordination::CreateRequest>(std::move(request))); |
| 894 | } |
| 895 | { |
| 896 | Coordination::CreateRequest request; |
| 897 | request.path = node_path + "/finished" ; |
| 898 | ops.emplace_back(std::make_shared<Coordination::CreateRequest>(std::move(request))); |
| 899 | } |
| 900 | Coordination::Responses responses; |
| 901 | int code = zookeeper->tryMulti(ops, responses); |
| 902 | if (code && code != Coordination::ZNODEEXISTS) |
| 903 | throw Coordination::Exception(code); |
| 904 | } |
| 905 | |
| 906 | |
| 907 | String DDLWorker::enqueueQuery(DDLLogEntry & entry) |
| 908 | { |
| 909 | if (entry.hosts.empty()) |
| 910 | throw Exception("Empty host list in a distributed DDL task" , ErrorCodes::LOGICAL_ERROR); |
| 911 | |
| 912 | auto zookeeper = getAndSetZooKeeper(); |
| 913 | |
| 914 | String query_path_prefix = queue_dir + "/query-" ; |
| 915 | zookeeper->createAncestors(query_path_prefix); |
| 916 | |
| 917 | String node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); |
| 918 | |
| 919 | /// Optional step |
| 920 | try |
| 921 | { |
| 922 | createStatusDirs(node_path, zookeeper); |
| 923 | } |
| 924 | catch (...) |
| 925 | { |
| 926 | LOG_INFO(log, "An error occurred while creating auxiliary ZooKeeper directories in " << node_path << " . They will be created later" |
| 927 | << ". Error : " << getCurrentExceptionMessage(true)); |
| 928 | } |
| 929 | |
| 930 | return node_path; |
| 931 | } |
| 932 | |
| 933 | |
| 934 | void DDLWorker::runMainThread() |
| 935 | { |
| 936 | setThreadName("DDLWorker" ); |
| 937 | LOG_DEBUG(log, "Started DDLWorker thread" ); |
| 938 | |
| 939 | bool initialized = false; |
| 940 | do |
| 941 | { |
| 942 | try |
| 943 | { |
| 944 | auto zookeeper = getAndSetZooKeeper(); |
| 945 | zookeeper->createAncestors(queue_dir + "/" ); |
| 946 | initialized = true; |
| 947 | } |
| 948 | catch (const Coordination::Exception & e) |
| 949 | { |
| 950 | if (!Coordination::isHardwareError(e.code)) |
| 951 | throw; /// A logical error. |
| 952 | |
| 953 | tryLogCurrentException(__PRETTY_FUNCTION__); |
| 954 | |
| 955 | /// Avoid busy loop when ZooKeeper is not available. |
| 956 | sleepForSeconds(1); |
| 957 | } |
| 958 | catch (...) |
| 959 | { |
| 960 | tryLogCurrentException(log, "Terminating. Cannot initialize DDL queue." ); |
| 961 | return; |
| 962 | } |
| 963 | } |
| 964 | while (!initialized && !stop_flag); |
| 965 | |
| 966 | while (!stop_flag) |
| 967 | { |
| 968 | try |
| 969 | { |
| 970 | attachToThreadGroup(); |
| 971 | |
| 972 | cleanup_event->set(); |
| 973 | processTasks(); |
| 974 | |
| 975 | LOG_DEBUG(log, "Waiting a watch" ); |
| 976 | queue_updated_event->wait(); |
| 977 | } |
| 978 | catch (const Coordination::Exception & e) |
| 979 | { |
| 980 | if (Coordination::isHardwareError(e.code)) |
| 981 | { |
| 982 | LOG_DEBUG(log, "Recovering ZooKeeper session after: " << getCurrentExceptionMessage(false)); |
| 983 | |
| 984 | while (!stop_flag) |
| 985 | { |
| 986 | try |
| 987 | { |
| 988 | getAndSetZooKeeper(); |
| 989 | break; |
| 990 | } |
| 991 | catch (...) |
| 992 | { |
| 993 | tryLogCurrentException(__PRETTY_FUNCTION__); |
| 994 | |
| 995 | using namespace std::chrono_literals; |
| 996 | std::this_thread::sleep_for(5s); |
| 997 | } |
| 998 | } |
| 999 | } |
| 1000 | else if (e.code == Coordination::ZNONODE) |
| 1001 | { |
| 1002 | LOG_ERROR(log, "ZooKeeper error: " << getCurrentExceptionMessage(true)); |
| 1003 | } |
| 1004 | else |
| 1005 | { |
| 1006 | LOG_ERROR(log, "Unexpected ZooKeeper error: " << getCurrentExceptionMessage(true) << ". Terminating." ); |
| 1007 | return; |
| 1008 | } |
| 1009 | } |
| 1010 | catch (...) |
| 1011 | { |
| 1012 | tryLogCurrentException(log, "Unexpected error, will terminate:" ); |
| 1013 | return; |
| 1014 | } |
| 1015 | } |
| 1016 | } |
| 1017 | |
| 1018 | |
| 1019 | void DDLWorker::runCleanupThread() |
| 1020 | { |
| 1021 | setThreadName("DDLWorkerClnr" ); |
| 1022 | LOG_DEBUG(log, "Started DDLWorker cleanup thread" ); |
| 1023 | |
| 1024 | Int64 last_cleanup_time_seconds = 0; |
| 1025 | while (!stop_flag) |
| 1026 | { |
| 1027 | try |
| 1028 | { |
| 1029 | cleanup_event->wait(); |
| 1030 | if (stop_flag) |
| 1031 | break; |
| 1032 | |
| 1033 | Int64 current_time_seconds = Poco::Timestamp().epochTime(); |
| 1034 | if (last_cleanup_time_seconds && current_time_seconds < last_cleanup_time_seconds + cleanup_delay_period) |
| 1035 | { |
| 1036 | LOG_TRACE(log, "Too early to clean queue, will do it later." ); |
| 1037 | continue; |
| 1038 | } |
| 1039 | |
| 1040 | auto zookeeper = tryGetZooKeeper(); |
| 1041 | if (zookeeper->expired()) |
| 1042 | continue; |
| 1043 | |
| 1044 | cleanupQueue(current_time_seconds, zookeeper); |
| 1045 | last_cleanup_time_seconds = current_time_seconds; |
| 1046 | } |
| 1047 | catch (...) |
| 1048 | { |
| 1049 | tryLogCurrentException(log, __PRETTY_FUNCTION__); |
| 1050 | } |
| 1051 | } |
| 1052 | } |
| 1053 | |
| 1054 | |
| 1055 | class DDLQueryStatusInputStream : public IBlockInputStream |
| 1056 | { |
| 1057 | public: |
| 1058 | |
| 1059 | DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_) |
| 1060 | : node_path(zk_node_path), context(context_), watch(CLOCK_MONOTONIC_COARSE), log(&Logger::get("DDLQueryStatusInputStream" )) |
| 1061 | { |
| 1062 | sample = Block{ |
| 1063 | {std::make_shared<DataTypeString>(), "host" }, |
| 1064 | {std::make_shared<DataTypeUInt16>(), "port" }, |
| 1065 | {std::make_shared<DataTypeInt64>(), "status" }, |
| 1066 | {std::make_shared<DataTypeString>(), "error" }, |
| 1067 | {std::make_shared<DataTypeUInt64>(), "num_hosts_remaining" }, |
| 1068 | {std::make_shared<DataTypeUInt64>(), "num_hosts_active" }, |
| 1069 | }; |
| 1070 | |
| 1071 | for (const HostID & host: entry.hosts) |
| 1072 | waiting_hosts.emplace(host.toString()); |
| 1073 | |
| 1074 | addTotalRowsApprox(entry.hosts.size()); |
| 1075 | |
| 1076 | timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; |
| 1077 | } |
| 1078 | |
| 1079 | String getName() const override |
| 1080 | { |
| 1081 | return "DDLQueryStatusInputStream" ; |
| 1082 | } |
| 1083 | |
| 1084 | Block () const override { return sample; } |
| 1085 | |
| 1086 | Block readImpl() override |
| 1087 | { |
| 1088 | Block res; |
| 1089 | if (num_hosts_finished >= waiting_hosts.size()) |
| 1090 | { |
| 1091 | if (first_exception) |
| 1092 | throw Exception(*first_exception); |
| 1093 | |
| 1094 | return res; |
| 1095 | } |
| 1096 | |
| 1097 | auto zookeeper = context.getZooKeeper(); |
| 1098 | size_t try_number = 0; |
| 1099 | |
| 1100 | while (res.rows() == 0) |
| 1101 | { |
| 1102 | if (isCancelled()) |
| 1103 | { |
| 1104 | if (first_exception) |
| 1105 | throw Exception(*first_exception); |
| 1106 | |
| 1107 | return res; |
| 1108 | } |
| 1109 | |
| 1110 | if (timeout_seconds >= 0 && watch.elapsedSeconds() > timeout_seconds) |
| 1111 | { |
| 1112 | size_t num_unfinished_hosts = waiting_hosts.size() - num_hosts_finished; |
| 1113 | size_t num_active_hosts = current_active_hosts.size(); |
| 1114 | |
| 1115 | std::stringstream msg; |
| 1116 | msg << "Watching task " << node_path << " is executing longer than distributed_ddl_task_timeout" |
| 1117 | << " (=" << timeout_seconds << ") seconds." |
| 1118 | << " There are " << num_unfinished_hosts << " unfinished hosts" |
| 1119 | << " (" << num_active_hosts << " of them are currently active)" |
| 1120 | << ", they are going to execute the query in background" ; |
| 1121 | |
| 1122 | throw Exception(msg.str(), ErrorCodes::TIMEOUT_EXCEEDED); |
| 1123 | } |
| 1124 | |
| 1125 | if (num_hosts_finished != 0 || try_number != 0) |
| 1126 | { |
| 1127 | auto current_sleep_for = std::chrono::milliseconds(std::min(static_cast<size_t>(1000), 50 * (try_number + 1))); |
| 1128 | std::this_thread::sleep_for(current_sleep_for); |
| 1129 | } |
| 1130 | |
| 1131 | /// TODO: add shared lock |
| 1132 | if (!zookeeper->exists(node_path)) |
| 1133 | { |
| 1134 | throw Exception("Cannot provide query execution status. The query's node " + node_path |
| 1135 | + " has been deleted by the cleaner since it was finished (or its lifetime is expired)" , |
| 1136 | ErrorCodes::UNFINISHED); |
| 1137 | } |
| 1138 | |
| 1139 | Strings new_hosts = getNewAndUpdate(getChildrenAllowNoNode(zookeeper, node_path + "/finished" )); |
| 1140 | ++try_number; |
| 1141 | if (new_hosts.empty()) |
| 1142 | continue; |
| 1143 | |
| 1144 | current_active_hosts = getChildrenAllowNoNode(zookeeper, node_path + "/active" ); |
| 1145 | |
| 1146 | MutableColumns columns = sample.cloneEmptyColumns(); |
| 1147 | for (const String & host_id : new_hosts) |
| 1148 | { |
| 1149 | ExecutionStatus status(-1, "Cannot obtain error message" ); |
| 1150 | { |
| 1151 | String status_data; |
| 1152 | if (zookeeper->tryGet(node_path + "/finished/" + host_id, status_data)) |
| 1153 | status.tryDeserializeText(status_data); |
| 1154 | } |
| 1155 | |
| 1156 | auto [host, port] = Cluster::Address::fromString(host_id); |
| 1157 | |
| 1158 | if (status.code != 0 && first_exception == nullptr) |
| 1159 | first_exception = std::make_unique<Exception>("There was an error on [" + host + ":" + toString(port) + "]: " + status.message, status.code); |
| 1160 | |
| 1161 | ++num_hosts_finished; |
| 1162 | |
| 1163 | columns[0]->insert(host); |
| 1164 | columns[1]->insert(port); |
| 1165 | columns[2]->insert(status.code); |
| 1166 | columns[3]->insert(status.message); |
| 1167 | columns[4]->insert(waiting_hosts.size() - num_hosts_finished); |
| 1168 | columns[5]->insert(current_active_hosts.size()); |
| 1169 | } |
| 1170 | res = sample.cloneWithColumns(std::move(columns)); |
| 1171 | } |
| 1172 | |
| 1173 | return res; |
| 1174 | } |
| 1175 | |
| 1176 | Block getSampleBlock() const |
| 1177 | { |
| 1178 | return sample.cloneEmpty(); |
| 1179 | } |
| 1180 | |
| 1181 | ~DDLQueryStatusInputStream() override = default; |
| 1182 | |
| 1183 | private: |
| 1184 | |
| 1185 | static Strings getChildrenAllowNoNode(const std::shared_ptr<zkutil::ZooKeeper> & zookeeper, const String & node_path) |
| 1186 | { |
| 1187 | Strings res; |
| 1188 | int code = zookeeper->tryGetChildren(node_path, res); |
| 1189 | if (code && code != Coordination::ZNONODE) |
| 1190 | throw Coordination::Exception(code, node_path); |
| 1191 | return res; |
| 1192 | } |
| 1193 | |
| 1194 | Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts) |
| 1195 | { |
| 1196 | Strings diff; |
| 1197 | for (const String & host : current_list_of_finished_hosts) |
| 1198 | { |
| 1199 | if (!waiting_hosts.count(host)) |
| 1200 | { |
| 1201 | if (!ignoring_hosts.count(host)) |
| 1202 | { |
| 1203 | ignoring_hosts.emplace(host); |
| 1204 | LOG_INFO(log, "Unexpected host " << host << " appeared " << " in task " << node_path); |
| 1205 | } |
| 1206 | continue; |
| 1207 | } |
| 1208 | |
| 1209 | if (!finished_hosts.count(host)) |
| 1210 | { |
| 1211 | diff.emplace_back(host); |
| 1212 | finished_hosts.emplace(host); |
| 1213 | } |
| 1214 | } |
| 1215 | |
| 1216 | return diff; |
| 1217 | } |
| 1218 | |
| 1219 | private: |
| 1220 | String node_path; |
| 1221 | const Context & context; |
| 1222 | Stopwatch watch; |
| 1223 | Logger * log; |
| 1224 | |
| 1225 | Block sample; |
| 1226 | |
| 1227 | NameSet waiting_hosts; /// hosts from task host list |
| 1228 | NameSet finished_hosts; /// finished hosts from host list |
| 1229 | NameSet ignoring_hosts; /// appeared hosts that are not in hosts list |
| 1230 | Strings current_active_hosts; /// Hosts that were in active state at the last check |
| 1231 | size_t num_hosts_finished = 0; |
| 1232 | |
| 1233 | /// Save the first detected error and throw it at the end of execution |
| 1234 | std::unique_ptr<Exception> first_exception; |
| 1235 | |
| 1236 | Int64 timeout_seconds = 120; |
| 1237 | }; |
| 1238 | |
| 1239 | |
| 1240 | BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context, NameSet && query_databases) |
| 1241 | { |
| 1242 | /// Remove FORMAT <fmt> and INTO OUTFILE <file> if exists |
| 1243 | ASTPtr query_ptr = query_ptr_->clone(); |
| 1244 | ASTQueryWithOutput::resetOutputASTIfExist(*query_ptr); |
| 1245 | |
| 1246 | // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! |
| 1247 | auto * query = dynamic_cast<ASTQueryWithOnCluster *>(query_ptr.get()); |
| 1248 | if (!query) |
| 1249 | { |
| 1250 | throw Exception("Distributed execution is not supported for such DDL queries" , ErrorCodes::NOT_IMPLEMENTED); |
| 1251 | } |
| 1252 | |
| 1253 | if (!context.getSettingsRef().allow_distributed_ddl) |
| 1254 | throw Exception("Distributed DDL queries are prohibited for the user" , ErrorCodes::QUERY_IS_PROHIBITED); |
| 1255 | |
| 1256 | if (const auto * query_alter = query_ptr->as<ASTAlterQuery>()) |
| 1257 | { |
| 1258 | for (const auto & command : query_alter->command_list->commands) |
| 1259 | { |
| 1260 | if (!isSupportedAlterType(command->type)) |
| 1261 | throw Exception("Unsupported type of ALTER query" , ErrorCodes::NOT_IMPLEMENTED); |
| 1262 | } |
| 1263 | } |
| 1264 | |
| 1265 | query->cluster = context.getMacros()->expand(query->cluster); |
| 1266 | ClusterPtr cluster = context.getCluster(query->cluster); |
| 1267 | DDLWorker & ddl_worker = context.getDDLWorker(); |
| 1268 | |
| 1269 | /// Check database access rights, assume that all servers have the same users config |
| 1270 | NameSet databases_to_access; |
| 1271 | const String & current_database = context.getCurrentDatabase(); |
| 1272 | |
| 1273 | Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); |
| 1274 | |
| 1275 | std::vector<HostID> hosts; |
| 1276 | bool use_shard_default_db = false; |
| 1277 | bool use_local_default_db = false; |
| 1278 | for (const auto & shard : shards) |
| 1279 | { |
| 1280 | for (const auto & addr : shard) |
| 1281 | { |
| 1282 | hosts.emplace_back(addr); |
| 1283 | |
| 1284 | /// Expand empty database name to shards' default (o current) database name |
| 1285 | for (const String & database : query_databases) |
| 1286 | { |
| 1287 | if (database.empty()) |
| 1288 | { |
| 1289 | bool has_shard_default_db = !addr.default_database.empty(); |
| 1290 | use_shard_default_db |= has_shard_default_db; |
| 1291 | use_local_default_db |= !has_shard_default_db; |
| 1292 | databases_to_access.emplace(has_shard_default_db ? addr.default_database : current_database); |
| 1293 | } |
| 1294 | else |
| 1295 | databases_to_access.emplace(database); |
| 1296 | } |
| 1297 | } |
| 1298 | } |
| 1299 | |
| 1300 | if (use_shard_default_db && use_local_default_db) |
| 1301 | throw Exception("Mixed local default DB and shard default DB in DDL query" , ErrorCodes::NOT_IMPLEMENTED); |
| 1302 | |
| 1303 | if (databases_to_access.empty()) |
| 1304 | throw Exception("No databases to access in distributed DDL query" , ErrorCodes::LOGICAL_ERROR); |
| 1305 | |
| 1306 | for (const String & database : databases_to_access) |
| 1307 | context.checkDatabaseAccessRights(database); |
| 1308 | |
| 1309 | if (use_local_default_db) |
| 1310 | { |
| 1311 | AddDefaultDatabaseVisitor visitor(current_database); |
| 1312 | visitor.visitDDL(query_ptr); |
| 1313 | } |
| 1314 | |
| 1315 | DDLLogEntry entry; |
| 1316 | entry.hosts = std::move(hosts); |
| 1317 | entry.query = queryToString(query_ptr); |
| 1318 | entry.initiator = ddl_worker.getCommonHostID(); |
| 1319 | String node_path = ddl_worker.enqueueQuery(entry); |
| 1320 | |
| 1321 | BlockIO io; |
| 1322 | if (context.getSettingsRef().distributed_ddl_task_timeout == 0) |
| 1323 | return io; |
| 1324 | |
| 1325 | auto stream = std::make_shared<DDLQueryStatusInputStream>(node_path, entry, context); |
| 1326 | io.in = std::move(stream); |
| 1327 | return io; |
| 1328 | } |
| 1329 | |
| 1330 | |
| 1331 | } |
| 1332 | |