| 1 | /* |
| 2 | * hb.c |
| 3 | * |
| 4 | * Copyright (C) 2012-2017 Aerospike, Inc. |
| 5 | * |
| 6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
| 7 | * license agreements. |
| 8 | * |
| 9 | * This program is free software: you can redistribute it and/or modify it under |
| 10 | * the terms of the GNU Affero General Public License as published by the Free |
| 11 | * Software Foundation, either version 3 of the License, or (at your option) any |
| 12 | * later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
| 15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
| 17 | * details. |
| 18 | * |
| 19 | * You should have received a copy of the GNU Affero General Public License |
| 20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
| 21 | */ |
| 22 | |
| 23 | #include "fabric/hb.h" |
| 24 | |
| 25 | #include <errno.h> |
| 26 | #include <limits.h> |
| 27 | #include <math.h> |
| 28 | #include <pthread.h> |
| 29 | #include <stdio.h> |
| 30 | #include <sys/param.h> |
| 31 | #include <sys/types.h> |
| 32 | #include <zlib.h> |
| 33 | |
| 34 | #include "citrusleaf/alloc.h" |
| 35 | #include "citrusleaf/cf_atomic.h" |
| 36 | #include "citrusleaf/cf_clock.h" |
| 37 | #include "citrusleaf/cf_hash_math.h" |
| 38 | #include "citrusleaf/cf_queue.h" |
| 39 | |
| 40 | #include "cf_thread.h" |
| 41 | #include "dns.h" |
| 42 | #include "fault.h" |
| 43 | #include "node.h" |
| 44 | #include "shash.h" |
| 45 | #include "socket.h" |
| 46 | |
| 47 | #include "base/cfg.h" |
| 48 | #include "base/health.h" |
| 49 | #include "base/stats.h" |
| 50 | #include "base/thr_info.h" |
| 51 | #include "fabric/endpoint.h" |
| 52 | #include "fabric/fabric.h" |
| 53 | #include "fabric/partition_balance.h" |
| 54 | |
| 55 | /* |
| 56 | * Overview |
| 57 | * ======== |
| 58 | * The heartbeat subsystem is a core clustering module that discovers nodes in |
| 59 | * the cluster and monitors connectivity to them. This subsystem maintains an |
| 60 | * "adjacency list", which is the list of nodes deemed to be alive and connected |
| 61 | * at any instance in time. |
| 62 | * |
| 63 | * The heartbeat subsystem is divided into three sub modules |
| 64 | * 1. Config |
| 65 | * 2. Channel |
| 66 | * 3. Mesh |
| 67 | * 4. Main |
| 68 | * |
| 69 | * Config |
| 70 | * ------ |
| 71 | * This sub module deals with overall heartbeat subsystem configuration and |
| 72 | * dynamic updates to configuration. |
| 73 | * |
| 74 | * Channel |
| 75 | * ------- |
| 76 | * This sub module is responsible for maintaining a channel between this node |
| 77 | * and all known nodes. The channel sub module provides the ability to broadcast |
| 78 | * or uni cast messages to known nodes. |
| 79 | * |
| 80 | * Other modules interact with the channel sub module primarily through events |
| 81 | * raised by the channel sub module. The events help other sub modules infer |
| 82 | * connectivity status to known nodes and react to incoming heartbeat message |
| 83 | * from other nodes. |
| 84 | * |
| 85 | * Depending on the configured mode (mesh. multicast) the channels between this |
| 86 | * node and other nodes could be |
| 87 | * 1. TCP and hence unicast. One per pair of nodes. |
| 88 | * 2. Multicast with UDP. One per cluster. |
| 89 | * |
| 90 | * Mesh |
| 91 | * ---- |
| 92 | * This sub module is responsible for discovering cluster members. New nodes are |
| 93 | * discovered via adjacency lists published in their heartbeats of know nodes. |
| 94 | * The mesh module boots up using configured seed nodes. |
| 95 | * |
| 96 | * Main |
| 97 | * ---- |
| 98 | * This sub module orchestrates other modules and hence main. Its primary |
| 99 | * responsibility is to maintain the adjacency list. |
| 100 | * |
| 101 | * Heartbeat messages |
| 102 | * ================== |
| 103 | * |
| 104 | * Every heartbeat message contains |
| 105 | * 1. the source node's nodeid |
| 106 | * 2. the source node's published ip address |
| 107 | * 3. the source node's published port. |
| 108 | * |
| 109 | * There are the following types of heartbeat messages |
| 110 | * 1. Pulse - messages sent at periodic intervals. Will contain current |
| 111 | * adjacency lists |
| 112 | * 2. Info request - message sent in the mesh mode, to a known mesh node, |
| 113 | * in order to get ip address and port of a newly discovered node. |
| 114 | * 3. Info reply - message sent in response to an info request. Returns |
| 115 | * the node's ip address and port. |
| 116 | * |
| 117 | * Message conventions |
| 118 | * ------------------- |
| 119 | * 1. Published adjacency will always contain the source node. |
| 120 | * |
| 121 | * Design philosophy |
| 122 | * ================= |
| 123 | * |
| 124 | * Locking vs single threaded event loop. |
| 125 | * -------------------------------------- |
| 126 | * This first cut leans toward using locks instead of single threaded event |
| 127 | * loops to protect critical data. The choice is driven by the fact that |
| 128 | * synchronous external and inter-sub module interaction looked like more work |
| 129 | * with single threaded event loops. The design chooses simplicity over |
| 130 | * performance given the lower volumes of events that need to be processed here |
| 131 | * as compared to the transaction processing code. The locks are coarse, one per |
| 132 | * sub module and re-entrant. They are used generously and no function makes an |
| 133 | * assumption of locks prior locks being held. |
| 134 | * |
| 135 | * Inter-module interactions in some cases are via synchronous function calls, |
| 136 | * which run the risk of deadlocks. For now, deadlocks should not happen. |
| 137 | * However, if this ideology complicates code, inter-module interaction will be |
| 138 | * rewritten to use asynchronous event queues. |
| 139 | * |
| 140 | * Locking policy |
| 141 | * ============== |
| 142 | * |
| 143 | * 1. Lock as much as you can. The locks are re-entrant. This is not a critical |
| 144 | * high volume code path, and hence correctness with simplicity is preferred. |
| 145 | * Any read / write access to module state should be under a lock. |
| 146 | * 2. Preventing deadlocks |
| 147 | * a. The enforced lock order is |
| 148 | * 1. Protocol lock (SET_PROTOCOL_LOCK) Uses to ensure protocol set is |
| 149 | * atomic. |
| 150 | * 2. Main module (HB_LOCK) |
| 151 | * 3. Mesh and multicast modules (MESH_LOCK) |
| 152 | * 4. Channel (CHANNEL_LOCK) |
| 153 | * 5. Config (HB_CONFIG_LOCK) |
| 154 | * Always make sure every thread acquires locks in this order ONLY. In terms |
| 155 | * of functions calls only lower numbered modules can call functions from the |
| 156 | * higher numbered modules while holding their onto their locks. |
| 157 | * 3. Events raised / messages passed to listeners should be outside the |
| 158 | * module's lock. |
| 159 | * |
| 160 | * Guidelines for message plugins |
| 161 | * ============================== |
| 162 | * The parse data functions should NOT hold any locks and thus avert deadlocks. |
| 163 | * |
| 164 | * TODO |
| 165 | * ==== |
| 166 | * 1. Extend to allow hostnames in mesh mode across the board. |
| 167 | */ |
| 168 | |
| 169 | /* |
| 170 | * ---------------------------------------------------------------------------- |
| 171 | * Macros |
| 172 | * ---------------------------------------------------------------------------- |
| 173 | */ |
| 174 | |
| 175 | /* |
| 176 | * ---------------------------------------------------------------------------- |
| 177 | * Channel |
| 178 | * ---------------------------------------------------------------------------- |
| 179 | */ |
| 180 | |
| 181 | /** |
| 182 | * Size of the poll events set. |
| 183 | */ |
| 184 | #define POLL_SZ 1024 |
| 185 | |
| 186 | /** |
| 187 | * The number of bytes for the message length on the wire. |
| 188 | */ |
| 189 | #define MSG_WIRE_LENGTH_SIZE 4 |
| 190 | |
| 191 | /** |
| 192 | * Channel idle interval after which check for inactive channel is triggered. |
| 193 | */ |
| 194 | #define CHANNEL_IDLE_CHECK_PERIOD (CHANNEL_NODE_READ_IDLE_TIMEOUT() / 2) |
| 195 | |
| 196 | /** |
| 197 | * A channel times out if there is no msg received from a node in this interval. |
| 198 | * Set to a fraction of node timeout so that a new channel could be set up to |
| 199 | * recover from a potentially bad connection before the node times out. |
| 200 | */ |
| 201 | #define CHANNEL_NODE_READ_IDLE_TIMEOUT() \ |
| 202 | (PULSE_TRANSMIT_INTERVAL() \ |
| 203 | * MAX(2, config_max_intervals_missed_get() / 3)) |
| 204 | |
| 205 | /** |
| 206 | * Acquire a lock on the entire channel sub module. |
| 207 | */ |
| 208 | #define CHANNEL_LOCK() (pthread_mutex_lock(&g_channel_lock)) |
| 209 | |
| 210 | /** |
| 211 | * Relinquish the lock on the entire channel sub module. |
| 212 | */ |
| 213 | #define CHANNEL_UNLOCK() (pthread_mutex_unlock(&g_channel_lock)) |
| 214 | |
| 215 | /* |
| 216 | * ---------------------------------------------------------------------------- |
| 217 | * Mesh and Multicast |
| 218 | * ---------------------------------------------------------------------------- |
| 219 | */ |
| 220 | |
| 221 | /** |
| 222 | * Read write timeout (in ms). |
| 223 | */ |
| 224 | #define MESH_RW_TIMEOUT 5 |
| 225 | |
| 226 | /** |
| 227 | * Size of the network header. |
| 228 | * |
| 229 | * Maximum size of IPv4 header - 20 bytes (assuming no variable length fields) |
| 230 | * Fixed size of IPv6 header - 40 bytes (assuming no extension headers) |
| 231 | * Maximum size of TCP header - 60 Bytes |
| 232 | * Size of UDP header (fixed) - 8 bytes |
| 233 | * So maximum size of empty TCP datagram - 60 + 20 = 80 bytes |
| 234 | * So maximum size of empty IPv4 UDP datagram - 20 + 8 = 28 bytes |
| 235 | * So maximum size of empty IPv6 UDP datagram - 40 + 8 = 48 bytes |
| 236 | * |
| 237 | * Being conservative and assuming 30 bytes for IPv4 UDP header and 50 bytes for |
| 238 | * IPv6 UDP header. |
| 239 | */ |
| 240 | #define 50 |
| 241 | |
| 242 | /** |
| 243 | * Expected ratio - (input size) / (compressed size). Assuming 40% decrease in |
| 244 | * size after compression. |
| 245 | */ |
| 246 | #define MSG_COMPRESSION_RATIO (1.0 / 0.60) |
| 247 | |
| 248 | /** |
| 249 | * Mesh timeout for pending nodes. |
| 250 | */ |
| 251 | #define MESH_PENDING_TIMEOUT (CONNECT_TIMEOUT()) |
| 252 | |
| 253 | /** |
| 254 | * Mesh inactive timeout after which a mesh node will be forgotten. |
| 255 | */ |
| 256 | #define MESH_INACTIVE_TIMEOUT (10 * HB_NODE_TIMEOUT()) |
| 257 | |
| 258 | /** |
| 259 | * Mesh timeout for getting the endpoint for a node after which this node will |
| 260 | * be forgotten. |
| 261 | */ |
| 262 | #define MESH_ENDPOINT_UNKNOWN_TIMEOUT (HB_NODE_TIMEOUT()) |
| 263 | |
| 264 | /** |
| 265 | * Intervals at which mesh tender runs. |
| 266 | */ |
| 267 | #define MESH_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL()) |
| 268 | |
| 269 | /** |
| 270 | * Intervals at which attempts to resolve unresolved seed hostname will be made. |
| 271 | */ |
| 272 | #define MESH_SEED_RESOLVE_ATTEMPT_INTERVAL() (HB_NODE_TIMEOUT()) |
| 273 | |
| 274 | /** |
| 275 | * Intervals at which conflict checks is enabled. |
| 276 | */ |
| 277 | #define MESH_CONFLICT_CHECK_INTERVAL() (5 * HB_NODE_TIMEOUT()) |
| 278 | |
| 279 | /** |
| 280 | * Duration for which conflicts are checked. |
| 281 | */ |
| 282 | #define MESH_CONFLICT_CHECK_DURATION() (MESH_CONFLICT_CHECK_INTERVAL() / 5) |
| 283 | |
| 284 | /** |
| 285 | * Acquire a lock on the entire mesh sub module. |
| 286 | */ |
| 287 | #define MESH_LOCK() (pthread_mutex_lock(&g_mesh_lock)) |
| 288 | |
| 289 | /** |
| 290 | * Relinquish the lock on the entire mesh sub module. |
| 291 | */ |
| 292 | #define MESH_UNLOCK() (pthread_mutex_unlock(&g_mesh_lock)) |
| 293 | |
| 294 | /** |
| 295 | * Acquire a lock on the entire multicast sub module. |
| 296 | */ |
| 297 | #define MULTICAST_LOCK() (pthread_mutex_lock(&g_multicast_lock)) |
| 298 | |
| 299 | /** |
| 300 | * Relinquish the lock on the entire multicast sub module. |
| 301 | */ |
| 302 | #define MULTICAST_UNLOCK() (pthread_mutex_unlock(&g_multicast_lock)) |
| 303 | |
| 304 | /* |
| 305 | * ---------------------------------------------------------------------------- |
| 306 | * Main |
| 307 | * ---------------------------------------------------------------------------- |
| 308 | */ |
| 309 | |
| 310 | /** |
| 311 | * The identifier for heartbeat protocol version 3. |
| 312 | */ |
| 313 | #define HB_PROTOCOL_V3_IDENTIFIER 0x6864 |
| 314 | |
| 315 | /** |
| 316 | * Maximum length of hb protocol string. |
| 317 | */ |
| 318 | #define HB_PROTOCOL_STR_MAX_LEN 16 |
| 319 | |
| 320 | /** |
| 321 | * Default allocation size for plugin data. |
| 322 | */ |
| 323 | #define HB_PLUGIN_DATA_DEFAULT_SIZE 128 |
| 324 | |
| 325 | /** |
| 326 | * Block size for allocating node plugin data. Ensure the allocation is in |
| 327 | * multiples of 128 bytes, allowing expansion to 16 nodes without reallocating. |
| 328 | */ |
| 329 | #define HB_PLUGIN_DATA_BLOCK_SIZE 128 |
| 330 | |
| 331 | /** |
| 332 | * Message scratch size for v3 HB messages. To accommodate 64 node cluster. |
| 333 | */ |
| 334 | #define AS_HB_MSG_SCRATCH_SIZE 1024 |
| 335 | |
| 336 | /** |
| 337 | * A soft limit for the maximum cluster size. Meant to be optimize hash and list |
| 338 | * data structures and not as a limit on the number of nodes. |
| 339 | */ |
| 340 | #define AS_HB_CLUSTER_MAX_SIZE_SOFT 200 |
| 341 | |
| 342 | /** |
| 343 | * Maximum event listeners. |
| 344 | */ |
| 345 | #define AS_HB_EVENT_LISTENER_MAX 7 |
| 346 | |
| 347 | /** |
| 348 | * Maximum permissible cluster-name mismatch per node. |
| 349 | */ |
| 350 | #define CLUSTER_NAME_MISMATCH_MAX 2 |
| 351 | |
| 352 | /** |
| 353 | * Timeout for deeming a node dead based on received heartbeats. |
| 354 | */ |
| 355 | #define HB_NODE_TIMEOUT() \ |
| 356 | ((config_max_intervals_missed_get() * config_tx_interval_get())) |
| 357 | |
| 358 | /** |
| 359 | * Intervals at which heartbeats are send. |
| 360 | */ |
| 361 | #define PULSE_TRANSMIT_INTERVAL() \ |
| 362 | (MAX(config_tx_interval_get(), AS_HB_TX_INTERVAL_MS_MIN)) |
| 363 | |
| 364 | /** |
| 365 | * Intervals at which adjacency tender runs. |
| 366 | */ |
| 367 | #define ADJACENCY_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL()) |
| 368 | |
| 369 | /** |
| 370 | * Intervals at which adjacency tender runs in anticipation of addtional node |
| 371 | * depart events. |
| 372 | */ |
| 373 | #define ADJACENCY_FAST_TEND_INTERVAL (MIN(ADJACENCY_TEND_INTERVAL, 10)) |
| 374 | |
| 375 | /** |
| 376 | * Acquire a lock on the external event publisher. |
| 377 | */ |
| 378 | #define EXTERNAL_EVENT_PUBLISH_LOCK() \ |
| 379 | (pthread_mutex_lock(&g_external_event_publish_lock)) |
| 380 | |
| 381 | /** |
| 382 | * Relinquish the lock on the external event publisher. |
| 383 | */ |
| 384 | #define EXTERNAL_EVENT_PUBLISH_UNLOCK() \ |
| 385 | (pthread_mutex_unlock(&g_external_event_publish_lock)) |
| 386 | |
| 387 | /** |
| 388 | * Acquire a lock on the heartbeat main module. |
| 389 | */ |
| 390 | #define HB_LOCK() (pthread_mutex_lock(&g_hb_lock)) |
| 391 | |
| 392 | /** |
| 393 | * Relinquish the lock on the heartbeat main module. |
| 394 | */ |
| 395 | #define HB_UNLOCK() (pthread_mutex_unlock(&g_hb_lock)) |
| 396 | |
| 397 | /** |
| 398 | * Weightage of current latency over current moving average. For now weigh |
| 399 | * recent values heavily over older values. |
| 400 | */ |
| 401 | #define ALPHA (0.65) |
| 402 | |
| 403 | /* |
| 404 | * ---------------------------------------------------------------------------- |
| 405 | * Common |
| 406 | * ---------------------------------------------------------------------------- |
| 407 | */ |
| 408 | |
| 409 | /** |
| 410 | * The default MTU for multicast in case device discovery fails. |
| 411 | */ |
| 412 | #define DEFAULT_MIN_MTU 1500 |
| 413 | |
| 414 | /** |
| 415 | * Maximum memory size allocated on the call stack. |
| 416 | */ |
| 417 | #define STACK_ALLOC_LIMIT (16 * 1024) |
| 418 | |
| 419 | /** |
| 420 | * Max string length for an endpoint list converted to a string. |
| 421 | */ |
| 422 | #define ENDPOINT_LIST_STR_SIZE 1024 |
| 423 | |
| 424 | /** |
| 425 | * A hard limit on the buffer size for parsing incoming messages. |
| 426 | */ |
| 427 | #define MSG_BUFFER_MAX_SIZE (10 * 1024 * 1024) |
| 428 | |
| 429 | #ifndef ASC |
| 430 | #define ASC (2 << 2) |
| 431 | #endif |
| 432 | |
| 433 | /** |
| 434 | * Connection initiation timeout, Capped at 100 ms. |
| 435 | */ |
| 436 | #define CONNECT_TIMEOUT() (MIN(100, config_tx_interval_get())) |
| 437 | |
| 438 | /** |
| 439 | * Allocate a buffer for heart beat messages. Larger buffers are heap allocated |
| 440 | * to prevent stack overflows. |
| 441 | */ |
| 442 | #define MSG_BUFF_ALLOC(size) ( \ |
| 443 | (size) <= MSG_BUFFER_MAX_SIZE ? \ |
| 444 | (((size) > STACK_ALLOC_LIMIT) ? \ |
| 445 | cf_malloc(size) : alloca(size)) : NULL) |
| 446 | |
| 447 | /** |
| 448 | * Allocate a buffer for heart beat messages. Larger buffers are heap allocated |
| 449 | * to prevent stack overflows. Crashes the process on failure to allocate the |
| 450 | * buffer. |
| 451 | */ |
| 452 | #define MSG_BUFF_ALLOC_OR_DIE(size, crash_msg, ...) \ |
| 453 | ({ \ |
| 454 | uint8_t* retval = MSG_BUFF_ALLOC((size)); \ |
| 455 | if (!retval) { \ |
| 456 | CRASH(crash_msg, ##__VA_ARGS__); \ |
| 457 | } \ |
| 458 | retval; \ |
| 459 | }) |
| 460 | |
| 461 | /** |
| 462 | * Free the buffer allocated by MSG_BUFF_ALLOC |
| 463 | */ |
| 464 | #define MSG_BUFF_FREE(buffer, size) \ |
| 465 | if (((size) > STACK_ALLOC_LIMIT) && buffer) {cf_free(buffer);} |
| 466 | |
| 467 | /** |
| 468 | * Acquire a lock on the entire config sub module. |
| 469 | */ |
| 470 | #define HB_CONFIG_LOCK() (pthread_mutex_lock(&g_hb_config_lock)) |
| 471 | |
| 472 | /** |
| 473 | * Relinquish the lock on the entire config sub module. |
| 474 | */ |
| 475 | #define HB_CONFIG_UNLOCK() (pthread_mutex_unlock(&g_hb_config_lock)) |
| 476 | |
| 477 | /** |
| 478 | * Acquire a lock while setting heartbeat protocol dynamically. |
| 479 | */ |
| 480 | #define SET_PROTOCOL_LOCK() (pthread_mutex_lock(&g_set_protocol_lock)) |
| 481 | |
| 482 | /** |
| 483 | * Relinquish the lock after setting heartbeat protocol dynamically. |
| 484 | */ |
| 485 | #define SET_PROTOCOL_UNLOCK() (pthread_mutex_unlock(&g_set_protocol_lock)) |
| 486 | |
| 487 | /** |
| 488 | * Logging macros. |
| 489 | */ |
| 490 | #define CRASH(format, ...) cf_crash(AS_HB, format, ##__VA_ARGS__) |
| 491 | #define CRASH_NOSTACK(format, ...) cf_crash_nostack(AS_HB, format, ##__VA_ARGS__) |
| 492 | #define WARNING(format, ...) cf_warning(AS_HB, format, ##__VA_ARGS__) |
| 493 | #define TICKER_WARNING(format, ...) \ |
| 494 | cf_ticker_warning(AS_HB, format, ##__VA_ARGS__) |
| 495 | #define INFO(format, ...) cf_info(AS_HB, format, ##__VA_ARGS__) |
| 496 | #define DEBUG(format, ...) cf_debug(AS_HB, format, ##__VA_ARGS__) |
| 497 | #define DETAIL(format, ...) cf_detail(AS_HB, format, ##__VA_ARGS__) |
| 498 | #define ASSERT(expression, message, ...) \ |
| 499 | if (!(expression)) {WARNING(message, ##__VA_ARGS__);} |
| 500 | |
| 501 | /* |
| 502 | * ---------------------------------------------------------------------------- |
| 503 | * Private internal data structures |
| 504 | * ---------------------------------------------------------------------------- |
| 505 | */ |
| 506 | |
| 507 | /* |
| 508 | * ---------------------------------------------------------------------------- |
| 509 | * Common |
| 510 | * ---------------------------------------------------------------------------- |
| 511 | */ |
| 512 | |
| 513 | /** |
| 514 | * Heartbeat subsystem state. |
| 515 | */ |
| 516 | typedef enum |
| 517 | { |
| 518 | AS_HB_STATUS_UNINITIALIZED, |
| 519 | AS_HB_STATUS_RUNNING, |
| 520 | AS_HB_STATUS_SHUTTING_DOWN, |
| 521 | AS_HB_STATUS_STOPPED |
| 522 | } as_hb_status; |
| 523 | |
| 524 | /* |
| 525 | * ---------------------------------------------------------------------------- |
| 526 | * Mesh related |
| 527 | * ---------------------------------------------------------------------------- |
| 528 | */ |
| 529 | |
| 530 | /** |
| 531 | * Mesh node status enum. |
| 532 | */ |
| 533 | typedef enum |
| 534 | { |
| 535 | /** |
| 536 | * The mesh node has an active channel. |
| 537 | */ |
| 538 | AS_HB_MESH_NODE_CHANNEL_ACTIVE, |
| 539 | |
| 540 | /** |
| 541 | * The mesh node is waiting for an active channel. |
| 542 | */ |
| 543 | AS_HB_MESH_NODE_CHANNEL_PENDING, |
| 544 | |
| 545 | /** |
| 546 | * The mesh node does not have an active channel. |
| 547 | */ |
| 548 | AS_HB_MESH_NODE_CHANNEL_INACTIVE, |
| 549 | |
| 550 | /** |
| 551 | * The ip address and port for this node are not yet known. |
| 552 | */ |
| 553 | AS_HB_MESH_NODE_ENDPOINT_UNKNOWN, |
| 554 | |
| 555 | /** |
| 556 | * The sentinel value. Should be the last in the enum. |
| 557 | */ |
| 558 | AS_HB_MESH_NODE_STATUS_SENTINEL |
| 559 | } as_hb_mesh_node_status; |
| 560 | |
| 561 | /** |
| 562 | * The info payload for a single node. |
| 563 | */ |
| 564 | typedef struct as_hb_mesh_info_reply_s |
| 565 | { |
| 566 | /** |
| 567 | * The nodeid of the node for which info reply is sent. |
| 568 | */ |
| 569 | cf_node nodeid; |
| 570 | |
| 571 | /** |
| 572 | * The advertised endpoint list for this node. List to allow variable size |
| 573 | * endpoint list. Always access as reply.endpoints[0]. |
| 574 | */ |
| 575 | as_endpoint_list endpoint_list[]; |
| 576 | }__attribute__((__packed__)) as_hb_mesh_info_reply; |
| 577 | |
| 578 | /** |
| 579 | * Mesh tend reduce function udata. |
| 580 | */ |
| 581 | typedef struct as_hb_mesh_tend_reduce_udata_s |
| 582 | { |
| 583 | /** |
| 584 | * The new endpoint lists to connect to. Each list has endpoints for s |
| 585 | * single remote peer. |
| 586 | */ |
| 587 | as_endpoint_list** to_connect; |
| 588 | |
| 589 | /** |
| 590 | * The capacity of the to connect array. |
| 591 | */ |
| 592 | size_t to_connect_capacity; |
| 593 | |
| 594 | /** |
| 595 | * The count of endpoints to connect. |
| 596 | */ |
| 597 | size_t to_connect_count; |
| 598 | |
| 599 | /** |
| 600 | * Pointers to seeds that need matching. |
| 601 | */ |
| 602 | cf_vector* inactive_seeds_p; |
| 603 | } as_hb_mesh_tend_reduce_udata; |
| 604 | |
| 605 | /** |
| 606 | * Mesh endpoint search udata. |
| 607 | */ |
| 608 | typedef struct |
| 609 | { |
| 610 | /** |
| 611 | * The endpoint to search. |
| 612 | */ |
| 613 | cf_sock_addr* to_search; |
| 614 | |
| 615 | /** |
| 616 | * Indicates is a match is found. |
| 617 | */ |
| 618 | bool found; |
| 619 | } as_hb_endpoint_list_addr_find_udata; |
| 620 | |
| 621 | /** |
| 622 | * Mesh endpoint list search udata. |
| 623 | */ |
| 624 | typedef struct as_hb_mesh_endpoint_list_reduce_udata_s |
| 625 | { |
| 626 | /** |
| 627 | * The endpoint to search. |
| 628 | */ |
| 629 | as_endpoint_list* to_search; |
| 630 | |
| 631 | /** |
| 632 | * Indicates is a match is found. |
| 633 | */ |
| 634 | bool found; |
| 635 | |
| 636 | /** |
| 637 | * The matched key if found. |
| 638 | */ |
| 639 | cf_node* matched_nodeid; |
| 640 | } as_hb_mesh_endpoint_list_reduce_udata; |
| 641 | |
| 642 | /** |
| 643 | * Information maintained for configured mesh seed nodes. |
| 644 | */ |
| 645 | typedef struct as_hb_mesh_seed_s |
| 646 | { |
| 647 | /** |
| 648 | * The name / ip address of this seed mesh host. |
| 649 | */ |
| 650 | char seed_host_name[DNS_NAME_MAX_SIZE]; |
| 651 | |
| 652 | /** |
| 653 | * The port of this seed mesh host. |
| 654 | */ |
| 655 | cf_ip_port seed_port; |
| 656 | |
| 657 | /** |
| 658 | * Identifies TLS mesh seed hosts. |
| 659 | */ |
| 660 | bool seed_tls; |
| 661 | |
| 662 | /** |
| 663 | * The heap allocated end point list for this seed host resolved usiung the |
| 664 | * seeds hostname. |
| 665 | * Will be null if the endpoint list cannot be resolved. |
| 666 | */ |
| 667 | as_endpoint_list* resolved_endpoint_list; |
| 668 | |
| 669 | /** |
| 670 | * Timestamp when the seed hostname was resolved into the endpoint list. |
| 671 | * Used to perform periodic refresh of the endpoint list. |
| 672 | */ |
| 673 | cf_clock resolved_endpoint_list_ts; |
| 674 | |
| 675 | /** |
| 676 | * The state of this seed in terms of established channel. |
| 677 | */ |
| 678 | as_hb_mesh_node_status status; |
| 679 | |
| 680 | /** |
| 681 | * The last time the state of this node was updated. |
| 682 | */ |
| 683 | cf_clock last_status_updated; |
| 684 | |
| 685 | /** |
| 686 | * The node id for a matching mesh node entry. A zero will indicate that |
| 687 | * there exists no matching mesh node entry. |
| 688 | */ |
| 689 | cf_node mesh_nodeid; |
| 690 | |
| 691 | /** |
| 692 | * Timestamp indicating when the matching mesh node's endpoint was updated. |
| 693 | * Used to detect endpoint changes to the matching mesh node entry if it |
| 694 | * exists. |
| 695 | */ |
| 696 | as_hlc_timestamp mesh_node_endpoint_change_ts; |
| 697 | } as_hb_mesh_seed; |
| 698 | |
| 699 | /** |
| 700 | * Information maintained for discovered mesh end points. |
| 701 | */ |
| 702 | typedef struct as_hb_mesh_node_s |
| 703 | { |
| 704 | /** |
| 705 | * The heap allocated end point list for this mesh host. Should be freed |
| 706 | * once the last mesh entry is removed from the mesh state. |
| 707 | */ |
| 708 | as_endpoint_list* endpoint_list; |
| 709 | |
| 710 | /** |
| 711 | * Timestamp when the mesh node was last updated. |
| 712 | */ |
| 713 | as_hlc_timestamp endpoint_change_ts; |
| 714 | |
| 715 | /** |
| 716 | * The state of this node in terms of established channel. |
| 717 | */ |
| 718 | as_hb_mesh_node_status status; |
| 719 | |
| 720 | /** |
| 721 | * The last time the state of this node was updated. |
| 722 | */ |
| 723 | cf_clock last_status_updated; |
| 724 | |
| 725 | /** |
| 726 | * The time this node's channel become inactive. |
| 727 | */ |
| 728 | cf_clock inactive_since; |
| 729 | } as_hb_mesh_node; |
| 730 | |
| 731 | /** |
| 732 | * State maintained for the mesh mode. |
| 733 | */ |
| 734 | typedef struct as_hb_mesh_state_s |
| 735 | { |
| 736 | /** |
| 737 | * The sockets on which this instance accepts heartbeat tcp connections. |
| 738 | */ |
| 739 | cf_sockets listening_sockets; |
| 740 | |
| 741 | /** |
| 742 | * Indicates if the published endpoint list is ipv4 only. |
| 743 | */ |
| 744 | bool published_endpoint_list_ipv4_only; |
| 745 | |
| 746 | /** |
| 747 | * The published endpoint list. |
| 748 | */ |
| 749 | as_endpoint_list* published_endpoint_list; |
| 750 | |
| 751 | /** |
| 752 | * Mesh seed data. |
| 753 | */ |
| 754 | cf_vector seeds; |
| 755 | |
| 756 | /** |
| 757 | * A map from an cf_node _key to a mesh node. |
| 758 | */ |
| 759 | cf_shash* nodeid_to_mesh_node; |
| 760 | |
| 761 | /** |
| 762 | * Thread id for the mesh tender thread. |
| 763 | */ |
| 764 | pthread_t mesh_tender_tid; |
| 765 | |
| 766 | /** |
| 767 | * The status of the mesh module. |
| 768 | */ |
| 769 | as_hb_status status; |
| 770 | |
| 771 | /** |
| 772 | * The mtu on the listening device. This is extrapolated to all nodes and |
| 773 | * paths in the cluster. This limits the cluster size possible. |
| 774 | */ |
| 775 | int min_mtu; |
| 776 | |
| 777 | /** |
| 778 | * Indicates if new nodes are discovered. Optimization to start mesh tend |
| 779 | * earlier than normal tend interval on discovering new nodes. |
| 780 | */ |
| 781 | bool nodes_discovered; |
| 782 | } as_hb_mesh_state; |
| 783 | |
| 784 | /* |
| 785 | * ---------------------------------------------------------------------------- |
| 786 | * Multicast data structures |
| 787 | * ---------------------------------------------------------------------------- |
| 788 | */ |
| 789 | |
| 790 | /** |
| 791 | * State maintained for the multicast mode. |
| 792 | */ |
| 793 | typedef struct as_hb_multicast_state_s |
| 794 | { |
| 795 | /** |
| 796 | * The sockets associated with multicast mode. |
| 797 | */ |
| 798 | cf_mserv_cfg cfg; |
| 799 | |
| 800 | /** |
| 801 | * Multicast listening sockets. |
| 802 | */ |
| 803 | cf_sockets listening_sockets; |
| 804 | |
| 805 | /** |
| 806 | * The mtu on the listening device. This is extrapolated to all nodes and |
| 807 | * paths in the cluster. This limits the cluster size possible. |
| 808 | */ |
| 809 | int min_mtu; |
| 810 | } as_hb_multicast_state; |
| 811 | |
| 812 | /* |
| 813 | * ---------------------------------------------------------------------------- |
| 814 | * Channel state |
| 815 | * ---------------------------------------------------------------------------- |
| 816 | */ |
| 817 | |
| 818 | /** |
| 819 | * The type of a channel event. |
| 820 | */ |
| 821 | typedef enum |
| 822 | { |
| 823 | /** |
| 824 | * The endpoint has a channel tx/rx channel associated with it. |
| 825 | */ |
| 826 | AS_HB_CHANNEL_NODE_CONNECTED, |
| 827 | |
| 828 | /** |
| 829 | * The endpoint had a tx/rx channel that went down. |
| 830 | */ |
| 831 | AS_HB_CHANNEL_NODE_DISCONNECTED, |
| 832 | |
| 833 | /** |
| 834 | * A message was received on a connected channel. The message in the event, |
| 835 | * is guaranteed to have passed basic sanity check like have protocol id, |
| 836 | * type and source nodeid. |
| 837 | */ |
| 838 | AS_HB_CHANNEL_MSG_RECEIVED, |
| 839 | |
| 840 | /** |
| 841 | * Channel found node whose cluster name does not match. |
| 842 | */ |
| 843 | AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH |
| 844 | } as_hb_channel_event_type; |
| 845 | |
| 846 | /** |
| 847 | * Status for reads from a channel. |
| 848 | */ |
| 849 | typedef enum |
| 850 | { |
| 851 | /** |
| 852 | * The message was read successfully and parser. |
| 853 | */ |
| 854 | AS_HB_CHANNEL_MSG_READ_SUCCESS, |
| 855 | |
| 856 | /** |
| 857 | * The message read successfully but parsing failed. |
| 858 | */ |
| 859 | AS_HB_CHANNEL_MSG_PARSE_FAIL, |
| 860 | |
| 861 | /** |
| 862 | * The message read failed network io. |
| 863 | */ |
| 864 | AS_HB_CHANNEL_MSG_CHANNEL_FAIL, |
| 865 | |
| 866 | /** |
| 867 | * Sentinel default value. |
| 868 | */ |
| 869 | AS_HB_CHANNEL_MSG_READ_UNDEF |
| 870 | } as_hb_channel_msg_read_status; |
| 871 | |
| 872 | typedef struct |
| 873 | { |
| 874 | /** |
| 875 | * The endpoint address to search channel by. |
| 876 | */ |
| 877 | as_endpoint_list* endpoint_list; |
| 878 | |
| 879 | /** |
| 880 | * Indicates if the endpoint was found. |
| 881 | */ |
| 882 | bool found; |
| 883 | |
| 884 | /** |
| 885 | * The matching socket, if found. |
| 886 | */ |
| 887 | cf_socket* socket; |
| 888 | } as_hb_channel_endpoint_reduce_udata; |
| 889 | |
| 890 | typedef struct |
| 891 | { |
| 892 | /** |
| 893 | * The endpoint address to search channel by. |
| 894 | */ |
| 895 | cf_sock_addr* addr_to_search; |
| 896 | |
| 897 | /** |
| 898 | * Indicates if the endpoint was found. |
| 899 | */ |
| 900 | bool found; |
| 901 | } as_hb_channel_endpoint_iterate_udata; |
| 902 | |
| 903 | typedef struct |
| 904 | { |
| 905 | /** |
| 906 | * The message buffer to send. |
| 907 | */ |
| 908 | uint8_t* buffer; |
| 909 | |
| 910 | /** |
| 911 | * The buffer length. |
| 912 | */ |
| 913 | size_t buffer_len; |
| 914 | } as_hb_channel_buffer_udata; |
| 915 | |
| 916 | /** |
| 917 | * A channel represents a medium to send and receive messages. |
| 918 | */ |
| 919 | typedef struct as_hb_channel_s |
| 920 | { |
| 921 | /** |
| 922 | * Indicates if this channel is a multicast channel. |
| 923 | */ |
| 924 | bool is_multicast; |
| 925 | |
| 926 | /** |
| 927 | * Indicates if this channel is inbound. Not relevant for multicast |
| 928 | * channels. |
| 929 | */ |
| 930 | bool is_inbound; |
| 931 | |
| 932 | /** |
| 933 | * The id of the associated node. In mesh / unicast case this will initially |
| 934 | * be zero and filled in when the nodeid for the node at the other end is |
| 935 | * learnt. In multicast case this will be zero. |
| 936 | */ |
| 937 | cf_node nodeid; |
| 938 | |
| 939 | /** |
| 940 | * The address of the peer. Will always be specified for outbound channels. |
| 941 | */ |
| 942 | cf_sock_addr endpoint_addr; |
| 943 | |
| 944 | /** |
| 945 | * The last time a message was received from this node. |
| 946 | */ |
| 947 | cf_clock last_received; |
| 948 | |
| 949 | /** |
| 950 | * Time when this channel won a socket resolution. Zero if this channel |
| 951 | * never won resolution. In compatibility mode with older code its possible |
| 952 | * we will keep allowing the same socket to win and enter an infinite loop |
| 953 | * of closing the sockets. |
| 954 | */ |
| 955 | cf_clock resolution_win_ts; |
| 956 | } as_hb_channel; |
| 957 | |
| 958 | /** |
| 959 | * State maintained per heartbeat channel. |
| 960 | */ |
| 961 | typedef struct as_hb_channel_state_s |
| 962 | { |
| 963 | /** |
| 964 | * The poll handle. All IO wait across all heartbeat connections happens on |
| 965 | * this handle. |
| 966 | */ |
| 967 | cf_poll poll; |
| 968 | |
| 969 | /** |
| 970 | * Channel status. |
| 971 | */ |
| 972 | as_hb_status status; |
| 973 | |
| 974 | /** |
| 975 | * Maps a socket to an as_hb_channel. |
| 976 | */ |
| 977 | cf_shash* socket_to_channel; |
| 978 | |
| 979 | /** |
| 980 | * Maps a nodeid to a channel specific node data structure. This association |
| 981 | * will be made only on receiving the first heartbeat message from the node |
| 982 | * on a channel. |
| 983 | */ |
| 984 | cf_shash* nodeid_to_socket; |
| 985 | |
| 986 | /** |
| 987 | * Sockets accumulated by the channel tender to close at the end of every |
| 988 | * epoll loop. |
| 989 | */ |
| 990 | cf_queue socket_close_queue; |
| 991 | |
| 992 | /** |
| 993 | * The sockets on which heartbeat subsystem listens. |
| 994 | */ |
| 995 | cf_sockets* listening_sockets; |
| 996 | |
| 997 | /** |
| 998 | * Clock to keep track of last time idle connections were checked. |
| 999 | */ |
| 1000 | cf_clock last_channel_idle_check; |
| 1001 | |
| 1002 | /** |
| 1003 | * Enables / disables publishing channel events. Events should be disabled |
| 1004 | * only when the state changes are temporary / transient and hence would not |
| 1005 | * change the overall channel state from an external perspective. |
| 1006 | */ |
| 1007 | bool events_enabled; |
| 1008 | |
| 1009 | /** |
| 1010 | * Events are batched and published to reduce cluster transitions. Queue of |
| 1011 | * unpublished heartbeat events. |
| 1012 | */ |
| 1013 | cf_queue events_queue; |
| 1014 | |
| 1015 | /** |
| 1016 | * Thread id for the socket tender thread. |
| 1017 | */ |
| 1018 | pthread_t channel_tender_tid; |
| 1019 | } as_hb_channel_state; |
| 1020 | |
| 1021 | /** |
| 1022 | * Entry queued up for socket close. |
| 1023 | */ |
| 1024 | typedef struct as_hb_channel_socket_close_entry_s |
| 1025 | { |
| 1026 | /** |
| 1027 | * The node for which this event was generated. |
| 1028 | */ |
| 1029 | cf_socket* socket; |
| 1030 | /** |
| 1031 | * Indicates if this close is a remote close. |
| 1032 | */ |
| 1033 | bool is_remote; |
| 1034 | /** |
| 1035 | * True if close of this entry should generate a disconnect event. |
| 1036 | */ |
| 1037 | bool raise_close_event; |
| 1038 | } as_hb_channel_socket_close_entry; |
| 1039 | |
| 1040 | /** |
| 1041 | * An event generated by the channel sub module. |
| 1042 | */ |
| 1043 | typedef struct as_hb_channel_event_s |
| 1044 | { |
| 1045 | /** |
| 1046 | * The channel event type. |
| 1047 | */ |
| 1048 | as_hb_channel_event_type type; |
| 1049 | |
| 1050 | /** |
| 1051 | * The node for which this event was generated. |
| 1052 | */ |
| 1053 | cf_node nodeid; |
| 1054 | |
| 1055 | /** |
| 1056 | * The received message if any over this endpoint. Valid for incoming |
| 1057 | * message type event. The message if not NULL never be edited or copied |
| 1058 | * over. |
| 1059 | */ |
| 1060 | msg* msg; |
| 1061 | |
| 1062 | /** |
| 1063 | * The hlc timestamp for message receipt. |
| 1064 | */ |
| 1065 | as_hlc_msg_timestamp msg_hlc_ts; |
| 1066 | } as_hb_channel_event; |
| 1067 | |
| 1068 | /* |
| 1069 | * ---------------------------------------------------------------------------- |
| 1070 | * Main sub module state |
| 1071 | * ---------------------------------------------------------------------------- |
| 1072 | */ |
| 1073 | |
| 1074 | /** |
| 1075 | * Heartbeat message types. |
| 1076 | */ |
| 1077 | typedef enum |
| 1078 | { |
| 1079 | AS_HB_MSG_TYPE_PULSE, |
| 1080 | AS_HB_MSG_TYPE_INFO_REQUEST, |
| 1081 | AS_HB_MSG_TYPE_INFO_REPLY, |
| 1082 | AS_HB_MSG_TYPE_COMPRESSED |
| 1083 | } as_hb_msg_type; |
| 1084 | |
| 1085 | /** |
| 1086 | * Events published by the heartbeat subsystem. |
| 1087 | */ |
| 1088 | typedef enum |
| 1089 | { |
| 1090 | AS_HB_INTERNAL_NODE_ARRIVE, |
| 1091 | AS_HB_INTERNAL_NODE_DEPART, |
| 1092 | AS_HB_INTERNAL_NODE_EVICT, |
| 1093 | AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED |
| 1094 | } as_hb_internal_event_type; |
| 1095 | |
| 1096 | /** |
| 1097 | * State maintained by the heartbeat subsystem for the selected mode. |
| 1098 | */ |
| 1099 | typedef struct as_hb_mode_state_s |
| 1100 | { |
| 1101 | /** |
| 1102 | * The mesh / multicast state. |
| 1103 | */ |
| 1104 | union |
| 1105 | { |
| 1106 | as_hb_mesh_state mesh_state; |
| 1107 | as_hb_multicast_state multicast_state; |
| 1108 | }; |
| 1109 | } as_hb_mode_state; |
| 1110 | |
| 1111 | /** |
| 1112 | * Plugin data iterate reduce udata. |
| 1113 | */ |
| 1114 | typedef struct |
| 1115 | { |
| 1116 | /** |
| 1117 | * The plugin id. |
| 1118 | */ |
| 1119 | as_hb_plugin_id pluginid; |
| 1120 | |
| 1121 | /** |
| 1122 | * The iterate function. |
| 1123 | */ |
| 1124 | as_hb_plugin_data_iterate_fn iterate_fn; |
| 1125 | |
| 1126 | /** |
| 1127 | * The udata for the iterate function. |
| 1128 | */ |
| 1129 | void* udata; |
| 1130 | } as_hb_adjacecny_iterate_reduce_udata; |
| 1131 | |
| 1132 | /** |
| 1133 | * Information tracked for an adjacent nodes. |
| 1134 | */ |
| 1135 | typedef struct as_hb_adjacent_node_s |
| 1136 | { |
| 1137 | /** |
| 1138 | * The heart beat protocol version. |
| 1139 | */ |
| 1140 | uint32_t protocol_version; |
| 1141 | |
| 1142 | /** |
| 1143 | * The remote node's |
| 1144 | */ |
| 1145 | as_endpoint_list* endpoint_list; |
| 1146 | |
| 1147 | /** |
| 1148 | * Used to cycle between the two copies of plugin data. |
| 1149 | */ |
| 1150 | int plugin_data_cycler; |
| 1151 | |
| 1152 | /** |
| 1153 | * Plugin specific data accumulated by the heartbeat subsystem. The data is |
| 1154 | * heap allocated and should be destroyed the moment this element entry is |
| 1155 | * unused. There are two copies of the plugin data, one the current copy and |
| 1156 | * one the previous copy. Previous copy is used to generate data change |
| 1157 | * notifications. |
| 1158 | */ |
| 1159 | as_hb_plugin_node_data plugin_data[AS_HB_PLUGIN_SENTINEL][2]; |
| 1160 | |
| 1161 | /** |
| 1162 | * The monotonic local time node information was last updated. |
| 1163 | */ |
| 1164 | cf_clock last_updated_monotonic_ts; |
| 1165 | |
| 1166 | /** |
| 1167 | * HLC timestamp for the last pulse message. |
| 1168 | */ |
| 1169 | as_hlc_msg_timestamp last_msg_hlc_ts; |
| 1170 | |
| 1171 | /** |
| 1172 | * Track number of consecutive cluster-name mismatches. |
| 1173 | */ |
| 1174 | uint32_t cluster_name_mismatch_count; |
| 1175 | |
| 1176 | /** |
| 1177 | * Moving average of the latency in ms. |
| 1178 | */ |
| 1179 | uint64_t avg_latency; |
| 1180 | |
| 1181 | /** |
| 1182 | * A shift register tracking change of endpoints. On receipt of a heartbeat, |
| 1183 | * if source node's endpoints change 1 is inserted at the LSB, else 0 is |
| 1184 | * inserted at the LSB. |
| 1185 | */ |
| 1186 | uint64_t endpoint_change_tracker; |
| 1187 | } as_hb_adjacent_node; |
| 1188 | |
| 1189 | /** |
| 1190 | * Internal storage for external event listeners. |
| 1191 | */ |
| 1192 | typedef struct as_hb_event_listener_s |
| 1193 | { |
| 1194 | /** |
| 1195 | * Registered callback function. |
| 1196 | */ |
| 1197 | as_hb_event_fn event_callback; |
| 1198 | |
| 1199 | /** |
| 1200 | * Arguments for the listeners. |
| 1201 | */ |
| 1202 | void* udata; |
| 1203 | } as_hb_event_listener; |
| 1204 | |
| 1205 | /** |
| 1206 | * Heartbeat subsystem internal state. |
| 1207 | */ |
| 1208 | typedef struct as_hb_s |
| 1209 | { |
| 1210 | /** |
| 1211 | * The status of the subsystem. |
| 1212 | */ |
| 1213 | as_hb_status status; |
| 1214 | |
| 1215 | /** |
| 1216 | * The adjacency dictionary. The key is the nodeid. The value is an instance |
| 1217 | * of as_hb_adjacent_node. |
| 1218 | */ |
| 1219 | cf_shash* adjacency; |
| 1220 | |
| 1221 | /** |
| 1222 | * The probation dictionary having nodes that display unexpected behavior. |
| 1223 | * Nodeids under probation and adjacency hash are always exclusive. The key |
| 1224 | * is the nodeid. The value is an instance of as_hb_adjacent_node. |
| 1225 | */ |
| 1226 | cf_shash* on_probation; |
| 1227 | |
| 1228 | /** |
| 1229 | * Temporary nodeid to index hash used to compute nodes to evict from a |
| 1230 | * clique. |
| 1231 | */ |
| 1232 | cf_shash* nodeid_to_index; |
| 1233 | |
| 1234 | /** |
| 1235 | * The mode specific state. |
| 1236 | */ |
| 1237 | as_hb_mode_state mode_state; |
| 1238 | |
| 1239 | /** |
| 1240 | * The channel state. |
| 1241 | */ |
| 1242 | as_hb_channel_state channel_state; |
| 1243 | |
| 1244 | /** |
| 1245 | * Self node accumulated stats used primarily to detect duplicate node-ids. |
| 1246 | */ |
| 1247 | as_hb_adjacent_node self_node; |
| 1248 | |
| 1249 | /** |
| 1250 | * Indicates self node-id has duplicates. |
| 1251 | */ |
| 1252 | bool self_is_duplicate; |
| 1253 | |
| 1254 | /** |
| 1255 | * Monotonic timestamp of when a self duplicate was detected. |
| 1256 | */ |
| 1257 | cf_clock self_duplicate_detected_ts; |
| 1258 | |
| 1259 | /** |
| 1260 | * The plugin dictionary. The key is the as_hb_plugin entry and the value an |
| 1261 | * instance of as_hb_plugin. |
| 1262 | */ |
| 1263 | as_hb_plugin plugins[AS_HB_PLUGIN_SENTINEL]; |
| 1264 | |
| 1265 | /** |
| 1266 | * Thread id for the transmitter thread. |
| 1267 | */ |
| 1268 | pthread_t transmitter_tid; |
| 1269 | |
| 1270 | /** |
| 1271 | * Thread id for the thread expiring nodes from the adjacency list. |
| 1272 | */ |
| 1273 | pthread_t adjacency_tender_tid; |
| 1274 | } as_hb; |
| 1275 | |
| 1276 | /** |
| 1277 | * Registered heartbeat listeners. |
| 1278 | */ |
| 1279 | typedef struct as_hb_external_events_s |
| 1280 | { |
| 1281 | /** |
| 1282 | * Events are batched and published. Queue of unpublished heartbeat events. |
| 1283 | */ |
| 1284 | cf_queue external_events_queue; |
| 1285 | |
| 1286 | /** |
| 1287 | * Count of event listeners. |
| 1288 | */ |
| 1289 | int event_listener_count; |
| 1290 | |
| 1291 | /** |
| 1292 | * External event listeners. |
| 1293 | */ |
| 1294 | as_hb_event_listener event_listeners[AS_HB_EVENT_LISTENER_MAX]; |
| 1295 | } as_hb_external_events; |
| 1296 | |
| 1297 | /** |
| 1298 | * Shash reduce function to read current adjacency list. |
| 1299 | */ |
| 1300 | typedef struct as_hb_adjacency_reduce_udata_s |
| 1301 | { |
| 1302 | /** |
| 1303 | * The target adjacency list. |
| 1304 | */ |
| 1305 | cf_node* adj_list; |
| 1306 | |
| 1307 | /** |
| 1308 | * Count of elements in the adjacency list. |
| 1309 | */ |
| 1310 | int adj_count; |
| 1311 | } as_hb_adjacency_reduce_udata; |
| 1312 | |
| 1313 | /** |
| 1314 | * Udata for finding nodes in the adjacency list not in the input succession |
| 1315 | * list. |
| 1316 | */ |
| 1317 | typedef struct |
| 1318 | { |
| 1319 | /** |
| 1320 | * Number of events generated. |
| 1321 | */ |
| 1322 | int event_count; |
| 1323 | |
| 1324 | /** |
| 1325 | * List of generated events. |
| 1326 | */ |
| 1327 | as_hb_event_node* events; |
| 1328 | |
| 1329 | /** |
| 1330 | * Limit on number of generated events. |
| 1331 | */ |
| 1332 | int max_events; |
| 1333 | |
| 1334 | /** |
| 1335 | * Current succession list. |
| 1336 | */ |
| 1337 | cf_node* succession; |
| 1338 | |
| 1339 | /** |
| 1340 | * Number of nodes in succession list. |
| 1341 | */ |
| 1342 | int succession_size; |
| 1343 | } as_hb_find_new_nodes_reduce_udata; |
| 1344 | |
| 1345 | /** |
| 1346 | * Shash reduce function to read current adjacency list. |
| 1347 | */ |
| 1348 | typedef struct as_hb_adjacency_tender_udata_s |
| 1349 | { |
| 1350 | /** |
| 1351 | * The list of expired nodes. |
| 1352 | */ |
| 1353 | cf_node* dead_nodes; |
| 1354 | |
| 1355 | /** |
| 1356 | * Count of elements in the dead node list. |
| 1357 | */ |
| 1358 | int dead_node_count; |
| 1359 | |
| 1360 | /** |
| 1361 | * The list of evicted nodes , e.g. due to cluster name mismatch. |
| 1362 | */ |
| 1363 | cf_node* evicted_nodes; |
| 1364 | |
| 1365 | /** |
| 1366 | * Count of elements in the evicted node list. |
| 1367 | */ |
| 1368 | int evicted_node_count; |
| 1369 | } as_hb_adjacency_tender_udata; |
| 1370 | |
| 1371 | /** |
| 1372 | * Udata for tip clear. |
| 1373 | */ |
| 1374 | typedef struct as_hb_mesh_tip_clear_udata_s |
| 1375 | { |
| 1376 | /** |
| 1377 | * Host IP or DNS name to be cleared from seed list. |
| 1378 | */ |
| 1379 | char host[DNS_NAME_MAX_SIZE]; |
| 1380 | |
| 1381 | /** |
| 1382 | * Listening port of the host. |
| 1383 | */ |
| 1384 | int port; |
| 1385 | |
| 1386 | /** |
| 1387 | * Number of IP addresses to match. |
| 1388 | */ |
| 1389 | uint32_t n_addrs; |
| 1390 | |
| 1391 | /** |
| 1392 | * IP addresses to match. |
| 1393 | */ |
| 1394 | cf_ip_addr* addrs; |
| 1395 | |
| 1396 | /** |
| 1397 | * Node id if a specific node-id needs to be removed as well. |
| 1398 | */ |
| 1399 | cf_node nodeid; |
| 1400 | |
| 1401 | /** |
| 1402 | * Tip-clear status |
| 1403 | */ |
| 1404 | bool entry_deleted; |
| 1405 | } as_hb_mesh_tip_clear_udata; |
| 1406 | |
| 1407 | /** |
| 1408 | * Convert endpoint list to string in a process function. |
| 1409 | */ |
| 1410 | typedef struct endpoint_list_to_string_udata_s |
| 1411 | { |
| 1412 | /** |
| 1413 | * The endpoint list in string format. |
| 1414 | */ |
| 1415 | char* endpoint_list_str; |
| 1416 | |
| 1417 | /** |
| 1418 | * The size of enpoint list. |
| 1419 | */ |
| 1420 | size_t endpoint_list_str_capacity; |
| 1421 | } endpoint_list_to_string_udata; |
| 1422 | |
| 1423 | /** |
| 1424 | * Udata to fill an endpoint list into a message. |
| 1425 | */ |
| 1426 | typedef struct endpoint_list_to_msg_udata_s |
| 1427 | { |
| 1428 | /** |
| 1429 | * The target message. |
| 1430 | */ |
| 1431 | msg* msg; |
| 1432 | |
| 1433 | /** |
| 1434 | * Indicates if we are running in mesh mode. |
| 1435 | */ |
| 1436 | bool is_mesh; |
| 1437 | } endpoint_list_to_msg_udata; |
| 1438 | |
| 1439 | /** |
| 1440 | * Udata to test if this endpoint list overlaps with other endpoint list. |
| 1441 | */ |
| 1442 | typedef struct endpoint_list_equal_check_udata_s |
| 1443 | { |
| 1444 | /** |
| 1445 | * The endpoint list of the new node. |
| 1446 | */ |
| 1447 | as_endpoint_list* other; |
| 1448 | |
| 1449 | /** |
| 1450 | * Output. Indicates if the lists are equal. |
| 1451 | */ |
| 1452 | bool are_equal; |
| 1453 | } endpoint_list_equal_check_udata; |
| 1454 | |
| 1455 | /** |
| 1456 | * Endpoint list process function. |
| 1457 | * @param endpoint current endpoint in the iteration. |
| 1458 | * @param udata udata passed through from the invoker of the iterate function. |
| 1459 | */ |
| 1460 | typedef void |
| 1461 | (*endpoint_list_process_fn)(const as_endpoint_list* endpoint_list, void* udata); |
| 1462 | |
| 1463 | /** |
| 1464 | * Seed host list reduce udata. |
| 1465 | */ |
| 1466 | typedef struct as_hb_seed_host_list_udata_s |
| 1467 | { |
| 1468 | /** |
| 1469 | * The buffer to receive the list. |
| 1470 | */ |
| 1471 | cf_dyn_buf* db; |
| 1472 | |
| 1473 | /** |
| 1474 | * Selects TLS seed nodes. |
| 1475 | */ |
| 1476 | bool tls; |
| 1477 | } as_hb_seed_host_list_udata; |
| 1478 | |
| 1479 | /* |
| 1480 | * ---------------------------------------------------------------------------- |
| 1481 | * Globals |
| 1482 | * ---------------------------------------------------------------------------- |
| 1483 | */ |
| 1484 | |
| 1485 | /** |
| 1486 | * Global heartbeat instance. |
| 1487 | */ |
| 1488 | static as_hb g_hb; |
| 1489 | |
| 1490 | /** |
| 1491 | * Global heartbeat events listener instance. |
| 1492 | */ |
| 1493 | static as_hb_external_events g_hb_event_listeners; |
| 1494 | |
| 1495 | /** |
| 1496 | * The big fat lock for all external event publishing. This ensures that a batch |
| 1497 | * of external events are published atomically to preserve the order of external |
| 1498 | * events. |
| 1499 | */ |
| 1500 | static pthread_mutex_t g_external_event_publish_lock = |
| 1501 | PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
| 1502 | |
| 1503 | /** |
| 1504 | * Global lock to serialize all read and writes to the heartbeat subsystem. |
| 1505 | */ |
| 1506 | static pthread_mutex_t g_hb_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
| 1507 | |
| 1508 | /** |
| 1509 | * The big fat lock for all channel state. |
| 1510 | */ |
| 1511 | static pthread_mutex_t g_channel_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
| 1512 | |
| 1513 | /** |
| 1514 | * The big fat lock for all mesh state. |
| 1515 | */ |
| 1516 | static pthread_mutex_t g_mesh_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
| 1517 | |
| 1518 | /** |
| 1519 | * The big fat lock for all multicast state. |
| 1520 | */ |
| 1521 | static pthread_mutex_t g_multicast_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
| 1522 | |
| 1523 | /** |
| 1524 | * The global lock for all heartbeat configuration. |
| 1525 | */ |
| 1526 | static pthread_mutex_t g_hb_config_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
| 1527 | |
| 1528 | /** |
| 1529 | * The lock used while setting heartbeat protocol. |
| 1530 | */ |
| 1531 | static pthread_mutex_t g_set_protocol_lock = |
| 1532 | PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
| 1533 | |
| 1534 | /** |
| 1535 | * Message templates for heartbeat messages. |
| 1536 | */ |
| 1537 | static msg_template g_hb_msg_template[] = { |
| 1538 | |
| 1539 | { AS_HB_MSG_ID, M_FT_UINT32 }, |
| 1540 | |
| 1541 | { AS_HB_MSG_TYPE, M_FT_UINT32 }, |
| 1542 | |
| 1543 | { AS_HB_MSG_NODE, M_FT_UINT64 }, |
| 1544 | |
| 1545 | { AS_HB_MSG_CLUSTER_NAME, M_FT_STR }, |
| 1546 | |
| 1547 | { AS_HB_MSG_HLC_TIMESTAMP, M_FT_UINT64 }, |
| 1548 | |
| 1549 | { AS_HB_MSG_ENDPOINTS, M_FT_BUF }, |
| 1550 | |
| 1551 | { AS_HB_MSG_COMPRESSED_PAYLOAD, M_FT_BUF }, |
| 1552 | |
| 1553 | { AS_HB_MSG_INFO_REQUEST, M_FT_BUF }, |
| 1554 | |
| 1555 | { AS_HB_MSG_INFO_REPLY, M_FT_BUF }, |
| 1556 | |
| 1557 | { AS_HB_MSG_FABRIC_DATA, M_FT_BUF }, |
| 1558 | |
| 1559 | { AS_HB_MSG_HB_DATA, M_FT_BUF }, |
| 1560 | |
| 1561 | { AS_HB_MSG_PAXOS_DATA, M_FT_BUF }, |
| 1562 | |
| 1563 | { AS_HB_MSG_SKEW_MONITOR_DATA, M_FT_UINT64 } }; |
| 1564 | |
| 1565 | /* |
| 1566 | * ---------------------------------------------------------------------------- |
| 1567 | * Private internal function forward declarations. |
| 1568 | * ---------------------------------------------------------------------------- |
| 1569 | */ |
| 1570 | |
| 1571 | static void info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list); |
| 1572 | static uint32_t round_up_pow2(uint32_t v); |
| 1573 | static int vector_find(cf_vector* vector, const void* element); |
| 1574 | |
| 1575 | static void endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src); |
| 1576 | static void endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, void* udata); |
| 1577 | static void endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata); |
| 1578 | |
| 1579 | static int msg_compression_threshold(int mtu); |
| 1580 | static int msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list); |
| 1581 | static int msg_id_get(msg* msg, uint32_t* id); |
| 1582 | static int msg_nodeid_get(msg* msg, cf_node* nodeid); |
| 1583 | static int msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts); |
| 1584 | static int msg_type_get(msg* msg, as_hb_msg_type* type); |
| 1585 | static int msg_cluster_name_get(msg* msg, char** cluster_name); |
| 1586 | static int msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, size_t* adj_length); |
| 1587 | static int msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length); |
| 1588 | static void msg_node_list_set(msg* msg, int field_id, cf_node* node_list, size_t node_length); |
| 1589 | static void msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length); |
| 1590 | static int msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count); |
| 1591 | static void msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, void* udata); |
| 1592 | static void msg_src_fields_fill(msg* msg); |
| 1593 | static void msg_type_set(msg* msg, as_hb_msg_type msg_type); |
| 1594 | |
| 1595 | static int config_mcsize(); |
| 1596 | static const cf_serv_cfg* config_bind_cfg_get(); |
| 1597 | static const cf_mserv_cfg* config_multicast_group_cfg_get(); |
| 1598 | static uint32_t config_tx_interval_get(); |
| 1599 | static void config_tx_interval_set(uint32_t new_interval); |
| 1600 | static uint32_t config_override_mtu_get(); |
| 1601 | static void config_override_mtu_set(uint32_t mtu); |
| 1602 | static uint32_t config_max_intervals_missed_get(); |
| 1603 | static void config_max_intervals_missed_set(uint32_t new_max); |
| 1604 | static unsigned char config_multicast_ttl_get(); |
| 1605 | static as_hb_protocol config_protocol_get(); |
| 1606 | static void config_protocol_set(as_hb_protocol new_protocol); |
| 1607 | static cf_node config_self_nodeid_get(); |
| 1608 | static as_hb_mode config_mode_get(); |
| 1609 | static void config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, cf_serv_cfg* published_cfg, bool ipv4_only); |
| 1610 | static bool config_binding_is_valid(char** error, as_hb_protocol protocol); |
| 1611 | |
| 1612 | static void channel_init_channel(as_hb_channel* channel); |
| 1613 | static void channel_event_init(as_hb_channel_event* event); |
| 1614 | static bool channel_is_running(); |
| 1615 | static bool channel_is_stopped(); |
| 1616 | static uint32_t channel_win_grace_ms(); |
| 1617 | static void channel_events_enabled_set(bool enabled); |
| 1618 | static bool channel_are_events_enabled(); |
| 1619 | static void channel_event_queue(as_hb_channel_event* event); |
| 1620 | static void channel_event_publish_pending(); |
| 1621 | static int channel_get_channel(cf_socket* socket, as_hb_channel* result); |
| 1622 | static void channel_socket_shutdown(cf_socket* socket); |
| 1623 | static int channel_socket_get(cf_node nodeid, cf_socket** socket); |
| 1624 | static bool channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find); |
| 1625 | static void channel_socket_destroy(cf_socket* sock); |
| 1626 | static void channel_socket_close(cf_socket* socket, bool remote_close, bool raise_close_event); |
| 1627 | static void channel_sockets_close(cf_vector* sockets); |
| 1628 | static void channel_socket_close_queue(cf_socket* socket, bool is_remote_close, bool raise_close_event); |
| 1629 | static void channel_socket_close_pending(); |
| 1630 | static void channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, cf_sock_addr* endpoint_addr); |
| 1631 | static void channel_accept_connection(cf_socket* lsock); |
| 1632 | static as_hb_channel_msg_read_status channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len); |
| 1633 | static void channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata); |
| 1634 | static int channel_endpoint_search_reduce(const void* key, void* data, void* udata); |
| 1635 | static bool channel_endpoint_is_connected(as_endpoint_list* endpoint_list); |
| 1636 | static as_hb_channel_msg_read_status channel_multicast_msg_read(cf_socket* socket, msg* msg); |
| 1637 | static as_hb_channel_msg_read_status channel_mesh_msg_read(cf_socket* socket, msg* msg); |
| 1638 | static void channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid); |
| 1639 | static bool channel_socket_should_live(cf_socket* socket, as_hb_channel* channel); |
| 1640 | static cf_socket* channel_socket_resolve(cf_socket* socket1, cf_socket* socket2); |
| 1641 | static int channel_msg_sanity_check(as_hb_channel_event* msg_event); |
| 1642 | static int channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event); |
| 1643 | static void channel_msg_read(cf_socket* socket); |
| 1644 | static void channel_channels_idle_check(); |
| 1645 | void* channel_tender(void* arg); |
| 1646 | static bool channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata); |
| 1647 | static void channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, int endpoint_list_count); |
| 1648 | static int channel_node_disconnect(cf_node nodeid); |
| 1649 | static void channel_mesh_listening_socks_register(cf_sockets* listening_sockets); |
| 1650 | static void channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets); |
| 1651 | static void channel_multicast_listening_socks_register(cf_sockets* listening_sockets); |
| 1652 | static void channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets); |
| 1653 | static void channel_init(); |
| 1654 | static void channel_start(); |
| 1655 | static int channel_sockets_get_reduce(const void* key, void* data, void* udata); |
| 1656 | static void channel_stop(); |
| 1657 | static int channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length); |
| 1658 | static int channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length); |
| 1659 | static bool channel_msg_is_compression_required(msg* msg, int wire_size, int mtu); |
| 1660 | static int channel_msg_buffer_size_get(int wire_size, int mtu); |
| 1661 | static size_t channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, uint8_t* buffer, size_t buffer_len); |
| 1662 | static int channel_msg_unicast(cf_node dest, msg* msg); |
| 1663 | static int channel_msg_broadcast_reduce(const void* key, void* data, void* udata); |
| 1664 | static int channel_msg_broadcast(msg* msg); |
| 1665 | static void channel_clear(); |
| 1666 | static int channel_dump_reduce(const void* key, void* data, void* udata); |
| 1667 | static void channel_dump(bool verbose); |
| 1668 | |
| 1669 | static bool mesh_is_running(); |
| 1670 | static bool mesh_is_stopped(); |
| 1671 | static void mesh_published_endpoints_process(endpoint_list_process_fn process_fn, void* udata); |
| 1672 | static const char* mesh_node_status_string(as_hb_mesh_node_status status); |
| 1673 | static int mesh_seed_delete_unsafe(int seed_index); |
| 1674 | static int mesh_seed_find_unsafe(char* host, int port); |
| 1675 | static void mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, int mesh_node_count); |
| 1676 | static void mesh_node_status_change(as_hb_mesh_node* mesh_node, as_hb_mesh_node_status new_status); |
| 1677 | static void mesh_listening_sockets_close(); |
| 1678 | static void mesh_seed_host_list_get(cf_dyn_buf* db, bool tls); |
| 1679 | static void mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p); |
| 1680 | static void mesh_stop(); |
| 1681 | static int mesh_tend_reduce(const void* key, void* data, void* udata); |
| 1682 | void* mesh_tender(void* arg); |
| 1683 | static void mesh_node_destroy(as_hb_mesh_node* mesh_node); |
| 1684 | static void mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata); |
| 1685 | static bool mesh_node_is_discovered(cf_node nodeid); |
| 1686 | static bool mesh_node_endpoint_list_is_valid(cf_node nodeid); |
| 1687 | static int mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node); |
| 1688 | static void mesh_channel_on_node_disconnect(as_hb_channel_event* event); |
| 1689 | static bool mesh_node_check_fix_self_msg(as_hb_channel_event* event); |
| 1690 | static void mesh_node_data_update(as_hb_channel_event* event); |
| 1691 | static int mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, size_t* reply_size); |
| 1692 | static void mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, size_t reply_count); |
| 1693 | static msg* mesh_info_msg_init(as_hb_msg_type msg_type); |
| 1694 | static void mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, size_t to_discover_count); |
| 1695 | static void mesh_channel_on_pulse(msg* msg); |
| 1696 | static void mesh_channel_on_info_request(msg* msg); |
| 1697 | static void mesh_channel_on_info_reply(msg* msg); |
| 1698 | static int mesh_tip(char* host, int port, bool tls); |
| 1699 | static void mesh_channel_event_process(as_hb_channel_event* event); |
| 1700 | static void mesh_init(); |
| 1701 | static int mesh_free_node_data_reduce(const void* key, void* data, void* udata); |
| 1702 | static int mesh_tip_clear_reduce(const void* key, void* data, void* udata); |
| 1703 | static int mesh_peer_endpoint_reduce(const void* key, void* data, void* udata); |
| 1704 | static void mesh_clear(); |
| 1705 | static void mesh_listening_sockets_open(); |
| 1706 | static void mesh_start(); |
| 1707 | static int mesh_dump_reduce(const void* key, void* data, void* udata); |
| 1708 | static void mesh_dump(bool verbose); |
| 1709 | |
| 1710 | static void multicast_init(); |
| 1711 | static void multicast_clear(); |
| 1712 | static void multicast_listening_sockets_open(); |
| 1713 | static void multicast_start(); |
| 1714 | static void multicast_listening_sockets_close(); |
| 1715 | static void multicast_stop(); |
| 1716 | static void multicast_dump(bool verbose); |
| 1717 | static int multicast_supported_cluster_size_get(); |
| 1718 | |
| 1719 | static bool hb_is_initialized(); |
| 1720 | static bool hb_is_running(); |
| 1721 | static bool hb_is_stopped(); |
| 1722 | static void hb_mode_init(); |
| 1723 | static void hb_mode_start(); |
| 1724 | static int hb_mtu(); |
| 1725 | static void hb_msg_init(); |
| 1726 | static uint32_t hb_protocol_identifier_get(); |
| 1727 | static cf_clock hb_node_depart_time(cf_clock detect_time); |
| 1728 | static bool hb_is_mesh(); |
| 1729 | static void hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, int node_count); |
| 1730 | static void hb_event_publish_pending(); |
| 1731 | static int hb_adjacency_free_data_reduce(const void* key, void* data, void* udata); |
| 1732 | static void hb_clear(); |
| 1733 | static int hb_adjacency_iterate_reduce(const void* key, void* data, void* udata); |
| 1734 | static void hb_plugin_set_fn(msg* msg); |
| 1735 | static void hb_plugin_parse_data_fn(msg* msg, cf_node source, as_hb_plugin_node_data* prev_plugin_data, as_hb_plugin_node_data* plugin_data); |
| 1736 | static msg* hb_msg_get(); |
| 1737 | static void hb_msg_return(msg* msg); |
| 1738 | static void hb_plugin_msg_fill(msg* msg); |
| 1739 | static void hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, as_hb_plugin* plugins, bool plugin_data_changed[]); |
| 1740 | static void hb_plugin_init(); |
| 1741 | void* hb_transmitter(void* arg); |
| 1742 | static int hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node); |
| 1743 | static void hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size); |
| 1744 | static void hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, cf_node** adjacency_list, size_t* adjacency_length); |
| 1745 | static bool hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node); |
| 1746 | static bool hb_self_is_duplicate(); |
| 1747 | static void hb_self_duplicate_update(); |
| 1748 | static void hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node); |
| 1749 | static int hb_adjacency_tend_reduce(const void* key, void* data, void* udata); |
| 1750 | void* hb_adjacency_tender(void* arg); |
| 1751 | static void hb_tx_start(); |
| 1752 | static void hb_tx_stop(); |
| 1753 | static void hb_adjacency_tender_start(); |
| 1754 | static void hb_adjacency_tender_stop(); |
| 1755 | static void hb_init(); |
| 1756 | static void hb_start(); |
| 1757 | static void hb_stop(); |
| 1758 | static void hb_plugin_register(as_hb_plugin* plugin); |
| 1759 | static bool hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp send_ts); |
| 1760 | static void hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed); |
| 1761 | static bool hb_endpoint_change_tracker_is_normal(uint64_t tracker); |
| 1762 | static bool hb_endpoint_change_tracker_has_changed(uint64_t tracker); |
| 1763 | static int hb_adjacent_node_update(as_hb_channel_event* msg_event, as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]); |
| 1764 | static bool hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node); |
| 1765 | static void hb_channel_on_self_pulse(as_hb_channel_event* msg_event); |
| 1766 | static void hb_channel_on_pulse(as_hb_channel_event* msg_event); |
| 1767 | static void hb_channel_on_msg_rcvd(as_hb_channel_event* event); |
| 1768 | static void hb_handle_cluster_name_mismatch(as_hb_channel_event* event); |
| 1769 | static void hb_channel_event_process(as_hb_channel_event* event); |
| 1770 | static void hb_mode_dump(bool verbose); |
| 1771 | static int hb_dump_reduce(const void* key, void* data, void* udata); |
| 1772 | static void hb_dump(bool verbose); |
| 1773 | static void hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph); |
| 1774 | static void hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict); |
| 1775 | static int hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata); |
| 1776 | static void hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, |
| 1777 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata); |
| 1778 | |
| 1779 | /* |
| 1780 | * ---------------------------------------------------------------------------- |
| 1781 | * Public functions. |
| 1782 | * ---------------------------------------------------------------------------- |
| 1783 | */ |
| 1784 | /** |
| 1785 | * Initialize the heartbeat subsystem. |
| 1786 | */ |
| 1787 | void |
| 1788 | as_hb_init() |
| 1789 | { |
| 1790 | // Initialize hb subsystem. |
| 1791 | hb_init(); |
| 1792 | |
| 1793 | // Add the mesh seed nodes. |
| 1794 | // Using one time seed config outside the config module. |
| 1795 | if (hb_is_mesh()) { |
| 1796 | for (int i = 0; i < AS_CLUSTER_SZ; i++) { |
| 1797 | if (g_config.hb_config.mesh_seed_addrs[i]) { |
| 1798 | mesh_tip(g_config.hb_config.mesh_seed_addrs[i], |
| 1799 | g_config.hb_config.mesh_seed_ports[i], |
| 1800 | g_config.hb_config.mesh_seed_tls[i]); |
| 1801 | } |
| 1802 | else { |
| 1803 | break; |
| 1804 | } |
| 1805 | } |
| 1806 | } |
| 1807 | } |
| 1808 | |
| 1809 | /** |
| 1810 | * Start the heartbeat subsystem. |
| 1811 | */ |
| 1812 | void |
| 1813 | as_hb_start() |
| 1814 | { |
| 1815 | hb_start(); |
| 1816 | } |
| 1817 | |
| 1818 | /** |
| 1819 | * Shut down the heartbeat subsystem. |
| 1820 | */ |
| 1821 | void |
| 1822 | as_hb_shutdown() |
| 1823 | { |
| 1824 | hb_stop(); |
| 1825 | } |
| 1826 | |
| 1827 | /** |
| 1828 | * Indicates if self node is a duplicate |
| 1829 | */ |
| 1830 | bool |
| 1831 | as_hb_self_is_duplicate() |
| 1832 | { |
| 1833 | return hb_self_is_duplicate(); |
| 1834 | } |
| 1835 | |
| 1836 | /** |
| 1837 | * Free the data structures of heart beat. |
| 1838 | */ |
| 1839 | void |
| 1840 | as_hb_destroy() |
| 1841 | { |
| 1842 | // Destroy the main module. |
| 1843 | hb_clear(); |
| 1844 | } |
| 1845 | |
| 1846 | /** |
| 1847 | * Return a string representation of a heartbeat protocol type. |
| 1848 | * |
| 1849 | * @param protocol for which the string is computed |
| 1850 | * @param protocol_s string representation of protocol |
| 1851 | */ |
| 1852 | void |
| 1853 | as_hb_protocol_get_s(as_hb_protocol protocol, char* protocol_s) |
| 1854 | { |
| 1855 | char *str; |
| 1856 | switch (protocol) { |
| 1857 | case AS_HB_PROTOCOL_V3: |
| 1858 | str = "v3" ; |
| 1859 | break; |
| 1860 | case AS_HB_PROTOCOL_NONE: |
| 1861 | str = "none" ; |
| 1862 | break; |
| 1863 | case AS_HB_PROTOCOL_RESET: |
| 1864 | str = "reset" ; |
| 1865 | break; |
| 1866 | default: |
| 1867 | str = "undefined" ; |
| 1868 | } |
| 1869 | |
| 1870 | sprintf(protocol_s, "%s" , str); |
| 1871 | } |
| 1872 | |
| 1873 | /** |
| 1874 | * Set heartbeat protocol version. |
| 1875 | */ |
| 1876 | as_hb_protocol |
| 1877 | as_hb_protocol_get() |
| 1878 | { |
| 1879 | return config_protocol_get(); |
| 1880 | } |
| 1881 | |
| 1882 | /** |
| 1883 | * Set heartbeat protocol version. |
| 1884 | */ |
| 1885 | int |
| 1886 | as_hb_protocol_set(as_hb_protocol new_protocol) |
| 1887 | { |
| 1888 | SET_PROTOCOL_LOCK(); |
| 1889 | int rv = 0; |
| 1890 | if (config_protocol_get() == new_protocol) { |
| 1891 | INFO("no heartbeat protocol change needed" ); |
| 1892 | rv = 0; |
| 1893 | goto Exit; |
| 1894 | } |
| 1895 | char old_protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
| 1896 | char new_protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
| 1897 | as_hb_protocol_get_s(config_protocol_get(), old_protocol_s); |
| 1898 | as_hb_protocol_get_s(new_protocol, new_protocol_s); |
| 1899 | switch (new_protocol) { |
| 1900 | case AS_HB_PROTOCOL_V3: |
| 1901 | if (hb_is_running()) { |
| 1902 | INFO("disabling current heartbeat protocol %s" , old_protocol_s); |
| 1903 | hb_stop(); |
| 1904 | } |
| 1905 | INFO("setting heartbeat protocol version number to %s" , new_protocol_s); |
| 1906 | config_protocol_set(new_protocol); |
| 1907 | hb_start(); |
| 1908 | INFO("heartbeat protocol version set to %s" , new_protocol_s); |
| 1909 | break; |
| 1910 | |
| 1911 | case AS_HB_PROTOCOL_NONE: |
| 1912 | INFO("setting heartbeat protocol version to none" ); |
| 1913 | hb_stop(); |
| 1914 | config_protocol_set(new_protocol); |
| 1915 | INFO("heartbeat protocol set to none" ); |
| 1916 | break; |
| 1917 | |
| 1918 | case AS_HB_PROTOCOL_RESET: |
| 1919 | if (config_protocol_get() == AS_HB_PROTOCOL_NONE) { |
| 1920 | INFO("heartbeat messaging disabled ~~ not resetting" ); |
| 1921 | rv = -1; |
| 1922 | goto Exit; |
| 1923 | } |
| 1924 | |
| 1925 | // NB: "protocol" is never actually set to "RESET" ~~ |
| 1926 | // it is simply a trigger for the reset action. |
| 1927 | INFO("resetting heartbeat messaging" ); |
| 1928 | |
| 1929 | hb_stop(); |
| 1930 | |
| 1931 | hb_clear(); |
| 1932 | |
| 1933 | hb_start(); |
| 1934 | |
| 1935 | break; |
| 1936 | |
| 1937 | default: |
| 1938 | WARNING("unknown heartbeat protocol version number: %d" , new_protocol); |
| 1939 | rv = -1; |
| 1940 | goto Exit; |
| 1941 | } |
| 1942 | |
| 1943 | Exit: |
| 1944 | SET_PROTOCOL_UNLOCK(); |
| 1945 | return rv; |
| 1946 | } |
| 1947 | |
| 1948 | /** |
| 1949 | * Register a heartbeat plugin. |
| 1950 | */ |
| 1951 | void |
| 1952 | as_hb_plugin_register(as_hb_plugin* plugin) |
| 1953 | { |
| 1954 | if (!hb_is_initialized()) { |
| 1955 | WARNING( |
| 1956 | "main heartbeat module uninitialized - not registering the plugin" ); |
| 1957 | return; |
| 1958 | } |
| 1959 | hb_plugin_register(plugin); |
| 1960 | } |
| 1961 | |
| 1962 | /** |
| 1963 | * Register a heartbeat node event listener. |
| 1964 | */ |
| 1965 | void |
| 1966 | as_hb_register_listener(as_hb_event_fn event_callback, void* udata) |
| 1967 | { |
| 1968 | if (!hb_is_initialized()) { |
| 1969 | WARNING( |
| 1970 | "main heartbeat module uninitialized - not registering the listener" ); |
| 1971 | return; |
| 1972 | } |
| 1973 | |
| 1974 | HB_LOCK(); |
| 1975 | |
| 1976 | if (g_hb_event_listeners.event_listener_count >= |
| 1977 | AS_HB_EVENT_LISTENER_MAX) { |
| 1978 | CRASH("cannot register more than %d event listeners" , |
| 1979 | AS_HB_EVENT_LISTENER_MAX); |
| 1980 | } |
| 1981 | |
| 1982 | g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].event_callback = |
| 1983 | event_callback; |
| 1984 | g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].udata = |
| 1985 | udata; |
| 1986 | g_hb_event_listeners.event_listener_count++; |
| 1987 | |
| 1988 | HB_UNLOCK(); |
| 1989 | } |
| 1990 | |
| 1991 | /** |
| 1992 | * Validate heartbeat config. |
| 1993 | */ |
| 1994 | void |
| 1995 | as_hb_config_validate() |
| 1996 | { |
| 1997 | char *error; |
| 1998 | // Validate clustering and heartbeat version compatibility. |
| 1999 | as_hb_protocol hb_protocol = config_protocol_get(); |
| 2000 | |
| 2001 | if (hb_protocol != AS_HB_PROTOCOL_V3 |
| 2002 | && hb_protocol != AS_HB_PROTOCOL_NONE) { |
| 2003 | CRASH_NOSTACK("clustering protocol v5 requires hearbeat version v3" ); |
| 2004 | } |
| 2005 | |
| 2006 | if (!config_binding_is_valid(&error, hb_protocol)) { |
| 2007 | CRASH_NOSTACK("%s" , error); |
| 2008 | } |
| 2009 | } |
| 2010 | |
| 2011 | /** |
| 2012 | * Override the computed MTU for the network interface used by heartbeat. |
| 2013 | */ |
| 2014 | void |
| 2015 | as_hb_override_mtu_set(int mtu) |
| 2016 | { |
| 2017 | config_override_mtu_set(mtu); |
| 2018 | } |
| 2019 | |
| 2020 | /** |
| 2021 | * Get the heartbeat pulse transmit interval. |
| 2022 | */ |
| 2023 | uint32_t |
| 2024 | as_hb_tx_interval_get() |
| 2025 | { |
| 2026 | return config_tx_interval_get(); |
| 2027 | } |
| 2028 | |
| 2029 | /** |
| 2030 | * Set the heartbeat pulse transmit interval. |
| 2031 | */ |
| 2032 | int |
| 2033 | as_hb_tx_interval_set(uint32_t new_interval) |
| 2034 | { |
| 2035 | if (new_interval < AS_HB_TX_INTERVAL_MS_MIN |
| 2036 | || new_interval > AS_HB_TX_INTERVAL_MS_MAX) { |
| 2037 | WARNING("heartbeat interval must be >= %u and <= %u - ignoring %u" , |
| 2038 | AS_HB_TX_INTERVAL_MS_MIN, AS_HB_TX_INTERVAL_MS_MAX, |
| 2039 | new_interval); |
| 2040 | return (-1); |
| 2041 | } |
| 2042 | config_tx_interval_set(new_interval); |
| 2043 | return (0); |
| 2044 | } |
| 2045 | |
| 2046 | /** |
| 2047 | * Get the maximum number of missed heartbeat intervals after which a node is |
| 2048 | * considered expired. |
| 2049 | */ |
| 2050 | uint32_t |
| 2051 | as_hb_max_intervals_missed_get() |
| 2052 | { |
| 2053 | return config_max_intervals_missed_get(); |
| 2054 | } |
| 2055 | |
| 2056 | /** |
| 2057 | * Set the maximum number of missed heartbeat intervals after which a node is |
| 2058 | * considered expired. |
| 2059 | */ |
| 2060 | int |
| 2061 | as_hb_max_intervals_missed_set(uint32_t new_max) |
| 2062 | { |
| 2063 | if (new_max < AS_HB_MAX_INTERVALS_MISSED_MIN) { |
| 2064 | WARNING("heartbeat timeout must be >= %u - ignoring %u" , |
| 2065 | AS_HB_MAX_INTERVALS_MISSED_MIN, new_max); |
| 2066 | return (-1); |
| 2067 | } |
| 2068 | config_max_intervals_missed_set(new_max); |
| 2069 | return (0); |
| 2070 | } |
| 2071 | |
| 2072 | /** |
| 2073 | * Get the timeout interval to consider a node dead / expired in milliseconds if |
| 2074 | * no heartbeat pulse messages are received. |
| 2075 | */ |
| 2076 | uint32_t |
| 2077 | as_hb_node_timeout_get() |
| 2078 | { |
| 2079 | return HB_NODE_TIMEOUT(); |
| 2080 | } |
| 2081 | |
| 2082 | /** |
| 2083 | * Populate the buffer with heartbeat configuration. |
| 2084 | */ |
| 2085 | void |
| 2086 | as_hb_info_config_get(cf_dyn_buf* db) |
| 2087 | { |
| 2088 | if (hb_is_mesh()) { |
| 2089 | info_append_string(db, "heartbeat.mode" , "mesh" ); |
| 2090 | info_append_addrs(db, "heartbeat.address" , &g_config.hb_serv_spec.bind); |
| 2091 | info_append_uint32(db, "heartbeat.port" , |
| 2092 | (uint32_t)g_config.hb_serv_spec.bind_port); |
| 2093 | info_append_addrs(db, "heartbeat.tls-address" , |
| 2094 | &g_config.hb_tls_serv_spec.bind); |
| 2095 | info_append_uint32(db, "heartbeat.tls-port" , |
| 2096 | g_config.hb_tls_serv_spec.bind_port); |
| 2097 | info_append_string_safe(db, "heartbeat.tls-name" , |
| 2098 | g_config.hb_tls_serv_spec.tls_our_name); |
| 2099 | mesh_seed_host_list_get(db, true); |
| 2100 | } |
| 2101 | else { |
| 2102 | info_append_string(db, "heartbeat.mode" , "multicast" ); |
| 2103 | info_append_addrs(db, "heartbeat.address" , &g_config.hb_serv_spec.bind); |
| 2104 | info_append_addrs(db, "heartbeat.multicast-group" , |
| 2105 | &g_config.hb_multicast_groups); |
| 2106 | info_append_uint32(db, "heartbeat.port" , |
| 2107 | (uint32_t)g_config.hb_serv_spec.bind_port); |
| 2108 | } |
| 2109 | |
| 2110 | info_append_uint32(db, "heartbeat.interval" , config_tx_interval_get()); |
| 2111 | info_append_uint32(db, "heartbeat.timeout" , |
| 2112 | config_max_intervals_missed_get()); |
| 2113 | |
| 2114 | info_append_int(db, "heartbeat.mtu" , hb_mtu()); |
| 2115 | |
| 2116 | char protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
| 2117 | as_hb_protocol_get_s(config_protocol_get(), protocol_s); |
| 2118 | |
| 2119 | info_append_string(db, "heartbeat.protocol" , protocol_s); |
| 2120 | } |
| 2121 | |
| 2122 | /** |
| 2123 | * Populate heartbeat endpoints. |
| 2124 | */ |
| 2125 | void |
| 2126 | as_hb_info_endpoints_get(cf_dyn_buf* db) |
| 2127 | { |
| 2128 | const cf_serv_cfg *cfg = config_bind_cfg_get(); |
| 2129 | |
| 2130 | if (cfg->n_cfgs == 0) { |
| 2131 | // Will never happen in practice. |
| 2132 | return; |
| 2133 | } |
| 2134 | |
| 2135 | info_append_int(db, "heartbeat.port" , g_config.hb_serv_spec.bind_port); |
| 2136 | |
| 2137 | char *string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT); |
| 2138 | info_append_string(db, "heartbeat.addresses" , string); |
| 2139 | cf_free(string); |
| 2140 | |
| 2141 | info_append_int(db, "heartbeat.tls-port" , |
| 2142 | g_config.hb_tls_serv_spec.bind_port); |
| 2143 | |
| 2144 | string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT_TLS); |
| 2145 | info_append_string(db, "heartbeat.tls-addresses" , string); |
| 2146 | cf_free(string); |
| 2147 | |
| 2148 | if (hb_is_mesh()) { |
| 2149 | MESH_LOCK(); |
| 2150 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
| 2151 | mesh_peer_endpoint_reduce, db); |
| 2152 | MESH_UNLOCK(); |
| 2153 | } |
| 2154 | else { |
| 2155 | // Output multicast groups. |
| 2156 | const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get(); |
| 2157 | if (multicast_cfg->n_cfgs == 0) { |
| 2158 | return; |
| 2159 | } |
| 2160 | |
| 2161 | cf_dyn_buf_append_string(db, "heartbeat.multicast-groups=" ); |
| 2162 | uint32_t count = 0; |
| 2163 | for (uint32_t i = 0; i < multicast_cfg->n_cfgs; ++i) { |
| 2164 | if (count > 0) { |
| 2165 | cf_dyn_buf_append_char(db, ','); |
| 2166 | } |
| 2167 | |
| 2168 | cf_dyn_buf_append_string(db, |
| 2169 | cf_ip_addr_print(&multicast_cfg->cfgs[i].addr)); |
| 2170 | ++count; |
| 2171 | } |
| 2172 | cf_dyn_buf_append_char(db, ';'); |
| 2173 | } |
| 2174 | } |
| 2175 | |
| 2176 | /** |
| 2177 | * Generate a string for listening address and port in format ip_address:port |
| 2178 | * and return the heartbeat mode. |
| 2179 | * |
| 2180 | * @param mode (output) current heartbeat subsystem mode. |
| 2181 | * @param addr_port (output) listening ip address and port formatted as |
| 2182 | * ip_address:port |
| 2183 | * @param addr_port_capacity the capacity of the addr_port input. |
| 2184 | */ |
| 2185 | void |
| 2186 | as_hb_info_listen_addr_get(as_hb_mode* mode, char* addr_port, |
| 2187 | size_t addr_port_capacity) |
| 2188 | { |
| 2189 | *mode = hb_is_mesh() ? AS_HB_MODE_MESH : AS_HB_MODE_MULTICAST; |
| 2190 | if (hb_is_mesh()) { |
| 2191 | endpoint_list_to_string_udata udata; |
| 2192 | udata.endpoint_list_str = addr_port; |
| 2193 | udata.endpoint_list_str_capacity = addr_port_capacity; |
| 2194 | mesh_published_endpoints_process(endpoint_list_to_string_process, |
| 2195 | &udata); |
| 2196 | } |
| 2197 | else { |
| 2198 | const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get(); |
| 2199 | |
| 2200 | char* write_ptr = addr_port; |
| 2201 | int remaining = addr_port_capacity; |
| 2202 | |
| 2203 | // Ensure we leave space for the terminating NULL delimiter. |
| 2204 | for (int i = 0; i < multicast_cfg->n_cfgs && remaining > 1; i++) { |
| 2205 | cf_sock_addr temp; |
| 2206 | cf_ip_addr_copy(&multicast_cfg->cfgs[i].addr, &temp.addr); |
| 2207 | temp.port = multicast_cfg->cfgs[i].port; |
| 2208 | int rv = cf_sock_addr_to_string(&temp, write_ptr, remaining); |
| 2209 | if (rv <= 0) { |
| 2210 | // We exhausted the write buffer. |
| 2211 | // Ensure NULL termination. |
| 2212 | addr_port[addr_port_capacity - 1] = 0; |
| 2213 | return; |
| 2214 | } |
| 2215 | |
| 2216 | write_ptr += rv; |
| 2217 | remaining -= rv; |
| 2218 | |
| 2219 | if (i != multicast_cfg->n_cfgs - 1 && remaining > 1) { |
| 2220 | *write_ptr = ','; |
| 2221 | write_ptr++; |
| 2222 | remaining--; |
| 2223 | } |
| 2224 | } |
| 2225 | |
| 2226 | // Ensure NULL termination. |
| 2227 | *write_ptr = 0; |
| 2228 | } |
| 2229 | } |
| 2230 | |
| 2231 | /** |
| 2232 | * Populate the buffer with duplicate nodeids. |
| 2233 | */ |
| 2234 | void |
| 2235 | as_hb_info_duplicates_get(cf_dyn_buf* db) |
| 2236 | { |
| 2237 | cf_dyn_buf_append_string(db, "cluster_duplicate_nodes=" ); |
| 2238 | |
| 2239 | HB_LOCK(); |
| 2240 | bool self_is_duplicate = hb_self_is_duplicate(); |
| 2241 | int num_probation = cf_shash_get_size(g_hb.on_probation); |
| 2242 | cf_node duplicate_list[num_probation + 1]; |
| 2243 | |
| 2244 | if (!self_is_duplicate && num_probation == 0) { |
| 2245 | cf_dyn_buf_append_string(db, "null" ); |
| 2246 | goto Exit; |
| 2247 | } |
| 2248 | |
| 2249 | as_hb_adjacency_reduce_udata probation_reduce_udata = { duplicate_list, 0 }; |
| 2250 | |
| 2251 | cf_shash_reduce(g_hb.on_probation, hb_adjacency_iterate_reduce, |
| 2252 | &probation_reduce_udata); |
| 2253 | |
| 2254 | if (hb_self_is_duplicate()) { |
| 2255 | duplicate_list[probation_reduce_udata.adj_count++] = |
| 2256 | config_self_nodeid_get(); |
| 2257 | } |
| 2258 | |
| 2259 | int num_duplicates = probation_reduce_udata.adj_count; |
| 2260 | qsort(duplicate_list, num_duplicates, sizeof(cf_node), |
| 2261 | cf_node_compare_desc); |
| 2262 | |
| 2263 | for (int i = 0; i < num_duplicates; i++) { |
| 2264 | cf_dyn_buf_append_uint64_x(db, duplicate_list[i]); |
| 2265 | cf_dyn_buf_append_char(db, ','); |
| 2266 | } |
| 2267 | cf_dyn_buf_chomp(db); |
| 2268 | |
| 2269 | Exit: |
| 2270 | HB_UNLOCK(); |
| 2271 | cf_dyn_buf_append_char(db, ';'); |
| 2272 | } |
| 2273 | |
| 2274 | /* |
| 2275 | * ----------------------------------------------------------------- |
| 2276 | * Mesh mode public API |
| 2277 | * ----------------------------------------------------------------- |
| 2278 | */ |
| 2279 | |
| 2280 | /** |
| 2281 | * Add an aerospike instance from the mesh seed list. |
| 2282 | */ |
| 2283 | int |
| 2284 | as_hb_mesh_tip(char* host, int port, bool tls) |
| 2285 | { |
| 2286 | if (!hb_is_mesh()) { |
| 2287 | WARNING("tip not applicable for multicast" ); |
| 2288 | return (-1); |
| 2289 | } |
| 2290 | |
| 2291 | return mesh_tip(host, port, tls); |
| 2292 | } |
| 2293 | |
| 2294 | /** |
| 2295 | * Remove a mesh node instance from the mesh list. |
| 2296 | */ |
| 2297 | int |
| 2298 | as_hb_mesh_tip_clear(char* host, int port) |
| 2299 | { |
| 2300 | if (!hb_is_mesh()) { |
| 2301 | WARNING("tip clear not applicable for multicast" ); |
| 2302 | return (-1); |
| 2303 | } |
| 2304 | |
| 2305 | if (host == NULL || host[0] == 0 |
| 2306 | || strnlen(host, DNS_NAME_MAX_SIZE) == DNS_NAME_MAX_SIZE) { |
| 2307 | WARNING("invalid tip clear host:%s or port:%d" , host, port); |
| 2308 | return (-1); |
| 2309 | } |
| 2310 | |
| 2311 | MESH_LOCK(); |
| 2312 | DETAIL("executing tip clear for %s:%d" , host, port); |
| 2313 | |
| 2314 | // FIXME: Remove the mesh host entry and close channel was done to meet |
| 2315 | // AER-5241 ??? |
| 2316 | // tip-clear is not a mechanism to throw a connected node out of the |
| 2317 | // cluster. |
| 2318 | // We should not be required to use this mechanism now. |
| 2319 | // tip-clear should only be used to cleanup seed list after decommisioning |
| 2320 | // an ip. |
| 2321 | cf_ip_addr addrs[CF_SOCK_CFG_MAX]; |
| 2322 | uint32_t n_addrs = CF_SOCK_CFG_MAX; |
| 2323 | |
| 2324 | as_hb_mesh_tip_clear_udata mesh_tip_clear_reduce_udata; |
| 2325 | strcpy(mesh_tip_clear_reduce_udata.host, host); |
| 2326 | mesh_tip_clear_reduce_udata.port = port; |
| 2327 | mesh_tip_clear_reduce_udata.entry_deleted = false; |
| 2328 | mesh_tip_clear_reduce_udata.nodeid = 0; |
| 2329 | |
| 2330 | if (cf_ip_addr_from_string_multi(host, addrs, &n_addrs) != 0) { |
| 2331 | n_addrs = 0; |
| 2332 | } |
| 2333 | |
| 2334 | mesh_tip_clear_reduce_udata.addrs = addrs; |
| 2335 | mesh_tip_clear_reduce_udata.n_addrs = n_addrs; |
| 2336 | |
| 2337 | int seed_index = mesh_seed_find_unsafe(host, port); |
| 2338 | if (seed_index >= 0) { |
| 2339 | as_hb_mesh_seed* seed = cf_vector_getp( |
| 2340 | &g_hb.mode_state.mesh_state.seeds, seed_index); |
| 2341 | mesh_tip_clear_reduce_udata.nodeid = seed->mesh_nodeid; |
| 2342 | } |
| 2343 | |
| 2344 | // Refresh the mapping between the seeds and the mesh hosts. |
| 2345 | mesh_seed_inactive_refresh_get_unsafe (NULL); |
| 2346 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
| 2347 | mesh_tip_clear_reduce, &mesh_tip_clear_reduce_udata); |
| 2348 | |
| 2349 | // Remove the seed entry in case we do not find a matching mesh entry. |
| 2350 | // Will happen trivially if this seed could not be connected. |
| 2351 | mesh_tip_clear_reduce_udata.entry_deleted = |
| 2352 | mesh_tip_clear_reduce_udata.entry_deleted |
| 2353 | || mesh_seed_delete_unsafe( |
| 2354 | mesh_seed_find_unsafe(host, port)) == 0; |
| 2355 | |
| 2356 | MESH_UNLOCK(); |
| 2357 | return mesh_tip_clear_reduce_udata.entry_deleted ? 0 : -1; |
| 2358 | } |
| 2359 | |
| 2360 | /** |
| 2361 | * Clear the entire mesh list. |
| 2362 | */ |
| 2363 | int |
| 2364 | as_hb_mesh_tip_clear_all(uint32_t* cleared) |
| 2365 | { |
| 2366 | if (!hb_is_mesh()) { |
| 2367 | WARNING("tip clear not applicable for multicast" ); |
| 2368 | return (-1); |
| 2369 | } |
| 2370 | |
| 2371 | MESH_LOCK(); |
| 2372 | *cleared = cf_shash_get_size( |
| 2373 | g_hb.mode_state.mesh_state.nodeid_to_mesh_node); |
| 2374 | |
| 2375 | // Refresh the mapping between the seeds and the mesh hosts. |
| 2376 | mesh_seed_inactive_refresh_get_unsafe(NULL); |
| 2377 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
| 2378 | mesh_tip_clear_reduce, NULL); |
| 2379 | |
| 2380 | // Remove all entries that did not have a matching mesh endpoint. |
| 2381 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 2382 | int element_count = cf_vector_size(seeds); |
| 2383 | for (int i = 0; i < element_count; i++) { |
| 2384 | if (mesh_seed_delete_unsafe(i) == 0) { |
| 2385 | i--; |
| 2386 | element_count--; |
| 2387 | } |
| 2388 | else { |
| 2389 | // Should not happen in practice. |
| 2390 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 2391 | CRASH("error deleting mesh seed entry %s:%d" , seed->seed_host_name, |
| 2392 | seed->seed_port); |
| 2393 | } |
| 2394 | } |
| 2395 | |
| 2396 | MESH_UNLOCK(); |
| 2397 | return (0); |
| 2398 | } |
| 2399 | |
| 2400 | /** |
| 2401 | * Read the plugin data for a node in the adjacency list. The plugin_data->data |
| 2402 | * input param should be pre allocated and plugin_data->data_capacity should |
| 2403 | * indicate its capacity. |
| 2404 | * |
| 2405 | * @param nodeid the node id |
| 2406 | * @param pluginid the plugin identifier. |
| 2407 | * @param plugin_data (input/output) on success plugin_data->data will be the |
| 2408 | * plugin's data for the node and plugin_data->data_size will be the data size. |
| 2409 | * node. NULL if there is no plugin data. |
| 2410 | * @praram msg_hlc_ts (output) if not NULL will be filled with the timestamp of |
| 2411 | * when the hb message for this data was received. |
| 2412 | * @param recv_monotonic_ts (output) if not NULL will be filled with monotonic |
| 2413 | * wall clock receive timestamp for this plugin data. |
| 2414 | * @return 0 on success and -1 on error, where errno will be set to ENOENT if |
| 2415 | * there is no entry for this node and ENOMEM if the input plugin data's |
| 2416 | * capacity is less than plugin's data. In ENOMEM case plugin_data->data_size |
| 2417 | * will be set to the required capacity. |
| 2418 | */ |
| 2419 | int |
| 2420 | as_hb_plugin_data_get(cf_node nodeid, as_hb_plugin_id plugin, |
| 2421 | as_hb_plugin_node_data* plugin_data, as_hlc_msg_timestamp* msg_hlc_ts, |
| 2422 | cf_clock* recv_monotonic_ts) |
| 2423 | { |
| 2424 | int rv = 0; |
| 2425 | |
| 2426 | HB_LOCK(); |
| 2427 | |
| 2428 | as_hb_adjacent_node adjacent_node; |
| 2429 | if (hb_adjacent_node_get(nodeid, &adjacent_node) != 0) { |
| 2430 | rv = -1; |
| 2431 | plugin_data->data_size = 0; |
| 2432 | errno = ENOENT; |
| 2433 | goto Exit; |
| 2434 | } |
| 2435 | |
| 2436 | as_hb_plugin_node_data* plugin_data_internal = |
| 2437 | &adjacent_node.plugin_data[plugin][adjacent_node.plugin_data_cycler |
| 2438 | % 2]; |
| 2439 | |
| 2440 | if (plugin_data_internal->data && plugin_data_internal->data_size) { |
| 2441 | // Set the plugin data size |
| 2442 | plugin_data->data_size = plugin_data_internal->data_size; |
| 2443 | |
| 2444 | if (plugin_data_internal->data_size > plugin_data->data_capacity) { |
| 2445 | rv = -1; |
| 2446 | errno = ENOMEM; |
| 2447 | goto Exit; |
| 2448 | } |
| 2449 | |
| 2450 | // Copy over the stored copy of the plugin data. |
| 2451 | memcpy(plugin_data->data, plugin_data_internal->data, |
| 2452 | plugin_data_internal->data_size); |
| 2453 | |
| 2454 | // Copy the message timestamp. |
| 2455 | if (msg_hlc_ts) { |
| 2456 | memcpy(msg_hlc_ts, &adjacent_node.last_msg_hlc_ts, |
| 2457 | sizeof(as_hlc_msg_timestamp)); |
| 2458 | } |
| 2459 | |
| 2460 | if (recv_monotonic_ts) { |
| 2461 | *recv_monotonic_ts = adjacent_node.last_updated_monotonic_ts; |
| 2462 | } |
| 2463 | |
| 2464 | rv = 0; |
| 2465 | } |
| 2466 | else { |
| 2467 | // No plugin data set. |
| 2468 | plugin_data->data_size = 0; |
| 2469 | if (recv_monotonic_ts) { |
| 2470 | *recv_monotonic_ts = 0; |
| 2471 | } |
| 2472 | if (msg_hlc_ts) { |
| 2473 | memset(msg_hlc_ts, 0, sizeof(as_hlc_msg_timestamp)); |
| 2474 | } |
| 2475 | rv = 0; |
| 2476 | } |
| 2477 | |
| 2478 | Exit: |
| 2479 | HB_UNLOCK(); |
| 2480 | return rv; |
| 2481 | } |
| 2482 | |
| 2483 | /** |
| 2484 | * Call the iterate method on plugin data for all nodes in the input vector. The |
| 2485 | * iterate function will be invoked for all nodes in the input vector even if |
| 2486 | * they are not in the adjacency list or they have no plugin data. Plugin data |
| 2487 | * will be NULL with size zero in such cases. |
| 2488 | * |
| 2489 | * @param nodes the iterate on. |
| 2490 | * @param plugin the plugin identifier. |
| 2491 | * @param iterate_fn the iterate function invoked for plugin data for every |
| 2492 | * node. |
| 2493 | * @param udata passed as is to the iterate function. Useful for getting results |
| 2494 | * out of the iteration. |
| 2495 | * NULL if there is no plugin data. |
| 2496 | * @return the size of the plugin data. 0 if there is no plugin data. |
| 2497 | */ |
| 2498 | void |
| 2499 | as_hb_plugin_data_iterate(cf_vector* nodes, as_hb_plugin_id plugin, |
| 2500 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata) |
| 2501 | |
| 2502 | { |
| 2503 | HB_LOCK(); |
| 2504 | |
| 2505 | int size = cf_vector_size(nodes); |
| 2506 | |
| 2507 | for (int i = 0; i < size; i++) { |
| 2508 | cf_node* nodeid = cf_vector_getp(nodes, i); |
| 2509 | |
| 2510 | if (nodeid == NULL || *nodeid == 0) { |
| 2511 | continue; |
| 2512 | } |
| 2513 | |
| 2514 | as_hb_adjacent_node nodeinfo; |
| 2515 | |
| 2516 | if (hb_adjacent_node_get(*nodeid, &nodeinfo) == 0) { |
| 2517 | size_t data_size = 0; |
| 2518 | void* data = NULL; |
| 2519 | |
| 2520 | hb_adjacent_node_plugin_data_get(&nodeinfo, plugin, &data, |
| 2521 | &data_size); |
| 2522 | |
| 2523 | iterate_fn(*nodeid, data, data_size, |
| 2524 | nodeinfo.last_updated_monotonic_ts, |
| 2525 | &nodeinfo.last_msg_hlc_ts, udata); |
| 2526 | } |
| 2527 | else { |
| 2528 | // This node is not known to the heartbeat subsystem. |
| 2529 | iterate_fn(*nodeid, NULL, 0, 0, NULL, udata); |
| 2530 | } |
| 2531 | } |
| 2532 | |
| 2533 | HB_UNLOCK(); |
| 2534 | } |
| 2535 | |
| 2536 | /** |
| 2537 | * Call the iterate method on all nodes in current adjacency list. Note plugin |
| 2538 | * data can still be NULL if the plugin data failed to parse the plugin data. |
| 2539 | * |
| 2540 | * @param pluginid the plugin identifier. |
| 2541 | * @param iterate_fn the iterate function invoked for plugin data for every |
| 2542 | * node. |
| 2543 | * @param udata passed as is to the iterate function. Useful for getting results |
| 2544 | * out of the iteration. |
| 2545 | * NULL if there is no plugin data. |
| 2546 | * @return the size of the plugin data. 0 if there is no plugin data. |
| 2547 | */ |
| 2548 | void |
| 2549 | as_hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, |
| 2550 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata) |
| 2551 | { |
| 2552 | hb_plugin_data_iterate_all(pluginid, iterate_fn, udata); |
| 2553 | } |
| 2554 | |
| 2555 | /** |
| 2556 | * Log the state of the heartbeat module. |
| 2557 | */ |
| 2558 | void |
| 2559 | as_hb_dump(bool verbose) |
| 2560 | { |
| 2561 | INFO("Heartbeat Dump:" ); |
| 2562 | |
| 2563 | as_hb_mode mode; |
| 2564 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 2565 | as_hb_info_listen_addr_get(&mode, endpoint_list_str, |
| 2566 | sizeof(endpoint_list_str)); |
| 2567 | |
| 2568 | // Dump the config. |
| 2569 | INFO("HB Mode: %s (%d)" , |
| 2570 | (mode == AS_HB_MODE_MULTICAST ? |
| 2571 | "multicast" : |
| 2572 | (mode == AS_HB_MODE_MESH ? "mesh" : "undefined" )), mode); |
| 2573 | |
| 2574 | INFO("HB Addresses: {%s}" , endpoint_list_str); |
| 2575 | INFO("HB MTU: %d" , hb_mtu()); |
| 2576 | |
| 2577 | INFO("HB Interval: %d" , config_tx_interval_get()); |
| 2578 | INFO("HB Timeout: %d" , config_max_intervals_missed_get()); |
| 2579 | char protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
| 2580 | as_hb_protocol_get_s(config_protocol_get(), protocol_s); |
| 2581 | INFO("HB Protocol: %s (%d)" , protocol_s, config_protocol_get()); |
| 2582 | |
| 2583 | // dump mode specific state. |
| 2584 | hb_mode_dump(verbose); |
| 2585 | |
| 2586 | // Dump the channel state. |
| 2587 | channel_dump(verbose); |
| 2588 | |
| 2589 | // Dump the adjacency list. |
| 2590 | hb_dump(verbose); |
| 2591 | } |
| 2592 | |
| 2593 | /** |
| 2594 | * Indicates if a node is alive. |
| 2595 | */ |
| 2596 | bool |
| 2597 | as_hb_is_alive(cf_node nodeid) |
| 2598 | { |
| 2599 | bool is_alive; |
| 2600 | HB_LOCK(); |
| 2601 | |
| 2602 | as_hb_adjacent_node adjacent_node; |
| 2603 | is_alive = (nodeid == config_self_nodeid_get()) |
| 2604 | || (hb_adjacent_node_get(nodeid, &adjacent_node) == 0); |
| 2605 | |
| 2606 | HB_UNLOCK(); |
| 2607 | return is_alive; |
| 2608 | } |
| 2609 | |
| 2610 | /** |
| 2611 | * Compute the nodes to evict from the input nodes so that remaining nodes form |
| 2612 | * a clique, based on adjacency lists. Self nodeid is never considered for |
| 2613 | * eviction. |
| 2614 | * |
| 2615 | * @param nodes input cf_node vector. |
| 2616 | * @param nodes_to_evict output cf_node clique array, that is initialized. |
| 2617 | */ |
| 2618 | void |
| 2619 | as_hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict) |
| 2620 | { |
| 2621 | hb_maximal_clique_evict(nodes, nodes_to_evict); |
| 2622 | } |
| 2623 | |
| 2624 | /** |
| 2625 | * Read the hlc timestamp for the message. |
| 2626 | * Note: A protected API for the sole benefit of skew monitor. |
| 2627 | * |
| 2628 | * @param msg the incoming message. |
| 2629 | * @param send_ts the output hlc timestamp. |
| 2630 | * @return 0 if the time stamp could be parsed -1 on failure. |
| 2631 | */ |
| 2632 | int |
| 2633 | as_hb_msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts) |
| 2634 | { |
| 2635 | return msg_send_hlc_ts_get(msg, send_ts); |
| 2636 | } |
| 2637 | |
| 2638 | /* |
| 2639 | * ---------------------------------------------------------------------------- |
| 2640 | * Common sub module. |
| 2641 | * ---------------------------------------------------------------------------- |
| 2642 | */ |
| 2643 | |
| 2644 | /* |
| 2645 | * ---------------------------------------------------------------------------- |
| 2646 | * Utility |
| 2647 | * ---------------------------------------------------------------------------- |
| 2648 | */ |
| 2649 | |
| 2650 | /** |
| 2651 | * Round up input int to the nearest power of two. |
| 2652 | */ |
| 2653 | static uint32_t |
| 2654 | round_up_pow2(uint32_t v) |
| 2655 | { |
| 2656 | v--; |
| 2657 | v |= v >> 1; |
| 2658 | v |= v >> 2; |
| 2659 | v |= v >> 4; |
| 2660 | v |= v >> 8; |
| 2661 | v |= v >> 16; |
| 2662 | v++; |
| 2663 | return v; |
| 2664 | } |
| 2665 | |
| 2666 | /** |
| 2667 | * Generate a hash code for a cf_socket. |
| 2668 | */ |
| 2669 | static uint32_t |
| 2670 | hb_socket_hash_fn(const void* key) |
| 2671 | { |
| 2672 | const cf_socket** socket = (const cf_socket**)key; |
| 2673 | return cf_hash_jen32((const uint8_t*)socket, sizeof(cf_socket*)); |
| 2674 | } |
| 2675 | |
| 2676 | /** |
| 2677 | * Reduce function to delete all entries in a map |
| 2678 | */ |
| 2679 | static int |
| 2680 | hb_delete_all_reduce(const void* key, void* data, void* udata) |
| 2681 | { |
| 2682 | return CF_SHASH_REDUCE_DELETE; |
| 2683 | } |
| 2684 | |
| 2685 | /* |
| 2686 | * ---------------------------------------------------------------------------- |
| 2687 | * Info call related |
| 2688 | * ---------------------------------------------------------------------------- |
| 2689 | */ |
| 2690 | |
| 2691 | /** |
| 2692 | * Append a address spec to a cf_dyn_buf. |
| 2693 | */ |
| 2694 | static void |
| 2695 | info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list) |
| 2696 | { |
| 2697 | for (uint32_t i = 0; i < list->n_addrs; ++i) { |
| 2698 | info_append_string(db, name, list->addrs[i]); |
| 2699 | } |
| 2700 | } |
| 2701 | |
| 2702 | /* |
| 2703 | * ---------------------------------------------------------------------------- |
| 2704 | * Vector operations |
| 2705 | * ---------------------------------------------------------------------------- |
| 2706 | */ |
| 2707 | |
| 2708 | /** |
| 2709 | * TODO: Move this to cf_vector. |
| 2710 | * Find the index of an element in the vector. Equality is based on mem compare. |
| 2711 | * |
| 2712 | * @param vector the source vector. |
| 2713 | * @param element the element to find. |
| 2714 | * @return the index if the element is found, -1 otherwise. |
| 2715 | */ |
| 2716 | static int |
| 2717 | vector_find(cf_vector* vector, const void* element) |
| 2718 | { |
| 2719 | int element_count = cf_vector_size(vector); |
| 2720 | size_t value_len = cf_vector_element_size(vector); |
| 2721 | for (int i = 0; i < element_count; i++) { |
| 2722 | // No null check required since we are iterating under a lock and within |
| 2723 | // vector bounds. |
| 2724 | void* src_element = cf_vector_getp(vector, i); |
| 2725 | if (src_element) { |
| 2726 | if (memcmp(element, src_element, value_len) == 0) { |
| 2727 | return i; |
| 2728 | } |
| 2729 | } |
| 2730 | } |
| 2731 | return -1; |
| 2732 | } |
| 2733 | |
| 2734 | /* |
| 2735 | * ---------------------------------------------------------------------------- |
| 2736 | * Endpoint list related |
| 2737 | * ---------------------------------------------------------------------------- |
| 2738 | */ |
| 2739 | |
| 2740 | /** |
| 2741 | * Copy an endpoint list to the destination, while possible reallocating the |
| 2742 | * destination space. |
| 2743 | * @param dest the double pointer to the destination list, because it might need |
| 2744 | * reallocation to accommodate a larger source list. |
| 2745 | * @param src the source endpoint list. |
| 2746 | */ |
| 2747 | static void |
| 2748 | endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src) |
| 2749 | { |
| 2750 | size_t src_size; |
| 2751 | |
| 2752 | if (as_endpoint_list_sizeof(src, &src_size) != 0) { |
| 2753 | // Bad endpoint list passed. |
| 2754 | CRASH("invalid adjacency list passed for copying" ); |
| 2755 | } |
| 2756 | |
| 2757 | *dest = cf_realloc(*dest, src_size); |
| 2758 | |
| 2759 | memcpy(*dest, src, src_size); |
| 2760 | } |
| 2761 | |
| 2762 | /** |
| 2763 | * Process function to convert endpoint list to a string. |
| 2764 | */ |
| 2765 | static void |
| 2766 | endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, |
| 2767 | void* udata) |
| 2768 | { |
| 2769 | endpoint_list_to_string_udata* to_string_udata = |
| 2770 | (endpoint_list_to_string_udata*)udata; |
| 2771 | as_endpoint_list_to_string(endpoint_list, |
| 2772 | to_string_udata->endpoint_list_str, |
| 2773 | to_string_udata->endpoint_list_str_capacity); |
| 2774 | } |
| 2775 | |
| 2776 | /** |
| 2777 | * Process function to check if endpoint lists overlap. |
| 2778 | */ |
| 2779 | static void |
| 2780 | endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata) |
| 2781 | { |
| 2782 | endpoint_list_equal_check_udata* equal_udata = |
| 2783 | (endpoint_list_equal_check_udata*)udata; |
| 2784 | |
| 2785 | equal_udata->are_equal = equal_udata->are_equal |
| 2786 | || as_endpoint_lists_are_equal(endpoint_list, equal_udata->other); |
| 2787 | } |
| 2788 | |
| 2789 | /* |
| 2790 | * ---------------------------------------------------------------------------- |
| 2791 | * Messge related |
| 2792 | * ---------------------------------------------------------------------------- |
| 2793 | */ |
| 2794 | |
| 2795 | /** |
| 2796 | * The size of a buffer beyond which compression should be applied. For now set |
| 2797 | * to 60% of the interface mtu. |
| 2798 | */ |
| 2799 | static int |
| 2800 | msg_compression_threshold(int mtu) |
| 2801 | { |
| 2802 | return (int)(mtu * 0.6); |
| 2803 | } |
| 2804 | |
| 2805 | /** |
| 2806 | * Read advertised endpoint list from an incoming message. |
| 2807 | * @param msg the incoming message. |
| 2808 | * @param endpoint_list the output endpoint. The endpoint_list will point to |
| 2809 | * input message. |
| 2810 | * internal location and should not be freed. |
| 2811 | * @return 0 on success -1 on failure. |
| 2812 | */ |
| 2813 | static int |
| 2814 | msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list) |
| 2815 | { |
| 2816 | size_t endpoint_list_size; |
| 2817 | if (msg_get_buf(msg, AS_HB_MSG_ENDPOINTS, (uint8_t**)endpoint_list, |
| 2818 | &endpoint_list_size, MSG_GET_DIRECT) != 0) { |
| 2819 | return -1; |
| 2820 | } |
| 2821 | |
| 2822 | size_t parsed_size; |
| 2823 | if (as_endpoint_list_nsizeof(*endpoint_list, &parsed_size, |
| 2824 | endpoint_list_size) || parsed_size != endpoint_list_size) { |
| 2825 | return -1; |
| 2826 | } |
| 2827 | return 0; |
| 2828 | } |
| 2829 | |
| 2830 | /** |
| 2831 | * Read the protocol identifier for this heartbeat message. These functions can |
| 2832 | * get called multiple times for a single message. Hence they do not increment |
| 2833 | * error counters. |
| 2834 | * |
| 2835 | * @param msg the incoming message. |
| 2836 | * @param id the output id. |
| 2837 | * @return 0 if the id could be parsed -1 on failure. |
| 2838 | */ |
| 2839 | static int |
| 2840 | msg_id_get(msg* msg, uint32_t* id) |
| 2841 | { |
| 2842 | if (msg_get_uint32(msg, AS_HB_MSG_ID, id) != 0) { |
| 2843 | return -1; |
| 2844 | } |
| 2845 | |
| 2846 | return 0; |
| 2847 | } |
| 2848 | |
| 2849 | /** |
| 2850 | * Read the source nodeid for a node. These functions can get called multiple |
| 2851 | * times for a single message. Hence they do not increment error counters. |
| 2852 | * @param msg the incoming message. |
| 2853 | * @param nodeid the output nodeid. |
| 2854 | * @return 0 if the nodeid could be parsed -1 on failure. |
| 2855 | */ |
| 2856 | static int |
| 2857 | msg_nodeid_get(msg* msg, cf_node* nodeid) |
| 2858 | { |
| 2859 | if (msg_get_uint64(msg, AS_HB_MSG_NODE, nodeid) != 0) { |
| 2860 | return -1; |
| 2861 | } |
| 2862 | |
| 2863 | return 0; |
| 2864 | } |
| 2865 | |
| 2866 | /** |
| 2867 | * Read the HLC send timestamp for the message. These functions can get called |
| 2868 | * multiple times for a single message. Hence they do not increment error |
| 2869 | * counters. |
| 2870 | * @param msg the incoming message. |
| 2871 | * @param send_ts the output hlc timestamp. |
| 2872 | * @return 0 if the time stamp could be parsed -1 on failure. |
| 2873 | */ |
| 2874 | static int |
| 2875 | msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts) |
| 2876 | { |
| 2877 | if (msg_get_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, send_ts) != 0) { |
| 2878 | return -1; |
| 2879 | } |
| 2880 | |
| 2881 | return 0; |
| 2882 | } |
| 2883 | |
| 2884 | /** |
| 2885 | * Read the message type. These functions can get called multiple times for a |
| 2886 | * single message. Hence they do not increment error counters. |
| 2887 | * @param msg the incoming message. |
| 2888 | * @param type the output message type. |
| 2889 | * @return 0 if the type could be parsed -1 on failure. |
| 2890 | */ |
| 2891 | static int |
| 2892 | msg_type_get(msg* msg, as_hb_msg_type* type) |
| 2893 | { |
| 2894 | if (msg_get_uint32(msg, AS_HB_MSG_TYPE, type) != 0) { |
| 2895 | return -1; |
| 2896 | } |
| 2897 | |
| 2898 | return 0; |
| 2899 | } |
| 2900 | |
| 2901 | /** |
| 2902 | * Read the cluster name. |
| 2903 | * @param msg the incoming message. |
| 2904 | * @param cluster name of the output message type. |
| 2905 | * @return 0 if the cluster name could be parsed -1 on failure. |
| 2906 | */ |
| 2907 | static int |
| 2908 | msg_cluster_name_get(msg* msg, char** cluster_name) |
| 2909 | { |
| 2910 | if (msg_get_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, |
| 2911 | MSG_GET_DIRECT) != 0) { |
| 2912 | return -1; |
| 2913 | } |
| 2914 | |
| 2915 | return 0; |
| 2916 | } |
| 2917 | |
| 2918 | /** |
| 2919 | * Get a pointer to a node list in the message. |
| 2920 | * |
| 2921 | * @param msg the incoming message. |
| 2922 | * @param field_id the field id. |
| 2923 | * @param adj_list output. on success will point to the adjacency list in the |
| 2924 | * message. |
| 2925 | * @para adj_length output. on success will contain the length of the adjacency |
| 2926 | * list. |
| 2927 | * @return 0 on success. -1 if the adjacency list is absent. |
| 2928 | */ |
| 2929 | static int |
| 2930 | msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, |
| 2931 | size_t* adj_length) |
| 2932 | { |
| 2933 | if (msg_get_buf(msg, field_id, (uint8_t**)adj_list, adj_length, |
| 2934 | MSG_GET_DIRECT) != 0) { |
| 2935 | return -1; |
| 2936 | } |
| 2937 | |
| 2938 | // correct adjacency list length. |
| 2939 | *adj_length /= sizeof(cf_node); |
| 2940 | |
| 2941 | return 0; |
| 2942 | } |
| 2943 | |
| 2944 | /** |
| 2945 | * Get a pointer to the adjacency list in the message. |
| 2946 | * |
| 2947 | * @param msg the incoming message. |
| 2948 | * @param adj_list output. on success will point to the adjacency list in the |
| 2949 | * message. |
| 2950 | * @para adj_length output. on success will contain the length of the adjacency |
| 2951 | * list. |
| 2952 | * @return 0 on success. -1 if the adjacency list is absent. |
| 2953 | */ |
| 2954 | static int |
| 2955 | msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length) |
| 2956 | { |
| 2957 | return msg_node_list_get(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length); |
| 2958 | } |
| 2959 | |
| 2960 | /** |
| 2961 | * Set a node list on an outgoing messages for a field. |
| 2962 | * |
| 2963 | * @param msg the outgoing message. |
| 2964 | * @param field_id the id of the list field. |
| 2965 | * @param node_list the adjacency list to set. |
| 2966 | * @para node_length the length of the adjacency list. |
| 2967 | */ |
| 2968 | static void |
| 2969 | msg_node_list_set(msg* msg, int field_id, cf_node* node_list, |
| 2970 | size_t node_length) |
| 2971 | { |
| 2972 | msg_set_buf(msg, field_id, (uint8_t*)node_list, |
| 2973 | sizeof(cf_node) * node_length, MSG_SET_COPY); |
| 2974 | } |
| 2975 | |
| 2976 | /** |
| 2977 | * Set the adjacency list on an outgoing messages. |
| 2978 | * |
| 2979 | * @param msg the outgoing message. |
| 2980 | * @param adj_list the adjacency list to set. |
| 2981 | * @para adj_length the length of the adjacency list. |
| 2982 | */ |
| 2983 | static void |
| 2984 | msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length) |
| 2985 | { |
| 2986 | msg_node_list_set(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length); |
| 2987 | } |
| 2988 | |
| 2989 | /** |
| 2990 | * Set the info reply on an outgoing messages. |
| 2991 | * |
| 2992 | * @param msg the outgoing message. |
| 2993 | * @param response the response list to set. |
| 2994 | * @para response_count the length of the response list. |
| 2995 | */ |
| 2996 | static void |
| 2997 | msg_info_reply_set(msg* msg, as_hb_mesh_info_reply* response, |
| 2998 | size_t response_count) |
| 2999 | { |
| 3000 | size_t response_size = 0; |
| 3001 | if (mesh_info_reply_sizeof(response, response_count, &response_size)) { |
| 3002 | CRASH("error setting info reply on msg" ); |
| 3003 | } |
| 3004 | |
| 3005 | msg_set_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t*)response, response_size, |
| 3006 | MSG_SET_COPY); |
| 3007 | |
| 3008 | return; |
| 3009 | } |
| 3010 | |
| 3011 | /** |
| 3012 | * Get a pointer to the info reply list in the message. |
| 3013 | * |
| 3014 | * @param msg the incoming message. |
| 3015 | * @param reply output. on success will point to the reply list in the message. |
| 3016 | * @param reply_count output. on success will contain the length of the reply |
| 3017 | * list. |
| 3018 | * @return 0 on success. -1 if the reply list is absent. |
| 3019 | */ |
| 3020 | static int |
| 3021 | msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count) |
| 3022 | { |
| 3023 | size_t reply_size; |
| 3024 | if (msg_get_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t**)reply, &reply_size, |
| 3025 | MSG_GET_DIRECT) != 0) { |
| 3026 | return -1; |
| 3027 | } |
| 3028 | |
| 3029 | *reply_count = 0; |
| 3030 | |
| 3031 | // Go over reply and compute the count of replies and also validate the |
| 3032 | // endpoint lists. |
| 3033 | uint8_t* start_ptr = (uint8_t*)*reply; |
| 3034 | int64_t remaining_size = reply_size; |
| 3035 | |
| 3036 | while (remaining_size > 0) { |
| 3037 | as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; |
| 3038 | remaining_size -= sizeof(as_hb_mesh_info_reply); |
| 3039 | start_ptr += sizeof(as_hb_mesh_info_reply); |
| 3040 | if (remaining_size <= 0) { |
| 3041 | // Incomplete / garbled info reply message. |
| 3042 | *reply_count = 0; |
| 3043 | return -1; |
| 3044 | } |
| 3045 | |
| 3046 | size_t endpoint_list_size = 0; |
| 3047 | if (as_endpoint_list_nsizeof(reply_ptr->endpoint_list, |
| 3048 | &endpoint_list_size, remaining_size) != 0) { |
| 3049 | // Incomplete / garbled info reply message. |
| 3050 | *reply_count = 0; |
| 3051 | return -1; |
| 3052 | } |
| 3053 | |
| 3054 | remaining_size -= endpoint_list_size; |
| 3055 | start_ptr += endpoint_list_size; |
| 3056 | (*reply_count)++; |
| 3057 | } |
| 3058 | |
| 3059 | return 0; |
| 3060 | } |
| 3061 | |
| 3062 | /** |
| 3063 | * Fill a message with an endpoint list. |
| 3064 | */ |
| 3065 | static void |
| 3066 | msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, |
| 3067 | void* udata) |
| 3068 | { |
| 3069 | endpoint_list_to_msg_udata* to_msg_udata = |
| 3070 | (endpoint_list_to_msg_udata*)udata; |
| 3071 | msg* msg = to_msg_udata->msg; |
| 3072 | bool is_mesh = to_msg_udata->is_mesh; |
| 3073 | |
| 3074 | if (!published_endpoint_list) { |
| 3075 | if (is_mesh) { |
| 3076 | // Something is messed up. Except for v3 multicast, |
| 3077 | // published list should not be empty. |
| 3078 | WARNING("published endpoint list is empty" ); |
| 3079 | } |
| 3080 | return; |
| 3081 | } |
| 3082 | |
| 3083 | // Makes sense only for mesh. |
| 3084 | if (is_mesh && published_endpoint_list) { |
| 3085 | // Set the source address |
| 3086 | size_t endpoint_list_size = 0; |
| 3087 | as_endpoint_list_sizeof(published_endpoint_list, &endpoint_list_size); |
| 3088 | msg_set_buf(msg, AS_HB_MSG_ENDPOINTS, |
| 3089 | (uint8_t*)published_endpoint_list, endpoint_list_size, |
| 3090 | MSG_SET_COPY); |
| 3091 | } |
| 3092 | } |
| 3093 | |
| 3094 | /** |
| 3095 | * Fill source fields for the message. |
| 3096 | * @param msg the message to fill the source fields into. |
| 3097 | */ |
| 3098 | static void |
| 3099 | msg_src_fields_fill(msg* msg) |
| 3100 | { |
| 3101 | bool is_mesh = hb_is_mesh(); |
| 3102 | |
| 3103 | // Set the hb protocol id / version. |
| 3104 | msg_set_uint32(msg, AS_HB_MSG_ID, hb_protocol_identifier_get()); |
| 3105 | |
| 3106 | // Set the source node. |
| 3107 | msg_set_uint64(msg, AS_HB_MSG_NODE, config_self_nodeid_get()); |
| 3108 | |
| 3109 | endpoint_list_to_msg_udata udata; |
| 3110 | udata.msg = msg; |
| 3111 | udata.is_mesh = is_mesh; |
| 3112 | |
| 3113 | if (is_mesh) { |
| 3114 | // Endpoint list only valid for mesh mode. |
| 3115 | mesh_published_endpoints_process(msg_published_endpoints_fill, &udata); |
| 3116 | } |
| 3117 | |
| 3118 | // Set the send hlc timestamp |
| 3119 | msg_set_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, as_hlc_timestamp_now()); |
| 3120 | } |
| 3121 | |
| 3122 | /** |
| 3123 | * Set the type for an outgoing message. |
| 3124 | * @param msg the outgoing message. |
| 3125 | * @param msg_type the type to set. |
| 3126 | */ |
| 3127 | static void |
| 3128 | msg_type_set(msg* msg, as_hb_msg_type msg_type) |
| 3129 | { |
| 3130 | // Set the message type. |
| 3131 | msg_set_uint32(msg, AS_HB_MSG_TYPE, msg_type); |
| 3132 | } |
| 3133 | |
| 3134 | /* |
| 3135 | * ---------------------------------------------------------------------------- |
| 3136 | * Config sub module. |
| 3137 | * ---------------------------------------------------------------------------- |
| 3138 | */ |
| 3139 | |
| 3140 | /** |
| 3141 | * Get mcsize. |
| 3142 | */ |
| 3143 | static int |
| 3144 | config_mcsize() |
| 3145 | { |
| 3146 | int mode_cluster_size = 0; |
| 3147 | if (hb_is_mesh()) { |
| 3148 | // Only bounded by available memory. But let's say its infinite. |
| 3149 | mode_cluster_size = INT_MAX; |
| 3150 | } |
| 3151 | else { |
| 3152 | mode_cluster_size = multicast_supported_cluster_size_get(); |
| 3153 | } |
| 3154 | |
| 3155 | // Ensure we are always upper bounded by the absolute max cluster size. |
| 3156 | int supported_cluster_size = MIN(ASC, mode_cluster_size); |
| 3157 | |
| 3158 | DETAIL("supported cluster size %d" , supported_cluster_size); |
| 3159 | return supported_cluster_size; |
| 3160 | } |
| 3161 | |
| 3162 | /** |
| 3163 | * Get the binding addresses for the heartbeat subsystem. |
| 3164 | */ |
| 3165 | static const cf_serv_cfg* |
| 3166 | config_bind_cfg_get() |
| 3167 | { |
| 3168 | // Not protected by config_lock because it is not changed. |
| 3169 | return &g_config.hb_config.bind_cfg; |
| 3170 | } |
| 3171 | |
| 3172 | /** |
| 3173 | * Get the multicast groups for the multicast mode. |
| 3174 | */ |
| 3175 | static const cf_mserv_cfg* |
| 3176 | config_multicast_group_cfg_get() |
| 3177 | { |
| 3178 | // Not protected by config_lock. Never updated after config parsing.. |
| 3179 | return &g_config.hb_config.multicast_group_cfg; |
| 3180 | } |
| 3181 | |
| 3182 | /** |
| 3183 | * Get the heartbeat pulse transmit interval. |
| 3184 | */ |
| 3185 | static uint32_t |
| 3186 | config_tx_interval_get() |
| 3187 | { |
| 3188 | HB_CONFIG_LOCK(); |
| 3189 | uint32_t interval = g_config.hb_config.tx_interval; |
| 3190 | HB_CONFIG_UNLOCK(); |
| 3191 | return interval; |
| 3192 | } |
| 3193 | |
| 3194 | /** |
| 3195 | * Set the heartbeat pulse transmit interval. |
| 3196 | */ |
| 3197 | static void |
| 3198 | config_tx_interval_set(uint32_t new_interval) |
| 3199 | { |
| 3200 | HB_CONFIG_LOCK(); |
| 3201 | INFO("changing value of interval from %d to %d " , |
| 3202 | g_config.hb_config.tx_interval, new_interval); |
| 3203 | g_config.hb_config.tx_interval = new_interval; |
| 3204 | HB_CONFIG_UNLOCK(); |
| 3205 | } |
| 3206 | |
| 3207 | /** |
| 3208 | * Get the heartbeat pulse transmit interval. |
| 3209 | */ |
| 3210 | static uint32_t |
| 3211 | config_override_mtu_get() |
| 3212 | { |
| 3213 | HB_CONFIG_LOCK(); |
| 3214 | uint32_t override_mtu = g_config.hb_config.override_mtu; |
| 3215 | HB_CONFIG_UNLOCK(); |
| 3216 | return override_mtu; |
| 3217 | } |
| 3218 | |
| 3219 | /** |
| 3220 | * Set the heartbeat pulse transmit interval. |
| 3221 | */ |
| 3222 | static void |
| 3223 | config_override_mtu_set(uint32_t mtu) |
| 3224 | { |
| 3225 | HB_CONFIG_LOCK(); |
| 3226 | INFO("changing value of override mtu from %d to %d " , |
| 3227 | g_config.hb_config.override_mtu, mtu); |
| 3228 | g_config.hb_config.override_mtu = mtu; |
| 3229 | HB_CONFIG_UNLOCK(); |
| 3230 | INFO("max supported cluster size is %d" , config_mcsize()); |
| 3231 | } |
| 3232 | |
| 3233 | /** |
| 3234 | * Get the maximum number of missed heartbeat intervals after which a node is |
| 3235 | * considered expired. |
| 3236 | */ |
| 3237 | static uint32_t |
| 3238 | config_max_intervals_missed_get() |
| 3239 | { |
| 3240 | uint32_t rv = 0; |
| 3241 | HB_CONFIG_LOCK(); |
| 3242 | rv = g_config.hb_config.max_intervals_missed; |
| 3243 | HB_CONFIG_UNLOCK(); |
| 3244 | return rv; |
| 3245 | } |
| 3246 | |
| 3247 | /** |
| 3248 | * Get the number intervals endpoints should be tracked for. |
| 3249 | */ |
| 3250 | static uint32_t |
| 3251 | config_endpoint_track_intervals_get() |
| 3252 | { |
| 3253 | // Allow a grace period of half heartbeat timeout, but lower bounded to at |
| 3254 | // least 3. |
| 3255 | return MAX(3, config_max_intervals_missed_get() / 2); |
| 3256 | } |
| 3257 | |
| 3258 | /** |
| 3259 | * Get the maximum number of allowed changes, per endpoint track intervals. |
| 3260 | */ |
| 3261 | static uint32_t |
| 3262 | config_endpoint_changes_allowed_get() |
| 3263 | { |
| 3264 | // Allow no change to the endpoint list for now. |
| 3265 | return 0; |
| 3266 | } |
| 3267 | |
| 3268 | /** |
| 3269 | * Set the maximum number of missed heartbeat intervals after which a node is |
| 3270 | * considered expired. |
| 3271 | */ |
| 3272 | static void |
| 3273 | config_max_intervals_missed_set(uint32_t new_max) |
| 3274 | { |
| 3275 | HB_CONFIG_LOCK(); |
| 3276 | INFO("changing value of timeout from %d to %d " , |
| 3277 | g_config.hb_config.max_intervals_missed, new_max); |
| 3278 | g_config.hb_config.max_intervals_missed = new_max; |
| 3279 | HB_CONFIG_UNLOCK(); |
| 3280 | } |
| 3281 | |
| 3282 | /** |
| 3283 | * Return ttl for multicast packets. Set to zero for default TTL. |
| 3284 | */ |
| 3285 | static unsigned char |
| 3286 | config_multicast_ttl_get() |
| 3287 | { |
| 3288 | return g_config.hb_config.multicast_ttl; |
| 3289 | } |
| 3290 | |
| 3291 | /** |
| 3292 | * Return the current heartbeat protocol. |
| 3293 | */ |
| 3294 | static as_hb_protocol |
| 3295 | config_protocol_get() |
| 3296 | { |
| 3297 | as_hb_protocol rv = 0; |
| 3298 | HB_CONFIG_LOCK(); |
| 3299 | rv = g_config.hb_config.protocol; |
| 3300 | HB_CONFIG_UNLOCK(); |
| 3301 | return rv; |
| 3302 | } |
| 3303 | |
| 3304 | /** |
| 3305 | * Return the current heartbeat protocol. |
| 3306 | */ |
| 3307 | static void |
| 3308 | config_protocol_set(as_hb_protocol new_protocol) |
| 3309 | { |
| 3310 | HB_CONFIG_LOCK(); |
| 3311 | g_config.hb_config.protocol = new_protocol; |
| 3312 | HB_CONFIG_UNLOCK(); |
| 3313 | } |
| 3314 | |
| 3315 | /** |
| 3316 | * The nodeid for this node. |
| 3317 | */ |
| 3318 | static cf_node |
| 3319 | config_self_nodeid_get() |
| 3320 | { |
| 3321 | // Not protected by config_lock. Never updated after config parsing.. |
| 3322 | return g_config.self_node; |
| 3323 | } |
| 3324 | |
| 3325 | /** |
| 3326 | * Return the heartbeat subsystem mode. |
| 3327 | */ |
| 3328 | static as_hb_mode |
| 3329 | config_mode_get() |
| 3330 | { |
| 3331 | // Not protected by config_lock. Never updated after config parsing.. |
| 3332 | return g_config.hb_config.mode; |
| 3333 | } |
| 3334 | |
| 3335 | /** |
| 3336 | * Expand "any" binding addresses to actual interface addresses. |
| 3337 | * @param bind_cfg the binding configuration. |
| 3338 | * @param published_cfg (output) the server configuration to expand. |
| 3339 | * @param ipv4_only indicates if only legacy addresses should be allowed. |
| 3340 | */ |
| 3341 | static void |
| 3342 | config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, |
| 3343 | cf_serv_cfg* published_cfg, bool ipv4_only) |
| 3344 | { |
| 3345 | cf_serv_cfg_init(published_cfg); |
| 3346 | cf_sock_cfg sock_cfg; |
| 3347 | |
| 3348 | for (int i = 0; i < bind_cfg->n_cfgs; i++) { |
| 3349 | cf_sock_cfg_copy(&bind_cfg->cfgs[i], &sock_cfg); |
| 3350 | |
| 3351 | // Expand "any" address to all interfaces. |
| 3352 | if (cf_ip_addr_is_any(&sock_cfg.addr)) { |
| 3353 | cf_ip_addr all_addrs[CF_SOCK_CFG_MAX]; |
| 3354 | uint32_t n_all_addrs = CF_SOCK_CFG_MAX; |
| 3355 | if (cf_inter_get_addr_all(all_addrs, &n_all_addrs) != 0) { |
| 3356 | WARNING("error getting all interface addresses" ); |
| 3357 | n_all_addrs = 0; |
| 3358 | } |
| 3359 | |
| 3360 | for (int j = 0; j < n_all_addrs; j++) { |
| 3361 | // Skip local address if any is specified. |
| 3362 | if (cf_ip_addr_is_local(&all_addrs[j]) |
| 3363 | || (ipv4_only && !cf_ip_addr_is_legacy(&all_addrs[j]))) { |
| 3364 | continue; |
| 3365 | } |
| 3366 | |
| 3367 | cf_ip_addr_copy(&all_addrs[j], &sock_cfg.addr); |
| 3368 | if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { |
| 3369 | CRASH("error initializing published address list" ); |
| 3370 | } |
| 3371 | } |
| 3372 | |
| 3373 | // TODO: Does not look like the right warning or the right message. |
| 3374 | if (published_cfg->n_cfgs == 0) { |
| 3375 | WARNING( |
| 3376 | "no network interface addresses detected for heartbeat access" ); |
| 3377 | } |
| 3378 | } |
| 3379 | else { |
| 3380 | if (ipv4_only && !cf_ip_addr_is_legacy(&bind_cfg->cfgs[i].addr)) { |
| 3381 | continue; |
| 3382 | } |
| 3383 | |
| 3384 | if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { |
| 3385 | CRASH("error initializing published address list" ); |
| 3386 | } |
| 3387 | } |
| 3388 | } |
| 3389 | } |
| 3390 | |
| 3391 | /** |
| 3392 | * Checks if the heartbeat binding configuration is valid. |
| 3393 | * @param error pointer to a static error message if validation fails, else will |
| 3394 | * be set to NULL. |
| 3395 | */ |
| 3396 | static bool |
| 3397 | config_binding_is_valid(char** error, as_hb_protocol protocol) |
| 3398 | { |
| 3399 | const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); |
| 3400 | const cf_mserv_cfg* multicast_group_cfg = config_multicast_group_cfg_get(); |
| 3401 | |
| 3402 | if (hb_is_mesh()) { |
| 3403 | if (bind_cfg->n_cfgs == 0) { |
| 3404 | // Should not happen in practice. |
| 3405 | *error = "no bind addresses found for heartbeat" ; |
| 3406 | return false; |
| 3407 | } |
| 3408 | |
| 3409 | // Ensure we have a valid port for all bind endpoints. |
| 3410 | for (int i = 0; i < bind_cfg->n_cfgs; i++) { |
| 3411 | if (bind_cfg->cfgs[i].port == 0) { |
| 3412 | *error = "invalid mesh listening port" ; |
| 3413 | return false; |
| 3414 | } |
| 3415 | } |
| 3416 | |
| 3417 | cf_serv_cfg publish_serv_cfg; |
| 3418 | cf_serv_cfg_init(&publish_serv_cfg); |
| 3419 | |
| 3420 | if (multicast_group_cfg->n_cfgs != 0) { |
| 3421 | *error = |
| 3422 | "invalid config option: multicast-group not supported in mesh mode" ; |
| 3423 | return false; |
| 3424 | } |
| 3425 | } |
| 3426 | else { |
| 3427 | const cf_mserv_cfg* multicast_group_cfg = |
| 3428 | config_multicast_group_cfg_get(); |
| 3429 | |
| 3430 | if (multicast_group_cfg->n_cfgs == 0) { |
| 3431 | *error = "no multicast groups specified" ; |
| 3432 | return false; |
| 3433 | } |
| 3434 | |
| 3435 | // Ensure multicast groups have valid ports. |
| 3436 | // TODO: We could check if the address is valid multicast. |
| 3437 | for (int i = 0; i < multicast_group_cfg->n_cfgs; i++) { |
| 3438 | if (multicast_group_cfg->cfgs[i].port == 0) { |
| 3439 | *error = "invalid multicast port" ; |
| 3440 | return false; |
| 3441 | } |
| 3442 | } |
| 3443 | |
| 3444 | if (g_config.hb_config.mesh_seed_addrs[0]) { |
| 3445 | *error = |
| 3446 | "invalid config option: mesh-seed-address-port not supported for multicast mode" ; |
| 3447 | return false; |
| 3448 | } |
| 3449 | |
| 3450 | cf_serv_cfg publish_serv_cfg; |
| 3451 | cf_serv_cfg_init(&publish_serv_cfg); |
| 3452 | } |
| 3453 | |
| 3454 | *error = NULL; |
| 3455 | return true; |
| 3456 | } |
| 3457 | |
| 3458 | /* |
| 3459 | * ---------------------------------------------------------------------------- |
| 3460 | * Channel sub module. |
| 3461 | * ---------------------------------------------------------------------------- |
| 3462 | */ |
| 3463 | |
| 3464 | /** |
| 3465 | * Initialize the channel structure. |
| 3466 | */ |
| 3467 | static void |
| 3468 | channel_init_channel(as_hb_channel* channel) |
| 3469 | { |
| 3470 | memset(channel, 0, sizeof(as_hb_channel)); |
| 3471 | cf_ip_addr_set_any(&channel->endpoint_addr.addr); |
| 3472 | } |
| 3473 | |
| 3474 | /** |
| 3475 | * Initialize the channel event structure. |
| 3476 | */ |
| 3477 | static void |
| 3478 | channel_event_init(as_hb_channel_event* event) |
| 3479 | { |
| 3480 | memset(event, 0, sizeof(as_hb_channel_event)); |
| 3481 | } |
| 3482 | |
| 3483 | /** |
| 3484 | * Is channel running. |
| 3485 | */ |
| 3486 | static bool |
| 3487 | channel_is_running() |
| 3488 | { |
| 3489 | CHANNEL_LOCK(); |
| 3490 | bool retval = |
| 3491 | (g_hb.channel_state.status == AS_HB_STATUS_RUNNING) ? true : false; |
| 3492 | CHANNEL_UNLOCK(); |
| 3493 | return retval; |
| 3494 | } |
| 3495 | |
| 3496 | /** |
| 3497 | * Is channel stopped. |
| 3498 | */ |
| 3499 | static bool |
| 3500 | channel_is_stopped() |
| 3501 | { |
| 3502 | CHANNEL_LOCK(); |
| 3503 | bool retval = |
| 3504 | (g_hb.channel_state.status == AS_HB_STATUS_STOPPED) ? true : false; |
| 3505 | CHANNEL_UNLOCK(); |
| 3506 | return retval; |
| 3507 | } |
| 3508 | |
| 3509 | /** |
| 3510 | * Keep a winning socket as a winner for at least this amount of time to prevent |
| 3511 | * constant flip flopping and give the winning socket a chance to send |
| 3512 | * heartbeats. |
| 3513 | */ |
| 3514 | static uint32_t |
| 3515 | channel_win_grace_ms() |
| 3516 | { |
| 3517 | return 3 * config_tx_interval_get(); |
| 3518 | } |
| 3519 | |
| 3520 | /** |
| 3521 | * Enable / disable events. |
| 3522 | */ |
| 3523 | static void |
| 3524 | channel_events_enabled_set(bool enabled) |
| 3525 | { |
| 3526 | CHANNEL_LOCK(); |
| 3527 | g_hb.channel_state.events_enabled = enabled; |
| 3528 | CHANNEL_UNLOCK(); |
| 3529 | } |
| 3530 | |
| 3531 | /** |
| 3532 | * Know if events are enabled. |
| 3533 | */ |
| 3534 | static bool |
| 3535 | channel_are_events_enabled() |
| 3536 | { |
| 3537 | bool result; |
| 3538 | CHANNEL_LOCK(); |
| 3539 | result = g_hb.channel_state.events_enabled; |
| 3540 | CHANNEL_UNLOCK(); |
| 3541 | return result; |
| 3542 | } |
| 3543 | |
| 3544 | /** |
| 3545 | * Discard an event that has been processed. |
| 3546 | */ |
| 3547 | static void |
| 3548 | channel_event_discard(as_hb_channel_event* event) |
| 3549 | { |
| 3550 | // Free the message structure for message received events. |
| 3551 | if (event->type == AS_HB_CHANNEL_MSG_RECEIVED) { |
| 3552 | hb_msg_return(event->msg); |
| 3553 | } |
| 3554 | } |
| 3555 | |
| 3556 | /** |
| 3557 | * Queues a channel event for publishing by the channel tender. |
| 3558 | */ |
| 3559 | static void |
| 3560 | channel_event_queue(as_hb_channel_event* event) |
| 3561 | { |
| 3562 | if (!channel_are_events_enabled()) { |
| 3563 | channel_event_discard(event); |
| 3564 | DETAIL( |
| 3565 | "events disabled. Ignoring event of type %d with nodeid %" PRIx64, |
| 3566 | event->type, event->nodeid); |
| 3567 | return; |
| 3568 | } |
| 3569 | |
| 3570 | DETAIL("queuing channel event of type %d for node %" PRIx64, event->type, |
| 3571 | event->nodeid); |
| 3572 | cf_queue_push(&g_hb.channel_state.events_queue, event); |
| 3573 | } |
| 3574 | |
| 3575 | /** |
| 3576 | * Publish queued up channel events. Should be called outside a channel lock to |
| 3577 | * prevent deadlocks. |
| 3578 | */ |
| 3579 | static void |
| 3580 | channel_event_publish_pending() |
| 3581 | { |
| 3582 | // No channel lock here to prevent deadlocks. |
| 3583 | as_hb_channel_event event; |
| 3584 | while (cf_queue_pop(&g_hb.channel_state.events_queue, &event, 0) |
| 3585 | == CF_QUEUE_OK) { |
| 3586 | // Nothing elaborate, using hardcoded list of event recipients. |
| 3587 | mesh_channel_event_process(&event); |
| 3588 | hb_channel_event_process(&event); |
| 3589 | |
| 3590 | channel_event_discard(&event); |
| 3591 | } |
| 3592 | } |
| 3593 | |
| 3594 | /** |
| 3595 | * Return the endpoint associated with this socket if it exists. |
| 3596 | * |
| 3597 | * @param socket the socket to query for. |
| 3598 | * @param result the output result. |
| 3599 | * @return 0 if the socket was found and the result value is filled. -1 if a |
| 3600 | * mapping for the socket could not be found. |
| 3601 | */ |
| 3602 | static int |
| 3603 | channel_get_channel(cf_socket* socket, as_hb_channel* result) |
| 3604 | { |
| 3605 | int status; |
| 3606 | CHANNEL_LOCK(); |
| 3607 | |
| 3608 | if (cf_shash_get(g_hb.channel_state.socket_to_channel, &socket, result) |
| 3609 | == CF_SHASH_OK) { |
| 3610 | status = 0; |
| 3611 | } |
| 3612 | else { |
| 3613 | status = -1; |
| 3614 | } |
| 3615 | |
| 3616 | CHANNEL_UNLOCK(); |
| 3617 | return status; |
| 3618 | } |
| 3619 | |
| 3620 | /** |
| 3621 | * Shutdown a channel socket without closing, forcing the channel tender to |
| 3622 | * cleanup associated data structures. |
| 3623 | */ |
| 3624 | static void |
| 3625 | channel_socket_shutdown(cf_socket* socket) |
| 3626 | { |
| 3627 | cf_socket_shutdown(socket); |
| 3628 | } |
| 3629 | |
| 3630 | /** |
| 3631 | * Return the socket associated with this node. |
| 3632 | * Returns 0 on success and -1 if there is no socket attached to this node. |
| 3633 | */ |
| 3634 | static int |
| 3635 | channel_socket_get(cf_node nodeid, cf_socket** socket) |
| 3636 | { |
| 3637 | int rv = -1; |
| 3638 | CHANNEL_LOCK(); |
| 3639 | if (cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, socket) |
| 3640 | == CF_SHASH_ERR_NOT_FOUND) { |
| 3641 | rv = -1; |
| 3642 | } |
| 3643 | else { |
| 3644 | rv = 0; |
| 3645 | } |
| 3646 | |
| 3647 | CHANNEL_UNLOCK(); |
| 3648 | return rv; |
| 3649 | } |
| 3650 | |
| 3651 | /** |
| 3652 | * Indicate if a socket is present in a sockets list. |
| 3653 | */ |
| 3654 | static bool |
| 3655 | channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find) |
| 3656 | { |
| 3657 | for (int i = 0; i < sockets->n_socks; i++) { |
| 3658 | if (&sockets->socks[i] == to_find) { |
| 3659 | return true; |
| 3660 | } |
| 3661 | } |
| 3662 | |
| 3663 | return false; |
| 3664 | } |
| 3665 | |
| 3666 | /** |
| 3667 | * Destroy an allocated socket. |
| 3668 | */ |
| 3669 | static void |
| 3670 | channel_socket_destroy(cf_socket* sock) |
| 3671 | { |
| 3672 | cf_socket_close(sock); |
| 3673 | cf_socket_term(sock); |
| 3674 | cf_free(sock); |
| 3675 | } |
| 3676 | |
| 3677 | /** |
| 3678 | * Close a channel socket. Precondition is that the socket is registered with |
| 3679 | * the channel module using channel_socket_register. |
| 3680 | */ |
| 3681 | static void |
| 3682 | channel_socket_close(cf_socket* socket, bool remote_close, |
| 3683 | bool raise_close_event) |
| 3684 | { |
| 3685 | if (remote_close) { |
| 3686 | DEBUG("remote close: fd %d event" , CSFD(socket)); |
| 3687 | } |
| 3688 | |
| 3689 | CHANNEL_LOCK(); |
| 3690 | |
| 3691 | if (channel_cf_sockets_contains(g_hb.channel_state.listening_sockets, |
| 3692 | socket)) { |
| 3693 | // Listening sockets will be closed by the mode (mesh/multicast |
| 3694 | // ) modules. |
| 3695 | goto Exit; |
| 3696 | } |
| 3697 | |
| 3698 | // Clean up data structures. |
| 3699 | as_hb_channel channel; |
| 3700 | int status = channel_get_channel(socket, &channel); |
| 3701 | |
| 3702 | if (status == 0) { |
| 3703 | if (channel.nodeid != 0) { |
| 3704 | cf_socket* node_socket; |
| 3705 | if (channel_socket_get(channel.nodeid, &node_socket) == 0 |
| 3706 | && node_socket == socket) { |
| 3707 | // Remove associated node for this socket. |
| 3708 | cf_shash_delete(g_hb.channel_state.nodeid_to_socket, |
| 3709 | &channel.nodeid); |
| 3710 | |
| 3711 | if (!channel.is_multicast && raise_close_event) { |
| 3712 | as_hb_channel_event event; |
| 3713 | channel_event_init(&event); |
| 3714 | |
| 3715 | // Notify others that this node is no longer connected. |
| 3716 | event.type = AS_HB_CHANNEL_NODE_DISCONNECTED; |
| 3717 | event.nodeid = channel.nodeid; |
| 3718 | event.msg = NULL; |
| 3719 | |
| 3720 | channel_event_queue(&event); |
| 3721 | } |
| 3722 | } |
| 3723 | } |
| 3724 | |
| 3725 | DETAIL("removed channel associated with fd %d polarity %s Type: %s" , |
| 3726 | CSFD(socket), channel.is_inbound ? "inbound" : "outbound" , |
| 3727 | channel.is_multicast ? "multicast" : "mesh" ); |
| 3728 | // Remove associated channel. |
| 3729 | cf_shash_delete(g_hb.channel_state.socket_to_channel, &socket); |
| 3730 | } |
| 3731 | else { |
| 3732 | // Will only happen if we are closing this socket twice. Cannot |
| 3733 | // deference the underlying fd because the socket has been freed. |
| 3734 | WARNING("found a socket %p without an associated channel" , socket); |
| 3735 | goto Exit; |
| 3736 | } |
| 3737 | |
| 3738 | static int32_t err_ok[] = { ENOENT, EBADF, EPERM }; |
| 3739 | int32_t err = cf_poll_delete_socket_forgiving(g_hb.channel_state.poll, |
| 3740 | socket, sizeof(err_ok) / sizeof(int32_t), err_ok); |
| 3741 | |
| 3742 | if (err == ENOENT) { |
| 3743 | // There is no valid code path where epoll ctl should fail. |
| 3744 | CRASH("unable to remove fd %d from epoll fd list: %s" , CSFD(socket), |
| 3745 | cf_strerror(errno)); |
| 3746 | goto Exit; |
| 3747 | } |
| 3748 | |
| 3749 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
| 3750 | DEBUG("closing channel with fd %d" , CSFD(socket)); |
| 3751 | |
| 3752 | channel_socket_destroy(socket); |
| 3753 | |
| 3754 | Exit: |
| 3755 | CHANNEL_UNLOCK(); |
| 3756 | } |
| 3757 | |
| 3758 | /** |
| 3759 | * Close multiple sockets. Should be invoked only by channel stop. |
| 3760 | * @param sockets the vector consisting of sockets to be closed. |
| 3761 | */ |
| 3762 | static void |
| 3763 | channel_sockets_close(cf_vector* sockets) |
| 3764 | { |
| 3765 | uint32_t socket_count = cf_vector_size(sockets); |
| 3766 | for (int index = 0; index < socket_count; index++) { |
| 3767 | cf_socket* socket; |
| 3768 | if (cf_vector_get(sockets, index, &socket) != 0) { |
| 3769 | WARNING("error finding the fd %d to be deleted" , CSFD(socket)); |
| 3770 | continue; |
| 3771 | } |
| 3772 | channel_socket_close(socket, false, true); |
| 3773 | } |
| 3774 | } |
| 3775 | |
| 3776 | /** |
| 3777 | * Queues a socket for closing by the channel tender. Should be used by all code |
| 3778 | * paths other than the channel stop code path. |
| 3779 | */ |
| 3780 | static void |
| 3781 | channel_socket_close_queue(cf_socket* socket, bool is_remote_close, |
| 3782 | bool raise_close_event) |
| 3783 | { |
| 3784 | as_hb_channel_socket_close_entry close_entry = { |
| 3785 | socket, |
| 3786 | is_remote_close, |
| 3787 | raise_close_event }; |
| 3788 | DETAIL("queuing close of fd %d" , CSFD(socket)); |
| 3789 | cf_queue_push(&g_hb.channel_state.socket_close_queue, &close_entry); |
| 3790 | } |
| 3791 | |
| 3792 | /** |
| 3793 | * Close queued up sockets. |
| 3794 | */ |
| 3795 | static void |
| 3796 | channel_socket_close_pending() |
| 3797 | { |
| 3798 | // No channel lock required here. |
| 3799 | as_hb_channel_socket_close_entry close_entry; |
| 3800 | while (cf_queue_pop(&g_hb.channel_state.socket_close_queue, &close_entry, 0) |
| 3801 | == CF_QUEUE_OK) { |
| 3802 | channel_socket_close(close_entry.socket, close_entry.is_remote, |
| 3803 | close_entry.raise_close_event); |
| 3804 | } |
| 3805 | } |
| 3806 | |
| 3807 | /** |
| 3808 | * Register a new socket. |
| 3809 | * |
| 3810 | * @param socket the socket. |
| 3811 | * @param is_multicast indicates if this socket is a multicast socket. |
| 3812 | * @param is_inbound indicates if this socket is an inbound / outbound. |
| 3813 | * @param endpoint peer endpoint this socket connects to. Will be NULL for |
| 3814 | * inbound sockets. |
| 3815 | */ |
| 3816 | static void |
| 3817 | channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, |
| 3818 | cf_sock_addr* endpoint_addr) |
| 3819 | { |
| 3820 | CHANNEL_LOCK(); |
| 3821 | |
| 3822 | as_hb_channel channel; |
| 3823 | channel_init_channel(&channel); |
| 3824 | |
| 3825 | // This socket should not be part of the socket to channel map. |
| 3826 | ASSERT(channel_get_channel(socket, &channel) == -1, |
| 3827 | "error the channel already exists for fd %d" , CSFD(socket)); |
| 3828 | |
| 3829 | channel.is_multicast = is_multicast; |
| 3830 | channel.is_inbound = is_inbound; |
| 3831 | channel.last_received = cf_getms(); |
| 3832 | |
| 3833 | if (endpoint_addr) { |
| 3834 | memcpy(&channel.endpoint_addr, endpoint_addr, sizeof(*endpoint_addr)); |
| 3835 | } |
| 3836 | |
| 3837 | // Add socket to poll list |
| 3838 | cf_poll_add_socket(g_hb.channel_state.poll, socket, |
| 3839 | EPOLLIN | EPOLLERR | EPOLLRDHUP, socket); |
| 3840 | |
| 3841 | cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel); |
| 3842 | |
| 3843 | DEBUG("channel created for fd %d - polarity %s type: %s" , CSFD(socket), |
| 3844 | channel.is_inbound ? "inbound" : "outbound" , |
| 3845 | channel.is_multicast ? "multicast" : "mesh" ); |
| 3846 | |
| 3847 | CHANNEL_UNLOCK(); |
| 3848 | } |
| 3849 | |
| 3850 | /** |
| 3851 | * Accept an incoming tcp connection. For now this is relevant only to the mesh |
| 3852 | * mode. |
| 3853 | * @param lsock the listening socket that received the connection. |
| 3854 | */ |
| 3855 | static void |
| 3856 | channel_accept_connection(cf_socket* lsock) |
| 3857 | { |
| 3858 | if (!hb_is_mesh()) { |
| 3859 | // We do not accept connections in non mesh modes. |
| 3860 | return; |
| 3861 | } |
| 3862 | |
| 3863 | cf_socket csock; |
| 3864 | cf_sock_addr caddr; |
| 3865 | |
| 3866 | if (cf_socket_accept(lsock, &csock, &caddr) < 0) { |
| 3867 | if ((errno == EMFILE) || (errno == ENFILE) || (errno == ENOMEM) |
| 3868 | || (errno == ENOBUFS)) { |
| 3869 | TICKER_WARNING( |
| 3870 | "failed to accept heartbeat connection due to error : %s" , |
| 3871 | cf_strerror(errno)); |
| 3872 | // We are in an extreme situation where we ran out of system |
| 3873 | // resources (file/mem). We should rather lie low and not do too |
| 3874 | // much activity. So, sleep. We should not sleep too long as this |
| 3875 | // same function is supposed to send heartbeat also. |
| 3876 | usleep(MAX(AS_HB_TX_INTERVAL_MS_MIN, 1) * 1000); |
| 3877 | return; |
| 3878 | } |
| 3879 | else { |
| 3880 | // TODO: Find what there errors are. |
| 3881 | WARNING("accept failed: %s" , cf_strerror(errno)); |
| 3882 | return; |
| 3883 | } |
| 3884 | } |
| 3885 | |
| 3886 | // Update the stats to reflect to a new connection opened. |
| 3887 | cf_atomic_int_incr(&g_stats.heartbeat_connections_opened); |
| 3888 | |
| 3889 | char caddr_str[DNS_NAME_MAX_SIZE]; |
| 3890 | cf_sock_addr_to_string_safe(&caddr, caddr_str, sizeof(caddr_str)); |
| 3891 | DEBUG("new connection from %s" , caddr_str); |
| 3892 | |
| 3893 | cf_sock_cfg *cfg = lsock->cfg; |
| 3894 | |
| 3895 | if (cfg->owner == CF_SOCK_OWNER_HEARTBEAT_TLS) { |
| 3896 | tls_socket_prepare_server(g_config.hb_config.tls, &csock); |
| 3897 | |
| 3898 | if (tls_socket_accept_block(&csock) != 1) { |
| 3899 | WARNING("heartbeat TLS server handshake with %s failed" , caddr_str); |
| 3900 | cf_socket_close(&csock); |
| 3901 | cf_socket_term(&csock); |
| 3902 | |
| 3903 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
| 3904 | return; |
| 3905 | } |
| 3906 | } |
| 3907 | |
| 3908 | // Allocate a new socket. |
| 3909 | cf_socket* sock = cf_malloc(sizeof(cf_socket)); |
| 3910 | cf_socket_init(sock); |
| 3911 | cf_socket_copy(&csock, sock); |
| 3912 | |
| 3913 | // Register this socket with the channel subsystem. |
| 3914 | channel_socket_register(sock, false, true, NULL); |
| 3915 | } |
| 3916 | |
| 3917 | /** |
| 3918 | * Parse compressed buffer into a message. |
| 3919 | * |
| 3920 | * @param msg the input parsed compressed message and also the output heartbeat |
| 3921 | * message. |
| 3922 | * @param buffer the input buffer. |
| 3923 | * @param buffer_content_len the length of the content in the buffer. |
| 3924 | * @return the status of parsing the message. |
| 3925 | */ |
| 3926 | static as_hb_channel_msg_read_status |
| 3927 | channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len) |
| 3928 | { |
| 3929 | // This is a direct pointer inside the buffer parameter. No allocation |
| 3930 | // required. |
| 3931 | uint8_t* compressed_buffer = NULL; |
| 3932 | size_t compressed_buffer_length = 0; |
| 3933 | int parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; |
| 3934 | void* uncompressed_buffer = NULL; |
| 3935 | size_t uncompressed_buffer_length = 0; |
| 3936 | |
| 3937 | if (msg_get_buf(msg, AS_HB_MSG_COMPRESSED_PAYLOAD, &compressed_buffer, |
| 3938 | &compressed_buffer_length, MSG_GET_DIRECT) != 0) { |
| 3939 | parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; |
| 3940 | goto Exit; |
| 3941 | } |
| 3942 | |
| 3943 | // Assume compression ratio of 3. We will expand the buffer if needed. |
| 3944 | uncompressed_buffer_length = round_up_pow2(3 * compressed_buffer_length); |
| 3945 | |
| 3946 | // Keep trying till we allocate enough memory for the uncompressed buffer. |
| 3947 | while (true) { |
| 3948 | uncompressed_buffer = MSG_BUFF_ALLOC_OR_DIE(uncompressed_buffer_length, |
| 3949 | "error allocating memory size %zu for decompressing message" , |
| 3950 | uncompressed_buffer_length); |
| 3951 | |
| 3952 | int uncompress_rv = uncompress(uncompressed_buffer, |
| 3953 | &uncompressed_buffer_length, compressed_buffer, |
| 3954 | compressed_buffer_length); |
| 3955 | |
| 3956 | if (uncompress_rv == Z_OK) { |
| 3957 | // Decompression was successful. |
| 3958 | break; |
| 3959 | } |
| 3960 | |
| 3961 | if (uncompress_rv == Z_BUF_ERROR) { |
| 3962 | // The uncompressed buffer is not large enough. Free current buffer |
| 3963 | // and allocate a new buffer. |
| 3964 | MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length); |
| 3965 | |
| 3966 | // Give uncompressed buffer more space. |
| 3967 | uncompressed_buffer_length *= 2; |
| 3968 | continue; |
| 3969 | } |
| 3970 | |
| 3971 | // Decompression failed. Clean up and exit. |
| 3972 | parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; |
| 3973 | goto Exit; |
| 3974 | } |
| 3975 | |
| 3976 | // Reset the message to prepare for parsing the uncompressed buffer. We have |
| 3977 | // no issues losing the compressed buffer because we have an uncompressed |
| 3978 | // copy. |
| 3979 | msg_reset(msg); |
| 3980 | |
| 3981 | // Parse the uncompressed buffer. |
| 3982 | parsed = |
| 3983 | msg_parse(msg, uncompressed_buffer, uncompressed_buffer_length) ? |
| 3984 | AS_HB_CHANNEL_MSG_READ_SUCCESS : |
| 3985 | AS_HB_CHANNEL_MSG_PARSE_FAIL; |
| 3986 | |
| 3987 | if (parsed == AS_HB_CHANNEL_MSG_READ_SUCCESS) { |
| 3988 | // Copying the buffer content to ensure that the message and the buffer |
| 3989 | // can have separate life cycles and we never get into races. The |
| 3990 | // frequency of heartbeat messages is low enough to make this not matter |
| 3991 | // much unless we have massive clusters. |
| 3992 | msg_preserve_all_fields(msg); |
| 3993 | } |
| 3994 | |
| 3995 | Exit: |
| 3996 | MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length); |
| 3997 | return parsed; |
| 3998 | } |
| 3999 | |
| 4000 | /** |
| 4001 | * Parse the buffer into a message. |
| 4002 | * |
| 4003 | * @param msg the output heartbeat message. |
| 4004 | * @param buffer the input buffer. |
| 4005 | * @param buffer_content_len the length of the content in the buffer. |
| 4006 | * @return the status of parsing the message. |
| 4007 | */ |
| 4008 | static as_hb_channel_msg_read_status |
| 4009 | channel_message_parse(msg* msg, void* buffer, int buffer_content_len) |
| 4010 | { |
| 4011 | // Peek into the buffer to get hold of the message type. |
| 4012 | msg_type type = 0; |
| 4013 | uint32_t msg_size = 0; |
| 4014 | if (! msg_parse_hdr(&msg_size, &type, (uint8_t*)buffer, buffer_content_len) |
| 4015 | || type != msg->type) { |
| 4016 | // Pre check because msg_parse considers this a warning but this would |
| 4017 | // be common when protocol version between nodes do not match. |
| 4018 | DEBUG("message type mismatch - expected:%d received:%d" , msg->type, |
| 4019 | type); |
| 4020 | return AS_HB_CHANNEL_MSG_PARSE_FAIL; |
| 4021 | } |
| 4022 | |
| 4023 | bool parsed = msg_parse(msg, buffer, buffer_content_len); |
| 4024 | |
| 4025 | if (parsed) { |
| 4026 | if (msg_is_set(msg, AS_HB_MSG_COMPRESSED_PAYLOAD)) { |
| 4027 | // This is a compressed message. |
| 4028 | return channel_compressed_message_parse(msg, buffer, |
| 4029 | buffer_content_len); |
| 4030 | } |
| 4031 | |
| 4032 | // This is an uncompressed message. Copying the buffer content to ensure |
| 4033 | // that the message and the buffer can have separate life cycles and we |
| 4034 | // never get into races. The frequency of heartbeat messages is low |
| 4035 | // enough to make this not matter much unless we have massive clusters. |
| 4036 | msg_preserve_all_fields(msg); |
| 4037 | } |
| 4038 | |
| 4039 | return parsed ? |
| 4040 | AS_HB_CHANNEL_MSG_READ_SUCCESS : AS_HB_CHANNEL_MSG_PARSE_FAIL; |
| 4041 | } |
| 4042 | |
| 4043 | /** |
| 4044 | * Iterate over a endpoint list and see if there is a matching socket address. |
| 4045 | */ |
| 4046 | static void |
| 4047 | channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata) |
| 4048 | { |
| 4049 | cf_sock_addr sock_addr; |
| 4050 | as_hb_channel_endpoint_iterate_udata* iterate_data = |
| 4051 | (as_hb_channel_endpoint_iterate_udata*)udata; |
| 4052 | if (as_endpoint_to_sock_addr(endpoint, &sock_addr) != 0) { |
| 4053 | return; |
| 4054 | } |
| 4055 | |
| 4056 | if (cf_sock_addr_is_any(&sock_addr)) { |
| 4057 | return; |
| 4058 | } |
| 4059 | |
| 4060 | iterate_data->found = iterate_data->found |
| 4061 | || (cf_sock_addr_compare(&sock_addr, iterate_data->addr_to_search) |
| 4062 | == 0); |
| 4063 | } |
| 4064 | |
| 4065 | /** |
| 4066 | * Reduce function to find a matching endpoint. |
| 4067 | */ |
| 4068 | static int |
| 4069 | channel_endpoint_search_reduce(const void* key, void* data, void* udata) |
| 4070 | { |
| 4071 | cf_socket** socket = (cf_socket**)key; |
| 4072 | as_hb_channel* channel = (as_hb_channel*)data; |
| 4073 | as_hb_channel_endpoint_reduce_udata* endpoint_reduce_udata = |
| 4074 | (as_hb_channel_endpoint_reduce_udata*)udata; |
| 4075 | |
| 4076 | as_hb_channel_endpoint_iterate_udata iterate_udata; |
| 4077 | iterate_udata.addr_to_search = &channel->endpoint_addr; |
| 4078 | iterate_udata.found = false; |
| 4079 | |
| 4080 | as_endpoint_list_iterate(endpoint_reduce_udata->endpoint_list, |
| 4081 | channel_endpoint_find_iterate_fn, &iterate_udata); |
| 4082 | |
| 4083 | if (iterate_udata.found) { |
| 4084 | endpoint_reduce_udata->found = true; |
| 4085 | endpoint_reduce_udata->socket = *socket; |
| 4086 | // Stop the reduce, we have found a match. |
| 4087 | return CF_SHASH_ERR_FOUND; |
| 4088 | } |
| 4089 | |
| 4090 | return CF_SHASH_OK; |
| 4091 | } |
| 4092 | |
| 4093 | /** |
| 4094 | * Indicates if any endpoint from the input endpoint list is already connected. |
| 4095 | * @param endpoint_list the endpoint list to check. |
| 4096 | * @return true if at least one endpoint is already connected to, false |
| 4097 | * otherwise. |
| 4098 | */ |
| 4099 | static bool |
| 4100 | channel_endpoint_is_connected(as_endpoint_list* endpoint_list) |
| 4101 | { |
| 4102 | CHANNEL_LOCK(); |
| 4103 | // Linear search. This will in practice not be a very frequent operation. |
| 4104 | as_hb_channel_endpoint_reduce_udata udata; |
| 4105 | memset(&udata, 0, sizeof(udata)); |
| 4106 | udata.endpoint_list = endpoint_list; |
| 4107 | |
| 4108 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
| 4109 | channel_endpoint_search_reduce, &udata); |
| 4110 | |
| 4111 | CHANNEL_UNLOCK(); |
| 4112 | return udata.found; |
| 4113 | } |
| 4114 | |
| 4115 | /** |
| 4116 | * Read a message from the multicast socket. |
| 4117 | * |
| 4118 | * @param socket the multicast socket to read from. |
| 4119 | * @param msg the message to read into. |
| 4120 | * |
| 4121 | * @return the status the read operation. |
| 4122 | */ |
| 4123 | static as_hb_channel_msg_read_status |
| 4124 | channel_multicast_msg_read(cf_socket* socket, msg* msg) |
| 4125 | { |
| 4126 | CHANNEL_LOCK(); |
| 4127 | |
| 4128 | as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF; |
| 4129 | |
| 4130 | int buffer_len = MAX(hb_mtu(), STACK_ALLOC_LIMIT); |
| 4131 | uint8_t* buffer = MSG_BUFF_ALLOC(buffer_len); |
| 4132 | |
| 4133 | if (!buffer) { |
| 4134 | WARNING( |
| 4135 | "error allocating space for multicast recv buffer of size %d on fd %d" , |
| 4136 | buffer_len, CSFD(socket)); |
| 4137 | goto Exit; |
| 4138 | } |
| 4139 | |
| 4140 | cf_sock_addr from; |
| 4141 | |
| 4142 | int num_rcvd = cf_socket_recv_from(socket, buffer, buffer_len, 0, &from); |
| 4143 | |
| 4144 | if (num_rcvd <= 0) { |
| 4145 | DEBUG("multicast packed read failed on fd %d" , CSFD(socket)); |
| 4146 | rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; |
| 4147 | goto Exit; |
| 4148 | } |
| 4149 | |
| 4150 | rv = channel_message_parse(msg, buffer, num_rcvd); |
| 4151 | if (rv != AS_HB_CHANNEL_MSG_READ_SUCCESS) { |
| 4152 | goto Exit; |
| 4153 | } |
| 4154 | |
| 4155 | rv = AS_HB_CHANNEL_MSG_READ_SUCCESS; |
| 4156 | |
| 4157 | Exit: |
| 4158 | MSG_BUFF_FREE(buffer, buffer_len); |
| 4159 | |
| 4160 | CHANNEL_UNLOCK(); |
| 4161 | return rv; |
| 4162 | } |
| 4163 | |
| 4164 | /** |
| 4165 | * Read a message from the a tcp mesh socket. |
| 4166 | * |
| 4167 | * @param socket the tcp socket to read from. |
| 4168 | * @param msg the message to read into. |
| 4169 | * |
| 4170 | * @return status of the read operation. |
| 4171 | */ |
| 4172 | static as_hb_channel_msg_read_status |
| 4173 | channel_mesh_msg_read(cf_socket* socket, msg* msg) |
| 4174 | { |
| 4175 | CHANNEL_LOCK(); |
| 4176 | |
| 4177 | uint32_t buffer_len = 0; |
| 4178 | uint8_t* buffer = NULL; |
| 4179 | |
| 4180 | as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF; |
| 4181 | uint8_t len_buff[MSG_WIRE_LENGTH_SIZE]; |
| 4182 | |
| 4183 | if (cf_socket_recv_all(socket, len_buff, MSG_WIRE_LENGTH_SIZE, 0, |
| 4184 | MESH_RW_TIMEOUT) < 0) { |
| 4185 | WARNING("mesh size recv failed fd %d : %s" , CSFD(socket), |
| 4186 | cf_strerror(errno)); |
| 4187 | rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; |
| 4188 | goto Exit; |
| 4189 | } |
| 4190 | |
| 4191 | buffer_len = ntohl(*((uint32_t*)len_buff)) + 6; |
| 4192 | |
| 4193 | buffer = MSG_BUFF_ALLOC(buffer_len); |
| 4194 | |
| 4195 | if (!buffer) { |
| 4196 | WARNING( |
| 4197 | "error allocating space for mesh recv buffer of size %d on fd %d" , |
| 4198 | buffer_len, CSFD(socket)); |
| 4199 | goto Exit; |
| 4200 | } |
| 4201 | |
| 4202 | memcpy(buffer, len_buff, MSG_WIRE_LENGTH_SIZE); |
| 4203 | |
| 4204 | if (cf_socket_recv_all(socket, buffer + MSG_WIRE_LENGTH_SIZE, |
| 4205 | buffer_len - MSG_WIRE_LENGTH_SIZE, 0, MESH_RW_TIMEOUT) < 0) { |
| 4206 | DETAIL("mesh recv failed fd %d : %s" , CSFD(socket), cf_strerror(errno)); |
| 4207 | rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; |
| 4208 | goto Exit; |
| 4209 | } |
| 4210 | |
| 4211 | DETAIL("mesh recv success fd %d message size %d" , CSFD(socket), buffer_len); |
| 4212 | |
| 4213 | rv = channel_message_parse(msg, buffer, buffer_len); |
| 4214 | |
| 4215 | Exit: |
| 4216 | MSG_BUFF_FREE(buffer, buffer_len); |
| 4217 | |
| 4218 | CHANNEL_UNLOCK(); |
| 4219 | return rv; |
| 4220 | } |
| 4221 | |
| 4222 | /** |
| 4223 | * Associate a socket with a nodeid and notify listeners about a node being |
| 4224 | * connected, effective only for mesh channels. |
| 4225 | * |
| 4226 | * For multicast channels this function is a no-op. The reason being additional |
| 4227 | * machinery would be required to clean up the node to channel mapping on node |
| 4228 | * expiry. |
| 4229 | * |
| 4230 | * @param socket the socket. |
| 4231 | * @param channel the channel to associate. |
| 4232 | * @param nodeid the nodeid associated with this socket. |
| 4233 | */ |
| 4234 | static void |
| 4235 | channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid) |
| 4236 | { |
| 4237 | // For now node to socket mapping is not maintained for multicast channels. |
| 4238 | if (channel->is_multicast) { |
| 4239 | return; |
| 4240 | } |
| 4241 | |
| 4242 | CHANNEL_LOCK(); |
| 4243 | |
| 4244 | // Update the node information for the channel. |
| 4245 | // This is the first time this node has a connection. Record the mapping. |
| 4246 | cf_shash_put(g_hb.channel_state.nodeid_to_socket, &nodeid, &socket); |
| 4247 | |
| 4248 | channel->nodeid = nodeid; |
| 4249 | cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, channel); |
| 4250 | |
| 4251 | DEBUG("attached fd %d to node %" PRIx64, CSFD(socket), nodeid); |
| 4252 | |
| 4253 | CHANNEL_UNLOCK(); |
| 4254 | |
| 4255 | // Publish an event to let know that a new node has a channel now. |
| 4256 | as_hb_channel_event node_connected_event; |
| 4257 | channel_event_init(&node_connected_event); |
| 4258 | node_connected_event.nodeid = nodeid; |
| 4259 | node_connected_event.type = AS_HB_CHANNEL_NODE_CONNECTED; |
| 4260 | channel_event_queue(&node_connected_event); |
| 4261 | } |
| 4262 | |
| 4263 | /** |
| 4264 | * Indicates if a channel should be allowed to continue to win and live because |
| 4265 | * of a winning grace period. |
| 4266 | */ |
| 4267 | static bool |
| 4268 | channel_socket_should_live(cf_socket* socket, as_hb_channel* channel) |
| 4269 | { |
| 4270 | if (channel->resolution_win_ts > 0 |
| 4271 | && channel->resolution_win_ts + channel_win_grace_ms() |
| 4272 | > cf_getms()) { |
| 4273 | // Losing socket was a previous winner. Allow it time to do some work |
| 4274 | // before knocking it off. |
| 4275 | INFO("giving %d unresolved fd some grace time" , CSFD(socket)); |
| 4276 | return true; |
| 4277 | } |
| 4278 | return false; |
| 4279 | } |
| 4280 | |
| 4281 | /** |
| 4282 | * Selects one out give two sockets connected to same remote node. The algorithm |
| 4283 | * is deterministic and ensures the remote node also chooses a socket that drops |
| 4284 | * the same connection. |
| 4285 | * |
| 4286 | * @param socket1 one of the sockets |
| 4287 | * @param socket2 one of the sockets |
| 4288 | * @return resolved socket on success, NULL if resolution fails. |
| 4289 | */ |
| 4290 | static cf_socket* |
| 4291 | channel_socket_resolve(cf_socket* socket1, cf_socket* socket2) |
| 4292 | { |
| 4293 | cf_socket* rv = NULL; |
| 4294 | CHANNEL_LOCK(); |
| 4295 | |
| 4296 | DEBUG("resolving between fd %d and %d" , CSFD(socket1), CSFD(socket2)); |
| 4297 | |
| 4298 | as_hb_channel channel1; |
| 4299 | if (channel_get_channel(socket1, &channel1) < 0) { |
| 4300 | // Should not happen in practice. |
| 4301 | WARNING("resolving fd %d without channel" , CSFD(socket1)); |
| 4302 | rv = socket2; |
| 4303 | goto Exit; |
| 4304 | } |
| 4305 | |
| 4306 | as_hb_channel channel2; |
| 4307 | if (channel_get_channel(socket2, &channel2) < 0) { |
| 4308 | // Should not happen in practice. |
| 4309 | WARNING("resolving fd %d without channel" , CSFD(socket2)); |
| 4310 | rv = socket1; |
| 4311 | goto Exit; |
| 4312 | } |
| 4313 | |
| 4314 | if (channel_socket_should_live(socket1, &channel1)) { |
| 4315 | rv = socket1; |
| 4316 | goto Exit; |
| 4317 | } |
| 4318 | |
| 4319 | if (channel_socket_should_live(socket2, &channel2)) { |
| 4320 | rv = socket2; |
| 4321 | goto Exit; |
| 4322 | } |
| 4323 | |
| 4324 | cf_node remote_nodeid = |
| 4325 | channel1.nodeid != 0 ? channel1.nodeid : channel2.nodeid; |
| 4326 | |
| 4327 | if (remote_nodeid == 0) { |
| 4328 | // Should not happen in practice. |
| 4329 | WARNING("remote node id unknown for fds %d and %d" , CSFD(socket1), |
| 4330 | CSFD(socket2)); |
| 4331 | rv = NULL; |
| 4332 | goto Exit; |
| 4333 | } |
| 4334 | |
| 4335 | // Choose the socket with the highest acceptor nodeid. |
| 4336 | cf_node acceptor_nodeid1 = |
| 4337 | channel1.is_inbound ? config_self_nodeid_get() : remote_nodeid; |
| 4338 | cf_node acceptor_nodeid2 = |
| 4339 | channel2.is_inbound ? config_self_nodeid_get() : remote_nodeid; |
| 4340 | |
| 4341 | as_hb_channel* winner_channel = NULL; |
| 4342 | cf_socket* winner_socket = NULL; |
| 4343 | if (acceptor_nodeid1 > acceptor_nodeid2) { |
| 4344 | winner_channel = &channel1; |
| 4345 | winner_socket = socket1; |
| 4346 | } |
| 4347 | else if (acceptor_nodeid1 < acceptor_nodeid2) { |
| 4348 | winner_channel = &channel2; |
| 4349 | winner_socket = socket2; |
| 4350 | } |
| 4351 | else { |
| 4352 | // Both connections have the same acceptor. Should not happen in |
| 4353 | // practice. Despair and report resolution failure. |
| 4354 | INFO( |
| 4355 | "found redundant connections to same node, fds %d %d - choosing at random" , |
| 4356 | CSFD(socket1), CSFD(socket2)); |
| 4357 | |
| 4358 | if (cf_getms() % 2 == 0) { |
| 4359 | winner_channel = &channel1; |
| 4360 | winner_socket = socket1; |
| 4361 | } |
| 4362 | else { |
| 4363 | winner_channel = &channel2; |
| 4364 | winner_socket = socket2; |
| 4365 | } |
| 4366 | } |
| 4367 | |
| 4368 | cf_clock now = cf_getms(); |
| 4369 | if (winner_channel->resolution_win_ts == 0) { |
| 4370 | winner_channel->resolution_win_ts = now; |
| 4371 | // Update the winning count of the winning channel in the channel data |
| 4372 | // structures. |
| 4373 | cf_shash_put(g_hb.channel_state.socket_to_channel, &winner_socket, |
| 4374 | winner_channel); |
| 4375 | } |
| 4376 | |
| 4377 | if (winner_channel->resolution_win_ts > now + channel_win_grace_ms()) { |
| 4378 | // The winner has been winning a lot, most likely the other side has us |
| 4379 | // with a seed address different from our published address. |
| 4380 | // |
| 4381 | // Break the cycle here and choose the loosing channel as the winner. |
| 4382 | INFO("breaking socket resolve loop dropping winning fd %d" , |
| 4383 | CSFD(winner_socket)); |
| 4384 | winner_channel = (winner_channel == &channel1) ? &channel2 : &channel1; |
| 4385 | winner_socket = (socket1 == winner_socket) ? socket2 : socket1; |
| 4386 | } |
| 4387 | |
| 4388 | rv = winner_socket; |
| 4389 | |
| 4390 | Exit: |
| 4391 | CHANNEL_UNLOCK(); |
| 4392 | return rv; |
| 4393 | } |
| 4394 | |
| 4395 | /** |
| 4396 | * Basic sanity check for a message. |
| 4397 | * @param msg_event the message event. |
| 4398 | * @return 0 if the message passes basic sanity tests. -1 on failure. |
| 4399 | */ |
| 4400 | static int |
| 4401 | channel_msg_sanity_check(as_hb_channel_event* msg_event) |
| 4402 | { |
| 4403 | msg* msg = msg_event->msg; |
| 4404 | uint32_t id = 0; |
| 4405 | |
| 4406 | as_hb_msg_type type = 0; |
| 4407 | cf_node src_nodeid = 0; |
| 4408 | |
| 4409 | int rv = 0; |
| 4410 | |
| 4411 | if (msg_nodeid_get(msg, &src_nodeid) != 0) { |
| 4412 | TICKER_WARNING("received message without a source node" ); |
| 4413 | rv = -1; |
| 4414 | } |
| 4415 | |
| 4416 | // Validate the fact that we have a valid source nodeid. |
| 4417 | if (src_nodeid == 0) { |
| 4418 | // Event nodeid is zero. Not a valid source nodeid. This will happen in |
| 4419 | // compatibility mode if the info request from a new node arrives before |
| 4420 | // the pulse message. Can be ignored. |
| 4421 | TICKER_WARNING("received a message from node with unknown nodeid" ); |
| 4422 | rv = -1; |
| 4423 | } |
| 4424 | |
| 4425 | if (msg_id_get(msg, &id) != 0) { |
| 4426 | TICKER_WARNING( |
| 4427 | "received message without heartbeat protocol identifier from node %" PRIx64, |
| 4428 | src_nodeid); |
| 4429 | rv = -1; |
| 4430 | } |
| 4431 | else { |
| 4432 | DETAIL( |
| 4433 | "received message with heartbeat protocol identifier %d from node %" PRIx64, |
| 4434 | id, src_nodeid); |
| 4435 | |
| 4436 | // Ignore the message if the protocol of the incoming message does not |
| 4437 | // match. |
| 4438 | if (id != hb_protocol_identifier_get()) { |
| 4439 | TICKER_WARNING( |
| 4440 | "received message with different heartbeat protocol identifier from node %" PRIx64, |
| 4441 | src_nodeid); |
| 4442 | rv = -1; |
| 4443 | } |
| 4444 | } |
| 4445 | |
| 4446 | if (msg_type_get(msg, &type) != 0) { |
| 4447 | TICKER_WARNING( |
| 4448 | "received message without message type from node %" PRIx64, |
| 4449 | src_nodeid); |
| 4450 | rv = -1; |
| 4451 | } |
| 4452 | |
| 4453 | as_endpoint_list* endpoint_list; |
| 4454 | if (hb_is_mesh()) { |
| 4455 | // Check only applies to v3 mesh. |
| 4456 | // v3 multicast protocol does not advertise endpoint list. |
| 4457 | if (msg_endpoint_list_get(msg, &endpoint_list) != 0 |
| 4458 | || endpoint_list->n_endpoints <= 0) { |
| 4459 | TICKER_WARNING( |
| 4460 | "received message without address/port from node %" PRIx64, |
| 4461 | src_nodeid); |
| 4462 | rv = -1; |
| 4463 | } |
| 4464 | } |
| 4465 | |
| 4466 | as_hlc_timestamp send_ts; |
| 4467 | if (msg_send_hlc_ts_get(msg, &send_ts) != 0) { |
| 4468 | TICKER_WARNING("received message without HLC time from node %" PRIx64, |
| 4469 | src_nodeid); |
| 4470 | rv = -1; |
| 4471 | } |
| 4472 | |
| 4473 | if (type == AS_HB_MSG_TYPE_PULSE) { |
| 4474 | char* remote_cluster_name = NULL; |
| 4475 | if (msg_cluster_name_get(msg, &remote_cluster_name) != 0) { |
| 4476 | remote_cluster_name = "" ; |
| 4477 | } |
| 4478 | |
| 4479 | if (!as_config_cluster_name_matches(remote_cluster_name)) { |
| 4480 | // Generate cluster-name mismatch event. |
| 4481 | as_hb_channel_event mismatch_event; |
| 4482 | channel_event_init(&mismatch_event); |
| 4483 | |
| 4484 | // Notify hb about cluster-name mismatch. |
| 4485 | mismatch_event.type = AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH; |
| 4486 | mismatch_event.nodeid = src_nodeid; |
| 4487 | mismatch_event.msg = NULL; |
| 4488 | memcpy(&mismatch_event.msg_hlc_ts, &msg_event->msg_hlc_ts, |
| 4489 | sizeof(msg_event->msg_hlc_ts)); |
| 4490 | |
| 4491 | channel_event_queue(&mismatch_event); |
| 4492 | |
| 4493 | TICKER_WARNING("ignoring message from %" PRIX64" with different cluster name(%s)" , |
| 4494 | src_nodeid, remote_cluster_name[0] == '\0' ? "null" : remote_cluster_name ); |
| 4495 | rv = -1; |
| 4496 | } |
| 4497 | } |
| 4498 | |
| 4499 | DETAIL("received message of type %d from node %" PRIx64, type, src_nodeid); |
| 4500 | |
| 4501 | return rv; |
| 4502 | } |
| 4503 | |
| 4504 | /** |
| 4505 | * Process incoming message to possibly update channel state. |
| 4506 | * |
| 4507 | * @param socket the socket on which the message is received. |
| 4508 | * @param event the message wrapped around in a channel event. |
| 4509 | * @return 0 if the message can be further processed, -1 if the message should |
| 4510 | * be discarded. |
| 4511 | */ |
| 4512 | static int |
| 4513 | channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event) |
| 4514 | { |
| 4515 | // Basic sanity check for the inbound message. |
| 4516 | if (channel_msg_sanity_check(event) != 0) { |
| 4517 | DETAIL("sanity check failed for message on fd %d" , CSFD(socket)); |
| 4518 | return -1; |
| 4519 | } |
| 4520 | |
| 4521 | int rv = -1; |
| 4522 | CHANNEL_LOCK(); |
| 4523 | |
| 4524 | as_hb_channel channel; |
| 4525 | if (channel_get_channel(socket, &channel) < 0) { |
| 4526 | // This is a bug and should not happen. Be paranoid and try fixing it ? |
| 4527 | WARNING("received a message on an unregistered fd %d - closing the fd" , |
| 4528 | CSFD(socket)); |
| 4529 | channel_socket_close_queue(socket, false, true); |
| 4530 | rv = -1; |
| 4531 | goto Exit; |
| 4532 | } |
| 4533 | |
| 4534 | if (channel.is_multicast) { |
| 4535 | rv = 0; |
| 4536 | goto Exit; |
| 4537 | } |
| 4538 | |
| 4539 | cf_node nodeid = event->nodeid; |
| 4540 | |
| 4541 | if (channel.nodeid != 0 && channel.nodeid != nodeid) { |
| 4542 | // The event nodeid does not match previously know event id. Something |
| 4543 | // seriously wrong here. |
| 4544 | WARNING("received a message from node with incorrect nodeid - expected %" PRIx64 " received %" PRIx64 "on fd %d" , |
| 4545 | channel.nodeid, nodeid, CSFD(socket)); |
| 4546 | rv = -1; |
| 4547 | goto Exit; |
| 4548 | } |
| 4549 | |
| 4550 | // Update the last received time for this node |
| 4551 | channel.last_received = cf_getms(); |
| 4552 | |
| 4553 | cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel); |
| 4554 | |
| 4555 | cf_socket* existing_socket; |
| 4556 | int get_result = cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, |
| 4557 | &existing_socket); |
| 4558 | |
| 4559 | if (get_result == CF_SHASH_ERR_NOT_FOUND) { |
| 4560 | // Associate this socket with the node. |
| 4561 | channel_node_attach(socket, &channel, nodeid); |
| 4562 | } |
| 4563 | else if (existing_socket != socket) { |
| 4564 | // Somehow the other node and this node discovered each other together |
| 4565 | // both connected via two tcp connections. Choose one and close the |
| 4566 | // other. |
| 4567 | cf_socket* resolved = channel_socket_resolve(socket, existing_socket); |
| 4568 | |
| 4569 | if (!resolved) { |
| 4570 | DEBUG( |
| 4571 | "resolving between fd %d and %d failed - closing both connections" , |
| 4572 | CSFD(socket), CSFD(existing_socket)); |
| 4573 | |
| 4574 | // Resolution failed. Should not happen but there is a window where |
| 4575 | // the same node initiated two connections. |
| 4576 | // Close both connections and try again. |
| 4577 | channel_socket_close_queue(socket, false, true); |
| 4578 | channel_socket_close_queue(existing_socket, false, true); |
| 4579 | |
| 4580 | // Nothing wrong with the message. Let it through. |
| 4581 | rv = 0; |
| 4582 | goto Exit; |
| 4583 | } |
| 4584 | |
| 4585 | DEBUG("resolved fd %d between redundant fd %d and %d for node %" PRIx64, |
| 4586 | CSFD(resolved), CSFD(socket), CSFD(existing_socket), nodeid); |
| 4587 | |
| 4588 | if (resolved == existing_socket) { |
| 4589 | // The node to socket mapping is correct, just close this socket and |
| 4590 | // this node will still be connected to the remote node. Do not |
| 4591 | // raise any event for this closure. |
| 4592 | channel_socket_close_queue(socket, false, false); |
| 4593 | } |
| 4594 | else { |
| 4595 | // We need to close the existing socket. Disable channel events |
| 4596 | // because we make the node appear to be not connected. Do not raise |
| 4597 | // any event for this closure. |
| 4598 | channel_socket_close_queue(existing_socket, false, false); |
| 4599 | // Associate this socket with the node. |
| 4600 | channel_node_attach(socket, &channel, nodeid); |
| 4601 | } |
| 4602 | } |
| 4603 | |
| 4604 | rv = 0; |
| 4605 | |
| 4606 | Exit: |
| 4607 | CHANNEL_UNLOCK(); |
| 4608 | return rv; |
| 4609 | } |
| 4610 | |
| 4611 | /** |
| 4612 | * Read a message from a socket that has data. |
| 4613 | * @param socket the socket having data to be read. |
| 4614 | */ |
| 4615 | static void |
| 4616 | channel_msg_read(cf_socket* socket) |
| 4617 | { |
| 4618 | CHANNEL_LOCK(); |
| 4619 | |
| 4620 | as_hb_channel_msg_read_status status; |
| 4621 | as_hb_channel channel; |
| 4622 | |
| 4623 | bool free_msg = true; |
| 4624 | |
| 4625 | msg* msg = hb_msg_get(); |
| 4626 | |
| 4627 | if (channel_get_channel(socket, &channel) != 0) { |
| 4628 | // Would happen if the channel was closed in the same epoll loop. |
| 4629 | DEBUG("error the channel does not exist for fd %d" , CSFD(socket)); |
| 4630 | goto Exit; |
| 4631 | } |
| 4632 | |
| 4633 | if (channel.is_multicast) { |
| 4634 | status = channel_multicast_msg_read(socket, msg); |
| 4635 | } |
| 4636 | else { |
| 4637 | status = channel_mesh_msg_read(socket, msg); |
| 4638 | } |
| 4639 | |
| 4640 | switch (status) { |
| 4641 | case AS_HB_CHANNEL_MSG_READ_SUCCESS: { |
| 4642 | break; |
| 4643 | } |
| 4644 | |
| 4645 | case AS_HB_CHANNEL_MSG_PARSE_FAIL: { |
| 4646 | TICKER_WARNING("unable to parse heartbeat message on fd %d" , |
| 4647 | CSFD(socket)); |
| 4648 | goto Exit; |
| 4649 | } |
| 4650 | |
| 4651 | case AS_HB_CHANNEL_MSG_CHANNEL_FAIL: // Falling through |
| 4652 | default: { |
| 4653 | DEBUG("could not read message from fd %d" , CSFD(socket)); |
| 4654 | if (!channel.is_multicast) { |
| 4655 | // Shut down only mesh socket. |
| 4656 | channel_socket_shutdown(socket); |
| 4657 | } |
| 4658 | goto Exit; |
| 4659 | } |
| 4660 | } |
| 4661 | |
| 4662 | as_hb_channel_event event; |
| 4663 | channel_event_init(&event); |
| 4664 | |
| 4665 | if (msg_get_uint64(msg, AS_HB_MSG_NODE, &event.nodeid) < 0) { |
| 4666 | // Node id missing from the message. Assume this message to be corrupt. |
| 4667 | TICKER_WARNING("message with invalid nodeid received on fd %d" , |
| 4668 | CSFD(socket)); |
| 4669 | goto Exit; |
| 4670 | } |
| 4671 | |
| 4672 | event.msg = msg; |
| 4673 | event.type = AS_HB_CHANNEL_MSG_RECEIVED; |
| 4674 | |
| 4675 | // Update hlc and store update message timestamp for the event. |
| 4676 | as_hlc_timestamp send_ts = 0; |
| 4677 | msg_send_hlc_ts_get(msg, &send_ts); |
| 4678 | as_hlc_timestamp_update(event.nodeid, send_ts, &event.msg_hlc_ts); |
| 4679 | |
| 4680 | // Process received message to update channel state. |
| 4681 | if (channel_msg_event_process(socket, &event) == 0) { |
| 4682 | // The message needs to be delivered to the listeners. Prevent a free. |
| 4683 | free_msg = false; |
| 4684 | channel_event_queue(&event); |
| 4685 | } |
| 4686 | |
| 4687 | Exit: |
| 4688 | CHANNEL_UNLOCK(); |
| 4689 | |
| 4690 | // release the message. |
| 4691 | if (free_msg) { |
| 4692 | hb_msg_return(msg); |
| 4693 | } |
| 4694 | } |
| 4695 | |
| 4696 | /** |
| 4697 | * Reduce function to remove faulty channels / nodes. Shutdown associated socket |
| 4698 | * to have channel tender cleanup. |
| 4699 | */ |
| 4700 | static int |
| 4701 | channel_channels_tend_reduce(const void* key, void* data, void* udata) |
| 4702 | { |
| 4703 | cf_socket** socket = (cf_socket**)key; |
| 4704 | as_hb_channel* channel = (as_hb_channel*)data; |
| 4705 | |
| 4706 | DETAIL("tending channel fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s" , |
| 4707 | CSFD(*socket), channel->nodeid, channel->last_received, |
| 4708 | cf_sock_addr_print(&channel->endpoint_addr)); |
| 4709 | |
| 4710 | if (channel->last_received + CHANNEL_NODE_READ_IDLE_TIMEOUT() |
| 4711 | < cf_getms()) { |
| 4712 | // Shutdown associated socket if it is not a multicast socket. |
| 4713 | if (!channel->is_multicast) { |
| 4714 | DEBUG("channel shutting down idle fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s" , |
| 4715 | CSFD(*socket), channel->nodeid, channel->last_received, |
| 4716 | cf_sock_addr_print(&channel->endpoint_addr)); |
| 4717 | channel_socket_shutdown(*socket); |
| 4718 | } |
| 4719 | } |
| 4720 | |
| 4721 | return CF_SHASH_OK; |
| 4722 | } |
| 4723 | |
| 4724 | /** |
| 4725 | * Tend channel specific node information to remove channels that are faulty (or |
| 4726 | * TODO: attached to misbehaving nodes). |
| 4727 | */ |
| 4728 | static void |
| 4729 | channel_channels_idle_check() |
| 4730 | { |
| 4731 | CHANNEL_LOCK(); |
| 4732 | |
| 4733 | cf_clock now = cf_getms(); |
| 4734 | if (g_hb.channel_state.last_channel_idle_check + CHANNEL_IDLE_CHECK_PERIOD |
| 4735 | <= now) { |
| 4736 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
| 4737 | channel_channels_tend_reduce, NULL); |
| 4738 | g_hb.channel_state.last_channel_idle_check = now; |
| 4739 | } |
| 4740 | |
| 4741 | CHANNEL_UNLOCK(); |
| 4742 | } |
| 4743 | |
| 4744 | /** |
| 4745 | * Socket tending thread. Manages heartbeat receive as well. |
| 4746 | */ |
| 4747 | void* |
| 4748 | channel_tender(void* arg) |
| 4749 | { |
| 4750 | DETAIL("channel tender started" ); |
| 4751 | |
| 4752 | while (channel_is_running()) { |
| 4753 | cf_poll_event events[POLL_SZ]; |
| 4754 | int32_t nevents = cf_poll_wait(g_hb.channel_state.poll, events, POLL_SZ, |
| 4755 | AS_HB_TX_INTERVAL_MS_MIN); |
| 4756 | |
| 4757 | DETAIL("tending channel" ); |
| 4758 | |
| 4759 | for (int32_t i = 0; i < nevents; i++) { |
| 4760 | cf_socket* socket = events[i].data; |
| 4761 | if (channel_cf_sockets_contains( |
| 4762 | g_hb.channel_state.listening_sockets, socket) |
| 4763 | && hb_is_mesh()) { |
| 4764 | // Accept a new connection. |
| 4765 | channel_accept_connection(socket); |
| 4766 | } |
| 4767 | else if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { |
| 4768 | channel_socket_close_queue(socket, true, true); |
| 4769 | } |
| 4770 | else if (events[i].events & EPOLLIN) { |
| 4771 | // Read a message for the socket that is ready. |
| 4772 | channel_msg_read(socket); |
| 4773 | } |
| 4774 | } |
| 4775 | |
| 4776 | // Tend channels to discard stale channels. |
| 4777 | channel_channels_idle_check(); |
| 4778 | |
| 4779 | // Close queued up socket. |
| 4780 | channel_socket_close_pending(); |
| 4781 | |
| 4782 | // Publish pending events. Should be outside channel lock. |
| 4783 | channel_event_publish_pending(); |
| 4784 | |
| 4785 | DETAIL("done tending channel" ); |
| 4786 | } |
| 4787 | |
| 4788 | DETAIL("channel tender shut down" ); |
| 4789 | return NULL; |
| 4790 | } |
| 4791 | |
| 4792 | /* |
| 4793 | * ---------------------------------------------------------------------------- |
| 4794 | * Channel public API |
| 4795 | * ---------------------------------------------------------------------------- |
| 4796 | */ |
| 4797 | |
| 4798 | /** |
| 4799 | * Filter out endpoints not matching this node's capabilities. |
| 4800 | */ |
| 4801 | static bool |
| 4802 | channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata) |
| 4803 | { |
| 4804 | if ((cf_ip_addr_legacy_only()) |
| 4805 | && endpoint->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6) { |
| 4806 | return false; |
| 4807 | } |
| 4808 | |
| 4809 | // If we don't offer TLS, then we won't connect via TLS, either. |
| 4810 | if (g_config.hb_tls_serv_spec.bind_port == 0 |
| 4811 | && as_endpoint_capability_is_supported(endpoint, |
| 4812 | AS_ENDPOINT_TLS_MASK)) { |
| 4813 | return false; |
| 4814 | } |
| 4815 | |
| 4816 | return true; |
| 4817 | } |
| 4818 | |
| 4819 | /** |
| 4820 | * Try and connect to a set of endpoint_lists. |
| 4821 | */ |
| 4822 | static void |
| 4823 | channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, |
| 4824 | int endpoint_list_count) |
| 4825 | { |
| 4826 | for (int i = 0; i < endpoint_list_count; i++) { |
| 4827 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 4828 | as_endpoint_list_to_string(endpoint_lists[i], endpoint_list_str, |
| 4829 | sizeof(endpoint_list_str)); |
| 4830 | |
| 4831 | if (channel_endpoint_is_connected(endpoint_lists[i])) { |
| 4832 | DEBUG( |
| 4833 | "duplicate endpoint connect request - ignoring endpoint list {%s}" , |
| 4834 | endpoint_list_str); |
| 4835 | continue; |
| 4836 | } |
| 4837 | |
| 4838 | DEBUG("attempting to connect mesh host at {%s}" , endpoint_list_str); |
| 4839 | |
| 4840 | cf_socket* sock = (cf_socket*)cf_malloc(sizeof(cf_socket)); |
| 4841 | |
| 4842 | const as_endpoint* connected_endpoint = as_endpoint_connect_any( |
| 4843 | endpoint_lists[i], channel_mesh_endpoint_filter, NULL, |
| 4844 | CONNECT_TIMEOUT(), sock); |
| 4845 | |
| 4846 | if (connected_endpoint) { |
| 4847 | cf_atomic_int_incr(&g_stats.heartbeat_connections_opened); |
| 4848 | |
| 4849 | cf_sock_addr endpoint_addr; |
| 4850 | memset(&endpoint_addr, 0, sizeof(endpoint_addr)); |
| 4851 | cf_ip_addr_set_any(&endpoint_addr.addr); |
| 4852 | if (as_endpoint_to_sock_addr(connected_endpoint, &endpoint_addr) |
| 4853 | != 0) { |
| 4854 | // Should never happen in practice. |
| 4855 | WARNING("error converting endpoint to socket address" ); |
| 4856 | channel_socket_destroy(sock); |
| 4857 | sock = NULL; |
| 4858 | |
| 4859 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
| 4860 | continue; |
| 4861 | } |
| 4862 | |
| 4863 | if (as_endpoint_capability_is_supported(connected_endpoint, |
| 4864 | AS_ENDPOINT_TLS_MASK)) { |
| 4865 | tls_socket_prepare_client(g_config.hb_config.tls, sock); |
| 4866 | |
| 4867 | if (tls_socket_connect_block(sock) != 1) { |
| 4868 | WARNING("heartbeat TLS client handshake with {%s} failed" , |
| 4869 | endpoint_list_str); |
| 4870 | channel_socket_destroy(sock); |
| 4871 | sock = NULL; |
| 4872 | |
| 4873 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
| 4874 | return; |
| 4875 | } |
| 4876 | } |
| 4877 | |
| 4878 | channel_socket_register(sock, false, false, &endpoint_addr); |
| 4879 | } |
| 4880 | else { |
| 4881 | TICKER_WARNING("could not create heartbeat connection to node {%s}" , |
| 4882 | endpoint_list_str); |
| 4883 | if (sock) { |
| 4884 | cf_free(sock); |
| 4885 | sock = NULL; |
| 4886 | } |
| 4887 | } |
| 4888 | } |
| 4889 | } |
| 4890 | |
| 4891 | /** |
| 4892 | * Disconnect a node from the channel list. |
| 4893 | * @param nodeid the nodeid of the node whose channel should be disconnected. |
| 4894 | * @return 0 if the node had a channel and was disconnected. -1 otherwise. |
| 4895 | */ |
| 4896 | static int |
| 4897 | channel_node_disconnect(cf_node nodeid) |
| 4898 | { |
| 4899 | int rv = -1; |
| 4900 | |
| 4901 | CHANNEL_LOCK(); |
| 4902 | |
| 4903 | cf_socket* socket; |
| 4904 | if (channel_socket_get(nodeid, &socket) != 0) { |
| 4905 | // not found |
| 4906 | rv = -1; |
| 4907 | goto Exit; |
| 4908 | } |
| 4909 | |
| 4910 | DEBUG("disconnecting the channel attached to node %" PRIx64, nodeid); |
| 4911 | |
| 4912 | channel_socket_close_queue(socket, false, true); |
| 4913 | |
| 4914 | rv = 0; |
| 4915 | |
| 4916 | Exit: |
| 4917 | CHANNEL_UNLOCK(); |
| 4918 | |
| 4919 | return rv; |
| 4920 | } |
| 4921 | |
| 4922 | /** |
| 4923 | * Register mesh listening sockets. |
| 4924 | */ |
| 4925 | static void |
| 4926 | channel_mesh_listening_socks_register(cf_sockets* listening_sockets) |
| 4927 | { |
| 4928 | CHANNEL_LOCK(); |
| 4929 | g_hb.channel_state.listening_sockets = listening_sockets; |
| 4930 | |
| 4931 | cf_poll_add_sockets(g_hb.channel_state.poll, |
| 4932 | g_hb.channel_state.listening_sockets, |
| 4933 | EPOLLIN | EPOLLERR | EPOLLHUP); |
| 4934 | cf_socket_show_server(AS_HB, "mesh heartbeat" , |
| 4935 | g_hb.channel_state.listening_sockets); |
| 4936 | |
| 4937 | // We do not need a separate channel to cover this socket because IO will |
| 4938 | // not happen on these sockets. |
| 4939 | CHANNEL_UNLOCK(); |
| 4940 | } |
| 4941 | |
| 4942 | /** |
| 4943 | * Deregister mesh listening socket from epoll event. |
| 4944 | * @param socket the listening socket socket. |
| 4945 | */ |
| 4946 | static void |
| 4947 | channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets) |
| 4948 | { |
| 4949 | CHANNEL_LOCK(); |
| 4950 | cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets); |
| 4951 | CHANNEL_UNLOCK(); |
| 4952 | } |
| 4953 | |
| 4954 | /** |
| 4955 | * Register the multicast listening socket. |
| 4956 | * @param socket the listening socket. |
| 4957 | * @param endpoint the endpoint on which multicast io happens. |
| 4958 | */ |
| 4959 | static void |
| 4960 | channel_multicast_listening_socks_register(cf_sockets* listening_sockets) |
| 4961 | { |
| 4962 | CHANNEL_LOCK(); |
| 4963 | g_hb.channel_state.listening_sockets = listening_sockets; |
| 4964 | |
| 4965 | // Create a new multicast channel for each multicast socket. |
| 4966 | for (uint32_t i = 0; |
| 4967 | i < g_hb.mode_state.multicast_state.listening_sockets.n_socks; |
| 4968 | ++i) { |
| 4969 | channel_socket_register(&g_hb.channel_state.listening_sockets->socks[i], |
| 4970 | true, false, NULL); |
| 4971 | } |
| 4972 | |
| 4973 | cf_socket_mcast_show(AS_HB, "multicast heartbeat" , |
| 4974 | g_hb.channel_state.listening_sockets); |
| 4975 | CHANNEL_UNLOCK(); |
| 4976 | } |
| 4977 | |
| 4978 | /** |
| 4979 | * Deregister multicast listening socket from epoll event. |
| 4980 | * @param socket the listening socket socket. |
| 4981 | */ |
| 4982 | static void |
| 4983 | channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets) |
| 4984 | { |
| 4985 | CHANNEL_LOCK(); |
| 4986 | cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets); |
| 4987 | CHANNEL_UNLOCK(); |
| 4988 | } |
| 4989 | |
| 4990 | /** |
| 4991 | * Initialize the channel sub module. |
| 4992 | */ |
| 4993 | static void |
| 4994 | channel_init() |
| 4995 | { |
| 4996 | CHANNEL_LOCK(); |
| 4997 | |
| 4998 | // Disable events till initialization is complete. |
| 4999 | channel_events_enabled_set(false); |
| 5000 | |
| 5001 | // Initialize unpublished event queue. |
| 5002 | cf_queue_init(&g_hb.channel_state.events_queue, sizeof(as_hb_channel_event), |
| 5003 | AS_HB_CLUSTER_MAX_SIZE_SOFT, true); |
| 5004 | |
| 5005 | // Initialize sockets to close queue. |
| 5006 | cf_queue_init(&g_hb.channel_state.socket_close_queue, |
| 5007 | sizeof(as_hb_channel_socket_close_entry), |
| 5008 | AS_HB_CLUSTER_MAX_SIZE_SOFT, true); |
| 5009 | |
| 5010 | // Initialize the nodeid to socket hash. |
| 5011 | g_hb.channel_state.nodeid_to_socket = cf_shash_create(cf_nodeid_shash_fn, |
| 5012 | sizeof(cf_node), sizeof(cf_socket*), AS_HB_CLUSTER_MAX_SIZE_SOFT, |
| 5013 | 0); |
| 5014 | |
| 5015 | // Initialize the socket to channel state hash. |
| 5016 | g_hb.channel_state.socket_to_channel = cf_shash_create(hb_socket_hash_fn, |
| 5017 | sizeof(cf_socket*), sizeof(as_hb_channel), |
| 5018 | AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
| 5019 | |
| 5020 | g_hb.channel_state.status = AS_HB_STATUS_STOPPED; |
| 5021 | |
| 5022 | CHANNEL_UNLOCK(); |
| 5023 | } |
| 5024 | |
| 5025 | /** |
| 5026 | * Start channel sub module. Kicks off the channel tending thread. |
| 5027 | */ |
| 5028 | static void |
| 5029 | channel_start() |
| 5030 | { |
| 5031 | CHANNEL_LOCK(); |
| 5032 | |
| 5033 | if (channel_is_running()) { |
| 5034 | WARNING("heartbeat channel already started" ); |
| 5035 | goto Exit; |
| 5036 | } |
| 5037 | |
| 5038 | // create the epoll socket. |
| 5039 | cf_poll_create(&g_hb.channel_state.poll); |
| 5040 | |
| 5041 | DEBUG("created epoll fd %d" , CEFD(g_hb.channel_state.poll)); |
| 5042 | |
| 5043 | // Disable events till initialization is complete. |
| 5044 | channel_events_enabled_set(false); |
| 5045 | |
| 5046 | // Data structures have been initialized. |
| 5047 | g_hb.channel_state.status = AS_HB_STATUS_RUNNING; |
| 5048 | |
| 5049 | // Initialization complete enable events. |
| 5050 | channel_events_enabled_set(true); |
| 5051 | |
| 5052 | // Start the channel tender. |
| 5053 | g_hb.channel_state.channel_tender_tid = |
| 5054 | cf_thread_create_joinable(channel_tender, (void*)&g_hb); |
| 5055 | |
| 5056 | Exit: |
| 5057 | CHANNEL_UNLOCK(); |
| 5058 | } |
| 5059 | |
| 5060 | /** |
| 5061 | * Get all sockets. |
| 5062 | */ |
| 5063 | static int |
| 5064 | channel_sockets_get_reduce(const void* key, void* data, void* udata) |
| 5065 | { |
| 5066 | cf_vector* sockets = (cf_vector*)udata; |
| 5067 | cf_vector_append(sockets, key); |
| 5068 | return CF_SHASH_OK; |
| 5069 | } |
| 5070 | |
| 5071 | /** |
| 5072 | * Stop the channel sub module called on hb_stop. |
| 5073 | */ |
| 5074 | static void |
| 5075 | channel_stop() |
| 5076 | { |
| 5077 | if (!channel_is_running()) { |
| 5078 | WARNING("heartbeat channel already stopped" ); |
| 5079 | return; |
| 5080 | } |
| 5081 | |
| 5082 | DEBUG("stopping the channel" ); |
| 5083 | |
| 5084 | // Unguarded state change but this should be OK. |
| 5085 | g_hb.channel_state.status = AS_HB_STATUS_SHUTTING_DOWN; |
| 5086 | |
| 5087 | // Wait for the channel tender thread to finish. |
| 5088 | cf_thread_join(g_hb.channel_state.channel_tender_tid); |
| 5089 | |
| 5090 | CHANNEL_LOCK(); |
| 5091 | |
| 5092 | cf_vector sockets; |
| 5093 | cf_socket buff[cf_shash_get_size(g_hb.channel_state.socket_to_channel)]; |
| 5094 | cf_vector_init_smalloc(&sockets, sizeof(cf_socket*), (uint8_t*)buff, |
| 5095 | sizeof(buff), VECTOR_FLAG_INITZERO); |
| 5096 | |
| 5097 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
| 5098 | channel_sockets_get_reduce, &sockets); |
| 5099 | |
| 5100 | channel_sockets_close(&sockets); |
| 5101 | |
| 5102 | // Disable events. |
| 5103 | channel_events_enabled_set(false); |
| 5104 | |
| 5105 | cf_vector_destroy(&sockets); |
| 5106 | |
| 5107 | // Close epoll socket. |
| 5108 | cf_poll_destroy(g_hb.channel_state.poll); |
| 5109 | EFD(g_hb.channel_state.poll) = -1; |
| 5110 | |
| 5111 | // Disable the channel thread. |
| 5112 | g_hb.channel_state.status = AS_HB_STATUS_STOPPED; |
| 5113 | |
| 5114 | DEBUG("channel Stopped" ); |
| 5115 | |
| 5116 | CHANNEL_UNLOCK(); |
| 5117 | } |
| 5118 | |
| 5119 | /** |
| 5120 | * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK |
| 5121 | * @param socket the socket to send the buffer over. |
| 5122 | * @param buff the data buffer. |
| 5123 | * @param buffer_length the number of bytes in the buffer to send. |
| 5124 | * @return 0 on successful send -1 on failure |
| 5125 | */ |
| 5126 | static int |
| 5127 | channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length) |
| 5128 | { |
| 5129 | CHANNEL_LOCK(); |
| 5130 | int rv; |
| 5131 | |
| 5132 | if (cf_socket_send_all(socket, buff, buffer_length, 0, |
| 5133 | MESH_RW_TIMEOUT) < 0) { |
| 5134 | as_hb_channel channel; |
| 5135 | if (channel_get_channel(socket, &channel) == 0) { |
| 5136 | // Would happen if the channel was closed in the same epoll loop. |
| 5137 | TICKER_WARNING("sending mesh message to %" PRIx64" on fd %d failed : %s" , |
| 5138 | channel.nodeid, CSFD(socket), cf_strerror(errno)); |
| 5139 | } |
| 5140 | else { |
| 5141 | TICKER_WARNING("sending mesh message on fd %d failed : %s" , |
| 5142 | CSFD(socket), cf_strerror(errno)); |
| 5143 | } |
| 5144 | |
| 5145 | channel_socket_shutdown(socket); |
| 5146 | rv = -1; |
| 5147 | } |
| 5148 | else { |
| 5149 | rv = 0; |
| 5150 | } |
| 5151 | |
| 5152 | CHANNEL_UNLOCK(); |
| 5153 | return rv; |
| 5154 | } |
| 5155 | |
| 5156 | /** |
| 5157 | * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK |
| 5158 | * @param socket the socket to send the buffer over. |
| 5159 | * @param buff the data buffer. |
| 5160 | * @param buffer_length the number of bytes in the buffer to send. |
| 5161 | * @return 0 on successful send -1 on failure |
| 5162 | */ |
| 5163 | static int |
| 5164 | channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, |
| 5165 | size_t buffer_length) |
| 5166 | { |
| 5167 | CHANNEL_LOCK(); |
| 5168 | int rv = 0; |
| 5169 | DETAIL("sending udp heartbeat to fd %d: msg size %zu" , CSFD(socket), |
| 5170 | buffer_length); |
| 5171 | |
| 5172 | int mtu = hb_mtu(); |
| 5173 | if (buffer_length > mtu) { |
| 5174 | TICKER_WARNING("mtu breach, sending udp heartbeat to fd %d: mtu %d" , |
| 5175 | CSFD(socket), mtu); |
| 5176 | } |
| 5177 | |
| 5178 | cf_msock_cfg* socket_cfg = (cf_msock_cfg*)(socket->cfg); |
| 5179 | cf_sock_addr dest; |
| 5180 | dest.port = socket_cfg->port; |
| 5181 | cf_ip_addr_copy(&socket_cfg->addr, &dest.addr); |
| 5182 | |
| 5183 | if (cf_socket_send_to(socket, buff, buffer_length, 0, &dest) < 0) { |
| 5184 | TICKER_WARNING("multicast message send failed on fd %d %s" , |
| 5185 | CSFD(socket), cf_strerror(errno)); |
| 5186 | rv = -1; |
| 5187 | } |
| 5188 | CHANNEL_UNLOCK(); |
| 5189 | return rv; |
| 5190 | } |
| 5191 | |
| 5192 | /** |
| 5193 | * Indicates if this msg requires compression. |
| 5194 | */ |
| 5195 | static bool |
| 5196 | channel_msg_is_compression_required(msg* msg, int wire_size, int mtu) |
| 5197 | { |
| 5198 | return wire_size > msg_compression_threshold(mtu); |
| 5199 | } |
| 5200 | |
| 5201 | /** |
| 5202 | * Estimate the size of the buffer required to fill out the serialized message. |
| 5203 | * @param msg the input message. |
| 5204 | * @param mtu the underlying network mtu. |
| 5205 | * @return the size of the buffer required. |
| 5206 | */ |
| 5207 | static int |
| 5208 | channel_msg_buffer_size_get(int wire_size, int mtu) |
| 5209 | { |
| 5210 | return round_up_pow2(MAX(wire_size, compressBound(wire_size))); |
| 5211 | } |
| 5212 | |
| 5213 | /** |
| 5214 | * Fills the buffer with the serialized message. |
| 5215 | * @param original_msg the original message to serialize. |
| 5216 | * @param wire_size the message wire size. |
| 5217 | * @param mtu the underlying network mtu. |
| 5218 | * @param buffer the destination buffer. |
| 5219 | * @param buffer_len the buffer length. |
| 5220 | * |
| 5221 | * @return length of the serialized message. |
| 5222 | */ |
| 5223 | static size_t |
| 5224 | channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, |
| 5225 | uint8_t* buffer, size_t buffer_len) |
| 5226 | { |
| 5227 | // This is output by msg_to_wire. Using a separate variable so that we do |
| 5228 | // not lose the actual buffer length needed for compression later on. |
| 5229 | size_t msg_size = msg_to_wire(original_msg, buffer); |
| 5230 | |
| 5231 | if (channel_msg_is_compression_required(original_msg, msg_size, mtu)) { |
| 5232 | // Compression is required. |
| 5233 | const size_t compressed_buffer_len = buffer_len; |
| 5234 | uint8_t* compressed_buffer = MSG_BUFF_ALLOC_OR_DIE( |
| 5235 | compressed_buffer_len, |
| 5236 | "error allocating memory size %zu for compressing message" , |
| 5237 | compressed_buffer_len); |
| 5238 | |
| 5239 | size_t compressed_msg_size = compressed_buffer_len; |
| 5240 | int compress_rv = compress2(compressed_buffer, &compressed_msg_size, |
| 5241 | buffer, wire_size, Z_BEST_COMPRESSION); |
| 5242 | |
| 5243 | if (compress_rv == Z_BUF_ERROR) { |
| 5244 | // Compression result going to be larger than original input buffer. |
| 5245 | // Skip compression and try to send the message as is. |
| 5246 | DETAIL( |
| 5247 | "skipping compression - compressed size larger than input size %zu" , |
| 5248 | msg_size); |
| 5249 | } |
| 5250 | else { |
| 5251 | msg* temp_msg = hb_msg_get(); |
| 5252 | |
| 5253 | msg_set_buf(temp_msg, AS_HB_MSG_COMPRESSED_PAYLOAD, |
| 5254 | compressed_buffer, compressed_msg_size, MSG_SET_COPY); |
| 5255 | msg_size = msg_to_wire(temp_msg, buffer); |
| 5256 | |
| 5257 | hb_msg_return(temp_msg); |
| 5258 | } |
| 5259 | |
| 5260 | MSG_BUFF_FREE(compressed_buffer, compressed_buffer_len); |
| 5261 | |
| 5262 | } |
| 5263 | |
| 5264 | return msg_size; |
| 5265 | } |
| 5266 | |
| 5267 | /** |
| 5268 | * Send a message to a destination node. |
| 5269 | */ |
| 5270 | static int |
| 5271 | channel_msg_unicast(cf_node dest, msg* msg) |
| 5272 | { |
| 5273 | size_t buffer_len = 0; |
| 5274 | uint8_t* buffer = NULL; |
| 5275 | if (!hb_is_mesh()) { |
| 5276 | // Can't send a unicast message in the multicast mode. |
| 5277 | WARNING("ignoring sending unicast message in multicast mode" ); |
| 5278 | return -1; |
| 5279 | } |
| 5280 | |
| 5281 | CHANNEL_LOCK(); |
| 5282 | |
| 5283 | int rv = -1; |
| 5284 | cf_socket* connected_socket; |
| 5285 | |
| 5286 | if (channel_socket_get(dest, &connected_socket) != 0) { |
| 5287 | DEBUG("failing message send to disconnected node %" PRIx64, dest); |
| 5288 | rv = -1; |
| 5289 | goto Exit; |
| 5290 | } |
| 5291 | |
| 5292 | // Read the message to a buffer. |
| 5293 | int mtu = hb_mtu(); |
| 5294 | int wire_size = msg_get_wire_size(msg); |
| 5295 | buffer_len = channel_msg_buffer_size_get(wire_size, mtu); |
| 5296 | buffer = |
| 5297 | MSG_BUFF_ALLOC_OR_DIE(buffer_len, |
| 5298 | "error allocating memory size %zu for sending message to node %" PRIx64, |
| 5299 | buffer_len, dest); |
| 5300 | |
| 5301 | size_t msg_size = channel_msg_buffer_fill(msg, wire_size, mtu, buffer, |
| 5302 | buffer_len); |
| 5303 | |
| 5304 | // Send over the buffer. |
| 5305 | rv = channel_mesh_msg_send(connected_socket, buffer, msg_size); |
| 5306 | |
| 5307 | Exit: |
| 5308 | MSG_BUFF_FREE(buffer, buffer_len); |
| 5309 | CHANNEL_UNLOCK(); |
| 5310 | return rv; |
| 5311 | } |
| 5312 | |
| 5313 | /** |
| 5314 | * Shash reduce function to walk over the socket to channel hash and broadcast |
| 5315 | * the message in udata. |
| 5316 | */ |
| 5317 | static int |
| 5318 | channel_msg_broadcast_reduce(const void* key, void* data, void* udata) |
| 5319 | { |
| 5320 | CHANNEL_LOCK(); |
| 5321 | cf_socket** socket = (cf_socket**)key; |
| 5322 | as_hb_channel* channel = (as_hb_channel*)data; |
| 5323 | as_hb_channel_buffer_udata* buffer_udata = |
| 5324 | (as_hb_channel_buffer_udata*)udata; |
| 5325 | |
| 5326 | if (!channel->is_multicast) { |
| 5327 | DETAIL( |
| 5328 | "broadcasting message of length %zu on channel %d assigned to node %" PRIx64, |
| 5329 | buffer_udata->buffer_len, CSFD(*socket), channel->nodeid); |
| 5330 | |
| 5331 | channel_mesh_msg_send(*socket, buffer_udata->buffer, |
| 5332 | buffer_udata->buffer_len); |
| 5333 | } |
| 5334 | else { |
| 5335 | channel_multicast_msg_send(*socket, buffer_udata->buffer, |
| 5336 | buffer_udata->buffer_len); |
| 5337 | } |
| 5338 | |
| 5339 | CHANNEL_UNLOCK(); |
| 5340 | |
| 5341 | return CF_SHASH_OK; |
| 5342 | } |
| 5343 | |
| 5344 | /** |
| 5345 | * Broadcast a message over all channels. |
| 5346 | */ |
| 5347 | static int |
| 5348 | channel_msg_broadcast(msg* msg) |
| 5349 | { |
| 5350 | CHANNEL_LOCK(); |
| 5351 | |
| 5352 | int rv = -1; |
| 5353 | |
| 5354 | // Read the message to a buffer. |
| 5355 | int mtu = hb_mtu(); |
| 5356 | int wire_size = msg_get_wire_size(msg); |
| 5357 | size_t buffer_len = channel_msg_buffer_size_get(wire_size, mtu); |
| 5358 | uint8_t* buffer = MSG_BUFF_ALLOC_OR_DIE(buffer_len, |
| 5359 | "error allocating memory size %zu for sending broadcast message" , |
| 5360 | buffer_len); |
| 5361 | |
| 5362 | as_hb_channel_buffer_udata udata; |
| 5363 | udata.buffer = buffer; |
| 5364 | |
| 5365 | // Note this is the length of buffer to send. |
| 5366 | udata.buffer_len = channel_msg_buffer_fill(msg, wire_size, mtu, buffer, |
| 5367 | buffer_len); |
| 5368 | |
| 5369 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
| 5370 | channel_msg_broadcast_reduce, &udata); |
| 5371 | |
| 5372 | MSG_BUFF_FREE(buffer, buffer_len); |
| 5373 | CHANNEL_UNLOCK(); |
| 5374 | return rv; |
| 5375 | } |
| 5376 | |
| 5377 | /** |
| 5378 | * Clear all channel state. |
| 5379 | */ |
| 5380 | static void |
| 5381 | channel_clear() |
| 5382 | { |
| 5383 | if (!channel_is_stopped()) { |
| 5384 | WARNING("attempted channel clear without stopping the channel" ); |
| 5385 | return; |
| 5386 | } |
| 5387 | |
| 5388 | CHANNEL_LOCK(); |
| 5389 | |
| 5390 | // Free the unpublished event queue. |
| 5391 | cf_queue_delete_all(&g_hb.channel_state.events_queue); |
| 5392 | |
| 5393 | // Delete nodeid to socket hash. |
| 5394 | cf_shash_reduce(g_hb.channel_state.nodeid_to_socket, hb_delete_all_reduce, |
| 5395 | NULL); |
| 5396 | |
| 5397 | // Delete the socket_to_channel hash. |
| 5398 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, hb_delete_all_reduce, |
| 5399 | NULL); |
| 5400 | |
| 5401 | DETAIL("cleared channel information" ); |
| 5402 | CHANNEL_UNLOCK(); |
| 5403 | } |
| 5404 | |
| 5405 | /** |
| 5406 | * Reduce function to dump channel node info to log file. |
| 5407 | */ |
| 5408 | static int |
| 5409 | channel_dump_reduce(const void* key, void* data, void* udata) |
| 5410 | { |
| 5411 | cf_socket** socket = (cf_socket**)key; |
| 5412 | as_hb_channel* channel = (as_hb_channel*)data; |
| 5413 | |
| 5414 | INFO("\tHB Channel (%s): node-id %" PRIx64 " fd %d endpoint %s polarity %s last-received %" PRIu64, |
| 5415 | channel->is_multicast ? "multicast" : "mesh" , channel->nodeid, |
| 5416 | CSFD(*socket), (cf_sock_addr_is_any(&channel->endpoint_addr)) |
| 5417 | ? "unknown" |
| 5418 | : cf_sock_addr_print(&channel->endpoint_addr), |
| 5419 | channel->is_inbound ? "inbound" : "outbound" , |
| 5420 | channel->last_received); |
| 5421 | |
| 5422 | return CF_SHASH_OK; |
| 5423 | } |
| 5424 | |
| 5425 | /** |
| 5426 | * Dump channel state to logs. |
| 5427 | * @param verbose enables / disables verbose logging. |
| 5428 | */ |
| 5429 | static void |
| 5430 | channel_dump(bool verbose) |
| 5431 | { |
| 5432 | CHANNEL_LOCK(); |
| 5433 | |
| 5434 | INFO("HB Channel Count %d" , |
| 5435 | cf_shash_get_size(g_hb.channel_state.socket_to_channel)); |
| 5436 | |
| 5437 | if (verbose) { |
| 5438 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
| 5439 | channel_dump_reduce, NULL); |
| 5440 | } |
| 5441 | |
| 5442 | CHANNEL_UNLOCK(); |
| 5443 | } |
| 5444 | |
| 5445 | /* |
| 5446 | * ---------------------------------------------------------------------------- |
| 5447 | * Mesh sub module. |
| 5448 | * ---------------------------------------------------------------------------- |
| 5449 | */ |
| 5450 | |
| 5451 | /** |
| 5452 | * Is mesh running. |
| 5453 | */ |
| 5454 | static bool |
| 5455 | mesh_is_running() |
| 5456 | { |
| 5457 | MESH_LOCK(); |
| 5458 | bool retval = |
| 5459 | (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_RUNNING) ? |
| 5460 | true : false; |
| 5461 | MESH_UNLOCK(); |
| 5462 | return retval; |
| 5463 | } |
| 5464 | |
| 5465 | /** |
| 5466 | * Is mesh stopped. |
| 5467 | */ |
| 5468 | static bool |
| 5469 | mesh_is_stopped() |
| 5470 | { |
| 5471 | MESH_LOCK(); |
| 5472 | bool retval = |
| 5473 | (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_STOPPED) ? |
| 5474 | true : false; |
| 5475 | MESH_UNLOCK(); |
| 5476 | return retval; |
| 5477 | } |
| 5478 | |
| 5479 | /** |
| 5480 | * Refresh the mesh published endpoint list. |
| 5481 | * @return 0 on successful list creation, -1 otherwise. |
| 5482 | */ |
| 5483 | static int |
| 5484 | mesh_published_endpoint_list_refresh() |
| 5485 | { |
| 5486 | int rv = -1; |
| 5487 | MESH_LOCK(); |
| 5488 | |
| 5489 | // TODO: Add interface addresses change detection logic here as well. |
| 5490 | if (g_hb.mode_state.mesh_state.published_endpoint_list != NULL |
| 5491 | && g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only |
| 5492 | == cf_ip_addr_legacy_only()) { |
| 5493 | rv = 0; |
| 5494 | goto Exit; |
| 5495 | } |
| 5496 | |
| 5497 | // The global flag has changed, refresh the published address list. |
| 5498 | if (g_hb.mode_state.mesh_state.published_endpoint_list) { |
| 5499 | // Free the obsolete list. |
| 5500 | cf_free(g_hb.mode_state.mesh_state.published_endpoint_list); |
| 5501 | } |
| 5502 | |
| 5503 | const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); |
| 5504 | cf_serv_cfg published_cfg; |
| 5505 | |
| 5506 | config_bind_serv_cfg_expand(bind_cfg, &published_cfg, |
| 5507 | g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only); |
| 5508 | |
| 5509 | g_hb.mode_state.mesh_state.published_endpoint_list = |
| 5510 | as_endpoint_list_from_serv_cfg(&published_cfg); |
| 5511 | |
| 5512 | if (!g_hb.mode_state.mesh_state.published_endpoint_list) { |
| 5513 | CRASH("error initializing mesh published address list" ); |
| 5514 | } |
| 5515 | |
| 5516 | g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only = |
| 5517 | cf_ip_addr_legacy_only(); |
| 5518 | |
| 5519 | rv = 0; |
| 5520 | |
| 5521 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 5522 | as_endpoint_list_to_string( |
| 5523 | g_hb.mode_state.mesh_state.published_endpoint_list, |
| 5524 | endpoint_list_str, sizeof(endpoint_list_str)); |
| 5525 | INFO("updated heartbeat published address list to {%s}" , endpoint_list_str); |
| 5526 | |
| 5527 | Exit: |
| 5528 | MESH_UNLOCK(); |
| 5529 | return rv; |
| 5530 | } |
| 5531 | |
| 5532 | /** |
| 5533 | * Read the published endpoint list via a callback. The call back pattern is to |
| 5534 | * prevent access to the published list outside the mesh lock. |
| 5535 | * @param process_fn the list process function. The list passed to the process |
| 5536 | * function can be NULL. |
| 5537 | * @param udata passed as is to the process function. |
| 5538 | */ |
| 5539 | static void |
| 5540 | mesh_published_endpoints_process(endpoint_list_process_fn process_fn, |
| 5541 | void* udata) |
| 5542 | { |
| 5543 | MESH_LOCK(); |
| 5544 | |
| 5545 | as_endpoint_list* rv = NULL; |
| 5546 | if (mesh_published_endpoint_list_refresh()) { |
| 5547 | WARNING("error creating mesh published endpoint list" ); |
| 5548 | rv = NULL; |
| 5549 | } |
| 5550 | else { |
| 5551 | rv = g_hb.mode_state.mesh_state.published_endpoint_list; |
| 5552 | } |
| 5553 | |
| 5554 | (process_fn)(rv, udata); |
| 5555 | |
| 5556 | MESH_UNLOCK(); |
| 5557 | } |
| 5558 | |
| 5559 | /** |
| 5560 | * Convert mesh status to a string. |
| 5561 | */ |
| 5562 | static const char* |
| 5563 | mesh_node_status_string(as_hb_mesh_node_status status) |
| 5564 | { |
| 5565 | static char* status_str[] = { |
| 5566 | "active" , |
| 5567 | "pending" , |
| 5568 | "inactive" , |
| 5569 | "endpoint-unknown" }; |
| 5570 | |
| 5571 | if (status >= AS_HB_MESH_NODE_STATUS_SENTINEL) { |
| 5572 | return "corrupted" ; |
| 5573 | } |
| 5574 | return status_str[status]; |
| 5575 | } |
| 5576 | |
| 5577 | /** |
| 5578 | * Change the state of a mesh node. Note: memset the mesh_nodes to zero before |
| 5579 | * calling state change for the first time. |
| 5580 | */ |
| 5581 | static void |
| 5582 | mesh_seed_status_change(as_hb_mesh_seed* seed, |
| 5583 | as_hb_mesh_node_status new_status) |
| 5584 | { |
| 5585 | seed->status = new_status; |
| 5586 | seed->last_status_updated = cf_getms(); |
| 5587 | } |
| 5588 | |
| 5589 | /** |
| 5590 | * Destroy a mesh seed node. |
| 5591 | */ |
| 5592 | static void |
| 5593 | mesh_seed_destroy(as_hb_mesh_seed* seed) |
| 5594 | { |
| 5595 | MESH_LOCK(); |
| 5596 | if (seed->resolved_endpoint_list) { |
| 5597 | cf_free(seed->resolved_endpoint_list); |
| 5598 | seed->resolved_endpoint_list = NULL; |
| 5599 | } |
| 5600 | MESH_UNLOCK(); |
| 5601 | } |
| 5602 | |
| 5603 | static void |
| 5604 | mesh_seed_dns_resolve_cb(bool is_resolved, const char* hostname, |
| 5605 | const cf_ip_addr *addrs, uint32_t n_addrs, void *udata) |
| 5606 | { |
| 5607 | MESH_LOCK(); |
| 5608 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 5609 | int element_count = cf_vector_size(seeds); |
| 5610 | for (int i = 0; i < element_count; i++) { |
| 5611 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 5612 | |
| 5613 | if ((strncmp(seed->seed_host_name, hostname, |
| 5614 | sizeof(seed->seed_host_name)) != 0) |
| 5615 | || seed->resolved_endpoint_list != NULL) { |
| 5616 | continue; |
| 5617 | } |
| 5618 | |
| 5619 | cf_serv_cfg temp_serv_cfg; |
| 5620 | cf_serv_cfg_init(&temp_serv_cfg); |
| 5621 | |
| 5622 | cf_sock_cfg sock_cfg; |
| 5623 | cf_sock_cfg_init(&sock_cfg, |
| 5624 | seed->seed_tls ? |
| 5625 | CF_SOCK_OWNER_HEARTBEAT_TLS : CF_SOCK_OWNER_HEARTBEAT); |
| 5626 | sock_cfg.port = seed->seed_port; |
| 5627 | |
| 5628 | for (int i = 0; i < n_addrs; i++) { |
| 5629 | cf_ip_addr_copy(&addrs[i], &sock_cfg.addr); |
| 5630 | if (cf_serv_cfg_add_sock_cfg(&temp_serv_cfg, &sock_cfg)) { |
| 5631 | CRASH("error initializing resolved address list" ); |
| 5632 | } |
| 5633 | |
| 5634 | DETAIL("resolved mesh node hostname %s to %s" , seed->seed_host_name, |
| 5635 | cf_ip_addr_print(&addrs[i])); |
| 5636 | } |
| 5637 | |
| 5638 | seed->resolved_endpoint_list = as_endpoint_list_from_serv_cfg( |
| 5639 | &temp_serv_cfg); |
| 5640 | } |
| 5641 | |
| 5642 | MESH_UNLOCK(); |
| 5643 | } |
| 5644 | |
| 5645 | /** |
| 5646 | * Fill the endpoint list for a mesh seed using the mesh seed hostname and port. |
| 5647 | * returns the |
| 5648 | * @param mesh_node the mesh node |
| 5649 | * @return 0 on success. -1 if a valid endpoint list does not exist and it could |
| 5650 | * not be generated. |
| 5651 | */ |
| 5652 | static int |
| 5653 | mesh_seed_endpoint_list_fill(as_hb_mesh_seed* seed) |
| 5654 | { |
| 5655 | if (seed->resolved_endpoint_list != NULL |
| 5656 | && seed->resolved_endpoint_list->n_endpoints > 0) { |
| 5657 | // A valid endpoint list already exists. For now we resolve only once. |
| 5658 | return 0; |
| 5659 | } |
| 5660 | |
| 5661 | cf_clock now = cf_getms(); |
| 5662 | if (now |
| 5663 | < seed->resolved_endpoint_list_ts |
| 5664 | + MESH_SEED_RESOLVE_ATTEMPT_INTERVAL()) { |
| 5665 | // We have just resolved this seed entry unsuccessfully. Don't try again |
| 5666 | // for sometime. |
| 5667 | return -1; |
| 5668 | } |
| 5669 | |
| 5670 | // Resolve and get all IPv4/IPv6 ip addresses asynchronously. |
| 5671 | seed->resolved_endpoint_list_ts = now; |
| 5672 | cf_ip_addr_from_string_multi_a(seed->seed_host_name, |
| 5673 | mesh_seed_dns_resolve_cb, NULL); |
| 5674 | return -1; |
| 5675 | } |
| 5676 | |
| 5677 | /** |
| 5678 | * Find a mesh seed in the seed list that has an overlapping endpoint and return |
| 5679 | * an internal pointer. Assumes this function is called within mesh lock to |
| 5680 | * prevent invalidating the returned index after function return. |
| 5681 | * |
| 5682 | * @param endpoint_list the endpoint list to find the endpoint by. |
| 5683 | * @return index to matching seed entry if found, else -1 |
| 5684 | */ |
| 5685 | static int |
| 5686 | mesh_seed_endpoint_list_overlapping_find_unsafe(as_endpoint_list* endpoint_list) |
| 5687 | { |
| 5688 | MESH_LOCK(); |
| 5689 | |
| 5690 | int match_index = -1; |
| 5691 | if (!endpoint_list) { |
| 5692 | // Null / empty endpoint list. |
| 5693 | goto Exit; |
| 5694 | } |
| 5695 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 5696 | int element_count = cf_vector_size(seeds); |
| 5697 | for (int i = 0; i < element_count; i++) { |
| 5698 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 5699 | |
| 5700 | // Ensure the seed hostname is resolved. |
| 5701 | mesh_seed_endpoint_list_fill(seed); |
| 5702 | |
| 5703 | if (as_endpoint_lists_are_overlapping(endpoint_list, |
| 5704 | seed->resolved_endpoint_list, true)) { |
| 5705 | match_index = i; |
| 5706 | break; |
| 5707 | } |
| 5708 | } |
| 5709 | |
| 5710 | Exit: |
| 5711 | MESH_UNLOCK(); |
| 5712 | return match_index; |
| 5713 | } |
| 5714 | |
| 5715 | /** |
| 5716 | * Remove a seed entry from the seed list. |
| 5717 | * Assumes this function is called within mesh lock to prevent invalidating the |
| 5718 | * used index during a function call. |
| 5719 | * @param seed_index the index of the seed element. |
| 5720 | * @return 0 on success -1 on failure. |
| 5721 | */ |
| 5722 | static int |
| 5723 | mesh_seed_delete_unsafe(int seed_index) |
| 5724 | { |
| 5725 | int rv = -1; |
| 5726 | MESH_LOCK(); |
| 5727 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 5728 | if (seed_index >= 0) { |
| 5729 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, seed_index); |
| 5730 | mesh_seed_destroy(seed); |
| 5731 | rv = cf_vector_delete(seeds, seed_index); |
| 5732 | if (rv == 0) { |
| 5733 | INFO("removed mesh seed host:%s port %d" , seed->seed_host_name, |
| 5734 | seed->seed_port); |
| 5735 | } |
| 5736 | } |
| 5737 | MESH_UNLOCK(); |
| 5738 | return rv; |
| 5739 | } |
| 5740 | |
| 5741 | /** |
| 5742 | * Find a mesh seed in the seed list with exactly matching hostname and port. |
| 5743 | * Assumes this function is called within mesh lock to prevent invalidating the |
| 5744 | * returned index after function return. |
| 5745 | * |
| 5746 | * @param host the seed hostname |
| 5747 | * @param port the seed port |
| 5748 | * @return index to matching seed entry if found, else -1 |
| 5749 | */ |
| 5750 | static int |
| 5751 | mesh_seed_find_unsafe(char* host, int port) |
| 5752 | { |
| 5753 | MESH_LOCK(); |
| 5754 | |
| 5755 | int match_index = -1; |
| 5756 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 5757 | int element_count = cf_vector_size(seeds); |
| 5758 | for (int i = 0; i < element_count; i++) { |
| 5759 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 5760 | if (strncmp(seed->seed_host_name, host, sizeof(seed->seed_host_name)) |
| 5761 | == 0 && seed->seed_port == port) { |
| 5762 | match_index = i; |
| 5763 | break; |
| 5764 | } |
| 5765 | } |
| 5766 | |
| 5767 | MESH_UNLOCK(); |
| 5768 | return match_index; |
| 5769 | } |
| 5770 | |
| 5771 | /** |
| 5772 | * Endure mesh tend udata has enough space for current mesh nodes. |
| 5773 | */ |
| 5774 | static void |
| 5775 | mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, |
| 5776 | int mesh_node_count) |
| 5777 | { |
| 5778 | // Ensure capacity for nodes to connect. |
| 5779 | if (tend_reduce_udata->to_connect_capacity < mesh_node_count) { |
| 5780 | uint32_t alloc_size = round_up_pow2( |
| 5781 | mesh_node_count * sizeof(as_endpoint_list*)); |
| 5782 | int old_capacity = tend_reduce_udata->to_connect_capacity; |
| 5783 | tend_reduce_udata->to_connect_capacity = alloc_size |
| 5784 | / sizeof(as_endpoint_list*); |
| 5785 | tend_reduce_udata->to_connect = cf_realloc( |
| 5786 | tend_reduce_udata->to_connect, alloc_size); |
| 5787 | |
| 5788 | // NULL out newly allocated elements. |
| 5789 | for (int i = old_capacity; i < tend_reduce_udata->to_connect_capacity; |
| 5790 | i++) { |
| 5791 | tend_reduce_udata->to_connect[i] = NULL; |
| 5792 | } |
| 5793 | } |
| 5794 | } |
| 5795 | |
| 5796 | /** |
| 5797 | * Change the state of a mesh node. Note: memset the mesh_nodes to zero before |
| 5798 | * calling state change for the first time. |
| 5799 | */ |
| 5800 | static void |
| 5801 | mesh_node_status_change(as_hb_mesh_node* mesh_node, |
| 5802 | as_hb_mesh_node_status new_status) |
| 5803 | { |
| 5804 | as_hb_mesh_node_status old_status = mesh_node->status; |
| 5805 | mesh_node->status = new_status; |
| 5806 | |
| 5807 | if ((new_status != AS_HB_MESH_NODE_CHANNEL_ACTIVE |
| 5808 | && old_status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) |
| 5809 | || mesh_node->last_status_updated == 0) { |
| 5810 | mesh_node->inactive_since = cf_getms(); |
| 5811 | } |
| 5812 | mesh_node->last_status_updated = cf_getms(); |
| 5813 | return; |
| 5814 | } |
| 5815 | |
| 5816 | /** |
| 5817 | * Close mesh listening sockets. |
| 5818 | */ |
| 5819 | static void |
| 5820 | mesh_listening_sockets_close() |
| 5821 | { |
| 5822 | MESH_LOCK(); |
| 5823 | INFO("closing mesh heartbeat sockets" ); |
| 5824 | cf_sockets_close(&g_hb.mode_state.mesh_state.listening_sockets); |
| 5825 | DEBUG("closed mesh heartbeat sockets" ); |
| 5826 | MESH_UNLOCK(); |
| 5827 | } |
| 5828 | |
| 5829 | /** |
| 5830 | * Populate the buffer with mesh seed list. |
| 5831 | */ |
| 5832 | static void |
| 5833 | mesh_seed_host_list_get(cf_dyn_buf* db, bool tls) |
| 5834 | { |
| 5835 | if (!hb_is_mesh()) { |
| 5836 | return; |
| 5837 | } |
| 5838 | |
| 5839 | MESH_LOCK(); |
| 5840 | |
| 5841 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 5842 | int element_count = cf_vector_size(seeds); |
| 5843 | for (int i = 0; i < element_count; i++) { |
| 5844 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 5845 | const char* info_key = |
| 5846 | seed->seed_tls ? |
| 5847 | "heartbeat.tls-mesh-seed-address-port=" : |
| 5848 | "heartbeat.mesh-seed-address-port=" ; |
| 5849 | |
| 5850 | cf_dyn_buf_append_string(db, info_key); |
| 5851 | cf_dyn_buf_append_string(db, seed->seed_host_name); |
| 5852 | cf_dyn_buf_append_char(db, ':'); |
| 5853 | cf_dyn_buf_append_uint32(db, seed->seed_port); |
| 5854 | cf_dyn_buf_append_char(db, ';'); |
| 5855 | } |
| 5856 | |
| 5857 | MESH_UNLOCK(); |
| 5858 | } |
| 5859 | |
| 5860 | /** |
| 5861 | * Checks if the match between a mesh seed and a mesh node is valid. |
| 5862 | * The matching would be invalid if the mesh node's endpoint has been updated |
| 5863 | * after the match was made or there has been no match. |
| 5864 | */ |
| 5865 | static bool |
| 5866 | mesh_seed_mesh_node_check(as_hb_mesh_seed* seed) |
| 5867 | { |
| 5868 | if (seed->status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) { |
| 5869 | return false; |
| 5870 | } |
| 5871 | |
| 5872 | as_hb_mesh_node node; |
| 5873 | if (mesh_node_get(seed->mesh_nodeid, &node) != 0) { |
| 5874 | // The matched node has vanished. |
| 5875 | return false; |
| 5876 | } |
| 5877 | |
| 5878 | return seed->mesh_node_endpoint_change_ts == node.endpoint_change_ts; |
| 5879 | } |
| 5880 | |
| 5881 | /** |
| 5882 | * Refresh the matching between seeds and mesh nodes and get inactive seeds. |
| 5883 | * Should be invoked under a mesh lock to ensure the validity of returned |
| 5884 | * pointers. |
| 5885 | * @param inactive_seeds_p output vector of inactive seed pointers. Can be NULL |
| 5886 | * if inactive nodes need not be returned. |
| 5887 | */ |
| 5888 | static void |
| 5889 | mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p) |
| 5890 | { |
| 5891 | MESH_LOCK(); |
| 5892 | |
| 5893 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 5894 | int element_count = cf_vector_size(seeds); |
| 5895 | if (inactive_seeds_p) { |
| 5896 | cf_vector_clear(inactive_seeds_p); |
| 5897 | } |
| 5898 | |
| 5899 | // Mark seeds that do not have a matching mesh node and transitively do not |
| 5900 | // have a matching channel. |
| 5901 | cf_clock now = cf_getms(); |
| 5902 | for (int i = 0; i < element_count; i++) { |
| 5903 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 5904 | if (mesh_seed_mesh_node_check(seed)) { |
| 5905 | continue; |
| 5906 | } |
| 5907 | |
| 5908 | seed->mesh_nodeid = 0; |
| 5909 | seed->mesh_node_endpoint_change_ts = 0; |
| 5910 | |
| 5911 | // The mesh node is being connected. Skip. |
| 5912 | if (seed->status == AS_HB_MESH_NODE_CHANNEL_PENDING) { |
| 5913 | if (seed->last_status_updated + MESH_PENDING_TIMEOUT > now) { |
| 5914 | // Spare the pending seeds, since we are attempting to connect |
| 5915 | // to the seed host. |
| 5916 | continue; |
| 5917 | } |
| 5918 | |
| 5919 | // Flip to inactive if we have been in pending state for a long |
| 5920 | // time. |
| 5921 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
| 5922 | } |
| 5923 | |
| 5924 | if (seed->status != AS_HB_MESH_NODE_CHANNEL_PENDING) { |
| 5925 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
| 5926 | if (inactive_seeds_p) { |
| 5927 | cf_vector_append(inactive_seeds_p, &seed); |
| 5928 | } |
| 5929 | } |
| 5930 | } |
| 5931 | |
| 5932 | MESH_UNLOCK(); |
| 5933 | } |
| 5934 | |
| 5935 | /** |
| 5936 | * Match input seeds to a mesh node using its endpoint address and |
| 5937 | */ |
| 5938 | static void |
| 5939 | mesh_seeds_mesh_node_match_update(cf_vector* inactive_seeds_p, |
| 5940 | as_hb_mesh_node* mesh_node, cf_node mesh_nodeid) |
| 5941 | { |
| 5942 | if (mesh_node->status |
| 5943 | == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN|| mesh_node->endpoint_list == NULL) { |
| 5944 | return; |
| 5945 | } |
| 5946 | |
| 5947 | int element_count = cf_vector_size(inactive_seeds_p); |
| 5948 | for (int i = 0; i < element_count; i++) { |
| 5949 | // No null check required since we are iterating under a lock and within |
| 5950 | // vector bounds. |
| 5951 | as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp( |
| 5952 | inactive_seeds_p, i); |
| 5953 | if (as_endpoint_lists_are_overlapping(seed->resolved_endpoint_list, |
| 5954 | mesh_node->endpoint_list, true)) { |
| 5955 | // We found a matching mesh node for the seed, flip its status to |
| 5956 | // active. |
| 5957 | seed->mesh_nodeid = mesh_nodeid; |
| 5958 | seed->mesh_node_endpoint_change_ts = mesh_node->endpoint_change_ts; |
| 5959 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_ACTIVE); |
| 5960 | DEBUG("seed entry %s:%d connected" , seed->seed_host_name, |
| 5961 | seed->seed_port); |
| 5962 | } |
| 5963 | } |
| 5964 | } |
| 5965 | |
| 5966 | /** |
| 5967 | * Determines if a mesh entry should be connected to or expired and deleted. |
| 5968 | */ |
| 5969 | static int |
| 5970 | mesh_tend_reduce(const void* key, void* data, void* udata) |
| 5971 | { |
| 5972 | MESH_LOCK(); |
| 5973 | |
| 5974 | int rv = CF_SHASH_OK; |
| 5975 | cf_node nodeid = *(cf_node*)key; |
| 5976 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
| 5977 | as_hb_mesh_tend_reduce_udata* tend_reduce_udata = |
| 5978 | (as_hb_mesh_tend_reduce_udata*)udata; |
| 5979 | |
| 5980 | DETAIL("tending mesh node %" PRIx64" with status %s" , nodeid, |
| 5981 | mesh_node_status_string(mesh_node->status)); |
| 5982 | |
| 5983 | mesh_seeds_mesh_node_match_update(tend_reduce_udata->inactive_seeds_p, |
| 5984 | mesh_node, nodeid); |
| 5985 | |
| 5986 | if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) { |
| 5987 | // The mesh node is connected. Skip. |
| 5988 | goto Exit; |
| 5989 | } |
| 5990 | |
| 5991 | cf_clock now = cf_getms(); |
| 5992 | |
| 5993 | if (!mesh_node->endpoint_list) { |
| 5994 | // Will happen if node discover and disconnect happen close together. |
| 5995 | mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_ENDPOINT_UNKNOWN); |
| 5996 | } |
| 5997 | |
| 5998 | if (mesh_node->inactive_since + MESH_INACTIVE_TIMEOUT <= now) { |
| 5999 | DEBUG("mesh forgetting node %" PRIx64" because it could not be connected since %" PRIx64, |
| 6000 | nodeid, mesh_node->inactive_since); |
| 6001 | rv = CF_SHASH_REDUCE_DELETE; |
| 6002 | goto Exit; |
| 6003 | } |
| 6004 | |
| 6005 | if (mesh_node->status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) { |
| 6006 | if (mesh_node->last_status_updated + MESH_ENDPOINT_UNKNOWN_TIMEOUT |
| 6007 | > now) { |
| 6008 | DEBUG("mesh forgetting node %" PRIx64" ip address/port undiscovered since %" PRIu64, |
| 6009 | nodeid, mesh_node->last_status_updated); |
| 6010 | |
| 6011 | rv = CF_SHASH_REDUCE_DELETE; |
| 6012 | } |
| 6013 | // Skip connecting with a node with unknown endpoint. |
| 6014 | goto Exit; |
| 6015 | } |
| 6016 | |
| 6017 | if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_PENDING) { |
| 6018 | // The mesh node is being connected. Skip. |
| 6019 | if (mesh_node->last_status_updated + MESH_PENDING_TIMEOUT > now) { |
| 6020 | goto Exit; |
| 6021 | } |
| 6022 | |
| 6023 | // Flip to inactive if we have been in pending state for a long time. |
| 6024 | mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
| 6025 | } |
| 6026 | |
| 6027 | // Channel for this node is inactive. Prompt the channel sub module to |
| 6028 | // connect to this node. |
| 6029 | if (tend_reduce_udata->to_connect_count |
| 6030 | >= tend_reduce_udata->to_connect_capacity) { |
| 6031 | // New nodes found but we are out of capacity. Ultra defensive coding. |
| 6032 | // This will never happen under the locks. |
| 6033 | WARNING("skipping connecting to node %" PRIx64" - not enough memory allocated" , |
| 6034 | nodeid); |
| 6035 | goto Exit; |
| 6036 | } |
| 6037 | |
| 6038 | endpoint_list_copy( |
| 6039 | &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count], |
| 6040 | mesh_node->endpoint_list); |
| 6041 | tend_reduce_udata->to_connect_count++; |
| 6042 | |
| 6043 | // Flip status to pending. |
| 6044 | mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_PENDING); |
| 6045 | |
| 6046 | Exit: |
| 6047 | if (rv == CF_SHASH_REDUCE_DELETE) { |
| 6048 | // Clear all internal allocated memory. |
| 6049 | mesh_node_destroy(mesh_node); |
| 6050 | } |
| 6051 | |
| 6052 | MESH_UNLOCK(); |
| 6053 | |
| 6054 | return rv; |
| 6055 | } |
| 6056 | |
| 6057 | /** |
| 6058 | * Add inactive seeds to to_connect array. |
| 6059 | * Should be invoked under mesh lock to prevent invalidating the array of seed |
| 6060 | * node pointers. |
| 6061 | * @param seed_p vector of seed pointers. |
| 6062 | * @param tend reduce udata having the to connect endpoint list. |
| 6063 | */ |
| 6064 | void |
| 6065 | mesh_seeds_inactive_add_to_connect(cf_vector* seeds_p, |
| 6066 | as_hb_mesh_tend_reduce_udata* tend_reduce_udata) |
| 6067 | { |
| 6068 | MESH_LOCK(); |
| 6069 | int element_count = cf_vector_size(seeds_p); |
| 6070 | for (int i = 0; i < element_count; i++) { |
| 6071 | as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp(seeds_p, i); |
| 6072 | if (seed->status != AS_HB_MESH_NODE_CHANNEL_INACTIVE) { |
| 6073 | continue; |
| 6074 | } |
| 6075 | |
| 6076 | // Channel for this node is inactive. Prompt the channel sub module to |
| 6077 | // connect to this node. |
| 6078 | if (tend_reduce_udata->to_connect_count |
| 6079 | >= tend_reduce_udata->to_connect_capacity) { |
| 6080 | // New nodes found but we are out of capacity. Ultra defensive |
| 6081 | // coding. |
| 6082 | // This will never happen under the locks. |
| 6083 | WARNING( |
| 6084 | "skipping connecting to %s:%d - not enough memory allocated" , |
| 6085 | seed->seed_host_name, seed->seed_port); |
| 6086 | return; |
| 6087 | } |
| 6088 | |
| 6089 | // Ensure the seed hostname is resolved. |
| 6090 | if (mesh_seed_endpoint_list_fill(seed) != 0) { |
| 6091 | continue; |
| 6092 | } |
| 6093 | |
| 6094 | endpoint_list_copy( |
| 6095 | &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count], |
| 6096 | seed->resolved_endpoint_list); |
| 6097 | tend_reduce_udata->to_connect_count++; |
| 6098 | |
| 6099 | // Flip status to pending. |
| 6100 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_PENDING); |
| 6101 | } |
| 6102 | MESH_UNLOCK(); |
| 6103 | } |
| 6104 | |
| 6105 | /** |
| 6106 | * Tends the mesh host list, to discover and remove nodes. Should never invoke a |
| 6107 | * channel call while holding a mesh lock. |
| 6108 | */ |
| 6109 | void* |
| 6110 | mesh_tender(void* arg) |
| 6111 | { |
| 6112 | DETAIL("mesh tender started" ); |
| 6113 | // Figure out which nodes need to be connected to. |
| 6114 | // collect nodes to connect to and remove dead nodes. |
| 6115 | as_hb_mesh_tend_reduce_udata tend_reduce_udata = { NULL, 0, 0 }; |
| 6116 | |
| 6117 | // Vector of pointer to inactive seeds. |
| 6118 | cf_vector inactive_seeds_p; |
| 6119 | cf_vector_init(&inactive_seeds_p, sizeof(as_hb_mesh_seed*), |
| 6120 | AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); |
| 6121 | |
| 6122 | cf_clock last_time = 0; |
| 6123 | |
| 6124 | while (hb_is_mesh() && mesh_is_running()) { |
| 6125 | cf_clock curr_time = cf_getms(); |
| 6126 | |
| 6127 | // Unlocked access but this should be alright Set the discovered flag. |
| 6128 | bool nodes_discovered = g_hb.mode_state.mesh_state.nodes_discovered; |
| 6129 | if ((curr_time - last_time) < MESH_TEND_INTERVAL && !nodes_discovered) { |
| 6130 | // Interval has not been reached for sending heartbeats |
| 6131 | usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time + |
| 6132 | MESH_TEND_INTERVAL) - curr_time) * 1000); |
| 6133 | continue; |
| 6134 | } |
| 6135 | last_time = curr_time; |
| 6136 | |
| 6137 | DETAIL("tending mesh list" ); |
| 6138 | |
| 6139 | MESH_LOCK(); |
| 6140 | // Unset the discovered flag. |
| 6141 | g_hb.mode_state.mesh_state.nodes_discovered = false; |
| 6142 | |
| 6143 | // Update the list of inactive seeds. |
| 6144 | mesh_seed_inactive_refresh_get_unsafe(&inactive_seeds_p); |
| 6145 | |
| 6146 | // Make sure the udata has enough capacity. |
| 6147 | int connect_count_max = cf_shash_get_size( |
| 6148 | g_hb.mode_state.mesh_state.nodeid_to_mesh_node) |
| 6149 | + cf_vector_size(&inactive_seeds_p); |
| 6150 | mesh_tend_udata_capacity_ensure(&tend_reduce_udata, connect_count_max); |
| 6151 | |
| 6152 | tend_reduce_udata.to_connect_count = 0; |
| 6153 | tend_reduce_udata.inactive_seeds_p = &inactive_seeds_p; |
| 6154 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
| 6155 | mesh_tend_reduce, &tend_reduce_udata); |
| 6156 | |
| 6157 | // Add inactive seeds for connection. |
| 6158 | mesh_seeds_inactive_add_to_connect(&inactive_seeds_p, |
| 6159 | &tend_reduce_udata); |
| 6160 | |
| 6161 | MESH_UNLOCK(); |
| 6162 | |
| 6163 | // Connect can be time consuming, especially in failure cases. |
| 6164 | // Connect outside of the mesh lock and prevent hogging the lock. |
| 6165 | if (tend_reduce_udata.to_connect_count > 0) { |
| 6166 | // Try connecting the newer nodes. |
| 6167 | channel_mesh_channel_establish(tend_reduce_udata.to_connect, |
| 6168 | tend_reduce_udata.to_connect_count); |
| 6169 | } |
| 6170 | |
| 6171 | DETAIL("done tending mesh list" ); |
| 6172 | } |
| 6173 | |
| 6174 | if (tend_reduce_udata.to_connect) { |
| 6175 | // Free space allocated for endpoint lists. |
| 6176 | for (int i = 0; i < tend_reduce_udata.to_connect_capacity; i++) { |
| 6177 | if (tend_reduce_udata.to_connect[i]) { |
| 6178 | cf_free(tend_reduce_udata.to_connect[i]); |
| 6179 | } |
| 6180 | } |
| 6181 | cf_free(tend_reduce_udata.to_connect); |
| 6182 | } |
| 6183 | |
| 6184 | cf_vector_destroy(&inactive_seeds_p); |
| 6185 | |
| 6186 | DETAIL("mesh tender shut down" ); |
| 6187 | return NULL; |
| 6188 | } |
| 6189 | |
| 6190 | /** |
| 6191 | * Add or update a mesh node to mesh node list. |
| 6192 | */ |
| 6193 | static void |
| 6194 | mesh_node_add_update(cf_node nodeid, as_hb_mesh_node* mesh_node) |
| 6195 | { |
| 6196 | MESH_LOCK(); |
| 6197 | cf_shash_put(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid, |
| 6198 | mesh_node); |
| 6199 | MESH_UNLOCK(); |
| 6200 | } |
| 6201 | |
| 6202 | /** |
| 6203 | * Destroy a mesh node. |
| 6204 | */ |
| 6205 | static void |
| 6206 | mesh_node_destroy(as_hb_mesh_node* mesh_node) |
| 6207 | { |
| 6208 | MESH_LOCK(); |
| 6209 | if (mesh_node->endpoint_list) { |
| 6210 | cf_free(mesh_node->endpoint_list); |
| 6211 | mesh_node->endpoint_list = NULL; |
| 6212 | } |
| 6213 | MESH_UNLOCK(); |
| 6214 | } |
| 6215 | |
| 6216 | /** |
| 6217 | * Endpoint list iterate function find endpoint matching sock addr. |
| 6218 | */ |
| 6219 | static void |
| 6220 | mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata) |
| 6221 | { |
| 6222 | cf_sock_addr endpoint_addr; |
| 6223 | if (as_endpoint_to_sock_addr(endpoint, &endpoint_addr) != 0) { |
| 6224 | return; |
| 6225 | } |
| 6226 | |
| 6227 | as_hb_endpoint_list_addr_find_udata* endpoint_reduce_udata = |
| 6228 | (as_hb_endpoint_list_addr_find_udata*)udata; |
| 6229 | |
| 6230 | if (cf_sock_addr_compare(&endpoint_addr, endpoint_reduce_udata->to_search) |
| 6231 | == 0) { |
| 6232 | endpoint_reduce_udata->found = true; |
| 6233 | } |
| 6234 | } |
| 6235 | |
| 6236 | /** |
| 6237 | * Indicates if a give node is discovered. |
| 6238 | * @param nodeid the input nodeid. |
| 6239 | * @return true if discovered, false otherwise. |
| 6240 | */ |
| 6241 | static bool |
| 6242 | mesh_node_is_discovered(cf_node nodeid) |
| 6243 | { |
| 6244 | if (nodeid == config_self_nodeid_get()) { |
| 6245 | // Assume this node knows itself. |
| 6246 | return true; |
| 6247 | } |
| 6248 | |
| 6249 | as_hb_mesh_node mesh_node; |
| 6250 | return mesh_node_get(nodeid, &mesh_node) == 0; |
| 6251 | } |
| 6252 | |
| 6253 | /** |
| 6254 | * Indicates if a give node has a valid endpoint list. |
| 6255 | * @param nodeid the input nodeid. |
| 6256 | * @return true if node has valid endpoint list, false otherwise. |
| 6257 | */ |
| 6258 | static bool |
| 6259 | mesh_node_endpoint_list_is_valid(cf_node nodeid) |
| 6260 | { |
| 6261 | if (nodeid == config_self_nodeid_get()) { |
| 6262 | // Assume this node knows itself. |
| 6263 | return true; |
| 6264 | } |
| 6265 | |
| 6266 | as_hb_mesh_node mesh_node; |
| 6267 | return mesh_node_get(nodeid, &mesh_node) == 0 |
| 6268 | && mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN |
| 6269 | && mesh_node.endpoint_list; |
| 6270 | } |
| 6271 | |
| 6272 | /** |
| 6273 | * Get the mesh node associated with this node. |
| 6274 | * @param nodeid the nodeid to search for. |
| 6275 | * @param is_real_nodeid indicates if the query is for a real or fake nodeid. |
| 6276 | * @param mesh_node the output mesh node. |
| 6277 | * @return 0 on success -1 if there is mesh node attached. |
| 6278 | */ |
| 6279 | static int |
| 6280 | mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node) |
| 6281 | { |
| 6282 | int rv = -1; |
| 6283 | |
| 6284 | MESH_LOCK(); |
| 6285 | if (cf_shash_get(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid, |
| 6286 | mesh_node) == CF_SHASH_OK) { |
| 6287 | rv = 0; |
| 6288 | } |
| 6289 | else { |
| 6290 | // The node not found. |
| 6291 | rv = -1; |
| 6292 | } |
| 6293 | MESH_UNLOCK(); |
| 6294 | return rv; |
| 6295 | } |
| 6296 | |
| 6297 | /** |
| 6298 | * Handle the event when the channel reports a node as disconnected. |
| 6299 | */ |
| 6300 | static void |
| 6301 | mesh_channel_on_node_disconnect(as_hb_channel_event* event) |
| 6302 | { |
| 6303 | MESH_LOCK(); |
| 6304 | |
| 6305 | as_hb_mesh_node mesh_node; |
| 6306 | if (mesh_node_get(event->nodeid, &mesh_node) != 0) { |
| 6307 | // Again should not happen in practice. But not really bad. |
| 6308 | DEBUG("unknown mesh node disconnected %" PRIx64, event->nodeid); |
| 6309 | goto Exit; |
| 6310 | } |
| 6311 | |
| 6312 | DEBUG("mesh setting node %" PRIx64" status as inactive on loss of channel" , |
| 6313 | event->nodeid); |
| 6314 | |
| 6315 | // Mark this node inactive and move on. Mesh tender should remove this node |
| 6316 | // after it has been inactive for a while. |
| 6317 | mesh_node_status_change(&mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
| 6318 | |
| 6319 | // Update the mesh entry. |
| 6320 | mesh_node_add_update(event->nodeid, &mesh_node); |
| 6321 | |
| 6322 | Exit: |
| 6323 | MESH_UNLOCK(); |
| 6324 | } |
| 6325 | |
| 6326 | /** |
| 6327 | * Check and fix the case where we received a self incoming message probably |
| 6328 | * because one of our non loop back interfaces was used as a seed address. |
| 6329 | * |
| 6330 | * @return true if this message is a self message, false otherwise. |
| 6331 | */ |
| 6332 | static bool |
| 6333 | mesh_node_check_fix_self_msg(as_hb_channel_event* event) |
| 6334 | { |
| 6335 | if (event->nodeid == config_self_nodeid_get()) { |
| 6336 | // Handle self message. Will happen if the seed node address on this |
| 6337 | // node does not match the listen / publish address. |
| 6338 | as_endpoint_list* msg_endpoint_list; |
| 6339 | msg_endpoint_list_get(event->msg, &msg_endpoint_list); |
| 6340 | |
| 6341 | MESH_LOCK(); |
| 6342 | |
| 6343 | // Check if this node has published an endpoint list matching self node. |
| 6344 | endpoint_list_equal_check_udata udata = { 0 }; |
| 6345 | udata.are_equal = false; |
| 6346 | udata.other = msg_endpoint_list; |
| 6347 | mesh_published_endpoints_process(endpoint_list_equal_process, &udata); |
| 6348 | |
| 6349 | if (udata.are_equal) { |
| 6350 | // Definitely pulse message from self node. |
| 6351 | int self_seed_index = |
| 6352 | mesh_seed_endpoint_list_overlapping_find_unsafe( |
| 6353 | msg_endpoint_list); |
| 6354 | if (self_seed_index >= 0) { |
| 6355 | as_hb_mesh_seed* self_seed = cf_vector_getp( |
| 6356 | &g_hb.mode_state.mesh_state.seeds, self_seed_index); |
| 6357 | INFO("removing self seed entry host:%s port:%d" , |
| 6358 | self_seed->seed_host_name, self_seed->seed_port); |
| 6359 | as_hb_mesh_tip_clear(self_seed->seed_host_name, |
| 6360 | self_seed->seed_port); |
| 6361 | } |
| 6362 | } |
| 6363 | MESH_UNLOCK(); |
| 6364 | return true; |
| 6365 | } |
| 6366 | return false; |
| 6367 | } |
| 6368 | |
| 6369 | /** |
| 6370 | * Update mesh node status based on an incoming message. |
| 6371 | */ |
| 6372 | static void |
| 6373 | mesh_node_data_update(as_hb_channel_event* event) |
| 6374 | { |
| 6375 | if (mesh_node_check_fix_self_msg(event)) { |
| 6376 | // Message from self, can be ignored. |
| 6377 | return; |
| 6378 | } |
| 6379 | |
| 6380 | MESH_LOCK(); |
| 6381 | as_hb_mesh_node existing_mesh_node = { 0 }; |
| 6382 | as_endpoint_list* msg_endpoint_list = NULL; |
| 6383 | msg_endpoint_list_get(event->msg, &msg_endpoint_list); |
| 6384 | |
| 6385 | // Search for existing entry. |
| 6386 | bool needs_update = mesh_node_get(event->nodeid, &existing_mesh_node) != 0; |
| 6387 | |
| 6388 | // Update the endpoint list to be the message endpoint list if the seed ip |
| 6389 | // list and the published ip list differ |
| 6390 | if (!as_endpoint_lists_are_equal(existing_mesh_node.endpoint_list, |
| 6391 | msg_endpoint_list)) { |
| 6392 | char endpoint_list_str1[ENDPOINT_LIST_STR_SIZE]; |
| 6393 | endpoint_list_str1[0] = 0; |
| 6394 | |
| 6395 | as_endpoint_list_to_string(existing_mesh_node.endpoint_list, |
| 6396 | endpoint_list_str1, sizeof(endpoint_list_str1)); |
| 6397 | |
| 6398 | char endpoint_list_str2[ENDPOINT_LIST_STR_SIZE]; |
| 6399 | as_endpoint_list_to_string(msg_endpoint_list, endpoint_list_str2, |
| 6400 | sizeof(endpoint_list_str2)); |
| 6401 | |
| 6402 | if (existing_mesh_node.endpoint_list) { |
| 6403 | INFO("for node %" PRIx64" updating mesh endpoint address from {%s} to {%s}" ,event->nodeid, |
| 6404 | endpoint_list_str1, endpoint_list_str2); |
| 6405 | } |
| 6406 | |
| 6407 | // Update the endpoints. |
| 6408 | endpoint_list_copy(&existing_mesh_node.endpoint_list, |
| 6409 | msg_endpoint_list); |
| 6410 | existing_mesh_node.endpoint_change_ts = as_hlc_timestamp_now(); |
| 6411 | |
| 6412 | needs_update = true; |
| 6413 | } |
| 6414 | |
| 6415 | if (existing_mesh_node.status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) { |
| 6416 | // Update status to active. |
| 6417 | mesh_node_status_change(&existing_mesh_node, |
| 6418 | AS_HB_MESH_NODE_CHANNEL_ACTIVE); |
| 6419 | needs_update = true; |
| 6420 | } |
| 6421 | |
| 6422 | if (needs_update) { |
| 6423 | // Apply the update. |
| 6424 | mesh_node_add_update(event->nodeid, &existing_mesh_node); |
| 6425 | } |
| 6426 | |
| 6427 | MESH_UNLOCK(); |
| 6428 | } |
| 6429 | |
| 6430 | /** |
| 6431 | * Return the in memory and on wire size of an info reply array. |
| 6432 | * @param reply the info reply. |
| 6433 | * @param reply_count the number of replies. |
| 6434 | * @param reply_size the wire size of the message. |
| 6435 | * @return 0 on successful reply count computation, -1 otherwise, |
| 6436 | */ |
| 6437 | static int |
| 6438 | mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, |
| 6439 | size_t* reply_size) |
| 6440 | { |
| 6441 | // Go over reply and compute the count of replies and also validate the |
| 6442 | // endpoint lists. |
| 6443 | uint8_t* start_ptr = (uint8_t*)reply; |
| 6444 | *reply_size = 0; |
| 6445 | |
| 6446 | for (int i = 0; i < reply_count; i++) { |
| 6447 | as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; |
| 6448 | *reply_size += sizeof(as_hb_mesh_info_reply); |
| 6449 | start_ptr += sizeof(as_hb_mesh_info_reply); |
| 6450 | |
| 6451 | size_t endpoint_list_size = 0; |
| 6452 | if (as_endpoint_list_sizeof(&reply_ptr->endpoint_list[0], |
| 6453 | &endpoint_list_size)) { |
| 6454 | // Incomplete / garbled info reply message. |
| 6455 | *reply_size = 0; |
| 6456 | return -1; |
| 6457 | } |
| 6458 | |
| 6459 | *reply_size += endpoint_list_size; |
| 6460 | start_ptr += endpoint_list_size; |
| 6461 | } |
| 6462 | |
| 6463 | return 0; |
| 6464 | } |
| 6465 | |
| 6466 | /** |
| 6467 | * Send a info reply in reply to an info request. |
| 6468 | * @param dest the destination node to send the info reply to. |
| 6469 | * @param reply array of node ids and endpoints |
| 6470 | * @param reply_count the count of replies. |
| 6471 | */ |
| 6472 | static void |
| 6473 | mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, |
| 6474 | size_t reply_count) |
| 6475 | { |
| 6476 | // Create the discover message. |
| 6477 | msg* msg = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REPLY); |
| 6478 | |
| 6479 | // Set the reply. |
| 6480 | msg_info_reply_set(msg, reply, reply_count); |
| 6481 | |
| 6482 | DEBUG("sending info reply to node %" PRIx64, dest); |
| 6483 | |
| 6484 | // Send the info reply. |
| 6485 | if (channel_msg_unicast(dest, msg) != 0) { |
| 6486 | TICKER_WARNING("error sending info reply message to node %" PRIx64, |
| 6487 | dest); |
| 6488 | } |
| 6489 | |
| 6490 | hb_msg_return(msg); |
| 6491 | } |
| 6492 | |
| 6493 | /** |
| 6494 | * Initialize the info request msg buffer |
| 6495 | */ |
| 6496 | static msg* |
| 6497 | mesh_info_msg_init(as_hb_msg_type msg_type) |
| 6498 | { |
| 6499 | msg* msg = hb_msg_get(); |
| 6500 | msg_src_fields_fill(msg); |
| 6501 | msg_type_set(msg, msg_type); |
| 6502 | return msg; |
| 6503 | } |
| 6504 | |
| 6505 | /** |
| 6506 | * Send a info request for all undiscovered nodes. |
| 6507 | * @param dest the destination node to send the discover message to. |
| 6508 | * @param to_discover array of node ids to discover. |
| 6509 | * @param to_discover_count the count of nodes in the array. |
| 6510 | */ |
| 6511 | static void |
| 6512 | mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, |
| 6513 | size_t to_discover_count) |
| 6514 | { |
| 6515 | // Create the discover message. |
| 6516 | msg* info_req = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REQUEST); |
| 6517 | |
| 6518 | // Set the list of nodes to discover. |
| 6519 | msg_node_list_set(info_req, AS_HB_MSG_INFO_REQUEST, to_discover, |
| 6520 | to_discover_count); |
| 6521 | |
| 6522 | DEBUG("sending info request to node %" PRIx64, dest); |
| 6523 | |
| 6524 | // Send the info request. |
| 6525 | if (channel_msg_unicast(dest, info_req) != 0) { |
| 6526 | TICKER_WARNING("error sending info request message to node %" PRIx64, |
| 6527 | dest); |
| 6528 | } |
| 6529 | hb_msg_return(info_req); |
| 6530 | } |
| 6531 | |
| 6532 | /** |
| 6533 | * Handle an incoming pulse message to discover new neighbours. |
| 6534 | */ |
| 6535 | static void |
| 6536 | mesh_channel_on_pulse(msg* msg) |
| 6537 | { |
| 6538 | cf_node* adj_list; |
| 6539 | size_t adj_length; |
| 6540 | |
| 6541 | cf_node source; |
| 6542 | |
| 6543 | // Channel has validated the source. Don't bother checking here. |
| 6544 | msg_nodeid_get(msg, &source); |
| 6545 | if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) { |
| 6546 | // Adjacency list absent. |
| 6547 | WARNING("received message from %" PRIx64" without adjacency list" , |
| 6548 | source); |
| 6549 | return; |
| 6550 | } |
| 6551 | |
| 6552 | cf_node to_discover[adj_length]; |
| 6553 | size_t num_to_discover = 0; |
| 6554 | |
| 6555 | // TODO: Track already queried nodes so that we do not retry immediately. |
| 6556 | // Will need a separate state, pending query. |
| 6557 | MESH_LOCK(); |
| 6558 | |
| 6559 | // Try and discover new nodes from this message's adjacency list. |
| 6560 | for (int i = 0; i < adj_length; i++) { |
| 6561 | if (!mesh_node_is_discovered(adj_list[i])) { |
| 6562 | DEBUG("discovered new mesh node %" PRIx64, adj_list[i]); |
| 6563 | |
| 6564 | as_hb_mesh_node new_node; |
| 6565 | memset(&new_node, 0, sizeof(new_node)); |
| 6566 | mesh_node_status_change(&new_node, |
| 6567 | AS_HB_MESH_NODE_ENDPOINT_UNKNOWN); |
| 6568 | |
| 6569 | // Add as a new node |
| 6570 | mesh_node_add_update(adj_list[i], &new_node); |
| 6571 | } |
| 6572 | |
| 6573 | if (!mesh_node_endpoint_list_is_valid(adj_list[i])) { |
| 6574 | to_discover[num_to_discover++] = adj_list[i]; |
| 6575 | } |
| 6576 | } |
| 6577 | |
| 6578 | MESH_UNLOCK(); |
| 6579 | |
| 6580 | // Discover these nodes outside a lock. |
| 6581 | if (num_to_discover) { |
| 6582 | mesh_nodes_send_info_request(msg, source, to_discover, num_to_discover); |
| 6583 | } |
| 6584 | } |
| 6585 | |
| 6586 | /** |
| 6587 | * Handle an incoming info message. |
| 6588 | */ |
| 6589 | static void |
| 6590 | mesh_channel_on_info_request(msg* msg) |
| 6591 | { |
| 6592 | cf_node* query_nodeids; |
| 6593 | size_t query_count; |
| 6594 | |
| 6595 | cf_node source; |
| 6596 | msg_nodeid_get(msg, &source); |
| 6597 | |
| 6598 | if (msg_node_list_get(msg, AS_HB_MSG_INFO_REQUEST, &query_nodeids, |
| 6599 | &query_count) != 0) { |
| 6600 | TICKER_WARNING("got an info request without query nodes from %" PRIx64, |
| 6601 | source); |
| 6602 | return; |
| 6603 | } |
| 6604 | |
| 6605 | MESH_LOCK(); |
| 6606 | |
| 6607 | // Compute the entire response size. |
| 6608 | size_t reply_size = 0; |
| 6609 | |
| 6610 | for (int i = 0; i < query_count; i++) { |
| 6611 | as_hb_mesh_node mesh_node; |
| 6612 | |
| 6613 | if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) { |
| 6614 | if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN |
| 6615 | && mesh_node.endpoint_list) { |
| 6616 | size_t endpoint_list_size = 0; |
| 6617 | as_endpoint_list_sizeof(mesh_node.endpoint_list, |
| 6618 | &endpoint_list_size); |
| 6619 | reply_size += sizeof(as_hb_mesh_info_reply) |
| 6620 | + endpoint_list_size; |
| 6621 | } |
| 6622 | } |
| 6623 | } |
| 6624 | |
| 6625 | as_hb_mesh_info_reply* replies = alloca(reply_size); |
| 6626 | uint8_t* reply_ptr = (uint8_t*)replies; |
| 6627 | size_t reply_count = 0; |
| 6628 | |
| 6629 | DEBUG("received info request from node : %" PRIx64, source); |
| 6630 | DEBUG("preparing a reply for %zu requests" , query_count); |
| 6631 | |
| 6632 | for (int i = 0; i < query_count; i++) { |
| 6633 | as_hb_mesh_node mesh_node; |
| 6634 | |
| 6635 | DEBUG("mesh received info request for node %" PRIx64, query_nodeids[i]); |
| 6636 | |
| 6637 | if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) { |
| 6638 | if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN |
| 6639 | && mesh_node.endpoint_list) { |
| 6640 | as_hb_mesh_info_reply* reply = (as_hb_mesh_info_reply*)reply_ptr; |
| 6641 | |
| 6642 | reply->nodeid = query_nodeids[i]; |
| 6643 | |
| 6644 | size_t endpoint_list_size = 0; |
| 6645 | as_endpoint_list_sizeof(mesh_node.endpoint_list, |
| 6646 | &endpoint_list_size); |
| 6647 | |
| 6648 | memcpy(&reply->endpoint_list[0], mesh_node.endpoint_list, |
| 6649 | endpoint_list_size); |
| 6650 | |
| 6651 | reply_ptr += sizeof(as_hb_mesh_info_reply) + endpoint_list_size; |
| 6652 | |
| 6653 | reply_count++; |
| 6654 | } |
| 6655 | } |
| 6656 | } |
| 6657 | |
| 6658 | MESH_UNLOCK(); |
| 6659 | |
| 6660 | // Send the reply |
| 6661 | if (reply_count > 0) { |
| 6662 | mesh_nodes_send_info_reply(source, replies, reply_count); |
| 6663 | } |
| 6664 | } |
| 6665 | |
| 6666 | /** |
| 6667 | * Handle an incoming info reply. |
| 6668 | */ |
| 6669 | static void |
| 6670 | mesh_channel_on_info_reply(msg* msg) |
| 6671 | { |
| 6672 | as_hb_mesh_info_reply* reply = NULL; |
| 6673 | size_t reply_count = 0; |
| 6674 | cf_node source = 0; |
| 6675 | msg_nodeid_get(msg, &source); |
| 6676 | if (msg_info_reply_get(msg, &reply, &reply_count) != 0 |
| 6677 | || reply_count == 0) { |
| 6678 | TICKER_WARNING( |
| 6679 | "got an info reply from without query nodes from %" PRIx64, |
| 6680 | source); |
| 6681 | return; |
| 6682 | } |
| 6683 | |
| 6684 | DEBUG("received info reply from node %" PRIx64, source); |
| 6685 | |
| 6686 | MESH_LOCK(); |
| 6687 | |
| 6688 | uint8_t *start_ptr = (uint8_t*)reply; |
| 6689 | for (int i = 0; i < reply_count; i++) { |
| 6690 | as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; |
| 6691 | as_hb_mesh_node existing_node; |
| 6692 | if (mesh_node_get(reply_ptr->nodeid, &existing_node) != 0) { |
| 6693 | // Somehow the node was removed from the mesh hash. Maybe a timeout. |
| 6694 | goto NextReply; |
| 6695 | } |
| 6696 | |
| 6697 | // Update the state of this node. |
| 6698 | if (existing_node.status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) { |
| 6699 | // Update the endpoint. |
| 6700 | endpoint_list_copy(&existing_node.endpoint_list, |
| 6701 | reply_ptr->endpoint_list); |
| 6702 | |
| 6703 | mesh_node_status_change(&existing_node, |
| 6704 | AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
| 6705 | // Set the discovered flag. |
| 6706 | g_hb.mode_state.mesh_state.nodes_discovered = true; |
| 6707 | |
| 6708 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 6709 | as_endpoint_list_to_string(existing_node.endpoint_list, |
| 6710 | endpoint_list_str, sizeof(endpoint_list_str)); |
| 6711 | |
| 6712 | DEBUG("for node %" PRIx64" discovered endpoints {%s}" , |
| 6713 | reply_ptr->nodeid, endpoint_list_str); |
| 6714 | |
| 6715 | // Update the hash. |
| 6716 | mesh_node_add_update(reply_ptr->nodeid, &existing_node); |
| 6717 | } |
| 6718 | |
| 6719 | NextReply: |
| 6720 | start_ptr += sizeof(as_hb_mesh_info_reply); |
| 6721 | size_t endpoint_list_size = 0; |
| 6722 | as_endpoint_list_sizeof(reply_ptr->endpoint_list, &endpoint_list_size); |
| 6723 | start_ptr += endpoint_list_size; |
| 6724 | } |
| 6725 | |
| 6726 | MESH_UNLOCK(); |
| 6727 | } |
| 6728 | |
| 6729 | /** |
| 6730 | * Handle the case when a message is received on a channel. |
| 6731 | */ |
| 6732 | static void |
| 6733 | mesh_channel_on_msg_rcvd(as_hb_channel_event* event) |
| 6734 | { |
| 6735 | // Update the mesh node status. |
| 6736 | mesh_node_data_update(event); |
| 6737 | |
| 6738 | as_hb_msg_type msg_type; |
| 6739 | msg_type_get(event->msg, &msg_type); |
| 6740 | |
| 6741 | switch (msg_type) { |
| 6742 | case AS_HB_MSG_TYPE_PULSE: // A pulse message. Try and discover new nodes. |
| 6743 | mesh_channel_on_pulse(event->msg); |
| 6744 | break; |
| 6745 | case AS_HB_MSG_TYPE_INFO_REQUEST: // Send back an info reply. |
| 6746 | mesh_channel_on_info_request(event->msg); |
| 6747 | break; |
| 6748 | case AS_HB_MSG_TYPE_INFO_REPLY: // Update the list of mesh nodes, if this is an undiscovered node. |
| 6749 | mesh_channel_on_info_reply(event->msg); |
| 6750 | break; |
| 6751 | default: |
| 6752 | WARNING("received a message of unknown type from" ); |
| 6753 | // Ignore other messages. |
| 6754 | break; |
| 6755 | } |
| 6756 | } |
| 6757 | |
| 6758 | /* |
| 6759 | * ---------------------------------------------------------------------------- |
| 6760 | * Mesh public API |
| 6761 | * ---------------------------------------------------------------------------- |
| 6762 | */ |
| 6763 | |
| 6764 | /** |
| 6765 | * Add a host / port to the mesh seed list. |
| 6766 | * @param host the seed node hostname / ip address |
| 6767 | * @param port the seed node port. |
| 6768 | * @param tls indicates TLS support. |
| 6769 | * @return CF_SHASH_OK, CF_SHASH_ERR, CF_SHASH_ERR_FOUND. |
| 6770 | */ |
| 6771 | static int |
| 6772 | mesh_tip(char* host, int port, bool tls) |
| 6773 | { |
| 6774 | MESH_LOCK(); |
| 6775 | |
| 6776 | int rv = -1; |
| 6777 | as_hb_mesh_seed new_seed = { { 0 } }; |
| 6778 | |
| 6779 | // Check validity of hostname and port. |
| 6780 | int hostname_len = strnlen(host, DNS_NAME_MAX_SIZE); |
| 6781 | if (hostname_len <= 0 || hostname_len == DNS_NAME_MAX_SIZE) { |
| 6782 | // Invalid hostname. |
| 6783 | WARNING("mesh seed host %s exceeds allowed %d characters" , host, |
| 6784 | DNS_NAME_MAX_LEN); |
| 6785 | goto Exit; |
| 6786 | } |
| 6787 | if (port <= 0 || port > USHRT_MAX) { |
| 6788 | WARNING("mesh seed port %s:%d exceeds should be between 0 to %d" , host, |
| 6789 | port, USHRT_MAX); |
| 6790 | goto Exit; |
| 6791 | } |
| 6792 | |
| 6793 | // Check if we already have a match for this seed. |
| 6794 | if (mesh_seed_find_unsafe(host, port) >= 0) { |
| 6795 | WARNING("mesh seed host %s:%d already in seed list" , host, port); |
| 6796 | goto Exit; |
| 6797 | } |
| 6798 | |
| 6799 | mesh_seed_status_change(&new_seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
| 6800 | strcpy(new_seed.seed_host_name, host); |
| 6801 | new_seed.seed_port = port; |
| 6802 | new_seed.seed_tls = tls; |
| 6803 | |
| 6804 | cf_vector_append(&g_hb.mode_state.mesh_state.seeds, &new_seed); |
| 6805 | |
| 6806 | INFO("added new mesh seed %s:%d" , host, port); |
| 6807 | rv = 0; |
| 6808 | |
| 6809 | Exit: |
| 6810 | if (rv != 0) { |
| 6811 | // Ensure endpoint allocated space is freed. |
| 6812 | mesh_seed_destroy(&new_seed); |
| 6813 | } |
| 6814 | |
| 6815 | MESH_UNLOCK(); |
| 6816 | return rv; |
| 6817 | } |
| 6818 | |
| 6819 | /** |
| 6820 | * Handle a channel event on an endpoint. |
| 6821 | */ |
| 6822 | static void |
| 6823 | mesh_channel_event_process(as_hb_channel_event* event) |
| 6824 | { |
| 6825 | // Skip if we are not in mesh mode. |
| 6826 | if (!hb_is_mesh()) { |
| 6827 | return; |
| 6828 | } |
| 6829 | |
| 6830 | MESH_LOCK(); |
| 6831 | switch (event->type) { |
| 6832 | case AS_HB_CHANNEL_NODE_CONNECTED: |
| 6833 | // Ignore this event. The subsequent message event will be use for |
| 6834 | // determining mesh node active status. |
| 6835 | break; |
| 6836 | case AS_HB_CHANNEL_NODE_DISCONNECTED: |
| 6837 | mesh_channel_on_node_disconnect(event); |
| 6838 | break; |
| 6839 | case AS_HB_CHANNEL_MSG_RECEIVED: |
| 6840 | mesh_channel_on_msg_rcvd(event); |
| 6841 | break; |
| 6842 | case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH: // Ignore this event. HB module will handle it. |
| 6843 | break; |
| 6844 | } |
| 6845 | |
| 6846 | MESH_UNLOCK(); |
| 6847 | } |
| 6848 | |
| 6849 | /** |
| 6850 | * Initialize mesh mode data structures. |
| 6851 | */ |
| 6852 | static void |
| 6853 | mesh_init() |
| 6854 | { |
| 6855 | if (!hb_is_mesh()) { |
| 6856 | return; |
| 6857 | } |
| 6858 | |
| 6859 | MESH_LOCK(); |
| 6860 | |
| 6861 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED; |
| 6862 | |
| 6863 | // Initialize the mesh node hash. |
| 6864 | g_hb.mode_state.mesh_state.nodeid_to_mesh_node = cf_shash_create( |
| 6865 | cf_nodeid_shash_fn, sizeof(cf_node), sizeof(as_hb_mesh_node), |
| 6866 | AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
| 6867 | |
| 6868 | // Initialize the seed list. |
| 6869 | cf_vector_init(&g_hb.mode_state.mesh_state.seeds, sizeof(as_hb_mesh_seed), |
| 6870 | AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); |
| 6871 | |
| 6872 | MESH_UNLOCK(); |
| 6873 | } |
| 6874 | |
| 6875 | /** |
| 6876 | * Delete the shash entries only if they are not seed entries. |
| 6877 | */ |
| 6878 | static int |
| 6879 | mesh_free_node_data_reduce(const void* key, void* data, void* udata) |
| 6880 | { |
| 6881 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
| 6882 | mesh_node_destroy(mesh_node); |
| 6883 | return CF_SHASH_REDUCE_DELETE; |
| 6884 | } |
| 6885 | |
| 6886 | /** |
| 6887 | * Remove a host / port from the mesh list. |
| 6888 | */ |
| 6889 | static int |
| 6890 | mesh_tip_clear_reduce(const void* key, void* data, void* udata) |
| 6891 | { |
| 6892 | int rv = CF_SHASH_OK; |
| 6893 | |
| 6894 | MESH_LOCK(); |
| 6895 | |
| 6896 | cf_node nodeid = *(cf_node*)key; |
| 6897 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
| 6898 | as_hb_mesh_tip_clear_udata* tip_clear_udata = |
| 6899 | (as_hb_mesh_tip_clear_udata*)udata; |
| 6900 | |
| 6901 | if (tip_clear_udata == NULL || nodeid == tip_clear_udata->nodeid) { |
| 6902 | // Handling tip clear all or clear of a specific node. |
| 6903 | rv = CF_SHASH_REDUCE_DELETE; |
| 6904 | goto Exit; |
| 6905 | } |
| 6906 | |
| 6907 | // See if the address matches any one of the endpoints in the node's |
| 6908 | // endpoint list. |
| 6909 | for (int i = 0; i < tip_clear_udata->n_addrs; i++) { |
| 6910 | cf_sock_addr sock_addr; |
| 6911 | cf_ip_addr_copy(&tip_clear_udata->addrs[i], &sock_addr.addr); |
| 6912 | sock_addr.port = tip_clear_udata->port; |
| 6913 | as_hb_endpoint_list_addr_find_udata udata; |
| 6914 | udata.found = false; |
| 6915 | udata.to_search = &sock_addr; |
| 6916 | |
| 6917 | as_endpoint_list_iterate(mesh_node->endpoint_list, |
| 6918 | mesh_endpoint_addr_find_iterate, &udata); |
| 6919 | |
| 6920 | if (udata.found) { |
| 6921 | rv = CF_SHASH_REDUCE_DELETE; |
| 6922 | goto Exit; |
| 6923 | } |
| 6924 | } |
| 6925 | |
| 6926 | // Not found by endpoint. |
| 6927 | rv = CF_SHASH_OK; |
| 6928 | |
| 6929 | Exit: |
| 6930 | if (rv == CF_SHASH_REDUCE_DELETE) { |
| 6931 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 6932 | as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str, |
| 6933 | sizeof(endpoint_list_str)); |
| 6934 | |
| 6935 | // Find all seed entries matching this mesh entry and delete them. |
| 6936 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 6937 | int element_count = cf_vector_size(seeds); |
| 6938 | for (int i = 0; i < element_count; i++) { |
| 6939 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 6940 | if (seed->mesh_nodeid != nodeid) { |
| 6941 | // Does not match this mesh entry. |
| 6942 | continue; |
| 6943 | } |
| 6944 | if (mesh_seed_delete_unsafe(i) == 0) { |
| 6945 | i--; |
| 6946 | element_count--; |
| 6947 | } |
| 6948 | else { |
| 6949 | // Should not happen in practice. |
| 6950 | CRASH("error deleting mesh seed entry %s:%d" , |
| 6951 | seed->seed_host_name, seed->seed_port); |
| 6952 | } |
| 6953 | } |
| 6954 | |
| 6955 | if (channel_node_disconnect(nodeid) != 0) { |
| 6956 | WARNING("unable to disconnect the channel to node %" PRIx64, |
| 6957 | nodeid); |
| 6958 | } |
| 6959 | |
| 6960 | mesh_node_destroy(mesh_node); |
| 6961 | if (tip_clear_udata != NULL) { |
| 6962 | tip_clear_udata->entry_deleted = true; |
| 6963 | } |
| 6964 | } |
| 6965 | |
| 6966 | MESH_UNLOCK(); |
| 6967 | return rv; |
| 6968 | } |
| 6969 | |
| 6970 | /** |
| 6971 | * Output Heartbeat endpoints of peers. |
| 6972 | */ |
| 6973 | static int |
| 6974 | mesh_peer_endpoint_reduce(const void* key, void* data, void* udata) |
| 6975 | { |
| 6976 | int rv = CF_SHASH_OK; |
| 6977 | MESH_LOCK(); |
| 6978 | cf_node nodeid = *(cf_node*)key; |
| 6979 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
| 6980 | cf_dyn_buf* db = (cf_dyn_buf*)udata; |
| 6981 | |
| 6982 | cf_dyn_buf_append_string(db, "heartbeat.peer=" ); |
| 6983 | cf_dyn_buf_append_string(db, "node-id=" ); |
| 6984 | cf_dyn_buf_append_uint64_x(db, nodeid); |
| 6985 | cf_dyn_buf_append_string(db, ":" ); |
| 6986 | as_endpoint_list_info(mesh_node->endpoint_list, db); |
| 6987 | cf_dyn_buf_append_string(db, ";" ); |
| 6988 | |
| 6989 | MESH_UNLOCK(); |
| 6990 | return rv; |
| 6991 | } |
| 6992 | |
| 6993 | /** |
| 6994 | * Free the mesh mode data structures. |
| 6995 | */ |
| 6996 | static void |
| 6997 | mesh_clear() |
| 6998 | { |
| 6999 | if (!mesh_is_stopped()) { |
| 7000 | WARNING( |
| 7001 | "attempted clearing mesh module without stopping it - skip mesh clear!" ); |
| 7002 | return; |
| 7003 | } |
| 7004 | |
| 7005 | MESH_LOCK(); |
| 7006 | // Delete the elements from the map. |
| 7007 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
| 7008 | mesh_free_node_data_reduce, NULL); |
| 7009 | |
| 7010 | // Reset the seeds to inactive state |
| 7011 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 7012 | int element_count = cf_vector_size(seeds); |
| 7013 | for (int i = 0; i < element_count; i++) { |
| 7014 | // Should not happen in practice. |
| 7015 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 7016 | seed->mesh_nodeid = 0; |
| 7017 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
| 7018 | } |
| 7019 | |
| 7020 | MESH_UNLOCK(); |
| 7021 | } |
| 7022 | |
| 7023 | /** |
| 7024 | * Open mesh listening socket. Crashes if open failed. |
| 7025 | */ |
| 7026 | static void |
| 7027 | mesh_listening_sockets_open() |
| 7028 | { |
| 7029 | MESH_LOCK(); |
| 7030 | |
| 7031 | const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); |
| 7032 | |
| 7033 | // Compute min MTU across all binding interfaces. |
| 7034 | int min_mtu = -1; |
| 7035 | char addr_string[DNS_NAME_MAX_SIZE]; |
| 7036 | for (uint32_t i = 0; i < bind_cfg->n_cfgs; ++i) { |
| 7037 | const cf_sock_cfg* sock_cfg = &bind_cfg->cfgs[i]; |
| 7038 | cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string, |
| 7039 | sizeof(addr_string)); |
| 7040 | |
| 7041 | INFO("initializing mesh heartbeat socket: %s:%d" , addr_string, |
| 7042 | sock_cfg->port); |
| 7043 | |
| 7044 | int bind_interface_mtu = |
| 7045 | !cf_ip_addr_is_any(&sock_cfg->addr) ? |
| 7046 | cf_inter_mtu(&sock_cfg->addr) : cf_inter_min_mtu(); |
| 7047 | |
| 7048 | if (min_mtu == -1 || min_mtu > bind_interface_mtu) { |
| 7049 | min_mtu = bind_interface_mtu; |
| 7050 | } |
| 7051 | } |
| 7052 | |
| 7053 | if (cf_socket_init_server((cf_serv_cfg*)bind_cfg, |
| 7054 | &g_hb.mode_state.mesh_state.listening_sockets) != 0) { |
| 7055 | CRASH("couldn't initialize unicast heartbeat sockets" ); |
| 7056 | } |
| 7057 | |
| 7058 | for (uint32_t i = 0; |
| 7059 | i < g_hb.mode_state.mesh_state.listening_sockets.n_socks; ++i) { |
| 7060 | DEBUG("opened mesh heartbeat socket: %d" , |
| 7061 | CSFD(&g_hb.mode_state.mesh_state.listening_sockets.socks[i])); |
| 7062 | } |
| 7063 | |
| 7064 | if (min_mtu == -1) { |
| 7065 | WARNING("error getting the min MTU - using the default %d" , |
| 7066 | DEFAULT_MIN_MTU); |
| 7067 | min_mtu = DEFAULT_MIN_MTU; |
| 7068 | } |
| 7069 | |
| 7070 | g_hb.mode_state.mesh_state.min_mtu = min_mtu; |
| 7071 | INFO("mtu of the network is %d" , min_mtu); |
| 7072 | |
| 7073 | MESH_UNLOCK(); |
| 7074 | } |
| 7075 | |
| 7076 | /** |
| 7077 | * Start mesh threads. |
| 7078 | */ |
| 7079 | static void |
| 7080 | mesh_start() |
| 7081 | { |
| 7082 | if (!hb_is_mesh()) { |
| 7083 | return; |
| 7084 | } |
| 7085 | |
| 7086 | MESH_LOCK(); |
| 7087 | |
| 7088 | mesh_listening_sockets_open(); |
| 7089 | channel_mesh_listening_socks_register( |
| 7090 | &g_hb.mode_state.mesh_state.listening_sockets); |
| 7091 | |
| 7092 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_RUNNING; |
| 7093 | |
| 7094 | // Start the mesh tender thread. |
| 7095 | g_hb.mode_state.mesh_state.mesh_tender_tid = |
| 7096 | cf_thread_create_joinable(mesh_tender, (void*)&g_hb); |
| 7097 | |
| 7098 | MESH_UNLOCK(); |
| 7099 | } |
| 7100 | |
| 7101 | /** |
| 7102 | * Stop the mesh module. |
| 7103 | */ |
| 7104 | static void |
| 7105 | mesh_stop() |
| 7106 | { |
| 7107 | if (!mesh_is_running()) { |
| 7108 | WARNING("mesh is already stopped" ); |
| 7109 | return; |
| 7110 | } |
| 7111 | |
| 7112 | // Unguarded state, but this should be OK. |
| 7113 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_SHUTTING_DOWN; |
| 7114 | |
| 7115 | // Wait for the channel tender thread to finish. |
| 7116 | cf_thread_join(g_hb.mode_state.mesh_state.mesh_tender_tid); |
| 7117 | |
| 7118 | MESH_LOCK(); |
| 7119 | |
| 7120 | channel_mesh_listening_socks_deregister( |
| 7121 | &g_hb.mode_state.mesh_state.listening_sockets); |
| 7122 | |
| 7123 | mesh_listening_sockets_close(); |
| 7124 | |
| 7125 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED; |
| 7126 | |
| 7127 | // Clear allocated state if any. |
| 7128 | if (g_hb.mode_state.mesh_state.published_endpoint_list) { |
| 7129 | cf_free(g_hb.mode_state.mesh_state.published_endpoint_list); |
| 7130 | g_hb.mode_state.mesh_state.published_endpoint_list = NULL; |
| 7131 | } |
| 7132 | |
| 7133 | MESH_UNLOCK(); |
| 7134 | } |
| 7135 | |
| 7136 | /** |
| 7137 | * Reduce function to dump mesh node info to log file. |
| 7138 | */ |
| 7139 | static int |
| 7140 | mesh_dump_reduce(const void* key, void* data, void* udata) |
| 7141 | { |
| 7142 | cf_node nodeid = *(cf_node*)key; |
| 7143 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
| 7144 | |
| 7145 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 7146 | as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str, |
| 7147 | sizeof(endpoint_list_str)); |
| 7148 | |
| 7149 | INFO("\tHB Mesh Node: node-id %" PRIx64" status %s last-updated %" PRIu64 " endpoints {%s}" , |
| 7150 | nodeid, mesh_node_status_string(mesh_node->status), |
| 7151 | mesh_node->last_status_updated, endpoint_list_str); |
| 7152 | |
| 7153 | return CF_SHASH_OK; |
| 7154 | } |
| 7155 | |
| 7156 | /** |
| 7157 | * Dump mesh state to logs. |
| 7158 | * @param verbose enables / disables verbose logging. |
| 7159 | */ |
| 7160 | static void |
| 7161 | mesh_dump(bool verbose) |
| 7162 | { |
| 7163 | if (!hb_is_mesh() || !verbose) { |
| 7164 | return; |
| 7165 | } |
| 7166 | |
| 7167 | MESH_LOCK(); |
| 7168 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
| 7169 | int element_count = cf_vector_size(seeds); |
| 7170 | INFO("HB Seed Count %d" , element_count); |
| 7171 | for (int i = 0; i < element_count; i++) { |
| 7172 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
| 7173 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 7174 | as_endpoint_list_to_string(seed->resolved_endpoint_list, |
| 7175 | endpoint_list_str, sizeof(endpoint_list_str)); |
| 7176 | INFO("\tHB Mesh Seed: host %s port %d node-id %" PRIx64" status %s endpoints {%s}" , |
| 7177 | seed->seed_host_name, seed->seed_port, seed->mesh_nodeid, mesh_node_status_string(seed->status), |
| 7178 | endpoint_list_str); |
| 7179 | } |
| 7180 | |
| 7181 | INFO("HB Mesh Nodes Count %d" , cf_shash_get_size(g_hb.mode_state.mesh_state.nodeid_to_mesh_node)); |
| 7182 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
| 7183 | mesh_dump_reduce, NULL); |
| 7184 | MESH_UNLOCK(); |
| 7185 | } |
| 7186 | |
| 7187 | /* |
| 7188 | * ---------------------------------------------------------------------------- |
| 7189 | * Multicast sub module. |
| 7190 | * ---------------------------------------------------------------------------- |
| 7191 | */ |
| 7192 | |
| 7193 | /** |
| 7194 | * Initialize multicast data structures. |
| 7195 | */ |
| 7196 | static void |
| 7197 | multicast_init() |
| 7198 | { |
| 7199 | } |
| 7200 | |
| 7201 | /** |
| 7202 | * Clear multicast data structures. |
| 7203 | */ |
| 7204 | static void |
| 7205 | multicast_clear() |
| 7206 | { |
| 7207 | // Free multicast data structures. Nothing to do. |
| 7208 | } |
| 7209 | |
| 7210 | /** |
| 7211 | * Open multicast sockets. Crashes if open failed. |
| 7212 | */ |
| 7213 | static void |
| 7214 | multicast_listening_sockets_open() |
| 7215 | { |
| 7216 | MULTICAST_LOCK(); |
| 7217 | |
| 7218 | const cf_mserv_cfg* mserv_cfg = config_multicast_group_cfg_get(); |
| 7219 | |
| 7220 | // Compute min MTU across all binding interfaces. |
| 7221 | int min_mtu = -1; |
| 7222 | char addr_string[DNS_NAME_MAX_SIZE]; |
| 7223 | for (uint32_t i = 0; i < mserv_cfg->n_cfgs; ++i) { |
| 7224 | const cf_msock_cfg* sock_cfg = &mserv_cfg->cfgs[i]; |
| 7225 | cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string, |
| 7226 | sizeof(addr_string)); |
| 7227 | |
| 7228 | INFO("initializing multicast heartbeat socket: %s:%d" , addr_string, |
| 7229 | sock_cfg->port); |
| 7230 | |
| 7231 | int bind_interface_mtu = |
| 7232 | !cf_ip_addr_is_any(&sock_cfg->if_addr) ? |
| 7233 | cf_inter_mtu(&sock_cfg->if_addr) : cf_inter_min_mtu(); |
| 7234 | |
| 7235 | if (min_mtu == -1 || min_mtu > bind_interface_mtu) { |
| 7236 | min_mtu = bind_interface_mtu; |
| 7237 | } |
| 7238 | } |
| 7239 | |
| 7240 | if (cf_socket_mcast_init((cf_mserv_cfg*)mserv_cfg, |
| 7241 | &g_hb.mode_state.multicast_state.listening_sockets) != 0) { |
| 7242 | CRASH("couldn't initialize multicast heartbeat socket: %s" , |
| 7243 | cf_strerror(errno)); |
| 7244 | } |
| 7245 | |
| 7246 | for (uint32_t i = 0; |
| 7247 | i < g_hb.mode_state.multicast_state.listening_sockets.n_socks; |
| 7248 | ++i) { |
| 7249 | DEBUG("opened multicast socket %d" , |
| 7250 | CSFD( |
| 7251 | &g_hb.mode_state.multicast_state.listening_sockets.socks[i])); |
| 7252 | } |
| 7253 | |
| 7254 | if (min_mtu == -1) { |
| 7255 | WARNING("error getting the min mtu - using the default %d" , |
| 7256 | DEFAULT_MIN_MTU); |
| 7257 | min_mtu = DEFAULT_MIN_MTU; |
| 7258 | } |
| 7259 | |
| 7260 | g_hb.mode_state.multicast_state.min_mtu = min_mtu; |
| 7261 | |
| 7262 | INFO("mtu of the network is %d" , min_mtu); |
| 7263 | MULTICAST_UNLOCK(); |
| 7264 | } |
| 7265 | |
| 7266 | /** |
| 7267 | * Start multicast module. |
| 7268 | */ |
| 7269 | static void |
| 7270 | multicast_start() |
| 7271 | { |
| 7272 | MULTICAST_LOCK(); |
| 7273 | multicast_listening_sockets_open(); |
| 7274 | channel_multicast_listening_socks_register( |
| 7275 | &g_hb.mode_state.multicast_state.listening_sockets); |
| 7276 | MULTICAST_UNLOCK(); |
| 7277 | } |
| 7278 | |
| 7279 | /** |
| 7280 | * Close multicast listening socket. |
| 7281 | */ |
| 7282 | static void |
| 7283 | multicast_listening_sockets_close() |
| 7284 | { |
| 7285 | MULTICAST_LOCK(); |
| 7286 | INFO("closing multicast heartbeat sockets" ); |
| 7287 | cf_sockets_close(&g_hb.mode_state.multicast_state.listening_sockets); |
| 7288 | DEBUG("closed multicast heartbeat socket" ); |
| 7289 | MULTICAST_UNLOCK(); |
| 7290 | } |
| 7291 | |
| 7292 | /** |
| 7293 | * Stop Multicast. |
| 7294 | */ |
| 7295 | static void |
| 7296 | multicast_stop() |
| 7297 | { |
| 7298 | MULTICAST_LOCK(); |
| 7299 | channel_multicast_listening_socks_deregister( |
| 7300 | &g_hb.mode_state.multicast_state.listening_sockets); |
| 7301 | multicast_listening_sockets_close(); |
| 7302 | |
| 7303 | MULTICAST_UNLOCK(); |
| 7304 | } |
| 7305 | |
| 7306 | /** |
| 7307 | * Dump multicast state to logs. |
| 7308 | * @param verbose enables / disables verbose logging. |
| 7309 | */ |
| 7310 | static void |
| 7311 | multicast_dump(bool verbose) |
| 7312 | { |
| 7313 | if (hb_is_mesh()) { |
| 7314 | return; |
| 7315 | } |
| 7316 | |
| 7317 | // Mode is multicast. |
| 7318 | INFO("HB Multicast TTL: %d" , config_multicast_ttl_get()); |
| 7319 | } |
| 7320 | |
| 7321 | /** |
| 7322 | * Find the maximum cluster size based on MTU of the network. |
| 7323 | * |
| 7324 | * num_nodes is computed so that |
| 7325 | * |
| 7326 | * MTU = compression_factor(fixed_size + num_nodesper_node_size) |
| 7327 | * where, |
| 7328 | * fixed_size = udp_header_size + msg_header_size + |
| 7329 | * sigma(per_plugin_fixed_size) |
| 7330 | * per_node_size = sigma(per_plugin_per_node_size). |
| 7331 | */ |
| 7332 | static int |
| 7333 | multicast_supported_cluster_size_get() |
| 7334 | { |
| 7335 | // Calculate the fixed size for a UDP packet and the message header. |
| 7336 | size_t msg_fixed_size = msg_get_template_fixed_sz(g_hb_msg_template, |
| 7337 | sizeof(g_hb_msg_template) / sizeof(msg_template)); |
| 7338 | |
| 7339 | size_t msg_plugin_per_node_size = 0; |
| 7340 | |
| 7341 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
| 7342 | // Adding plugin specific fixed size |
| 7343 | msg_fixed_size += g_hb.plugins[i].wire_size_fixed; |
| 7344 | // Adding plugin specific per node size. |
| 7345 | msg_plugin_per_node_size += g_hb.plugins[i].wire_size_per_node; |
| 7346 | } |
| 7347 | |
| 7348 | // TODO: Compute the max cluster size using max storage per node in cluster |
| 7349 | // and the min mtu. |
| 7350 | int supported_cluster_size = MAX(1, |
| 7351 | (((hb_mtu() - UDP_HEADER_SIZE_MAX) * MSG_COMPRESSION_RATIO) |
| 7352 | - msg_fixed_size) / msg_plugin_per_node_size); |
| 7353 | |
| 7354 | return supported_cluster_size; |
| 7355 | } |
| 7356 | |
| 7357 | /* |
| 7358 | * ---------------------------------------------------------------------------- |
| 7359 | * Heartbeat main sub module. |
| 7360 | * ---------------------------------------------------------------------------- |
| 7361 | */ |
| 7362 | |
| 7363 | /** |
| 7364 | * Is Main module initialized. |
| 7365 | */ |
| 7366 | static bool |
| 7367 | hb_is_initialized() |
| 7368 | { |
| 7369 | HB_LOCK(); |
| 7370 | bool retval = (g_hb.status != AS_HB_STATUS_UNINITIALIZED) ? true : false; |
| 7371 | HB_UNLOCK(); |
| 7372 | return retval; |
| 7373 | } |
| 7374 | |
| 7375 | /** |
| 7376 | * Is Main module running. |
| 7377 | */ |
| 7378 | static bool |
| 7379 | hb_is_running() |
| 7380 | { |
| 7381 | HB_LOCK(); |
| 7382 | bool retval = (g_hb.status == AS_HB_STATUS_RUNNING) ? true : false; |
| 7383 | HB_UNLOCK(); |
| 7384 | return retval; |
| 7385 | } |
| 7386 | |
| 7387 | /** |
| 7388 | * Is Main module stopped. |
| 7389 | */ |
| 7390 | static bool |
| 7391 | hb_is_stopped() |
| 7392 | { |
| 7393 | HB_LOCK(); |
| 7394 | bool retval = (g_hb.status == AS_HB_STATUS_STOPPED) ? true : false; |
| 7395 | HB_UNLOCK(); |
| 7396 | return retval; |
| 7397 | } |
| 7398 | |
| 7399 | /** |
| 7400 | * Initialize the mode specific data structures. |
| 7401 | */ |
| 7402 | static void |
| 7403 | hb_mode_init() |
| 7404 | { |
| 7405 | if (hb_is_mesh()) { |
| 7406 | mesh_init(); |
| 7407 | } |
| 7408 | else { |
| 7409 | multicast_init(); |
| 7410 | } |
| 7411 | } |
| 7412 | |
| 7413 | /** |
| 7414 | * Start mode specific threads.. |
| 7415 | */ |
| 7416 | static void |
| 7417 | hb_mode_start() |
| 7418 | { |
| 7419 | if (hb_is_mesh()) { |
| 7420 | mesh_start(); |
| 7421 | } |
| 7422 | else { |
| 7423 | multicast_start(); |
| 7424 | } |
| 7425 | } |
| 7426 | |
| 7427 | /** |
| 7428 | * The MTU for underlying network. |
| 7429 | */ |
| 7430 | static int |
| 7431 | hb_mtu() |
| 7432 | { |
| 7433 | int __mtu = config_override_mtu_get(); |
| 7434 | if (!__mtu) { |
| 7435 | __mtu = hb_is_mesh() ? |
| 7436 | g_hb.mode_state.mesh_state.min_mtu : |
| 7437 | g_hb.mode_state.multicast_state.min_mtu; |
| 7438 | __mtu = __mtu > 0 ? __mtu : DEFAULT_MIN_MTU; |
| 7439 | } |
| 7440 | return __mtu; |
| 7441 | } |
| 7442 | |
| 7443 | /** |
| 7444 | * Initialize the template to be used for heartbeat messages. |
| 7445 | */ |
| 7446 | static void |
| 7447 | hb_msg_init() |
| 7448 | { |
| 7449 | // Register fabric heartbeat msg type with no processing function: |
| 7450 | // This permits getting / putting heartbeat msgs to be moderated via an idle |
| 7451 | // msg queue. |
| 7452 | as_fabric_register_msg_fn(M_TYPE_HEARTBEAT, g_hb_msg_template, |
| 7453 | sizeof(g_hb_msg_template), |
| 7454 | AS_HB_MSG_SCRATCH_SIZE, 0, 0); |
| 7455 | } |
| 7456 | |
| 7457 | /** |
| 7458 | * Get hold of current heartbeat protocol version |
| 7459 | */ |
| 7460 | static uint32_t |
| 7461 | hb_protocol_identifier_get() |
| 7462 | { |
| 7463 | return HB_PROTOCOL_V3_IDENTIFIER; |
| 7464 | } |
| 7465 | |
| 7466 | /** |
| 7467 | * Node depart event time estimate. Assumes node departed timeout milliseconds |
| 7468 | * before the detection. |
| 7469 | */ |
| 7470 | static cf_clock |
| 7471 | hb_node_depart_time(cf_clock detect_time) |
| 7472 | { |
| 7473 | return (detect_time - HB_NODE_TIMEOUT()); |
| 7474 | } |
| 7475 | |
| 7476 | /** |
| 7477 | * Indicates if mode is mesh. |
| 7478 | */ |
| 7479 | static bool |
| 7480 | hb_is_mesh() |
| 7481 | { |
| 7482 | return (config_mode_get() == AS_HB_MODE_MESH); |
| 7483 | } |
| 7484 | |
| 7485 | /** |
| 7486 | * Publish an event to subsystems listening to heart beat events. |
| 7487 | */ |
| 7488 | static void |
| 7489 | hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, |
| 7490 | int node_count) |
| 7491 | { |
| 7492 | // Lock-less because the queue is thread safe and we do not use heartbeat |
| 7493 | // state here. |
| 7494 | for (int i = 0; i < node_count; i++) { |
| 7495 | as_hb_event_node event; |
| 7496 | event.nodeid = nodes[i]; |
| 7497 | event.event_detected_time = cf_getms(); |
| 7498 | |
| 7499 | switch (event_type) { |
| 7500 | case AS_HB_INTERNAL_NODE_ARRIVE: |
| 7501 | event.evt = AS_HB_NODE_ARRIVE; |
| 7502 | event.event_time = event.event_detected_time; |
| 7503 | as_health_add_node_counter(event.nodeid, AS_HEALTH_NODE_ARRIVALS); |
| 7504 | break; |
| 7505 | case AS_HB_INTERNAL_NODE_DEPART: |
| 7506 | event.evt = AS_HB_NODE_DEPART; |
| 7507 | event.event_time = hb_node_depart_time(event.event_detected_time); |
| 7508 | break; |
| 7509 | case AS_HB_INTERNAL_NODE_EVICT: |
| 7510 | event.evt = AS_HB_NODE_DEPART; |
| 7511 | event.event_time = event.event_detected_time; |
| 7512 | break; |
| 7513 | case AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED: |
| 7514 | event.evt = AS_HB_NODE_ADJACENCY_CHANGED; |
| 7515 | event.event_time = event.event_detected_time; |
| 7516 | break; |
| 7517 | } |
| 7518 | |
| 7519 | DEBUG("queuing event of type %d for node %" PRIx64, event.evt, |
| 7520 | event.nodeid); |
| 7521 | cf_queue_push(&g_hb_event_listeners.external_events_queue, &event); |
| 7522 | } |
| 7523 | } |
| 7524 | |
| 7525 | /** |
| 7526 | * Publish all pending events. Should be invoked outside hb locks. |
| 7527 | */ |
| 7528 | static void |
| 7529 | hb_event_publish_pending() |
| 7530 | { |
| 7531 | EXTERNAL_EVENT_PUBLISH_LOCK(); |
| 7532 | int num_events = cf_queue_sz(&g_hb_event_listeners.external_events_queue); |
| 7533 | if (num_events <= 0) { |
| 7534 | // Events need not be published. |
| 7535 | goto Exit; |
| 7536 | } |
| 7537 | |
| 7538 | as_hb_event_node events[AS_HB_CLUSTER_MAX_SIZE_SOFT]; |
| 7539 | int published_count = 0; |
| 7540 | while (published_count < AS_HB_CLUSTER_MAX_SIZE_SOFT |
| 7541 | && cf_queue_pop(&g_hb_event_listeners.external_events_queue, |
| 7542 | &events[published_count], 0) == CF_QUEUE_OK) { |
| 7543 | published_count++; |
| 7544 | } |
| 7545 | |
| 7546 | if (published_count) { |
| 7547 | // Assuming that event listeners are not registered after system init, |
| 7548 | // no locks here. |
| 7549 | DEBUG("publishing %d heartbeat events" , published_count); |
| 7550 | for (int i = 0; i < g_hb_event_listeners.event_listener_count; i++) { |
| 7551 | (g_hb_event_listeners.event_listeners[i].event_callback)( |
| 7552 | published_count, events, |
| 7553 | g_hb_event_listeners.event_listeners[i].udata); |
| 7554 | } |
| 7555 | } |
| 7556 | |
| 7557 | Exit: |
| 7558 | EXTERNAL_EVENT_PUBLISH_UNLOCK(); |
| 7559 | } |
| 7560 | |
| 7561 | /** |
| 7562 | * Delete the heap allocated data while iterating through the hash and deleting |
| 7563 | * entries. |
| 7564 | */ |
| 7565 | static int |
| 7566 | hb_adjacency_free_data_reduce(const void* key, void* data, void* udata) |
| 7567 | { |
| 7568 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
| 7569 | |
| 7570 | const cf_node* nodeid = (const cf_node*)key; |
| 7571 | |
| 7572 | hb_adjacent_node_destroy(adjacent_node); |
| 7573 | |
| 7574 | // Send event depart to for this node |
| 7575 | hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, nodeid, 1); |
| 7576 | |
| 7577 | return CF_SHASH_REDUCE_DELETE; |
| 7578 | } |
| 7579 | |
| 7580 | /** |
| 7581 | * Clear the heartbeat data structures. |
| 7582 | */ |
| 7583 | static void |
| 7584 | hb_clear() |
| 7585 | { |
| 7586 | if (!hb_is_stopped()) { |
| 7587 | WARNING("attempted to clear heartbeat module without stopping it" ); |
| 7588 | return; |
| 7589 | } |
| 7590 | |
| 7591 | HB_LOCK(); |
| 7592 | |
| 7593 | // Free the plugin data and delete adjacent nodes. |
| 7594 | cf_shash_reduce(g_hb.adjacency, hb_adjacency_free_data_reduce, NULL); |
| 7595 | cf_shash_reduce(g_hb.on_probation, hb_adjacency_free_data_reduce, NULL); |
| 7596 | hb_adjacent_node_destroy(&g_hb.self_node); |
| 7597 | memset(&g_hb.self_node, 0, sizeof(g_hb.self_node)); |
| 7598 | |
| 7599 | HB_UNLOCK(); |
| 7600 | |
| 7601 | // Publish node departed events for the removed nodes. |
| 7602 | hb_event_publish_pending(); |
| 7603 | |
| 7604 | // Clear the mode module. |
| 7605 | if (hb_is_mesh()) { |
| 7606 | mesh_clear(); |
| 7607 | } |
| 7608 | else { |
| 7609 | multicast_clear(); |
| 7610 | } |
| 7611 | |
| 7612 | channel_clear(); |
| 7613 | } |
| 7614 | |
| 7615 | /** |
| 7616 | * Reduce function to get hold of current adjacency list. |
| 7617 | */ |
| 7618 | static int |
| 7619 | hb_adjacency_iterate_reduce(const void* key, void* data, void* udata) |
| 7620 | { |
| 7621 | const cf_node* nodeid = (const cf_node*)key; |
| 7622 | as_hb_adjacency_reduce_udata* adjacency_reduce_udata = |
| 7623 | (as_hb_adjacency_reduce_udata*)udata; |
| 7624 | |
| 7625 | adjacency_reduce_udata->adj_list[adjacency_reduce_udata->adj_count] = |
| 7626 | *nodeid; |
| 7627 | adjacency_reduce_udata->adj_count++; |
| 7628 | |
| 7629 | return CF_SHASH_OK; |
| 7630 | } |
| 7631 | |
| 7632 | /** |
| 7633 | * Plugin function to set heartbeat adjacency list into a pulse message. |
| 7634 | */ |
| 7635 | static void |
| 7636 | hb_plugin_set_fn(msg* msg) |
| 7637 | { |
| 7638 | HB_LOCK(); |
| 7639 | |
| 7640 | cf_node adj_list[cf_shash_get_size(g_hb.adjacency)]; |
| 7641 | as_hb_adjacency_reduce_udata adjacency_reduce_udata = { adj_list, 0 }; |
| 7642 | |
| 7643 | cf_shash_reduce(g_hb.adjacency, hb_adjacency_iterate_reduce, |
| 7644 | &adjacency_reduce_udata); |
| 7645 | |
| 7646 | HB_UNLOCK(); |
| 7647 | |
| 7648 | // Populate adjacency list. |
| 7649 | msg_adjacency_set(msg, adj_list, adjacency_reduce_udata.adj_count); |
| 7650 | |
| 7651 | // Set cluster name. |
| 7652 | char cluster_name[AS_CLUSTER_NAME_SZ]; |
| 7653 | as_config_cluster_name_get(cluster_name); |
| 7654 | |
| 7655 | if (cluster_name[0] != '\0') { |
| 7656 | msg_set_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, MSG_SET_COPY); |
| 7657 | } |
| 7658 | } |
| 7659 | |
| 7660 | /** |
| 7661 | * Plugin function that parses adjacency list out of a heartbeat pulse message. |
| 7662 | */ |
| 7663 | static void |
| 7664 | hb_plugin_parse_data_fn(msg* msg, cf_node source, |
| 7665 | as_hb_plugin_node_data* prev_plugin_data, |
| 7666 | as_hb_plugin_node_data* plugin_data) |
| 7667 | { |
| 7668 | size_t adj_length = 0; |
| 7669 | cf_node* adj_list = NULL; |
| 7670 | |
| 7671 | if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) { |
| 7672 | // Store a zero length adjacency list. Should not have happened. |
| 7673 | WARNING("received heartbeat without adjacency list %" PRIx64, source); |
| 7674 | adj_length = 0; |
| 7675 | } |
| 7676 | |
| 7677 | // The guess can be larger for older protocols which also include self node |
| 7678 | // in the adjacency list. |
| 7679 | int guessed_data_size = (adj_length * sizeof(cf_node)); |
| 7680 | |
| 7681 | if (guessed_data_size > plugin_data->data_capacity) { |
| 7682 | // Round up to nearest multiple of block size to prevent very frequent |
| 7683 | // reallocation. |
| 7684 | size_t data_capacity = ((guessed_data_size + HB_PLUGIN_DATA_BLOCK_SIZE |
| 7685 | - 1) / |
| 7686 | HB_PLUGIN_DATA_BLOCK_SIZE) * |
| 7687 | HB_PLUGIN_DATA_BLOCK_SIZE; |
| 7688 | |
| 7689 | // Reallocate since we have outgrown existing capacity. |
| 7690 | plugin_data->data = cf_realloc(plugin_data->data, data_capacity); |
| 7691 | plugin_data->data_capacity = data_capacity; |
| 7692 | } |
| 7693 | |
| 7694 | cf_node* dest_list = (cf_node*)(plugin_data->data); |
| 7695 | |
| 7696 | size_t final_list_length = 0; |
| 7697 | for (size_t i = 0; i < adj_length; i++) { |
| 7698 | if (adj_list[i] == source) { |
| 7699 | // Skip the source node. |
| 7700 | continue; |
| 7701 | } |
| 7702 | dest_list[final_list_length++] = adj_list[i]; |
| 7703 | } |
| 7704 | |
| 7705 | plugin_data->data_size = (final_list_length * sizeof(cf_node)); |
| 7706 | } |
| 7707 | |
| 7708 | /** |
| 7709 | * Get the msg buffer from a pool based on the protocol under use. |
| 7710 | * @return the msg buff |
| 7711 | */ |
| 7712 | static msg* |
| 7713 | hb_msg_get() |
| 7714 | { |
| 7715 | return as_fabric_msg_get(M_TYPE_HEARTBEAT); |
| 7716 | } |
| 7717 | |
| 7718 | /** |
| 7719 | * Return the message buffer back to the pool. |
| 7720 | */ |
| 7721 | static void |
| 7722 | hb_msg_return(msg* msg) |
| 7723 | { |
| 7724 | as_fabric_msg_put(msg); |
| 7725 | } |
| 7726 | |
| 7727 | /** |
| 7728 | * Fill the outgoing pulse message with plugin specific data. |
| 7729 | * |
| 7730 | * Note: The set functions would be acquiring their locks. This function should |
| 7731 | * never directly use nor have a call stack under HB_LOCK. |
| 7732 | * |
| 7733 | * @param msg the outgoing pulse message. |
| 7734 | */ |
| 7735 | static void |
| 7736 | hb_plugin_msg_fill(msg* msg) |
| 7737 | { |
| 7738 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
| 7739 | if (g_hb.plugins[i].set_fn) { |
| 7740 | (g_hb.plugins[i].set_fn)(msg); |
| 7741 | } |
| 7742 | } |
| 7743 | } |
| 7744 | |
| 7745 | /** |
| 7746 | * Parse fields from the message into plugin specific data. |
| 7747 | * @param msg the outgoing pulse message. |
| 7748 | * @param adjacent_node the node from which this message was received. |
| 7749 | * @param plugin_data_changed (output) array whose ith entry is set to true if |
| 7750 | * ith plugin's data changed, false otherwise. Should be large enough to hold |
| 7751 | * flags for all plugins. |
| 7752 | */ |
| 7753 | static void |
| 7754 | hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, |
| 7755 | as_hb_plugin* plugins, bool plugin_data_changed[]) |
| 7756 | { |
| 7757 | cf_node source; |
| 7758 | adjacent_node->plugin_data_cycler++; |
| 7759 | |
| 7760 | msg_nodeid_get(msg, &source); |
| 7761 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
| 7762 | plugin_data_changed[i] = false; |
| 7763 | if (plugins[i].parse_fn) { |
| 7764 | as_hb_plugin_node_data* curr_data = |
| 7765 | &adjacent_node->plugin_data[i][adjacent_node->plugin_data_cycler |
| 7766 | % 2]; |
| 7767 | |
| 7768 | as_hb_plugin_node_data* prev_data = |
| 7769 | &adjacent_node->plugin_data[i][(adjacent_node->plugin_data_cycler |
| 7770 | + 1) % 2]; |
| 7771 | |
| 7772 | // Ensure there is a preallocated data pointer. |
| 7773 | if (curr_data->data == NULL) { |
| 7774 | curr_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE); |
| 7775 | curr_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE; |
| 7776 | curr_data->data_size = 0; |
| 7777 | } |
| 7778 | |
| 7779 | if (prev_data->data == NULL) { |
| 7780 | prev_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE); |
| 7781 | prev_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE; |
| 7782 | prev_data->data_size = 0; |
| 7783 | } |
| 7784 | |
| 7785 | // Parse message data into current data. |
| 7786 | (plugins[i]).parse_fn(msg, source, prev_data, curr_data); |
| 7787 | |
| 7788 | if (!plugins[i].change_listener) { |
| 7789 | // No change listener configured. Skip detecting change. |
| 7790 | continue; |
| 7791 | } |
| 7792 | |
| 7793 | size_t curr_data_size = curr_data->data_size; |
| 7794 | void* curr_data_blob = curr_data_size ? curr_data->data : NULL; |
| 7795 | |
| 7796 | size_t prev_data_size = prev_data->data_size; |
| 7797 | void* prev_data_blob = prev_data_size ? prev_data->data : NULL; |
| 7798 | |
| 7799 | if (prev_data_blob == curr_data_blob) { |
| 7800 | // Old and new data both NULL or both point to the same memory |
| 7801 | // location. |
| 7802 | plugin_data_changed[i] = false; |
| 7803 | continue; |
| 7804 | } |
| 7805 | |
| 7806 | if (prev_data_size != curr_data_size || prev_data_blob == NULL |
| 7807 | || curr_data_blob == NULL) { |
| 7808 | // Plugin data definitely changed, as the data sizes differ or |
| 7809 | // exactly one of old or new data pointers is NULL. |
| 7810 | plugin_data_changed[i] = true; |
| 7811 | continue; |
| 7812 | } |
| 7813 | |
| 7814 | // The data sizes match at this point and neither values are NULL. |
| 7815 | plugin_data_changed[i] = memcmp(prev_data_blob, curr_data_blob, |
| 7816 | curr_data_size) != 0; |
| 7817 | } |
| 7818 | } |
| 7819 | } |
| 7820 | |
| 7821 | /** |
| 7822 | * Adjacency list for an adjacent node changed. |
| 7823 | */ |
| 7824 | static void |
| 7825 | hb_plugin_data_change_listener(cf_node changed_node_id) |
| 7826 | { |
| 7827 | hb_event_queue(AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED, &changed_node_id, 1); |
| 7828 | } |
| 7829 | |
| 7830 | /** |
| 7831 | * Initialize the plugin specific data structures. |
| 7832 | */ |
| 7833 | static void |
| 7834 | hb_plugin_init() |
| 7835 | { |
| 7836 | memset(&g_hb.plugins, 0, sizeof(g_hb.plugins)); |
| 7837 | |
| 7838 | // Be cute. Register self as a plugin. |
| 7839 | as_hb_plugin self_plugin; |
| 7840 | memset(&self_plugin, 0, sizeof(self_plugin)); |
| 7841 | self_plugin.id = AS_HB_PLUGIN_HB; |
| 7842 | self_plugin.wire_size_fixed = 0; |
| 7843 | self_plugin.wire_size_per_node = sizeof(cf_node); |
| 7844 | self_plugin.set_fn = hb_plugin_set_fn; |
| 7845 | self_plugin.parse_fn = hb_plugin_parse_data_fn; |
| 7846 | self_plugin.change_listener = hb_plugin_data_change_listener; |
| 7847 | hb_plugin_register(&self_plugin); |
| 7848 | } |
| 7849 | |
| 7850 | /** |
| 7851 | * Transmits heartbeats at fixed intervals. |
| 7852 | */ |
| 7853 | void* |
| 7854 | hb_transmitter(void* arg) |
| 7855 | { |
| 7856 | DETAIL("heartbeat transmitter started" ); |
| 7857 | |
| 7858 | cf_clock last_time = 0; |
| 7859 | |
| 7860 | while (hb_is_running()) { |
| 7861 | cf_clock curr_time = cf_getms(); |
| 7862 | |
| 7863 | if ((curr_time - last_time) < PULSE_TRANSMIT_INTERVAL()) { |
| 7864 | // Interval has not been reached for sending heartbeats |
| 7865 | usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time + |
| 7866 | PULSE_TRANSMIT_INTERVAL()) - curr_time) * 1000); |
| 7867 | continue; |
| 7868 | } |
| 7869 | |
| 7870 | last_time = curr_time; |
| 7871 | |
| 7872 | // Construct the pulse message. |
| 7873 | msg* msg = hb_msg_get(); |
| 7874 | |
| 7875 | msg_src_fields_fill(msg); |
| 7876 | msg_type_set(msg, AS_HB_MSG_TYPE_PULSE); |
| 7877 | |
| 7878 | // Have plugins fill their data into the heartbeat pulse message. |
| 7879 | hb_plugin_msg_fill(msg); |
| 7880 | |
| 7881 | // Broadcast the heartbeat to all known recipients. |
| 7882 | channel_msg_broadcast(msg); |
| 7883 | |
| 7884 | // Return the msg back to the fabric. |
| 7885 | hb_msg_return(msg); |
| 7886 | |
| 7887 | DETAIL("done sending pulse message" ); |
| 7888 | } |
| 7889 | |
| 7890 | DETAIL("heartbeat transmitter stopped" ); |
| 7891 | return NULL; |
| 7892 | } |
| 7893 | |
| 7894 | /** |
| 7895 | * Get hold of adjacent node information given its nodeid. |
| 7896 | * @param nodeid the nodeid. |
| 7897 | * @param adjacent_node the output node information. |
| 7898 | * @return 0 on success, -1 on failure. |
| 7899 | */ |
| 7900 | static int |
| 7901 | hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node) |
| 7902 | { |
| 7903 | int rv = -1; |
| 7904 | HB_LOCK(); |
| 7905 | |
| 7906 | if (cf_shash_get(g_hb.adjacency, &nodeid, adjacent_node) == CF_SHASH_OK) { |
| 7907 | rv = 0; |
| 7908 | } |
| 7909 | |
| 7910 | HB_UNLOCK(); |
| 7911 | return rv; |
| 7912 | } |
| 7913 | |
| 7914 | /** |
| 7915 | * Get hold of an on-probation node information given its nodeid. |
| 7916 | * @param nodeid the nodeid. |
| 7917 | * @param adjacent_node the output node information. |
| 7918 | * @return 0 on success, -1 on failure. |
| 7919 | */ |
| 7920 | static int |
| 7921 | hb_on_probation_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node) |
| 7922 | { |
| 7923 | int rv = -1; |
| 7924 | HB_LOCK(); |
| 7925 | |
| 7926 | if (cf_shash_get(g_hb.on_probation, &nodeid, adjacent_node) |
| 7927 | == CF_SHASH_OK) { |
| 7928 | rv = 0; |
| 7929 | } |
| 7930 | |
| 7931 | HB_UNLOCK(); |
| 7932 | return rv; |
| 7933 | } |
| 7934 | |
| 7935 | /** |
| 7936 | * Read the plugin data from an adjacent node. |
| 7937 | * @param adjacent_node the adjacent node. |
| 7938 | * @param plugin_data (output) will be null if this node has no plugin data. |
| 7939 | * Else will point to the plugin data. |
| 7940 | * @param plugin_data_size (output) the size of the plugin data. |
| 7941 | */ |
| 7942 | static void |
| 7943 | hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, |
| 7944 | as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size) |
| 7945 | { |
| 7946 | *plugin_data_size = |
| 7947 | adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler |
| 7948 | % 2].data_size; |
| 7949 | |
| 7950 | *plugin_data = |
| 7951 | *plugin_data_size ? |
| 7952 | (cf_node*)(adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler |
| 7953 | % 2].data) : NULL; |
| 7954 | } |
| 7955 | |
| 7956 | /** |
| 7957 | * Get adjacency list for an adjacent node. |
| 7958 | */ |
| 7959 | static void |
| 7960 | hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, |
| 7961 | cf_node** adjacency_list, size_t* adjacency_length) |
| 7962 | { |
| 7963 | hb_adjacent_node_plugin_data_get(adjacent_node, AS_HB_PLUGIN_HB, |
| 7964 | (void**)adjacency_list, adjacency_length); |
| 7965 | (*adjacency_length) /= sizeof(cf_node); |
| 7966 | } |
| 7967 | |
| 7968 | /** |
| 7969 | * Indicates if a give node has expired and should be removed from the adjacency |
| 7970 | * list. |
| 7971 | */ |
| 7972 | static bool |
| 7973 | hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node) |
| 7974 | { |
| 7975 | if (nodeid == config_self_nodeid_get()) { |
| 7976 | return false; |
| 7977 | } |
| 7978 | |
| 7979 | HB_LOCK(); |
| 7980 | |
| 7981 | cf_clock now = cf_getms(); |
| 7982 | |
| 7983 | bool expired = adjacent_node->last_updated_monotonic_ts + HB_NODE_TIMEOUT() |
| 7984 | < now; |
| 7985 | |
| 7986 | HB_UNLOCK(); |
| 7987 | return expired; |
| 7988 | } |
| 7989 | |
| 7990 | /** |
| 7991 | * Indicates if self node has duplicate ids. |
| 7992 | */ |
| 7993 | static bool |
| 7994 | hb_self_is_duplicate(){ |
| 7995 | HB_LOCK(); |
| 7996 | bool self_is_duplicate = g_hb.self_is_duplicate; |
| 7997 | HB_UNLOCK(); |
| 7998 | return self_is_duplicate; |
| 7999 | } |
| 8000 | |
| 8001 | /** |
| 8002 | * Updates the self is duplicate flag. |
| 8003 | */ |
| 8004 | static void |
| 8005 | hb_self_duplicate_update() |
| 8006 | { |
| 8007 | cf_clock now = cf_getms(); |
| 8008 | HB_LOCK(); |
| 8009 | if (g_hb.self_is_duplicate) { |
| 8010 | uint32_t duplicate_block_interval = |
| 8011 | config_endpoint_track_intervals_get() |
| 8012 | * config_tx_interval_get(); |
| 8013 | if (g_hb.self_duplicate_detected_ts + duplicate_block_interval <= now) { |
| 8014 | // We have not seen duplicates for the endpoint change tracking |
| 8015 | // interval. Mark ourself as non-duplicate. |
| 8016 | g_hb.self_is_duplicate = false; |
| 8017 | } |
| 8018 | } |
| 8019 | HB_UNLOCK(); |
| 8020 | } |
| 8021 | |
| 8022 | /** |
| 8023 | * Free up space occupied by plugin data from adjacent node. |
| 8024 | */ |
| 8025 | static void |
| 8026 | hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node) |
| 8027 | { |
| 8028 | HB_LOCK(); |
| 8029 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
| 8030 | as_hb_plugin_node_data* curr_plugin_data = adjacent_node->plugin_data[i]; |
| 8031 | for (int j = 0; j < 2; j++) { |
| 8032 | if (curr_plugin_data[j].data) { |
| 8033 | cf_free(curr_plugin_data[j].data); |
| 8034 | curr_plugin_data[j].data = NULL; |
| 8035 | } |
| 8036 | |
| 8037 | curr_plugin_data[j].data_capacity = 0; |
| 8038 | curr_plugin_data[j].data_size = 0; |
| 8039 | } |
| 8040 | } |
| 8041 | |
| 8042 | if (adjacent_node->endpoint_list) { |
| 8043 | // Free the endpoint list. |
| 8044 | cf_free(adjacent_node->endpoint_list); |
| 8045 | adjacent_node->endpoint_list = NULL; |
| 8046 | } |
| 8047 | |
| 8048 | HB_UNLOCK(); |
| 8049 | } |
| 8050 | |
| 8051 | /** |
| 8052 | * Tend reduce function that removes expired nodes from adjacency list. |
| 8053 | */ |
| 8054 | static int |
| 8055 | hb_adjacency_tend_reduce(const void* key, void* data, void* udata) |
| 8056 | { |
| 8057 | cf_node nodeid = *(const cf_node*)key; |
| 8058 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
| 8059 | as_hb_adjacency_tender_udata* adjacency_tender_udata = |
| 8060 | (as_hb_adjacency_tender_udata*)udata; |
| 8061 | |
| 8062 | int rv = CF_SHASH_OK; |
| 8063 | bool cluster_name_mismatch = adjacent_node->cluster_name_mismatch_count |
| 8064 | > CLUSTER_NAME_MISMATCH_MAX; |
| 8065 | if (hb_node_has_expired(nodeid, adjacent_node) || cluster_name_mismatch) { |
| 8066 | INFO("node expired %" PRIx64" %s" , nodeid, cluster_name_mismatch ? "(cluster name mismatch)" : "" ); |
| 8067 | if (cluster_name_mismatch) { |
| 8068 | adjacency_tender_udata->evicted_nodes[adjacency_tender_udata->evicted_node_count++] = |
| 8069 | nodeid; |
| 8070 | } |
| 8071 | else { |
| 8072 | adjacency_tender_udata->dead_nodes[adjacency_tender_udata->dead_node_count++] = |
| 8073 | nodeid; |
| 8074 | } |
| 8075 | |
| 8076 | // Free plugin data as well. |
| 8077 | hb_adjacent_node_destroy(adjacent_node); |
| 8078 | |
| 8079 | rv = CF_SHASH_REDUCE_DELETE; |
| 8080 | } |
| 8081 | |
| 8082 | return rv; |
| 8083 | } |
| 8084 | |
| 8085 | /** |
| 8086 | * Tend reduce function that removes expired nodes from the probationary list. |
| 8087 | */ |
| 8088 | static int |
| 8089 | hb_on_probation_tend_reduce(const void* key, void* data, void* udata) |
| 8090 | { |
| 8091 | cf_node nodeid = *(const cf_node*)key; |
| 8092 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
| 8093 | |
| 8094 | int rv = CF_SHASH_OK; |
| 8095 | if (hb_node_has_expired(nodeid, adjacent_node)) { |
| 8096 | DEBUG("on-probation node %" PRIx64 " expired" , nodeid); |
| 8097 | // Free plugin data as well. |
| 8098 | hb_adjacent_node_destroy(adjacent_node); |
| 8099 | rv = CF_SHASH_REDUCE_DELETE; |
| 8100 | } |
| 8101 | return rv; |
| 8102 | } |
| 8103 | |
| 8104 | /** |
| 8105 | * Tends the adjacency list. Removes nodes that expire. |
| 8106 | */ |
| 8107 | void* |
| 8108 | hb_adjacency_tender(void* arg) |
| 8109 | { |
| 8110 | DETAIL("adjacency tender started" ); |
| 8111 | |
| 8112 | cf_clock last_time = 0; |
| 8113 | cf_clock last_depart_time = 0; |
| 8114 | |
| 8115 | while (hb_is_running()) { |
| 8116 | cf_clock curr_time = cf_getms(); |
| 8117 | uint32_t adjacency_tend_interval = ADJACENCY_TEND_INTERVAL; |
| 8118 | // Interval after node depart where we tend faster to detect additional |
| 8119 | // node departures. |
| 8120 | uint32_t fast_check_interval = 2 * config_tx_interval_get(); |
| 8121 | if (last_depart_time + fast_check_interval > curr_time) { |
| 8122 | adjacency_tend_interval = ADJACENCY_FAST_TEND_INTERVAL; |
| 8123 | } |
| 8124 | |
| 8125 | hb_self_duplicate_update(); |
| 8126 | |
| 8127 | if ((curr_time - last_time) < adjacency_tend_interval) { |
| 8128 | // Publish any pendng events. |
| 8129 | hb_event_publish_pending(); |
| 8130 | |
| 8131 | // Interval has not been reached for sending heartbeats |
| 8132 | usleep( |
| 8133 | MIN(AS_HB_TX_INTERVAL_MS_MIN, |
| 8134 | (last_time + adjacency_tend_interval) - curr_time) |
| 8135 | * 1000); |
| 8136 | continue; |
| 8137 | } |
| 8138 | |
| 8139 | last_time = curr_time; |
| 8140 | |
| 8141 | DETAIL("tending adjacency list" ); |
| 8142 | |
| 8143 | HB_LOCK(); |
| 8144 | cf_node dead_nodes[cf_shash_get_size(g_hb.adjacency)]; |
| 8145 | cf_node evicted_nodes[cf_shash_get_size(g_hb.adjacency)]; |
| 8146 | as_hb_adjacency_tender_udata adjacency_tender_udata; |
| 8147 | adjacency_tender_udata.dead_nodes = dead_nodes; |
| 8148 | adjacency_tender_udata.dead_node_count = 0; |
| 8149 | adjacency_tender_udata.evicted_nodes = evicted_nodes; |
| 8150 | adjacency_tender_udata.evicted_node_count = 0; |
| 8151 | |
| 8152 | cf_shash_reduce(g_hb.adjacency, hb_adjacency_tend_reduce, |
| 8153 | &adjacency_tender_udata); |
| 8154 | |
| 8155 | if (adjacency_tender_udata.dead_node_count > 0) { |
| 8156 | last_depart_time = curr_time; |
| 8157 | // Queue events for dead nodes. |
| 8158 | hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, dead_nodes, |
| 8159 | adjacency_tender_udata.dead_node_count); |
| 8160 | } |
| 8161 | |
| 8162 | if (adjacency_tender_udata.evicted_node_count > 0) { |
| 8163 | last_depart_time = curr_time; |
| 8164 | // Queue events for evicted nodes. |
| 8165 | hb_event_queue(AS_HB_INTERNAL_NODE_EVICT, evicted_nodes, |
| 8166 | adjacency_tender_udata.evicted_node_count); |
| 8167 | } |
| 8168 | |
| 8169 | // Expire nodes from the on-probation list. |
| 8170 | cf_shash_reduce(g_hb.on_probation, hb_on_probation_tend_reduce, NULL); |
| 8171 | HB_UNLOCK(); |
| 8172 | |
| 8173 | // See if we have pending events to publish. |
| 8174 | hb_event_publish_pending(); |
| 8175 | |
| 8176 | DETAIL("done tending adjacency list" ); |
| 8177 | } |
| 8178 | |
| 8179 | DETAIL("adjacency tender shut down" ); |
| 8180 | return NULL; |
| 8181 | } |
| 8182 | |
| 8183 | /** |
| 8184 | * Start the transmitter thread. |
| 8185 | */ |
| 8186 | static void |
| 8187 | hb_tx_start() |
| 8188 | { |
| 8189 | // Start the transmitter thread. |
| 8190 | g_hb.transmitter_tid = cf_thread_create_joinable(hb_transmitter, |
| 8191 | (void*)&g_hb); |
| 8192 | } |
| 8193 | |
| 8194 | /** |
| 8195 | * Stop the transmitter thread. |
| 8196 | */ |
| 8197 | static void |
| 8198 | hb_tx_stop() |
| 8199 | { |
| 8200 | DETAIL("waiting for the transmitter thread to stop" ); |
| 8201 | // Wait for the adjacency tender thread to stop. |
| 8202 | cf_thread_join(g_hb.transmitter_tid); |
| 8203 | } |
| 8204 | |
| 8205 | /** |
| 8206 | * Start the transmitter thread. |
| 8207 | */ |
| 8208 | static void |
| 8209 | hb_adjacency_tender_start() |
| 8210 | { |
| 8211 | // Start the transmitter thread. |
| 8212 | g_hb.adjacency_tender_tid = cf_thread_create_joinable(hb_adjacency_tender, |
| 8213 | (void*)&g_hb); |
| 8214 | } |
| 8215 | |
| 8216 | /** |
| 8217 | * Stop the adjacency tender thread. |
| 8218 | */ |
| 8219 | static void |
| 8220 | hb_adjacency_tender_stop() |
| 8221 | { |
| 8222 | // Wait for the adjacency tender thread to stop. |
| 8223 | cf_thread_join(g_hb.adjacency_tender_tid); |
| 8224 | } |
| 8225 | |
| 8226 | /** |
| 8227 | * Initialize the heartbeat subsystem. |
| 8228 | */ |
| 8229 | static void |
| 8230 | hb_init() |
| 8231 | { |
| 8232 | if (hb_is_initialized()) { |
| 8233 | WARNING("heartbeat main module is already initialized" ); |
| 8234 | return; |
| 8235 | } |
| 8236 | |
| 8237 | // Operate under a lock. Let's be paranoid everywhere. |
| 8238 | HB_LOCK(); |
| 8239 | |
| 8240 | // Initialize the heartbeat data structure. |
| 8241 | memset(&g_hb, 0, sizeof(g_hb)); |
| 8242 | |
| 8243 | // Initialize the adjacency hash. |
| 8244 | g_hb.adjacency = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), |
| 8245 | sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
| 8246 | |
| 8247 | // Initialize the on_probation hash. |
| 8248 | g_hb.on_probation = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), |
| 8249 | sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
| 8250 | |
| 8251 | // Initialize the temporary hash to map nodeid to index. |
| 8252 | g_hb.nodeid_to_index = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), |
| 8253 | sizeof(int), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
| 8254 | |
| 8255 | // Initialize unpublished event queue. |
| 8256 | cf_queue_init(&g_hb_event_listeners.external_events_queue, |
| 8257 | sizeof(as_hb_event_node), |
| 8258 | AS_HB_CLUSTER_MAX_SIZE_SOFT, true); |
| 8259 | |
| 8260 | // Initialize the mode specific state. |
| 8261 | hb_mode_init(); |
| 8262 | |
| 8263 | // Initialize the plugin functions. |
| 8264 | hb_plugin_init(); |
| 8265 | |
| 8266 | // Initialize IO channel subsystem. |
| 8267 | channel_init(); |
| 8268 | |
| 8269 | g_hb.status = AS_HB_STATUS_STOPPED; |
| 8270 | |
| 8271 | HB_UNLOCK(); |
| 8272 | } |
| 8273 | |
| 8274 | /** |
| 8275 | * Start the heartbeat subsystem. |
| 8276 | */ |
| 8277 | static void |
| 8278 | hb_start() |
| 8279 | { |
| 8280 | // Operate under a lock. Let's be paranoid everywhere. |
| 8281 | HB_LOCK(); |
| 8282 | |
| 8283 | if (hb_is_running()) { |
| 8284 | // Shutdown the heartbeat subsystem. |
| 8285 | hb_stop(); |
| 8286 | } |
| 8287 | |
| 8288 | g_hb.status = AS_HB_STATUS_RUNNING; |
| 8289 | |
| 8290 | // Initialize the heartbeat message templates. Called from here because |
| 8291 | // fabric needs to be initialized for this call to succeed. Fabric init |
| 8292 | // happens after heartbeat init. |
| 8293 | hb_msg_init(); |
| 8294 | |
| 8295 | // Initialize channel sub module. |
| 8296 | channel_start(); |
| 8297 | |
| 8298 | // Start the mode sub module |
| 8299 | hb_mode_start(); |
| 8300 | |
| 8301 | // Start heart beat transmitter. |
| 8302 | hb_tx_start(); |
| 8303 | |
| 8304 | // Start heart beat adjacency tender. |
| 8305 | hb_adjacency_tender_start(); |
| 8306 | |
| 8307 | HB_UNLOCK(); |
| 8308 | } |
| 8309 | |
| 8310 | /** |
| 8311 | * Shut down the heartbeat subsystem. |
| 8312 | */ |
| 8313 | static void |
| 8314 | hb_stop() |
| 8315 | { |
| 8316 | if (!hb_is_running()) { |
| 8317 | WARNING("heartbeat is already stopped" ); |
| 8318 | return; |
| 8319 | } |
| 8320 | |
| 8321 | HB_LOCK(); |
| 8322 | g_hb.status = AS_HB_STATUS_SHUTTING_DOWN; |
| 8323 | HB_UNLOCK(); |
| 8324 | |
| 8325 | // Publish pending events. Should not delay any events. |
| 8326 | hb_event_publish_pending(); |
| 8327 | |
| 8328 | // Shutdown mode. |
| 8329 | if (hb_is_mesh()) { |
| 8330 | mesh_stop(); |
| 8331 | } |
| 8332 | else { |
| 8333 | multicast_stop(); |
| 8334 | } |
| 8335 | |
| 8336 | // Wait for the threads to shut down. |
| 8337 | hb_tx_stop(); |
| 8338 | |
| 8339 | hb_adjacency_tender_stop(); |
| 8340 | |
| 8341 | // Stop channels. |
| 8342 | channel_stop(); |
| 8343 | |
| 8344 | g_hb.status = AS_HB_STATUS_STOPPED; |
| 8345 | } |
| 8346 | |
| 8347 | /** |
| 8348 | * Register a plugin with the heart beat system. |
| 8349 | */ |
| 8350 | static void |
| 8351 | hb_plugin_register(as_hb_plugin* plugin) |
| 8352 | { |
| 8353 | HB_LOCK(); |
| 8354 | memcpy(&g_hb.plugins[plugin->id], plugin, sizeof(as_hb_plugin)); |
| 8355 | HB_UNLOCK(); |
| 8356 | } |
| 8357 | |
| 8358 | /** |
| 8359 | * Check if the heartbeat recieved is duplicate or stale. |
| 8360 | */ |
| 8361 | static bool |
| 8362 | hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp last_send_ts) |
| 8363 | { |
| 8364 | if (as_hlc_timestamp_order_get(event->msg_hlc_ts.send_ts, last_send_ts) |
| 8365 | == AS_HLC_HAPPENS_BEFORE) { |
| 8366 | // Received a delayed heartbeat send before the current heartbeat. |
| 8367 | return true; |
| 8368 | } |
| 8369 | return false; |
| 8370 | } |
| 8371 | |
| 8372 | /** |
| 8373 | * Update the tracker with endpoint change status. |
| 8374 | */ |
| 8375 | static void |
| 8376 | hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed) |
| 8377 | { |
| 8378 | *tracker = *tracker << 1; |
| 8379 | if (endpoint_changed) { |
| 8380 | (*tracker)++; |
| 8381 | } |
| 8382 | } |
| 8383 | |
| 8384 | /** |
| 8385 | * Indicates if endpoint changes for this node are normal. |
| 8386 | */ |
| 8387 | static bool |
| 8388 | hb_endpoint_change_tracker_is_normal(uint64_t tracker) |
| 8389 | { |
| 8390 | if (tracker == 0) { |
| 8391 | // Normal and healthy case. |
| 8392 | return true; |
| 8393 | } |
| 8394 | |
| 8395 | uint32_t num_intervals_to_track = MIN(64, |
| 8396 | config_endpoint_track_intervals_get()); |
| 8397 | uint64_t mask = ~(~(uint64_t)0 << num_intervals_to_track); |
| 8398 | |
| 8399 | // Ignore older history. |
| 8400 | tracker &= mask; |
| 8401 | |
| 8402 | int flip_count = 0; |
| 8403 | static int nibblebits[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; |
| 8404 | for (; tracker != 0; tracker >>= 4) { |
| 8405 | flip_count += nibblebits[tracker & 0x0f]; |
| 8406 | } |
| 8407 | |
| 8408 | return flip_count <= config_endpoint_changes_allowed_get(); |
| 8409 | } |
| 8410 | |
| 8411 | |
| 8412 | /** |
| 8413 | * Indicates if the change tracker just changed. |
| 8414 | */ |
| 8415 | static bool |
| 8416 | hb_endpoint_change_tracker_has_changed(uint64_t tracker) |
| 8417 | { |
| 8418 | return tracker % 2; |
| 8419 | } |
| 8420 | |
| 8421 | /** |
| 8422 | * Update adjacent node data on receiving a valid pulse message. |
| 8423 | * |
| 8424 | * @return 0 if the update was successfully applied, -1 if the update should be |
| 8425 | * rejected. |
| 8426 | */ |
| 8427 | static int |
| 8428 | hb_adjacent_node_update(as_hb_channel_event* msg_event, |
| 8429 | as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]) |
| 8430 | { |
| 8431 | msg* msg = msg_event->msg; |
| 8432 | |
| 8433 | cf_node source = 0; |
| 8434 | // Channel has validated the source. Don't bother checking here. |
| 8435 | msg_nodeid_get(msg, &source); |
| 8436 | |
| 8437 | msg_id_get(msg, &adjacent_node->protocol_version); |
| 8438 | |
| 8439 | as_hlc_timestamp send_ts = adjacent_node->last_msg_hlc_ts.send_ts; |
| 8440 | |
| 8441 | if (hb_endpoint_change_tracker_has_changed( |
| 8442 | adjacent_node->endpoint_change_tracker)) { |
| 8443 | // Allow a little more slack for obsolete checking because the two nodes |
| 8444 | // might not have matching send timestamps. |
| 8445 | send_ts = as_hlc_timestamp_subtract_ms(send_ts, |
| 8446 | config_tx_interval_get()); |
| 8447 | } |
| 8448 | |
| 8449 | if (hb_msg_is_obsolete(msg_event, send_ts)) { |
| 8450 | WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64, |
| 8451 | send_ts, |
| 8452 | msg_event->msg_hlc_ts.send_ts, source); |
| 8453 | return -1; |
| 8454 | } |
| 8455 | |
| 8456 | // Populate plugin data. |
| 8457 | hb_plugin_msg_parse(msg, adjacent_node, g_hb.plugins, plugin_data_changed); |
| 8458 | |
| 8459 | // Get the ip address. |
| 8460 | as_endpoint_list* msg_endpoint_list; |
| 8461 | if (msg_endpoint_list_get(msg, &msg_endpoint_list) == 0 |
| 8462 | && !as_endpoint_lists_are_equal(adjacent_node->endpoint_list, |
| 8463 | msg_endpoint_list)) { |
| 8464 | // Update the endpoints. |
| 8465 | endpoint_list_copy(&adjacent_node->endpoint_list, msg_endpoint_list); |
| 8466 | } |
| 8467 | |
| 8468 | // Update the last updated time. |
| 8469 | adjacent_node->last_updated_monotonic_ts = cf_getms(); |
| 8470 | memcpy(&adjacent_node->last_msg_hlc_ts, &msg_event->msg_hlc_ts, |
| 8471 | sizeof(adjacent_node->last_msg_hlc_ts)); |
| 8472 | |
| 8473 | // Update the latency. |
| 8474 | int64_t latency = as_hlc_timestamp_diff_ms(msg_event->msg_hlc_ts.send_ts, |
| 8475 | msg_event->msg_hlc_ts.recv_ts); |
| 8476 | latency = latency < 0 ? -latency : latency; |
| 8477 | adjacent_node->avg_latency = ALPHA * latency |
| 8478 | + (1 - ALPHA) * adjacent_node->avg_latency; |
| 8479 | |
| 8480 | // Reset the cluster-name mismatch counter to zero. |
| 8481 | adjacent_node->cluster_name_mismatch_count = 0; |
| 8482 | |
| 8483 | // Check if fabric endpoints have changed. |
| 8484 | as_hb_plugin_node_data* curr_data = |
| 8485 | &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][adjacent_node->plugin_data_cycler |
| 8486 | % 2]; |
| 8487 | |
| 8488 | as_hb_plugin_node_data* prev_data = |
| 8489 | &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][(adjacent_node->plugin_data_cycler |
| 8490 | + 1) % 2]; |
| 8491 | |
| 8492 | as_endpoint_list* curr_fabric_endpoints = |
| 8493 | as_fabric_hb_plugin_get_endpoint_list(curr_data); |
| 8494 | as_endpoint_list* prev_fabric_endpoints = |
| 8495 | as_fabric_hb_plugin_get_endpoint_list(prev_data); |
| 8496 | |
| 8497 | // Endpoints changed if this is not the first update and if the endpoint |
| 8498 | // lists do not match. |
| 8499 | bool endpoints_changed = prev_fabric_endpoints != NULL |
| 8500 | && !as_endpoint_lists_are_equal(curr_fabric_endpoints, |
| 8501 | prev_fabric_endpoints); |
| 8502 | |
| 8503 | if (endpoints_changed) { |
| 8504 | char curr_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE]; |
| 8505 | char prev_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE]; |
| 8506 | |
| 8507 | as_endpoint_list_to_string(curr_fabric_endpoints, |
| 8508 | curr_fabric_endpoints_str, sizeof(curr_fabric_endpoints_str)); |
| 8509 | as_endpoint_list_to_string(prev_fabric_endpoints, |
| 8510 | prev_fabric_endpoints_str, sizeof(prev_fabric_endpoints_str)); |
| 8511 | |
| 8512 | TICKER_WARNING("node: %" PRIx64" fabric endpoints changed from {%s} to {%s}" , source, prev_fabric_endpoints_str, curr_fabric_endpoints_str); |
| 8513 | } |
| 8514 | |
| 8515 | hb_endpoint_change_tracker_update(&adjacent_node->endpoint_change_tracker, |
| 8516 | endpoints_changed); |
| 8517 | |
| 8518 | return 0; |
| 8519 | } |
| 8520 | |
| 8521 | /** |
| 8522 | * Indicates if a node can be considered adjacent, based on accumulated |
| 8523 | * statistics. |
| 8524 | */ |
| 8525 | static bool |
| 8526 | hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node) |
| 8527 | { |
| 8528 | return hb_endpoint_change_tracker_is_normal( |
| 8529 | adjacent_node->endpoint_change_tracker); |
| 8530 | } |
| 8531 | |
| 8532 | /** |
| 8533 | * Process a pulse from source having our node-id. |
| 8534 | */ |
| 8535 | static void |
| 8536 | hb_channel_on_self_pulse(as_hb_channel_event* msg_event) |
| 8537 | { |
| 8538 | bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 }; |
| 8539 | |
| 8540 | HB_LOCK(); |
| 8541 | if (hb_adjacent_node_update(msg_event, &g_hb.self_node, plugin_data_changed) |
| 8542 | != 0) { |
| 8543 | goto Exit; |
| 8544 | } |
| 8545 | |
| 8546 | as_hb_plugin_node_data* curr_data = |
| 8547 | &g_hb.self_node.plugin_data[AS_HB_PLUGIN_FABRIC][g_hb.self_node.plugin_data_cycler |
| 8548 | % 2]; |
| 8549 | as_endpoint_list* curr_fabric_endpoints = |
| 8550 | as_fabric_hb_plugin_get_endpoint_list(curr_data); |
| 8551 | |
| 8552 | if (!as_fabric_is_published_endpoint_list(curr_fabric_endpoints)) { |
| 8553 | // Mark self as having duplicate node-id. |
| 8554 | g_hb.self_is_duplicate = true; |
| 8555 | g_hb.self_duplicate_detected_ts = cf_getms(); |
| 8556 | |
| 8557 | // Found another node with duplicate node-id. |
| 8558 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 8559 | as_endpoint_list_to_string(curr_fabric_endpoints, endpoint_list_str, |
| 8560 | sizeof(endpoint_list_str)); |
| 8561 | TICKER_WARNING("duplicate node-id: %" PRIx64 " with fabric endpoints {%s}" , config_self_nodeid_get(), endpoint_list_str); |
| 8562 | } |
| 8563 | else { |
| 8564 | cf_atomic_int_incr(&g_stats.heartbeat_received_self); |
| 8565 | } |
| 8566 | |
| 8567 | Exit: |
| 8568 | HB_UNLOCK(); |
| 8569 | } |
| 8570 | |
| 8571 | /** |
| 8572 | * Process an incoming pulse message. |
| 8573 | */ |
| 8574 | static void |
| 8575 | hb_channel_on_pulse(as_hb_channel_event* msg_event) |
| 8576 | { |
| 8577 | msg* msg = msg_event->msg; |
| 8578 | cf_node source; |
| 8579 | |
| 8580 | // Print cluster breach only once per second. |
| 8581 | static cf_clock last_cluster_breach_print = 0; |
| 8582 | |
| 8583 | // Channel has validated the source. Don't bother checking here. |
| 8584 | msg_nodeid_get(msg, &source); |
| 8585 | |
| 8586 | if (source == config_self_nodeid_get()) { |
| 8587 | hb_channel_on_self_pulse(msg_event); |
| 8588 | // Ignore self heartbeats. |
| 8589 | return; |
| 8590 | } |
| 8591 | |
| 8592 | HB_LOCK(); |
| 8593 | |
| 8594 | as_hb_adjacent_node adjacent_node = { 0 }; |
| 8595 | |
| 8596 | bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 }; |
| 8597 | bool is_in_adjacency = (hb_adjacent_node_get(source, &adjacent_node) == 0); |
| 8598 | bool should_be_on_probation = false; |
| 8599 | |
| 8600 | if (!is_in_adjacency) { |
| 8601 | hb_on_probation_node_get(source, &adjacent_node); |
| 8602 | } |
| 8603 | |
| 8604 | // Update the adjacent node with contents of the message. |
| 8605 | if (hb_adjacent_node_update(msg_event, &adjacent_node, plugin_data_changed) |
| 8606 | != 0) { |
| 8607 | // Update rejected. |
| 8608 | goto Exit; |
| 8609 | } |
| 8610 | |
| 8611 | // Check if this node needs to be on probation. |
| 8612 | should_be_on_probation = !hb_node_can_consider_adjacent(&adjacent_node); |
| 8613 | |
| 8614 | cf_atomic_int_incr(&g_stats.heartbeat_received_foreign); |
| 8615 | |
| 8616 | bool is_new = !should_be_on_probation && !is_in_adjacency; |
| 8617 | |
| 8618 | if (is_new) { |
| 8619 | int mcsize = config_mcsize(); |
| 8620 | // Note: adjacency list does not contain self node hence |
| 8621 | // (mcsize - 1) in the check. |
| 8622 | if (cf_shash_get_size(g_hb.adjacency) >= (mcsize - 1)) { |
| 8623 | if (last_cluster_breach_print != (cf_getms() / 1000L)) { |
| 8624 | WARNING("ignoring node: %" PRIx64" - exceeding maximum supported cluster size %d" , |
| 8625 | source, mcsize); |
| 8626 | last_cluster_breach_print = cf_getms() / 1000L; |
| 8627 | } |
| 8628 | goto Exit; |
| 8629 | } |
| 8630 | } |
| 8631 | |
| 8632 | // Move the node to appropriate hash. |
| 8633 | cf_shash_put(should_be_on_probation ? g_hb.on_probation : g_hb.adjacency, |
| 8634 | &source, &adjacent_node); |
| 8635 | |
| 8636 | // Maintain mutual exclusion between adjacency and on_probation hashes. |
| 8637 | cf_shash_delete(should_be_on_probation ? g_hb.adjacency : g_hb.on_probation, |
| 8638 | &source); |
| 8639 | |
| 8640 | if (is_new) { |
| 8641 | // Publish event if this is a new node. |
| 8642 | INFO("node arrived %" PRIx64, source); |
| 8643 | hb_event_queue(AS_HB_INTERNAL_NODE_ARRIVE, &source, 1); |
| 8644 | } |
| 8645 | else if (should_be_on_probation && is_in_adjacency) { |
| 8646 | // This node needs to be on probation, most likely due to duplicate |
| 8647 | // node-ids. |
| 8648 | WARNING("node expired %" PRIx64" - potentially duplicate node-id" , source); |
| 8649 | hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, &source, 1); |
| 8650 | } |
| 8651 | |
| 8652 | Exit: |
| 8653 | HB_UNLOCK(); |
| 8654 | |
| 8655 | // Publish any pending node arrival events. |
| 8656 | hb_event_publish_pending(); |
| 8657 | |
| 8658 | if (!should_be_on_probation) { |
| 8659 | // Call plugin change listeners outside of a lock to prevent deadlocks. |
| 8660 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
| 8661 | if (plugin_data_changed[i] && g_hb.plugins[i].change_listener) { |
| 8662 | // Notify that data for this plugin for the source node has |
| 8663 | // changed. |
| 8664 | DETAIL("plugin data for node %" PRIx64" changed for plugin %d" , |
| 8665 | source, i); |
| 8666 | (g_hb.plugins[i]).change_listener(source); |
| 8667 | } |
| 8668 | } |
| 8669 | } |
| 8670 | } |
| 8671 | |
| 8672 | /** |
| 8673 | * Process an incoming heartbeat message. |
| 8674 | */ |
| 8675 | static void |
| 8676 | hb_channel_on_msg_rcvd(as_hb_channel_event* event) |
| 8677 | { |
| 8678 | msg* msg = event->msg; |
| 8679 | as_hb_msg_type type; |
| 8680 | msg_type_get(msg, &type); |
| 8681 | |
| 8682 | switch (type) { |
| 8683 | case AS_HB_MSG_TYPE_PULSE: // A pulse message. Update the adjacent node data. |
| 8684 | hb_channel_on_pulse(event); |
| 8685 | break; |
| 8686 | default: // Ignore other messages. |
| 8687 | break; |
| 8688 | } |
| 8689 | } |
| 8690 | |
| 8691 | /** |
| 8692 | * Increase the cluster-name mismatch counter the node. |
| 8693 | */ |
| 8694 | static void |
| 8695 | hb_handle_cluster_name_mismatch(as_hb_channel_event* event) |
| 8696 | { |
| 8697 | HB_LOCK(); |
| 8698 | |
| 8699 | as_hb_adjacent_node adjacent_node; |
| 8700 | memset(&adjacent_node, 0, sizeof(adjacent_node)); |
| 8701 | |
| 8702 | if (hb_adjacent_node_get(event->nodeid, &adjacent_node) != 0) { |
| 8703 | // Node does not exist in the adjacency list |
| 8704 | goto Exit; |
| 8705 | } |
| 8706 | |
| 8707 | if (hb_msg_is_obsolete(event, adjacent_node.last_msg_hlc_ts.send_ts)) { |
| 8708 | WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64, |
| 8709 | adjacent_node.last_msg_hlc_ts.send_ts, |
| 8710 | event->msg_hlc_ts.send_ts, event->nodeid); |
| 8711 | goto Exit; |
| 8712 | } |
| 8713 | |
| 8714 | // Update the cluster_name_mismatch counter. |
| 8715 | adjacent_node.cluster_name_mismatch_count++; |
| 8716 | cf_shash_put(g_hb.adjacency, &event->nodeid, &adjacent_node); |
| 8717 | Exit: |
| 8718 | HB_UNLOCK(); |
| 8719 | } |
| 8720 | |
| 8721 | /** |
| 8722 | * Process channel events. |
| 8723 | */ |
| 8724 | static void |
| 8725 | hb_channel_event_process(as_hb_channel_event* event) |
| 8726 | { |
| 8727 | // Deal with pulse messages here. |
| 8728 | switch (event->type) { |
| 8729 | case AS_HB_CHANNEL_MSG_RECEIVED: |
| 8730 | hb_channel_on_msg_rcvd(event); |
| 8731 | break; |
| 8732 | case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH: |
| 8733 | hb_handle_cluster_name_mismatch(event); |
| 8734 | break; |
| 8735 | default: // Ignore channel active and inactive events. Rather rely on the adjacency |
| 8736 | // tender to expire nodes. |
| 8737 | break; |
| 8738 | } |
| 8739 | } |
| 8740 | |
| 8741 | /** |
| 8742 | * Dump hb mode state to logs. |
| 8743 | * @param verbose enables / disables verbose logging. |
| 8744 | */ |
| 8745 | static void |
| 8746 | hb_mode_dump(bool verbose) |
| 8747 | { |
| 8748 | if (hb_is_mesh()) { |
| 8749 | mesh_dump(verbose); |
| 8750 | } |
| 8751 | else { |
| 8752 | multicast_dump(verbose); |
| 8753 | } |
| 8754 | } |
| 8755 | |
| 8756 | /** |
| 8757 | * Reduce function to dump hb node info to log file. |
| 8758 | */ |
| 8759 | static int |
| 8760 | hb_dump_reduce(const void* key, void* data, void* udata) |
| 8761 | { |
| 8762 | const cf_node* nodeid = (const cf_node*)key; |
| 8763 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
| 8764 | |
| 8765 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
| 8766 | as_endpoint_list_to_string(adjacent_node->endpoint_list, endpoint_list_str, |
| 8767 | sizeof(endpoint_list_str)); |
| 8768 | |
| 8769 | INFO("\tHB %s Node: node-id %" PRIx64" protocol %" PRIu32" endpoints {%s} last-updated %" PRIu64 " latency-ms %" PRIu64 , |
| 8770 | (char*)udata, |
| 8771 | *nodeid, adjacent_node->protocol_version, endpoint_list_str, |
| 8772 | adjacent_node->last_updated_monotonic_ts, adjacent_node->avg_latency); |
| 8773 | |
| 8774 | return CF_SHASH_OK; |
| 8775 | } |
| 8776 | |
| 8777 | /** |
| 8778 | * Dump hb state to logs. |
| 8779 | * @param verbose enables / disables verbose logging. |
| 8780 | */ |
| 8781 | static void |
| 8782 | hb_dump(bool verbose) |
| 8783 | { |
| 8784 | HB_LOCK(); |
| 8785 | |
| 8786 | INFO("HB Adjacency Size: %d" , cf_shash_get_size(g_hb.adjacency)); |
| 8787 | |
| 8788 | if (verbose) { |
| 8789 | cf_shash_reduce(g_hb.adjacency, hb_dump_reduce, "Adjacent" ); |
| 8790 | } |
| 8791 | |
| 8792 | if (cf_shash_get_size(g_hb.on_probation)) { |
| 8793 | INFO("HB On-probation Size: %d" , cf_shash_get_size(g_hb.on_probation)); |
| 8794 | |
| 8795 | if (verbose) { |
| 8796 | cf_shash_reduce(g_hb.on_probation, hb_dump_reduce, "On-probation" ); |
| 8797 | } |
| 8798 | } |
| 8799 | |
| 8800 | HB_UNLOCK(); |
| 8801 | } |
| 8802 | |
| 8803 | /** |
| 8804 | * Compute a complement / inverted adjacency graph for input nodes such that |
| 8805 | * entry |
| 8806 | * |
| 8807 | * inverted_graph[i][j] = 0 iff node[i] and node[j] are in each others adjacency |
| 8808 | * lists. That is they have a bidirectional network link active between them. |
| 8809 | * |
| 8810 | * else |
| 8811 | * |
| 8812 | * inverted_graph[i][j] > 0 iff there is no link or a unidirectional link |
| 8813 | * between them. |
| 8814 | * |
| 8815 | * |
| 8816 | * @param nodes the input vector of nodes. |
| 8817 | * @param inverted_graph (output) a (num_nodes x num_nodes ) 2D byte array. |
| 8818 | */ |
| 8819 | static void |
| 8820 | hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph) |
| 8821 | { |
| 8822 | HB_LOCK(); |
| 8823 | int num_nodes = cf_vector_size(nodes); |
| 8824 | |
| 8825 | for (int i = 0; i < num_nodes; i++) { |
| 8826 | for (int j = 0; j < num_nodes; j++) { |
| 8827 | inverted_graph[i][j] = 2; |
| 8828 | } |
| 8829 | cf_node nodeid = 0; |
| 8830 | cf_vector_get(nodes, i, &nodeid); |
| 8831 | cf_shash_put(g_hb.nodeid_to_index, &nodeid, &i); |
| 8832 | } |
| 8833 | |
| 8834 | cf_node self_nodeid = config_self_nodeid_get(); |
| 8835 | int self_node_index = -1; |
| 8836 | cf_shash_get(g_hb.nodeid_to_index, &self_nodeid, &self_node_index); |
| 8837 | |
| 8838 | for (int i = 0; i < num_nodes; i++) { |
| 8839 | // Mark the node connected from itself, i.e, disconnected in the |
| 8840 | // inverted graph. |
| 8841 | inverted_graph[i][i] = 0; |
| 8842 | |
| 8843 | cf_node node = *(cf_node*)cf_vector_getp(nodes, i); |
| 8844 | as_hb_adjacent_node node_info; |
| 8845 | |
| 8846 | if (hb_adjacent_node_get(node, &node_info) == 0) { |
| 8847 | if (self_node_index >= 0) { |
| 8848 | // Self node will not have plugin data. But the fact that this |
| 8849 | // node has an adjacent node indicates that is is in our |
| 8850 | // adjacency list. Adjust the graph. |
| 8851 | inverted_graph[i][self_node_index]--; |
| 8852 | inverted_graph[self_node_index][i]--; |
| 8853 | } |
| 8854 | |
| 8855 | cf_node* adjacency_list = NULL; |
| 8856 | size_t adjacency_length = 0; |
| 8857 | hb_adjacent_node_adjacency_get(&node_info, &adjacency_list, &adjacency_length); |
| 8858 | |
| 8859 | for (int j = 0; j < adjacency_length; j++) { |
| 8860 | int other_node_index = -1; |
| 8861 | cf_shash_get(g_hb.nodeid_to_index, &adjacency_list[j], |
| 8862 | &other_node_index); |
| 8863 | if (other_node_index < 0) { |
| 8864 | // This node is not in the input set of nodes. |
| 8865 | continue; |
| 8866 | } |
| 8867 | |
| 8868 | if (i != other_node_index) { |
| 8869 | inverted_graph[i][other_node_index]--; |
| 8870 | inverted_graph[other_node_index][i]--; |
| 8871 | } |
| 8872 | } |
| 8873 | } |
| 8874 | } |
| 8875 | |
| 8876 | // Cleanup the temporary hash. |
| 8877 | cf_shash_delete_all(g_hb.nodeid_to_index); |
| 8878 | |
| 8879 | HB_UNLOCK(); |
| 8880 | } |
| 8881 | |
| 8882 | /** |
| 8883 | * Compute the nodes to evict from the input nodes so that remaining nodes form |
| 8884 | * a clique, based on adjacency lists using minimal vertex cover. |
| 8885 | * |
| 8886 | * The minimal vertex cover on this graph is the set of nodes that should be |
| 8887 | * removed to result in a clique on the remaining nodes. This implementation is |
| 8888 | * an approximation of the minimal vertex cover. The notion is to keep removing |
| 8889 | * vertices having the highest degree until there are no more edges remaining. |
| 8890 | * The heuristic gets rid of the more problematic nodes first. |
| 8891 | * |
| 8892 | * @param nodes input cf_node vector. |
| 8893 | * @param nodes_to_evict output cf_node clique array, that is initialized. |
| 8894 | */ |
| 8895 | static void |
| 8896 | hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict) |
| 8897 | { |
| 8898 | int num_nodes = cf_vector_size(nodes); |
| 8899 | |
| 8900 | if (num_nodes == 0) { |
| 8901 | // Nothing to do. |
| 8902 | return; |
| 8903 | } |
| 8904 | |
| 8905 | int graph_alloc_size = sizeof(uint8_t) * num_nodes * num_nodes; |
| 8906 | void* graph_data = MSG_BUFF_ALLOC(graph_alloc_size); |
| 8907 | |
| 8908 | if (!graph_data) { |
| 8909 | CRASH("error allocating space for clique finding data structure" ); |
| 8910 | } |
| 8911 | |
| 8912 | uint8_t* inverted_graph[num_nodes]; |
| 8913 | inverted_graph[0] = graph_data; |
| 8914 | for (int i = 1; i < num_nodes; i++) { |
| 8915 | inverted_graph[i] = *inverted_graph + num_nodes * i; |
| 8916 | } |
| 8917 | |
| 8918 | hb_adjacency_graph_invert(nodes, inverted_graph); |
| 8919 | |
| 8920 | // Count the number of edges in the inverted graph. These edges are the ones |
| 8921 | // that need to be removed so that the remaining nodes form a clique in the |
| 8922 | // adjacency graph. Also for performance get hold of the self node index in |
| 8923 | // the nodes vector. |
| 8924 | int edge_count = 0; |
| 8925 | int self_node_index = -1; |
| 8926 | for (int i = 0; i < num_nodes; i++) { |
| 8927 | cf_node node = 0; |
| 8928 | cf_vector_get(nodes, i, &node); |
| 8929 | if (node == config_self_nodeid_get()) { |
| 8930 | self_node_index = i; |
| 8931 | } |
| 8932 | |
| 8933 | for (int j = 0; j < num_nodes; j++) { |
| 8934 | if (inverted_graph[i][j]) { |
| 8935 | edge_count++; |
| 8936 | } |
| 8937 | } |
| 8938 | } |
| 8939 | |
| 8940 | cf_vector_delete_range(nodes_to_evict, 0, |
| 8941 | cf_vector_size(nodes_to_evict) - 1); |
| 8942 | |
| 8943 | // Since we always decide to retain self node, first get rid of all nodes |
| 8944 | // having missing links to self node. |
| 8945 | if (self_node_index >= 0) { |
| 8946 | for (int i = 0; i < num_nodes; i++) { |
| 8947 | if (inverted_graph[self_node_index][i] |
| 8948 | || inverted_graph[i][self_node_index]) { |
| 8949 | cf_node to_evict = 0; |
| 8950 | cf_vector_get(nodes, i, &to_evict); |
| 8951 | DEBUG("marking node %" PRIx64" for clique based eviction" , |
| 8952 | to_evict); |
| 8953 | |
| 8954 | cf_vector_append(nodes_to_evict, &to_evict); |
| 8955 | |
| 8956 | // Remove all edges attached to the removed node. |
| 8957 | for (int j = 0; j < num_nodes; j++) { |
| 8958 | if (inverted_graph[i][j]) { |
| 8959 | inverted_graph[i][j] = 0; |
| 8960 | edge_count--; |
| 8961 | } |
| 8962 | if (inverted_graph[j][i]) { |
| 8963 | inverted_graph[j][i] = 0; |
| 8964 | edge_count--; |
| 8965 | } |
| 8966 | } |
| 8967 | } |
| 8968 | } |
| 8969 | } |
| 8970 | |
| 8971 | while (edge_count > 0) { |
| 8972 | // Find vertex with highest degree. |
| 8973 | cf_node max_degree_node = 0; |
| 8974 | int max_degree_node_idx = -1; |
| 8975 | int max_degree = 0; |
| 8976 | |
| 8977 | for (int i = 0; i < num_nodes; i++) { |
| 8978 | cf_node to_evict = 0; |
| 8979 | cf_vector_get(nodes, i, &to_evict); |
| 8980 | |
| 8981 | if (vector_find(nodes_to_evict, &to_evict) >= 0) { |
| 8982 | // We have already decided to evict this node. |
| 8983 | continue; |
| 8984 | } |
| 8985 | |
| 8986 | if (to_evict == config_self_nodeid_get()) { |
| 8987 | // Do not evict self. |
| 8988 | continue; |
| 8989 | } |
| 8990 | |
| 8991 | // Get the degree of this node. |
| 8992 | int degree = 0; |
| 8993 | for (int j = 0; j < num_nodes; j++) { |
| 8994 | if (inverted_graph[i][j]) { |
| 8995 | degree++; |
| 8996 | } |
| 8997 | } |
| 8998 | |
| 8999 | DETAIL("inverted degree for node %" PRIx64" is %d" , |
| 9000 | to_evict, degree); |
| 9001 | |
| 9002 | // See if this node has a higher degree. On ties choose the node |
| 9003 | // with a smaller nodeid |
| 9004 | if (degree > max_degree |
| 9005 | || (degree == max_degree && max_degree_node > to_evict)) { |
| 9006 | max_degree = degree; |
| 9007 | max_degree_node = to_evict; |
| 9008 | max_degree_node_idx = i; |
| 9009 | } |
| 9010 | } |
| 9011 | |
| 9012 | if (max_degree_node_idx < 0) { |
| 9013 | // We are done no node to evict. |
| 9014 | break; |
| 9015 | } |
| 9016 | |
| 9017 | DEBUG("marking node %" PRIx64" with degree %d for clique based eviction" , |
| 9018 | max_degree_node, max_degree); |
| 9019 | |
| 9020 | cf_vector_append(nodes_to_evict, &max_degree_node); |
| 9021 | |
| 9022 | // Remove all edges attached to the removed node. |
| 9023 | for (int i = 0; i < num_nodes; i++) { |
| 9024 | if (inverted_graph[max_degree_node_idx][i]) { |
| 9025 | inverted_graph[max_degree_node_idx][i] = 0; |
| 9026 | edge_count--; |
| 9027 | } |
| 9028 | if (inverted_graph[i][max_degree_node_idx]) { |
| 9029 | inverted_graph[i][max_degree_node_idx] = 0; |
| 9030 | edge_count--; |
| 9031 | } |
| 9032 | } |
| 9033 | } |
| 9034 | |
| 9035 | MSG_BUFF_FREE(graph_data, graph_alloc_size); |
| 9036 | } |
| 9037 | |
| 9038 | /** |
| 9039 | * Reduce function to iterate over plugin data for all adjacent nodes. |
| 9040 | */ |
| 9041 | static int |
| 9042 | hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata) |
| 9043 | { |
| 9044 | const cf_node* nodeid = (const cf_node*)key; |
| 9045 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
| 9046 | as_hb_adjacecny_iterate_reduce_udata* reduce_udata = |
| 9047 | (as_hb_adjacecny_iterate_reduce_udata*)udata; |
| 9048 | |
| 9049 | size_t plugin_data_size = |
| 9050 | adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler |
| 9051 | % 2].data_size; |
| 9052 | void* plugin_data = |
| 9053 | plugin_data_size ? |
| 9054 | adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler |
| 9055 | % 2].data : NULL; |
| 9056 | |
| 9057 | reduce_udata->iterate_fn(*nodeid, plugin_data, plugin_data_size, |
| 9058 | adjacent_node->last_updated_monotonic_ts, |
| 9059 | &adjacent_node->last_msg_hlc_ts, reduce_udata->udata); |
| 9060 | |
| 9061 | return CF_SHASH_OK; |
| 9062 | } |
| 9063 | |
| 9064 | /** |
| 9065 | * Call the iterate method on all nodes in current adjacency list. Note plugin |
| 9066 | * data can still be NULL if the plugin data failed to parse the plugin data. |
| 9067 | * |
| 9068 | * @param pluginid the plugin identifier. |
| 9069 | * @param iterate_fn the iterate function invoked for plugin data forevery node. |
| 9070 | * @param udata passed as is to the iterate function. Useful for getting results |
| 9071 | * out of the iteration. NULL if there is no plugin data. |
| 9072 | * @return the size of the plugin data. 0 if there is no plugin data. |
| 9073 | */ |
| 9074 | static void |
| 9075 | hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, |
| 9076 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata) |
| 9077 | { |
| 9078 | HB_LOCK(); |
| 9079 | |
| 9080 | as_hb_adjacecny_iterate_reduce_udata reduce_udata; |
| 9081 | reduce_udata.pluginid = pluginid; |
| 9082 | reduce_udata.iterate_fn = iterate_fn; |
| 9083 | reduce_udata.udata = udata; |
| 9084 | cf_shash_reduce(g_hb.adjacency, hb_plugin_data_iterate_reduce, |
| 9085 | &reduce_udata); |
| 9086 | |
| 9087 | HB_UNLOCK(); |
| 9088 | } |
| 9089 | |