1 | /* |
2 | * hb.c |
3 | * |
4 | * Copyright (C) 2012-2017 Aerospike, Inc. |
5 | * |
6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
7 | * license agreements. |
8 | * |
9 | * This program is free software: you can redistribute it and/or modify it under |
10 | * the terms of the GNU Affero General Public License as published by the Free |
11 | * Software Foundation, either version 3 of the License, or (at your option) any |
12 | * later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
17 | * details. |
18 | * |
19 | * You should have received a copy of the GNU Affero General Public License |
20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
21 | */ |
22 | |
23 | #include "fabric/hb.h" |
24 | |
25 | #include <errno.h> |
26 | #include <limits.h> |
27 | #include <math.h> |
28 | #include <pthread.h> |
29 | #include <stdio.h> |
30 | #include <sys/param.h> |
31 | #include <sys/types.h> |
32 | #include <zlib.h> |
33 | |
34 | #include "citrusleaf/alloc.h" |
35 | #include "citrusleaf/cf_atomic.h" |
36 | #include "citrusleaf/cf_clock.h" |
37 | #include "citrusleaf/cf_hash_math.h" |
38 | #include "citrusleaf/cf_queue.h" |
39 | |
40 | #include "cf_thread.h" |
41 | #include "dns.h" |
42 | #include "fault.h" |
43 | #include "node.h" |
44 | #include "shash.h" |
45 | #include "socket.h" |
46 | |
47 | #include "base/cfg.h" |
48 | #include "base/health.h" |
49 | #include "base/stats.h" |
50 | #include "base/thr_info.h" |
51 | #include "fabric/endpoint.h" |
52 | #include "fabric/fabric.h" |
53 | #include "fabric/partition_balance.h" |
54 | |
55 | /* |
56 | * Overview |
57 | * ======== |
58 | * The heartbeat subsystem is a core clustering module that discovers nodes in |
59 | * the cluster and monitors connectivity to them. This subsystem maintains an |
60 | * "adjacency list", which is the list of nodes deemed to be alive and connected |
61 | * at any instance in time. |
62 | * |
63 | * The heartbeat subsystem is divided into three sub modules |
64 | * 1. Config |
65 | * 2. Channel |
66 | * 3. Mesh |
67 | * 4. Main |
68 | * |
69 | * Config |
70 | * ------ |
71 | * This sub module deals with overall heartbeat subsystem configuration and |
72 | * dynamic updates to configuration. |
73 | * |
74 | * Channel |
75 | * ------- |
76 | * This sub module is responsible for maintaining a channel between this node |
77 | * and all known nodes. The channel sub module provides the ability to broadcast |
78 | * or uni cast messages to known nodes. |
79 | * |
80 | * Other modules interact with the channel sub module primarily through events |
81 | * raised by the channel sub module. The events help other sub modules infer |
82 | * connectivity status to known nodes and react to incoming heartbeat message |
83 | * from other nodes. |
84 | * |
85 | * Depending on the configured mode (mesh. multicast) the channels between this |
86 | * node and other nodes could be |
87 | * 1. TCP and hence unicast. One per pair of nodes. |
88 | * 2. Multicast with UDP. One per cluster. |
89 | * |
90 | * Mesh |
91 | * ---- |
92 | * This sub module is responsible for discovering cluster members. New nodes are |
93 | * discovered via adjacency lists published in their heartbeats of know nodes. |
94 | * The mesh module boots up using configured seed nodes. |
95 | * |
96 | * Main |
97 | * ---- |
98 | * This sub module orchestrates other modules and hence main. Its primary |
99 | * responsibility is to maintain the adjacency list. |
100 | * |
101 | * Heartbeat messages |
102 | * ================== |
103 | * |
104 | * Every heartbeat message contains |
105 | * 1. the source node's nodeid |
106 | * 2. the source node's published ip address |
107 | * 3. the source node's published port. |
108 | * |
109 | * There are the following types of heartbeat messages |
110 | * 1. Pulse - messages sent at periodic intervals. Will contain current |
111 | * adjacency lists |
112 | * 2. Info request - message sent in the mesh mode, to a known mesh node, |
113 | * in order to get ip address and port of a newly discovered node. |
114 | * 3. Info reply - message sent in response to an info request. Returns |
115 | * the node's ip address and port. |
116 | * |
117 | * Message conventions |
118 | * ------------------- |
119 | * 1. Published adjacency will always contain the source node. |
120 | * |
121 | * Design philosophy |
122 | * ================= |
123 | * |
124 | * Locking vs single threaded event loop. |
125 | * -------------------------------------- |
126 | * This first cut leans toward using locks instead of single threaded event |
127 | * loops to protect critical data. The choice is driven by the fact that |
128 | * synchronous external and inter-sub module interaction looked like more work |
129 | * with single threaded event loops. The design chooses simplicity over |
130 | * performance given the lower volumes of events that need to be processed here |
131 | * as compared to the transaction processing code. The locks are coarse, one per |
132 | * sub module and re-entrant. They are used generously and no function makes an |
133 | * assumption of locks prior locks being held. |
134 | * |
135 | * Inter-module interactions in some cases are via synchronous function calls, |
136 | * which run the risk of deadlocks. For now, deadlocks should not happen. |
137 | * However, if this ideology complicates code, inter-module interaction will be |
138 | * rewritten to use asynchronous event queues. |
139 | * |
140 | * Locking policy |
141 | * ============== |
142 | * |
143 | * 1. Lock as much as you can. The locks are re-entrant. This is not a critical |
144 | * high volume code path, and hence correctness with simplicity is preferred. |
145 | * Any read / write access to module state should be under a lock. |
146 | * 2. Preventing deadlocks |
147 | * a. The enforced lock order is |
148 | * 1. Protocol lock (SET_PROTOCOL_LOCK) Uses to ensure protocol set is |
149 | * atomic. |
150 | * 2. Main module (HB_LOCK) |
151 | * 3. Mesh and multicast modules (MESH_LOCK) |
152 | * 4. Channel (CHANNEL_LOCK) |
153 | * 5. Config (HB_CONFIG_LOCK) |
154 | * Always make sure every thread acquires locks in this order ONLY. In terms |
155 | * of functions calls only lower numbered modules can call functions from the |
156 | * higher numbered modules while holding their onto their locks. |
157 | * 3. Events raised / messages passed to listeners should be outside the |
158 | * module's lock. |
159 | * |
160 | * Guidelines for message plugins |
161 | * ============================== |
162 | * The parse data functions should NOT hold any locks and thus avert deadlocks. |
163 | * |
164 | * TODO |
165 | * ==== |
166 | * 1. Extend to allow hostnames in mesh mode across the board. |
167 | */ |
168 | |
169 | /* |
170 | * ---------------------------------------------------------------------------- |
171 | * Macros |
172 | * ---------------------------------------------------------------------------- |
173 | */ |
174 | |
175 | /* |
176 | * ---------------------------------------------------------------------------- |
177 | * Channel |
178 | * ---------------------------------------------------------------------------- |
179 | */ |
180 | |
181 | /** |
182 | * Size of the poll events set. |
183 | */ |
184 | #define POLL_SZ 1024 |
185 | |
186 | /** |
187 | * The number of bytes for the message length on the wire. |
188 | */ |
189 | #define MSG_WIRE_LENGTH_SIZE 4 |
190 | |
191 | /** |
192 | * Channel idle interval after which check for inactive channel is triggered. |
193 | */ |
194 | #define CHANNEL_IDLE_CHECK_PERIOD (CHANNEL_NODE_READ_IDLE_TIMEOUT() / 2) |
195 | |
196 | /** |
197 | * A channel times out if there is no msg received from a node in this interval. |
198 | * Set to a fraction of node timeout so that a new channel could be set up to |
199 | * recover from a potentially bad connection before the node times out. |
200 | */ |
201 | #define CHANNEL_NODE_READ_IDLE_TIMEOUT() \ |
202 | (PULSE_TRANSMIT_INTERVAL() \ |
203 | * MAX(2, config_max_intervals_missed_get() / 3)) |
204 | |
205 | /** |
206 | * Acquire a lock on the entire channel sub module. |
207 | */ |
208 | #define CHANNEL_LOCK() (pthread_mutex_lock(&g_channel_lock)) |
209 | |
210 | /** |
211 | * Relinquish the lock on the entire channel sub module. |
212 | */ |
213 | #define CHANNEL_UNLOCK() (pthread_mutex_unlock(&g_channel_lock)) |
214 | |
215 | /* |
216 | * ---------------------------------------------------------------------------- |
217 | * Mesh and Multicast |
218 | * ---------------------------------------------------------------------------- |
219 | */ |
220 | |
221 | /** |
222 | * Read write timeout (in ms). |
223 | */ |
224 | #define MESH_RW_TIMEOUT 5 |
225 | |
226 | /** |
227 | * Size of the network header. |
228 | * |
229 | * Maximum size of IPv4 header - 20 bytes (assuming no variable length fields) |
230 | * Fixed size of IPv6 header - 40 bytes (assuming no extension headers) |
231 | * Maximum size of TCP header - 60 Bytes |
232 | * Size of UDP header (fixed) - 8 bytes |
233 | * So maximum size of empty TCP datagram - 60 + 20 = 80 bytes |
234 | * So maximum size of empty IPv4 UDP datagram - 20 + 8 = 28 bytes |
235 | * So maximum size of empty IPv6 UDP datagram - 40 + 8 = 48 bytes |
236 | * |
237 | * Being conservative and assuming 30 bytes for IPv4 UDP header and 50 bytes for |
238 | * IPv6 UDP header. |
239 | */ |
240 | #define 50 |
241 | |
242 | /** |
243 | * Expected ratio - (input size) / (compressed size). Assuming 40% decrease in |
244 | * size after compression. |
245 | */ |
246 | #define MSG_COMPRESSION_RATIO (1.0 / 0.60) |
247 | |
248 | /** |
249 | * Mesh timeout for pending nodes. |
250 | */ |
251 | #define MESH_PENDING_TIMEOUT (CONNECT_TIMEOUT()) |
252 | |
253 | /** |
254 | * Mesh inactive timeout after which a mesh node will be forgotten. |
255 | */ |
256 | #define MESH_INACTIVE_TIMEOUT (10 * HB_NODE_TIMEOUT()) |
257 | |
258 | /** |
259 | * Mesh timeout for getting the endpoint for a node after which this node will |
260 | * be forgotten. |
261 | */ |
262 | #define MESH_ENDPOINT_UNKNOWN_TIMEOUT (HB_NODE_TIMEOUT()) |
263 | |
264 | /** |
265 | * Intervals at which mesh tender runs. |
266 | */ |
267 | #define MESH_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL()) |
268 | |
269 | /** |
270 | * Intervals at which attempts to resolve unresolved seed hostname will be made. |
271 | */ |
272 | #define MESH_SEED_RESOLVE_ATTEMPT_INTERVAL() (HB_NODE_TIMEOUT()) |
273 | |
274 | /** |
275 | * Intervals at which conflict checks is enabled. |
276 | */ |
277 | #define MESH_CONFLICT_CHECK_INTERVAL() (5 * HB_NODE_TIMEOUT()) |
278 | |
279 | /** |
280 | * Duration for which conflicts are checked. |
281 | */ |
282 | #define MESH_CONFLICT_CHECK_DURATION() (MESH_CONFLICT_CHECK_INTERVAL() / 5) |
283 | |
284 | /** |
285 | * Acquire a lock on the entire mesh sub module. |
286 | */ |
287 | #define MESH_LOCK() (pthread_mutex_lock(&g_mesh_lock)) |
288 | |
289 | /** |
290 | * Relinquish the lock on the entire mesh sub module. |
291 | */ |
292 | #define MESH_UNLOCK() (pthread_mutex_unlock(&g_mesh_lock)) |
293 | |
294 | /** |
295 | * Acquire a lock on the entire multicast sub module. |
296 | */ |
297 | #define MULTICAST_LOCK() (pthread_mutex_lock(&g_multicast_lock)) |
298 | |
299 | /** |
300 | * Relinquish the lock on the entire multicast sub module. |
301 | */ |
302 | #define MULTICAST_UNLOCK() (pthread_mutex_unlock(&g_multicast_lock)) |
303 | |
304 | /* |
305 | * ---------------------------------------------------------------------------- |
306 | * Main |
307 | * ---------------------------------------------------------------------------- |
308 | */ |
309 | |
310 | /** |
311 | * The identifier for heartbeat protocol version 3. |
312 | */ |
313 | #define HB_PROTOCOL_V3_IDENTIFIER 0x6864 |
314 | |
315 | /** |
316 | * Maximum length of hb protocol string. |
317 | */ |
318 | #define HB_PROTOCOL_STR_MAX_LEN 16 |
319 | |
320 | /** |
321 | * Default allocation size for plugin data. |
322 | */ |
323 | #define HB_PLUGIN_DATA_DEFAULT_SIZE 128 |
324 | |
325 | /** |
326 | * Block size for allocating node plugin data. Ensure the allocation is in |
327 | * multiples of 128 bytes, allowing expansion to 16 nodes without reallocating. |
328 | */ |
329 | #define HB_PLUGIN_DATA_BLOCK_SIZE 128 |
330 | |
331 | /** |
332 | * Message scratch size for v3 HB messages. To accommodate 64 node cluster. |
333 | */ |
334 | #define AS_HB_MSG_SCRATCH_SIZE 1024 |
335 | |
336 | /** |
337 | * A soft limit for the maximum cluster size. Meant to be optimize hash and list |
338 | * data structures and not as a limit on the number of nodes. |
339 | */ |
340 | #define AS_HB_CLUSTER_MAX_SIZE_SOFT 200 |
341 | |
342 | /** |
343 | * Maximum event listeners. |
344 | */ |
345 | #define AS_HB_EVENT_LISTENER_MAX 7 |
346 | |
347 | /** |
348 | * Maximum permissible cluster-name mismatch per node. |
349 | */ |
350 | #define CLUSTER_NAME_MISMATCH_MAX 2 |
351 | |
352 | /** |
353 | * Timeout for deeming a node dead based on received heartbeats. |
354 | */ |
355 | #define HB_NODE_TIMEOUT() \ |
356 | ((config_max_intervals_missed_get() * config_tx_interval_get())) |
357 | |
358 | /** |
359 | * Intervals at which heartbeats are send. |
360 | */ |
361 | #define PULSE_TRANSMIT_INTERVAL() \ |
362 | (MAX(config_tx_interval_get(), AS_HB_TX_INTERVAL_MS_MIN)) |
363 | |
364 | /** |
365 | * Intervals at which adjacency tender runs. |
366 | */ |
367 | #define ADJACENCY_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL()) |
368 | |
369 | /** |
370 | * Intervals at which adjacency tender runs in anticipation of addtional node |
371 | * depart events. |
372 | */ |
373 | #define ADJACENCY_FAST_TEND_INTERVAL (MIN(ADJACENCY_TEND_INTERVAL, 10)) |
374 | |
375 | /** |
376 | * Acquire a lock on the external event publisher. |
377 | */ |
378 | #define EXTERNAL_EVENT_PUBLISH_LOCK() \ |
379 | (pthread_mutex_lock(&g_external_event_publish_lock)) |
380 | |
381 | /** |
382 | * Relinquish the lock on the external event publisher. |
383 | */ |
384 | #define EXTERNAL_EVENT_PUBLISH_UNLOCK() \ |
385 | (pthread_mutex_unlock(&g_external_event_publish_lock)) |
386 | |
387 | /** |
388 | * Acquire a lock on the heartbeat main module. |
389 | */ |
390 | #define HB_LOCK() (pthread_mutex_lock(&g_hb_lock)) |
391 | |
392 | /** |
393 | * Relinquish the lock on the heartbeat main module. |
394 | */ |
395 | #define HB_UNLOCK() (pthread_mutex_unlock(&g_hb_lock)) |
396 | |
397 | /** |
398 | * Weightage of current latency over current moving average. For now weigh |
399 | * recent values heavily over older values. |
400 | */ |
401 | #define ALPHA (0.65) |
402 | |
403 | /* |
404 | * ---------------------------------------------------------------------------- |
405 | * Common |
406 | * ---------------------------------------------------------------------------- |
407 | */ |
408 | |
409 | /** |
410 | * The default MTU for multicast in case device discovery fails. |
411 | */ |
412 | #define DEFAULT_MIN_MTU 1500 |
413 | |
414 | /** |
415 | * Maximum memory size allocated on the call stack. |
416 | */ |
417 | #define STACK_ALLOC_LIMIT (16 * 1024) |
418 | |
419 | /** |
420 | * Max string length for an endpoint list converted to a string. |
421 | */ |
422 | #define ENDPOINT_LIST_STR_SIZE 1024 |
423 | |
424 | /** |
425 | * A hard limit on the buffer size for parsing incoming messages. |
426 | */ |
427 | #define MSG_BUFFER_MAX_SIZE (10 * 1024 * 1024) |
428 | |
429 | #ifndef ASC |
430 | #define ASC (2 << 2) |
431 | #endif |
432 | |
433 | /** |
434 | * Connection initiation timeout, Capped at 100 ms. |
435 | */ |
436 | #define CONNECT_TIMEOUT() (MIN(100, config_tx_interval_get())) |
437 | |
438 | /** |
439 | * Allocate a buffer for heart beat messages. Larger buffers are heap allocated |
440 | * to prevent stack overflows. |
441 | */ |
442 | #define MSG_BUFF_ALLOC(size) ( \ |
443 | (size) <= MSG_BUFFER_MAX_SIZE ? \ |
444 | (((size) > STACK_ALLOC_LIMIT) ? \ |
445 | cf_malloc(size) : alloca(size)) : NULL) |
446 | |
447 | /** |
448 | * Allocate a buffer for heart beat messages. Larger buffers are heap allocated |
449 | * to prevent stack overflows. Crashes the process on failure to allocate the |
450 | * buffer. |
451 | */ |
452 | #define MSG_BUFF_ALLOC_OR_DIE(size, crash_msg, ...) \ |
453 | ({ \ |
454 | uint8_t* retval = MSG_BUFF_ALLOC((size)); \ |
455 | if (!retval) { \ |
456 | CRASH(crash_msg, ##__VA_ARGS__); \ |
457 | } \ |
458 | retval; \ |
459 | }) |
460 | |
461 | /** |
462 | * Free the buffer allocated by MSG_BUFF_ALLOC |
463 | */ |
464 | #define MSG_BUFF_FREE(buffer, size) \ |
465 | if (((size) > STACK_ALLOC_LIMIT) && buffer) {cf_free(buffer);} |
466 | |
467 | /** |
468 | * Acquire a lock on the entire config sub module. |
469 | */ |
470 | #define HB_CONFIG_LOCK() (pthread_mutex_lock(&g_hb_config_lock)) |
471 | |
472 | /** |
473 | * Relinquish the lock on the entire config sub module. |
474 | */ |
475 | #define HB_CONFIG_UNLOCK() (pthread_mutex_unlock(&g_hb_config_lock)) |
476 | |
477 | /** |
478 | * Acquire a lock while setting heartbeat protocol dynamically. |
479 | */ |
480 | #define SET_PROTOCOL_LOCK() (pthread_mutex_lock(&g_set_protocol_lock)) |
481 | |
482 | /** |
483 | * Relinquish the lock after setting heartbeat protocol dynamically. |
484 | */ |
485 | #define SET_PROTOCOL_UNLOCK() (pthread_mutex_unlock(&g_set_protocol_lock)) |
486 | |
487 | /** |
488 | * Logging macros. |
489 | */ |
490 | #define CRASH(format, ...) cf_crash(AS_HB, format, ##__VA_ARGS__) |
491 | #define CRASH_NOSTACK(format, ...) cf_crash_nostack(AS_HB, format, ##__VA_ARGS__) |
492 | #define WARNING(format, ...) cf_warning(AS_HB, format, ##__VA_ARGS__) |
493 | #define TICKER_WARNING(format, ...) \ |
494 | cf_ticker_warning(AS_HB, format, ##__VA_ARGS__) |
495 | #define INFO(format, ...) cf_info(AS_HB, format, ##__VA_ARGS__) |
496 | #define DEBUG(format, ...) cf_debug(AS_HB, format, ##__VA_ARGS__) |
497 | #define DETAIL(format, ...) cf_detail(AS_HB, format, ##__VA_ARGS__) |
498 | #define ASSERT(expression, message, ...) \ |
499 | if (!(expression)) {WARNING(message, ##__VA_ARGS__);} |
500 | |
501 | /* |
502 | * ---------------------------------------------------------------------------- |
503 | * Private internal data structures |
504 | * ---------------------------------------------------------------------------- |
505 | */ |
506 | |
507 | /* |
508 | * ---------------------------------------------------------------------------- |
509 | * Common |
510 | * ---------------------------------------------------------------------------- |
511 | */ |
512 | |
513 | /** |
514 | * Heartbeat subsystem state. |
515 | */ |
516 | typedef enum |
517 | { |
518 | AS_HB_STATUS_UNINITIALIZED, |
519 | AS_HB_STATUS_RUNNING, |
520 | AS_HB_STATUS_SHUTTING_DOWN, |
521 | AS_HB_STATUS_STOPPED |
522 | } as_hb_status; |
523 | |
524 | /* |
525 | * ---------------------------------------------------------------------------- |
526 | * Mesh related |
527 | * ---------------------------------------------------------------------------- |
528 | */ |
529 | |
530 | /** |
531 | * Mesh node status enum. |
532 | */ |
533 | typedef enum |
534 | { |
535 | /** |
536 | * The mesh node has an active channel. |
537 | */ |
538 | AS_HB_MESH_NODE_CHANNEL_ACTIVE, |
539 | |
540 | /** |
541 | * The mesh node is waiting for an active channel. |
542 | */ |
543 | AS_HB_MESH_NODE_CHANNEL_PENDING, |
544 | |
545 | /** |
546 | * The mesh node does not have an active channel. |
547 | */ |
548 | AS_HB_MESH_NODE_CHANNEL_INACTIVE, |
549 | |
550 | /** |
551 | * The ip address and port for this node are not yet known. |
552 | */ |
553 | AS_HB_MESH_NODE_ENDPOINT_UNKNOWN, |
554 | |
555 | /** |
556 | * The sentinel value. Should be the last in the enum. |
557 | */ |
558 | AS_HB_MESH_NODE_STATUS_SENTINEL |
559 | } as_hb_mesh_node_status; |
560 | |
561 | /** |
562 | * The info payload for a single node. |
563 | */ |
564 | typedef struct as_hb_mesh_info_reply_s |
565 | { |
566 | /** |
567 | * The nodeid of the node for which info reply is sent. |
568 | */ |
569 | cf_node nodeid; |
570 | |
571 | /** |
572 | * The advertised endpoint list for this node. List to allow variable size |
573 | * endpoint list. Always access as reply.endpoints[0]. |
574 | */ |
575 | as_endpoint_list endpoint_list[]; |
576 | }__attribute__((__packed__)) as_hb_mesh_info_reply; |
577 | |
578 | /** |
579 | * Mesh tend reduce function udata. |
580 | */ |
581 | typedef struct as_hb_mesh_tend_reduce_udata_s |
582 | { |
583 | /** |
584 | * The new endpoint lists to connect to. Each list has endpoints for s |
585 | * single remote peer. |
586 | */ |
587 | as_endpoint_list** to_connect; |
588 | |
589 | /** |
590 | * The capacity of the to connect array. |
591 | */ |
592 | size_t to_connect_capacity; |
593 | |
594 | /** |
595 | * The count of endpoints to connect. |
596 | */ |
597 | size_t to_connect_count; |
598 | |
599 | /** |
600 | * Pointers to seeds that need matching. |
601 | */ |
602 | cf_vector* inactive_seeds_p; |
603 | } as_hb_mesh_tend_reduce_udata; |
604 | |
605 | /** |
606 | * Mesh endpoint search udata. |
607 | */ |
608 | typedef struct |
609 | { |
610 | /** |
611 | * The endpoint to search. |
612 | */ |
613 | cf_sock_addr* to_search; |
614 | |
615 | /** |
616 | * Indicates is a match is found. |
617 | */ |
618 | bool found; |
619 | } as_hb_endpoint_list_addr_find_udata; |
620 | |
621 | /** |
622 | * Mesh endpoint list search udata. |
623 | */ |
624 | typedef struct as_hb_mesh_endpoint_list_reduce_udata_s |
625 | { |
626 | /** |
627 | * The endpoint to search. |
628 | */ |
629 | as_endpoint_list* to_search; |
630 | |
631 | /** |
632 | * Indicates is a match is found. |
633 | */ |
634 | bool found; |
635 | |
636 | /** |
637 | * The matched key if found. |
638 | */ |
639 | cf_node* matched_nodeid; |
640 | } as_hb_mesh_endpoint_list_reduce_udata; |
641 | |
642 | /** |
643 | * Information maintained for configured mesh seed nodes. |
644 | */ |
645 | typedef struct as_hb_mesh_seed_s |
646 | { |
647 | /** |
648 | * The name / ip address of this seed mesh host. |
649 | */ |
650 | char seed_host_name[DNS_NAME_MAX_SIZE]; |
651 | |
652 | /** |
653 | * The port of this seed mesh host. |
654 | */ |
655 | cf_ip_port seed_port; |
656 | |
657 | /** |
658 | * Identifies TLS mesh seed hosts. |
659 | */ |
660 | bool seed_tls; |
661 | |
662 | /** |
663 | * The heap allocated end point list for this seed host resolved usiung the |
664 | * seeds hostname. |
665 | * Will be null if the endpoint list cannot be resolved. |
666 | */ |
667 | as_endpoint_list* resolved_endpoint_list; |
668 | |
669 | /** |
670 | * Timestamp when the seed hostname was resolved into the endpoint list. |
671 | * Used to perform periodic refresh of the endpoint list. |
672 | */ |
673 | cf_clock resolved_endpoint_list_ts; |
674 | |
675 | /** |
676 | * The state of this seed in terms of established channel. |
677 | */ |
678 | as_hb_mesh_node_status status; |
679 | |
680 | /** |
681 | * The last time the state of this node was updated. |
682 | */ |
683 | cf_clock last_status_updated; |
684 | |
685 | /** |
686 | * The node id for a matching mesh node entry. A zero will indicate that |
687 | * there exists no matching mesh node entry. |
688 | */ |
689 | cf_node mesh_nodeid; |
690 | |
691 | /** |
692 | * Timestamp indicating when the matching mesh node's endpoint was updated. |
693 | * Used to detect endpoint changes to the matching mesh node entry if it |
694 | * exists. |
695 | */ |
696 | as_hlc_timestamp mesh_node_endpoint_change_ts; |
697 | } as_hb_mesh_seed; |
698 | |
699 | /** |
700 | * Information maintained for discovered mesh end points. |
701 | */ |
702 | typedef struct as_hb_mesh_node_s |
703 | { |
704 | /** |
705 | * The heap allocated end point list for this mesh host. Should be freed |
706 | * once the last mesh entry is removed from the mesh state. |
707 | */ |
708 | as_endpoint_list* endpoint_list; |
709 | |
710 | /** |
711 | * Timestamp when the mesh node was last updated. |
712 | */ |
713 | as_hlc_timestamp endpoint_change_ts; |
714 | |
715 | /** |
716 | * The state of this node in terms of established channel. |
717 | */ |
718 | as_hb_mesh_node_status status; |
719 | |
720 | /** |
721 | * The last time the state of this node was updated. |
722 | */ |
723 | cf_clock last_status_updated; |
724 | |
725 | /** |
726 | * The time this node's channel become inactive. |
727 | */ |
728 | cf_clock inactive_since; |
729 | } as_hb_mesh_node; |
730 | |
731 | /** |
732 | * State maintained for the mesh mode. |
733 | */ |
734 | typedef struct as_hb_mesh_state_s |
735 | { |
736 | /** |
737 | * The sockets on which this instance accepts heartbeat tcp connections. |
738 | */ |
739 | cf_sockets listening_sockets; |
740 | |
741 | /** |
742 | * Indicates if the published endpoint list is ipv4 only. |
743 | */ |
744 | bool published_endpoint_list_ipv4_only; |
745 | |
746 | /** |
747 | * The published endpoint list. |
748 | */ |
749 | as_endpoint_list* published_endpoint_list; |
750 | |
751 | /** |
752 | * Mesh seed data. |
753 | */ |
754 | cf_vector seeds; |
755 | |
756 | /** |
757 | * A map from an cf_node _key to a mesh node. |
758 | */ |
759 | cf_shash* nodeid_to_mesh_node; |
760 | |
761 | /** |
762 | * Thread id for the mesh tender thread. |
763 | */ |
764 | pthread_t mesh_tender_tid; |
765 | |
766 | /** |
767 | * The status of the mesh module. |
768 | */ |
769 | as_hb_status status; |
770 | |
771 | /** |
772 | * The mtu on the listening device. This is extrapolated to all nodes and |
773 | * paths in the cluster. This limits the cluster size possible. |
774 | */ |
775 | int min_mtu; |
776 | |
777 | /** |
778 | * Indicates if new nodes are discovered. Optimization to start mesh tend |
779 | * earlier than normal tend interval on discovering new nodes. |
780 | */ |
781 | bool nodes_discovered; |
782 | } as_hb_mesh_state; |
783 | |
784 | /* |
785 | * ---------------------------------------------------------------------------- |
786 | * Multicast data structures |
787 | * ---------------------------------------------------------------------------- |
788 | */ |
789 | |
790 | /** |
791 | * State maintained for the multicast mode. |
792 | */ |
793 | typedef struct as_hb_multicast_state_s |
794 | { |
795 | /** |
796 | * The sockets associated with multicast mode. |
797 | */ |
798 | cf_mserv_cfg cfg; |
799 | |
800 | /** |
801 | * Multicast listening sockets. |
802 | */ |
803 | cf_sockets listening_sockets; |
804 | |
805 | /** |
806 | * The mtu on the listening device. This is extrapolated to all nodes and |
807 | * paths in the cluster. This limits the cluster size possible. |
808 | */ |
809 | int min_mtu; |
810 | } as_hb_multicast_state; |
811 | |
812 | /* |
813 | * ---------------------------------------------------------------------------- |
814 | * Channel state |
815 | * ---------------------------------------------------------------------------- |
816 | */ |
817 | |
818 | /** |
819 | * The type of a channel event. |
820 | */ |
821 | typedef enum |
822 | { |
823 | /** |
824 | * The endpoint has a channel tx/rx channel associated with it. |
825 | */ |
826 | AS_HB_CHANNEL_NODE_CONNECTED, |
827 | |
828 | /** |
829 | * The endpoint had a tx/rx channel that went down. |
830 | */ |
831 | AS_HB_CHANNEL_NODE_DISCONNECTED, |
832 | |
833 | /** |
834 | * A message was received on a connected channel. The message in the event, |
835 | * is guaranteed to have passed basic sanity check like have protocol id, |
836 | * type and source nodeid. |
837 | */ |
838 | AS_HB_CHANNEL_MSG_RECEIVED, |
839 | |
840 | /** |
841 | * Channel found node whose cluster name does not match. |
842 | */ |
843 | AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH |
844 | } as_hb_channel_event_type; |
845 | |
846 | /** |
847 | * Status for reads from a channel. |
848 | */ |
849 | typedef enum |
850 | { |
851 | /** |
852 | * The message was read successfully and parser. |
853 | */ |
854 | AS_HB_CHANNEL_MSG_READ_SUCCESS, |
855 | |
856 | /** |
857 | * The message read successfully but parsing failed. |
858 | */ |
859 | AS_HB_CHANNEL_MSG_PARSE_FAIL, |
860 | |
861 | /** |
862 | * The message read failed network io. |
863 | */ |
864 | AS_HB_CHANNEL_MSG_CHANNEL_FAIL, |
865 | |
866 | /** |
867 | * Sentinel default value. |
868 | */ |
869 | AS_HB_CHANNEL_MSG_READ_UNDEF |
870 | } as_hb_channel_msg_read_status; |
871 | |
872 | typedef struct |
873 | { |
874 | /** |
875 | * The endpoint address to search channel by. |
876 | */ |
877 | as_endpoint_list* endpoint_list; |
878 | |
879 | /** |
880 | * Indicates if the endpoint was found. |
881 | */ |
882 | bool found; |
883 | |
884 | /** |
885 | * The matching socket, if found. |
886 | */ |
887 | cf_socket* socket; |
888 | } as_hb_channel_endpoint_reduce_udata; |
889 | |
890 | typedef struct |
891 | { |
892 | /** |
893 | * The endpoint address to search channel by. |
894 | */ |
895 | cf_sock_addr* addr_to_search; |
896 | |
897 | /** |
898 | * Indicates if the endpoint was found. |
899 | */ |
900 | bool found; |
901 | } as_hb_channel_endpoint_iterate_udata; |
902 | |
903 | typedef struct |
904 | { |
905 | /** |
906 | * The message buffer to send. |
907 | */ |
908 | uint8_t* buffer; |
909 | |
910 | /** |
911 | * The buffer length. |
912 | */ |
913 | size_t buffer_len; |
914 | } as_hb_channel_buffer_udata; |
915 | |
916 | /** |
917 | * A channel represents a medium to send and receive messages. |
918 | */ |
919 | typedef struct as_hb_channel_s |
920 | { |
921 | /** |
922 | * Indicates if this channel is a multicast channel. |
923 | */ |
924 | bool is_multicast; |
925 | |
926 | /** |
927 | * Indicates if this channel is inbound. Not relevant for multicast |
928 | * channels. |
929 | */ |
930 | bool is_inbound; |
931 | |
932 | /** |
933 | * The id of the associated node. In mesh / unicast case this will initially |
934 | * be zero and filled in when the nodeid for the node at the other end is |
935 | * learnt. In multicast case this will be zero. |
936 | */ |
937 | cf_node nodeid; |
938 | |
939 | /** |
940 | * The address of the peer. Will always be specified for outbound channels. |
941 | */ |
942 | cf_sock_addr endpoint_addr; |
943 | |
944 | /** |
945 | * The last time a message was received from this node. |
946 | */ |
947 | cf_clock last_received; |
948 | |
949 | /** |
950 | * Time when this channel won a socket resolution. Zero if this channel |
951 | * never won resolution. In compatibility mode with older code its possible |
952 | * we will keep allowing the same socket to win and enter an infinite loop |
953 | * of closing the sockets. |
954 | */ |
955 | cf_clock resolution_win_ts; |
956 | } as_hb_channel; |
957 | |
958 | /** |
959 | * State maintained per heartbeat channel. |
960 | */ |
961 | typedef struct as_hb_channel_state_s |
962 | { |
963 | /** |
964 | * The poll handle. All IO wait across all heartbeat connections happens on |
965 | * this handle. |
966 | */ |
967 | cf_poll poll; |
968 | |
969 | /** |
970 | * Channel status. |
971 | */ |
972 | as_hb_status status; |
973 | |
974 | /** |
975 | * Maps a socket to an as_hb_channel. |
976 | */ |
977 | cf_shash* socket_to_channel; |
978 | |
979 | /** |
980 | * Maps a nodeid to a channel specific node data structure. This association |
981 | * will be made only on receiving the first heartbeat message from the node |
982 | * on a channel. |
983 | */ |
984 | cf_shash* nodeid_to_socket; |
985 | |
986 | /** |
987 | * Sockets accumulated by the channel tender to close at the end of every |
988 | * epoll loop. |
989 | */ |
990 | cf_queue socket_close_queue; |
991 | |
992 | /** |
993 | * The sockets on which heartbeat subsystem listens. |
994 | */ |
995 | cf_sockets* listening_sockets; |
996 | |
997 | /** |
998 | * Clock to keep track of last time idle connections were checked. |
999 | */ |
1000 | cf_clock last_channel_idle_check; |
1001 | |
1002 | /** |
1003 | * Enables / disables publishing channel events. Events should be disabled |
1004 | * only when the state changes are temporary / transient and hence would not |
1005 | * change the overall channel state from an external perspective. |
1006 | */ |
1007 | bool events_enabled; |
1008 | |
1009 | /** |
1010 | * Events are batched and published to reduce cluster transitions. Queue of |
1011 | * unpublished heartbeat events. |
1012 | */ |
1013 | cf_queue events_queue; |
1014 | |
1015 | /** |
1016 | * Thread id for the socket tender thread. |
1017 | */ |
1018 | pthread_t channel_tender_tid; |
1019 | } as_hb_channel_state; |
1020 | |
1021 | /** |
1022 | * Entry queued up for socket close. |
1023 | */ |
1024 | typedef struct as_hb_channel_socket_close_entry_s |
1025 | { |
1026 | /** |
1027 | * The node for which this event was generated. |
1028 | */ |
1029 | cf_socket* socket; |
1030 | /** |
1031 | * Indicates if this close is a remote close. |
1032 | */ |
1033 | bool is_remote; |
1034 | /** |
1035 | * True if close of this entry should generate a disconnect event. |
1036 | */ |
1037 | bool raise_close_event; |
1038 | } as_hb_channel_socket_close_entry; |
1039 | |
1040 | /** |
1041 | * An event generated by the channel sub module. |
1042 | */ |
1043 | typedef struct as_hb_channel_event_s |
1044 | { |
1045 | /** |
1046 | * The channel event type. |
1047 | */ |
1048 | as_hb_channel_event_type type; |
1049 | |
1050 | /** |
1051 | * The node for which this event was generated. |
1052 | */ |
1053 | cf_node nodeid; |
1054 | |
1055 | /** |
1056 | * The received message if any over this endpoint. Valid for incoming |
1057 | * message type event. The message if not NULL never be edited or copied |
1058 | * over. |
1059 | */ |
1060 | msg* msg; |
1061 | |
1062 | /** |
1063 | * The hlc timestamp for message receipt. |
1064 | */ |
1065 | as_hlc_msg_timestamp msg_hlc_ts; |
1066 | } as_hb_channel_event; |
1067 | |
1068 | /* |
1069 | * ---------------------------------------------------------------------------- |
1070 | * Main sub module state |
1071 | * ---------------------------------------------------------------------------- |
1072 | */ |
1073 | |
1074 | /** |
1075 | * Heartbeat message types. |
1076 | */ |
1077 | typedef enum |
1078 | { |
1079 | AS_HB_MSG_TYPE_PULSE, |
1080 | AS_HB_MSG_TYPE_INFO_REQUEST, |
1081 | AS_HB_MSG_TYPE_INFO_REPLY, |
1082 | AS_HB_MSG_TYPE_COMPRESSED |
1083 | } as_hb_msg_type; |
1084 | |
1085 | /** |
1086 | * Events published by the heartbeat subsystem. |
1087 | */ |
1088 | typedef enum |
1089 | { |
1090 | AS_HB_INTERNAL_NODE_ARRIVE, |
1091 | AS_HB_INTERNAL_NODE_DEPART, |
1092 | AS_HB_INTERNAL_NODE_EVICT, |
1093 | AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED |
1094 | } as_hb_internal_event_type; |
1095 | |
1096 | /** |
1097 | * State maintained by the heartbeat subsystem for the selected mode. |
1098 | */ |
1099 | typedef struct as_hb_mode_state_s |
1100 | { |
1101 | /** |
1102 | * The mesh / multicast state. |
1103 | */ |
1104 | union |
1105 | { |
1106 | as_hb_mesh_state mesh_state; |
1107 | as_hb_multicast_state multicast_state; |
1108 | }; |
1109 | } as_hb_mode_state; |
1110 | |
1111 | /** |
1112 | * Plugin data iterate reduce udata. |
1113 | */ |
1114 | typedef struct |
1115 | { |
1116 | /** |
1117 | * The plugin id. |
1118 | */ |
1119 | as_hb_plugin_id pluginid; |
1120 | |
1121 | /** |
1122 | * The iterate function. |
1123 | */ |
1124 | as_hb_plugin_data_iterate_fn iterate_fn; |
1125 | |
1126 | /** |
1127 | * The udata for the iterate function. |
1128 | */ |
1129 | void* udata; |
1130 | } as_hb_adjacecny_iterate_reduce_udata; |
1131 | |
1132 | /** |
1133 | * Information tracked for an adjacent nodes. |
1134 | */ |
1135 | typedef struct as_hb_adjacent_node_s |
1136 | { |
1137 | /** |
1138 | * The heart beat protocol version. |
1139 | */ |
1140 | uint32_t protocol_version; |
1141 | |
1142 | /** |
1143 | * The remote node's |
1144 | */ |
1145 | as_endpoint_list* endpoint_list; |
1146 | |
1147 | /** |
1148 | * Used to cycle between the two copies of plugin data. |
1149 | */ |
1150 | int plugin_data_cycler; |
1151 | |
1152 | /** |
1153 | * Plugin specific data accumulated by the heartbeat subsystem. The data is |
1154 | * heap allocated and should be destroyed the moment this element entry is |
1155 | * unused. There are two copies of the plugin data, one the current copy and |
1156 | * one the previous copy. Previous copy is used to generate data change |
1157 | * notifications. |
1158 | */ |
1159 | as_hb_plugin_node_data plugin_data[AS_HB_PLUGIN_SENTINEL][2]; |
1160 | |
1161 | /** |
1162 | * The monotonic local time node information was last updated. |
1163 | */ |
1164 | cf_clock last_updated_monotonic_ts; |
1165 | |
1166 | /** |
1167 | * HLC timestamp for the last pulse message. |
1168 | */ |
1169 | as_hlc_msg_timestamp last_msg_hlc_ts; |
1170 | |
1171 | /** |
1172 | * Track number of consecutive cluster-name mismatches. |
1173 | */ |
1174 | uint32_t cluster_name_mismatch_count; |
1175 | |
1176 | /** |
1177 | * Moving average of the latency in ms. |
1178 | */ |
1179 | uint64_t avg_latency; |
1180 | |
1181 | /** |
1182 | * A shift register tracking change of endpoints. On receipt of a heartbeat, |
1183 | * if source node's endpoints change 1 is inserted at the LSB, else 0 is |
1184 | * inserted at the LSB. |
1185 | */ |
1186 | uint64_t endpoint_change_tracker; |
1187 | } as_hb_adjacent_node; |
1188 | |
1189 | /** |
1190 | * Internal storage for external event listeners. |
1191 | */ |
1192 | typedef struct as_hb_event_listener_s |
1193 | { |
1194 | /** |
1195 | * Registered callback function. |
1196 | */ |
1197 | as_hb_event_fn event_callback; |
1198 | |
1199 | /** |
1200 | * Arguments for the listeners. |
1201 | */ |
1202 | void* udata; |
1203 | } as_hb_event_listener; |
1204 | |
1205 | /** |
1206 | * Heartbeat subsystem internal state. |
1207 | */ |
1208 | typedef struct as_hb_s |
1209 | { |
1210 | /** |
1211 | * The status of the subsystem. |
1212 | */ |
1213 | as_hb_status status; |
1214 | |
1215 | /** |
1216 | * The adjacency dictionary. The key is the nodeid. The value is an instance |
1217 | * of as_hb_adjacent_node. |
1218 | */ |
1219 | cf_shash* adjacency; |
1220 | |
1221 | /** |
1222 | * The probation dictionary having nodes that display unexpected behavior. |
1223 | * Nodeids under probation and adjacency hash are always exclusive. The key |
1224 | * is the nodeid. The value is an instance of as_hb_adjacent_node. |
1225 | */ |
1226 | cf_shash* on_probation; |
1227 | |
1228 | /** |
1229 | * Temporary nodeid to index hash used to compute nodes to evict from a |
1230 | * clique. |
1231 | */ |
1232 | cf_shash* nodeid_to_index; |
1233 | |
1234 | /** |
1235 | * The mode specific state. |
1236 | */ |
1237 | as_hb_mode_state mode_state; |
1238 | |
1239 | /** |
1240 | * The channel state. |
1241 | */ |
1242 | as_hb_channel_state channel_state; |
1243 | |
1244 | /** |
1245 | * Self node accumulated stats used primarily to detect duplicate node-ids. |
1246 | */ |
1247 | as_hb_adjacent_node self_node; |
1248 | |
1249 | /** |
1250 | * Indicates self node-id has duplicates. |
1251 | */ |
1252 | bool self_is_duplicate; |
1253 | |
1254 | /** |
1255 | * Monotonic timestamp of when a self duplicate was detected. |
1256 | */ |
1257 | cf_clock self_duplicate_detected_ts; |
1258 | |
1259 | /** |
1260 | * The plugin dictionary. The key is the as_hb_plugin entry and the value an |
1261 | * instance of as_hb_plugin. |
1262 | */ |
1263 | as_hb_plugin plugins[AS_HB_PLUGIN_SENTINEL]; |
1264 | |
1265 | /** |
1266 | * Thread id for the transmitter thread. |
1267 | */ |
1268 | pthread_t transmitter_tid; |
1269 | |
1270 | /** |
1271 | * Thread id for the thread expiring nodes from the adjacency list. |
1272 | */ |
1273 | pthread_t adjacency_tender_tid; |
1274 | } as_hb; |
1275 | |
1276 | /** |
1277 | * Registered heartbeat listeners. |
1278 | */ |
1279 | typedef struct as_hb_external_events_s |
1280 | { |
1281 | /** |
1282 | * Events are batched and published. Queue of unpublished heartbeat events. |
1283 | */ |
1284 | cf_queue external_events_queue; |
1285 | |
1286 | /** |
1287 | * Count of event listeners. |
1288 | */ |
1289 | int event_listener_count; |
1290 | |
1291 | /** |
1292 | * External event listeners. |
1293 | */ |
1294 | as_hb_event_listener event_listeners[AS_HB_EVENT_LISTENER_MAX]; |
1295 | } as_hb_external_events; |
1296 | |
1297 | /** |
1298 | * Shash reduce function to read current adjacency list. |
1299 | */ |
1300 | typedef struct as_hb_adjacency_reduce_udata_s |
1301 | { |
1302 | /** |
1303 | * The target adjacency list. |
1304 | */ |
1305 | cf_node* adj_list; |
1306 | |
1307 | /** |
1308 | * Count of elements in the adjacency list. |
1309 | */ |
1310 | int adj_count; |
1311 | } as_hb_adjacency_reduce_udata; |
1312 | |
1313 | /** |
1314 | * Udata for finding nodes in the adjacency list not in the input succession |
1315 | * list. |
1316 | */ |
1317 | typedef struct |
1318 | { |
1319 | /** |
1320 | * Number of events generated. |
1321 | */ |
1322 | int event_count; |
1323 | |
1324 | /** |
1325 | * List of generated events. |
1326 | */ |
1327 | as_hb_event_node* events; |
1328 | |
1329 | /** |
1330 | * Limit on number of generated events. |
1331 | */ |
1332 | int max_events; |
1333 | |
1334 | /** |
1335 | * Current succession list. |
1336 | */ |
1337 | cf_node* succession; |
1338 | |
1339 | /** |
1340 | * Number of nodes in succession list. |
1341 | */ |
1342 | int succession_size; |
1343 | } as_hb_find_new_nodes_reduce_udata; |
1344 | |
1345 | /** |
1346 | * Shash reduce function to read current adjacency list. |
1347 | */ |
1348 | typedef struct as_hb_adjacency_tender_udata_s |
1349 | { |
1350 | /** |
1351 | * The list of expired nodes. |
1352 | */ |
1353 | cf_node* dead_nodes; |
1354 | |
1355 | /** |
1356 | * Count of elements in the dead node list. |
1357 | */ |
1358 | int dead_node_count; |
1359 | |
1360 | /** |
1361 | * The list of evicted nodes , e.g. due to cluster name mismatch. |
1362 | */ |
1363 | cf_node* evicted_nodes; |
1364 | |
1365 | /** |
1366 | * Count of elements in the evicted node list. |
1367 | */ |
1368 | int evicted_node_count; |
1369 | } as_hb_adjacency_tender_udata; |
1370 | |
1371 | /** |
1372 | * Udata for tip clear. |
1373 | */ |
1374 | typedef struct as_hb_mesh_tip_clear_udata_s |
1375 | { |
1376 | /** |
1377 | * Host IP or DNS name to be cleared from seed list. |
1378 | */ |
1379 | char host[DNS_NAME_MAX_SIZE]; |
1380 | |
1381 | /** |
1382 | * Listening port of the host. |
1383 | */ |
1384 | int port; |
1385 | |
1386 | /** |
1387 | * Number of IP addresses to match. |
1388 | */ |
1389 | uint32_t n_addrs; |
1390 | |
1391 | /** |
1392 | * IP addresses to match. |
1393 | */ |
1394 | cf_ip_addr* addrs; |
1395 | |
1396 | /** |
1397 | * Node id if a specific node-id needs to be removed as well. |
1398 | */ |
1399 | cf_node nodeid; |
1400 | |
1401 | /** |
1402 | * Tip-clear status |
1403 | */ |
1404 | bool entry_deleted; |
1405 | } as_hb_mesh_tip_clear_udata; |
1406 | |
1407 | /** |
1408 | * Convert endpoint list to string in a process function. |
1409 | */ |
1410 | typedef struct endpoint_list_to_string_udata_s |
1411 | { |
1412 | /** |
1413 | * The endpoint list in string format. |
1414 | */ |
1415 | char* endpoint_list_str; |
1416 | |
1417 | /** |
1418 | * The size of enpoint list. |
1419 | */ |
1420 | size_t endpoint_list_str_capacity; |
1421 | } endpoint_list_to_string_udata; |
1422 | |
1423 | /** |
1424 | * Udata to fill an endpoint list into a message. |
1425 | */ |
1426 | typedef struct endpoint_list_to_msg_udata_s |
1427 | { |
1428 | /** |
1429 | * The target message. |
1430 | */ |
1431 | msg* msg; |
1432 | |
1433 | /** |
1434 | * Indicates if we are running in mesh mode. |
1435 | */ |
1436 | bool is_mesh; |
1437 | } endpoint_list_to_msg_udata; |
1438 | |
1439 | /** |
1440 | * Udata to test if this endpoint list overlaps with other endpoint list. |
1441 | */ |
1442 | typedef struct endpoint_list_equal_check_udata_s |
1443 | { |
1444 | /** |
1445 | * The endpoint list of the new node. |
1446 | */ |
1447 | as_endpoint_list* other; |
1448 | |
1449 | /** |
1450 | * Output. Indicates if the lists are equal. |
1451 | */ |
1452 | bool are_equal; |
1453 | } endpoint_list_equal_check_udata; |
1454 | |
1455 | /** |
1456 | * Endpoint list process function. |
1457 | * @param endpoint current endpoint in the iteration. |
1458 | * @param udata udata passed through from the invoker of the iterate function. |
1459 | */ |
1460 | typedef void |
1461 | (*endpoint_list_process_fn)(const as_endpoint_list* endpoint_list, void* udata); |
1462 | |
1463 | /** |
1464 | * Seed host list reduce udata. |
1465 | */ |
1466 | typedef struct as_hb_seed_host_list_udata_s |
1467 | { |
1468 | /** |
1469 | * The buffer to receive the list. |
1470 | */ |
1471 | cf_dyn_buf* db; |
1472 | |
1473 | /** |
1474 | * Selects TLS seed nodes. |
1475 | */ |
1476 | bool tls; |
1477 | } as_hb_seed_host_list_udata; |
1478 | |
1479 | /* |
1480 | * ---------------------------------------------------------------------------- |
1481 | * Globals |
1482 | * ---------------------------------------------------------------------------- |
1483 | */ |
1484 | |
1485 | /** |
1486 | * Global heartbeat instance. |
1487 | */ |
1488 | static as_hb g_hb; |
1489 | |
1490 | /** |
1491 | * Global heartbeat events listener instance. |
1492 | */ |
1493 | static as_hb_external_events g_hb_event_listeners; |
1494 | |
1495 | /** |
1496 | * The big fat lock for all external event publishing. This ensures that a batch |
1497 | * of external events are published atomically to preserve the order of external |
1498 | * events. |
1499 | */ |
1500 | static pthread_mutex_t g_external_event_publish_lock = |
1501 | PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
1502 | |
1503 | /** |
1504 | * Global lock to serialize all read and writes to the heartbeat subsystem. |
1505 | */ |
1506 | static pthread_mutex_t g_hb_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
1507 | |
1508 | /** |
1509 | * The big fat lock for all channel state. |
1510 | */ |
1511 | static pthread_mutex_t g_channel_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
1512 | |
1513 | /** |
1514 | * The big fat lock for all mesh state. |
1515 | */ |
1516 | static pthread_mutex_t g_mesh_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
1517 | |
1518 | /** |
1519 | * The big fat lock for all multicast state. |
1520 | */ |
1521 | static pthread_mutex_t g_multicast_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
1522 | |
1523 | /** |
1524 | * The global lock for all heartbeat configuration. |
1525 | */ |
1526 | static pthread_mutex_t g_hb_config_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
1527 | |
1528 | /** |
1529 | * The lock used while setting heartbeat protocol. |
1530 | */ |
1531 | static pthread_mutex_t g_set_protocol_lock = |
1532 | PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; |
1533 | |
1534 | /** |
1535 | * Message templates for heartbeat messages. |
1536 | */ |
1537 | static msg_template g_hb_msg_template[] = { |
1538 | |
1539 | { AS_HB_MSG_ID, M_FT_UINT32 }, |
1540 | |
1541 | { AS_HB_MSG_TYPE, M_FT_UINT32 }, |
1542 | |
1543 | { AS_HB_MSG_NODE, M_FT_UINT64 }, |
1544 | |
1545 | { AS_HB_MSG_CLUSTER_NAME, M_FT_STR }, |
1546 | |
1547 | { AS_HB_MSG_HLC_TIMESTAMP, M_FT_UINT64 }, |
1548 | |
1549 | { AS_HB_MSG_ENDPOINTS, M_FT_BUF }, |
1550 | |
1551 | { AS_HB_MSG_COMPRESSED_PAYLOAD, M_FT_BUF }, |
1552 | |
1553 | { AS_HB_MSG_INFO_REQUEST, M_FT_BUF }, |
1554 | |
1555 | { AS_HB_MSG_INFO_REPLY, M_FT_BUF }, |
1556 | |
1557 | { AS_HB_MSG_FABRIC_DATA, M_FT_BUF }, |
1558 | |
1559 | { AS_HB_MSG_HB_DATA, M_FT_BUF }, |
1560 | |
1561 | { AS_HB_MSG_PAXOS_DATA, M_FT_BUF }, |
1562 | |
1563 | { AS_HB_MSG_SKEW_MONITOR_DATA, M_FT_UINT64 } }; |
1564 | |
1565 | /* |
1566 | * ---------------------------------------------------------------------------- |
1567 | * Private internal function forward declarations. |
1568 | * ---------------------------------------------------------------------------- |
1569 | */ |
1570 | |
1571 | static void info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list); |
1572 | static uint32_t round_up_pow2(uint32_t v); |
1573 | static int vector_find(cf_vector* vector, const void* element); |
1574 | |
1575 | static void endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src); |
1576 | static void endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, void* udata); |
1577 | static void endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata); |
1578 | |
1579 | static int msg_compression_threshold(int mtu); |
1580 | static int msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list); |
1581 | static int msg_id_get(msg* msg, uint32_t* id); |
1582 | static int msg_nodeid_get(msg* msg, cf_node* nodeid); |
1583 | static int msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts); |
1584 | static int msg_type_get(msg* msg, as_hb_msg_type* type); |
1585 | static int msg_cluster_name_get(msg* msg, char** cluster_name); |
1586 | static int msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, size_t* adj_length); |
1587 | static int msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length); |
1588 | static void msg_node_list_set(msg* msg, int field_id, cf_node* node_list, size_t node_length); |
1589 | static void msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length); |
1590 | static int msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count); |
1591 | static void msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, void* udata); |
1592 | static void msg_src_fields_fill(msg* msg); |
1593 | static void msg_type_set(msg* msg, as_hb_msg_type msg_type); |
1594 | |
1595 | static int config_mcsize(); |
1596 | static const cf_serv_cfg* config_bind_cfg_get(); |
1597 | static const cf_mserv_cfg* config_multicast_group_cfg_get(); |
1598 | static uint32_t config_tx_interval_get(); |
1599 | static void config_tx_interval_set(uint32_t new_interval); |
1600 | static uint32_t config_override_mtu_get(); |
1601 | static void config_override_mtu_set(uint32_t mtu); |
1602 | static uint32_t config_max_intervals_missed_get(); |
1603 | static void config_max_intervals_missed_set(uint32_t new_max); |
1604 | static unsigned char config_multicast_ttl_get(); |
1605 | static as_hb_protocol config_protocol_get(); |
1606 | static void config_protocol_set(as_hb_protocol new_protocol); |
1607 | static cf_node config_self_nodeid_get(); |
1608 | static as_hb_mode config_mode_get(); |
1609 | static void config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, cf_serv_cfg* published_cfg, bool ipv4_only); |
1610 | static bool config_binding_is_valid(char** error, as_hb_protocol protocol); |
1611 | |
1612 | static void channel_init_channel(as_hb_channel* channel); |
1613 | static void channel_event_init(as_hb_channel_event* event); |
1614 | static bool channel_is_running(); |
1615 | static bool channel_is_stopped(); |
1616 | static uint32_t channel_win_grace_ms(); |
1617 | static void channel_events_enabled_set(bool enabled); |
1618 | static bool channel_are_events_enabled(); |
1619 | static void channel_event_queue(as_hb_channel_event* event); |
1620 | static void channel_event_publish_pending(); |
1621 | static int channel_get_channel(cf_socket* socket, as_hb_channel* result); |
1622 | static void channel_socket_shutdown(cf_socket* socket); |
1623 | static int channel_socket_get(cf_node nodeid, cf_socket** socket); |
1624 | static bool channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find); |
1625 | static void channel_socket_destroy(cf_socket* sock); |
1626 | static void channel_socket_close(cf_socket* socket, bool remote_close, bool raise_close_event); |
1627 | static void channel_sockets_close(cf_vector* sockets); |
1628 | static void channel_socket_close_queue(cf_socket* socket, bool is_remote_close, bool raise_close_event); |
1629 | static void channel_socket_close_pending(); |
1630 | static void channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, cf_sock_addr* endpoint_addr); |
1631 | static void channel_accept_connection(cf_socket* lsock); |
1632 | static as_hb_channel_msg_read_status channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len); |
1633 | static void channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata); |
1634 | static int channel_endpoint_search_reduce(const void* key, void* data, void* udata); |
1635 | static bool channel_endpoint_is_connected(as_endpoint_list* endpoint_list); |
1636 | static as_hb_channel_msg_read_status channel_multicast_msg_read(cf_socket* socket, msg* msg); |
1637 | static as_hb_channel_msg_read_status channel_mesh_msg_read(cf_socket* socket, msg* msg); |
1638 | static void channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid); |
1639 | static bool channel_socket_should_live(cf_socket* socket, as_hb_channel* channel); |
1640 | static cf_socket* channel_socket_resolve(cf_socket* socket1, cf_socket* socket2); |
1641 | static int channel_msg_sanity_check(as_hb_channel_event* msg_event); |
1642 | static int channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event); |
1643 | static void channel_msg_read(cf_socket* socket); |
1644 | static void channel_channels_idle_check(); |
1645 | void* channel_tender(void* arg); |
1646 | static bool channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata); |
1647 | static void channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, int endpoint_list_count); |
1648 | static int channel_node_disconnect(cf_node nodeid); |
1649 | static void channel_mesh_listening_socks_register(cf_sockets* listening_sockets); |
1650 | static void channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets); |
1651 | static void channel_multicast_listening_socks_register(cf_sockets* listening_sockets); |
1652 | static void channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets); |
1653 | static void channel_init(); |
1654 | static void channel_start(); |
1655 | static int channel_sockets_get_reduce(const void* key, void* data, void* udata); |
1656 | static void channel_stop(); |
1657 | static int channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length); |
1658 | static int channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length); |
1659 | static bool channel_msg_is_compression_required(msg* msg, int wire_size, int mtu); |
1660 | static int channel_msg_buffer_size_get(int wire_size, int mtu); |
1661 | static size_t channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, uint8_t* buffer, size_t buffer_len); |
1662 | static int channel_msg_unicast(cf_node dest, msg* msg); |
1663 | static int channel_msg_broadcast_reduce(const void* key, void* data, void* udata); |
1664 | static int channel_msg_broadcast(msg* msg); |
1665 | static void channel_clear(); |
1666 | static int channel_dump_reduce(const void* key, void* data, void* udata); |
1667 | static void channel_dump(bool verbose); |
1668 | |
1669 | static bool mesh_is_running(); |
1670 | static bool mesh_is_stopped(); |
1671 | static void mesh_published_endpoints_process(endpoint_list_process_fn process_fn, void* udata); |
1672 | static const char* mesh_node_status_string(as_hb_mesh_node_status status); |
1673 | static int mesh_seed_delete_unsafe(int seed_index); |
1674 | static int mesh_seed_find_unsafe(char* host, int port); |
1675 | static void mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, int mesh_node_count); |
1676 | static void mesh_node_status_change(as_hb_mesh_node* mesh_node, as_hb_mesh_node_status new_status); |
1677 | static void mesh_listening_sockets_close(); |
1678 | static void mesh_seed_host_list_get(cf_dyn_buf* db, bool tls); |
1679 | static void mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p); |
1680 | static void mesh_stop(); |
1681 | static int mesh_tend_reduce(const void* key, void* data, void* udata); |
1682 | void* mesh_tender(void* arg); |
1683 | static void mesh_node_destroy(as_hb_mesh_node* mesh_node); |
1684 | static void mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata); |
1685 | static bool mesh_node_is_discovered(cf_node nodeid); |
1686 | static bool mesh_node_endpoint_list_is_valid(cf_node nodeid); |
1687 | static int mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node); |
1688 | static void mesh_channel_on_node_disconnect(as_hb_channel_event* event); |
1689 | static bool mesh_node_check_fix_self_msg(as_hb_channel_event* event); |
1690 | static void mesh_node_data_update(as_hb_channel_event* event); |
1691 | static int mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, size_t* reply_size); |
1692 | static void mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, size_t reply_count); |
1693 | static msg* mesh_info_msg_init(as_hb_msg_type msg_type); |
1694 | static void mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, size_t to_discover_count); |
1695 | static void mesh_channel_on_pulse(msg* msg); |
1696 | static void mesh_channel_on_info_request(msg* msg); |
1697 | static void mesh_channel_on_info_reply(msg* msg); |
1698 | static int mesh_tip(char* host, int port, bool tls); |
1699 | static void mesh_channel_event_process(as_hb_channel_event* event); |
1700 | static void mesh_init(); |
1701 | static int mesh_free_node_data_reduce(const void* key, void* data, void* udata); |
1702 | static int mesh_tip_clear_reduce(const void* key, void* data, void* udata); |
1703 | static int mesh_peer_endpoint_reduce(const void* key, void* data, void* udata); |
1704 | static void mesh_clear(); |
1705 | static void mesh_listening_sockets_open(); |
1706 | static void mesh_start(); |
1707 | static int mesh_dump_reduce(const void* key, void* data, void* udata); |
1708 | static void mesh_dump(bool verbose); |
1709 | |
1710 | static void multicast_init(); |
1711 | static void multicast_clear(); |
1712 | static void multicast_listening_sockets_open(); |
1713 | static void multicast_start(); |
1714 | static void multicast_listening_sockets_close(); |
1715 | static void multicast_stop(); |
1716 | static void multicast_dump(bool verbose); |
1717 | static int multicast_supported_cluster_size_get(); |
1718 | |
1719 | static bool hb_is_initialized(); |
1720 | static bool hb_is_running(); |
1721 | static bool hb_is_stopped(); |
1722 | static void hb_mode_init(); |
1723 | static void hb_mode_start(); |
1724 | static int hb_mtu(); |
1725 | static void hb_msg_init(); |
1726 | static uint32_t hb_protocol_identifier_get(); |
1727 | static cf_clock hb_node_depart_time(cf_clock detect_time); |
1728 | static bool hb_is_mesh(); |
1729 | static void hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, int node_count); |
1730 | static void hb_event_publish_pending(); |
1731 | static int hb_adjacency_free_data_reduce(const void* key, void* data, void* udata); |
1732 | static void hb_clear(); |
1733 | static int hb_adjacency_iterate_reduce(const void* key, void* data, void* udata); |
1734 | static void hb_plugin_set_fn(msg* msg); |
1735 | static void hb_plugin_parse_data_fn(msg* msg, cf_node source, as_hb_plugin_node_data* prev_plugin_data, as_hb_plugin_node_data* plugin_data); |
1736 | static msg* hb_msg_get(); |
1737 | static void hb_msg_return(msg* msg); |
1738 | static void hb_plugin_msg_fill(msg* msg); |
1739 | static void hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, as_hb_plugin* plugins, bool plugin_data_changed[]); |
1740 | static void hb_plugin_init(); |
1741 | void* hb_transmitter(void* arg); |
1742 | static int hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node); |
1743 | static void hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size); |
1744 | static void hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, cf_node** adjacency_list, size_t* adjacency_length); |
1745 | static bool hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node); |
1746 | static bool hb_self_is_duplicate(); |
1747 | static void hb_self_duplicate_update(); |
1748 | static void hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node); |
1749 | static int hb_adjacency_tend_reduce(const void* key, void* data, void* udata); |
1750 | void* hb_adjacency_tender(void* arg); |
1751 | static void hb_tx_start(); |
1752 | static void hb_tx_stop(); |
1753 | static void hb_adjacency_tender_start(); |
1754 | static void hb_adjacency_tender_stop(); |
1755 | static void hb_init(); |
1756 | static void hb_start(); |
1757 | static void hb_stop(); |
1758 | static void hb_plugin_register(as_hb_plugin* plugin); |
1759 | static bool hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp send_ts); |
1760 | static void hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed); |
1761 | static bool hb_endpoint_change_tracker_is_normal(uint64_t tracker); |
1762 | static bool hb_endpoint_change_tracker_has_changed(uint64_t tracker); |
1763 | static int hb_adjacent_node_update(as_hb_channel_event* msg_event, as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]); |
1764 | static bool hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node); |
1765 | static void hb_channel_on_self_pulse(as_hb_channel_event* msg_event); |
1766 | static void hb_channel_on_pulse(as_hb_channel_event* msg_event); |
1767 | static void hb_channel_on_msg_rcvd(as_hb_channel_event* event); |
1768 | static void hb_handle_cluster_name_mismatch(as_hb_channel_event* event); |
1769 | static void hb_channel_event_process(as_hb_channel_event* event); |
1770 | static void hb_mode_dump(bool verbose); |
1771 | static int hb_dump_reduce(const void* key, void* data, void* udata); |
1772 | static void hb_dump(bool verbose); |
1773 | static void hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph); |
1774 | static void hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict); |
1775 | static int hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata); |
1776 | static void hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, |
1777 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata); |
1778 | |
1779 | /* |
1780 | * ---------------------------------------------------------------------------- |
1781 | * Public functions. |
1782 | * ---------------------------------------------------------------------------- |
1783 | */ |
1784 | /** |
1785 | * Initialize the heartbeat subsystem. |
1786 | */ |
1787 | void |
1788 | as_hb_init() |
1789 | { |
1790 | // Initialize hb subsystem. |
1791 | hb_init(); |
1792 | |
1793 | // Add the mesh seed nodes. |
1794 | // Using one time seed config outside the config module. |
1795 | if (hb_is_mesh()) { |
1796 | for (int i = 0; i < AS_CLUSTER_SZ; i++) { |
1797 | if (g_config.hb_config.mesh_seed_addrs[i]) { |
1798 | mesh_tip(g_config.hb_config.mesh_seed_addrs[i], |
1799 | g_config.hb_config.mesh_seed_ports[i], |
1800 | g_config.hb_config.mesh_seed_tls[i]); |
1801 | } |
1802 | else { |
1803 | break; |
1804 | } |
1805 | } |
1806 | } |
1807 | } |
1808 | |
1809 | /** |
1810 | * Start the heartbeat subsystem. |
1811 | */ |
1812 | void |
1813 | as_hb_start() |
1814 | { |
1815 | hb_start(); |
1816 | } |
1817 | |
1818 | /** |
1819 | * Shut down the heartbeat subsystem. |
1820 | */ |
1821 | void |
1822 | as_hb_shutdown() |
1823 | { |
1824 | hb_stop(); |
1825 | } |
1826 | |
1827 | /** |
1828 | * Indicates if self node is a duplicate |
1829 | */ |
1830 | bool |
1831 | as_hb_self_is_duplicate() |
1832 | { |
1833 | return hb_self_is_duplicate(); |
1834 | } |
1835 | |
1836 | /** |
1837 | * Free the data structures of heart beat. |
1838 | */ |
1839 | void |
1840 | as_hb_destroy() |
1841 | { |
1842 | // Destroy the main module. |
1843 | hb_clear(); |
1844 | } |
1845 | |
1846 | /** |
1847 | * Return a string representation of a heartbeat protocol type. |
1848 | * |
1849 | * @param protocol for which the string is computed |
1850 | * @param protocol_s string representation of protocol |
1851 | */ |
1852 | void |
1853 | as_hb_protocol_get_s(as_hb_protocol protocol, char* protocol_s) |
1854 | { |
1855 | char *str; |
1856 | switch (protocol) { |
1857 | case AS_HB_PROTOCOL_V3: |
1858 | str = "v3" ; |
1859 | break; |
1860 | case AS_HB_PROTOCOL_NONE: |
1861 | str = "none" ; |
1862 | break; |
1863 | case AS_HB_PROTOCOL_RESET: |
1864 | str = "reset" ; |
1865 | break; |
1866 | default: |
1867 | str = "undefined" ; |
1868 | } |
1869 | |
1870 | sprintf(protocol_s, "%s" , str); |
1871 | } |
1872 | |
1873 | /** |
1874 | * Set heartbeat protocol version. |
1875 | */ |
1876 | as_hb_protocol |
1877 | as_hb_protocol_get() |
1878 | { |
1879 | return config_protocol_get(); |
1880 | } |
1881 | |
1882 | /** |
1883 | * Set heartbeat protocol version. |
1884 | */ |
1885 | int |
1886 | as_hb_protocol_set(as_hb_protocol new_protocol) |
1887 | { |
1888 | SET_PROTOCOL_LOCK(); |
1889 | int rv = 0; |
1890 | if (config_protocol_get() == new_protocol) { |
1891 | INFO("no heartbeat protocol change needed" ); |
1892 | rv = 0; |
1893 | goto Exit; |
1894 | } |
1895 | char old_protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
1896 | char new_protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
1897 | as_hb_protocol_get_s(config_protocol_get(), old_protocol_s); |
1898 | as_hb_protocol_get_s(new_protocol, new_protocol_s); |
1899 | switch (new_protocol) { |
1900 | case AS_HB_PROTOCOL_V3: |
1901 | if (hb_is_running()) { |
1902 | INFO("disabling current heartbeat protocol %s" , old_protocol_s); |
1903 | hb_stop(); |
1904 | } |
1905 | INFO("setting heartbeat protocol version number to %s" , new_protocol_s); |
1906 | config_protocol_set(new_protocol); |
1907 | hb_start(); |
1908 | INFO("heartbeat protocol version set to %s" , new_protocol_s); |
1909 | break; |
1910 | |
1911 | case AS_HB_PROTOCOL_NONE: |
1912 | INFO("setting heartbeat protocol version to none" ); |
1913 | hb_stop(); |
1914 | config_protocol_set(new_protocol); |
1915 | INFO("heartbeat protocol set to none" ); |
1916 | break; |
1917 | |
1918 | case AS_HB_PROTOCOL_RESET: |
1919 | if (config_protocol_get() == AS_HB_PROTOCOL_NONE) { |
1920 | INFO("heartbeat messaging disabled ~~ not resetting" ); |
1921 | rv = -1; |
1922 | goto Exit; |
1923 | } |
1924 | |
1925 | // NB: "protocol" is never actually set to "RESET" ~~ |
1926 | // it is simply a trigger for the reset action. |
1927 | INFO("resetting heartbeat messaging" ); |
1928 | |
1929 | hb_stop(); |
1930 | |
1931 | hb_clear(); |
1932 | |
1933 | hb_start(); |
1934 | |
1935 | break; |
1936 | |
1937 | default: |
1938 | WARNING("unknown heartbeat protocol version number: %d" , new_protocol); |
1939 | rv = -1; |
1940 | goto Exit; |
1941 | } |
1942 | |
1943 | Exit: |
1944 | SET_PROTOCOL_UNLOCK(); |
1945 | return rv; |
1946 | } |
1947 | |
1948 | /** |
1949 | * Register a heartbeat plugin. |
1950 | */ |
1951 | void |
1952 | as_hb_plugin_register(as_hb_plugin* plugin) |
1953 | { |
1954 | if (!hb_is_initialized()) { |
1955 | WARNING( |
1956 | "main heartbeat module uninitialized - not registering the plugin" ); |
1957 | return; |
1958 | } |
1959 | hb_plugin_register(plugin); |
1960 | } |
1961 | |
1962 | /** |
1963 | * Register a heartbeat node event listener. |
1964 | */ |
1965 | void |
1966 | as_hb_register_listener(as_hb_event_fn event_callback, void* udata) |
1967 | { |
1968 | if (!hb_is_initialized()) { |
1969 | WARNING( |
1970 | "main heartbeat module uninitialized - not registering the listener" ); |
1971 | return; |
1972 | } |
1973 | |
1974 | HB_LOCK(); |
1975 | |
1976 | if (g_hb_event_listeners.event_listener_count >= |
1977 | AS_HB_EVENT_LISTENER_MAX) { |
1978 | CRASH("cannot register more than %d event listeners" , |
1979 | AS_HB_EVENT_LISTENER_MAX); |
1980 | } |
1981 | |
1982 | g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].event_callback = |
1983 | event_callback; |
1984 | g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].udata = |
1985 | udata; |
1986 | g_hb_event_listeners.event_listener_count++; |
1987 | |
1988 | HB_UNLOCK(); |
1989 | } |
1990 | |
1991 | /** |
1992 | * Validate heartbeat config. |
1993 | */ |
1994 | void |
1995 | as_hb_config_validate() |
1996 | { |
1997 | char *error; |
1998 | // Validate clustering and heartbeat version compatibility. |
1999 | as_hb_protocol hb_protocol = config_protocol_get(); |
2000 | |
2001 | if (hb_protocol != AS_HB_PROTOCOL_V3 |
2002 | && hb_protocol != AS_HB_PROTOCOL_NONE) { |
2003 | CRASH_NOSTACK("clustering protocol v5 requires hearbeat version v3" ); |
2004 | } |
2005 | |
2006 | if (!config_binding_is_valid(&error, hb_protocol)) { |
2007 | CRASH_NOSTACK("%s" , error); |
2008 | } |
2009 | } |
2010 | |
2011 | /** |
2012 | * Override the computed MTU for the network interface used by heartbeat. |
2013 | */ |
2014 | void |
2015 | as_hb_override_mtu_set(int mtu) |
2016 | { |
2017 | config_override_mtu_set(mtu); |
2018 | } |
2019 | |
2020 | /** |
2021 | * Get the heartbeat pulse transmit interval. |
2022 | */ |
2023 | uint32_t |
2024 | as_hb_tx_interval_get() |
2025 | { |
2026 | return config_tx_interval_get(); |
2027 | } |
2028 | |
2029 | /** |
2030 | * Set the heartbeat pulse transmit interval. |
2031 | */ |
2032 | int |
2033 | as_hb_tx_interval_set(uint32_t new_interval) |
2034 | { |
2035 | if (new_interval < AS_HB_TX_INTERVAL_MS_MIN |
2036 | || new_interval > AS_HB_TX_INTERVAL_MS_MAX) { |
2037 | WARNING("heartbeat interval must be >= %u and <= %u - ignoring %u" , |
2038 | AS_HB_TX_INTERVAL_MS_MIN, AS_HB_TX_INTERVAL_MS_MAX, |
2039 | new_interval); |
2040 | return (-1); |
2041 | } |
2042 | config_tx_interval_set(new_interval); |
2043 | return (0); |
2044 | } |
2045 | |
2046 | /** |
2047 | * Get the maximum number of missed heartbeat intervals after which a node is |
2048 | * considered expired. |
2049 | */ |
2050 | uint32_t |
2051 | as_hb_max_intervals_missed_get() |
2052 | { |
2053 | return config_max_intervals_missed_get(); |
2054 | } |
2055 | |
2056 | /** |
2057 | * Set the maximum number of missed heartbeat intervals after which a node is |
2058 | * considered expired. |
2059 | */ |
2060 | int |
2061 | as_hb_max_intervals_missed_set(uint32_t new_max) |
2062 | { |
2063 | if (new_max < AS_HB_MAX_INTERVALS_MISSED_MIN) { |
2064 | WARNING("heartbeat timeout must be >= %u - ignoring %u" , |
2065 | AS_HB_MAX_INTERVALS_MISSED_MIN, new_max); |
2066 | return (-1); |
2067 | } |
2068 | config_max_intervals_missed_set(new_max); |
2069 | return (0); |
2070 | } |
2071 | |
2072 | /** |
2073 | * Get the timeout interval to consider a node dead / expired in milliseconds if |
2074 | * no heartbeat pulse messages are received. |
2075 | */ |
2076 | uint32_t |
2077 | as_hb_node_timeout_get() |
2078 | { |
2079 | return HB_NODE_TIMEOUT(); |
2080 | } |
2081 | |
2082 | /** |
2083 | * Populate the buffer with heartbeat configuration. |
2084 | */ |
2085 | void |
2086 | as_hb_info_config_get(cf_dyn_buf* db) |
2087 | { |
2088 | if (hb_is_mesh()) { |
2089 | info_append_string(db, "heartbeat.mode" , "mesh" ); |
2090 | info_append_addrs(db, "heartbeat.address" , &g_config.hb_serv_spec.bind); |
2091 | info_append_uint32(db, "heartbeat.port" , |
2092 | (uint32_t)g_config.hb_serv_spec.bind_port); |
2093 | info_append_addrs(db, "heartbeat.tls-address" , |
2094 | &g_config.hb_tls_serv_spec.bind); |
2095 | info_append_uint32(db, "heartbeat.tls-port" , |
2096 | g_config.hb_tls_serv_spec.bind_port); |
2097 | info_append_string_safe(db, "heartbeat.tls-name" , |
2098 | g_config.hb_tls_serv_spec.tls_our_name); |
2099 | mesh_seed_host_list_get(db, true); |
2100 | } |
2101 | else { |
2102 | info_append_string(db, "heartbeat.mode" , "multicast" ); |
2103 | info_append_addrs(db, "heartbeat.address" , &g_config.hb_serv_spec.bind); |
2104 | info_append_addrs(db, "heartbeat.multicast-group" , |
2105 | &g_config.hb_multicast_groups); |
2106 | info_append_uint32(db, "heartbeat.port" , |
2107 | (uint32_t)g_config.hb_serv_spec.bind_port); |
2108 | } |
2109 | |
2110 | info_append_uint32(db, "heartbeat.interval" , config_tx_interval_get()); |
2111 | info_append_uint32(db, "heartbeat.timeout" , |
2112 | config_max_intervals_missed_get()); |
2113 | |
2114 | info_append_int(db, "heartbeat.mtu" , hb_mtu()); |
2115 | |
2116 | char protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
2117 | as_hb_protocol_get_s(config_protocol_get(), protocol_s); |
2118 | |
2119 | info_append_string(db, "heartbeat.protocol" , protocol_s); |
2120 | } |
2121 | |
2122 | /** |
2123 | * Populate heartbeat endpoints. |
2124 | */ |
2125 | void |
2126 | as_hb_info_endpoints_get(cf_dyn_buf* db) |
2127 | { |
2128 | const cf_serv_cfg *cfg = config_bind_cfg_get(); |
2129 | |
2130 | if (cfg->n_cfgs == 0) { |
2131 | // Will never happen in practice. |
2132 | return; |
2133 | } |
2134 | |
2135 | info_append_int(db, "heartbeat.port" , g_config.hb_serv_spec.bind_port); |
2136 | |
2137 | char *string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT); |
2138 | info_append_string(db, "heartbeat.addresses" , string); |
2139 | cf_free(string); |
2140 | |
2141 | info_append_int(db, "heartbeat.tls-port" , |
2142 | g_config.hb_tls_serv_spec.bind_port); |
2143 | |
2144 | string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT_TLS); |
2145 | info_append_string(db, "heartbeat.tls-addresses" , string); |
2146 | cf_free(string); |
2147 | |
2148 | if (hb_is_mesh()) { |
2149 | MESH_LOCK(); |
2150 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
2151 | mesh_peer_endpoint_reduce, db); |
2152 | MESH_UNLOCK(); |
2153 | } |
2154 | else { |
2155 | // Output multicast groups. |
2156 | const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get(); |
2157 | if (multicast_cfg->n_cfgs == 0) { |
2158 | return; |
2159 | } |
2160 | |
2161 | cf_dyn_buf_append_string(db, "heartbeat.multicast-groups=" ); |
2162 | uint32_t count = 0; |
2163 | for (uint32_t i = 0; i < multicast_cfg->n_cfgs; ++i) { |
2164 | if (count > 0) { |
2165 | cf_dyn_buf_append_char(db, ','); |
2166 | } |
2167 | |
2168 | cf_dyn_buf_append_string(db, |
2169 | cf_ip_addr_print(&multicast_cfg->cfgs[i].addr)); |
2170 | ++count; |
2171 | } |
2172 | cf_dyn_buf_append_char(db, ';'); |
2173 | } |
2174 | } |
2175 | |
2176 | /** |
2177 | * Generate a string for listening address and port in format ip_address:port |
2178 | * and return the heartbeat mode. |
2179 | * |
2180 | * @param mode (output) current heartbeat subsystem mode. |
2181 | * @param addr_port (output) listening ip address and port formatted as |
2182 | * ip_address:port |
2183 | * @param addr_port_capacity the capacity of the addr_port input. |
2184 | */ |
2185 | void |
2186 | as_hb_info_listen_addr_get(as_hb_mode* mode, char* addr_port, |
2187 | size_t addr_port_capacity) |
2188 | { |
2189 | *mode = hb_is_mesh() ? AS_HB_MODE_MESH : AS_HB_MODE_MULTICAST; |
2190 | if (hb_is_mesh()) { |
2191 | endpoint_list_to_string_udata udata; |
2192 | udata.endpoint_list_str = addr_port; |
2193 | udata.endpoint_list_str_capacity = addr_port_capacity; |
2194 | mesh_published_endpoints_process(endpoint_list_to_string_process, |
2195 | &udata); |
2196 | } |
2197 | else { |
2198 | const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get(); |
2199 | |
2200 | char* write_ptr = addr_port; |
2201 | int remaining = addr_port_capacity; |
2202 | |
2203 | // Ensure we leave space for the terminating NULL delimiter. |
2204 | for (int i = 0; i < multicast_cfg->n_cfgs && remaining > 1; i++) { |
2205 | cf_sock_addr temp; |
2206 | cf_ip_addr_copy(&multicast_cfg->cfgs[i].addr, &temp.addr); |
2207 | temp.port = multicast_cfg->cfgs[i].port; |
2208 | int rv = cf_sock_addr_to_string(&temp, write_ptr, remaining); |
2209 | if (rv <= 0) { |
2210 | // We exhausted the write buffer. |
2211 | // Ensure NULL termination. |
2212 | addr_port[addr_port_capacity - 1] = 0; |
2213 | return; |
2214 | } |
2215 | |
2216 | write_ptr += rv; |
2217 | remaining -= rv; |
2218 | |
2219 | if (i != multicast_cfg->n_cfgs - 1 && remaining > 1) { |
2220 | *write_ptr = ','; |
2221 | write_ptr++; |
2222 | remaining--; |
2223 | } |
2224 | } |
2225 | |
2226 | // Ensure NULL termination. |
2227 | *write_ptr = 0; |
2228 | } |
2229 | } |
2230 | |
2231 | /** |
2232 | * Populate the buffer with duplicate nodeids. |
2233 | */ |
2234 | void |
2235 | as_hb_info_duplicates_get(cf_dyn_buf* db) |
2236 | { |
2237 | cf_dyn_buf_append_string(db, "cluster_duplicate_nodes=" ); |
2238 | |
2239 | HB_LOCK(); |
2240 | bool self_is_duplicate = hb_self_is_duplicate(); |
2241 | int num_probation = cf_shash_get_size(g_hb.on_probation); |
2242 | cf_node duplicate_list[num_probation + 1]; |
2243 | |
2244 | if (!self_is_duplicate && num_probation == 0) { |
2245 | cf_dyn_buf_append_string(db, "null" ); |
2246 | goto Exit; |
2247 | } |
2248 | |
2249 | as_hb_adjacency_reduce_udata probation_reduce_udata = { duplicate_list, 0 }; |
2250 | |
2251 | cf_shash_reduce(g_hb.on_probation, hb_adjacency_iterate_reduce, |
2252 | &probation_reduce_udata); |
2253 | |
2254 | if (hb_self_is_duplicate()) { |
2255 | duplicate_list[probation_reduce_udata.adj_count++] = |
2256 | config_self_nodeid_get(); |
2257 | } |
2258 | |
2259 | int num_duplicates = probation_reduce_udata.adj_count; |
2260 | qsort(duplicate_list, num_duplicates, sizeof(cf_node), |
2261 | cf_node_compare_desc); |
2262 | |
2263 | for (int i = 0; i < num_duplicates; i++) { |
2264 | cf_dyn_buf_append_uint64_x(db, duplicate_list[i]); |
2265 | cf_dyn_buf_append_char(db, ','); |
2266 | } |
2267 | cf_dyn_buf_chomp(db); |
2268 | |
2269 | Exit: |
2270 | HB_UNLOCK(); |
2271 | cf_dyn_buf_append_char(db, ';'); |
2272 | } |
2273 | |
2274 | /* |
2275 | * ----------------------------------------------------------------- |
2276 | * Mesh mode public API |
2277 | * ----------------------------------------------------------------- |
2278 | */ |
2279 | |
2280 | /** |
2281 | * Add an aerospike instance from the mesh seed list. |
2282 | */ |
2283 | int |
2284 | as_hb_mesh_tip(char* host, int port, bool tls) |
2285 | { |
2286 | if (!hb_is_mesh()) { |
2287 | WARNING("tip not applicable for multicast" ); |
2288 | return (-1); |
2289 | } |
2290 | |
2291 | return mesh_tip(host, port, tls); |
2292 | } |
2293 | |
2294 | /** |
2295 | * Remove a mesh node instance from the mesh list. |
2296 | */ |
2297 | int |
2298 | as_hb_mesh_tip_clear(char* host, int port) |
2299 | { |
2300 | if (!hb_is_mesh()) { |
2301 | WARNING("tip clear not applicable for multicast" ); |
2302 | return (-1); |
2303 | } |
2304 | |
2305 | if (host == NULL || host[0] == 0 |
2306 | || strnlen(host, DNS_NAME_MAX_SIZE) == DNS_NAME_MAX_SIZE) { |
2307 | WARNING("invalid tip clear host:%s or port:%d" , host, port); |
2308 | return (-1); |
2309 | } |
2310 | |
2311 | MESH_LOCK(); |
2312 | DETAIL("executing tip clear for %s:%d" , host, port); |
2313 | |
2314 | // FIXME: Remove the mesh host entry and close channel was done to meet |
2315 | // AER-5241 ??? |
2316 | // tip-clear is not a mechanism to throw a connected node out of the |
2317 | // cluster. |
2318 | // We should not be required to use this mechanism now. |
2319 | // tip-clear should only be used to cleanup seed list after decommisioning |
2320 | // an ip. |
2321 | cf_ip_addr addrs[CF_SOCK_CFG_MAX]; |
2322 | uint32_t n_addrs = CF_SOCK_CFG_MAX; |
2323 | |
2324 | as_hb_mesh_tip_clear_udata mesh_tip_clear_reduce_udata; |
2325 | strcpy(mesh_tip_clear_reduce_udata.host, host); |
2326 | mesh_tip_clear_reduce_udata.port = port; |
2327 | mesh_tip_clear_reduce_udata.entry_deleted = false; |
2328 | mesh_tip_clear_reduce_udata.nodeid = 0; |
2329 | |
2330 | if (cf_ip_addr_from_string_multi(host, addrs, &n_addrs) != 0) { |
2331 | n_addrs = 0; |
2332 | } |
2333 | |
2334 | mesh_tip_clear_reduce_udata.addrs = addrs; |
2335 | mesh_tip_clear_reduce_udata.n_addrs = n_addrs; |
2336 | |
2337 | int seed_index = mesh_seed_find_unsafe(host, port); |
2338 | if (seed_index >= 0) { |
2339 | as_hb_mesh_seed* seed = cf_vector_getp( |
2340 | &g_hb.mode_state.mesh_state.seeds, seed_index); |
2341 | mesh_tip_clear_reduce_udata.nodeid = seed->mesh_nodeid; |
2342 | } |
2343 | |
2344 | // Refresh the mapping between the seeds and the mesh hosts. |
2345 | mesh_seed_inactive_refresh_get_unsafe (NULL); |
2346 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
2347 | mesh_tip_clear_reduce, &mesh_tip_clear_reduce_udata); |
2348 | |
2349 | // Remove the seed entry in case we do not find a matching mesh entry. |
2350 | // Will happen trivially if this seed could not be connected. |
2351 | mesh_tip_clear_reduce_udata.entry_deleted = |
2352 | mesh_tip_clear_reduce_udata.entry_deleted |
2353 | || mesh_seed_delete_unsafe( |
2354 | mesh_seed_find_unsafe(host, port)) == 0; |
2355 | |
2356 | MESH_UNLOCK(); |
2357 | return mesh_tip_clear_reduce_udata.entry_deleted ? 0 : -1; |
2358 | } |
2359 | |
2360 | /** |
2361 | * Clear the entire mesh list. |
2362 | */ |
2363 | int |
2364 | as_hb_mesh_tip_clear_all(uint32_t* cleared) |
2365 | { |
2366 | if (!hb_is_mesh()) { |
2367 | WARNING("tip clear not applicable for multicast" ); |
2368 | return (-1); |
2369 | } |
2370 | |
2371 | MESH_LOCK(); |
2372 | *cleared = cf_shash_get_size( |
2373 | g_hb.mode_state.mesh_state.nodeid_to_mesh_node); |
2374 | |
2375 | // Refresh the mapping between the seeds and the mesh hosts. |
2376 | mesh_seed_inactive_refresh_get_unsafe(NULL); |
2377 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
2378 | mesh_tip_clear_reduce, NULL); |
2379 | |
2380 | // Remove all entries that did not have a matching mesh endpoint. |
2381 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
2382 | int element_count = cf_vector_size(seeds); |
2383 | for (int i = 0; i < element_count; i++) { |
2384 | if (mesh_seed_delete_unsafe(i) == 0) { |
2385 | i--; |
2386 | element_count--; |
2387 | } |
2388 | else { |
2389 | // Should not happen in practice. |
2390 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
2391 | CRASH("error deleting mesh seed entry %s:%d" , seed->seed_host_name, |
2392 | seed->seed_port); |
2393 | } |
2394 | } |
2395 | |
2396 | MESH_UNLOCK(); |
2397 | return (0); |
2398 | } |
2399 | |
2400 | /** |
2401 | * Read the plugin data for a node in the adjacency list. The plugin_data->data |
2402 | * input param should be pre allocated and plugin_data->data_capacity should |
2403 | * indicate its capacity. |
2404 | * |
2405 | * @param nodeid the node id |
2406 | * @param pluginid the plugin identifier. |
2407 | * @param plugin_data (input/output) on success plugin_data->data will be the |
2408 | * plugin's data for the node and plugin_data->data_size will be the data size. |
2409 | * node. NULL if there is no plugin data. |
2410 | * @praram msg_hlc_ts (output) if not NULL will be filled with the timestamp of |
2411 | * when the hb message for this data was received. |
2412 | * @param recv_monotonic_ts (output) if not NULL will be filled with monotonic |
2413 | * wall clock receive timestamp for this plugin data. |
2414 | * @return 0 on success and -1 on error, where errno will be set to ENOENT if |
2415 | * there is no entry for this node and ENOMEM if the input plugin data's |
2416 | * capacity is less than plugin's data. In ENOMEM case plugin_data->data_size |
2417 | * will be set to the required capacity. |
2418 | */ |
2419 | int |
2420 | as_hb_plugin_data_get(cf_node nodeid, as_hb_plugin_id plugin, |
2421 | as_hb_plugin_node_data* plugin_data, as_hlc_msg_timestamp* msg_hlc_ts, |
2422 | cf_clock* recv_monotonic_ts) |
2423 | { |
2424 | int rv = 0; |
2425 | |
2426 | HB_LOCK(); |
2427 | |
2428 | as_hb_adjacent_node adjacent_node; |
2429 | if (hb_adjacent_node_get(nodeid, &adjacent_node) != 0) { |
2430 | rv = -1; |
2431 | plugin_data->data_size = 0; |
2432 | errno = ENOENT; |
2433 | goto Exit; |
2434 | } |
2435 | |
2436 | as_hb_plugin_node_data* plugin_data_internal = |
2437 | &adjacent_node.plugin_data[plugin][adjacent_node.plugin_data_cycler |
2438 | % 2]; |
2439 | |
2440 | if (plugin_data_internal->data && plugin_data_internal->data_size) { |
2441 | // Set the plugin data size |
2442 | plugin_data->data_size = plugin_data_internal->data_size; |
2443 | |
2444 | if (plugin_data_internal->data_size > plugin_data->data_capacity) { |
2445 | rv = -1; |
2446 | errno = ENOMEM; |
2447 | goto Exit; |
2448 | } |
2449 | |
2450 | // Copy over the stored copy of the plugin data. |
2451 | memcpy(plugin_data->data, plugin_data_internal->data, |
2452 | plugin_data_internal->data_size); |
2453 | |
2454 | // Copy the message timestamp. |
2455 | if (msg_hlc_ts) { |
2456 | memcpy(msg_hlc_ts, &adjacent_node.last_msg_hlc_ts, |
2457 | sizeof(as_hlc_msg_timestamp)); |
2458 | } |
2459 | |
2460 | if (recv_monotonic_ts) { |
2461 | *recv_monotonic_ts = adjacent_node.last_updated_monotonic_ts; |
2462 | } |
2463 | |
2464 | rv = 0; |
2465 | } |
2466 | else { |
2467 | // No plugin data set. |
2468 | plugin_data->data_size = 0; |
2469 | if (recv_monotonic_ts) { |
2470 | *recv_monotonic_ts = 0; |
2471 | } |
2472 | if (msg_hlc_ts) { |
2473 | memset(msg_hlc_ts, 0, sizeof(as_hlc_msg_timestamp)); |
2474 | } |
2475 | rv = 0; |
2476 | } |
2477 | |
2478 | Exit: |
2479 | HB_UNLOCK(); |
2480 | return rv; |
2481 | } |
2482 | |
2483 | /** |
2484 | * Call the iterate method on plugin data for all nodes in the input vector. The |
2485 | * iterate function will be invoked for all nodes in the input vector even if |
2486 | * they are not in the adjacency list or they have no plugin data. Plugin data |
2487 | * will be NULL with size zero in such cases. |
2488 | * |
2489 | * @param nodes the iterate on. |
2490 | * @param plugin the plugin identifier. |
2491 | * @param iterate_fn the iterate function invoked for plugin data for every |
2492 | * node. |
2493 | * @param udata passed as is to the iterate function. Useful for getting results |
2494 | * out of the iteration. |
2495 | * NULL if there is no plugin data. |
2496 | * @return the size of the plugin data. 0 if there is no plugin data. |
2497 | */ |
2498 | void |
2499 | as_hb_plugin_data_iterate(cf_vector* nodes, as_hb_plugin_id plugin, |
2500 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata) |
2501 | |
2502 | { |
2503 | HB_LOCK(); |
2504 | |
2505 | int size = cf_vector_size(nodes); |
2506 | |
2507 | for (int i = 0; i < size; i++) { |
2508 | cf_node* nodeid = cf_vector_getp(nodes, i); |
2509 | |
2510 | if (nodeid == NULL || *nodeid == 0) { |
2511 | continue; |
2512 | } |
2513 | |
2514 | as_hb_adjacent_node nodeinfo; |
2515 | |
2516 | if (hb_adjacent_node_get(*nodeid, &nodeinfo) == 0) { |
2517 | size_t data_size = 0; |
2518 | void* data = NULL; |
2519 | |
2520 | hb_adjacent_node_plugin_data_get(&nodeinfo, plugin, &data, |
2521 | &data_size); |
2522 | |
2523 | iterate_fn(*nodeid, data, data_size, |
2524 | nodeinfo.last_updated_monotonic_ts, |
2525 | &nodeinfo.last_msg_hlc_ts, udata); |
2526 | } |
2527 | else { |
2528 | // This node is not known to the heartbeat subsystem. |
2529 | iterate_fn(*nodeid, NULL, 0, 0, NULL, udata); |
2530 | } |
2531 | } |
2532 | |
2533 | HB_UNLOCK(); |
2534 | } |
2535 | |
2536 | /** |
2537 | * Call the iterate method on all nodes in current adjacency list. Note plugin |
2538 | * data can still be NULL if the plugin data failed to parse the plugin data. |
2539 | * |
2540 | * @param pluginid the plugin identifier. |
2541 | * @param iterate_fn the iterate function invoked for plugin data for every |
2542 | * node. |
2543 | * @param udata passed as is to the iterate function. Useful for getting results |
2544 | * out of the iteration. |
2545 | * NULL if there is no plugin data. |
2546 | * @return the size of the plugin data. 0 if there is no plugin data. |
2547 | */ |
2548 | void |
2549 | as_hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, |
2550 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata) |
2551 | { |
2552 | hb_plugin_data_iterate_all(pluginid, iterate_fn, udata); |
2553 | } |
2554 | |
2555 | /** |
2556 | * Log the state of the heartbeat module. |
2557 | */ |
2558 | void |
2559 | as_hb_dump(bool verbose) |
2560 | { |
2561 | INFO("Heartbeat Dump:" ); |
2562 | |
2563 | as_hb_mode mode; |
2564 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
2565 | as_hb_info_listen_addr_get(&mode, endpoint_list_str, |
2566 | sizeof(endpoint_list_str)); |
2567 | |
2568 | // Dump the config. |
2569 | INFO("HB Mode: %s (%d)" , |
2570 | (mode == AS_HB_MODE_MULTICAST ? |
2571 | "multicast" : |
2572 | (mode == AS_HB_MODE_MESH ? "mesh" : "undefined" )), mode); |
2573 | |
2574 | INFO("HB Addresses: {%s}" , endpoint_list_str); |
2575 | INFO("HB MTU: %d" , hb_mtu()); |
2576 | |
2577 | INFO("HB Interval: %d" , config_tx_interval_get()); |
2578 | INFO("HB Timeout: %d" , config_max_intervals_missed_get()); |
2579 | char protocol_s[HB_PROTOCOL_STR_MAX_LEN]; |
2580 | as_hb_protocol_get_s(config_protocol_get(), protocol_s); |
2581 | INFO("HB Protocol: %s (%d)" , protocol_s, config_protocol_get()); |
2582 | |
2583 | // dump mode specific state. |
2584 | hb_mode_dump(verbose); |
2585 | |
2586 | // Dump the channel state. |
2587 | channel_dump(verbose); |
2588 | |
2589 | // Dump the adjacency list. |
2590 | hb_dump(verbose); |
2591 | } |
2592 | |
2593 | /** |
2594 | * Indicates if a node is alive. |
2595 | */ |
2596 | bool |
2597 | as_hb_is_alive(cf_node nodeid) |
2598 | { |
2599 | bool is_alive; |
2600 | HB_LOCK(); |
2601 | |
2602 | as_hb_adjacent_node adjacent_node; |
2603 | is_alive = (nodeid == config_self_nodeid_get()) |
2604 | || (hb_adjacent_node_get(nodeid, &adjacent_node) == 0); |
2605 | |
2606 | HB_UNLOCK(); |
2607 | return is_alive; |
2608 | } |
2609 | |
2610 | /** |
2611 | * Compute the nodes to evict from the input nodes so that remaining nodes form |
2612 | * a clique, based on adjacency lists. Self nodeid is never considered for |
2613 | * eviction. |
2614 | * |
2615 | * @param nodes input cf_node vector. |
2616 | * @param nodes_to_evict output cf_node clique array, that is initialized. |
2617 | */ |
2618 | void |
2619 | as_hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict) |
2620 | { |
2621 | hb_maximal_clique_evict(nodes, nodes_to_evict); |
2622 | } |
2623 | |
2624 | /** |
2625 | * Read the hlc timestamp for the message. |
2626 | * Note: A protected API for the sole benefit of skew monitor. |
2627 | * |
2628 | * @param msg the incoming message. |
2629 | * @param send_ts the output hlc timestamp. |
2630 | * @return 0 if the time stamp could be parsed -1 on failure. |
2631 | */ |
2632 | int |
2633 | as_hb_msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts) |
2634 | { |
2635 | return msg_send_hlc_ts_get(msg, send_ts); |
2636 | } |
2637 | |
2638 | /* |
2639 | * ---------------------------------------------------------------------------- |
2640 | * Common sub module. |
2641 | * ---------------------------------------------------------------------------- |
2642 | */ |
2643 | |
2644 | /* |
2645 | * ---------------------------------------------------------------------------- |
2646 | * Utility |
2647 | * ---------------------------------------------------------------------------- |
2648 | */ |
2649 | |
2650 | /** |
2651 | * Round up input int to the nearest power of two. |
2652 | */ |
2653 | static uint32_t |
2654 | round_up_pow2(uint32_t v) |
2655 | { |
2656 | v--; |
2657 | v |= v >> 1; |
2658 | v |= v >> 2; |
2659 | v |= v >> 4; |
2660 | v |= v >> 8; |
2661 | v |= v >> 16; |
2662 | v++; |
2663 | return v; |
2664 | } |
2665 | |
2666 | /** |
2667 | * Generate a hash code for a cf_socket. |
2668 | */ |
2669 | static uint32_t |
2670 | hb_socket_hash_fn(const void* key) |
2671 | { |
2672 | const cf_socket** socket = (const cf_socket**)key; |
2673 | return cf_hash_jen32((const uint8_t*)socket, sizeof(cf_socket*)); |
2674 | } |
2675 | |
2676 | /** |
2677 | * Reduce function to delete all entries in a map |
2678 | */ |
2679 | static int |
2680 | hb_delete_all_reduce(const void* key, void* data, void* udata) |
2681 | { |
2682 | return CF_SHASH_REDUCE_DELETE; |
2683 | } |
2684 | |
2685 | /* |
2686 | * ---------------------------------------------------------------------------- |
2687 | * Info call related |
2688 | * ---------------------------------------------------------------------------- |
2689 | */ |
2690 | |
2691 | /** |
2692 | * Append a address spec to a cf_dyn_buf. |
2693 | */ |
2694 | static void |
2695 | info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list) |
2696 | { |
2697 | for (uint32_t i = 0; i < list->n_addrs; ++i) { |
2698 | info_append_string(db, name, list->addrs[i]); |
2699 | } |
2700 | } |
2701 | |
2702 | /* |
2703 | * ---------------------------------------------------------------------------- |
2704 | * Vector operations |
2705 | * ---------------------------------------------------------------------------- |
2706 | */ |
2707 | |
2708 | /** |
2709 | * TODO: Move this to cf_vector. |
2710 | * Find the index of an element in the vector. Equality is based on mem compare. |
2711 | * |
2712 | * @param vector the source vector. |
2713 | * @param element the element to find. |
2714 | * @return the index if the element is found, -1 otherwise. |
2715 | */ |
2716 | static int |
2717 | vector_find(cf_vector* vector, const void* element) |
2718 | { |
2719 | int element_count = cf_vector_size(vector); |
2720 | size_t value_len = cf_vector_element_size(vector); |
2721 | for (int i = 0; i < element_count; i++) { |
2722 | // No null check required since we are iterating under a lock and within |
2723 | // vector bounds. |
2724 | void* src_element = cf_vector_getp(vector, i); |
2725 | if (src_element) { |
2726 | if (memcmp(element, src_element, value_len) == 0) { |
2727 | return i; |
2728 | } |
2729 | } |
2730 | } |
2731 | return -1; |
2732 | } |
2733 | |
2734 | /* |
2735 | * ---------------------------------------------------------------------------- |
2736 | * Endpoint list related |
2737 | * ---------------------------------------------------------------------------- |
2738 | */ |
2739 | |
2740 | /** |
2741 | * Copy an endpoint list to the destination, while possible reallocating the |
2742 | * destination space. |
2743 | * @param dest the double pointer to the destination list, because it might need |
2744 | * reallocation to accommodate a larger source list. |
2745 | * @param src the source endpoint list. |
2746 | */ |
2747 | static void |
2748 | endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src) |
2749 | { |
2750 | size_t src_size; |
2751 | |
2752 | if (as_endpoint_list_sizeof(src, &src_size) != 0) { |
2753 | // Bad endpoint list passed. |
2754 | CRASH("invalid adjacency list passed for copying" ); |
2755 | } |
2756 | |
2757 | *dest = cf_realloc(*dest, src_size); |
2758 | |
2759 | memcpy(*dest, src, src_size); |
2760 | } |
2761 | |
2762 | /** |
2763 | * Process function to convert endpoint list to a string. |
2764 | */ |
2765 | static void |
2766 | endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, |
2767 | void* udata) |
2768 | { |
2769 | endpoint_list_to_string_udata* to_string_udata = |
2770 | (endpoint_list_to_string_udata*)udata; |
2771 | as_endpoint_list_to_string(endpoint_list, |
2772 | to_string_udata->endpoint_list_str, |
2773 | to_string_udata->endpoint_list_str_capacity); |
2774 | } |
2775 | |
2776 | /** |
2777 | * Process function to check if endpoint lists overlap. |
2778 | */ |
2779 | static void |
2780 | endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata) |
2781 | { |
2782 | endpoint_list_equal_check_udata* equal_udata = |
2783 | (endpoint_list_equal_check_udata*)udata; |
2784 | |
2785 | equal_udata->are_equal = equal_udata->are_equal |
2786 | || as_endpoint_lists_are_equal(endpoint_list, equal_udata->other); |
2787 | } |
2788 | |
2789 | /* |
2790 | * ---------------------------------------------------------------------------- |
2791 | * Messge related |
2792 | * ---------------------------------------------------------------------------- |
2793 | */ |
2794 | |
2795 | /** |
2796 | * The size of a buffer beyond which compression should be applied. For now set |
2797 | * to 60% of the interface mtu. |
2798 | */ |
2799 | static int |
2800 | msg_compression_threshold(int mtu) |
2801 | { |
2802 | return (int)(mtu * 0.6); |
2803 | } |
2804 | |
2805 | /** |
2806 | * Read advertised endpoint list from an incoming message. |
2807 | * @param msg the incoming message. |
2808 | * @param endpoint_list the output endpoint. The endpoint_list will point to |
2809 | * input message. |
2810 | * internal location and should not be freed. |
2811 | * @return 0 on success -1 on failure. |
2812 | */ |
2813 | static int |
2814 | msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list) |
2815 | { |
2816 | size_t endpoint_list_size; |
2817 | if (msg_get_buf(msg, AS_HB_MSG_ENDPOINTS, (uint8_t**)endpoint_list, |
2818 | &endpoint_list_size, MSG_GET_DIRECT) != 0) { |
2819 | return -1; |
2820 | } |
2821 | |
2822 | size_t parsed_size; |
2823 | if (as_endpoint_list_nsizeof(*endpoint_list, &parsed_size, |
2824 | endpoint_list_size) || parsed_size != endpoint_list_size) { |
2825 | return -1; |
2826 | } |
2827 | return 0; |
2828 | } |
2829 | |
2830 | /** |
2831 | * Read the protocol identifier for this heartbeat message. These functions can |
2832 | * get called multiple times for a single message. Hence they do not increment |
2833 | * error counters. |
2834 | * |
2835 | * @param msg the incoming message. |
2836 | * @param id the output id. |
2837 | * @return 0 if the id could be parsed -1 on failure. |
2838 | */ |
2839 | static int |
2840 | msg_id_get(msg* msg, uint32_t* id) |
2841 | { |
2842 | if (msg_get_uint32(msg, AS_HB_MSG_ID, id) != 0) { |
2843 | return -1; |
2844 | } |
2845 | |
2846 | return 0; |
2847 | } |
2848 | |
2849 | /** |
2850 | * Read the source nodeid for a node. These functions can get called multiple |
2851 | * times for a single message. Hence they do not increment error counters. |
2852 | * @param msg the incoming message. |
2853 | * @param nodeid the output nodeid. |
2854 | * @return 0 if the nodeid could be parsed -1 on failure. |
2855 | */ |
2856 | static int |
2857 | msg_nodeid_get(msg* msg, cf_node* nodeid) |
2858 | { |
2859 | if (msg_get_uint64(msg, AS_HB_MSG_NODE, nodeid) != 0) { |
2860 | return -1; |
2861 | } |
2862 | |
2863 | return 0; |
2864 | } |
2865 | |
2866 | /** |
2867 | * Read the HLC send timestamp for the message. These functions can get called |
2868 | * multiple times for a single message. Hence they do not increment error |
2869 | * counters. |
2870 | * @param msg the incoming message. |
2871 | * @param send_ts the output hlc timestamp. |
2872 | * @return 0 if the time stamp could be parsed -1 on failure. |
2873 | */ |
2874 | static int |
2875 | msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts) |
2876 | { |
2877 | if (msg_get_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, send_ts) != 0) { |
2878 | return -1; |
2879 | } |
2880 | |
2881 | return 0; |
2882 | } |
2883 | |
2884 | /** |
2885 | * Read the message type. These functions can get called multiple times for a |
2886 | * single message. Hence they do not increment error counters. |
2887 | * @param msg the incoming message. |
2888 | * @param type the output message type. |
2889 | * @return 0 if the type could be parsed -1 on failure. |
2890 | */ |
2891 | static int |
2892 | msg_type_get(msg* msg, as_hb_msg_type* type) |
2893 | { |
2894 | if (msg_get_uint32(msg, AS_HB_MSG_TYPE, type) != 0) { |
2895 | return -1; |
2896 | } |
2897 | |
2898 | return 0; |
2899 | } |
2900 | |
2901 | /** |
2902 | * Read the cluster name. |
2903 | * @param msg the incoming message. |
2904 | * @param cluster name of the output message type. |
2905 | * @return 0 if the cluster name could be parsed -1 on failure. |
2906 | */ |
2907 | static int |
2908 | msg_cluster_name_get(msg* msg, char** cluster_name) |
2909 | { |
2910 | if (msg_get_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, |
2911 | MSG_GET_DIRECT) != 0) { |
2912 | return -1; |
2913 | } |
2914 | |
2915 | return 0; |
2916 | } |
2917 | |
2918 | /** |
2919 | * Get a pointer to a node list in the message. |
2920 | * |
2921 | * @param msg the incoming message. |
2922 | * @param field_id the field id. |
2923 | * @param adj_list output. on success will point to the adjacency list in the |
2924 | * message. |
2925 | * @para adj_length output. on success will contain the length of the adjacency |
2926 | * list. |
2927 | * @return 0 on success. -1 if the adjacency list is absent. |
2928 | */ |
2929 | static int |
2930 | msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, |
2931 | size_t* adj_length) |
2932 | { |
2933 | if (msg_get_buf(msg, field_id, (uint8_t**)adj_list, adj_length, |
2934 | MSG_GET_DIRECT) != 0) { |
2935 | return -1; |
2936 | } |
2937 | |
2938 | // correct adjacency list length. |
2939 | *adj_length /= sizeof(cf_node); |
2940 | |
2941 | return 0; |
2942 | } |
2943 | |
2944 | /** |
2945 | * Get a pointer to the adjacency list in the message. |
2946 | * |
2947 | * @param msg the incoming message. |
2948 | * @param adj_list output. on success will point to the adjacency list in the |
2949 | * message. |
2950 | * @para adj_length output. on success will contain the length of the adjacency |
2951 | * list. |
2952 | * @return 0 on success. -1 if the adjacency list is absent. |
2953 | */ |
2954 | static int |
2955 | msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length) |
2956 | { |
2957 | return msg_node_list_get(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length); |
2958 | } |
2959 | |
2960 | /** |
2961 | * Set a node list on an outgoing messages for a field. |
2962 | * |
2963 | * @param msg the outgoing message. |
2964 | * @param field_id the id of the list field. |
2965 | * @param node_list the adjacency list to set. |
2966 | * @para node_length the length of the adjacency list. |
2967 | */ |
2968 | static void |
2969 | msg_node_list_set(msg* msg, int field_id, cf_node* node_list, |
2970 | size_t node_length) |
2971 | { |
2972 | msg_set_buf(msg, field_id, (uint8_t*)node_list, |
2973 | sizeof(cf_node) * node_length, MSG_SET_COPY); |
2974 | } |
2975 | |
2976 | /** |
2977 | * Set the adjacency list on an outgoing messages. |
2978 | * |
2979 | * @param msg the outgoing message. |
2980 | * @param adj_list the adjacency list to set. |
2981 | * @para adj_length the length of the adjacency list. |
2982 | */ |
2983 | static void |
2984 | msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length) |
2985 | { |
2986 | msg_node_list_set(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length); |
2987 | } |
2988 | |
2989 | /** |
2990 | * Set the info reply on an outgoing messages. |
2991 | * |
2992 | * @param msg the outgoing message. |
2993 | * @param response the response list to set. |
2994 | * @para response_count the length of the response list. |
2995 | */ |
2996 | static void |
2997 | msg_info_reply_set(msg* msg, as_hb_mesh_info_reply* response, |
2998 | size_t response_count) |
2999 | { |
3000 | size_t response_size = 0; |
3001 | if (mesh_info_reply_sizeof(response, response_count, &response_size)) { |
3002 | CRASH("error setting info reply on msg" ); |
3003 | } |
3004 | |
3005 | msg_set_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t*)response, response_size, |
3006 | MSG_SET_COPY); |
3007 | |
3008 | return; |
3009 | } |
3010 | |
3011 | /** |
3012 | * Get a pointer to the info reply list in the message. |
3013 | * |
3014 | * @param msg the incoming message. |
3015 | * @param reply output. on success will point to the reply list in the message. |
3016 | * @param reply_count output. on success will contain the length of the reply |
3017 | * list. |
3018 | * @return 0 on success. -1 if the reply list is absent. |
3019 | */ |
3020 | static int |
3021 | msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count) |
3022 | { |
3023 | size_t reply_size; |
3024 | if (msg_get_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t**)reply, &reply_size, |
3025 | MSG_GET_DIRECT) != 0) { |
3026 | return -1; |
3027 | } |
3028 | |
3029 | *reply_count = 0; |
3030 | |
3031 | // Go over reply and compute the count of replies and also validate the |
3032 | // endpoint lists. |
3033 | uint8_t* start_ptr = (uint8_t*)*reply; |
3034 | int64_t remaining_size = reply_size; |
3035 | |
3036 | while (remaining_size > 0) { |
3037 | as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; |
3038 | remaining_size -= sizeof(as_hb_mesh_info_reply); |
3039 | start_ptr += sizeof(as_hb_mesh_info_reply); |
3040 | if (remaining_size <= 0) { |
3041 | // Incomplete / garbled info reply message. |
3042 | *reply_count = 0; |
3043 | return -1; |
3044 | } |
3045 | |
3046 | size_t endpoint_list_size = 0; |
3047 | if (as_endpoint_list_nsizeof(reply_ptr->endpoint_list, |
3048 | &endpoint_list_size, remaining_size) != 0) { |
3049 | // Incomplete / garbled info reply message. |
3050 | *reply_count = 0; |
3051 | return -1; |
3052 | } |
3053 | |
3054 | remaining_size -= endpoint_list_size; |
3055 | start_ptr += endpoint_list_size; |
3056 | (*reply_count)++; |
3057 | } |
3058 | |
3059 | return 0; |
3060 | } |
3061 | |
3062 | /** |
3063 | * Fill a message with an endpoint list. |
3064 | */ |
3065 | static void |
3066 | msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, |
3067 | void* udata) |
3068 | { |
3069 | endpoint_list_to_msg_udata* to_msg_udata = |
3070 | (endpoint_list_to_msg_udata*)udata; |
3071 | msg* msg = to_msg_udata->msg; |
3072 | bool is_mesh = to_msg_udata->is_mesh; |
3073 | |
3074 | if (!published_endpoint_list) { |
3075 | if (is_mesh) { |
3076 | // Something is messed up. Except for v3 multicast, |
3077 | // published list should not be empty. |
3078 | WARNING("published endpoint list is empty" ); |
3079 | } |
3080 | return; |
3081 | } |
3082 | |
3083 | // Makes sense only for mesh. |
3084 | if (is_mesh && published_endpoint_list) { |
3085 | // Set the source address |
3086 | size_t endpoint_list_size = 0; |
3087 | as_endpoint_list_sizeof(published_endpoint_list, &endpoint_list_size); |
3088 | msg_set_buf(msg, AS_HB_MSG_ENDPOINTS, |
3089 | (uint8_t*)published_endpoint_list, endpoint_list_size, |
3090 | MSG_SET_COPY); |
3091 | } |
3092 | } |
3093 | |
3094 | /** |
3095 | * Fill source fields for the message. |
3096 | * @param msg the message to fill the source fields into. |
3097 | */ |
3098 | static void |
3099 | msg_src_fields_fill(msg* msg) |
3100 | { |
3101 | bool is_mesh = hb_is_mesh(); |
3102 | |
3103 | // Set the hb protocol id / version. |
3104 | msg_set_uint32(msg, AS_HB_MSG_ID, hb_protocol_identifier_get()); |
3105 | |
3106 | // Set the source node. |
3107 | msg_set_uint64(msg, AS_HB_MSG_NODE, config_self_nodeid_get()); |
3108 | |
3109 | endpoint_list_to_msg_udata udata; |
3110 | udata.msg = msg; |
3111 | udata.is_mesh = is_mesh; |
3112 | |
3113 | if (is_mesh) { |
3114 | // Endpoint list only valid for mesh mode. |
3115 | mesh_published_endpoints_process(msg_published_endpoints_fill, &udata); |
3116 | } |
3117 | |
3118 | // Set the send hlc timestamp |
3119 | msg_set_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, as_hlc_timestamp_now()); |
3120 | } |
3121 | |
3122 | /** |
3123 | * Set the type for an outgoing message. |
3124 | * @param msg the outgoing message. |
3125 | * @param msg_type the type to set. |
3126 | */ |
3127 | static void |
3128 | msg_type_set(msg* msg, as_hb_msg_type msg_type) |
3129 | { |
3130 | // Set the message type. |
3131 | msg_set_uint32(msg, AS_HB_MSG_TYPE, msg_type); |
3132 | } |
3133 | |
3134 | /* |
3135 | * ---------------------------------------------------------------------------- |
3136 | * Config sub module. |
3137 | * ---------------------------------------------------------------------------- |
3138 | */ |
3139 | |
3140 | /** |
3141 | * Get mcsize. |
3142 | */ |
3143 | static int |
3144 | config_mcsize() |
3145 | { |
3146 | int mode_cluster_size = 0; |
3147 | if (hb_is_mesh()) { |
3148 | // Only bounded by available memory. But let's say its infinite. |
3149 | mode_cluster_size = INT_MAX; |
3150 | } |
3151 | else { |
3152 | mode_cluster_size = multicast_supported_cluster_size_get(); |
3153 | } |
3154 | |
3155 | // Ensure we are always upper bounded by the absolute max cluster size. |
3156 | int supported_cluster_size = MIN(ASC, mode_cluster_size); |
3157 | |
3158 | DETAIL("supported cluster size %d" , supported_cluster_size); |
3159 | return supported_cluster_size; |
3160 | } |
3161 | |
3162 | /** |
3163 | * Get the binding addresses for the heartbeat subsystem. |
3164 | */ |
3165 | static const cf_serv_cfg* |
3166 | config_bind_cfg_get() |
3167 | { |
3168 | // Not protected by config_lock because it is not changed. |
3169 | return &g_config.hb_config.bind_cfg; |
3170 | } |
3171 | |
3172 | /** |
3173 | * Get the multicast groups for the multicast mode. |
3174 | */ |
3175 | static const cf_mserv_cfg* |
3176 | config_multicast_group_cfg_get() |
3177 | { |
3178 | // Not protected by config_lock. Never updated after config parsing.. |
3179 | return &g_config.hb_config.multicast_group_cfg; |
3180 | } |
3181 | |
3182 | /** |
3183 | * Get the heartbeat pulse transmit interval. |
3184 | */ |
3185 | static uint32_t |
3186 | config_tx_interval_get() |
3187 | { |
3188 | HB_CONFIG_LOCK(); |
3189 | uint32_t interval = g_config.hb_config.tx_interval; |
3190 | HB_CONFIG_UNLOCK(); |
3191 | return interval; |
3192 | } |
3193 | |
3194 | /** |
3195 | * Set the heartbeat pulse transmit interval. |
3196 | */ |
3197 | static void |
3198 | config_tx_interval_set(uint32_t new_interval) |
3199 | { |
3200 | HB_CONFIG_LOCK(); |
3201 | INFO("changing value of interval from %d to %d " , |
3202 | g_config.hb_config.tx_interval, new_interval); |
3203 | g_config.hb_config.tx_interval = new_interval; |
3204 | HB_CONFIG_UNLOCK(); |
3205 | } |
3206 | |
3207 | /** |
3208 | * Get the heartbeat pulse transmit interval. |
3209 | */ |
3210 | static uint32_t |
3211 | config_override_mtu_get() |
3212 | { |
3213 | HB_CONFIG_LOCK(); |
3214 | uint32_t override_mtu = g_config.hb_config.override_mtu; |
3215 | HB_CONFIG_UNLOCK(); |
3216 | return override_mtu; |
3217 | } |
3218 | |
3219 | /** |
3220 | * Set the heartbeat pulse transmit interval. |
3221 | */ |
3222 | static void |
3223 | config_override_mtu_set(uint32_t mtu) |
3224 | { |
3225 | HB_CONFIG_LOCK(); |
3226 | INFO("changing value of override mtu from %d to %d " , |
3227 | g_config.hb_config.override_mtu, mtu); |
3228 | g_config.hb_config.override_mtu = mtu; |
3229 | HB_CONFIG_UNLOCK(); |
3230 | INFO("max supported cluster size is %d" , config_mcsize()); |
3231 | } |
3232 | |
3233 | /** |
3234 | * Get the maximum number of missed heartbeat intervals after which a node is |
3235 | * considered expired. |
3236 | */ |
3237 | static uint32_t |
3238 | config_max_intervals_missed_get() |
3239 | { |
3240 | uint32_t rv = 0; |
3241 | HB_CONFIG_LOCK(); |
3242 | rv = g_config.hb_config.max_intervals_missed; |
3243 | HB_CONFIG_UNLOCK(); |
3244 | return rv; |
3245 | } |
3246 | |
3247 | /** |
3248 | * Get the number intervals endpoints should be tracked for. |
3249 | */ |
3250 | static uint32_t |
3251 | config_endpoint_track_intervals_get() |
3252 | { |
3253 | // Allow a grace period of half heartbeat timeout, but lower bounded to at |
3254 | // least 3. |
3255 | return MAX(3, config_max_intervals_missed_get() / 2); |
3256 | } |
3257 | |
3258 | /** |
3259 | * Get the maximum number of allowed changes, per endpoint track intervals. |
3260 | */ |
3261 | static uint32_t |
3262 | config_endpoint_changes_allowed_get() |
3263 | { |
3264 | // Allow no change to the endpoint list for now. |
3265 | return 0; |
3266 | } |
3267 | |
3268 | /** |
3269 | * Set the maximum number of missed heartbeat intervals after which a node is |
3270 | * considered expired. |
3271 | */ |
3272 | static void |
3273 | config_max_intervals_missed_set(uint32_t new_max) |
3274 | { |
3275 | HB_CONFIG_LOCK(); |
3276 | INFO("changing value of timeout from %d to %d " , |
3277 | g_config.hb_config.max_intervals_missed, new_max); |
3278 | g_config.hb_config.max_intervals_missed = new_max; |
3279 | HB_CONFIG_UNLOCK(); |
3280 | } |
3281 | |
3282 | /** |
3283 | * Return ttl for multicast packets. Set to zero for default TTL. |
3284 | */ |
3285 | static unsigned char |
3286 | config_multicast_ttl_get() |
3287 | { |
3288 | return g_config.hb_config.multicast_ttl; |
3289 | } |
3290 | |
3291 | /** |
3292 | * Return the current heartbeat protocol. |
3293 | */ |
3294 | static as_hb_protocol |
3295 | config_protocol_get() |
3296 | { |
3297 | as_hb_protocol rv = 0; |
3298 | HB_CONFIG_LOCK(); |
3299 | rv = g_config.hb_config.protocol; |
3300 | HB_CONFIG_UNLOCK(); |
3301 | return rv; |
3302 | } |
3303 | |
3304 | /** |
3305 | * Return the current heartbeat protocol. |
3306 | */ |
3307 | static void |
3308 | config_protocol_set(as_hb_protocol new_protocol) |
3309 | { |
3310 | HB_CONFIG_LOCK(); |
3311 | g_config.hb_config.protocol = new_protocol; |
3312 | HB_CONFIG_UNLOCK(); |
3313 | } |
3314 | |
3315 | /** |
3316 | * The nodeid for this node. |
3317 | */ |
3318 | static cf_node |
3319 | config_self_nodeid_get() |
3320 | { |
3321 | // Not protected by config_lock. Never updated after config parsing.. |
3322 | return g_config.self_node; |
3323 | } |
3324 | |
3325 | /** |
3326 | * Return the heartbeat subsystem mode. |
3327 | */ |
3328 | static as_hb_mode |
3329 | config_mode_get() |
3330 | { |
3331 | // Not protected by config_lock. Never updated after config parsing.. |
3332 | return g_config.hb_config.mode; |
3333 | } |
3334 | |
3335 | /** |
3336 | * Expand "any" binding addresses to actual interface addresses. |
3337 | * @param bind_cfg the binding configuration. |
3338 | * @param published_cfg (output) the server configuration to expand. |
3339 | * @param ipv4_only indicates if only legacy addresses should be allowed. |
3340 | */ |
3341 | static void |
3342 | config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, |
3343 | cf_serv_cfg* published_cfg, bool ipv4_only) |
3344 | { |
3345 | cf_serv_cfg_init(published_cfg); |
3346 | cf_sock_cfg sock_cfg; |
3347 | |
3348 | for (int i = 0; i < bind_cfg->n_cfgs; i++) { |
3349 | cf_sock_cfg_copy(&bind_cfg->cfgs[i], &sock_cfg); |
3350 | |
3351 | // Expand "any" address to all interfaces. |
3352 | if (cf_ip_addr_is_any(&sock_cfg.addr)) { |
3353 | cf_ip_addr all_addrs[CF_SOCK_CFG_MAX]; |
3354 | uint32_t n_all_addrs = CF_SOCK_CFG_MAX; |
3355 | if (cf_inter_get_addr_all(all_addrs, &n_all_addrs) != 0) { |
3356 | WARNING("error getting all interface addresses" ); |
3357 | n_all_addrs = 0; |
3358 | } |
3359 | |
3360 | for (int j = 0; j < n_all_addrs; j++) { |
3361 | // Skip local address if any is specified. |
3362 | if (cf_ip_addr_is_local(&all_addrs[j]) |
3363 | || (ipv4_only && !cf_ip_addr_is_legacy(&all_addrs[j]))) { |
3364 | continue; |
3365 | } |
3366 | |
3367 | cf_ip_addr_copy(&all_addrs[j], &sock_cfg.addr); |
3368 | if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { |
3369 | CRASH("error initializing published address list" ); |
3370 | } |
3371 | } |
3372 | |
3373 | // TODO: Does not look like the right warning or the right message. |
3374 | if (published_cfg->n_cfgs == 0) { |
3375 | WARNING( |
3376 | "no network interface addresses detected for heartbeat access" ); |
3377 | } |
3378 | } |
3379 | else { |
3380 | if (ipv4_only && !cf_ip_addr_is_legacy(&bind_cfg->cfgs[i].addr)) { |
3381 | continue; |
3382 | } |
3383 | |
3384 | if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { |
3385 | CRASH("error initializing published address list" ); |
3386 | } |
3387 | } |
3388 | } |
3389 | } |
3390 | |
3391 | /** |
3392 | * Checks if the heartbeat binding configuration is valid. |
3393 | * @param error pointer to a static error message if validation fails, else will |
3394 | * be set to NULL. |
3395 | */ |
3396 | static bool |
3397 | config_binding_is_valid(char** error, as_hb_protocol protocol) |
3398 | { |
3399 | const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); |
3400 | const cf_mserv_cfg* multicast_group_cfg = config_multicast_group_cfg_get(); |
3401 | |
3402 | if (hb_is_mesh()) { |
3403 | if (bind_cfg->n_cfgs == 0) { |
3404 | // Should not happen in practice. |
3405 | *error = "no bind addresses found for heartbeat" ; |
3406 | return false; |
3407 | } |
3408 | |
3409 | // Ensure we have a valid port for all bind endpoints. |
3410 | for (int i = 0; i < bind_cfg->n_cfgs; i++) { |
3411 | if (bind_cfg->cfgs[i].port == 0) { |
3412 | *error = "invalid mesh listening port" ; |
3413 | return false; |
3414 | } |
3415 | } |
3416 | |
3417 | cf_serv_cfg publish_serv_cfg; |
3418 | cf_serv_cfg_init(&publish_serv_cfg); |
3419 | |
3420 | if (multicast_group_cfg->n_cfgs != 0) { |
3421 | *error = |
3422 | "invalid config option: multicast-group not supported in mesh mode" ; |
3423 | return false; |
3424 | } |
3425 | } |
3426 | else { |
3427 | const cf_mserv_cfg* multicast_group_cfg = |
3428 | config_multicast_group_cfg_get(); |
3429 | |
3430 | if (multicast_group_cfg->n_cfgs == 0) { |
3431 | *error = "no multicast groups specified" ; |
3432 | return false; |
3433 | } |
3434 | |
3435 | // Ensure multicast groups have valid ports. |
3436 | // TODO: We could check if the address is valid multicast. |
3437 | for (int i = 0; i < multicast_group_cfg->n_cfgs; i++) { |
3438 | if (multicast_group_cfg->cfgs[i].port == 0) { |
3439 | *error = "invalid multicast port" ; |
3440 | return false; |
3441 | } |
3442 | } |
3443 | |
3444 | if (g_config.hb_config.mesh_seed_addrs[0]) { |
3445 | *error = |
3446 | "invalid config option: mesh-seed-address-port not supported for multicast mode" ; |
3447 | return false; |
3448 | } |
3449 | |
3450 | cf_serv_cfg publish_serv_cfg; |
3451 | cf_serv_cfg_init(&publish_serv_cfg); |
3452 | } |
3453 | |
3454 | *error = NULL; |
3455 | return true; |
3456 | } |
3457 | |
3458 | /* |
3459 | * ---------------------------------------------------------------------------- |
3460 | * Channel sub module. |
3461 | * ---------------------------------------------------------------------------- |
3462 | */ |
3463 | |
3464 | /** |
3465 | * Initialize the channel structure. |
3466 | */ |
3467 | static void |
3468 | channel_init_channel(as_hb_channel* channel) |
3469 | { |
3470 | memset(channel, 0, sizeof(as_hb_channel)); |
3471 | cf_ip_addr_set_any(&channel->endpoint_addr.addr); |
3472 | } |
3473 | |
3474 | /** |
3475 | * Initialize the channel event structure. |
3476 | */ |
3477 | static void |
3478 | channel_event_init(as_hb_channel_event* event) |
3479 | { |
3480 | memset(event, 0, sizeof(as_hb_channel_event)); |
3481 | } |
3482 | |
3483 | /** |
3484 | * Is channel running. |
3485 | */ |
3486 | static bool |
3487 | channel_is_running() |
3488 | { |
3489 | CHANNEL_LOCK(); |
3490 | bool retval = |
3491 | (g_hb.channel_state.status == AS_HB_STATUS_RUNNING) ? true : false; |
3492 | CHANNEL_UNLOCK(); |
3493 | return retval; |
3494 | } |
3495 | |
3496 | /** |
3497 | * Is channel stopped. |
3498 | */ |
3499 | static bool |
3500 | channel_is_stopped() |
3501 | { |
3502 | CHANNEL_LOCK(); |
3503 | bool retval = |
3504 | (g_hb.channel_state.status == AS_HB_STATUS_STOPPED) ? true : false; |
3505 | CHANNEL_UNLOCK(); |
3506 | return retval; |
3507 | } |
3508 | |
3509 | /** |
3510 | * Keep a winning socket as a winner for at least this amount of time to prevent |
3511 | * constant flip flopping and give the winning socket a chance to send |
3512 | * heartbeats. |
3513 | */ |
3514 | static uint32_t |
3515 | channel_win_grace_ms() |
3516 | { |
3517 | return 3 * config_tx_interval_get(); |
3518 | } |
3519 | |
3520 | /** |
3521 | * Enable / disable events. |
3522 | */ |
3523 | static void |
3524 | channel_events_enabled_set(bool enabled) |
3525 | { |
3526 | CHANNEL_LOCK(); |
3527 | g_hb.channel_state.events_enabled = enabled; |
3528 | CHANNEL_UNLOCK(); |
3529 | } |
3530 | |
3531 | /** |
3532 | * Know if events are enabled. |
3533 | */ |
3534 | static bool |
3535 | channel_are_events_enabled() |
3536 | { |
3537 | bool result; |
3538 | CHANNEL_LOCK(); |
3539 | result = g_hb.channel_state.events_enabled; |
3540 | CHANNEL_UNLOCK(); |
3541 | return result; |
3542 | } |
3543 | |
3544 | /** |
3545 | * Discard an event that has been processed. |
3546 | */ |
3547 | static void |
3548 | channel_event_discard(as_hb_channel_event* event) |
3549 | { |
3550 | // Free the message structure for message received events. |
3551 | if (event->type == AS_HB_CHANNEL_MSG_RECEIVED) { |
3552 | hb_msg_return(event->msg); |
3553 | } |
3554 | } |
3555 | |
3556 | /** |
3557 | * Queues a channel event for publishing by the channel tender. |
3558 | */ |
3559 | static void |
3560 | channel_event_queue(as_hb_channel_event* event) |
3561 | { |
3562 | if (!channel_are_events_enabled()) { |
3563 | channel_event_discard(event); |
3564 | DETAIL( |
3565 | "events disabled. Ignoring event of type %d with nodeid %" PRIx64, |
3566 | event->type, event->nodeid); |
3567 | return; |
3568 | } |
3569 | |
3570 | DETAIL("queuing channel event of type %d for node %" PRIx64, event->type, |
3571 | event->nodeid); |
3572 | cf_queue_push(&g_hb.channel_state.events_queue, event); |
3573 | } |
3574 | |
3575 | /** |
3576 | * Publish queued up channel events. Should be called outside a channel lock to |
3577 | * prevent deadlocks. |
3578 | */ |
3579 | static void |
3580 | channel_event_publish_pending() |
3581 | { |
3582 | // No channel lock here to prevent deadlocks. |
3583 | as_hb_channel_event event; |
3584 | while (cf_queue_pop(&g_hb.channel_state.events_queue, &event, 0) |
3585 | == CF_QUEUE_OK) { |
3586 | // Nothing elaborate, using hardcoded list of event recipients. |
3587 | mesh_channel_event_process(&event); |
3588 | hb_channel_event_process(&event); |
3589 | |
3590 | channel_event_discard(&event); |
3591 | } |
3592 | } |
3593 | |
3594 | /** |
3595 | * Return the endpoint associated with this socket if it exists. |
3596 | * |
3597 | * @param socket the socket to query for. |
3598 | * @param result the output result. |
3599 | * @return 0 if the socket was found and the result value is filled. -1 if a |
3600 | * mapping for the socket could not be found. |
3601 | */ |
3602 | static int |
3603 | channel_get_channel(cf_socket* socket, as_hb_channel* result) |
3604 | { |
3605 | int status; |
3606 | CHANNEL_LOCK(); |
3607 | |
3608 | if (cf_shash_get(g_hb.channel_state.socket_to_channel, &socket, result) |
3609 | == CF_SHASH_OK) { |
3610 | status = 0; |
3611 | } |
3612 | else { |
3613 | status = -1; |
3614 | } |
3615 | |
3616 | CHANNEL_UNLOCK(); |
3617 | return status; |
3618 | } |
3619 | |
3620 | /** |
3621 | * Shutdown a channel socket without closing, forcing the channel tender to |
3622 | * cleanup associated data structures. |
3623 | */ |
3624 | static void |
3625 | channel_socket_shutdown(cf_socket* socket) |
3626 | { |
3627 | cf_socket_shutdown(socket); |
3628 | } |
3629 | |
3630 | /** |
3631 | * Return the socket associated with this node. |
3632 | * Returns 0 on success and -1 if there is no socket attached to this node. |
3633 | */ |
3634 | static int |
3635 | channel_socket_get(cf_node nodeid, cf_socket** socket) |
3636 | { |
3637 | int rv = -1; |
3638 | CHANNEL_LOCK(); |
3639 | if (cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, socket) |
3640 | == CF_SHASH_ERR_NOT_FOUND) { |
3641 | rv = -1; |
3642 | } |
3643 | else { |
3644 | rv = 0; |
3645 | } |
3646 | |
3647 | CHANNEL_UNLOCK(); |
3648 | return rv; |
3649 | } |
3650 | |
3651 | /** |
3652 | * Indicate if a socket is present in a sockets list. |
3653 | */ |
3654 | static bool |
3655 | channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find) |
3656 | { |
3657 | for (int i = 0; i < sockets->n_socks; i++) { |
3658 | if (&sockets->socks[i] == to_find) { |
3659 | return true; |
3660 | } |
3661 | } |
3662 | |
3663 | return false; |
3664 | } |
3665 | |
3666 | /** |
3667 | * Destroy an allocated socket. |
3668 | */ |
3669 | static void |
3670 | channel_socket_destroy(cf_socket* sock) |
3671 | { |
3672 | cf_socket_close(sock); |
3673 | cf_socket_term(sock); |
3674 | cf_free(sock); |
3675 | } |
3676 | |
3677 | /** |
3678 | * Close a channel socket. Precondition is that the socket is registered with |
3679 | * the channel module using channel_socket_register. |
3680 | */ |
3681 | static void |
3682 | channel_socket_close(cf_socket* socket, bool remote_close, |
3683 | bool raise_close_event) |
3684 | { |
3685 | if (remote_close) { |
3686 | DEBUG("remote close: fd %d event" , CSFD(socket)); |
3687 | } |
3688 | |
3689 | CHANNEL_LOCK(); |
3690 | |
3691 | if (channel_cf_sockets_contains(g_hb.channel_state.listening_sockets, |
3692 | socket)) { |
3693 | // Listening sockets will be closed by the mode (mesh/multicast |
3694 | // ) modules. |
3695 | goto Exit; |
3696 | } |
3697 | |
3698 | // Clean up data structures. |
3699 | as_hb_channel channel; |
3700 | int status = channel_get_channel(socket, &channel); |
3701 | |
3702 | if (status == 0) { |
3703 | if (channel.nodeid != 0) { |
3704 | cf_socket* node_socket; |
3705 | if (channel_socket_get(channel.nodeid, &node_socket) == 0 |
3706 | && node_socket == socket) { |
3707 | // Remove associated node for this socket. |
3708 | cf_shash_delete(g_hb.channel_state.nodeid_to_socket, |
3709 | &channel.nodeid); |
3710 | |
3711 | if (!channel.is_multicast && raise_close_event) { |
3712 | as_hb_channel_event event; |
3713 | channel_event_init(&event); |
3714 | |
3715 | // Notify others that this node is no longer connected. |
3716 | event.type = AS_HB_CHANNEL_NODE_DISCONNECTED; |
3717 | event.nodeid = channel.nodeid; |
3718 | event.msg = NULL; |
3719 | |
3720 | channel_event_queue(&event); |
3721 | } |
3722 | } |
3723 | } |
3724 | |
3725 | DETAIL("removed channel associated with fd %d polarity %s Type: %s" , |
3726 | CSFD(socket), channel.is_inbound ? "inbound" : "outbound" , |
3727 | channel.is_multicast ? "multicast" : "mesh" ); |
3728 | // Remove associated channel. |
3729 | cf_shash_delete(g_hb.channel_state.socket_to_channel, &socket); |
3730 | } |
3731 | else { |
3732 | // Will only happen if we are closing this socket twice. Cannot |
3733 | // deference the underlying fd because the socket has been freed. |
3734 | WARNING("found a socket %p without an associated channel" , socket); |
3735 | goto Exit; |
3736 | } |
3737 | |
3738 | static int32_t err_ok[] = { ENOENT, EBADF, EPERM }; |
3739 | int32_t err = cf_poll_delete_socket_forgiving(g_hb.channel_state.poll, |
3740 | socket, sizeof(err_ok) / sizeof(int32_t), err_ok); |
3741 | |
3742 | if (err == ENOENT) { |
3743 | // There is no valid code path where epoll ctl should fail. |
3744 | CRASH("unable to remove fd %d from epoll fd list: %s" , CSFD(socket), |
3745 | cf_strerror(errno)); |
3746 | goto Exit; |
3747 | } |
3748 | |
3749 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
3750 | DEBUG("closing channel with fd %d" , CSFD(socket)); |
3751 | |
3752 | channel_socket_destroy(socket); |
3753 | |
3754 | Exit: |
3755 | CHANNEL_UNLOCK(); |
3756 | } |
3757 | |
3758 | /** |
3759 | * Close multiple sockets. Should be invoked only by channel stop. |
3760 | * @param sockets the vector consisting of sockets to be closed. |
3761 | */ |
3762 | static void |
3763 | channel_sockets_close(cf_vector* sockets) |
3764 | { |
3765 | uint32_t socket_count = cf_vector_size(sockets); |
3766 | for (int index = 0; index < socket_count; index++) { |
3767 | cf_socket* socket; |
3768 | if (cf_vector_get(sockets, index, &socket) != 0) { |
3769 | WARNING("error finding the fd %d to be deleted" , CSFD(socket)); |
3770 | continue; |
3771 | } |
3772 | channel_socket_close(socket, false, true); |
3773 | } |
3774 | } |
3775 | |
3776 | /** |
3777 | * Queues a socket for closing by the channel tender. Should be used by all code |
3778 | * paths other than the channel stop code path. |
3779 | */ |
3780 | static void |
3781 | channel_socket_close_queue(cf_socket* socket, bool is_remote_close, |
3782 | bool raise_close_event) |
3783 | { |
3784 | as_hb_channel_socket_close_entry close_entry = { |
3785 | socket, |
3786 | is_remote_close, |
3787 | raise_close_event }; |
3788 | DETAIL("queuing close of fd %d" , CSFD(socket)); |
3789 | cf_queue_push(&g_hb.channel_state.socket_close_queue, &close_entry); |
3790 | } |
3791 | |
3792 | /** |
3793 | * Close queued up sockets. |
3794 | */ |
3795 | static void |
3796 | channel_socket_close_pending() |
3797 | { |
3798 | // No channel lock required here. |
3799 | as_hb_channel_socket_close_entry close_entry; |
3800 | while (cf_queue_pop(&g_hb.channel_state.socket_close_queue, &close_entry, 0) |
3801 | == CF_QUEUE_OK) { |
3802 | channel_socket_close(close_entry.socket, close_entry.is_remote, |
3803 | close_entry.raise_close_event); |
3804 | } |
3805 | } |
3806 | |
3807 | /** |
3808 | * Register a new socket. |
3809 | * |
3810 | * @param socket the socket. |
3811 | * @param is_multicast indicates if this socket is a multicast socket. |
3812 | * @param is_inbound indicates if this socket is an inbound / outbound. |
3813 | * @param endpoint peer endpoint this socket connects to. Will be NULL for |
3814 | * inbound sockets. |
3815 | */ |
3816 | static void |
3817 | channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, |
3818 | cf_sock_addr* endpoint_addr) |
3819 | { |
3820 | CHANNEL_LOCK(); |
3821 | |
3822 | as_hb_channel channel; |
3823 | channel_init_channel(&channel); |
3824 | |
3825 | // This socket should not be part of the socket to channel map. |
3826 | ASSERT(channel_get_channel(socket, &channel) == -1, |
3827 | "error the channel already exists for fd %d" , CSFD(socket)); |
3828 | |
3829 | channel.is_multicast = is_multicast; |
3830 | channel.is_inbound = is_inbound; |
3831 | channel.last_received = cf_getms(); |
3832 | |
3833 | if (endpoint_addr) { |
3834 | memcpy(&channel.endpoint_addr, endpoint_addr, sizeof(*endpoint_addr)); |
3835 | } |
3836 | |
3837 | // Add socket to poll list |
3838 | cf_poll_add_socket(g_hb.channel_state.poll, socket, |
3839 | EPOLLIN | EPOLLERR | EPOLLRDHUP, socket); |
3840 | |
3841 | cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel); |
3842 | |
3843 | DEBUG("channel created for fd %d - polarity %s type: %s" , CSFD(socket), |
3844 | channel.is_inbound ? "inbound" : "outbound" , |
3845 | channel.is_multicast ? "multicast" : "mesh" ); |
3846 | |
3847 | CHANNEL_UNLOCK(); |
3848 | } |
3849 | |
3850 | /** |
3851 | * Accept an incoming tcp connection. For now this is relevant only to the mesh |
3852 | * mode. |
3853 | * @param lsock the listening socket that received the connection. |
3854 | */ |
3855 | static void |
3856 | channel_accept_connection(cf_socket* lsock) |
3857 | { |
3858 | if (!hb_is_mesh()) { |
3859 | // We do not accept connections in non mesh modes. |
3860 | return; |
3861 | } |
3862 | |
3863 | cf_socket csock; |
3864 | cf_sock_addr caddr; |
3865 | |
3866 | if (cf_socket_accept(lsock, &csock, &caddr) < 0) { |
3867 | if ((errno == EMFILE) || (errno == ENFILE) || (errno == ENOMEM) |
3868 | || (errno == ENOBUFS)) { |
3869 | TICKER_WARNING( |
3870 | "failed to accept heartbeat connection due to error : %s" , |
3871 | cf_strerror(errno)); |
3872 | // We are in an extreme situation where we ran out of system |
3873 | // resources (file/mem). We should rather lie low and not do too |
3874 | // much activity. So, sleep. We should not sleep too long as this |
3875 | // same function is supposed to send heartbeat also. |
3876 | usleep(MAX(AS_HB_TX_INTERVAL_MS_MIN, 1) * 1000); |
3877 | return; |
3878 | } |
3879 | else { |
3880 | // TODO: Find what there errors are. |
3881 | WARNING("accept failed: %s" , cf_strerror(errno)); |
3882 | return; |
3883 | } |
3884 | } |
3885 | |
3886 | // Update the stats to reflect to a new connection opened. |
3887 | cf_atomic_int_incr(&g_stats.heartbeat_connections_opened); |
3888 | |
3889 | char caddr_str[DNS_NAME_MAX_SIZE]; |
3890 | cf_sock_addr_to_string_safe(&caddr, caddr_str, sizeof(caddr_str)); |
3891 | DEBUG("new connection from %s" , caddr_str); |
3892 | |
3893 | cf_sock_cfg *cfg = lsock->cfg; |
3894 | |
3895 | if (cfg->owner == CF_SOCK_OWNER_HEARTBEAT_TLS) { |
3896 | tls_socket_prepare_server(g_config.hb_config.tls, &csock); |
3897 | |
3898 | if (tls_socket_accept_block(&csock) != 1) { |
3899 | WARNING("heartbeat TLS server handshake with %s failed" , caddr_str); |
3900 | cf_socket_close(&csock); |
3901 | cf_socket_term(&csock); |
3902 | |
3903 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
3904 | return; |
3905 | } |
3906 | } |
3907 | |
3908 | // Allocate a new socket. |
3909 | cf_socket* sock = cf_malloc(sizeof(cf_socket)); |
3910 | cf_socket_init(sock); |
3911 | cf_socket_copy(&csock, sock); |
3912 | |
3913 | // Register this socket with the channel subsystem. |
3914 | channel_socket_register(sock, false, true, NULL); |
3915 | } |
3916 | |
3917 | /** |
3918 | * Parse compressed buffer into a message. |
3919 | * |
3920 | * @param msg the input parsed compressed message and also the output heartbeat |
3921 | * message. |
3922 | * @param buffer the input buffer. |
3923 | * @param buffer_content_len the length of the content in the buffer. |
3924 | * @return the status of parsing the message. |
3925 | */ |
3926 | static as_hb_channel_msg_read_status |
3927 | channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len) |
3928 | { |
3929 | // This is a direct pointer inside the buffer parameter. No allocation |
3930 | // required. |
3931 | uint8_t* compressed_buffer = NULL; |
3932 | size_t compressed_buffer_length = 0; |
3933 | int parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; |
3934 | void* uncompressed_buffer = NULL; |
3935 | size_t uncompressed_buffer_length = 0; |
3936 | |
3937 | if (msg_get_buf(msg, AS_HB_MSG_COMPRESSED_PAYLOAD, &compressed_buffer, |
3938 | &compressed_buffer_length, MSG_GET_DIRECT) != 0) { |
3939 | parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; |
3940 | goto Exit; |
3941 | } |
3942 | |
3943 | // Assume compression ratio of 3. We will expand the buffer if needed. |
3944 | uncompressed_buffer_length = round_up_pow2(3 * compressed_buffer_length); |
3945 | |
3946 | // Keep trying till we allocate enough memory for the uncompressed buffer. |
3947 | while (true) { |
3948 | uncompressed_buffer = MSG_BUFF_ALLOC_OR_DIE(uncompressed_buffer_length, |
3949 | "error allocating memory size %zu for decompressing message" , |
3950 | uncompressed_buffer_length); |
3951 | |
3952 | int uncompress_rv = uncompress(uncompressed_buffer, |
3953 | &uncompressed_buffer_length, compressed_buffer, |
3954 | compressed_buffer_length); |
3955 | |
3956 | if (uncompress_rv == Z_OK) { |
3957 | // Decompression was successful. |
3958 | break; |
3959 | } |
3960 | |
3961 | if (uncompress_rv == Z_BUF_ERROR) { |
3962 | // The uncompressed buffer is not large enough. Free current buffer |
3963 | // and allocate a new buffer. |
3964 | MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length); |
3965 | |
3966 | // Give uncompressed buffer more space. |
3967 | uncompressed_buffer_length *= 2; |
3968 | continue; |
3969 | } |
3970 | |
3971 | // Decompression failed. Clean up and exit. |
3972 | parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; |
3973 | goto Exit; |
3974 | } |
3975 | |
3976 | // Reset the message to prepare for parsing the uncompressed buffer. We have |
3977 | // no issues losing the compressed buffer because we have an uncompressed |
3978 | // copy. |
3979 | msg_reset(msg); |
3980 | |
3981 | // Parse the uncompressed buffer. |
3982 | parsed = |
3983 | msg_parse(msg, uncompressed_buffer, uncompressed_buffer_length) ? |
3984 | AS_HB_CHANNEL_MSG_READ_SUCCESS : |
3985 | AS_HB_CHANNEL_MSG_PARSE_FAIL; |
3986 | |
3987 | if (parsed == AS_HB_CHANNEL_MSG_READ_SUCCESS) { |
3988 | // Copying the buffer content to ensure that the message and the buffer |
3989 | // can have separate life cycles and we never get into races. The |
3990 | // frequency of heartbeat messages is low enough to make this not matter |
3991 | // much unless we have massive clusters. |
3992 | msg_preserve_all_fields(msg); |
3993 | } |
3994 | |
3995 | Exit: |
3996 | MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length); |
3997 | return parsed; |
3998 | } |
3999 | |
4000 | /** |
4001 | * Parse the buffer into a message. |
4002 | * |
4003 | * @param msg the output heartbeat message. |
4004 | * @param buffer the input buffer. |
4005 | * @param buffer_content_len the length of the content in the buffer. |
4006 | * @return the status of parsing the message. |
4007 | */ |
4008 | static as_hb_channel_msg_read_status |
4009 | channel_message_parse(msg* msg, void* buffer, int buffer_content_len) |
4010 | { |
4011 | // Peek into the buffer to get hold of the message type. |
4012 | msg_type type = 0; |
4013 | uint32_t msg_size = 0; |
4014 | if (! msg_parse_hdr(&msg_size, &type, (uint8_t*)buffer, buffer_content_len) |
4015 | || type != msg->type) { |
4016 | // Pre check because msg_parse considers this a warning but this would |
4017 | // be common when protocol version between nodes do not match. |
4018 | DEBUG("message type mismatch - expected:%d received:%d" , msg->type, |
4019 | type); |
4020 | return AS_HB_CHANNEL_MSG_PARSE_FAIL; |
4021 | } |
4022 | |
4023 | bool parsed = msg_parse(msg, buffer, buffer_content_len); |
4024 | |
4025 | if (parsed) { |
4026 | if (msg_is_set(msg, AS_HB_MSG_COMPRESSED_PAYLOAD)) { |
4027 | // This is a compressed message. |
4028 | return channel_compressed_message_parse(msg, buffer, |
4029 | buffer_content_len); |
4030 | } |
4031 | |
4032 | // This is an uncompressed message. Copying the buffer content to ensure |
4033 | // that the message and the buffer can have separate life cycles and we |
4034 | // never get into races. The frequency of heartbeat messages is low |
4035 | // enough to make this not matter much unless we have massive clusters. |
4036 | msg_preserve_all_fields(msg); |
4037 | } |
4038 | |
4039 | return parsed ? |
4040 | AS_HB_CHANNEL_MSG_READ_SUCCESS : AS_HB_CHANNEL_MSG_PARSE_FAIL; |
4041 | } |
4042 | |
4043 | /** |
4044 | * Iterate over a endpoint list and see if there is a matching socket address. |
4045 | */ |
4046 | static void |
4047 | channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata) |
4048 | { |
4049 | cf_sock_addr sock_addr; |
4050 | as_hb_channel_endpoint_iterate_udata* iterate_data = |
4051 | (as_hb_channel_endpoint_iterate_udata*)udata; |
4052 | if (as_endpoint_to_sock_addr(endpoint, &sock_addr) != 0) { |
4053 | return; |
4054 | } |
4055 | |
4056 | if (cf_sock_addr_is_any(&sock_addr)) { |
4057 | return; |
4058 | } |
4059 | |
4060 | iterate_data->found = iterate_data->found |
4061 | || (cf_sock_addr_compare(&sock_addr, iterate_data->addr_to_search) |
4062 | == 0); |
4063 | } |
4064 | |
4065 | /** |
4066 | * Reduce function to find a matching endpoint. |
4067 | */ |
4068 | static int |
4069 | channel_endpoint_search_reduce(const void* key, void* data, void* udata) |
4070 | { |
4071 | cf_socket** socket = (cf_socket**)key; |
4072 | as_hb_channel* channel = (as_hb_channel*)data; |
4073 | as_hb_channel_endpoint_reduce_udata* endpoint_reduce_udata = |
4074 | (as_hb_channel_endpoint_reduce_udata*)udata; |
4075 | |
4076 | as_hb_channel_endpoint_iterate_udata iterate_udata; |
4077 | iterate_udata.addr_to_search = &channel->endpoint_addr; |
4078 | iterate_udata.found = false; |
4079 | |
4080 | as_endpoint_list_iterate(endpoint_reduce_udata->endpoint_list, |
4081 | channel_endpoint_find_iterate_fn, &iterate_udata); |
4082 | |
4083 | if (iterate_udata.found) { |
4084 | endpoint_reduce_udata->found = true; |
4085 | endpoint_reduce_udata->socket = *socket; |
4086 | // Stop the reduce, we have found a match. |
4087 | return CF_SHASH_ERR_FOUND; |
4088 | } |
4089 | |
4090 | return CF_SHASH_OK; |
4091 | } |
4092 | |
4093 | /** |
4094 | * Indicates if any endpoint from the input endpoint list is already connected. |
4095 | * @param endpoint_list the endpoint list to check. |
4096 | * @return true if at least one endpoint is already connected to, false |
4097 | * otherwise. |
4098 | */ |
4099 | static bool |
4100 | channel_endpoint_is_connected(as_endpoint_list* endpoint_list) |
4101 | { |
4102 | CHANNEL_LOCK(); |
4103 | // Linear search. This will in practice not be a very frequent operation. |
4104 | as_hb_channel_endpoint_reduce_udata udata; |
4105 | memset(&udata, 0, sizeof(udata)); |
4106 | udata.endpoint_list = endpoint_list; |
4107 | |
4108 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
4109 | channel_endpoint_search_reduce, &udata); |
4110 | |
4111 | CHANNEL_UNLOCK(); |
4112 | return udata.found; |
4113 | } |
4114 | |
4115 | /** |
4116 | * Read a message from the multicast socket. |
4117 | * |
4118 | * @param socket the multicast socket to read from. |
4119 | * @param msg the message to read into. |
4120 | * |
4121 | * @return the status the read operation. |
4122 | */ |
4123 | static as_hb_channel_msg_read_status |
4124 | channel_multicast_msg_read(cf_socket* socket, msg* msg) |
4125 | { |
4126 | CHANNEL_LOCK(); |
4127 | |
4128 | as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF; |
4129 | |
4130 | int buffer_len = MAX(hb_mtu(), STACK_ALLOC_LIMIT); |
4131 | uint8_t* buffer = MSG_BUFF_ALLOC(buffer_len); |
4132 | |
4133 | if (!buffer) { |
4134 | WARNING( |
4135 | "error allocating space for multicast recv buffer of size %d on fd %d" , |
4136 | buffer_len, CSFD(socket)); |
4137 | goto Exit; |
4138 | } |
4139 | |
4140 | cf_sock_addr from; |
4141 | |
4142 | int num_rcvd = cf_socket_recv_from(socket, buffer, buffer_len, 0, &from); |
4143 | |
4144 | if (num_rcvd <= 0) { |
4145 | DEBUG("multicast packed read failed on fd %d" , CSFD(socket)); |
4146 | rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; |
4147 | goto Exit; |
4148 | } |
4149 | |
4150 | rv = channel_message_parse(msg, buffer, num_rcvd); |
4151 | if (rv != AS_HB_CHANNEL_MSG_READ_SUCCESS) { |
4152 | goto Exit; |
4153 | } |
4154 | |
4155 | rv = AS_HB_CHANNEL_MSG_READ_SUCCESS; |
4156 | |
4157 | Exit: |
4158 | MSG_BUFF_FREE(buffer, buffer_len); |
4159 | |
4160 | CHANNEL_UNLOCK(); |
4161 | return rv; |
4162 | } |
4163 | |
4164 | /** |
4165 | * Read a message from the a tcp mesh socket. |
4166 | * |
4167 | * @param socket the tcp socket to read from. |
4168 | * @param msg the message to read into. |
4169 | * |
4170 | * @return status of the read operation. |
4171 | */ |
4172 | static as_hb_channel_msg_read_status |
4173 | channel_mesh_msg_read(cf_socket* socket, msg* msg) |
4174 | { |
4175 | CHANNEL_LOCK(); |
4176 | |
4177 | uint32_t buffer_len = 0; |
4178 | uint8_t* buffer = NULL; |
4179 | |
4180 | as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF; |
4181 | uint8_t len_buff[MSG_WIRE_LENGTH_SIZE]; |
4182 | |
4183 | if (cf_socket_recv_all(socket, len_buff, MSG_WIRE_LENGTH_SIZE, 0, |
4184 | MESH_RW_TIMEOUT) < 0) { |
4185 | WARNING("mesh size recv failed fd %d : %s" , CSFD(socket), |
4186 | cf_strerror(errno)); |
4187 | rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; |
4188 | goto Exit; |
4189 | } |
4190 | |
4191 | buffer_len = ntohl(*((uint32_t*)len_buff)) + 6; |
4192 | |
4193 | buffer = MSG_BUFF_ALLOC(buffer_len); |
4194 | |
4195 | if (!buffer) { |
4196 | WARNING( |
4197 | "error allocating space for mesh recv buffer of size %d on fd %d" , |
4198 | buffer_len, CSFD(socket)); |
4199 | goto Exit; |
4200 | } |
4201 | |
4202 | memcpy(buffer, len_buff, MSG_WIRE_LENGTH_SIZE); |
4203 | |
4204 | if (cf_socket_recv_all(socket, buffer + MSG_WIRE_LENGTH_SIZE, |
4205 | buffer_len - MSG_WIRE_LENGTH_SIZE, 0, MESH_RW_TIMEOUT) < 0) { |
4206 | DETAIL("mesh recv failed fd %d : %s" , CSFD(socket), cf_strerror(errno)); |
4207 | rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; |
4208 | goto Exit; |
4209 | } |
4210 | |
4211 | DETAIL("mesh recv success fd %d message size %d" , CSFD(socket), buffer_len); |
4212 | |
4213 | rv = channel_message_parse(msg, buffer, buffer_len); |
4214 | |
4215 | Exit: |
4216 | MSG_BUFF_FREE(buffer, buffer_len); |
4217 | |
4218 | CHANNEL_UNLOCK(); |
4219 | return rv; |
4220 | } |
4221 | |
4222 | /** |
4223 | * Associate a socket with a nodeid and notify listeners about a node being |
4224 | * connected, effective only for mesh channels. |
4225 | * |
4226 | * For multicast channels this function is a no-op. The reason being additional |
4227 | * machinery would be required to clean up the node to channel mapping on node |
4228 | * expiry. |
4229 | * |
4230 | * @param socket the socket. |
4231 | * @param channel the channel to associate. |
4232 | * @param nodeid the nodeid associated with this socket. |
4233 | */ |
4234 | static void |
4235 | channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid) |
4236 | { |
4237 | // For now node to socket mapping is not maintained for multicast channels. |
4238 | if (channel->is_multicast) { |
4239 | return; |
4240 | } |
4241 | |
4242 | CHANNEL_LOCK(); |
4243 | |
4244 | // Update the node information for the channel. |
4245 | // This is the first time this node has a connection. Record the mapping. |
4246 | cf_shash_put(g_hb.channel_state.nodeid_to_socket, &nodeid, &socket); |
4247 | |
4248 | channel->nodeid = nodeid; |
4249 | cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, channel); |
4250 | |
4251 | DEBUG("attached fd %d to node %" PRIx64, CSFD(socket), nodeid); |
4252 | |
4253 | CHANNEL_UNLOCK(); |
4254 | |
4255 | // Publish an event to let know that a new node has a channel now. |
4256 | as_hb_channel_event node_connected_event; |
4257 | channel_event_init(&node_connected_event); |
4258 | node_connected_event.nodeid = nodeid; |
4259 | node_connected_event.type = AS_HB_CHANNEL_NODE_CONNECTED; |
4260 | channel_event_queue(&node_connected_event); |
4261 | } |
4262 | |
4263 | /** |
4264 | * Indicates if a channel should be allowed to continue to win and live because |
4265 | * of a winning grace period. |
4266 | */ |
4267 | static bool |
4268 | channel_socket_should_live(cf_socket* socket, as_hb_channel* channel) |
4269 | { |
4270 | if (channel->resolution_win_ts > 0 |
4271 | && channel->resolution_win_ts + channel_win_grace_ms() |
4272 | > cf_getms()) { |
4273 | // Losing socket was a previous winner. Allow it time to do some work |
4274 | // before knocking it off. |
4275 | INFO("giving %d unresolved fd some grace time" , CSFD(socket)); |
4276 | return true; |
4277 | } |
4278 | return false; |
4279 | } |
4280 | |
4281 | /** |
4282 | * Selects one out give two sockets connected to same remote node. The algorithm |
4283 | * is deterministic and ensures the remote node also chooses a socket that drops |
4284 | * the same connection. |
4285 | * |
4286 | * @param socket1 one of the sockets |
4287 | * @param socket2 one of the sockets |
4288 | * @return resolved socket on success, NULL if resolution fails. |
4289 | */ |
4290 | static cf_socket* |
4291 | channel_socket_resolve(cf_socket* socket1, cf_socket* socket2) |
4292 | { |
4293 | cf_socket* rv = NULL; |
4294 | CHANNEL_LOCK(); |
4295 | |
4296 | DEBUG("resolving between fd %d and %d" , CSFD(socket1), CSFD(socket2)); |
4297 | |
4298 | as_hb_channel channel1; |
4299 | if (channel_get_channel(socket1, &channel1) < 0) { |
4300 | // Should not happen in practice. |
4301 | WARNING("resolving fd %d without channel" , CSFD(socket1)); |
4302 | rv = socket2; |
4303 | goto Exit; |
4304 | } |
4305 | |
4306 | as_hb_channel channel2; |
4307 | if (channel_get_channel(socket2, &channel2) < 0) { |
4308 | // Should not happen in practice. |
4309 | WARNING("resolving fd %d without channel" , CSFD(socket2)); |
4310 | rv = socket1; |
4311 | goto Exit; |
4312 | } |
4313 | |
4314 | if (channel_socket_should_live(socket1, &channel1)) { |
4315 | rv = socket1; |
4316 | goto Exit; |
4317 | } |
4318 | |
4319 | if (channel_socket_should_live(socket2, &channel2)) { |
4320 | rv = socket2; |
4321 | goto Exit; |
4322 | } |
4323 | |
4324 | cf_node remote_nodeid = |
4325 | channel1.nodeid != 0 ? channel1.nodeid : channel2.nodeid; |
4326 | |
4327 | if (remote_nodeid == 0) { |
4328 | // Should not happen in practice. |
4329 | WARNING("remote node id unknown for fds %d and %d" , CSFD(socket1), |
4330 | CSFD(socket2)); |
4331 | rv = NULL; |
4332 | goto Exit; |
4333 | } |
4334 | |
4335 | // Choose the socket with the highest acceptor nodeid. |
4336 | cf_node acceptor_nodeid1 = |
4337 | channel1.is_inbound ? config_self_nodeid_get() : remote_nodeid; |
4338 | cf_node acceptor_nodeid2 = |
4339 | channel2.is_inbound ? config_self_nodeid_get() : remote_nodeid; |
4340 | |
4341 | as_hb_channel* winner_channel = NULL; |
4342 | cf_socket* winner_socket = NULL; |
4343 | if (acceptor_nodeid1 > acceptor_nodeid2) { |
4344 | winner_channel = &channel1; |
4345 | winner_socket = socket1; |
4346 | } |
4347 | else if (acceptor_nodeid1 < acceptor_nodeid2) { |
4348 | winner_channel = &channel2; |
4349 | winner_socket = socket2; |
4350 | } |
4351 | else { |
4352 | // Both connections have the same acceptor. Should not happen in |
4353 | // practice. Despair and report resolution failure. |
4354 | INFO( |
4355 | "found redundant connections to same node, fds %d %d - choosing at random" , |
4356 | CSFD(socket1), CSFD(socket2)); |
4357 | |
4358 | if (cf_getms() % 2 == 0) { |
4359 | winner_channel = &channel1; |
4360 | winner_socket = socket1; |
4361 | } |
4362 | else { |
4363 | winner_channel = &channel2; |
4364 | winner_socket = socket2; |
4365 | } |
4366 | } |
4367 | |
4368 | cf_clock now = cf_getms(); |
4369 | if (winner_channel->resolution_win_ts == 0) { |
4370 | winner_channel->resolution_win_ts = now; |
4371 | // Update the winning count of the winning channel in the channel data |
4372 | // structures. |
4373 | cf_shash_put(g_hb.channel_state.socket_to_channel, &winner_socket, |
4374 | winner_channel); |
4375 | } |
4376 | |
4377 | if (winner_channel->resolution_win_ts > now + channel_win_grace_ms()) { |
4378 | // The winner has been winning a lot, most likely the other side has us |
4379 | // with a seed address different from our published address. |
4380 | // |
4381 | // Break the cycle here and choose the loosing channel as the winner. |
4382 | INFO("breaking socket resolve loop dropping winning fd %d" , |
4383 | CSFD(winner_socket)); |
4384 | winner_channel = (winner_channel == &channel1) ? &channel2 : &channel1; |
4385 | winner_socket = (socket1 == winner_socket) ? socket2 : socket1; |
4386 | } |
4387 | |
4388 | rv = winner_socket; |
4389 | |
4390 | Exit: |
4391 | CHANNEL_UNLOCK(); |
4392 | return rv; |
4393 | } |
4394 | |
4395 | /** |
4396 | * Basic sanity check for a message. |
4397 | * @param msg_event the message event. |
4398 | * @return 0 if the message passes basic sanity tests. -1 on failure. |
4399 | */ |
4400 | static int |
4401 | channel_msg_sanity_check(as_hb_channel_event* msg_event) |
4402 | { |
4403 | msg* msg = msg_event->msg; |
4404 | uint32_t id = 0; |
4405 | |
4406 | as_hb_msg_type type = 0; |
4407 | cf_node src_nodeid = 0; |
4408 | |
4409 | int rv = 0; |
4410 | |
4411 | if (msg_nodeid_get(msg, &src_nodeid) != 0) { |
4412 | TICKER_WARNING("received message without a source node" ); |
4413 | rv = -1; |
4414 | } |
4415 | |
4416 | // Validate the fact that we have a valid source nodeid. |
4417 | if (src_nodeid == 0) { |
4418 | // Event nodeid is zero. Not a valid source nodeid. This will happen in |
4419 | // compatibility mode if the info request from a new node arrives before |
4420 | // the pulse message. Can be ignored. |
4421 | TICKER_WARNING("received a message from node with unknown nodeid" ); |
4422 | rv = -1; |
4423 | } |
4424 | |
4425 | if (msg_id_get(msg, &id) != 0) { |
4426 | TICKER_WARNING( |
4427 | "received message without heartbeat protocol identifier from node %" PRIx64, |
4428 | src_nodeid); |
4429 | rv = -1; |
4430 | } |
4431 | else { |
4432 | DETAIL( |
4433 | "received message with heartbeat protocol identifier %d from node %" PRIx64, |
4434 | id, src_nodeid); |
4435 | |
4436 | // Ignore the message if the protocol of the incoming message does not |
4437 | // match. |
4438 | if (id != hb_protocol_identifier_get()) { |
4439 | TICKER_WARNING( |
4440 | "received message with different heartbeat protocol identifier from node %" PRIx64, |
4441 | src_nodeid); |
4442 | rv = -1; |
4443 | } |
4444 | } |
4445 | |
4446 | if (msg_type_get(msg, &type) != 0) { |
4447 | TICKER_WARNING( |
4448 | "received message without message type from node %" PRIx64, |
4449 | src_nodeid); |
4450 | rv = -1; |
4451 | } |
4452 | |
4453 | as_endpoint_list* endpoint_list; |
4454 | if (hb_is_mesh()) { |
4455 | // Check only applies to v3 mesh. |
4456 | // v3 multicast protocol does not advertise endpoint list. |
4457 | if (msg_endpoint_list_get(msg, &endpoint_list) != 0 |
4458 | || endpoint_list->n_endpoints <= 0) { |
4459 | TICKER_WARNING( |
4460 | "received message without address/port from node %" PRIx64, |
4461 | src_nodeid); |
4462 | rv = -1; |
4463 | } |
4464 | } |
4465 | |
4466 | as_hlc_timestamp send_ts; |
4467 | if (msg_send_hlc_ts_get(msg, &send_ts) != 0) { |
4468 | TICKER_WARNING("received message without HLC time from node %" PRIx64, |
4469 | src_nodeid); |
4470 | rv = -1; |
4471 | } |
4472 | |
4473 | if (type == AS_HB_MSG_TYPE_PULSE) { |
4474 | char* remote_cluster_name = NULL; |
4475 | if (msg_cluster_name_get(msg, &remote_cluster_name) != 0) { |
4476 | remote_cluster_name = "" ; |
4477 | } |
4478 | |
4479 | if (!as_config_cluster_name_matches(remote_cluster_name)) { |
4480 | // Generate cluster-name mismatch event. |
4481 | as_hb_channel_event mismatch_event; |
4482 | channel_event_init(&mismatch_event); |
4483 | |
4484 | // Notify hb about cluster-name mismatch. |
4485 | mismatch_event.type = AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH; |
4486 | mismatch_event.nodeid = src_nodeid; |
4487 | mismatch_event.msg = NULL; |
4488 | memcpy(&mismatch_event.msg_hlc_ts, &msg_event->msg_hlc_ts, |
4489 | sizeof(msg_event->msg_hlc_ts)); |
4490 | |
4491 | channel_event_queue(&mismatch_event); |
4492 | |
4493 | TICKER_WARNING("ignoring message from %" PRIX64" with different cluster name(%s)" , |
4494 | src_nodeid, remote_cluster_name[0] == '\0' ? "null" : remote_cluster_name ); |
4495 | rv = -1; |
4496 | } |
4497 | } |
4498 | |
4499 | DETAIL("received message of type %d from node %" PRIx64, type, src_nodeid); |
4500 | |
4501 | return rv; |
4502 | } |
4503 | |
4504 | /** |
4505 | * Process incoming message to possibly update channel state. |
4506 | * |
4507 | * @param socket the socket on which the message is received. |
4508 | * @param event the message wrapped around in a channel event. |
4509 | * @return 0 if the message can be further processed, -1 if the message should |
4510 | * be discarded. |
4511 | */ |
4512 | static int |
4513 | channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event) |
4514 | { |
4515 | // Basic sanity check for the inbound message. |
4516 | if (channel_msg_sanity_check(event) != 0) { |
4517 | DETAIL("sanity check failed for message on fd %d" , CSFD(socket)); |
4518 | return -1; |
4519 | } |
4520 | |
4521 | int rv = -1; |
4522 | CHANNEL_LOCK(); |
4523 | |
4524 | as_hb_channel channel; |
4525 | if (channel_get_channel(socket, &channel) < 0) { |
4526 | // This is a bug and should not happen. Be paranoid and try fixing it ? |
4527 | WARNING("received a message on an unregistered fd %d - closing the fd" , |
4528 | CSFD(socket)); |
4529 | channel_socket_close_queue(socket, false, true); |
4530 | rv = -1; |
4531 | goto Exit; |
4532 | } |
4533 | |
4534 | if (channel.is_multicast) { |
4535 | rv = 0; |
4536 | goto Exit; |
4537 | } |
4538 | |
4539 | cf_node nodeid = event->nodeid; |
4540 | |
4541 | if (channel.nodeid != 0 && channel.nodeid != nodeid) { |
4542 | // The event nodeid does not match previously know event id. Something |
4543 | // seriously wrong here. |
4544 | WARNING("received a message from node with incorrect nodeid - expected %" PRIx64 " received %" PRIx64 "on fd %d" , |
4545 | channel.nodeid, nodeid, CSFD(socket)); |
4546 | rv = -1; |
4547 | goto Exit; |
4548 | } |
4549 | |
4550 | // Update the last received time for this node |
4551 | channel.last_received = cf_getms(); |
4552 | |
4553 | cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel); |
4554 | |
4555 | cf_socket* existing_socket; |
4556 | int get_result = cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, |
4557 | &existing_socket); |
4558 | |
4559 | if (get_result == CF_SHASH_ERR_NOT_FOUND) { |
4560 | // Associate this socket with the node. |
4561 | channel_node_attach(socket, &channel, nodeid); |
4562 | } |
4563 | else if (existing_socket != socket) { |
4564 | // Somehow the other node and this node discovered each other together |
4565 | // both connected via two tcp connections. Choose one and close the |
4566 | // other. |
4567 | cf_socket* resolved = channel_socket_resolve(socket, existing_socket); |
4568 | |
4569 | if (!resolved) { |
4570 | DEBUG( |
4571 | "resolving between fd %d and %d failed - closing both connections" , |
4572 | CSFD(socket), CSFD(existing_socket)); |
4573 | |
4574 | // Resolution failed. Should not happen but there is a window where |
4575 | // the same node initiated two connections. |
4576 | // Close both connections and try again. |
4577 | channel_socket_close_queue(socket, false, true); |
4578 | channel_socket_close_queue(existing_socket, false, true); |
4579 | |
4580 | // Nothing wrong with the message. Let it through. |
4581 | rv = 0; |
4582 | goto Exit; |
4583 | } |
4584 | |
4585 | DEBUG("resolved fd %d between redundant fd %d and %d for node %" PRIx64, |
4586 | CSFD(resolved), CSFD(socket), CSFD(existing_socket), nodeid); |
4587 | |
4588 | if (resolved == existing_socket) { |
4589 | // The node to socket mapping is correct, just close this socket and |
4590 | // this node will still be connected to the remote node. Do not |
4591 | // raise any event for this closure. |
4592 | channel_socket_close_queue(socket, false, false); |
4593 | } |
4594 | else { |
4595 | // We need to close the existing socket. Disable channel events |
4596 | // because we make the node appear to be not connected. Do not raise |
4597 | // any event for this closure. |
4598 | channel_socket_close_queue(existing_socket, false, false); |
4599 | // Associate this socket with the node. |
4600 | channel_node_attach(socket, &channel, nodeid); |
4601 | } |
4602 | } |
4603 | |
4604 | rv = 0; |
4605 | |
4606 | Exit: |
4607 | CHANNEL_UNLOCK(); |
4608 | return rv; |
4609 | } |
4610 | |
4611 | /** |
4612 | * Read a message from a socket that has data. |
4613 | * @param socket the socket having data to be read. |
4614 | */ |
4615 | static void |
4616 | channel_msg_read(cf_socket* socket) |
4617 | { |
4618 | CHANNEL_LOCK(); |
4619 | |
4620 | as_hb_channel_msg_read_status status; |
4621 | as_hb_channel channel; |
4622 | |
4623 | bool free_msg = true; |
4624 | |
4625 | msg* msg = hb_msg_get(); |
4626 | |
4627 | if (channel_get_channel(socket, &channel) != 0) { |
4628 | // Would happen if the channel was closed in the same epoll loop. |
4629 | DEBUG("error the channel does not exist for fd %d" , CSFD(socket)); |
4630 | goto Exit; |
4631 | } |
4632 | |
4633 | if (channel.is_multicast) { |
4634 | status = channel_multicast_msg_read(socket, msg); |
4635 | } |
4636 | else { |
4637 | status = channel_mesh_msg_read(socket, msg); |
4638 | } |
4639 | |
4640 | switch (status) { |
4641 | case AS_HB_CHANNEL_MSG_READ_SUCCESS: { |
4642 | break; |
4643 | } |
4644 | |
4645 | case AS_HB_CHANNEL_MSG_PARSE_FAIL: { |
4646 | TICKER_WARNING("unable to parse heartbeat message on fd %d" , |
4647 | CSFD(socket)); |
4648 | goto Exit; |
4649 | } |
4650 | |
4651 | case AS_HB_CHANNEL_MSG_CHANNEL_FAIL: // Falling through |
4652 | default: { |
4653 | DEBUG("could not read message from fd %d" , CSFD(socket)); |
4654 | if (!channel.is_multicast) { |
4655 | // Shut down only mesh socket. |
4656 | channel_socket_shutdown(socket); |
4657 | } |
4658 | goto Exit; |
4659 | } |
4660 | } |
4661 | |
4662 | as_hb_channel_event event; |
4663 | channel_event_init(&event); |
4664 | |
4665 | if (msg_get_uint64(msg, AS_HB_MSG_NODE, &event.nodeid) < 0) { |
4666 | // Node id missing from the message. Assume this message to be corrupt. |
4667 | TICKER_WARNING("message with invalid nodeid received on fd %d" , |
4668 | CSFD(socket)); |
4669 | goto Exit; |
4670 | } |
4671 | |
4672 | event.msg = msg; |
4673 | event.type = AS_HB_CHANNEL_MSG_RECEIVED; |
4674 | |
4675 | // Update hlc and store update message timestamp for the event. |
4676 | as_hlc_timestamp send_ts = 0; |
4677 | msg_send_hlc_ts_get(msg, &send_ts); |
4678 | as_hlc_timestamp_update(event.nodeid, send_ts, &event.msg_hlc_ts); |
4679 | |
4680 | // Process received message to update channel state. |
4681 | if (channel_msg_event_process(socket, &event) == 0) { |
4682 | // The message needs to be delivered to the listeners. Prevent a free. |
4683 | free_msg = false; |
4684 | channel_event_queue(&event); |
4685 | } |
4686 | |
4687 | Exit: |
4688 | CHANNEL_UNLOCK(); |
4689 | |
4690 | // release the message. |
4691 | if (free_msg) { |
4692 | hb_msg_return(msg); |
4693 | } |
4694 | } |
4695 | |
4696 | /** |
4697 | * Reduce function to remove faulty channels / nodes. Shutdown associated socket |
4698 | * to have channel tender cleanup. |
4699 | */ |
4700 | static int |
4701 | channel_channels_tend_reduce(const void* key, void* data, void* udata) |
4702 | { |
4703 | cf_socket** socket = (cf_socket**)key; |
4704 | as_hb_channel* channel = (as_hb_channel*)data; |
4705 | |
4706 | DETAIL("tending channel fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s" , |
4707 | CSFD(*socket), channel->nodeid, channel->last_received, |
4708 | cf_sock_addr_print(&channel->endpoint_addr)); |
4709 | |
4710 | if (channel->last_received + CHANNEL_NODE_READ_IDLE_TIMEOUT() |
4711 | < cf_getms()) { |
4712 | // Shutdown associated socket if it is not a multicast socket. |
4713 | if (!channel->is_multicast) { |
4714 | DEBUG("channel shutting down idle fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s" , |
4715 | CSFD(*socket), channel->nodeid, channel->last_received, |
4716 | cf_sock_addr_print(&channel->endpoint_addr)); |
4717 | channel_socket_shutdown(*socket); |
4718 | } |
4719 | } |
4720 | |
4721 | return CF_SHASH_OK; |
4722 | } |
4723 | |
4724 | /** |
4725 | * Tend channel specific node information to remove channels that are faulty (or |
4726 | * TODO: attached to misbehaving nodes). |
4727 | */ |
4728 | static void |
4729 | channel_channels_idle_check() |
4730 | { |
4731 | CHANNEL_LOCK(); |
4732 | |
4733 | cf_clock now = cf_getms(); |
4734 | if (g_hb.channel_state.last_channel_idle_check + CHANNEL_IDLE_CHECK_PERIOD |
4735 | <= now) { |
4736 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
4737 | channel_channels_tend_reduce, NULL); |
4738 | g_hb.channel_state.last_channel_idle_check = now; |
4739 | } |
4740 | |
4741 | CHANNEL_UNLOCK(); |
4742 | } |
4743 | |
4744 | /** |
4745 | * Socket tending thread. Manages heartbeat receive as well. |
4746 | */ |
4747 | void* |
4748 | channel_tender(void* arg) |
4749 | { |
4750 | DETAIL("channel tender started" ); |
4751 | |
4752 | while (channel_is_running()) { |
4753 | cf_poll_event events[POLL_SZ]; |
4754 | int32_t nevents = cf_poll_wait(g_hb.channel_state.poll, events, POLL_SZ, |
4755 | AS_HB_TX_INTERVAL_MS_MIN); |
4756 | |
4757 | DETAIL("tending channel" ); |
4758 | |
4759 | for (int32_t i = 0; i < nevents; i++) { |
4760 | cf_socket* socket = events[i].data; |
4761 | if (channel_cf_sockets_contains( |
4762 | g_hb.channel_state.listening_sockets, socket) |
4763 | && hb_is_mesh()) { |
4764 | // Accept a new connection. |
4765 | channel_accept_connection(socket); |
4766 | } |
4767 | else if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { |
4768 | channel_socket_close_queue(socket, true, true); |
4769 | } |
4770 | else if (events[i].events & EPOLLIN) { |
4771 | // Read a message for the socket that is ready. |
4772 | channel_msg_read(socket); |
4773 | } |
4774 | } |
4775 | |
4776 | // Tend channels to discard stale channels. |
4777 | channel_channels_idle_check(); |
4778 | |
4779 | // Close queued up socket. |
4780 | channel_socket_close_pending(); |
4781 | |
4782 | // Publish pending events. Should be outside channel lock. |
4783 | channel_event_publish_pending(); |
4784 | |
4785 | DETAIL("done tending channel" ); |
4786 | } |
4787 | |
4788 | DETAIL("channel tender shut down" ); |
4789 | return NULL; |
4790 | } |
4791 | |
4792 | /* |
4793 | * ---------------------------------------------------------------------------- |
4794 | * Channel public API |
4795 | * ---------------------------------------------------------------------------- |
4796 | */ |
4797 | |
4798 | /** |
4799 | * Filter out endpoints not matching this node's capabilities. |
4800 | */ |
4801 | static bool |
4802 | channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata) |
4803 | { |
4804 | if ((cf_ip_addr_legacy_only()) |
4805 | && endpoint->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6) { |
4806 | return false; |
4807 | } |
4808 | |
4809 | // If we don't offer TLS, then we won't connect via TLS, either. |
4810 | if (g_config.hb_tls_serv_spec.bind_port == 0 |
4811 | && as_endpoint_capability_is_supported(endpoint, |
4812 | AS_ENDPOINT_TLS_MASK)) { |
4813 | return false; |
4814 | } |
4815 | |
4816 | return true; |
4817 | } |
4818 | |
4819 | /** |
4820 | * Try and connect to a set of endpoint_lists. |
4821 | */ |
4822 | static void |
4823 | channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, |
4824 | int endpoint_list_count) |
4825 | { |
4826 | for (int i = 0; i < endpoint_list_count; i++) { |
4827 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
4828 | as_endpoint_list_to_string(endpoint_lists[i], endpoint_list_str, |
4829 | sizeof(endpoint_list_str)); |
4830 | |
4831 | if (channel_endpoint_is_connected(endpoint_lists[i])) { |
4832 | DEBUG( |
4833 | "duplicate endpoint connect request - ignoring endpoint list {%s}" , |
4834 | endpoint_list_str); |
4835 | continue; |
4836 | } |
4837 | |
4838 | DEBUG("attempting to connect mesh host at {%s}" , endpoint_list_str); |
4839 | |
4840 | cf_socket* sock = (cf_socket*)cf_malloc(sizeof(cf_socket)); |
4841 | |
4842 | const as_endpoint* connected_endpoint = as_endpoint_connect_any( |
4843 | endpoint_lists[i], channel_mesh_endpoint_filter, NULL, |
4844 | CONNECT_TIMEOUT(), sock); |
4845 | |
4846 | if (connected_endpoint) { |
4847 | cf_atomic_int_incr(&g_stats.heartbeat_connections_opened); |
4848 | |
4849 | cf_sock_addr endpoint_addr; |
4850 | memset(&endpoint_addr, 0, sizeof(endpoint_addr)); |
4851 | cf_ip_addr_set_any(&endpoint_addr.addr); |
4852 | if (as_endpoint_to_sock_addr(connected_endpoint, &endpoint_addr) |
4853 | != 0) { |
4854 | // Should never happen in practice. |
4855 | WARNING("error converting endpoint to socket address" ); |
4856 | channel_socket_destroy(sock); |
4857 | sock = NULL; |
4858 | |
4859 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
4860 | continue; |
4861 | } |
4862 | |
4863 | if (as_endpoint_capability_is_supported(connected_endpoint, |
4864 | AS_ENDPOINT_TLS_MASK)) { |
4865 | tls_socket_prepare_client(g_config.hb_config.tls, sock); |
4866 | |
4867 | if (tls_socket_connect_block(sock) != 1) { |
4868 | WARNING("heartbeat TLS client handshake with {%s} failed" , |
4869 | endpoint_list_str); |
4870 | channel_socket_destroy(sock); |
4871 | sock = NULL; |
4872 | |
4873 | cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); |
4874 | return; |
4875 | } |
4876 | } |
4877 | |
4878 | channel_socket_register(sock, false, false, &endpoint_addr); |
4879 | } |
4880 | else { |
4881 | TICKER_WARNING("could not create heartbeat connection to node {%s}" , |
4882 | endpoint_list_str); |
4883 | if (sock) { |
4884 | cf_free(sock); |
4885 | sock = NULL; |
4886 | } |
4887 | } |
4888 | } |
4889 | } |
4890 | |
4891 | /** |
4892 | * Disconnect a node from the channel list. |
4893 | * @param nodeid the nodeid of the node whose channel should be disconnected. |
4894 | * @return 0 if the node had a channel and was disconnected. -1 otherwise. |
4895 | */ |
4896 | static int |
4897 | channel_node_disconnect(cf_node nodeid) |
4898 | { |
4899 | int rv = -1; |
4900 | |
4901 | CHANNEL_LOCK(); |
4902 | |
4903 | cf_socket* socket; |
4904 | if (channel_socket_get(nodeid, &socket) != 0) { |
4905 | // not found |
4906 | rv = -1; |
4907 | goto Exit; |
4908 | } |
4909 | |
4910 | DEBUG("disconnecting the channel attached to node %" PRIx64, nodeid); |
4911 | |
4912 | channel_socket_close_queue(socket, false, true); |
4913 | |
4914 | rv = 0; |
4915 | |
4916 | Exit: |
4917 | CHANNEL_UNLOCK(); |
4918 | |
4919 | return rv; |
4920 | } |
4921 | |
4922 | /** |
4923 | * Register mesh listening sockets. |
4924 | */ |
4925 | static void |
4926 | channel_mesh_listening_socks_register(cf_sockets* listening_sockets) |
4927 | { |
4928 | CHANNEL_LOCK(); |
4929 | g_hb.channel_state.listening_sockets = listening_sockets; |
4930 | |
4931 | cf_poll_add_sockets(g_hb.channel_state.poll, |
4932 | g_hb.channel_state.listening_sockets, |
4933 | EPOLLIN | EPOLLERR | EPOLLHUP); |
4934 | cf_socket_show_server(AS_HB, "mesh heartbeat" , |
4935 | g_hb.channel_state.listening_sockets); |
4936 | |
4937 | // We do not need a separate channel to cover this socket because IO will |
4938 | // not happen on these sockets. |
4939 | CHANNEL_UNLOCK(); |
4940 | } |
4941 | |
4942 | /** |
4943 | * Deregister mesh listening socket from epoll event. |
4944 | * @param socket the listening socket socket. |
4945 | */ |
4946 | static void |
4947 | channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets) |
4948 | { |
4949 | CHANNEL_LOCK(); |
4950 | cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets); |
4951 | CHANNEL_UNLOCK(); |
4952 | } |
4953 | |
4954 | /** |
4955 | * Register the multicast listening socket. |
4956 | * @param socket the listening socket. |
4957 | * @param endpoint the endpoint on which multicast io happens. |
4958 | */ |
4959 | static void |
4960 | channel_multicast_listening_socks_register(cf_sockets* listening_sockets) |
4961 | { |
4962 | CHANNEL_LOCK(); |
4963 | g_hb.channel_state.listening_sockets = listening_sockets; |
4964 | |
4965 | // Create a new multicast channel for each multicast socket. |
4966 | for (uint32_t i = 0; |
4967 | i < g_hb.mode_state.multicast_state.listening_sockets.n_socks; |
4968 | ++i) { |
4969 | channel_socket_register(&g_hb.channel_state.listening_sockets->socks[i], |
4970 | true, false, NULL); |
4971 | } |
4972 | |
4973 | cf_socket_mcast_show(AS_HB, "multicast heartbeat" , |
4974 | g_hb.channel_state.listening_sockets); |
4975 | CHANNEL_UNLOCK(); |
4976 | } |
4977 | |
4978 | /** |
4979 | * Deregister multicast listening socket from epoll event. |
4980 | * @param socket the listening socket socket. |
4981 | */ |
4982 | static void |
4983 | channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets) |
4984 | { |
4985 | CHANNEL_LOCK(); |
4986 | cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets); |
4987 | CHANNEL_UNLOCK(); |
4988 | } |
4989 | |
4990 | /** |
4991 | * Initialize the channel sub module. |
4992 | */ |
4993 | static void |
4994 | channel_init() |
4995 | { |
4996 | CHANNEL_LOCK(); |
4997 | |
4998 | // Disable events till initialization is complete. |
4999 | channel_events_enabled_set(false); |
5000 | |
5001 | // Initialize unpublished event queue. |
5002 | cf_queue_init(&g_hb.channel_state.events_queue, sizeof(as_hb_channel_event), |
5003 | AS_HB_CLUSTER_MAX_SIZE_SOFT, true); |
5004 | |
5005 | // Initialize sockets to close queue. |
5006 | cf_queue_init(&g_hb.channel_state.socket_close_queue, |
5007 | sizeof(as_hb_channel_socket_close_entry), |
5008 | AS_HB_CLUSTER_MAX_SIZE_SOFT, true); |
5009 | |
5010 | // Initialize the nodeid to socket hash. |
5011 | g_hb.channel_state.nodeid_to_socket = cf_shash_create(cf_nodeid_shash_fn, |
5012 | sizeof(cf_node), sizeof(cf_socket*), AS_HB_CLUSTER_MAX_SIZE_SOFT, |
5013 | 0); |
5014 | |
5015 | // Initialize the socket to channel state hash. |
5016 | g_hb.channel_state.socket_to_channel = cf_shash_create(hb_socket_hash_fn, |
5017 | sizeof(cf_socket*), sizeof(as_hb_channel), |
5018 | AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
5019 | |
5020 | g_hb.channel_state.status = AS_HB_STATUS_STOPPED; |
5021 | |
5022 | CHANNEL_UNLOCK(); |
5023 | } |
5024 | |
5025 | /** |
5026 | * Start channel sub module. Kicks off the channel tending thread. |
5027 | */ |
5028 | static void |
5029 | channel_start() |
5030 | { |
5031 | CHANNEL_LOCK(); |
5032 | |
5033 | if (channel_is_running()) { |
5034 | WARNING("heartbeat channel already started" ); |
5035 | goto Exit; |
5036 | } |
5037 | |
5038 | // create the epoll socket. |
5039 | cf_poll_create(&g_hb.channel_state.poll); |
5040 | |
5041 | DEBUG("created epoll fd %d" , CEFD(g_hb.channel_state.poll)); |
5042 | |
5043 | // Disable events till initialization is complete. |
5044 | channel_events_enabled_set(false); |
5045 | |
5046 | // Data structures have been initialized. |
5047 | g_hb.channel_state.status = AS_HB_STATUS_RUNNING; |
5048 | |
5049 | // Initialization complete enable events. |
5050 | channel_events_enabled_set(true); |
5051 | |
5052 | // Start the channel tender. |
5053 | g_hb.channel_state.channel_tender_tid = |
5054 | cf_thread_create_joinable(channel_tender, (void*)&g_hb); |
5055 | |
5056 | Exit: |
5057 | CHANNEL_UNLOCK(); |
5058 | } |
5059 | |
5060 | /** |
5061 | * Get all sockets. |
5062 | */ |
5063 | static int |
5064 | channel_sockets_get_reduce(const void* key, void* data, void* udata) |
5065 | { |
5066 | cf_vector* sockets = (cf_vector*)udata; |
5067 | cf_vector_append(sockets, key); |
5068 | return CF_SHASH_OK; |
5069 | } |
5070 | |
5071 | /** |
5072 | * Stop the channel sub module called on hb_stop. |
5073 | */ |
5074 | static void |
5075 | channel_stop() |
5076 | { |
5077 | if (!channel_is_running()) { |
5078 | WARNING("heartbeat channel already stopped" ); |
5079 | return; |
5080 | } |
5081 | |
5082 | DEBUG("stopping the channel" ); |
5083 | |
5084 | // Unguarded state change but this should be OK. |
5085 | g_hb.channel_state.status = AS_HB_STATUS_SHUTTING_DOWN; |
5086 | |
5087 | // Wait for the channel tender thread to finish. |
5088 | cf_thread_join(g_hb.channel_state.channel_tender_tid); |
5089 | |
5090 | CHANNEL_LOCK(); |
5091 | |
5092 | cf_vector sockets; |
5093 | cf_socket buff[cf_shash_get_size(g_hb.channel_state.socket_to_channel)]; |
5094 | cf_vector_init_smalloc(&sockets, sizeof(cf_socket*), (uint8_t*)buff, |
5095 | sizeof(buff), VECTOR_FLAG_INITZERO); |
5096 | |
5097 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
5098 | channel_sockets_get_reduce, &sockets); |
5099 | |
5100 | channel_sockets_close(&sockets); |
5101 | |
5102 | // Disable events. |
5103 | channel_events_enabled_set(false); |
5104 | |
5105 | cf_vector_destroy(&sockets); |
5106 | |
5107 | // Close epoll socket. |
5108 | cf_poll_destroy(g_hb.channel_state.poll); |
5109 | EFD(g_hb.channel_state.poll) = -1; |
5110 | |
5111 | // Disable the channel thread. |
5112 | g_hb.channel_state.status = AS_HB_STATUS_STOPPED; |
5113 | |
5114 | DEBUG("channel Stopped" ); |
5115 | |
5116 | CHANNEL_UNLOCK(); |
5117 | } |
5118 | |
5119 | /** |
5120 | * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK |
5121 | * @param socket the socket to send the buffer over. |
5122 | * @param buff the data buffer. |
5123 | * @param buffer_length the number of bytes in the buffer to send. |
5124 | * @return 0 on successful send -1 on failure |
5125 | */ |
5126 | static int |
5127 | channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length) |
5128 | { |
5129 | CHANNEL_LOCK(); |
5130 | int rv; |
5131 | |
5132 | if (cf_socket_send_all(socket, buff, buffer_length, 0, |
5133 | MESH_RW_TIMEOUT) < 0) { |
5134 | as_hb_channel channel; |
5135 | if (channel_get_channel(socket, &channel) == 0) { |
5136 | // Would happen if the channel was closed in the same epoll loop. |
5137 | TICKER_WARNING("sending mesh message to %" PRIx64" on fd %d failed : %s" , |
5138 | channel.nodeid, CSFD(socket), cf_strerror(errno)); |
5139 | } |
5140 | else { |
5141 | TICKER_WARNING("sending mesh message on fd %d failed : %s" , |
5142 | CSFD(socket), cf_strerror(errno)); |
5143 | } |
5144 | |
5145 | channel_socket_shutdown(socket); |
5146 | rv = -1; |
5147 | } |
5148 | else { |
5149 | rv = 0; |
5150 | } |
5151 | |
5152 | CHANNEL_UNLOCK(); |
5153 | return rv; |
5154 | } |
5155 | |
5156 | /** |
5157 | * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK |
5158 | * @param socket the socket to send the buffer over. |
5159 | * @param buff the data buffer. |
5160 | * @param buffer_length the number of bytes in the buffer to send. |
5161 | * @return 0 on successful send -1 on failure |
5162 | */ |
5163 | static int |
5164 | channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, |
5165 | size_t buffer_length) |
5166 | { |
5167 | CHANNEL_LOCK(); |
5168 | int rv = 0; |
5169 | DETAIL("sending udp heartbeat to fd %d: msg size %zu" , CSFD(socket), |
5170 | buffer_length); |
5171 | |
5172 | int mtu = hb_mtu(); |
5173 | if (buffer_length > mtu) { |
5174 | TICKER_WARNING("mtu breach, sending udp heartbeat to fd %d: mtu %d" , |
5175 | CSFD(socket), mtu); |
5176 | } |
5177 | |
5178 | cf_msock_cfg* socket_cfg = (cf_msock_cfg*)(socket->cfg); |
5179 | cf_sock_addr dest; |
5180 | dest.port = socket_cfg->port; |
5181 | cf_ip_addr_copy(&socket_cfg->addr, &dest.addr); |
5182 | |
5183 | if (cf_socket_send_to(socket, buff, buffer_length, 0, &dest) < 0) { |
5184 | TICKER_WARNING("multicast message send failed on fd %d %s" , |
5185 | CSFD(socket), cf_strerror(errno)); |
5186 | rv = -1; |
5187 | } |
5188 | CHANNEL_UNLOCK(); |
5189 | return rv; |
5190 | } |
5191 | |
5192 | /** |
5193 | * Indicates if this msg requires compression. |
5194 | */ |
5195 | static bool |
5196 | channel_msg_is_compression_required(msg* msg, int wire_size, int mtu) |
5197 | { |
5198 | return wire_size > msg_compression_threshold(mtu); |
5199 | } |
5200 | |
5201 | /** |
5202 | * Estimate the size of the buffer required to fill out the serialized message. |
5203 | * @param msg the input message. |
5204 | * @param mtu the underlying network mtu. |
5205 | * @return the size of the buffer required. |
5206 | */ |
5207 | static int |
5208 | channel_msg_buffer_size_get(int wire_size, int mtu) |
5209 | { |
5210 | return round_up_pow2(MAX(wire_size, compressBound(wire_size))); |
5211 | } |
5212 | |
5213 | /** |
5214 | * Fills the buffer with the serialized message. |
5215 | * @param original_msg the original message to serialize. |
5216 | * @param wire_size the message wire size. |
5217 | * @param mtu the underlying network mtu. |
5218 | * @param buffer the destination buffer. |
5219 | * @param buffer_len the buffer length. |
5220 | * |
5221 | * @return length of the serialized message. |
5222 | */ |
5223 | static size_t |
5224 | channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, |
5225 | uint8_t* buffer, size_t buffer_len) |
5226 | { |
5227 | // This is output by msg_to_wire. Using a separate variable so that we do |
5228 | // not lose the actual buffer length needed for compression later on. |
5229 | size_t msg_size = msg_to_wire(original_msg, buffer); |
5230 | |
5231 | if (channel_msg_is_compression_required(original_msg, msg_size, mtu)) { |
5232 | // Compression is required. |
5233 | const size_t compressed_buffer_len = buffer_len; |
5234 | uint8_t* compressed_buffer = MSG_BUFF_ALLOC_OR_DIE( |
5235 | compressed_buffer_len, |
5236 | "error allocating memory size %zu for compressing message" , |
5237 | compressed_buffer_len); |
5238 | |
5239 | size_t compressed_msg_size = compressed_buffer_len; |
5240 | int compress_rv = compress2(compressed_buffer, &compressed_msg_size, |
5241 | buffer, wire_size, Z_BEST_COMPRESSION); |
5242 | |
5243 | if (compress_rv == Z_BUF_ERROR) { |
5244 | // Compression result going to be larger than original input buffer. |
5245 | // Skip compression and try to send the message as is. |
5246 | DETAIL( |
5247 | "skipping compression - compressed size larger than input size %zu" , |
5248 | msg_size); |
5249 | } |
5250 | else { |
5251 | msg* temp_msg = hb_msg_get(); |
5252 | |
5253 | msg_set_buf(temp_msg, AS_HB_MSG_COMPRESSED_PAYLOAD, |
5254 | compressed_buffer, compressed_msg_size, MSG_SET_COPY); |
5255 | msg_size = msg_to_wire(temp_msg, buffer); |
5256 | |
5257 | hb_msg_return(temp_msg); |
5258 | } |
5259 | |
5260 | MSG_BUFF_FREE(compressed_buffer, compressed_buffer_len); |
5261 | |
5262 | } |
5263 | |
5264 | return msg_size; |
5265 | } |
5266 | |
5267 | /** |
5268 | * Send a message to a destination node. |
5269 | */ |
5270 | static int |
5271 | channel_msg_unicast(cf_node dest, msg* msg) |
5272 | { |
5273 | size_t buffer_len = 0; |
5274 | uint8_t* buffer = NULL; |
5275 | if (!hb_is_mesh()) { |
5276 | // Can't send a unicast message in the multicast mode. |
5277 | WARNING("ignoring sending unicast message in multicast mode" ); |
5278 | return -1; |
5279 | } |
5280 | |
5281 | CHANNEL_LOCK(); |
5282 | |
5283 | int rv = -1; |
5284 | cf_socket* connected_socket; |
5285 | |
5286 | if (channel_socket_get(dest, &connected_socket) != 0) { |
5287 | DEBUG("failing message send to disconnected node %" PRIx64, dest); |
5288 | rv = -1; |
5289 | goto Exit; |
5290 | } |
5291 | |
5292 | // Read the message to a buffer. |
5293 | int mtu = hb_mtu(); |
5294 | int wire_size = msg_get_wire_size(msg); |
5295 | buffer_len = channel_msg_buffer_size_get(wire_size, mtu); |
5296 | buffer = |
5297 | MSG_BUFF_ALLOC_OR_DIE(buffer_len, |
5298 | "error allocating memory size %zu for sending message to node %" PRIx64, |
5299 | buffer_len, dest); |
5300 | |
5301 | size_t msg_size = channel_msg_buffer_fill(msg, wire_size, mtu, buffer, |
5302 | buffer_len); |
5303 | |
5304 | // Send over the buffer. |
5305 | rv = channel_mesh_msg_send(connected_socket, buffer, msg_size); |
5306 | |
5307 | Exit: |
5308 | MSG_BUFF_FREE(buffer, buffer_len); |
5309 | CHANNEL_UNLOCK(); |
5310 | return rv; |
5311 | } |
5312 | |
5313 | /** |
5314 | * Shash reduce function to walk over the socket to channel hash and broadcast |
5315 | * the message in udata. |
5316 | */ |
5317 | static int |
5318 | channel_msg_broadcast_reduce(const void* key, void* data, void* udata) |
5319 | { |
5320 | CHANNEL_LOCK(); |
5321 | cf_socket** socket = (cf_socket**)key; |
5322 | as_hb_channel* channel = (as_hb_channel*)data; |
5323 | as_hb_channel_buffer_udata* buffer_udata = |
5324 | (as_hb_channel_buffer_udata*)udata; |
5325 | |
5326 | if (!channel->is_multicast) { |
5327 | DETAIL( |
5328 | "broadcasting message of length %zu on channel %d assigned to node %" PRIx64, |
5329 | buffer_udata->buffer_len, CSFD(*socket), channel->nodeid); |
5330 | |
5331 | channel_mesh_msg_send(*socket, buffer_udata->buffer, |
5332 | buffer_udata->buffer_len); |
5333 | } |
5334 | else { |
5335 | channel_multicast_msg_send(*socket, buffer_udata->buffer, |
5336 | buffer_udata->buffer_len); |
5337 | } |
5338 | |
5339 | CHANNEL_UNLOCK(); |
5340 | |
5341 | return CF_SHASH_OK; |
5342 | } |
5343 | |
5344 | /** |
5345 | * Broadcast a message over all channels. |
5346 | */ |
5347 | static int |
5348 | channel_msg_broadcast(msg* msg) |
5349 | { |
5350 | CHANNEL_LOCK(); |
5351 | |
5352 | int rv = -1; |
5353 | |
5354 | // Read the message to a buffer. |
5355 | int mtu = hb_mtu(); |
5356 | int wire_size = msg_get_wire_size(msg); |
5357 | size_t buffer_len = channel_msg_buffer_size_get(wire_size, mtu); |
5358 | uint8_t* buffer = MSG_BUFF_ALLOC_OR_DIE(buffer_len, |
5359 | "error allocating memory size %zu for sending broadcast message" , |
5360 | buffer_len); |
5361 | |
5362 | as_hb_channel_buffer_udata udata; |
5363 | udata.buffer = buffer; |
5364 | |
5365 | // Note this is the length of buffer to send. |
5366 | udata.buffer_len = channel_msg_buffer_fill(msg, wire_size, mtu, buffer, |
5367 | buffer_len); |
5368 | |
5369 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
5370 | channel_msg_broadcast_reduce, &udata); |
5371 | |
5372 | MSG_BUFF_FREE(buffer, buffer_len); |
5373 | CHANNEL_UNLOCK(); |
5374 | return rv; |
5375 | } |
5376 | |
5377 | /** |
5378 | * Clear all channel state. |
5379 | */ |
5380 | static void |
5381 | channel_clear() |
5382 | { |
5383 | if (!channel_is_stopped()) { |
5384 | WARNING("attempted channel clear without stopping the channel" ); |
5385 | return; |
5386 | } |
5387 | |
5388 | CHANNEL_LOCK(); |
5389 | |
5390 | // Free the unpublished event queue. |
5391 | cf_queue_delete_all(&g_hb.channel_state.events_queue); |
5392 | |
5393 | // Delete nodeid to socket hash. |
5394 | cf_shash_reduce(g_hb.channel_state.nodeid_to_socket, hb_delete_all_reduce, |
5395 | NULL); |
5396 | |
5397 | // Delete the socket_to_channel hash. |
5398 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, hb_delete_all_reduce, |
5399 | NULL); |
5400 | |
5401 | DETAIL("cleared channel information" ); |
5402 | CHANNEL_UNLOCK(); |
5403 | } |
5404 | |
5405 | /** |
5406 | * Reduce function to dump channel node info to log file. |
5407 | */ |
5408 | static int |
5409 | channel_dump_reduce(const void* key, void* data, void* udata) |
5410 | { |
5411 | cf_socket** socket = (cf_socket**)key; |
5412 | as_hb_channel* channel = (as_hb_channel*)data; |
5413 | |
5414 | INFO("\tHB Channel (%s): node-id %" PRIx64 " fd %d endpoint %s polarity %s last-received %" PRIu64, |
5415 | channel->is_multicast ? "multicast" : "mesh" , channel->nodeid, |
5416 | CSFD(*socket), (cf_sock_addr_is_any(&channel->endpoint_addr)) |
5417 | ? "unknown" |
5418 | : cf_sock_addr_print(&channel->endpoint_addr), |
5419 | channel->is_inbound ? "inbound" : "outbound" , |
5420 | channel->last_received); |
5421 | |
5422 | return CF_SHASH_OK; |
5423 | } |
5424 | |
5425 | /** |
5426 | * Dump channel state to logs. |
5427 | * @param verbose enables / disables verbose logging. |
5428 | */ |
5429 | static void |
5430 | channel_dump(bool verbose) |
5431 | { |
5432 | CHANNEL_LOCK(); |
5433 | |
5434 | INFO("HB Channel Count %d" , |
5435 | cf_shash_get_size(g_hb.channel_state.socket_to_channel)); |
5436 | |
5437 | if (verbose) { |
5438 | cf_shash_reduce(g_hb.channel_state.socket_to_channel, |
5439 | channel_dump_reduce, NULL); |
5440 | } |
5441 | |
5442 | CHANNEL_UNLOCK(); |
5443 | } |
5444 | |
5445 | /* |
5446 | * ---------------------------------------------------------------------------- |
5447 | * Mesh sub module. |
5448 | * ---------------------------------------------------------------------------- |
5449 | */ |
5450 | |
5451 | /** |
5452 | * Is mesh running. |
5453 | */ |
5454 | static bool |
5455 | mesh_is_running() |
5456 | { |
5457 | MESH_LOCK(); |
5458 | bool retval = |
5459 | (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_RUNNING) ? |
5460 | true : false; |
5461 | MESH_UNLOCK(); |
5462 | return retval; |
5463 | } |
5464 | |
5465 | /** |
5466 | * Is mesh stopped. |
5467 | */ |
5468 | static bool |
5469 | mesh_is_stopped() |
5470 | { |
5471 | MESH_LOCK(); |
5472 | bool retval = |
5473 | (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_STOPPED) ? |
5474 | true : false; |
5475 | MESH_UNLOCK(); |
5476 | return retval; |
5477 | } |
5478 | |
5479 | /** |
5480 | * Refresh the mesh published endpoint list. |
5481 | * @return 0 on successful list creation, -1 otherwise. |
5482 | */ |
5483 | static int |
5484 | mesh_published_endpoint_list_refresh() |
5485 | { |
5486 | int rv = -1; |
5487 | MESH_LOCK(); |
5488 | |
5489 | // TODO: Add interface addresses change detection logic here as well. |
5490 | if (g_hb.mode_state.mesh_state.published_endpoint_list != NULL |
5491 | && g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only |
5492 | == cf_ip_addr_legacy_only()) { |
5493 | rv = 0; |
5494 | goto Exit; |
5495 | } |
5496 | |
5497 | // The global flag has changed, refresh the published address list. |
5498 | if (g_hb.mode_state.mesh_state.published_endpoint_list) { |
5499 | // Free the obsolete list. |
5500 | cf_free(g_hb.mode_state.mesh_state.published_endpoint_list); |
5501 | } |
5502 | |
5503 | const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); |
5504 | cf_serv_cfg published_cfg; |
5505 | |
5506 | config_bind_serv_cfg_expand(bind_cfg, &published_cfg, |
5507 | g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only); |
5508 | |
5509 | g_hb.mode_state.mesh_state.published_endpoint_list = |
5510 | as_endpoint_list_from_serv_cfg(&published_cfg); |
5511 | |
5512 | if (!g_hb.mode_state.mesh_state.published_endpoint_list) { |
5513 | CRASH("error initializing mesh published address list" ); |
5514 | } |
5515 | |
5516 | g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only = |
5517 | cf_ip_addr_legacy_only(); |
5518 | |
5519 | rv = 0; |
5520 | |
5521 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
5522 | as_endpoint_list_to_string( |
5523 | g_hb.mode_state.mesh_state.published_endpoint_list, |
5524 | endpoint_list_str, sizeof(endpoint_list_str)); |
5525 | INFO("updated heartbeat published address list to {%s}" , endpoint_list_str); |
5526 | |
5527 | Exit: |
5528 | MESH_UNLOCK(); |
5529 | return rv; |
5530 | } |
5531 | |
5532 | /** |
5533 | * Read the published endpoint list via a callback. The call back pattern is to |
5534 | * prevent access to the published list outside the mesh lock. |
5535 | * @param process_fn the list process function. The list passed to the process |
5536 | * function can be NULL. |
5537 | * @param udata passed as is to the process function. |
5538 | */ |
5539 | static void |
5540 | mesh_published_endpoints_process(endpoint_list_process_fn process_fn, |
5541 | void* udata) |
5542 | { |
5543 | MESH_LOCK(); |
5544 | |
5545 | as_endpoint_list* rv = NULL; |
5546 | if (mesh_published_endpoint_list_refresh()) { |
5547 | WARNING("error creating mesh published endpoint list" ); |
5548 | rv = NULL; |
5549 | } |
5550 | else { |
5551 | rv = g_hb.mode_state.mesh_state.published_endpoint_list; |
5552 | } |
5553 | |
5554 | (process_fn)(rv, udata); |
5555 | |
5556 | MESH_UNLOCK(); |
5557 | } |
5558 | |
5559 | /** |
5560 | * Convert mesh status to a string. |
5561 | */ |
5562 | static const char* |
5563 | mesh_node_status_string(as_hb_mesh_node_status status) |
5564 | { |
5565 | static char* status_str[] = { |
5566 | "active" , |
5567 | "pending" , |
5568 | "inactive" , |
5569 | "endpoint-unknown" }; |
5570 | |
5571 | if (status >= AS_HB_MESH_NODE_STATUS_SENTINEL) { |
5572 | return "corrupted" ; |
5573 | } |
5574 | return status_str[status]; |
5575 | } |
5576 | |
5577 | /** |
5578 | * Change the state of a mesh node. Note: memset the mesh_nodes to zero before |
5579 | * calling state change for the first time. |
5580 | */ |
5581 | static void |
5582 | mesh_seed_status_change(as_hb_mesh_seed* seed, |
5583 | as_hb_mesh_node_status new_status) |
5584 | { |
5585 | seed->status = new_status; |
5586 | seed->last_status_updated = cf_getms(); |
5587 | } |
5588 | |
5589 | /** |
5590 | * Destroy a mesh seed node. |
5591 | */ |
5592 | static void |
5593 | mesh_seed_destroy(as_hb_mesh_seed* seed) |
5594 | { |
5595 | MESH_LOCK(); |
5596 | if (seed->resolved_endpoint_list) { |
5597 | cf_free(seed->resolved_endpoint_list); |
5598 | seed->resolved_endpoint_list = NULL; |
5599 | } |
5600 | MESH_UNLOCK(); |
5601 | } |
5602 | |
5603 | static void |
5604 | mesh_seed_dns_resolve_cb(bool is_resolved, const char* hostname, |
5605 | const cf_ip_addr *addrs, uint32_t n_addrs, void *udata) |
5606 | { |
5607 | MESH_LOCK(); |
5608 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
5609 | int element_count = cf_vector_size(seeds); |
5610 | for (int i = 0; i < element_count; i++) { |
5611 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
5612 | |
5613 | if ((strncmp(seed->seed_host_name, hostname, |
5614 | sizeof(seed->seed_host_name)) != 0) |
5615 | || seed->resolved_endpoint_list != NULL) { |
5616 | continue; |
5617 | } |
5618 | |
5619 | cf_serv_cfg temp_serv_cfg; |
5620 | cf_serv_cfg_init(&temp_serv_cfg); |
5621 | |
5622 | cf_sock_cfg sock_cfg; |
5623 | cf_sock_cfg_init(&sock_cfg, |
5624 | seed->seed_tls ? |
5625 | CF_SOCK_OWNER_HEARTBEAT_TLS : CF_SOCK_OWNER_HEARTBEAT); |
5626 | sock_cfg.port = seed->seed_port; |
5627 | |
5628 | for (int i = 0; i < n_addrs; i++) { |
5629 | cf_ip_addr_copy(&addrs[i], &sock_cfg.addr); |
5630 | if (cf_serv_cfg_add_sock_cfg(&temp_serv_cfg, &sock_cfg)) { |
5631 | CRASH("error initializing resolved address list" ); |
5632 | } |
5633 | |
5634 | DETAIL("resolved mesh node hostname %s to %s" , seed->seed_host_name, |
5635 | cf_ip_addr_print(&addrs[i])); |
5636 | } |
5637 | |
5638 | seed->resolved_endpoint_list = as_endpoint_list_from_serv_cfg( |
5639 | &temp_serv_cfg); |
5640 | } |
5641 | |
5642 | MESH_UNLOCK(); |
5643 | } |
5644 | |
5645 | /** |
5646 | * Fill the endpoint list for a mesh seed using the mesh seed hostname and port. |
5647 | * returns the |
5648 | * @param mesh_node the mesh node |
5649 | * @return 0 on success. -1 if a valid endpoint list does not exist and it could |
5650 | * not be generated. |
5651 | */ |
5652 | static int |
5653 | mesh_seed_endpoint_list_fill(as_hb_mesh_seed* seed) |
5654 | { |
5655 | if (seed->resolved_endpoint_list != NULL |
5656 | && seed->resolved_endpoint_list->n_endpoints > 0) { |
5657 | // A valid endpoint list already exists. For now we resolve only once. |
5658 | return 0; |
5659 | } |
5660 | |
5661 | cf_clock now = cf_getms(); |
5662 | if (now |
5663 | < seed->resolved_endpoint_list_ts |
5664 | + MESH_SEED_RESOLVE_ATTEMPT_INTERVAL()) { |
5665 | // We have just resolved this seed entry unsuccessfully. Don't try again |
5666 | // for sometime. |
5667 | return -1; |
5668 | } |
5669 | |
5670 | // Resolve and get all IPv4/IPv6 ip addresses asynchronously. |
5671 | seed->resolved_endpoint_list_ts = now; |
5672 | cf_ip_addr_from_string_multi_a(seed->seed_host_name, |
5673 | mesh_seed_dns_resolve_cb, NULL); |
5674 | return -1; |
5675 | } |
5676 | |
5677 | /** |
5678 | * Find a mesh seed in the seed list that has an overlapping endpoint and return |
5679 | * an internal pointer. Assumes this function is called within mesh lock to |
5680 | * prevent invalidating the returned index after function return. |
5681 | * |
5682 | * @param endpoint_list the endpoint list to find the endpoint by. |
5683 | * @return index to matching seed entry if found, else -1 |
5684 | */ |
5685 | static int |
5686 | mesh_seed_endpoint_list_overlapping_find_unsafe(as_endpoint_list* endpoint_list) |
5687 | { |
5688 | MESH_LOCK(); |
5689 | |
5690 | int match_index = -1; |
5691 | if (!endpoint_list) { |
5692 | // Null / empty endpoint list. |
5693 | goto Exit; |
5694 | } |
5695 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
5696 | int element_count = cf_vector_size(seeds); |
5697 | for (int i = 0; i < element_count; i++) { |
5698 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
5699 | |
5700 | // Ensure the seed hostname is resolved. |
5701 | mesh_seed_endpoint_list_fill(seed); |
5702 | |
5703 | if (as_endpoint_lists_are_overlapping(endpoint_list, |
5704 | seed->resolved_endpoint_list, true)) { |
5705 | match_index = i; |
5706 | break; |
5707 | } |
5708 | } |
5709 | |
5710 | Exit: |
5711 | MESH_UNLOCK(); |
5712 | return match_index; |
5713 | } |
5714 | |
5715 | /** |
5716 | * Remove a seed entry from the seed list. |
5717 | * Assumes this function is called within mesh lock to prevent invalidating the |
5718 | * used index during a function call. |
5719 | * @param seed_index the index of the seed element. |
5720 | * @return 0 on success -1 on failure. |
5721 | */ |
5722 | static int |
5723 | mesh_seed_delete_unsafe(int seed_index) |
5724 | { |
5725 | int rv = -1; |
5726 | MESH_LOCK(); |
5727 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
5728 | if (seed_index >= 0) { |
5729 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, seed_index); |
5730 | mesh_seed_destroy(seed); |
5731 | rv = cf_vector_delete(seeds, seed_index); |
5732 | if (rv == 0) { |
5733 | INFO("removed mesh seed host:%s port %d" , seed->seed_host_name, |
5734 | seed->seed_port); |
5735 | } |
5736 | } |
5737 | MESH_UNLOCK(); |
5738 | return rv; |
5739 | } |
5740 | |
5741 | /** |
5742 | * Find a mesh seed in the seed list with exactly matching hostname and port. |
5743 | * Assumes this function is called within mesh lock to prevent invalidating the |
5744 | * returned index after function return. |
5745 | * |
5746 | * @param host the seed hostname |
5747 | * @param port the seed port |
5748 | * @return index to matching seed entry if found, else -1 |
5749 | */ |
5750 | static int |
5751 | mesh_seed_find_unsafe(char* host, int port) |
5752 | { |
5753 | MESH_LOCK(); |
5754 | |
5755 | int match_index = -1; |
5756 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
5757 | int element_count = cf_vector_size(seeds); |
5758 | for (int i = 0; i < element_count; i++) { |
5759 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
5760 | if (strncmp(seed->seed_host_name, host, sizeof(seed->seed_host_name)) |
5761 | == 0 && seed->seed_port == port) { |
5762 | match_index = i; |
5763 | break; |
5764 | } |
5765 | } |
5766 | |
5767 | MESH_UNLOCK(); |
5768 | return match_index; |
5769 | } |
5770 | |
5771 | /** |
5772 | * Endure mesh tend udata has enough space for current mesh nodes. |
5773 | */ |
5774 | static void |
5775 | mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, |
5776 | int mesh_node_count) |
5777 | { |
5778 | // Ensure capacity for nodes to connect. |
5779 | if (tend_reduce_udata->to_connect_capacity < mesh_node_count) { |
5780 | uint32_t alloc_size = round_up_pow2( |
5781 | mesh_node_count * sizeof(as_endpoint_list*)); |
5782 | int old_capacity = tend_reduce_udata->to_connect_capacity; |
5783 | tend_reduce_udata->to_connect_capacity = alloc_size |
5784 | / sizeof(as_endpoint_list*); |
5785 | tend_reduce_udata->to_connect = cf_realloc( |
5786 | tend_reduce_udata->to_connect, alloc_size); |
5787 | |
5788 | // NULL out newly allocated elements. |
5789 | for (int i = old_capacity; i < tend_reduce_udata->to_connect_capacity; |
5790 | i++) { |
5791 | tend_reduce_udata->to_connect[i] = NULL; |
5792 | } |
5793 | } |
5794 | } |
5795 | |
5796 | /** |
5797 | * Change the state of a mesh node. Note: memset the mesh_nodes to zero before |
5798 | * calling state change for the first time. |
5799 | */ |
5800 | static void |
5801 | mesh_node_status_change(as_hb_mesh_node* mesh_node, |
5802 | as_hb_mesh_node_status new_status) |
5803 | { |
5804 | as_hb_mesh_node_status old_status = mesh_node->status; |
5805 | mesh_node->status = new_status; |
5806 | |
5807 | if ((new_status != AS_HB_MESH_NODE_CHANNEL_ACTIVE |
5808 | && old_status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) |
5809 | || mesh_node->last_status_updated == 0) { |
5810 | mesh_node->inactive_since = cf_getms(); |
5811 | } |
5812 | mesh_node->last_status_updated = cf_getms(); |
5813 | return; |
5814 | } |
5815 | |
5816 | /** |
5817 | * Close mesh listening sockets. |
5818 | */ |
5819 | static void |
5820 | mesh_listening_sockets_close() |
5821 | { |
5822 | MESH_LOCK(); |
5823 | INFO("closing mesh heartbeat sockets" ); |
5824 | cf_sockets_close(&g_hb.mode_state.mesh_state.listening_sockets); |
5825 | DEBUG("closed mesh heartbeat sockets" ); |
5826 | MESH_UNLOCK(); |
5827 | } |
5828 | |
5829 | /** |
5830 | * Populate the buffer with mesh seed list. |
5831 | */ |
5832 | static void |
5833 | mesh_seed_host_list_get(cf_dyn_buf* db, bool tls) |
5834 | { |
5835 | if (!hb_is_mesh()) { |
5836 | return; |
5837 | } |
5838 | |
5839 | MESH_LOCK(); |
5840 | |
5841 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
5842 | int element_count = cf_vector_size(seeds); |
5843 | for (int i = 0; i < element_count; i++) { |
5844 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
5845 | const char* info_key = |
5846 | seed->seed_tls ? |
5847 | "heartbeat.tls-mesh-seed-address-port=" : |
5848 | "heartbeat.mesh-seed-address-port=" ; |
5849 | |
5850 | cf_dyn_buf_append_string(db, info_key); |
5851 | cf_dyn_buf_append_string(db, seed->seed_host_name); |
5852 | cf_dyn_buf_append_char(db, ':'); |
5853 | cf_dyn_buf_append_uint32(db, seed->seed_port); |
5854 | cf_dyn_buf_append_char(db, ';'); |
5855 | } |
5856 | |
5857 | MESH_UNLOCK(); |
5858 | } |
5859 | |
5860 | /** |
5861 | * Checks if the match between a mesh seed and a mesh node is valid. |
5862 | * The matching would be invalid if the mesh node's endpoint has been updated |
5863 | * after the match was made or there has been no match. |
5864 | */ |
5865 | static bool |
5866 | mesh_seed_mesh_node_check(as_hb_mesh_seed* seed) |
5867 | { |
5868 | if (seed->status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) { |
5869 | return false; |
5870 | } |
5871 | |
5872 | as_hb_mesh_node node; |
5873 | if (mesh_node_get(seed->mesh_nodeid, &node) != 0) { |
5874 | // The matched node has vanished. |
5875 | return false; |
5876 | } |
5877 | |
5878 | return seed->mesh_node_endpoint_change_ts == node.endpoint_change_ts; |
5879 | } |
5880 | |
5881 | /** |
5882 | * Refresh the matching between seeds and mesh nodes and get inactive seeds. |
5883 | * Should be invoked under a mesh lock to ensure the validity of returned |
5884 | * pointers. |
5885 | * @param inactive_seeds_p output vector of inactive seed pointers. Can be NULL |
5886 | * if inactive nodes need not be returned. |
5887 | */ |
5888 | static void |
5889 | mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p) |
5890 | { |
5891 | MESH_LOCK(); |
5892 | |
5893 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
5894 | int element_count = cf_vector_size(seeds); |
5895 | if (inactive_seeds_p) { |
5896 | cf_vector_clear(inactive_seeds_p); |
5897 | } |
5898 | |
5899 | // Mark seeds that do not have a matching mesh node and transitively do not |
5900 | // have a matching channel. |
5901 | cf_clock now = cf_getms(); |
5902 | for (int i = 0; i < element_count; i++) { |
5903 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
5904 | if (mesh_seed_mesh_node_check(seed)) { |
5905 | continue; |
5906 | } |
5907 | |
5908 | seed->mesh_nodeid = 0; |
5909 | seed->mesh_node_endpoint_change_ts = 0; |
5910 | |
5911 | // The mesh node is being connected. Skip. |
5912 | if (seed->status == AS_HB_MESH_NODE_CHANNEL_PENDING) { |
5913 | if (seed->last_status_updated + MESH_PENDING_TIMEOUT > now) { |
5914 | // Spare the pending seeds, since we are attempting to connect |
5915 | // to the seed host. |
5916 | continue; |
5917 | } |
5918 | |
5919 | // Flip to inactive if we have been in pending state for a long |
5920 | // time. |
5921 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
5922 | } |
5923 | |
5924 | if (seed->status != AS_HB_MESH_NODE_CHANNEL_PENDING) { |
5925 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
5926 | if (inactive_seeds_p) { |
5927 | cf_vector_append(inactive_seeds_p, &seed); |
5928 | } |
5929 | } |
5930 | } |
5931 | |
5932 | MESH_UNLOCK(); |
5933 | } |
5934 | |
5935 | /** |
5936 | * Match input seeds to a mesh node using its endpoint address and |
5937 | */ |
5938 | static void |
5939 | mesh_seeds_mesh_node_match_update(cf_vector* inactive_seeds_p, |
5940 | as_hb_mesh_node* mesh_node, cf_node mesh_nodeid) |
5941 | { |
5942 | if (mesh_node->status |
5943 | == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN|| mesh_node->endpoint_list == NULL) { |
5944 | return; |
5945 | } |
5946 | |
5947 | int element_count = cf_vector_size(inactive_seeds_p); |
5948 | for (int i = 0; i < element_count; i++) { |
5949 | // No null check required since we are iterating under a lock and within |
5950 | // vector bounds. |
5951 | as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp( |
5952 | inactive_seeds_p, i); |
5953 | if (as_endpoint_lists_are_overlapping(seed->resolved_endpoint_list, |
5954 | mesh_node->endpoint_list, true)) { |
5955 | // We found a matching mesh node for the seed, flip its status to |
5956 | // active. |
5957 | seed->mesh_nodeid = mesh_nodeid; |
5958 | seed->mesh_node_endpoint_change_ts = mesh_node->endpoint_change_ts; |
5959 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_ACTIVE); |
5960 | DEBUG("seed entry %s:%d connected" , seed->seed_host_name, |
5961 | seed->seed_port); |
5962 | } |
5963 | } |
5964 | } |
5965 | |
5966 | /** |
5967 | * Determines if a mesh entry should be connected to or expired and deleted. |
5968 | */ |
5969 | static int |
5970 | mesh_tend_reduce(const void* key, void* data, void* udata) |
5971 | { |
5972 | MESH_LOCK(); |
5973 | |
5974 | int rv = CF_SHASH_OK; |
5975 | cf_node nodeid = *(cf_node*)key; |
5976 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
5977 | as_hb_mesh_tend_reduce_udata* tend_reduce_udata = |
5978 | (as_hb_mesh_tend_reduce_udata*)udata; |
5979 | |
5980 | DETAIL("tending mesh node %" PRIx64" with status %s" , nodeid, |
5981 | mesh_node_status_string(mesh_node->status)); |
5982 | |
5983 | mesh_seeds_mesh_node_match_update(tend_reduce_udata->inactive_seeds_p, |
5984 | mesh_node, nodeid); |
5985 | |
5986 | if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) { |
5987 | // The mesh node is connected. Skip. |
5988 | goto Exit; |
5989 | } |
5990 | |
5991 | cf_clock now = cf_getms(); |
5992 | |
5993 | if (!mesh_node->endpoint_list) { |
5994 | // Will happen if node discover and disconnect happen close together. |
5995 | mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_ENDPOINT_UNKNOWN); |
5996 | } |
5997 | |
5998 | if (mesh_node->inactive_since + MESH_INACTIVE_TIMEOUT <= now) { |
5999 | DEBUG("mesh forgetting node %" PRIx64" because it could not be connected since %" PRIx64, |
6000 | nodeid, mesh_node->inactive_since); |
6001 | rv = CF_SHASH_REDUCE_DELETE; |
6002 | goto Exit; |
6003 | } |
6004 | |
6005 | if (mesh_node->status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) { |
6006 | if (mesh_node->last_status_updated + MESH_ENDPOINT_UNKNOWN_TIMEOUT |
6007 | > now) { |
6008 | DEBUG("mesh forgetting node %" PRIx64" ip address/port undiscovered since %" PRIu64, |
6009 | nodeid, mesh_node->last_status_updated); |
6010 | |
6011 | rv = CF_SHASH_REDUCE_DELETE; |
6012 | } |
6013 | // Skip connecting with a node with unknown endpoint. |
6014 | goto Exit; |
6015 | } |
6016 | |
6017 | if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_PENDING) { |
6018 | // The mesh node is being connected. Skip. |
6019 | if (mesh_node->last_status_updated + MESH_PENDING_TIMEOUT > now) { |
6020 | goto Exit; |
6021 | } |
6022 | |
6023 | // Flip to inactive if we have been in pending state for a long time. |
6024 | mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
6025 | } |
6026 | |
6027 | // Channel for this node is inactive. Prompt the channel sub module to |
6028 | // connect to this node. |
6029 | if (tend_reduce_udata->to_connect_count |
6030 | >= tend_reduce_udata->to_connect_capacity) { |
6031 | // New nodes found but we are out of capacity. Ultra defensive coding. |
6032 | // This will never happen under the locks. |
6033 | WARNING("skipping connecting to node %" PRIx64" - not enough memory allocated" , |
6034 | nodeid); |
6035 | goto Exit; |
6036 | } |
6037 | |
6038 | endpoint_list_copy( |
6039 | &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count], |
6040 | mesh_node->endpoint_list); |
6041 | tend_reduce_udata->to_connect_count++; |
6042 | |
6043 | // Flip status to pending. |
6044 | mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_PENDING); |
6045 | |
6046 | Exit: |
6047 | if (rv == CF_SHASH_REDUCE_DELETE) { |
6048 | // Clear all internal allocated memory. |
6049 | mesh_node_destroy(mesh_node); |
6050 | } |
6051 | |
6052 | MESH_UNLOCK(); |
6053 | |
6054 | return rv; |
6055 | } |
6056 | |
6057 | /** |
6058 | * Add inactive seeds to to_connect array. |
6059 | * Should be invoked under mesh lock to prevent invalidating the array of seed |
6060 | * node pointers. |
6061 | * @param seed_p vector of seed pointers. |
6062 | * @param tend reduce udata having the to connect endpoint list. |
6063 | */ |
6064 | void |
6065 | mesh_seeds_inactive_add_to_connect(cf_vector* seeds_p, |
6066 | as_hb_mesh_tend_reduce_udata* tend_reduce_udata) |
6067 | { |
6068 | MESH_LOCK(); |
6069 | int element_count = cf_vector_size(seeds_p); |
6070 | for (int i = 0; i < element_count; i++) { |
6071 | as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp(seeds_p, i); |
6072 | if (seed->status != AS_HB_MESH_NODE_CHANNEL_INACTIVE) { |
6073 | continue; |
6074 | } |
6075 | |
6076 | // Channel for this node is inactive. Prompt the channel sub module to |
6077 | // connect to this node. |
6078 | if (tend_reduce_udata->to_connect_count |
6079 | >= tend_reduce_udata->to_connect_capacity) { |
6080 | // New nodes found but we are out of capacity. Ultra defensive |
6081 | // coding. |
6082 | // This will never happen under the locks. |
6083 | WARNING( |
6084 | "skipping connecting to %s:%d - not enough memory allocated" , |
6085 | seed->seed_host_name, seed->seed_port); |
6086 | return; |
6087 | } |
6088 | |
6089 | // Ensure the seed hostname is resolved. |
6090 | if (mesh_seed_endpoint_list_fill(seed) != 0) { |
6091 | continue; |
6092 | } |
6093 | |
6094 | endpoint_list_copy( |
6095 | &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count], |
6096 | seed->resolved_endpoint_list); |
6097 | tend_reduce_udata->to_connect_count++; |
6098 | |
6099 | // Flip status to pending. |
6100 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_PENDING); |
6101 | } |
6102 | MESH_UNLOCK(); |
6103 | } |
6104 | |
6105 | /** |
6106 | * Tends the mesh host list, to discover and remove nodes. Should never invoke a |
6107 | * channel call while holding a mesh lock. |
6108 | */ |
6109 | void* |
6110 | mesh_tender(void* arg) |
6111 | { |
6112 | DETAIL("mesh tender started" ); |
6113 | // Figure out which nodes need to be connected to. |
6114 | // collect nodes to connect to and remove dead nodes. |
6115 | as_hb_mesh_tend_reduce_udata tend_reduce_udata = { NULL, 0, 0 }; |
6116 | |
6117 | // Vector of pointer to inactive seeds. |
6118 | cf_vector inactive_seeds_p; |
6119 | cf_vector_init(&inactive_seeds_p, sizeof(as_hb_mesh_seed*), |
6120 | AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); |
6121 | |
6122 | cf_clock last_time = 0; |
6123 | |
6124 | while (hb_is_mesh() && mesh_is_running()) { |
6125 | cf_clock curr_time = cf_getms(); |
6126 | |
6127 | // Unlocked access but this should be alright Set the discovered flag. |
6128 | bool nodes_discovered = g_hb.mode_state.mesh_state.nodes_discovered; |
6129 | if ((curr_time - last_time) < MESH_TEND_INTERVAL && !nodes_discovered) { |
6130 | // Interval has not been reached for sending heartbeats |
6131 | usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time + |
6132 | MESH_TEND_INTERVAL) - curr_time) * 1000); |
6133 | continue; |
6134 | } |
6135 | last_time = curr_time; |
6136 | |
6137 | DETAIL("tending mesh list" ); |
6138 | |
6139 | MESH_LOCK(); |
6140 | // Unset the discovered flag. |
6141 | g_hb.mode_state.mesh_state.nodes_discovered = false; |
6142 | |
6143 | // Update the list of inactive seeds. |
6144 | mesh_seed_inactive_refresh_get_unsafe(&inactive_seeds_p); |
6145 | |
6146 | // Make sure the udata has enough capacity. |
6147 | int connect_count_max = cf_shash_get_size( |
6148 | g_hb.mode_state.mesh_state.nodeid_to_mesh_node) |
6149 | + cf_vector_size(&inactive_seeds_p); |
6150 | mesh_tend_udata_capacity_ensure(&tend_reduce_udata, connect_count_max); |
6151 | |
6152 | tend_reduce_udata.to_connect_count = 0; |
6153 | tend_reduce_udata.inactive_seeds_p = &inactive_seeds_p; |
6154 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
6155 | mesh_tend_reduce, &tend_reduce_udata); |
6156 | |
6157 | // Add inactive seeds for connection. |
6158 | mesh_seeds_inactive_add_to_connect(&inactive_seeds_p, |
6159 | &tend_reduce_udata); |
6160 | |
6161 | MESH_UNLOCK(); |
6162 | |
6163 | // Connect can be time consuming, especially in failure cases. |
6164 | // Connect outside of the mesh lock and prevent hogging the lock. |
6165 | if (tend_reduce_udata.to_connect_count > 0) { |
6166 | // Try connecting the newer nodes. |
6167 | channel_mesh_channel_establish(tend_reduce_udata.to_connect, |
6168 | tend_reduce_udata.to_connect_count); |
6169 | } |
6170 | |
6171 | DETAIL("done tending mesh list" ); |
6172 | } |
6173 | |
6174 | if (tend_reduce_udata.to_connect) { |
6175 | // Free space allocated for endpoint lists. |
6176 | for (int i = 0; i < tend_reduce_udata.to_connect_capacity; i++) { |
6177 | if (tend_reduce_udata.to_connect[i]) { |
6178 | cf_free(tend_reduce_udata.to_connect[i]); |
6179 | } |
6180 | } |
6181 | cf_free(tend_reduce_udata.to_connect); |
6182 | } |
6183 | |
6184 | cf_vector_destroy(&inactive_seeds_p); |
6185 | |
6186 | DETAIL("mesh tender shut down" ); |
6187 | return NULL; |
6188 | } |
6189 | |
6190 | /** |
6191 | * Add or update a mesh node to mesh node list. |
6192 | */ |
6193 | static void |
6194 | mesh_node_add_update(cf_node nodeid, as_hb_mesh_node* mesh_node) |
6195 | { |
6196 | MESH_LOCK(); |
6197 | cf_shash_put(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid, |
6198 | mesh_node); |
6199 | MESH_UNLOCK(); |
6200 | } |
6201 | |
6202 | /** |
6203 | * Destroy a mesh node. |
6204 | */ |
6205 | static void |
6206 | mesh_node_destroy(as_hb_mesh_node* mesh_node) |
6207 | { |
6208 | MESH_LOCK(); |
6209 | if (mesh_node->endpoint_list) { |
6210 | cf_free(mesh_node->endpoint_list); |
6211 | mesh_node->endpoint_list = NULL; |
6212 | } |
6213 | MESH_UNLOCK(); |
6214 | } |
6215 | |
6216 | /** |
6217 | * Endpoint list iterate function find endpoint matching sock addr. |
6218 | */ |
6219 | static void |
6220 | mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata) |
6221 | { |
6222 | cf_sock_addr endpoint_addr; |
6223 | if (as_endpoint_to_sock_addr(endpoint, &endpoint_addr) != 0) { |
6224 | return; |
6225 | } |
6226 | |
6227 | as_hb_endpoint_list_addr_find_udata* endpoint_reduce_udata = |
6228 | (as_hb_endpoint_list_addr_find_udata*)udata; |
6229 | |
6230 | if (cf_sock_addr_compare(&endpoint_addr, endpoint_reduce_udata->to_search) |
6231 | == 0) { |
6232 | endpoint_reduce_udata->found = true; |
6233 | } |
6234 | } |
6235 | |
6236 | /** |
6237 | * Indicates if a give node is discovered. |
6238 | * @param nodeid the input nodeid. |
6239 | * @return true if discovered, false otherwise. |
6240 | */ |
6241 | static bool |
6242 | mesh_node_is_discovered(cf_node nodeid) |
6243 | { |
6244 | if (nodeid == config_self_nodeid_get()) { |
6245 | // Assume this node knows itself. |
6246 | return true; |
6247 | } |
6248 | |
6249 | as_hb_mesh_node mesh_node; |
6250 | return mesh_node_get(nodeid, &mesh_node) == 0; |
6251 | } |
6252 | |
6253 | /** |
6254 | * Indicates if a give node has a valid endpoint list. |
6255 | * @param nodeid the input nodeid. |
6256 | * @return true if node has valid endpoint list, false otherwise. |
6257 | */ |
6258 | static bool |
6259 | mesh_node_endpoint_list_is_valid(cf_node nodeid) |
6260 | { |
6261 | if (nodeid == config_self_nodeid_get()) { |
6262 | // Assume this node knows itself. |
6263 | return true; |
6264 | } |
6265 | |
6266 | as_hb_mesh_node mesh_node; |
6267 | return mesh_node_get(nodeid, &mesh_node) == 0 |
6268 | && mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN |
6269 | && mesh_node.endpoint_list; |
6270 | } |
6271 | |
6272 | /** |
6273 | * Get the mesh node associated with this node. |
6274 | * @param nodeid the nodeid to search for. |
6275 | * @param is_real_nodeid indicates if the query is for a real or fake nodeid. |
6276 | * @param mesh_node the output mesh node. |
6277 | * @return 0 on success -1 if there is mesh node attached. |
6278 | */ |
6279 | static int |
6280 | mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node) |
6281 | { |
6282 | int rv = -1; |
6283 | |
6284 | MESH_LOCK(); |
6285 | if (cf_shash_get(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid, |
6286 | mesh_node) == CF_SHASH_OK) { |
6287 | rv = 0; |
6288 | } |
6289 | else { |
6290 | // The node not found. |
6291 | rv = -1; |
6292 | } |
6293 | MESH_UNLOCK(); |
6294 | return rv; |
6295 | } |
6296 | |
6297 | /** |
6298 | * Handle the event when the channel reports a node as disconnected. |
6299 | */ |
6300 | static void |
6301 | mesh_channel_on_node_disconnect(as_hb_channel_event* event) |
6302 | { |
6303 | MESH_LOCK(); |
6304 | |
6305 | as_hb_mesh_node mesh_node; |
6306 | if (mesh_node_get(event->nodeid, &mesh_node) != 0) { |
6307 | // Again should not happen in practice. But not really bad. |
6308 | DEBUG("unknown mesh node disconnected %" PRIx64, event->nodeid); |
6309 | goto Exit; |
6310 | } |
6311 | |
6312 | DEBUG("mesh setting node %" PRIx64" status as inactive on loss of channel" , |
6313 | event->nodeid); |
6314 | |
6315 | // Mark this node inactive and move on. Mesh tender should remove this node |
6316 | // after it has been inactive for a while. |
6317 | mesh_node_status_change(&mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
6318 | |
6319 | // Update the mesh entry. |
6320 | mesh_node_add_update(event->nodeid, &mesh_node); |
6321 | |
6322 | Exit: |
6323 | MESH_UNLOCK(); |
6324 | } |
6325 | |
6326 | /** |
6327 | * Check and fix the case where we received a self incoming message probably |
6328 | * because one of our non loop back interfaces was used as a seed address. |
6329 | * |
6330 | * @return true if this message is a self message, false otherwise. |
6331 | */ |
6332 | static bool |
6333 | mesh_node_check_fix_self_msg(as_hb_channel_event* event) |
6334 | { |
6335 | if (event->nodeid == config_self_nodeid_get()) { |
6336 | // Handle self message. Will happen if the seed node address on this |
6337 | // node does not match the listen / publish address. |
6338 | as_endpoint_list* msg_endpoint_list; |
6339 | msg_endpoint_list_get(event->msg, &msg_endpoint_list); |
6340 | |
6341 | MESH_LOCK(); |
6342 | |
6343 | // Check if this node has published an endpoint list matching self node. |
6344 | endpoint_list_equal_check_udata udata = { 0 }; |
6345 | udata.are_equal = false; |
6346 | udata.other = msg_endpoint_list; |
6347 | mesh_published_endpoints_process(endpoint_list_equal_process, &udata); |
6348 | |
6349 | if (udata.are_equal) { |
6350 | // Definitely pulse message from self node. |
6351 | int self_seed_index = |
6352 | mesh_seed_endpoint_list_overlapping_find_unsafe( |
6353 | msg_endpoint_list); |
6354 | if (self_seed_index >= 0) { |
6355 | as_hb_mesh_seed* self_seed = cf_vector_getp( |
6356 | &g_hb.mode_state.mesh_state.seeds, self_seed_index); |
6357 | INFO("removing self seed entry host:%s port:%d" , |
6358 | self_seed->seed_host_name, self_seed->seed_port); |
6359 | as_hb_mesh_tip_clear(self_seed->seed_host_name, |
6360 | self_seed->seed_port); |
6361 | } |
6362 | } |
6363 | MESH_UNLOCK(); |
6364 | return true; |
6365 | } |
6366 | return false; |
6367 | } |
6368 | |
6369 | /** |
6370 | * Update mesh node status based on an incoming message. |
6371 | */ |
6372 | static void |
6373 | mesh_node_data_update(as_hb_channel_event* event) |
6374 | { |
6375 | if (mesh_node_check_fix_self_msg(event)) { |
6376 | // Message from self, can be ignored. |
6377 | return; |
6378 | } |
6379 | |
6380 | MESH_LOCK(); |
6381 | as_hb_mesh_node existing_mesh_node = { 0 }; |
6382 | as_endpoint_list* msg_endpoint_list = NULL; |
6383 | msg_endpoint_list_get(event->msg, &msg_endpoint_list); |
6384 | |
6385 | // Search for existing entry. |
6386 | bool needs_update = mesh_node_get(event->nodeid, &existing_mesh_node) != 0; |
6387 | |
6388 | // Update the endpoint list to be the message endpoint list if the seed ip |
6389 | // list and the published ip list differ |
6390 | if (!as_endpoint_lists_are_equal(existing_mesh_node.endpoint_list, |
6391 | msg_endpoint_list)) { |
6392 | char endpoint_list_str1[ENDPOINT_LIST_STR_SIZE]; |
6393 | endpoint_list_str1[0] = 0; |
6394 | |
6395 | as_endpoint_list_to_string(existing_mesh_node.endpoint_list, |
6396 | endpoint_list_str1, sizeof(endpoint_list_str1)); |
6397 | |
6398 | char endpoint_list_str2[ENDPOINT_LIST_STR_SIZE]; |
6399 | as_endpoint_list_to_string(msg_endpoint_list, endpoint_list_str2, |
6400 | sizeof(endpoint_list_str2)); |
6401 | |
6402 | if (existing_mesh_node.endpoint_list) { |
6403 | INFO("for node %" PRIx64" updating mesh endpoint address from {%s} to {%s}" ,event->nodeid, |
6404 | endpoint_list_str1, endpoint_list_str2); |
6405 | } |
6406 | |
6407 | // Update the endpoints. |
6408 | endpoint_list_copy(&existing_mesh_node.endpoint_list, |
6409 | msg_endpoint_list); |
6410 | existing_mesh_node.endpoint_change_ts = as_hlc_timestamp_now(); |
6411 | |
6412 | needs_update = true; |
6413 | } |
6414 | |
6415 | if (existing_mesh_node.status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) { |
6416 | // Update status to active. |
6417 | mesh_node_status_change(&existing_mesh_node, |
6418 | AS_HB_MESH_NODE_CHANNEL_ACTIVE); |
6419 | needs_update = true; |
6420 | } |
6421 | |
6422 | if (needs_update) { |
6423 | // Apply the update. |
6424 | mesh_node_add_update(event->nodeid, &existing_mesh_node); |
6425 | } |
6426 | |
6427 | MESH_UNLOCK(); |
6428 | } |
6429 | |
6430 | /** |
6431 | * Return the in memory and on wire size of an info reply array. |
6432 | * @param reply the info reply. |
6433 | * @param reply_count the number of replies. |
6434 | * @param reply_size the wire size of the message. |
6435 | * @return 0 on successful reply count computation, -1 otherwise, |
6436 | */ |
6437 | static int |
6438 | mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, |
6439 | size_t* reply_size) |
6440 | { |
6441 | // Go over reply and compute the count of replies and also validate the |
6442 | // endpoint lists. |
6443 | uint8_t* start_ptr = (uint8_t*)reply; |
6444 | *reply_size = 0; |
6445 | |
6446 | for (int i = 0; i < reply_count; i++) { |
6447 | as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; |
6448 | *reply_size += sizeof(as_hb_mesh_info_reply); |
6449 | start_ptr += sizeof(as_hb_mesh_info_reply); |
6450 | |
6451 | size_t endpoint_list_size = 0; |
6452 | if (as_endpoint_list_sizeof(&reply_ptr->endpoint_list[0], |
6453 | &endpoint_list_size)) { |
6454 | // Incomplete / garbled info reply message. |
6455 | *reply_size = 0; |
6456 | return -1; |
6457 | } |
6458 | |
6459 | *reply_size += endpoint_list_size; |
6460 | start_ptr += endpoint_list_size; |
6461 | } |
6462 | |
6463 | return 0; |
6464 | } |
6465 | |
6466 | /** |
6467 | * Send a info reply in reply to an info request. |
6468 | * @param dest the destination node to send the info reply to. |
6469 | * @param reply array of node ids and endpoints |
6470 | * @param reply_count the count of replies. |
6471 | */ |
6472 | static void |
6473 | mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, |
6474 | size_t reply_count) |
6475 | { |
6476 | // Create the discover message. |
6477 | msg* msg = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REPLY); |
6478 | |
6479 | // Set the reply. |
6480 | msg_info_reply_set(msg, reply, reply_count); |
6481 | |
6482 | DEBUG("sending info reply to node %" PRIx64, dest); |
6483 | |
6484 | // Send the info reply. |
6485 | if (channel_msg_unicast(dest, msg) != 0) { |
6486 | TICKER_WARNING("error sending info reply message to node %" PRIx64, |
6487 | dest); |
6488 | } |
6489 | |
6490 | hb_msg_return(msg); |
6491 | } |
6492 | |
6493 | /** |
6494 | * Initialize the info request msg buffer |
6495 | */ |
6496 | static msg* |
6497 | mesh_info_msg_init(as_hb_msg_type msg_type) |
6498 | { |
6499 | msg* msg = hb_msg_get(); |
6500 | msg_src_fields_fill(msg); |
6501 | msg_type_set(msg, msg_type); |
6502 | return msg; |
6503 | } |
6504 | |
6505 | /** |
6506 | * Send a info request for all undiscovered nodes. |
6507 | * @param dest the destination node to send the discover message to. |
6508 | * @param to_discover array of node ids to discover. |
6509 | * @param to_discover_count the count of nodes in the array. |
6510 | */ |
6511 | static void |
6512 | mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, |
6513 | size_t to_discover_count) |
6514 | { |
6515 | // Create the discover message. |
6516 | msg* info_req = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REQUEST); |
6517 | |
6518 | // Set the list of nodes to discover. |
6519 | msg_node_list_set(info_req, AS_HB_MSG_INFO_REQUEST, to_discover, |
6520 | to_discover_count); |
6521 | |
6522 | DEBUG("sending info request to node %" PRIx64, dest); |
6523 | |
6524 | // Send the info request. |
6525 | if (channel_msg_unicast(dest, info_req) != 0) { |
6526 | TICKER_WARNING("error sending info request message to node %" PRIx64, |
6527 | dest); |
6528 | } |
6529 | hb_msg_return(info_req); |
6530 | } |
6531 | |
6532 | /** |
6533 | * Handle an incoming pulse message to discover new neighbours. |
6534 | */ |
6535 | static void |
6536 | mesh_channel_on_pulse(msg* msg) |
6537 | { |
6538 | cf_node* adj_list; |
6539 | size_t adj_length; |
6540 | |
6541 | cf_node source; |
6542 | |
6543 | // Channel has validated the source. Don't bother checking here. |
6544 | msg_nodeid_get(msg, &source); |
6545 | if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) { |
6546 | // Adjacency list absent. |
6547 | WARNING("received message from %" PRIx64" without adjacency list" , |
6548 | source); |
6549 | return; |
6550 | } |
6551 | |
6552 | cf_node to_discover[adj_length]; |
6553 | size_t num_to_discover = 0; |
6554 | |
6555 | // TODO: Track already queried nodes so that we do not retry immediately. |
6556 | // Will need a separate state, pending query. |
6557 | MESH_LOCK(); |
6558 | |
6559 | // Try and discover new nodes from this message's adjacency list. |
6560 | for (int i = 0; i < adj_length; i++) { |
6561 | if (!mesh_node_is_discovered(adj_list[i])) { |
6562 | DEBUG("discovered new mesh node %" PRIx64, adj_list[i]); |
6563 | |
6564 | as_hb_mesh_node new_node; |
6565 | memset(&new_node, 0, sizeof(new_node)); |
6566 | mesh_node_status_change(&new_node, |
6567 | AS_HB_MESH_NODE_ENDPOINT_UNKNOWN); |
6568 | |
6569 | // Add as a new node |
6570 | mesh_node_add_update(adj_list[i], &new_node); |
6571 | } |
6572 | |
6573 | if (!mesh_node_endpoint_list_is_valid(adj_list[i])) { |
6574 | to_discover[num_to_discover++] = adj_list[i]; |
6575 | } |
6576 | } |
6577 | |
6578 | MESH_UNLOCK(); |
6579 | |
6580 | // Discover these nodes outside a lock. |
6581 | if (num_to_discover) { |
6582 | mesh_nodes_send_info_request(msg, source, to_discover, num_to_discover); |
6583 | } |
6584 | } |
6585 | |
6586 | /** |
6587 | * Handle an incoming info message. |
6588 | */ |
6589 | static void |
6590 | mesh_channel_on_info_request(msg* msg) |
6591 | { |
6592 | cf_node* query_nodeids; |
6593 | size_t query_count; |
6594 | |
6595 | cf_node source; |
6596 | msg_nodeid_get(msg, &source); |
6597 | |
6598 | if (msg_node_list_get(msg, AS_HB_MSG_INFO_REQUEST, &query_nodeids, |
6599 | &query_count) != 0) { |
6600 | TICKER_WARNING("got an info request without query nodes from %" PRIx64, |
6601 | source); |
6602 | return; |
6603 | } |
6604 | |
6605 | MESH_LOCK(); |
6606 | |
6607 | // Compute the entire response size. |
6608 | size_t reply_size = 0; |
6609 | |
6610 | for (int i = 0; i < query_count; i++) { |
6611 | as_hb_mesh_node mesh_node; |
6612 | |
6613 | if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) { |
6614 | if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN |
6615 | && mesh_node.endpoint_list) { |
6616 | size_t endpoint_list_size = 0; |
6617 | as_endpoint_list_sizeof(mesh_node.endpoint_list, |
6618 | &endpoint_list_size); |
6619 | reply_size += sizeof(as_hb_mesh_info_reply) |
6620 | + endpoint_list_size; |
6621 | } |
6622 | } |
6623 | } |
6624 | |
6625 | as_hb_mesh_info_reply* replies = alloca(reply_size); |
6626 | uint8_t* reply_ptr = (uint8_t*)replies; |
6627 | size_t reply_count = 0; |
6628 | |
6629 | DEBUG("received info request from node : %" PRIx64, source); |
6630 | DEBUG("preparing a reply for %zu requests" , query_count); |
6631 | |
6632 | for (int i = 0; i < query_count; i++) { |
6633 | as_hb_mesh_node mesh_node; |
6634 | |
6635 | DEBUG("mesh received info request for node %" PRIx64, query_nodeids[i]); |
6636 | |
6637 | if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) { |
6638 | if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN |
6639 | && mesh_node.endpoint_list) { |
6640 | as_hb_mesh_info_reply* reply = (as_hb_mesh_info_reply*)reply_ptr; |
6641 | |
6642 | reply->nodeid = query_nodeids[i]; |
6643 | |
6644 | size_t endpoint_list_size = 0; |
6645 | as_endpoint_list_sizeof(mesh_node.endpoint_list, |
6646 | &endpoint_list_size); |
6647 | |
6648 | memcpy(&reply->endpoint_list[0], mesh_node.endpoint_list, |
6649 | endpoint_list_size); |
6650 | |
6651 | reply_ptr += sizeof(as_hb_mesh_info_reply) + endpoint_list_size; |
6652 | |
6653 | reply_count++; |
6654 | } |
6655 | } |
6656 | } |
6657 | |
6658 | MESH_UNLOCK(); |
6659 | |
6660 | // Send the reply |
6661 | if (reply_count > 0) { |
6662 | mesh_nodes_send_info_reply(source, replies, reply_count); |
6663 | } |
6664 | } |
6665 | |
6666 | /** |
6667 | * Handle an incoming info reply. |
6668 | */ |
6669 | static void |
6670 | mesh_channel_on_info_reply(msg* msg) |
6671 | { |
6672 | as_hb_mesh_info_reply* reply = NULL; |
6673 | size_t reply_count = 0; |
6674 | cf_node source = 0; |
6675 | msg_nodeid_get(msg, &source); |
6676 | if (msg_info_reply_get(msg, &reply, &reply_count) != 0 |
6677 | || reply_count == 0) { |
6678 | TICKER_WARNING( |
6679 | "got an info reply from without query nodes from %" PRIx64, |
6680 | source); |
6681 | return; |
6682 | } |
6683 | |
6684 | DEBUG("received info reply from node %" PRIx64, source); |
6685 | |
6686 | MESH_LOCK(); |
6687 | |
6688 | uint8_t *start_ptr = (uint8_t*)reply; |
6689 | for (int i = 0; i < reply_count; i++) { |
6690 | as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; |
6691 | as_hb_mesh_node existing_node; |
6692 | if (mesh_node_get(reply_ptr->nodeid, &existing_node) != 0) { |
6693 | // Somehow the node was removed from the mesh hash. Maybe a timeout. |
6694 | goto NextReply; |
6695 | } |
6696 | |
6697 | // Update the state of this node. |
6698 | if (existing_node.status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) { |
6699 | // Update the endpoint. |
6700 | endpoint_list_copy(&existing_node.endpoint_list, |
6701 | reply_ptr->endpoint_list); |
6702 | |
6703 | mesh_node_status_change(&existing_node, |
6704 | AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
6705 | // Set the discovered flag. |
6706 | g_hb.mode_state.mesh_state.nodes_discovered = true; |
6707 | |
6708 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
6709 | as_endpoint_list_to_string(existing_node.endpoint_list, |
6710 | endpoint_list_str, sizeof(endpoint_list_str)); |
6711 | |
6712 | DEBUG("for node %" PRIx64" discovered endpoints {%s}" , |
6713 | reply_ptr->nodeid, endpoint_list_str); |
6714 | |
6715 | // Update the hash. |
6716 | mesh_node_add_update(reply_ptr->nodeid, &existing_node); |
6717 | } |
6718 | |
6719 | NextReply: |
6720 | start_ptr += sizeof(as_hb_mesh_info_reply); |
6721 | size_t endpoint_list_size = 0; |
6722 | as_endpoint_list_sizeof(reply_ptr->endpoint_list, &endpoint_list_size); |
6723 | start_ptr += endpoint_list_size; |
6724 | } |
6725 | |
6726 | MESH_UNLOCK(); |
6727 | } |
6728 | |
6729 | /** |
6730 | * Handle the case when a message is received on a channel. |
6731 | */ |
6732 | static void |
6733 | mesh_channel_on_msg_rcvd(as_hb_channel_event* event) |
6734 | { |
6735 | // Update the mesh node status. |
6736 | mesh_node_data_update(event); |
6737 | |
6738 | as_hb_msg_type msg_type; |
6739 | msg_type_get(event->msg, &msg_type); |
6740 | |
6741 | switch (msg_type) { |
6742 | case AS_HB_MSG_TYPE_PULSE: // A pulse message. Try and discover new nodes. |
6743 | mesh_channel_on_pulse(event->msg); |
6744 | break; |
6745 | case AS_HB_MSG_TYPE_INFO_REQUEST: // Send back an info reply. |
6746 | mesh_channel_on_info_request(event->msg); |
6747 | break; |
6748 | case AS_HB_MSG_TYPE_INFO_REPLY: // Update the list of mesh nodes, if this is an undiscovered node. |
6749 | mesh_channel_on_info_reply(event->msg); |
6750 | break; |
6751 | default: |
6752 | WARNING("received a message of unknown type from" ); |
6753 | // Ignore other messages. |
6754 | break; |
6755 | } |
6756 | } |
6757 | |
6758 | /* |
6759 | * ---------------------------------------------------------------------------- |
6760 | * Mesh public API |
6761 | * ---------------------------------------------------------------------------- |
6762 | */ |
6763 | |
6764 | /** |
6765 | * Add a host / port to the mesh seed list. |
6766 | * @param host the seed node hostname / ip address |
6767 | * @param port the seed node port. |
6768 | * @param tls indicates TLS support. |
6769 | * @return CF_SHASH_OK, CF_SHASH_ERR, CF_SHASH_ERR_FOUND. |
6770 | */ |
6771 | static int |
6772 | mesh_tip(char* host, int port, bool tls) |
6773 | { |
6774 | MESH_LOCK(); |
6775 | |
6776 | int rv = -1; |
6777 | as_hb_mesh_seed new_seed = { { 0 } }; |
6778 | |
6779 | // Check validity of hostname and port. |
6780 | int hostname_len = strnlen(host, DNS_NAME_MAX_SIZE); |
6781 | if (hostname_len <= 0 || hostname_len == DNS_NAME_MAX_SIZE) { |
6782 | // Invalid hostname. |
6783 | WARNING("mesh seed host %s exceeds allowed %d characters" , host, |
6784 | DNS_NAME_MAX_LEN); |
6785 | goto Exit; |
6786 | } |
6787 | if (port <= 0 || port > USHRT_MAX) { |
6788 | WARNING("mesh seed port %s:%d exceeds should be between 0 to %d" , host, |
6789 | port, USHRT_MAX); |
6790 | goto Exit; |
6791 | } |
6792 | |
6793 | // Check if we already have a match for this seed. |
6794 | if (mesh_seed_find_unsafe(host, port) >= 0) { |
6795 | WARNING("mesh seed host %s:%d already in seed list" , host, port); |
6796 | goto Exit; |
6797 | } |
6798 | |
6799 | mesh_seed_status_change(&new_seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
6800 | strcpy(new_seed.seed_host_name, host); |
6801 | new_seed.seed_port = port; |
6802 | new_seed.seed_tls = tls; |
6803 | |
6804 | cf_vector_append(&g_hb.mode_state.mesh_state.seeds, &new_seed); |
6805 | |
6806 | INFO("added new mesh seed %s:%d" , host, port); |
6807 | rv = 0; |
6808 | |
6809 | Exit: |
6810 | if (rv != 0) { |
6811 | // Ensure endpoint allocated space is freed. |
6812 | mesh_seed_destroy(&new_seed); |
6813 | } |
6814 | |
6815 | MESH_UNLOCK(); |
6816 | return rv; |
6817 | } |
6818 | |
6819 | /** |
6820 | * Handle a channel event on an endpoint. |
6821 | */ |
6822 | static void |
6823 | mesh_channel_event_process(as_hb_channel_event* event) |
6824 | { |
6825 | // Skip if we are not in mesh mode. |
6826 | if (!hb_is_mesh()) { |
6827 | return; |
6828 | } |
6829 | |
6830 | MESH_LOCK(); |
6831 | switch (event->type) { |
6832 | case AS_HB_CHANNEL_NODE_CONNECTED: |
6833 | // Ignore this event. The subsequent message event will be use for |
6834 | // determining mesh node active status. |
6835 | break; |
6836 | case AS_HB_CHANNEL_NODE_DISCONNECTED: |
6837 | mesh_channel_on_node_disconnect(event); |
6838 | break; |
6839 | case AS_HB_CHANNEL_MSG_RECEIVED: |
6840 | mesh_channel_on_msg_rcvd(event); |
6841 | break; |
6842 | case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH: // Ignore this event. HB module will handle it. |
6843 | break; |
6844 | } |
6845 | |
6846 | MESH_UNLOCK(); |
6847 | } |
6848 | |
6849 | /** |
6850 | * Initialize mesh mode data structures. |
6851 | */ |
6852 | static void |
6853 | mesh_init() |
6854 | { |
6855 | if (!hb_is_mesh()) { |
6856 | return; |
6857 | } |
6858 | |
6859 | MESH_LOCK(); |
6860 | |
6861 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED; |
6862 | |
6863 | // Initialize the mesh node hash. |
6864 | g_hb.mode_state.mesh_state.nodeid_to_mesh_node = cf_shash_create( |
6865 | cf_nodeid_shash_fn, sizeof(cf_node), sizeof(as_hb_mesh_node), |
6866 | AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
6867 | |
6868 | // Initialize the seed list. |
6869 | cf_vector_init(&g_hb.mode_state.mesh_state.seeds, sizeof(as_hb_mesh_seed), |
6870 | AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); |
6871 | |
6872 | MESH_UNLOCK(); |
6873 | } |
6874 | |
6875 | /** |
6876 | * Delete the shash entries only if they are not seed entries. |
6877 | */ |
6878 | static int |
6879 | mesh_free_node_data_reduce(const void* key, void* data, void* udata) |
6880 | { |
6881 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
6882 | mesh_node_destroy(mesh_node); |
6883 | return CF_SHASH_REDUCE_DELETE; |
6884 | } |
6885 | |
6886 | /** |
6887 | * Remove a host / port from the mesh list. |
6888 | */ |
6889 | static int |
6890 | mesh_tip_clear_reduce(const void* key, void* data, void* udata) |
6891 | { |
6892 | int rv = CF_SHASH_OK; |
6893 | |
6894 | MESH_LOCK(); |
6895 | |
6896 | cf_node nodeid = *(cf_node*)key; |
6897 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
6898 | as_hb_mesh_tip_clear_udata* tip_clear_udata = |
6899 | (as_hb_mesh_tip_clear_udata*)udata; |
6900 | |
6901 | if (tip_clear_udata == NULL || nodeid == tip_clear_udata->nodeid) { |
6902 | // Handling tip clear all or clear of a specific node. |
6903 | rv = CF_SHASH_REDUCE_DELETE; |
6904 | goto Exit; |
6905 | } |
6906 | |
6907 | // See if the address matches any one of the endpoints in the node's |
6908 | // endpoint list. |
6909 | for (int i = 0; i < tip_clear_udata->n_addrs; i++) { |
6910 | cf_sock_addr sock_addr; |
6911 | cf_ip_addr_copy(&tip_clear_udata->addrs[i], &sock_addr.addr); |
6912 | sock_addr.port = tip_clear_udata->port; |
6913 | as_hb_endpoint_list_addr_find_udata udata; |
6914 | udata.found = false; |
6915 | udata.to_search = &sock_addr; |
6916 | |
6917 | as_endpoint_list_iterate(mesh_node->endpoint_list, |
6918 | mesh_endpoint_addr_find_iterate, &udata); |
6919 | |
6920 | if (udata.found) { |
6921 | rv = CF_SHASH_REDUCE_DELETE; |
6922 | goto Exit; |
6923 | } |
6924 | } |
6925 | |
6926 | // Not found by endpoint. |
6927 | rv = CF_SHASH_OK; |
6928 | |
6929 | Exit: |
6930 | if (rv == CF_SHASH_REDUCE_DELETE) { |
6931 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
6932 | as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str, |
6933 | sizeof(endpoint_list_str)); |
6934 | |
6935 | // Find all seed entries matching this mesh entry and delete them. |
6936 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
6937 | int element_count = cf_vector_size(seeds); |
6938 | for (int i = 0; i < element_count; i++) { |
6939 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
6940 | if (seed->mesh_nodeid != nodeid) { |
6941 | // Does not match this mesh entry. |
6942 | continue; |
6943 | } |
6944 | if (mesh_seed_delete_unsafe(i) == 0) { |
6945 | i--; |
6946 | element_count--; |
6947 | } |
6948 | else { |
6949 | // Should not happen in practice. |
6950 | CRASH("error deleting mesh seed entry %s:%d" , |
6951 | seed->seed_host_name, seed->seed_port); |
6952 | } |
6953 | } |
6954 | |
6955 | if (channel_node_disconnect(nodeid) != 0) { |
6956 | WARNING("unable to disconnect the channel to node %" PRIx64, |
6957 | nodeid); |
6958 | } |
6959 | |
6960 | mesh_node_destroy(mesh_node); |
6961 | if (tip_clear_udata != NULL) { |
6962 | tip_clear_udata->entry_deleted = true; |
6963 | } |
6964 | } |
6965 | |
6966 | MESH_UNLOCK(); |
6967 | return rv; |
6968 | } |
6969 | |
6970 | /** |
6971 | * Output Heartbeat endpoints of peers. |
6972 | */ |
6973 | static int |
6974 | mesh_peer_endpoint_reduce(const void* key, void* data, void* udata) |
6975 | { |
6976 | int rv = CF_SHASH_OK; |
6977 | MESH_LOCK(); |
6978 | cf_node nodeid = *(cf_node*)key; |
6979 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
6980 | cf_dyn_buf* db = (cf_dyn_buf*)udata; |
6981 | |
6982 | cf_dyn_buf_append_string(db, "heartbeat.peer=" ); |
6983 | cf_dyn_buf_append_string(db, "node-id=" ); |
6984 | cf_dyn_buf_append_uint64_x(db, nodeid); |
6985 | cf_dyn_buf_append_string(db, ":" ); |
6986 | as_endpoint_list_info(mesh_node->endpoint_list, db); |
6987 | cf_dyn_buf_append_string(db, ";" ); |
6988 | |
6989 | MESH_UNLOCK(); |
6990 | return rv; |
6991 | } |
6992 | |
6993 | /** |
6994 | * Free the mesh mode data structures. |
6995 | */ |
6996 | static void |
6997 | mesh_clear() |
6998 | { |
6999 | if (!mesh_is_stopped()) { |
7000 | WARNING( |
7001 | "attempted clearing mesh module without stopping it - skip mesh clear!" ); |
7002 | return; |
7003 | } |
7004 | |
7005 | MESH_LOCK(); |
7006 | // Delete the elements from the map. |
7007 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
7008 | mesh_free_node_data_reduce, NULL); |
7009 | |
7010 | // Reset the seeds to inactive state |
7011 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
7012 | int element_count = cf_vector_size(seeds); |
7013 | for (int i = 0; i < element_count; i++) { |
7014 | // Should not happen in practice. |
7015 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
7016 | seed->mesh_nodeid = 0; |
7017 | mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); |
7018 | } |
7019 | |
7020 | MESH_UNLOCK(); |
7021 | } |
7022 | |
7023 | /** |
7024 | * Open mesh listening socket. Crashes if open failed. |
7025 | */ |
7026 | static void |
7027 | mesh_listening_sockets_open() |
7028 | { |
7029 | MESH_LOCK(); |
7030 | |
7031 | const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); |
7032 | |
7033 | // Compute min MTU across all binding interfaces. |
7034 | int min_mtu = -1; |
7035 | char addr_string[DNS_NAME_MAX_SIZE]; |
7036 | for (uint32_t i = 0; i < bind_cfg->n_cfgs; ++i) { |
7037 | const cf_sock_cfg* sock_cfg = &bind_cfg->cfgs[i]; |
7038 | cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string, |
7039 | sizeof(addr_string)); |
7040 | |
7041 | INFO("initializing mesh heartbeat socket: %s:%d" , addr_string, |
7042 | sock_cfg->port); |
7043 | |
7044 | int bind_interface_mtu = |
7045 | !cf_ip_addr_is_any(&sock_cfg->addr) ? |
7046 | cf_inter_mtu(&sock_cfg->addr) : cf_inter_min_mtu(); |
7047 | |
7048 | if (min_mtu == -1 || min_mtu > bind_interface_mtu) { |
7049 | min_mtu = bind_interface_mtu; |
7050 | } |
7051 | } |
7052 | |
7053 | if (cf_socket_init_server((cf_serv_cfg*)bind_cfg, |
7054 | &g_hb.mode_state.mesh_state.listening_sockets) != 0) { |
7055 | CRASH("couldn't initialize unicast heartbeat sockets" ); |
7056 | } |
7057 | |
7058 | for (uint32_t i = 0; |
7059 | i < g_hb.mode_state.mesh_state.listening_sockets.n_socks; ++i) { |
7060 | DEBUG("opened mesh heartbeat socket: %d" , |
7061 | CSFD(&g_hb.mode_state.mesh_state.listening_sockets.socks[i])); |
7062 | } |
7063 | |
7064 | if (min_mtu == -1) { |
7065 | WARNING("error getting the min MTU - using the default %d" , |
7066 | DEFAULT_MIN_MTU); |
7067 | min_mtu = DEFAULT_MIN_MTU; |
7068 | } |
7069 | |
7070 | g_hb.mode_state.mesh_state.min_mtu = min_mtu; |
7071 | INFO("mtu of the network is %d" , min_mtu); |
7072 | |
7073 | MESH_UNLOCK(); |
7074 | } |
7075 | |
7076 | /** |
7077 | * Start mesh threads. |
7078 | */ |
7079 | static void |
7080 | mesh_start() |
7081 | { |
7082 | if (!hb_is_mesh()) { |
7083 | return; |
7084 | } |
7085 | |
7086 | MESH_LOCK(); |
7087 | |
7088 | mesh_listening_sockets_open(); |
7089 | channel_mesh_listening_socks_register( |
7090 | &g_hb.mode_state.mesh_state.listening_sockets); |
7091 | |
7092 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_RUNNING; |
7093 | |
7094 | // Start the mesh tender thread. |
7095 | g_hb.mode_state.mesh_state.mesh_tender_tid = |
7096 | cf_thread_create_joinable(mesh_tender, (void*)&g_hb); |
7097 | |
7098 | MESH_UNLOCK(); |
7099 | } |
7100 | |
7101 | /** |
7102 | * Stop the mesh module. |
7103 | */ |
7104 | static void |
7105 | mesh_stop() |
7106 | { |
7107 | if (!mesh_is_running()) { |
7108 | WARNING("mesh is already stopped" ); |
7109 | return; |
7110 | } |
7111 | |
7112 | // Unguarded state, but this should be OK. |
7113 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_SHUTTING_DOWN; |
7114 | |
7115 | // Wait for the channel tender thread to finish. |
7116 | cf_thread_join(g_hb.mode_state.mesh_state.mesh_tender_tid); |
7117 | |
7118 | MESH_LOCK(); |
7119 | |
7120 | channel_mesh_listening_socks_deregister( |
7121 | &g_hb.mode_state.mesh_state.listening_sockets); |
7122 | |
7123 | mesh_listening_sockets_close(); |
7124 | |
7125 | g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED; |
7126 | |
7127 | // Clear allocated state if any. |
7128 | if (g_hb.mode_state.mesh_state.published_endpoint_list) { |
7129 | cf_free(g_hb.mode_state.mesh_state.published_endpoint_list); |
7130 | g_hb.mode_state.mesh_state.published_endpoint_list = NULL; |
7131 | } |
7132 | |
7133 | MESH_UNLOCK(); |
7134 | } |
7135 | |
7136 | /** |
7137 | * Reduce function to dump mesh node info to log file. |
7138 | */ |
7139 | static int |
7140 | mesh_dump_reduce(const void* key, void* data, void* udata) |
7141 | { |
7142 | cf_node nodeid = *(cf_node*)key; |
7143 | as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; |
7144 | |
7145 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
7146 | as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str, |
7147 | sizeof(endpoint_list_str)); |
7148 | |
7149 | INFO("\tHB Mesh Node: node-id %" PRIx64" status %s last-updated %" PRIu64 " endpoints {%s}" , |
7150 | nodeid, mesh_node_status_string(mesh_node->status), |
7151 | mesh_node->last_status_updated, endpoint_list_str); |
7152 | |
7153 | return CF_SHASH_OK; |
7154 | } |
7155 | |
7156 | /** |
7157 | * Dump mesh state to logs. |
7158 | * @param verbose enables / disables verbose logging. |
7159 | */ |
7160 | static void |
7161 | mesh_dump(bool verbose) |
7162 | { |
7163 | if (!hb_is_mesh() || !verbose) { |
7164 | return; |
7165 | } |
7166 | |
7167 | MESH_LOCK(); |
7168 | cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; |
7169 | int element_count = cf_vector_size(seeds); |
7170 | INFO("HB Seed Count %d" , element_count); |
7171 | for (int i = 0; i < element_count; i++) { |
7172 | as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); |
7173 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
7174 | as_endpoint_list_to_string(seed->resolved_endpoint_list, |
7175 | endpoint_list_str, sizeof(endpoint_list_str)); |
7176 | INFO("\tHB Mesh Seed: host %s port %d node-id %" PRIx64" status %s endpoints {%s}" , |
7177 | seed->seed_host_name, seed->seed_port, seed->mesh_nodeid, mesh_node_status_string(seed->status), |
7178 | endpoint_list_str); |
7179 | } |
7180 | |
7181 | INFO("HB Mesh Nodes Count %d" , cf_shash_get_size(g_hb.mode_state.mesh_state.nodeid_to_mesh_node)); |
7182 | cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, |
7183 | mesh_dump_reduce, NULL); |
7184 | MESH_UNLOCK(); |
7185 | } |
7186 | |
7187 | /* |
7188 | * ---------------------------------------------------------------------------- |
7189 | * Multicast sub module. |
7190 | * ---------------------------------------------------------------------------- |
7191 | */ |
7192 | |
7193 | /** |
7194 | * Initialize multicast data structures. |
7195 | */ |
7196 | static void |
7197 | multicast_init() |
7198 | { |
7199 | } |
7200 | |
7201 | /** |
7202 | * Clear multicast data structures. |
7203 | */ |
7204 | static void |
7205 | multicast_clear() |
7206 | { |
7207 | // Free multicast data structures. Nothing to do. |
7208 | } |
7209 | |
7210 | /** |
7211 | * Open multicast sockets. Crashes if open failed. |
7212 | */ |
7213 | static void |
7214 | multicast_listening_sockets_open() |
7215 | { |
7216 | MULTICAST_LOCK(); |
7217 | |
7218 | const cf_mserv_cfg* mserv_cfg = config_multicast_group_cfg_get(); |
7219 | |
7220 | // Compute min MTU across all binding interfaces. |
7221 | int min_mtu = -1; |
7222 | char addr_string[DNS_NAME_MAX_SIZE]; |
7223 | for (uint32_t i = 0; i < mserv_cfg->n_cfgs; ++i) { |
7224 | const cf_msock_cfg* sock_cfg = &mserv_cfg->cfgs[i]; |
7225 | cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string, |
7226 | sizeof(addr_string)); |
7227 | |
7228 | INFO("initializing multicast heartbeat socket: %s:%d" , addr_string, |
7229 | sock_cfg->port); |
7230 | |
7231 | int bind_interface_mtu = |
7232 | !cf_ip_addr_is_any(&sock_cfg->if_addr) ? |
7233 | cf_inter_mtu(&sock_cfg->if_addr) : cf_inter_min_mtu(); |
7234 | |
7235 | if (min_mtu == -1 || min_mtu > bind_interface_mtu) { |
7236 | min_mtu = bind_interface_mtu; |
7237 | } |
7238 | } |
7239 | |
7240 | if (cf_socket_mcast_init((cf_mserv_cfg*)mserv_cfg, |
7241 | &g_hb.mode_state.multicast_state.listening_sockets) != 0) { |
7242 | CRASH("couldn't initialize multicast heartbeat socket: %s" , |
7243 | cf_strerror(errno)); |
7244 | } |
7245 | |
7246 | for (uint32_t i = 0; |
7247 | i < g_hb.mode_state.multicast_state.listening_sockets.n_socks; |
7248 | ++i) { |
7249 | DEBUG("opened multicast socket %d" , |
7250 | CSFD( |
7251 | &g_hb.mode_state.multicast_state.listening_sockets.socks[i])); |
7252 | } |
7253 | |
7254 | if (min_mtu == -1) { |
7255 | WARNING("error getting the min mtu - using the default %d" , |
7256 | DEFAULT_MIN_MTU); |
7257 | min_mtu = DEFAULT_MIN_MTU; |
7258 | } |
7259 | |
7260 | g_hb.mode_state.multicast_state.min_mtu = min_mtu; |
7261 | |
7262 | INFO("mtu of the network is %d" , min_mtu); |
7263 | MULTICAST_UNLOCK(); |
7264 | } |
7265 | |
7266 | /** |
7267 | * Start multicast module. |
7268 | */ |
7269 | static void |
7270 | multicast_start() |
7271 | { |
7272 | MULTICAST_LOCK(); |
7273 | multicast_listening_sockets_open(); |
7274 | channel_multicast_listening_socks_register( |
7275 | &g_hb.mode_state.multicast_state.listening_sockets); |
7276 | MULTICAST_UNLOCK(); |
7277 | } |
7278 | |
7279 | /** |
7280 | * Close multicast listening socket. |
7281 | */ |
7282 | static void |
7283 | multicast_listening_sockets_close() |
7284 | { |
7285 | MULTICAST_LOCK(); |
7286 | INFO("closing multicast heartbeat sockets" ); |
7287 | cf_sockets_close(&g_hb.mode_state.multicast_state.listening_sockets); |
7288 | DEBUG("closed multicast heartbeat socket" ); |
7289 | MULTICAST_UNLOCK(); |
7290 | } |
7291 | |
7292 | /** |
7293 | * Stop Multicast. |
7294 | */ |
7295 | static void |
7296 | multicast_stop() |
7297 | { |
7298 | MULTICAST_LOCK(); |
7299 | channel_multicast_listening_socks_deregister( |
7300 | &g_hb.mode_state.multicast_state.listening_sockets); |
7301 | multicast_listening_sockets_close(); |
7302 | |
7303 | MULTICAST_UNLOCK(); |
7304 | } |
7305 | |
7306 | /** |
7307 | * Dump multicast state to logs. |
7308 | * @param verbose enables / disables verbose logging. |
7309 | */ |
7310 | static void |
7311 | multicast_dump(bool verbose) |
7312 | { |
7313 | if (hb_is_mesh()) { |
7314 | return; |
7315 | } |
7316 | |
7317 | // Mode is multicast. |
7318 | INFO("HB Multicast TTL: %d" , config_multicast_ttl_get()); |
7319 | } |
7320 | |
7321 | /** |
7322 | * Find the maximum cluster size based on MTU of the network. |
7323 | * |
7324 | * num_nodes is computed so that |
7325 | * |
7326 | * MTU = compression_factor(fixed_size + num_nodesper_node_size) |
7327 | * where, |
7328 | * fixed_size = udp_header_size + msg_header_size + |
7329 | * sigma(per_plugin_fixed_size) |
7330 | * per_node_size = sigma(per_plugin_per_node_size). |
7331 | */ |
7332 | static int |
7333 | multicast_supported_cluster_size_get() |
7334 | { |
7335 | // Calculate the fixed size for a UDP packet and the message header. |
7336 | size_t msg_fixed_size = msg_get_template_fixed_sz(g_hb_msg_template, |
7337 | sizeof(g_hb_msg_template) / sizeof(msg_template)); |
7338 | |
7339 | size_t msg_plugin_per_node_size = 0; |
7340 | |
7341 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
7342 | // Adding plugin specific fixed size |
7343 | msg_fixed_size += g_hb.plugins[i].wire_size_fixed; |
7344 | // Adding plugin specific per node size. |
7345 | msg_plugin_per_node_size += g_hb.plugins[i].wire_size_per_node; |
7346 | } |
7347 | |
7348 | // TODO: Compute the max cluster size using max storage per node in cluster |
7349 | // and the min mtu. |
7350 | int supported_cluster_size = MAX(1, |
7351 | (((hb_mtu() - UDP_HEADER_SIZE_MAX) * MSG_COMPRESSION_RATIO) |
7352 | - msg_fixed_size) / msg_plugin_per_node_size); |
7353 | |
7354 | return supported_cluster_size; |
7355 | } |
7356 | |
7357 | /* |
7358 | * ---------------------------------------------------------------------------- |
7359 | * Heartbeat main sub module. |
7360 | * ---------------------------------------------------------------------------- |
7361 | */ |
7362 | |
7363 | /** |
7364 | * Is Main module initialized. |
7365 | */ |
7366 | static bool |
7367 | hb_is_initialized() |
7368 | { |
7369 | HB_LOCK(); |
7370 | bool retval = (g_hb.status != AS_HB_STATUS_UNINITIALIZED) ? true : false; |
7371 | HB_UNLOCK(); |
7372 | return retval; |
7373 | } |
7374 | |
7375 | /** |
7376 | * Is Main module running. |
7377 | */ |
7378 | static bool |
7379 | hb_is_running() |
7380 | { |
7381 | HB_LOCK(); |
7382 | bool retval = (g_hb.status == AS_HB_STATUS_RUNNING) ? true : false; |
7383 | HB_UNLOCK(); |
7384 | return retval; |
7385 | } |
7386 | |
7387 | /** |
7388 | * Is Main module stopped. |
7389 | */ |
7390 | static bool |
7391 | hb_is_stopped() |
7392 | { |
7393 | HB_LOCK(); |
7394 | bool retval = (g_hb.status == AS_HB_STATUS_STOPPED) ? true : false; |
7395 | HB_UNLOCK(); |
7396 | return retval; |
7397 | } |
7398 | |
7399 | /** |
7400 | * Initialize the mode specific data structures. |
7401 | */ |
7402 | static void |
7403 | hb_mode_init() |
7404 | { |
7405 | if (hb_is_mesh()) { |
7406 | mesh_init(); |
7407 | } |
7408 | else { |
7409 | multicast_init(); |
7410 | } |
7411 | } |
7412 | |
7413 | /** |
7414 | * Start mode specific threads.. |
7415 | */ |
7416 | static void |
7417 | hb_mode_start() |
7418 | { |
7419 | if (hb_is_mesh()) { |
7420 | mesh_start(); |
7421 | } |
7422 | else { |
7423 | multicast_start(); |
7424 | } |
7425 | } |
7426 | |
7427 | /** |
7428 | * The MTU for underlying network. |
7429 | */ |
7430 | static int |
7431 | hb_mtu() |
7432 | { |
7433 | int __mtu = config_override_mtu_get(); |
7434 | if (!__mtu) { |
7435 | __mtu = hb_is_mesh() ? |
7436 | g_hb.mode_state.mesh_state.min_mtu : |
7437 | g_hb.mode_state.multicast_state.min_mtu; |
7438 | __mtu = __mtu > 0 ? __mtu : DEFAULT_MIN_MTU; |
7439 | } |
7440 | return __mtu; |
7441 | } |
7442 | |
7443 | /** |
7444 | * Initialize the template to be used for heartbeat messages. |
7445 | */ |
7446 | static void |
7447 | hb_msg_init() |
7448 | { |
7449 | // Register fabric heartbeat msg type with no processing function: |
7450 | // This permits getting / putting heartbeat msgs to be moderated via an idle |
7451 | // msg queue. |
7452 | as_fabric_register_msg_fn(M_TYPE_HEARTBEAT, g_hb_msg_template, |
7453 | sizeof(g_hb_msg_template), |
7454 | AS_HB_MSG_SCRATCH_SIZE, 0, 0); |
7455 | } |
7456 | |
7457 | /** |
7458 | * Get hold of current heartbeat protocol version |
7459 | */ |
7460 | static uint32_t |
7461 | hb_protocol_identifier_get() |
7462 | { |
7463 | return HB_PROTOCOL_V3_IDENTIFIER; |
7464 | } |
7465 | |
7466 | /** |
7467 | * Node depart event time estimate. Assumes node departed timeout milliseconds |
7468 | * before the detection. |
7469 | */ |
7470 | static cf_clock |
7471 | hb_node_depart_time(cf_clock detect_time) |
7472 | { |
7473 | return (detect_time - HB_NODE_TIMEOUT()); |
7474 | } |
7475 | |
7476 | /** |
7477 | * Indicates if mode is mesh. |
7478 | */ |
7479 | static bool |
7480 | hb_is_mesh() |
7481 | { |
7482 | return (config_mode_get() == AS_HB_MODE_MESH); |
7483 | } |
7484 | |
7485 | /** |
7486 | * Publish an event to subsystems listening to heart beat events. |
7487 | */ |
7488 | static void |
7489 | hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, |
7490 | int node_count) |
7491 | { |
7492 | // Lock-less because the queue is thread safe and we do not use heartbeat |
7493 | // state here. |
7494 | for (int i = 0; i < node_count; i++) { |
7495 | as_hb_event_node event; |
7496 | event.nodeid = nodes[i]; |
7497 | event.event_detected_time = cf_getms(); |
7498 | |
7499 | switch (event_type) { |
7500 | case AS_HB_INTERNAL_NODE_ARRIVE: |
7501 | event.evt = AS_HB_NODE_ARRIVE; |
7502 | event.event_time = event.event_detected_time; |
7503 | as_health_add_node_counter(event.nodeid, AS_HEALTH_NODE_ARRIVALS); |
7504 | break; |
7505 | case AS_HB_INTERNAL_NODE_DEPART: |
7506 | event.evt = AS_HB_NODE_DEPART; |
7507 | event.event_time = hb_node_depart_time(event.event_detected_time); |
7508 | break; |
7509 | case AS_HB_INTERNAL_NODE_EVICT: |
7510 | event.evt = AS_HB_NODE_DEPART; |
7511 | event.event_time = event.event_detected_time; |
7512 | break; |
7513 | case AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED: |
7514 | event.evt = AS_HB_NODE_ADJACENCY_CHANGED; |
7515 | event.event_time = event.event_detected_time; |
7516 | break; |
7517 | } |
7518 | |
7519 | DEBUG("queuing event of type %d for node %" PRIx64, event.evt, |
7520 | event.nodeid); |
7521 | cf_queue_push(&g_hb_event_listeners.external_events_queue, &event); |
7522 | } |
7523 | } |
7524 | |
7525 | /** |
7526 | * Publish all pending events. Should be invoked outside hb locks. |
7527 | */ |
7528 | static void |
7529 | hb_event_publish_pending() |
7530 | { |
7531 | EXTERNAL_EVENT_PUBLISH_LOCK(); |
7532 | int num_events = cf_queue_sz(&g_hb_event_listeners.external_events_queue); |
7533 | if (num_events <= 0) { |
7534 | // Events need not be published. |
7535 | goto Exit; |
7536 | } |
7537 | |
7538 | as_hb_event_node events[AS_HB_CLUSTER_MAX_SIZE_SOFT]; |
7539 | int published_count = 0; |
7540 | while (published_count < AS_HB_CLUSTER_MAX_SIZE_SOFT |
7541 | && cf_queue_pop(&g_hb_event_listeners.external_events_queue, |
7542 | &events[published_count], 0) == CF_QUEUE_OK) { |
7543 | published_count++; |
7544 | } |
7545 | |
7546 | if (published_count) { |
7547 | // Assuming that event listeners are not registered after system init, |
7548 | // no locks here. |
7549 | DEBUG("publishing %d heartbeat events" , published_count); |
7550 | for (int i = 0; i < g_hb_event_listeners.event_listener_count; i++) { |
7551 | (g_hb_event_listeners.event_listeners[i].event_callback)( |
7552 | published_count, events, |
7553 | g_hb_event_listeners.event_listeners[i].udata); |
7554 | } |
7555 | } |
7556 | |
7557 | Exit: |
7558 | EXTERNAL_EVENT_PUBLISH_UNLOCK(); |
7559 | } |
7560 | |
7561 | /** |
7562 | * Delete the heap allocated data while iterating through the hash and deleting |
7563 | * entries. |
7564 | */ |
7565 | static int |
7566 | hb_adjacency_free_data_reduce(const void* key, void* data, void* udata) |
7567 | { |
7568 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
7569 | |
7570 | const cf_node* nodeid = (const cf_node*)key; |
7571 | |
7572 | hb_adjacent_node_destroy(adjacent_node); |
7573 | |
7574 | // Send event depart to for this node |
7575 | hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, nodeid, 1); |
7576 | |
7577 | return CF_SHASH_REDUCE_DELETE; |
7578 | } |
7579 | |
7580 | /** |
7581 | * Clear the heartbeat data structures. |
7582 | */ |
7583 | static void |
7584 | hb_clear() |
7585 | { |
7586 | if (!hb_is_stopped()) { |
7587 | WARNING("attempted to clear heartbeat module without stopping it" ); |
7588 | return; |
7589 | } |
7590 | |
7591 | HB_LOCK(); |
7592 | |
7593 | // Free the plugin data and delete adjacent nodes. |
7594 | cf_shash_reduce(g_hb.adjacency, hb_adjacency_free_data_reduce, NULL); |
7595 | cf_shash_reduce(g_hb.on_probation, hb_adjacency_free_data_reduce, NULL); |
7596 | hb_adjacent_node_destroy(&g_hb.self_node); |
7597 | memset(&g_hb.self_node, 0, sizeof(g_hb.self_node)); |
7598 | |
7599 | HB_UNLOCK(); |
7600 | |
7601 | // Publish node departed events for the removed nodes. |
7602 | hb_event_publish_pending(); |
7603 | |
7604 | // Clear the mode module. |
7605 | if (hb_is_mesh()) { |
7606 | mesh_clear(); |
7607 | } |
7608 | else { |
7609 | multicast_clear(); |
7610 | } |
7611 | |
7612 | channel_clear(); |
7613 | } |
7614 | |
7615 | /** |
7616 | * Reduce function to get hold of current adjacency list. |
7617 | */ |
7618 | static int |
7619 | hb_adjacency_iterate_reduce(const void* key, void* data, void* udata) |
7620 | { |
7621 | const cf_node* nodeid = (const cf_node*)key; |
7622 | as_hb_adjacency_reduce_udata* adjacency_reduce_udata = |
7623 | (as_hb_adjacency_reduce_udata*)udata; |
7624 | |
7625 | adjacency_reduce_udata->adj_list[adjacency_reduce_udata->adj_count] = |
7626 | *nodeid; |
7627 | adjacency_reduce_udata->adj_count++; |
7628 | |
7629 | return CF_SHASH_OK; |
7630 | } |
7631 | |
7632 | /** |
7633 | * Plugin function to set heartbeat adjacency list into a pulse message. |
7634 | */ |
7635 | static void |
7636 | hb_plugin_set_fn(msg* msg) |
7637 | { |
7638 | HB_LOCK(); |
7639 | |
7640 | cf_node adj_list[cf_shash_get_size(g_hb.adjacency)]; |
7641 | as_hb_adjacency_reduce_udata adjacency_reduce_udata = { adj_list, 0 }; |
7642 | |
7643 | cf_shash_reduce(g_hb.adjacency, hb_adjacency_iterate_reduce, |
7644 | &adjacency_reduce_udata); |
7645 | |
7646 | HB_UNLOCK(); |
7647 | |
7648 | // Populate adjacency list. |
7649 | msg_adjacency_set(msg, adj_list, adjacency_reduce_udata.adj_count); |
7650 | |
7651 | // Set cluster name. |
7652 | char cluster_name[AS_CLUSTER_NAME_SZ]; |
7653 | as_config_cluster_name_get(cluster_name); |
7654 | |
7655 | if (cluster_name[0] != '\0') { |
7656 | msg_set_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, MSG_SET_COPY); |
7657 | } |
7658 | } |
7659 | |
7660 | /** |
7661 | * Plugin function that parses adjacency list out of a heartbeat pulse message. |
7662 | */ |
7663 | static void |
7664 | hb_plugin_parse_data_fn(msg* msg, cf_node source, |
7665 | as_hb_plugin_node_data* prev_plugin_data, |
7666 | as_hb_plugin_node_data* plugin_data) |
7667 | { |
7668 | size_t adj_length = 0; |
7669 | cf_node* adj_list = NULL; |
7670 | |
7671 | if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) { |
7672 | // Store a zero length adjacency list. Should not have happened. |
7673 | WARNING("received heartbeat without adjacency list %" PRIx64, source); |
7674 | adj_length = 0; |
7675 | } |
7676 | |
7677 | // The guess can be larger for older protocols which also include self node |
7678 | // in the adjacency list. |
7679 | int guessed_data_size = (adj_length * sizeof(cf_node)); |
7680 | |
7681 | if (guessed_data_size > plugin_data->data_capacity) { |
7682 | // Round up to nearest multiple of block size to prevent very frequent |
7683 | // reallocation. |
7684 | size_t data_capacity = ((guessed_data_size + HB_PLUGIN_DATA_BLOCK_SIZE |
7685 | - 1) / |
7686 | HB_PLUGIN_DATA_BLOCK_SIZE) * |
7687 | HB_PLUGIN_DATA_BLOCK_SIZE; |
7688 | |
7689 | // Reallocate since we have outgrown existing capacity. |
7690 | plugin_data->data = cf_realloc(plugin_data->data, data_capacity); |
7691 | plugin_data->data_capacity = data_capacity; |
7692 | } |
7693 | |
7694 | cf_node* dest_list = (cf_node*)(plugin_data->data); |
7695 | |
7696 | size_t final_list_length = 0; |
7697 | for (size_t i = 0; i < adj_length; i++) { |
7698 | if (adj_list[i] == source) { |
7699 | // Skip the source node. |
7700 | continue; |
7701 | } |
7702 | dest_list[final_list_length++] = adj_list[i]; |
7703 | } |
7704 | |
7705 | plugin_data->data_size = (final_list_length * sizeof(cf_node)); |
7706 | } |
7707 | |
7708 | /** |
7709 | * Get the msg buffer from a pool based on the protocol under use. |
7710 | * @return the msg buff |
7711 | */ |
7712 | static msg* |
7713 | hb_msg_get() |
7714 | { |
7715 | return as_fabric_msg_get(M_TYPE_HEARTBEAT); |
7716 | } |
7717 | |
7718 | /** |
7719 | * Return the message buffer back to the pool. |
7720 | */ |
7721 | static void |
7722 | hb_msg_return(msg* msg) |
7723 | { |
7724 | as_fabric_msg_put(msg); |
7725 | } |
7726 | |
7727 | /** |
7728 | * Fill the outgoing pulse message with plugin specific data. |
7729 | * |
7730 | * Note: The set functions would be acquiring their locks. This function should |
7731 | * never directly use nor have a call stack under HB_LOCK. |
7732 | * |
7733 | * @param msg the outgoing pulse message. |
7734 | */ |
7735 | static void |
7736 | hb_plugin_msg_fill(msg* msg) |
7737 | { |
7738 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
7739 | if (g_hb.plugins[i].set_fn) { |
7740 | (g_hb.plugins[i].set_fn)(msg); |
7741 | } |
7742 | } |
7743 | } |
7744 | |
7745 | /** |
7746 | * Parse fields from the message into plugin specific data. |
7747 | * @param msg the outgoing pulse message. |
7748 | * @param adjacent_node the node from which this message was received. |
7749 | * @param plugin_data_changed (output) array whose ith entry is set to true if |
7750 | * ith plugin's data changed, false otherwise. Should be large enough to hold |
7751 | * flags for all plugins. |
7752 | */ |
7753 | static void |
7754 | hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, |
7755 | as_hb_plugin* plugins, bool plugin_data_changed[]) |
7756 | { |
7757 | cf_node source; |
7758 | adjacent_node->plugin_data_cycler++; |
7759 | |
7760 | msg_nodeid_get(msg, &source); |
7761 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
7762 | plugin_data_changed[i] = false; |
7763 | if (plugins[i].parse_fn) { |
7764 | as_hb_plugin_node_data* curr_data = |
7765 | &adjacent_node->plugin_data[i][adjacent_node->plugin_data_cycler |
7766 | % 2]; |
7767 | |
7768 | as_hb_plugin_node_data* prev_data = |
7769 | &adjacent_node->plugin_data[i][(adjacent_node->plugin_data_cycler |
7770 | + 1) % 2]; |
7771 | |
7772 | // Ensure there is a preallocated data pointer. |
7773 | if (curr_data->data == NULL) { |
7774 | curr_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE); |
7775 | curr_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE; |
7776 | curr_data->data_size = 0; |
7777 | } |
7778 | |
7779 | if (prev_data->data == NULL) { |
7780 | prev_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE); |
7781 | prev_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE; |
7782 | prev_data->data_size = 0; |
7783 | } |
7784 | |
7785 | // Parse message data into current data. |
7786 | (plugins[i]).parse_fn(msg, source, prev_data, curr_data); |
7787 | |
7788 | if (!plugins[i].change_listener) { |
7789 | // No change listener configured. Skip detecting change. |
7790 | continue; |
7791 | } |
7792 | |
7793 | size_t curr_data_size = curr_data->data_size; |
7794 | void* curr_data_blob = curr_data_size ? curr_data->data : NULL; |
7795 | |
7796 | size_t prev_data_size = prev_data->data_size; |
7797 | void* prev_data_blob = prev_data_size ? prev_data->data : NULL; |
7798 | |
7799 | if (prev_data_blob == curr_data_blob) { |
7800 | // Old and new data both NULL or both point to the same memory |
7801 | // location. |
7802 | plugin_data_changed[i] = false; |
7803 | continue; |
7804 | } |
7805 | |
7806 | if (prev_data_size != curr_data_size || prev_data_blob == NULL |
7807 | || curr_data_blob == NULL) { |
7808 | // Plugin data definitely changed, as the data sizes differ or |
7809 | // exactly one of old or new data pointers is NULL. |
7810 | plugin_data_changed[i] = true; |
7811 | continue; |
7812 | } |
7813 | |
7814 | // The data sizes match at this point and neither values are NULL. |
7815 | plugin_data_changed[i] = memcmp(prev_data_blob, curr_data_blob, |
7816 | curr_data_size) != 0; |
7817 | } |
7818 | } |
7819 | } |
7820 | |
7821 | /** |
7822 | * Adjacency list for an adjacent node changed. |
7823 | */ |
7824 | static void |
7825 | hb_plugin_data_change_listener(cf_node changed_node_id) |
7826 | { |
7827 | hb_event_queue(AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED, &changed_node_id, 1); |
7828 | } |
7829 | |
7830 | /** |
7831 | * Initialize the plugin specific data structures. |
7832 | */ |
7833 | static void |
7834 | hb_plugin_init() |
7835 | { |
7836 | memset(&g_hb.plugins, 0, sizeof(g_hb.plugins)); |
7837 | |
7838 | // Be cute. Register self as a plugin. |
7839 | as_hb_plugin self_plugin; |
7840 | memset(&self_plugin, 0, sizeof(self_plugin)); |
7841 | self_plugin.id = AS_HB_PLUGIN_HB; |
7842 | self_plugin.wire_size_fixed = 0; |
7843 | self_plugin.wire_size_per_node = sizeof(cf_node); |
7844 | self_plugin.set_fn = hb_plugin_set_fn; |
7845 | self_plugin.parse_fn = hb_plugin_parse_data_fn; |
7846 | self_plugin.change_listener = hb_plugin_data_change_listener; |
7847 | hb_plugin_register(&self_plugin); |
7848 | } |
7849 | |
7850 | /** |
7851 | * Transmits heartbeats at fixed intervals. |
7852 | */ |
7853 | void* |
7854 | hb_transmitter(void* arg) |
7855 | { |
7856 | DETAIL("heartbeat transmitter started" ); |
7857 | |
7858 | cf_clock last_time = 0; |
7859 | |
7860 | while (hb_is_running()) { |
7861 | cf_clock curr_time = cf_getms(); |
7862 | |
7863 | if ((curr_time - last_time) < PULSE_TRANSMIT_INTERVAL()) { |
7864 | // Interval has not been reached for sending heartbeats |
7865 | usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time + |
7866 | PULSE_TRANSMIT_INTERVAL()) - curr_time) * 1000); |
7867 | continue; |
7868 | } |
7869 | |
7870 | last_time = curr_time; |
7871 | |
7872 | // Construct the pulse message. |
7873 | msg* msg = hb_msg_get(); |
7874 | |
7875 | msg_src_fields_fill(msg); |
7876 | msg_type_set(msg, AS_HB_MSG_TYPE_PULSE); |
7877 | |
7878 | // Have plugins fill their data into the heartbeat pulse message. |
7879 | hb_plugin_msg_fill(msg); |
7880 | |
7881 | // Broadcast the heartbeat to all known recipients. |
7882 | channel_msg_broadcast(msg); |
7883 | |
7884 | // Return the msg back to the fabric. |
7885 | hb_msg_return(msg); |
7886 | |
7887 | DETAIL("done sending pulse message" ); |
7888 | } |
7889 | |
7890 | DETAIL("heartbeat transmitter stopped" ); |
7891 | return NULL; |
7892 | } |
7893 | |
7894 | /** |
7895 | * Get hold of adjacent node information given its nodeid. |
7896 | * @param nodeid the nodeid. |
7897 | * @param adjacent_node the output node information. |
7898 | * @return 0 on success, -1 on failure. |
7899 | */ |
7900 | static int |
7901 | hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node) |
7902 | { |
7903 | int rv = -1; |
7904 | HB_LOCK(); |
7905 | |
7906 | if (cf_shash_get(g_hb.adjacency, &nodeid, adjacent_node) == CF_SHASH_OK) { |
7907 | rv = 0; |
7908 | } |
7909 | |
7910 | HB_UNLOCK(); |
7911 | return rv; |
7912 | } |
7913 | |
7914 | /** |
7915 | * Get hold of an on-probation node information given its nodeid. |
7916 | * @param nodeid the nodeid. |
7917 | * @param adjacent_node the output node information. |
7918 | * @return 0 on success, -1 on failure. |
7919 | */ |
7920 | static int |
7921 | hb_on_probation_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node) |
7922 | { |
7923 | int rv = -1; |
7924 | HB_LOCK(); |
7925 | |
7926 | if (cf_shash_get(g_hb.on_probation, &nodeid, adjacent_node) |
7927 | == CF_SHASH_OK) { |
7928 | rv = 0; |
7929 | } |
7930 | |
7931 | HB_UNLOCK(); |
7932 | return rv; |
7933 | } |
7934 | |
7935 | /** |
7936 | * Read the plugin data from an adjacent node. |
7937 | * @param adjacent_node the adjacent node. |
7938 | * @param plugin_data (output) will be null if this node has no plugin data. |
7939 | * Else will point to the plugin data. |
7940 | * @param plugin_data_size (output) the size of the plugin data. |
7941 | */ |
7942 | static void |
7943 | hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, |
7944 | as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size) |
7945 | { |
7946 | *plugin_data_size = |
7947 | adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler |
7948 | % 2].data_size; |
7949 | |
7950 | *plugin_data = |
7951 | *plugin_data_size ? |
7952 | (cf_node*)(adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler |
7953 | % 2].data) : NULL; |
7954 | } |
7955 | |
7956 | /** |
7957 | * Get adjacency list for an adjacent node. |
7958 | */ |
7959 | static void |
7960 | hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, |
7961 | cf_node** adjacency_list, size_t* adjacency_length) |
7962 | { |
7963 | hb_adjacent_node_plugin_data_get(adjacent_node, AS_HB_PLUGIN_HB, |
7964 | (void**)adjacency_list, adjacency_length); |
7965 | (*adjacency_length) /= sizeof(cf_node); |
7966 | } |
7967 | |
7968 | /** |
7969 | * Indicates if a give node has expired and should be removed from the adjacency |
7970 | * list. |
7971 | */ |
7972 | static bool |
7973 | hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node) |
7974 | { |
7975 | if (nodeid == config_self_nodeid_get()) { |
7976 | return false; |
7977 | } |
7978 | |
7979 | HB_LOCK(); |
7980 | |
7981 | cf_clock now = cf_getms(); |
7982 | |
7983 | bool expired = adjacent_node->last_updated_monotonic_ts + HB_NODE_TIMEOUT() |
7984 | < now; |
7985 | |
7986 | HB_UNLOCK(); |
7987 | return expired; |
7988 | } |
7989 | |
7990 | /** |
7991 | * Indicates if self node has duplicate ids. |
7992 | */ |
7993 | static bool |
7994 | hb_self_is_duplicate(){ |
7995 | HB_LOCK(); |
7996 | bool self_is_duplicate = g_hb.self_is_duplicate; |
7997 | HB_UNLOCK(); |
7998 | return self_is_duplicate; |
7999 | } |
8000 | |
8001 | /** |
8002 | * Updates the self is duplicate flag. |
8003 | */ |
8004 | static void |
8005 | hb_self_duplicate_update() |
8006 | { |
8007 | cf_clock now = cf_getms(); |
8008 | HB_LOCK(); |
8009 | if (g_hb.self_is_duplicate) { |
8010 | uint32_t duplicate_block_interval = |
8011 | config_endpoint_track_intervals_get() |
8012 | * config_tx_interval_get(); |
8013 | if (g_hb.self_duplicate_detected_ts + duplicate_block_interval <= now) { |
8014 | // We have not seen duplicates for the endpoint change tracking |
8015 | // interval. Mark ourself as non-duplicate. |
8016 | g_hb.self_is_duplicate = false; |
8017 | } |
8018 | } |
8019 | HB_UNLOCK(); |
8020 | } |
8021 | |
8022 | /** |
8023 | * Free up space occupied by plugin data from adjacent node. |
8024 | */ |
8025 | static void |
8026 | hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node) |
8027 | { |
8028 | HB_LOCK(); |
8029 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
8030 | as_hb_plugin_node_data* curr_plugin_data = adjacent_node->plugin_data[i]; |
8031 | for (int j = 0; j < 2; j++) { |
8032 | if (curr_plugin_data[j].data) { |
8033 | cf_free(curr_plugin_data[j].data); |
8034 | curr_plugin_data[j].data = NULL; |
8035 | } |
8036 | |
8037 | curr_plugin_data[j].data_capacity = 0; |
8038 | curr_plugin_data[j].data_size = 0; |
8039 | } |
8040 | } |
8041 | |
8042 | if (adjacent_node->endpoint_list) { |
8043 | // Free the endpoint list. |
8044 | cf_free(adjacent_node->endpoint_list); |
8045 | adjacent_node->endpoint_list = NULL; |
8046 | } |
8047 | |
8048 | HB_UNLOCK(); |
8049 | } |
8050 | |
8051 | /** |
8052 | * Tend reduce function that removes expired nodes from adjacency list. |
8053 | */ |
8054 | static int |
8055 | hb_adjacency_tend_reduce(const void* key, void* data, void* udata) |
8056 | { |
8057 | cf_node nodeid = *(const cf_node*)key; |
8058 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
8059 | as_hb_adjacency_tender_udata* adjacency_tender_udata = |
8060 | (as_hb_adjacency_tender_udata*)udata; |
8061 | |
8062 | int rv = CF_SHASH_OK; |
8063 | bool cluster_name_mismatch = adjacent_node->cluster_name_mismatch_count |
8064 | > CLUSTER_NAME_MISMATCH_MAX; |
8065 | if (hb_node_has_expired(nodeid, adjacent_node) || cluster_name_mismatch) { |
8066 | INFO("node expired %" PRIx64" %s" , nodeid, cluster_name_mismatch ? "(cluster name mismatch)" : "" ); |
8067 | if (cluster_name_mismatch) { |
8068 | adjacency_tender_udata->evicted_nodes[adjacency_tender_udata->evicted_node_count++] = |
8069 | nodeid; |
8070 | } |
8071 | else { |
8072 | adjacency_tender_udata->dead_nodes[adjacency_tender_udata->dead_node_count++] = |
8073 | nodeid; |
8074 | } |
8075 | |
8076 | // Free plugin data as well. |
8077 | hb_adjacent_node_destroy(adjacent_node); |
8078 | |
8079 | rv = CF_SHASH_REDUCE_DELETE; |
8080 | } |
8081 | |
8082 | return rv; |
8083 | } |
8084 | |
8085 | /** |
8086 | * Tend reduce function that removes expired nodes from the probationary list. |
8087 | */ |
8088 | static int |
8089 | hb_on_probation_tend_reduce(const void* key, void* data, void* udata) |
8090 | { |
8091 | cf_node nodeid = *(const cf_node*)key; |
8092 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
8093 | |
8094 | int rv = CF_SHASH_OK; |
8095 | if (hb_node_has_expired(nodeid, adjacent_node)) { |
8096 | DEBUG("on-probation node %" PRIx64 " expired" , nodeid); |
8097 | // Free plugin data as well. |
8098 | hb_adjacent_node_destroy(adjacent_node); |
8099 | rv = CF_SHASH_REDUCE_DELETE; |
8100 | } |
8101 | return rv; |
8102 | } |
8103 | |
8104 | /** |
8105 | * Tends the adjacency list. Removes nodes that expire. |
8106 | */ |
8107 | void* |
8108 | hb_adjacency_tender(void* arg) |
8109 | { |
8110 | DETAIL("adjacency tender started" ); |
8111 | |
8112 | cf_clock last_time = 0; |
8113 | cf_clock last_depart_time = 0; |
8114 | |
8115 | while (hb_is_running()) { |
8116 | cf_clock curr_time = cf_getms(); |
8117 | uint32_t adjacency_tend_interval = ADJACENCY_TEND_INTERVAL; |
8118 | // Interval after node depart where we tend faster to detect additional |
8119 | // node departures. |
8120 | uint32_t fast_check_interval = 2 * config_tx_interval_get(); |
8121 | if (last_depart_time + fast_check_interval > curr_time) { |
8122 | adjacency_tend_interval = ADJACENCY_FAST_TEND_INTERVAL; |
8123 | } |
8124 | |
8125 | hb_self_duplicate_update(); |
8126 | |
8127 | if ((curr_time - last_time) < adjacency_tend_interval) { |
8128 | // Publish any pendng events. |
8129 | hb_event_publish_pending(); |
8130 | |
8131 | // Interval has not been reached for sending heartbeats |
8132 | usleep( |
8133 | MIN(AS_HB_TX_INTERVAL_MS_MIN, |
8134 | (last_time + adjacency_tend_interval) - curr_time) |
8135 | * 1000); |
8136 | continue; |
8137 | } |
8138 | |
8139 | last_time = curr_time; |
8140 | |
8141 | DETAIL("tending adjacency list" ); |
8142 | |
8143 | HB_LOCK(); |
8144 | cf_node dead_nodes[cf_shash_get_size(g_hb.adjacency)]; |
8145 | cf_node evicted_nodes[cf_shash_get_size(g_hb.adjacency)]; |
8146 | as_hb_adjacency_tender_udata adjacency_tender_udata; |
8147 | adjacency_tender_udata.dead_nodes = dead_nodes; |
8148 | adjacency_tender_udata.dead_node_count = 0; |
8149 | adjacency_tender_udata.evicted_nodes = evicted_nodes; |
8150 | adjacency_tender_udata.evicted_node_count = 0; |
8151 | |
8152 | cf_shash_reduce(g_hb.adjacency, hb_adjacency_tend_reduce, |
8153 | &adjacency_tender_udata); |
8154 | |
8155 | if (adjacency_tender_udata.dead_node_count > 0) { |
8156 | last_depart_time = curr_time; |
8157 | // Queue events for dead nodes. |
8158 | hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, dead_nodes, |
8159 | adjacency_tender_udata.dead_node_count); |
8160 | } |
8161 | |
8162 | if (adjacency_tender_udata.evicted_node_count > 0) { |
8163 | last_depart_time = curr_time; |
8164 | // Queue events for evicted nodes. |
8165 | hb_event_queue(AS_HB_INTERNAL_NODE_EVICT, evicted_nodes, |
8166 | adjacency_tender_udata.evicted_node_count); |
8167 | } |
8168 | |
8169 | // Expire nodes from the on-probation list. |
8170 | cf_shash_reduce(g_hb.on_probation, hb_on_probation_tend_reduce, NULL); |
8171 | HB_UNLOCK(); |
8172 | |
8173 | // See if we have pending events to publish. |
8174 | hb_event_publish_pending(); |
8175 | |
8176 | DETAIL("done tending adjacency list" ); |
8177 | } |
8178 | |
8179 | DETAIL("adjacency tender shut down" ); |
8180 | return NULL; |
8181 | } |
8182 | |
8183 | /** |
8184 | * Start the transmitter thread. |
8185 | */ |
8186 | static void |
8187 | hb_tx_start() |
8188 | { |
8189 | // Start the transmitter thread. |
8190 | g_hb.transmitter_tid = cf_thread_create_joinable(hb_transmitter, |
8191 | (void*)&g_hb); |
8192 | } |
8193 | |
8194 | /** |
8195 | * Stop the transmitter thread. |
8196 | */ |
8197 | static void |
8198 | hb_tx_stop() |
8199 | { |
8200 | DETAIL("waiting for the transmitter thread to stop" ); |
8201 | // Wait for the adjacency tender thread to stop. |
8202 | cf_thread_join(g_hb.transmitter_tid); |
8203 | } |
8204 | |
8205 | /** |
8206 | * Start the transmitter thread. |
8207 | */ |
8208 | static void |
8209 | hb_adjacency_tender_start() |
8210 | { |
8211 | // Start the transmitter thread. |
8212 | g_hb.adjacency_tender_tid = cf_thread_create_joinable(hb_adjacency_tender, |
8213 | (void*)&g_hb); |
8214 | } |
8215 | |
8216 | /** |
8217 | * Stop the adjacency tender thread. |
8218 | */ |
8219 | static void |
8220 | hb_adjacency_tender_stop() |
8221 | { |
8222 | // Wait for the adjacency tender thread to stop. |
8223 | cf_thread_join(g_hb.adjacency_tender_tid); |
8224 | } |
8225 | |
8226 | /** |
8227 | * Initialize the heartbeat subsystem. |
8228 | */ |
8229 | static void |
8230 | hb_init() |
8231 | { |
8232 | if (hb_is_initialized()) { |
8233 | WARNING("heartbeat main module is already initialized" ); |
8234 | return; |
8235 | } |
8236 | |
8237 | // Operate under a lock. Let's be paranoid everywhere. |
8238 | HB_LOCK(); |
8239 | |
8240 | // Initialize the heartbeat data structure. |
8241 | memset(&g_hb, 0, sizeof(g_hb)); |
8242 | |
8243 | // Initialize the adjacency hash. |
8244 | g_hb.adjacency = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), |
8245 | sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
8246 | |
8247 | // Initialize the on_probation hash. |
8248 | g_hb.on_probation = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), |
8249 | sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
8250 | |
8251 | // Initialize the temporary hash to map nodeid to index. |
8252 | g_hb.nodeid_to_index = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), |
8253 | sizeof(int), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); |
8254 | |
8255 | // Initialize unpublished event queue. |
8256 | cf_queue_init(&g_hb_event_listeners.external_events_queue, |
8257 | sizeof(as_hb_event_node), |
8258 | AS_HB_CLUSTER_MAX_SIZE_SOFT, true); |
8259 | |
8260 | // Initialize the mode specific state. |
8261 | hb_mode_init(); |
8262 | |
8263 | // Initialize the plugin functions. |
8264 | hb_plugin_init(); |
8265 | |
8266 | // Initialize IO channel subsystem. |
8267 | channel_init(); |
8268 | |
8269 | g_hb.status = AS_HB_STATUS_STOPPED; |
8270 | |
8271 | HB_UNLOCK(); |
8272 | } |
8273 | |
8274 | /** |
8275 | * Start the heartbeat subsystem. |
8276 | */ |
8277 | static void |
8278 | hb_start() |
8279 | { |
8280 | // Operate under a lock. Let's be paranoid everywhere. |
8281 | HB_LOCK(); |
8282 | |
8283 | if (hb_is_running()) { |
8284 | // Shutdown the heartbeat subsystem. |
8285 | hb_stop(); |
8286 | } |
8287 | |
8288 | g_hb.status = AS_HB_STATUS_RUNNING; |
8289 | |
8290 | // Initialize the heartbeat message templates. Called from here because |
8291 | // fabric needs to be initialized for this call to succeed. Fabric init |
8292 | // happens after heartbeat init. |
8293 | hb_msg_init(); |
8294 | |
8295 | // Initialize channel sub module. |
8296 | channel_start(); |
8297 | |
8298 | // Start the mode sub module |
8299 | hb_mode_start(); |
8300 | |
8301 | // Start heart beat transmitter. |
8302 | hb_tx_start(); |
8303 | |
8304 | // Start heart beat adjacency tender. |
8305 | hb_adjacency_tender_start(); |
8306 | |
8307 | HB_UNLOCK(); |
8308 | } |
8309 | |
8310 | /** |
8311 | * Shut down the heartbeat subsystem. |
8312 | */ |
8313 | static void |
8314 | hb_stop() |
8315 | { |
8316 | if (!hb_is_running()) { |
8317 | WARNING("heartbeat is already stopped" ); |
8318 | return; |
8319 | } |
8320 | |
8321 | HB_LOCK(); |
8322 | g_hb.status = AS_HB_STATUS_SHUTTING_DOWN; |
8323 | HB_UNLOCK(); |
8324 | |
8325 | // Publish pending events. Should not delay any events. |
8326 | hb_event_publish_pending(); |
8327 | |
8328 | // Shutdown mode. |
8329 | if (hb_is_mesh()) { |
8330 | mesh_stop(); |
8331 | } |
8332 | else { |
8333 | multicast_stop(); |
8334 | } |
8335 | |
8336 | // Wait for the threads to shut down. |
8337 | hb_tx_stop(); |
8338 | |
8339 | hb_adjacency_tender_stop(); |
8340 | |
8341 | // Stop channels. |
8342 | channel_stop(); |
8343 | |
8344 | g_hb.status = AS_HB_STATUS_STOPPED; |
8345 | } |
8346 | |
8347 | /** |
8348 | * Register a plugin with the heart beat system. |
8349 | */ |
8350 | static void |
8351 | hb_plugin_register(as_hb_plugin* plugin) |
8352 | { |
8353 | HB_LOCK(); |
8354 | memcpy(&g_hb.plugins[plugin->id], plugin, sizeof(as_hb_plugin)); |
8355 | HB_UNLOCK(); |
8356 | } |
8357 | |
8358 | /** |
8359 | * Check if the heartbeat recieved is duplicate or stale. |
8360 | */ |
8361 | static bool |
8362 | hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp last_send_ts) |
8363 | { |
8364 | if (as_hlc_timestamp_order_get(event->msg_hlc_ts.send_ts, last_send_ts) |
8365 | == AS_HLC_HAPPENS_BEFORE) { |
8366 | // Received a delayed heartbeat send before the current heartbeat. |
8367 | return true; |
8368 | } |
8369 | return false; |
8370 | } |
8371 | |
8372 | /** |
8373 | * Update the tracker with endpoint change status. |
8374 | */ |
8375 | static void |
8376 | hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed) |
8377 | { |
8378 | *tracker = *tracker << 1; |
8379 | if (endpoint_changed) { |
8380 | (*tracker)++; |
8381 | } |
8382 | } |
8383 | |
8384 | /** |
8385 | * Indicates if endpoint changes for this node are normal. |
8386 | */ |
8387 | static bool |
8388 | hb_endpoint_change_tracker_is_normal(uint64_t tracker) |
8389 | { |
8390 | if (tracker == 0) { |
8391 | // Normal and healthy case. |
8392 | return true; |
8393 | } |
8394 | |
8395 | uint32_t num_intervals_to_track = MIN(64, |
8396 | config_endpoint_track_intervals_get()); |
8397 | uint64_t mask = ~(~(uint64_t)0 << num_intervals_to_track); |
8398 | |
8399 | // Ignore older history. |
8400 | tracker &= mask; |
8401 | |
8402 | int flip_count = 0; |
8403 | static int nibblebits[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; |
8404 | for (; tracker != 0; tracker >>= 4) { |
8405 | flip_count += nibblebits[tracker & 0x0f]; |
8406 | } |
8407 | |
8408 | return flip_count <= config_endpoint_changes_allowed_get(); |
8409 | } |
8410 | |
8411 | |
8412 | /** |
8413 | * Indicates if the change tracker just changed. |
8414 | */ |
8415 | static bool |
8416 | hb_endpoint_change_tracker_has_changed(uint64_t tracker) |
8417 | { |
8418 | return tracker % 2; |
8419 | } |
8420 | |
8421 | /** |
8422 | * Update adjacent node data on receiving a valid pulse message. |
8423 | * |
8424 | * @return 0 if the update was successfully applied, -1 if the update should be |
8425 | * rejected. |
8426 | */ |
8427 | static int |
8428 | hb_adjacent_node_update(as_hb_channel_event* msg_event, |
8429 | as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]) |
8430 | { |
8431 | msg* msg = msg_event->msg; |
8432 | |
8433 | cf_node source = 0; |
8434 | // Channel has validated the source. Don't bother checking here. |
8435 | msg_nodeid_get(msg, &source); |
8436 | |
8437 | msg_id_get(msg, &adjacent_node->protocol_version); |
8438 | |
8439 | as_hlc_timestamp send_ts = adjacent_node->last_msg_hlc_ts.send_ts; |
8440 | |
8441 | if (hb_endpoint_change_tracker_has_changed( |
8442 | adjacent_node->endpoint_change_tracker)) { |
8443 | // Allow a little more slack for obsolete checking because the two nodes |
8444 | // might not have matching send timestamps. |
8445 | send_ts = as_hlc_timestamp_subtract_ms(send_ts, |
8446 | config_tx_interval_get()); |
8447 | } |
8448 | |
8449 | if (hb_msg_is_obsolete(msg_event, send_ts)) { |
8450 | WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64, |
8451 | send_ts, |
8452 | msg_event->msg_hlc_ts.send_ts, source); |
8453 | return -1; |
8454 | } |
8455 | |
8456 | // Populate plugin data. |
8457 | hb_plugin_msg_parse(msg, adjacent_node, g_hb.plugins, plugin_data_changed); |
8458 | |
8459 | // Get the ip address. |
8460 | as_endpoint_list* msg_endpoint_list; |
8461 | if (msg_endpoint_list_get(msg, &msg_endpoint_list) == 0 |
8462 | && !as_endpoint_lists_are_equal(adjacent_node->endpoint_list, |
8463 | msg_endpoint_list)) { |
8464 | // Update the endpoints. |
8465 | endpoint_list_copy(&adjacent_node->endpoint_list, msg_endpoint_list); |
8466 | } |
8467 | |
8468 | // Update the last updated time. |
8469 | adjacent_node->last_updated_monotonic_ts = cf_getms(); |
8470 | memcpy(&adjacent_node->last_msg_hlc_ts, &msg_event->msg_hlc_ts, |
8471 | sizeof(adjacent_node->last_msg_hlc_ts)); |
8472 | |
8473 | // Update the latency. |
8474 | int64_t latency = as_hlc_timestamp_diff_ms(msg_event->msg_hlc_ts.send_ts, |
8475 | msg_event->msg_hlc_ts.recv_ts); |
8476 | latency = latency < 0 ? -latency : latency; |
8477 | adjacent_node->avg_latency = ALPHA * latency |
8478 | + (1 - ALPHA) * adjacent_node->avg_latency; |
8479 | |
8480 | // Reset the cluster-name mismatch counter to zero. |
8481 | adjacent_node->cluster_name_mismatch_count = 0; |
8482 | |
8483 | // Check if fabric endpoints have changed. |
8484 | as_hb_plugin_node_data* curr_data = |
8485 | &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][adjacent_node->plugin_data_cycler |
8486 | % 2]; |
8487 | |
8488 | as_hb_plugin_node_data* prev_data = |
8489 | &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][(adjacent_node->plugin_data_cycler |
8490 | + 1) % 2]; |
8491 | |
8492 | as_endpoint_list* curr_fabric_endpoints = |
8493 | as_fabric_hb_plugin_get_endpoint_list(curr_data); |
8494 | as_endpoint_list* prev_fabric_endpoints = |
8495 | as_fabric_hb_plugin_get_endpoint_list(prev_data); |
8496 | |
8497 | // Endpoints changed if this is not the first update and if the endpoint |
8498 | // lists do not match. |
8499 | bool endpoints_changed = prev_fabric_endpoints != NULL |
8500 | && !as_endpoint_lists_are_equal(curr_fabric_endpoints, |
8501 | prev_fabric_endpoints); |
8502 | |
8503 | if (endpoints_changed) { |
8504 | char curr_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE]; |
8505 | char prev_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE]; |
8506 | |
8507 | as_endpoint_list_to_string(curr_fabric_endpoints, |
8508 | curr_fabric_endpoints_str, sizeof(curr_fabric_endpoints_str)); |
8509 | as_endpoint_list_to_string(prev_fabric_endpoints, |
8510 | prev_fabric_endpoints_str, sizeof(prev_fabric_endpoints_str)); |
8511 | |
8512 | TICKER_WARNING("node: %" PRIx64" fabric endpoints changed from {%s} to {%s}" , source, prev_fabric_endpoints_str, curr_fabric_endpoints_str); |
8513 | } |
8514 | |
8515 | hb_endpoint_change_tracker_update(&adjacent_node->endpoint_change_tracker, |
8516 | endpoints_changed); |
8517 | |
8518 | return 0; |
8519 | } |
8520 | |
8521 | /** |
8522 | * Indicates if a node can be considered adjacent, based on accumulated |
8523 | * statistics. |
8524 | */ |
8525 | static bool |
8526 | hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node) |
8527 | { |
8528 | return hb_endpoint_change_tracker_is_normal( |
8529 | adjacent_node->endpoint_change_tracker); |
8530 | } |
8531 | |
8532 | /** |
8533 | * Process a pulse from source having our node-id. |
8534 | */ |
8535 | static void |
8536 | hb_channel_on_self_pulse(as_hb_channel_event* msg_event) |
8537 | { |
8538 | bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 }; |
8539 | |
8540 | HB_LOCK(); |
8541 | if (hb_adjacent_node_update(msg_event, &g_hb.self_node, plugin_data_changed) |
8542 | != 0) { |
8543 | goto Exit; |
8544 | } |
8545 | |
8546 | as_hb_plugin_node_data* curr_data = |
8547 | &g_hb.self_node.plugin_data[AS_HB_PLUGIN_FABRIC][g_hb.self_node.plugin_data_cycler |
8548 | % 2]; |
8549 | as_endpoint_list* curr_fabric_endpoints = |
8550 | as_fabric_hb_plugin_get_endpoint_list(curr_data); |
8551 | |
8552 | if (!as_fabric_is_published_endpoint_list(curr_fabric_endpoints)) { |
8553 | // Mark self as having duplicate node-id. |
8554 | g_hb.self_is_duplicate = true; |
8555 | g_hb.self_duplicate_detected_ts = cf_getms(); |
8556 | |
8557 | // Found another node with duplicate node-id. |
8558 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
8559 | as_endpoint_list_to_string(curr_fabric_endpoints, endpoint_list_str, |
8560 | sizeof(endpoint_list_str)); |
8561 | TICKER_WARNING("duplicate node-id: %" PRIx64 " with fabric endpoints {%s}" , config_self_nodeid_get(), endpoint_list_str); |
8562 | } |
8563 | else { |
8564 | cf_atomic_int_incr(&g_stats.heartbeat_received_self); |
8565 | } |
8566 | |
8567 | Exit: |
8568 | HB_UNLOCK(); |
8569 | } |
8570 | |
8571 | /** |
8572 | * Process an incoming pulse message. |
8573 | */ |
8574 | static void |
8575 | hb_channel_on_pulse(as_hb_channel_event* msg_event) |
8576 | { |
8577 | msg* msg = msg_event->msg; |
8578 | cf_node source; |
8579 | |
8580 | // Print cluster breach only once per second. |
8581 | static cf_clock last_cluster_breach_print = 0; |
8582 | |
8583 | // Channel has validated the source. Don't bother checking here. |
8584 | msg_nodeid_get(msg, &source); |
8585 | |
8586 | if (source == config_self_nodeid_get()) { |
8587 | hb_channel_on_self_pulse(msg_event); |
8588 | // Ignore self heartbeats. |
8589 | return; |
8590 | } |
8591 | |
8592 | HB_LOCK(); |
8593 | |
8594 | as_hb_adjacent_node adjacent_node = { 0 }; |
8595 | |
8596 | bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 }; |
8597 | bool is_in_adjacency = (hb_adjacent_node_get(source, &adjacent_node) == 0); |
8598 | bool should_be_on_probation = false; |
8599 | |
8600 | if (!is_in_adjacency) { |
8601 | hb_on_probation_node_get(source, &adjacent_node); |
8602 | } |
8603 | |
8604 | // Update the adjacent node with contents of the message. |
8605 | if (hb_adjacent_node_update(msg_event, &adjacent_node, plugin_data_changed) |
8606 | != 0) { |
8607 | // Update rejected. |
8608 | goto Exit; |
8609 | } |
8610 | |
8611 | // Check if this node needs to be on probation. |
8612 | should_be_on_probation = !hb_node_can_consider_adjacent(&adjacent_node); |
8613 | |
8614 | cf_atomic_int_incr(&g_stats.heartbeat_received_foreign); |
8615 | |
8616 | bool is_new = !should_be_on_probation && !is_in_adjacency; |
8617 | |
8618 | if (is_new) { |
8619 | int mcsize = config_mcsize(); |
8620 | // Note: adjacency list does not contain self node hence |
8621 | // (mcsize - 1) in the check. |
8622 | if (cf_shash_get_size(g_hb.adjacency) >= (mcsize - 1)) { |
8623 | if (last_cluster_breach_print != (cf_getms() / 1000L)) { |
8624 | WARNING("ignoring node: %" PRIx64" - exceeding maximum supported cluster size %d" , |
8625 | source, mcsize); |
8626 | last_cluster_breach_print = cf_getms() / 1000L; |
8627 | } |
8628 | goto Exit; |
8629 | } |
8630 | } |
8631 | |
8632 | // Move the node to appropriate hash. |
8633 | cf_shash_put(should_be_on_probation ? g_hb.on_probation : g_hb.adjacency, |
8634 | &source, &adjacent_node); |
8635 | |
8636 | // Maintain mutual exclusion between adjacency and on_probation hashes. |
8637 | cf_shash_delete(should_be_on_probation ? g_hb.adjacency : g_hb.on_probation, |
8638 | &source); |
8639 | |
8640 | if (is_new) { |
8641 | // Publish event if this is a new node. |
8642 | INFO("node arrived %" PRIx64, source); |
8643 | hb_event_queue(AS_HB_INTERNAL_NODE_ARRIVE, &source, 1); |
8644 | } |
8645 | else if (should_be_on_probation && is_in_adjacency) { |
8646 | // This node needs to be on probation, most likely due to duplicate |
8647 | // node-ids. |
8648 | WARNING("node expired %" PRIx64" - potentially duplicate node-id" , source); |
8649 | hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, &source, 1); |
8650 | } |
8651 | |
8652 | Exit: |
8653 | HB_UNLOCK(); |
8654 | |
8655 | // Publish any pending node arrival events. |
8656 | hb_event_publish_pending(); |
8657 | |
8658 | if (!should_be_on_probation) { |
8659 | // Call plugin change listeners outside of a lock to prevent deadlocks. |
8660 | for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { |
8661 | if (plugin_data_changed[i] && g_hb.plugins[i].change_listener) { |
8662 | // Notify that data for this plugin for the source node has |
8663 | // changed. |
8664 | DETAIL("plugin data for node %" PRIx64" changed for plugin %d" , |
8665 | source, i); |
8666 | (g_hb.plugins[i]).change_listener(source); |
8667 | } |
8668 | } |
8669 | } |
8670 | } |
8671 | |
8672 | /** |
8673 | * Process an incoming heartbeat message. |
8674 | */ |
8675 | static void |
8676 | hb_channel_on_msg_rcvd(as_hb_channel_event* event) |
8677 | { |
8678 | msg* msg = event->msg; |
8679 | as_hb_msg_type type; |
8680 | msg_type_get(msg, &type); |
8681 | |
8682 | switch (type) { |
8683 | case AS_HB_MSG_TYPE_PULSE: // A pulse message. Update the adjacent node data. |
8684 | hb_channel_on_pulse(event); |
8685 | break; |
8686 | default: // Ignore other messages. |
8687 | break; |
8688 | } |
8689 | } |
8690 | |
8691 | /** |
8692 | * Increase the cluster-name mismatch counter the node. |
8693 | */ |
8694 | static void |
8695 | hb_handle_cluster_name_mismatch(as_hb_channel_event* event) |
8696 | { |
8697 | HB_LOCK(); |
8698 | |
8699 | as_hb_adjacent_node adjacent_node; |
8700 | memset(&adjacent_node, 0, sizeof(adjacent_node)); |
8701 | |
8702 | if (hb_adjacent_node_get(event->nodeid, &adjacent_node) != 0) { |
8703 | // Node does not exist in the adjacency list |
8704 | goto Exit; |
8705 | } |
8706 | |
8707 | if (hb_msg_is_obsolete(event, adjacent_node.last_msg_hlc_ts.send_ts)) { |
8708 | WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64, |
8709 | adjacent_node.last_msg_hlc_ts.send_ts, |
8710 | event->msg_hlc_ts.send_ts, event->nodeid); |
8711 | goto Exit; |
8712 | } |
8713 | |
8714 | // Update the cluster_name_mismatch counter. |
8715 | adjacent_node.cluster_name_mismatch_count++; |
8716 | cf_shash_put(g_hb.adjacency, &event->nodeid, &adjacent_node); |
8717 | Exit: |
8718 | HB_UNLOCK(); |
8719 | } |
8720 | |
8721 | /** |
8722 | * Process channel events. |
8723 | */ |
8724 | static void |
8725 | hb_channel_event_process(as_hb_channel_event* event) |
8726 | { |
8727 | // Deal with pulse messages here. |
8728 | switch (event->type) { |
8729 | case AS_HB_CHANNEL_MSG_RECEIVED: |
8730 | hb_channel_on_msg_rcvd(event); |
8731 | break; |
8732 | case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH: |
8733 | hb_handle_cluster_name_mismatch(event); |
8734 | break; |
8735 | default: // Ignore channel active and inactive events. Rather rely on the adjacency |
8736 | // tender to expire nodes. |
8737 | break; |
8738 | } |
8739 | } |
8740 | |
8741 | /** |
8742 | * Dump hb mode state to logs. |
8743 | * @param verbose enables / disables verbose logging. |
8744 | */ |
8745 | static void |
8746 | hb_mode_dump(bool verbose) |
8747 | { |
8748 | if (hb_is_mesh()) { |
8749 | mesh_dump(verbose); |
8750 | } |
8751 | else { |
8752 | multicast_dump(verbose); |
8753 | } |
8754 | } |
8755 | |
8756 | /** |
8757 | * Reduce function to dump hb node info to log file. |
8758 | */ |
8759 | static int |
8760 | hb_dump_reduce(const void* key, void* data, void* udata) |
8761 | { |
8762 | const cf_node* nodeid = (const cf_node*)key; |
8763 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
8764 | |
8765 | char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; |
8766 | as_endpoint_list_to_string(adjacent_node->endpoint_list, endpoint_list_str, |
8767 | sizeof(endpoint_list_str)); |
8768 | |
8769 | INFO("\tHB %s Node: node-id %" PRIx64" protocol %" PRIu32" endpoints {%s} last-updated %" PRIu64 " latency-ms %" PRIu64 , |
8770 | (char*)udata, |
8771 | *nodeid, adjacent_node->protocol_version, endpoint_list_str, |
8772 | adjacent_node->last_updated_monotonic_ts, adjacent_node->avg_latency); |
8773 | |
8774 | return CF_SHASH_OK; |
8775 | } |
8776 | |
8777 | /** |
8778 | * Dump hb state to logs. |
8779 | * @param verbose enables / disables verbose logging. |
8780 | */ |
8781 | static void |
8782 | hb_dump(bool verbose) |
8783 | { |
8784 | HB_LOCK(); |
8785 | |
8786 | INFO("HB Adjacency Size: %d" , cf_shash_get_size(g_hb.adjacency)); |
8787 | |
8788 | if (verbose) { |
8789 | cf_shash_reduce(g_hb.adjacency, hb_dump_reduce, "Adjacent" ); |
8790 | } |
8791 | |
8792 | if (cf_shash_get_size(g_hb.on_probation)) { |
8793 | INFO("HB On-probation Size: %d" , cf_shash_get_size(g_hb.on_probation)); |
8794 | |
8795 | if (verbose) { |
8796 | cf_shash_reduce(g_hb.on_probation, hb_dump_reduce, "On-probation" ); |
8797 | } |
8798 | } |
8799 | |
8800 | HB_UNLOCK(); |
8801 | } |
8802 | |
8803 | /** |
8804 | * Compute a complement / inverted adjacency graph for input nodes such that |
8805 | * entry |
8806 | * |
8807 | * inverted_graph[i][j] = 0 iff node[i] and node[j] are in each others adjacency |
8808 | * lists. That is they have a bidirectional network link active between them. |
8809 | * |
8810 | * else |
8811 | * |
8812 | * inverted_graph[i][j] > 0 iff there is no link or a unidirectional link |
8813 | * between them. |
8814 | * |
8815 | * |
8816 | * @param nodes the input vector of nodes. |
8817 | * @param inverted_graph (output) a (num_nodes x num_nodes ) 2D byte array. |
8818 | */ |
8819 | static void |
8820 | hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph) |
8821 | { |
8822 | HB_LOCK(); |
8823 | int num_nodes = cf_vector_size(nodes); |
8824 | |
8825 | for (int i = 0; i < num_nodes; i++) { |
8826 | for (int j = 0; j < num_nodes; j++) { |
8827 | inverted_graph[i][j] = 2; |
8828 | } |
8829 | cf_node nodeid = 0; |
8830 | cf_vector_get(nodes, i, &nodeid); |
8831 | cf_shash_put(g_hb.nodeid_to_index, &nodeid, &i); |
8832 | } |
8833 | |
8834 | cf_node self_nodeid = config_self_nodeid_get(); |
8835 | int self_node_index = -1; |
8836 | cf_shash_get(g_hb.nodeid_to_index, &self_nodeid, &self_node_index); |
8837 | |
8838 | for (int i = 0; i < num_nodes; i++) { |
8839 | // Mark the node connected from itself, i.e, disconnected in the |
8840 | // inverted graph. |
8841 | inverted_graph[i][i] = 0; |
8842 | |
8843 | cf_node node = *(cf_node*)cf_vector_getp(nodes, i); |
8844 | as_hb_adjacent_node node_info; |
8845 | |
8846 | if (hb_adjacent_node_get(node, &node_info) == 0) { |
8847 | if (self_node_index >= 0) { |
8848 | // Self node will not have plugin data. But the fact that this |
8849 | // node has an adjacent node indicates that is is in our |
8850 | // adjacency list. Adjust the graph. |
8851 | inverted_graph[i][self_node_index]--; |
8852 | inverted_graph[self_node_index][i]--; |
8853 | } |
8854 | |
8855 | cf_node* adjacency_list = NULL; |
8856 | size_t adjacency_length = 0; |
8857 | hb_adjacent_node_adjacency_get(&node_info, &adjacency_list, &adjacency_length); |
8858 | |
8859 | for (int j = 0; j < adjacency_length; j++) { |
8860 | int other_node_index = -1; |
8861 | cf_shash_get(g_hb.nodeid_to_index, &adjacency_list[j], |
8862 | &other_node_index); |
8863 | if (other_node_index < 0) { |
8864 | // This node is not in the input set of nodes. |
8865 | continue; |
8866 | } |
8867 | |
8868 | if (i != other_node_index) { |
8869 | inverted_graph[i][other_node_index]--; |
8870 | inverted_graph[other_node_index][i]--; |
8871 | } |
8872 | } |
8873 | } |
8874 | } |
8875 | |
8876 | // Cleanup the temporary hash. |
8877 | cf_shash_delete_all(g_hb.nodeid_to_index); |
8878 | |
8879 | HB_UNLOCK(); |
8880 | } |
8881 | |
8882 | /** |
8883 | * Compute the nodes to evict from the input nodes so that remaining nodes form |
8884 | * a clique, based on adjacency lists using minimal vertex cover. |
8885 | * |
8886 | * The minimal vertex cover on this graph is the set of nodes that should be |
8887 | * removed to result in a clique on the remaining nodes. This implementation is |
8888 | * an approximation of the minimal vertex cover. The notion is to keep removing |
8889 | * vertices having the highest degree until there are no more edges remaining. |
8890 | * The heuristic gets rid of the more problematic nodes first. |
8891 | * |
8892 | * @param nodes input cf_node vector. |
8893 | * @param nodes_to_evict output cf_node clique array, that is initialized. |
8894 | */ |
8895 | static void |
8896 | hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict) |
8897 | { |
8898 | int num_nodes = cf_vector_size(nodes); |
8899 | |
8900 | if (num_nodes == 0) { |
8901 | // Nothing to do. |
8902 | return; |
8903 | } |
8904 | |
8905 | int graph_alloc_size = sizeof(uint8_t) * num_nodes * num_nodes; |
8906 | void* graph_data = MSG_BUFF_ALLOC(graph_alloc_size); |
8907 | |
8908 | if (!graph_data) { |
8909 | CRASH("error allocating space for clique finding data structure" ); |
8910 | } |
8911 | |
8912 | uint8_t* inverted_graph[num_nodes]; |
8913 | inverted_graph[0] = graph_data; |
8914 | for (int i = 1; i < num_nodes; i++) { |
8915 | inverted_graph[i] = *inverted_graph + num_nodes * i; |
8916 | } |
8917 | |
8918 | hb_adjacency_graph_invert(nodes, inverted_graph); |
8919 | |
8920 | // Count the number of edges in the inverted graph. These edges are the ones |
8921 | // that need to be removed so that the remaining nodes form a clique in the |
8922 | // adjacency graph. Also for performance get hold of the self node index in |
8923 | // the nodes vector. |
8924 | int edge_count = 0; |
8925 | int self_node_index = -1; |
8926 | for (int i = 0; i < num_nodes; i++) { |
8927 | cf_node node = 0; |
8928 | cf_vector_get(nodes, i, &node); |
8929 | if (node == config_self_nodeid_get()) { |
8930 | self_node_index = i; |
8931 | } |
8932 | |
8933 | for (int j = 0; j < num_nodes; j++) { |
8934 | if (inverted_graph[i][j]) { |
8935 | edge_count++; |
8936 | } |
8937 | } |
8938 | } |
8939 | |
8940 | cf_vector_delete_range(nodes_to_evict, 0, |
8941 | cf_vector_size(nodes_to_evict) - 1); |
8942 | |
8943 | // Since we always decide to retain self node, first get rid of all nodes |
8944 | // having missing links to self node. |
8945 | if (self_node_index >= 0) { |
8946 | for (int i = 0; i < num_nodes; i++) { |
8947 | if (inverted_graph[self_node_index][i] |
8948 | || inverted_graph[i][self_node_index]) { |
8949 | cf_node to_evict = 0; |
8950 | cf_vector_get(nodes, i, &to_evict); |
8951 | DEBUG("marking node %" PRIx64" for clique based eviction" , |
8952 | to_evict); |
8953 | |
8954 | cf_vector_append(nodes_to_evict, &to_evict); |
8955 | |
8956 | // Remove all edges attached to the removed node. |
8957 | for (int j = 0; j < num_nodes; j++) { |
8958 | if (inverted_graph[i][j]) { |
8959 | inverted_graph[i][j] = 0; |
8960 | edge_count--; |
8961 | } |
8962 | if (inverted_graph[j][i]) { |
8963 | inverted_graph[j][i] = 0; |
8964 | edge_count--; |
8965 | } |
8966 | } |
8967 | } |
8968 | } |
8969 | } |
8970 | |
8971 | while (edge_count > 0) { |
8972 | // Find vertex with highest degree. |
8973 | cf_node max_degree_node = 0; |
8974 | int max_degree_node_idx = -1; |
8975 | int max_degree = 0; |
8976 | |
8977 | for (int i = 0; i < num_nodes; i++) { |
8978 | cf_node to_evict = 0; |
8979 | cf_vector_get(nodes, i, &to_evict); |
8980 | |
8981 | if (vector_find(nodes_to_evict, &to_evict) >= 0) { |
8982 | // We have already decided to evict this node. |
8983 | continue; |
8984 | } |
8985 | |
8986 | if (to_evict == config_self_nodeid_get()) { |
8987 | // Do not evict self. |
8988 | continue; |
8989 | } |
8990 | |
8991 | // Get the degree of this node. |
8992 | int degree = 0; |
8993 | for (int j = 0; j < num_nodes; j++) { |
8994 | if (inverted_graph[i][j]) { |
8995 | degree++; |
8996 | } |
8997 | } |
8998 | |
8999 | DETAIL("inverted degree for node %" PRIx64" is %d" , |
9000 | to_evict, degree); |
9001 | |
9002 | // See if this node has a higher degree. On ties choose the node |
9003 | // with a smaller nodeid |
9004 | if (degree > max_degree |
9005 | || (degree == max_degree && max_degree_node > to_evict)) { |
9006 | max_degree = degree; |
9007 | max_degree_node = to_evict; |
9008 | max_degree_node_idx = i; |
9009 | } |
9010 | } |
9011 | |
9012 | if (max_degree_node_idx < 0) { |
9013 | // We are done no node to evict. |
9014 | break; |
9015 | } |
9016 | |
9017 | DEBUG("marking node %" PRIx64" with degree %d for clique based eviction" , |
9018 | max_degree_node, max_degree); |
9019 | |
9020 | cf_vector_append(nodes_to_evict, &max_degree_node); |
9021 | |
9022 | // Remove all edges attached to the removed node. |
9023 | for (int i = 0; i < num_nodes; i++) { |
9024 | if (inverted_graph[max_degree_node_idx][i]) { |
9025 | inverted_graph[max_degree_node_idx][i] = 0; |
9026 | edge_count--; |
9027 | } |
9028 | if (inverted_graph[i][max_degree_node_idx]) { |
9029 | inverted_graph[i][max_degree_node_idx] = 0; |
9030 | edge_count--; |
9031 | } |
9032 | } |
9033 | } |
9034 | |
9035 | MSG_BUFF_FREE(graph_data, graph_alloc_size); |
9036 | } |
9037 | |
9038 | /** |
9039 | * Reduce function to iterate over plugin data for all adjacent nodes. |
9040 | */ |
9041 | static int |
9042 | hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata) |
9043 | { |
9044 | const cf_node* nodeid = (const cf_node*)key; |
9045 | as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; |
9046 | as_hb_adjacecny_iterate_reduce_udata* reduce_udata = |
9047 | (as_hb_adjacecny_iterate_reduce_udata*)udata; |
9048 | |
9049 | size_t plugin_data_size = |
9050 | adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler |
9051 | % 2].data_size; |
9052 | void* plugin_data = |
9053 | plugin_data_size ? |
9054 | adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler |
9055 | % 2].data : NULL; |
9056 | |
9057 | reduce_udata->iterate_fn(*nodeid, plugin_data, plugin_data_size, |
9058 | adjacent_node->last_updated_monotonic_ts, |
9059 | &adjacent_node->last_msg_hlc_ts, reduce_udata->udata); |
9060 | |
9061 | return CF_SHASH_OK; |
9062 | } |
9063 | |
9064 | /** |
9065 | * Call the iterate method on all nodes in current adjacency list. Note plugin |
9066 | * data can still be NULL if the plugin data failed to parse the plugin data. |
9067 | * |
9068 | * @param pluginid the plugin identifier. |
9069 | * @param iterate_fn the iterate function invoked for plugin data forevery node. |
9070 | * @param udata passed as is to the iterate function. Useful for getting results |
9071 | * out of the iteration. NULL if there is no plugin data. |
9072 | * @return the size of the plugin data. 0 if there is no plugin data. |
9073 | */ |
9074 | static void |
9075 | hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, |
9076 | as_hb_plugin_data_iterate_fn iterate_fn, void* udata) |
9077 | { |
9078 | HB_LOCK(); |
9079 | |
9080 | as_hb_adjacecny_iterate_reduce_udata reduce_udata; |
9081 | reduce_udata.pluginid = pluginid; |
9082 | reduce_udata.iterate_fn = iterate_fn; |
9083 | reduce_udata.udata = udata; |
9084 | cf_shash_reduce(g_hb.adjacency, hb_plugin_data_iterate_reduce, |
9085 | &reduce_udata); |
9086 | |
9087 | HB_UNLOCK(); |
9088 | } |
9089 | |