1/*
2 * hb.c
3 *
4 * Copyright (C) 2012-2017 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23#include "fabric/hb.h"
24
25#include <errno.h>
26#include <limits.h>
27#include <math.h>
28#include <pthread.h>
29#include <stdio.h>
30#include <sys/param.h>
31#include <sys/types.h>
32#include <zlib.h>
33
34#include "citrusleaf/alloc.h"
35#include "citrusleaf/cf_atomic.h"
36#include "citrusleaf/cf_clock.h"
37#include "citrusleaf/cf_hash_math.h"
38#include "citrusleaf/cf_queue.h"
39
40#include "cf_thread.h"
41#include "dns.h"
42#include "fault.h"
43#include "node.h"
44#include "shash.h"
45#include "socket.h"
46
47#include "base/cfg.h"
48#include "base/health.h"
49#include "base/stats.h"
50#include "base/thr_info.h"
51#include "fabric/endpoint.h"
52#include "fabric/fabric.h"
53#include "fabric/partition_balance.h"
54
55/*
56 * Overview
57 * ========
58 * The heartbeat subsystem is a core clustering module that discovers nodes in
59 * the cluster and monitors connectivity to them. This subsystem maintains an
60 * "adjacency list", which is the list of nodes deemed to be alive and connected
61 * at any instance in time.
62 *
63 * The heartbeat subsystem is divided into three sub modules
64 * 1. Config
65 * 2. Channel
66 * 3. Mesh
67 * 4. Main
68 *
69 * Config
70 * ------
71 * This sub module deals with overall heartbeat subsystem configuration and
72 * dynamic updates to configuration.
73 *
74 * Channel
75 * -------
76 * This sub module is responsible for maintaining a channel between this node
77 * and all known nodes. The channel sub module provides the ability to broadcast
78 * or uni cast messages to known nodes.
79 *
80 * Other modules interact with the channel sub module primarily through events
81 * raised by the channel sub module. The events help other sub modules infer
82 * connectivity status to known nodes and react to incoming heartbeat message
83 * from other nodes.
84 *
85 * Depending on the configured mode (mesh. multicast) the channels between this
86 * node and other nodes could be
87 * 1. TCP and hence unicast. One per pair of nodes.
88 * 2. Multicast with UDP. One per cluster.
89 *
90 * Mesh
91 * ----
92 * This sub module is responsible for discovering cluster members. New nodes are
93 * discovered via adjacency lists published in their heartbeats of know nodes.
94 * The mesh module boots up using configured seed nodes.
95 *
96 * Main
97 * ----
98 * This sub module orchestrates other modules and hence main. Its primary
99 * responsibility is to maintain the adjacency list.
100 *
101 * Heartbeat messages
102 * ==================
103 *
104 * Every heartbeat message contains
105 * 1. the source node's nodeid
106 * 2. the source node's published ip address
107 * 3. the source node's published port.
108 *
109 * There are the following types of heartbeat messages
110 * 1. Pulse - messages sent at periodic intervals. Will contain current
111 * adjacency lists
112 * 2. Info request - message sent in the mesh mode, to a known mesh node,
113 * in order to get ip address and port of a newly discovered node.
114 * 3. Info reply - message sent in response to an info request. Returns
115 * the node's ip address and port.
116 *
117 * Message conventions
118 * -------------------
119 * 1. Published adjacency will always contain the source node.
120 *
121 * Design philosophy
122 * =================
123 *
124 * Locking vs single threaded event loop.
125 * --------------------------------------
126 * This first cut leans toward using locks instead of single threaded event
127 * loops to protect critical data. The choice is driven by the fact that
128 * synchronous external and inter-sub module interaction looked like more work
129 * with single threaded event loops. The design chooses simplicity over
130 * performance given the lower volumes of events that need to be processed here
131 * as compared to the transaction processing code. The locks are coarse, one per
132 * sub module and re-entrant. They are used generously and no function makes an
133 * assumption of locks prior locks being held.
134 *
135 * Inter-module interactions in some cases are via synchronous function calls,
136 * which run the risk of deadlocks. For now, deadlocks should not happen.
137 * However, if this ideology complicates code, inter-module interaction will be
138 * rewritten to use asynchronous event queues.
139 *
140 * Locking policy
141 * ==============
142 *
143 * 1. Lock as much as you can. The locks are re-entrant. This is not a critical
144 * high volume code path, and hence correctness with simplicity is preferred.
145 * Any read / write access to module state should be under a lock.
146 * 2. Preventing deadlocks
147 * a. The enforced lock order is
148 * 1. Protocol lock (SET_PROTOCOL_LOCK) Uses to ensure protocol set is
149 * atomic.
150 * 2. Main module (HB_LOCK)
151 * 3. Mesh and multicast modules (MESH_LOCK)
152 * 4. Channel (CHANNEL_LOCK)
153 * 5. Config (HB_CONFIG_LOCK)
154 * Always make sure every thread acquires locks in this order ONLY. In terms
155 * of functions calls only lower numbered modules can call functions from the
156 * higher numbered modules while holding their onto their locks.
157 * 3. Events raised / messages passed to listeners should be outside the
158 * module's lock.
159 *
160 * Guidelines for message plugins
161 * ==============================
162 * The parse data functions should NOT hold any locks and thus avert deadlocks.
163 *
164 * TODO
165 * ====
166 * 1. Extend to allow hostnames in mesh mode across the board.
167 */
168
169/*
170 * ----------------------------------------------------------------------------
171 * Macros
172 * ----------------------------------------------------------------------------
173 */
174
175/*
176 * ----------------------------------------------------------------------------
177 * Channel
178 * ----------------------------------------------------------------------------
179 */
180
181/**
182 * Size of the poll events set.
183 */
184#define POLL_SZ 1024
185
186/**
187 * The number of bytes for the message length on the wire.
188 */
189#define MSG_WIRE_LENGTH_SIZE 4
190
191/**
192 * Channel idle interval after which check for inactive channel is triggered.
193 */
194#define CHANNEL_IDLE_CHECK_PERIOD (CHANNEL_NODE_READ_IDLE_TIMEOUT() / 2)
195
196/**
197 * A channel times out if there is no msg received from a node in this interval.
198 * Set to a fraction of node timeout so that a new channel could be set up to
199 * recover from a potentially bad connection before the node times out.
200 */
201#define CHANNEL_NODE_READ_IDLE_TIMEOUT() \
202(PULSE_TRANSMIT_INTERVAL() \
203 * MAX(2, config_max_intervals_missed_get() / 3))
204
205/**
206 * Acquire a lock on the entire channel sub module.
207 */
208#define CHANNEL_LOCK() (pthread_mutex_lock(&g_channel_lock))
209
210/**
211 * Relinquish the lock on the entire channel sub module.
212 */
213#define CHANNEL_UNLOCK() (pthread_mutex_unlock(&g_channel_lock))
214
215/*
216 * ----------------------------------------------------------------------------
217 * Mesh and Multicast
218 * ----------------------------------------------------------------------------
219 */
220
221/**
222 * Read write timeout (in ms).
223 */
224#define MESH_RW_TIMEOUT 5
225
226/**
227 * Size of the network header.
228 *
229 * Maximum size of IPv4 header - 20 bytes (assuming no variable length fields)
230 * Fixed size of IPv6 header - 40 bytes (assuming no extension headers)
231 * Maximum size of TCP header - 60 Bytes
232 * Size of UDP header (fixed) - 8 bytes
233 * So maximum size of empty TCP datagram - 60 + 20 = 80 bytes
234 * So maximum size of empty IPv4 UDP datagram - 20 + 8 = 28 bytes
235 * So maximum size of empty IPv6 UDP datagram - 40 + 8 = 48 bytes
236 *
237 * Being conservative and assuming 30 bytes for IPv4 UDP header and 50 bytes for
238 * IPv6 UDP header.
239 */
240#define UDP_HEADER_SIZE_MAX 50
241
242/**
243 * Expected ratio - (input size) / (compressed size). Assuming 40% decrease in
244 * size after compression.
245 */
246#define MSG_COMPRESSION_RATIO (1.0 / 0.60)
247
248/**
249 * Mesh timeout for pending nodes.
250 */
251#define MESH_PENDING_TIMEOUT (CONNECT_TIMEOUT())
252
253/**
254 * Mesh inactive timeout after which a mesh node will be forgotten.
255 */
256#define MESH_INACTIVE_TIMEOUT (10 * HB_NODE_TIMEOUT())
257
258/**
259 * Mesh timeout for getting the endpoint for a node after which this node will
260 * be forgotten.
261 */
262#define MESH_ENDPOINT_UNKNOWN_TIMEOUT (HB_NODE_TIMEOUT())
263
264/**
265 * Intervals at which mesh tender runs.
266 */
267#define MESH_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL())
268
269/**
270 * Intervals at which attempts to resolve unresolved seed hostname will be made.
271 */
272#define MESH_SEED_RESOLVE_ATTEMPT_INTERVAL() (HB_NODE_TIMEOUT())
273
274/**
275 * Intervals at which conflict checks is enabled.
276 */
277#define MESH_CONFLICT_CHECK_INTERVAL() (5 * HB_NODE_TIMEOUT())
278
279/**
280 * Duration for which conflicts are checked.
281 */
282#define MESH_CONFLICT_CHECK_DURATION() (MESH_CONFLICT_CHECK_INTERVAL() / 5)
283
284/**
285 * Acquire a lock on the entire mesh sub module.
286 */
287#define MESH_LOCK() (pthread_mutex_lock(&g_mesh_lock))
288
289/**
290 * Relinquish the lock on the entire mesh sub module.
291 */
292#define MESH_UNLOCK() (pthread_mutex_unlock(&g_mesh_lock))
293
294/**
295 * Acquire a lock on the entire multicast sub module.
296 */
297#define MULTICAST_LOCK() (pthread_mutex_lock(&g_multicast_lock))
298
299/**
300 * Relinquish the lock on the entire multicast sub module.
301 */
302#define MULTICAST_UNLOCK() (pthread_mutex_unlock(&g_multicast_lock))
303
304/*
305 * ----------------------------------------------------------------------------
306 * Main
307 * ----------------------------------------------------------------------------
308 */
309
310/**
311 * The identifier for heartbeat protocol version 3.
312 */
313#define HB_PROTOCOL_V3_IDENTIFIER 0x6864
314
315/**
316 * Maximum length of hb protocol string.
317 */
318#define HB_PROTOCOL_STR_MAX_LEN 16
319
320/**
321 * Default allocation size for plugin data.
322 */
323#define HB_PLUGIN_DATA_DEFAULT_SIZE 128
324
325/**
326 * Block size for allocating node plugin data. Ensure the allocation is in
327 * multiples of 128 bytes, allowing expansion to 16 nodes without reallocating.
328 */
329#define HB_PLUGIN_DATA_BLOCK_SIZE 128
330
331/**
332 * Message scratch size for v3 HB messages. To accommodate 64 node cluster.
333 */
334#define AS_HB_MSG_SCRATCH_SIZE 1024
335
336/**
337 * A soft limit for the maximum cluster size. Meant to be optimize hash and list
338 * data structures and not as a limit on the number of nodes.
339 */
340#define AS_HB_CLUSTER_MAX_SIZE_SOFT 200
341
342/**
343 * Maximum event listeners.
344 */
345#define AS_HB_EVENT_LISTENER_MAX 7
346
347/**
348 * Maximum permissible cluster-name mismatch per node.
349 */
350#define CLUSTER_NAME_MISMATCH_MAX 2
351
352/**
353 * Timeout for deeming a node dead based on received heartbeats.
354 */
355#define HB_NODE_TIMEOUT() \
356((config_max_intervals_missed_get() * config_tx_interval_get()))
357
358/**
359 * Intervals at which heartbeats are send.
360 */
361#define PULSE_TRANSMIT_INTERVAL() \
362(MAX(config_tx_interval_get(), AS_HB_TX_INTERVAL_MS_MIN))
363
364/**
365 * Intervals at which adjacency tender runs.
366 */
367#define ADJACENCY_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL())
368
369/**
370 * Intervals at which adjacency tender runs in anticipation of addtional node
371 * depart events.
372 */
373#define ADJACENCY_FAST_TEND_INTERVAL (MIN(ADJACENCY_TEND_INTERVAL, 10))
374
375/**
376 * Acquire a lock on the external event publisher.
377 */
378#define EXTERNAL_EVENT_PUBLISH_LOCK() \
379(pthread_mutex_lock(&g_external_event_publish_lock))
380
381/**
382 * Relinquish the lock on the external event publisher.
383 */
384#define EXTERNAL_EVENT_PUBLISH_UNLOCK() \
385(pthread_mutex_unlock(&g_external_event_publish_lock))
386
387/**
388 * Acquire a lock on the heartbeat main module.
389 */
390#define HB_LOCK() (pthread_mutex_lock(&g_hb_lock))
391
392/**
393 * Relinquish the lock on the heartbeat main module.
394 */
395#define HB_UNLOCK() (pthread_mutex_unlock(&g_hb_lock))
396
397/**
398 * Weightage of current latency over current moving average. For now weigh
399 * recent values heavily over older values.
400 */
401#define ALPHA (0.65)
402
403/*
404 * ----------------------------------------------------------------------------
405 * Common
406 * ----------------------------------------------------------------------------
407 */
408
409/**
410 * The default MTU for multicast in case device discovery fails.
411 */
412#define DEFAULT_MIN_MTU 1500
413
414/**
415 * Maximum memory size allocated on the call stack.
416 */
417#define STACK_ALLOC_LIMIT (16 * 1024)
418
419/**
420 * Max string length for an endpoint list converted to a string.
421 */
422#define ENDPOINT_LIST_STR_SIZE 1024
423
424/**
425 * A hard limit on the buffer size for parsing incoming messages.
426 */
427#define MSG_BUFFER_MAX_SIZE (10 * 1024 * 1024)
428
429#ifndef ASC
430#define ASC (2 << 2)
431#endif
432
433/**
434 * Connection initiation timeout, Capped at 100 ms.
435 */
436#define CONNECT_TIMEOUT() (MIN(100, config_tx_interval_get()))
437
438/**
439 * Allocate a buffer for heart beat messages. Larger buffers are heap allocated
440 * to prevent stack overflows.
441 */
442#define MSG_BUFF_ALLOC(size) ( \
443 (size) <= MSG_BUFFER_MAX_SIZE ? \
444 (((size) > STACK_ALLOC_LIMIT) ? \
445 cf_malloc(size) : alloca(size)) : NULL)
446
447/**
448 * Allocate a buffer for heart beat messages. Larger buffers are heap allocated
449 * to prevent stack overflows. Crashes the process on failure to allocate the
450 * buffer.
451 */
452#define MSG_BUFF_ALLOC_OR_DIE(size, crash_msg, ...) \
453({ \
454 uint8_t* retval = MSG_BUFF_ALLOC((size)); \
455 if (!retval) { \
456 CRASH(crash_msg, ##__VA_ARGS__); \
457 } \
458 retval; \
459})
460
461/**
462 * Free the buffer allocated by MSG_BUFF_ALLOC
463 */
464#define MSG_BUFF_FREE(buffer, size) \
465if (((size) > STACK_ALLOC_LIMIT) && buffer) {cf_free(buffer);}
466
467/**
468 * Acquire a lock on the entire config sub module.
469 */
470#define HB_CONFIG_LOCK() (pthread_mutex_lock(&g_hb_config_lock))
471
472/**
473 * Relinquish the lock on the entire config sub module.
474 */
475#define HB_CONFIG_UNLOCK() (pthread_mutex_unlock(&g_hb_config_lock))
476
477/**
478 * Acquire a lock while setting heartbeat protocol dynamically.
479 */
480#define SET_PROTOCOL_LOCK() (pthread_mutex_lock(&g_set_protocol_lock))
481
482/**
483 * Relinquish the lock after setting heartbeat protocol dynamically.
484 */
485#define SET_PROTOCOL_UNLOCK() (pthread_mutex_unlock(&g_set_protocol_lock))
486
487/**
488 * Logging macros.
489 */
490#define CRASH(format, ...) cf_crash(AS_HB, format, ##__VA_ARGS__)
491#define CRASH_NOSTACK(format, ...) cf_crash_nostack(AS_HB, format, ##__VA_ARGS__)
492#define WARNING(format, ...) cf_warning(AS_HB, format, ##__VA_ARGS__)
493#define TICKER_WARNING(format, ...) \
494cf_ticker_warning(AS_HB, format, ##__VA_ARGS__)
495#define INFO(format, ...) cf_info(AS_HB, format, ##__VA_ARGS__)
496#define DEBUG(format, ...) cf_debug(AS_HB, format, ##__VA_ARGS__)
497#define DETAIL(format, ...) cf_detail(AS_HB, format, ##__VA_ARGS__)
498#define ASSERT(expression, message, ...) \
499if (!(expression)) {WARNING(message, ##__VA_ARGS__);}
500
501/*
502 * ----------------------------------------------------------------------------
503 * Private internal data structures
504 * ----------------------------------------------------------------------------
505 */
506
507/*
508 * ----------------------------------------------------------------------------
509 * Common
510 * ----------------------------------------------------------------------------
511 */
512
513/**
514 * Heartbeat subsystem state.
515 */
516typedef enum
517{
518 AS_HB_STATUS_UNINITIALIZED,
519 AS_HB_STATUS_RUNNING,
520 AS_HB_STATUS_SHUTTING_DOWN,
521 AS_HB_STATUS_STOPPED
522} as_hb_status;
523
524/*
525 * ----------------------------------------------------------------------------
526 * Mesh related
527 * ----------------------------------------------------------------------------
528 */
529
530/**
531 * Mesh node status enum.
532 */
533typedef enum
534{
535 /**
536 * The mesh node has an active channel.
537 */
538 AS_HB_MESH_NODE_CHANNEL_ACTIVE,
539
540 /**
541 * The mesh node is waiting for an active channel.
542 */
543 AS_HB_MESH_NODE_CHANNEL_PENDING,
544
545 /**
546 * The mesh node does not have an active channel.
547 */
548 AS_HB_MESH_NODE_CHANNEL_INACTIVE,
549
550 /**
551 * The ip address and port for this node are not yet known.
552 */
553 AS_HB_MESH_NODE_ENDPOINT_UNKNOWN,
554
555 /**
556 * The sentinel value. Should be the last in the enum.
557 */
558 AS_HB_MESH_NODE_STATUS_SENTINEL
559} as_hb_mesh_node_status;
560
561/**
562 * The info payload for a single node.
563 */
564typedef struct as_hb_mesh_info_reply_s
565{
566 /**
567 * The nodeid of the node for which info reply is sent.
568 */
569 cf_node nodeid;
570
571 /**
572 * The advertised endpoint list for this node. List to allow variable size
573 * endpoint list. Always access as reply.endpoints[0].
574 */
575 as_endpoint_list endpoint_list[];
576}__attribute__((__packed__)) as_hb_mesh_info_reply;
577
578/**
579 * Mesh tend reduce function udata.
580 */
581typedef struct as_hb_mesh_tend_reduce_udata_s
582{
583 /**
584 * The new endpoint lists to connect to. Each list has endpoints for s
585 * single remote peer.
586 */
587 as_endpoint_list** to_connect;
588
589 /**
590 * The capacity of the to connect array.
591 */
592 size_t to_connect_capacity;
593
594 /**
595 * The count of endpoints to connect.
596 */
597 size_t to_connect_count;
598
599 /**
600 * Pointers to seeds that need matching.
601 */
602 cf_vector* inactive_seeds_p;
603} as_hb_mesh_tend_reduce_udata;
604
605/**
606 * Mesh endpoint search udata.
607 */
608typedef struct
609{
610 /**
611 * The endpoint to search.
612 */
613 cf_sock_addr* to_search;
614
615 /**
616 * Indicates is a match is found.
617 */
618 bool found;
619} as_hb_endpoint_list_addr_find_udata;
620
621/**
622 * Mesh endpoint list search udata.
623 */
624typedef struct as_hb_mesh_endpoint_list_reduce_udata_s
625{
626 /**
627 * The endpoint to search.
628 */
629 as_endpoint_list* to_search;
630
631 /**
632 * Indicates is a match is found.
633 */
634 bool found;
635
636 /**
637 * The matched key if found.
638 */
639 cf_node* matched_nodeid;
640} as_hb_mesh_endpoint_list_reduce_udata;
641
642/**
643 * Information maintained for configured mesh seed nodes.
644 */
645typedef struct as_hb_mesh_seed_s
646{
647 /**
648 * The name / ip address of this seed mesh host.
649 */
650 char seed_host_name[DNS_NAME_MAX_SIZE];
651
652 /**
653 * The port of this seed mesh host.
654 */
655 cf_ip_port seed_port;
656
657 /**
658 * Identifies TLS mesh seed hosts.
659 */
660 bool seed_tls;
661
662 /**
663 * The heap allocated end point list for this seed host resolved usiung the
664 * seeds hostname.
665 * Will be null if the endpoint list cannot be resolved.
666 */
667 as_endpoint_list* resolved_endpoint_list;
668
669 /**
670 * Timestamp when the seed hostname was resolved into the endpoint list.
671 * Used to perform periodic refresh of the endpoint list.
672 */
673 cf_clock resolved_endpoint_list_ts;
674
675 /**
676 * The state of this seed in terms of established channel.
677 */
678 as_hb_mesh_node_status status;
679
680 /**
681 * The last time the state of this node was updated.
682 */
683 cf_clock last_status_updated;
684
685 /**
686 * The node id for a matching mesh node entry. A zero will indicate that
687 * there exists no matching mesh node entry.
688 */
689 cf_node mesh_nodeid;
690
691 /**
692 * Timestamp indicating when the matching mesh node's endpoint was updated.
693 * Used to detect endpoint changes to the matching mesh node entry if it
694 * exists.
695 */
696 as_hlc_timestamp mesh_node_endpoint_change_ts;
697} as_hb_mesh_seed;
698
699/**
700 * Information maintained for discovered mesh end points.
701 */
702typedef struct as_hb_mesh_node_s
703{
704 /**
705 * The heap allocated end point list for this mesh host. Should be freed
706 * once the last mesh entry is removed from the mesh state.
707 */
708 as_endpoint_list* endpoint_list;
709
710 /**
711 * Timestamp when the mesh node was last updated.
712 */
713 as_hlc_timestamp endpoint_change_ts;
714
715 /**
716 * The state of this node in terms of established channel.
717 */
718 as_hb_mesh_node_status status;
719
720 /**
721 * The last time the state of this node was updated.
722 */
723 cf_clock last_status_updated;
724
725 /**
726 * The time this node's channel become inactive.
727 */
728 cf_clock inactive_since;
729} as_hb_mesh_node;
730
731/**
732 * State maintained for the mesh mode.
733 */
734typedef struct as_hb_mesh_state_s
735{
736 /**
737 * The sockets on which this instance accepts heartbeat tcp connections.
738 */
739 cf_sockets listening_sockets;
740
741 /**
742 * Indicates if the published endpoint list is ipv4 only.
743 */
744 bool published_endpoint_list_ipv4_only;
745
746 /**
747 * The published endpoint list.
748 */
749 as_endpoint_list* published_endpoint_list;
750
751 /**
752 * Mesh seed data.
753 */
754 cf_vector seeds;
755
756 /**
757 * A map from an cf_node _key to a mesh node.
758 */
759 cf_shash* nodeid_to_mesh_node;
760
761 /**
762 * Thread id for the mesh tender thread.
763 */
764 pthread_t mesh_tender_tid;
765
766 /**
767 * The status of the mesh module.
768 */
769 as_hb_status status;
770
771 /**
772 * The mtu on the listening device. This is extrapolated to all nodes and
773 * paths in the cluster. This limits the cluster size possible.
774 */
775 int min_mtu;
776
777 /**
778 * Indicates if new nodes are discovered. Optimization to start mesh tend
779 * earlier than normal tend interval on discovering new nodes.
780 */
781 bool nodes_discovered;
782} as_hb_mesh_state;
783
784/*
785 * ----------------------------------------------------------------------------
786 * Multicast data structures
787 * ----------------------------------------------------------------------------
788 */
789
790/**
791 * State maintained for the multicast mode.
792 */
793typedef struct as_hb_multicast_state_s
794{
795 /**
796 * The sockets associated with multicast mode.
797 */
798 cf_mserv_cfg cfg;
799
800 /**
801 * Multicast listening sockets.
802 */
803 cf_sockets listening_sockets;
804
805 /**
806 * The mtu on the listening device. This is extrapolated to all nodes and
807 * paths in the cluster. This limits the cluster size possible.
808 */
809 int min_mtu;
810} as_hb_multicast_state;
811
812/*
813 * ----------------------------------------------------------------------------
814 * Channel state
815 * ----------------------------------------------------------------------------
816 */
817
818/**
819 * The type of a channel event.
820 */
821typedef enum
822{
823 /**
824 * The endpoint has a channel tx/rx channel associated with it.
825 */
826 AS_HB_CHANNEL_NODE_CONNECTED,
827
828 /**
829 * The endpoint had a tx/rx channel that went down.
830 */
831 AS_HB_CHANNEL_NODE_DISCONNECTED,
832
833 /**
834 * A message was received on a connected channel. The message in the event,
835 * is guaranteed to have passed basic sanity check like have protocol id,
836 * type and source nodeid.
837 */
838 AS_HB_CHANNEL_MSG_RECEIVED,
839
840 /**
841 * Channel found node whose cluster name does not match.
842 */
843 AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH
844} as_hb_channel_event_type;
845
846/**
847 * Status for reads from a channel.
848 */
849typedef enum
850{
851 /**
852 * The message was read successfully and parser.
853 */
854 AS_HB_CHANNEL_MSG_READ_SUCCESS,
855
856 /**
857 * The message read successfully but parsing failed.
858 */
859 AS_HB_CHANNEL_MSG_PARSE_FAIL,
860
861 /**
862 * The message read failed network io.
863 */
864 AS_HB_CHANNEL_MSG_CHANNEL_FAIL,
865
866 /**
867 * Sentinel default value.
868 */
869 AS_HB_CHANNEL_MSG_READ_UNDEF
870} as_hb_channel_msg_read_status;
871
872typedef struct
873{
874 /**
875 * The endpoint address to search channel by.
876 */
877 as_endpoint_list* endpoint_list;
878
879 /**
880 * Indicates if the endpoint was found.
881 */
882 bool found;
883
884 /**
885 * The matching socket, if found.
886 */
887 cf_socket* socket;
888} as_hb_channel_endpoint_reduce_udata;
889
890typedef struct
891{
892 /**
893 * The endpoint address to search channel by.
894 */
895 cf_sock_addr* addr_to_search;
896
897 /**
898 * Indicates if the endpoint was found.
899 */
900 bool found;
901} as_hb_channel_endpoint_iterate_udata;
902
903typedef struct
904{
905 /**
906 * The message buffer to send.
907 */
908 uint8_t* buffer;
909
910 /**
911 * The buffer length.
912 */
913 size_t buffer_len;
914} as_hb_channel_buffer_udata;
915
916/**
917 * A channel represents a medium to send and receive messages.
918 */
919typedef struct as_hb_channel_s
920{
921 /**
922 * Indicates if this channel is a multicast channel.
923 */
924 bool is_multicast;
925
926 /**
927 * Indicates if this channel is inbound. Not relevant for multicast
928 * channels.
929 */
930 bool is_inbound;
931
932 /**
933 * The id of the associated node. In mesh / unicast case this will initially
934 * be zero and filled in when the nodeid for the node at the other end is
935 * learnt. In multicast case this will be zero.
936 */
937 cf_node nodeid;
938
939 /**
940 * The address of the peer. Will always be specified for outbound channels.
941 */
942 cf_sock_addr endpoint_addr;
943
944 /**
945 * The last time a message was received from this node.
946 */
947 cf_clock last_received;
948
949 /**
950 * Time when this channel won a socket resolution. Zero if this channel
951 * never won resolution. In compatibility mode with older code its possible
952 * we will keep allowing the same socket to win and enter an infinite loop
953 * of closing the sockets.
954 */
955 cf_clock resolution_win_ts;
956} as_hb_channel;
957
958/**
959 * State maintained per heartbeat channel.
960 */
961typedef struct as_hb_channel_state_s
962{
963 /**
964 * The poll handle. All IO wait across all heartbeat connections happens on
965 * this handle.
966 */
967 cf_poll poll;
968
969 /**
970 * Channel status.
971 */
972 as_hb_status status;
973
974 /**
975 * Maps a socket to an as_hb_channel.
976 */
977 cf_shash* socket_to_channel;
978
979 /**
980 * Maps a nodeid to a channel specific node data structure. This association
981 * will be made only on receiving the first heartbeat message from the node
982 * on a channel.
983 */
984 cf_shash* nodeid_to_socket;
985
986 /**
987 * Sockets accumulated by the channel tender to close at the end of every
988 * epoll loop.
989 */
990 cf_queue socket_close_queue;
991
992 /**
993 * The sockets on which heartbeat subsystem listens.
994 */
995 cf_sockets* listening_sockets;
996
997 /**
998 * Clock to keep track of last time idle connections were checked.
999 */
1000 cf_clock last_channel_idle_check;
1001
1002 /**
1003 * Enables / disables publishing channel events. Events should be disabled
1004 * only when the state changes are temporary / transient and hence would not
1005 * change the overall channel state from an external perspective.
1006 */
1007 bool events_enabled;
1008
1009 /**
1010 * Events are batched and published to reduce cluster transitions. Queue of
1011 * unpublished heartbeat events.
1012 */
1013 cf_queue events_queue;
1014
1015 /**
1016 * Thread id for the socket tender thread.
1017 */
1018 pthread_t channel_tender_tid;
1019} as_hb_channel_state;
1020
1021/**
1022 * Entry queued up for socket close.
1023 */
1024typedef struct as_hb_channel_socket_close_entry_s
1025{
1026 /**
1027 * The node for which this event was generated.
1028 */
1029 cf_socket* socket;
1030 /**
1031 * Indicates if this close is a remote close.
1032 */
1033 bool is_remote;
1034 /**
1035 * True if close of this entry should generate a disconnect event.
1036 */
1037 bool raise_close_event;
1038} as_hb_channel_socket_close_entry;
1039
1040/**
1041 * An event generated by the channel sub module.
1042 */
1043typedef struct as_hb_channel_event_s
1044{
1045 /**
1046 * The channel event type.
1047 */
1048 as_hb_channel_event_type type;
1049
1050 /**
1051 * The node for which this event was generated.
1052 */
1053 cf_node nodeid;
1054
1055 /**
1056 * The received message if any over this endpoint. Valid for incoming
1057 * message type event. The message if not NULL never be edited or copied
1058 * over.
1059 */
1060 msg* msg;
1061
1062 /**
1063 * The hlc timestamp for message receipt.
1064 */
1065 as_hlc_msg_timestamp msg_hlc_ts;
1066} as_hb_channel_event;
1067
1068/*
1069 * ----------------------------------------------------------------------------
1070 * Main sub module state
1071 * ----------------------------------------------------------------------------
1072 */
1073
1074/**
1075 * Heartbeat message types.
1076 */
1077typedef enum
1078{
1079 AS_HB_MSG_TYPE_PULSE,
1080 AS_HB_MSG_TYPE_INFO_REQUEST,
1081 AS_HB_MSG_TYPE_INFO_REPLY,
1082 AS_HB_MSG_TYPE_COMPRESSED
1083} as_hb_msg_type;
1084
1085/**
1086 * Events published by the heartbeat subsystem.
1087 */
1088typedef enum
1089{
1090 AS_HB_INTERNAL_NODE_ARRIVE,
1091 AS_HB_INTERNAL_NODE_DEPART,
1092 AS_HB_INTERNAL_NODE_EVICT,
1093 AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED
1094} as_hb_internal_event_type;
1095
1096/**
1097 * State maintained by the heartbeat subsystem for the selected mode.
1098 */
1099typedef struct as_hb_mode_state_s
1100{
1101 /**
1102 * The mesh / multicast state.
1103 */
1104 union
1105 {
1106 as_hb_mesh_state mesh_state;
1107 as_hb_multicast_state multicast_state;
1108 };
1109} as_hb_mode_state;
1110
1111/**
1112 * Plugin data iterate reduce udata.
1113 */
1114typedef struct
1115{
1116 /**
1117 * The plugin id.
1118 */
1119 as_hb_plugin_id pluginid;
1120
1121 /**
1122 * The iterate function.
1123 */
1124 as_hb_plugin_data_iterate_fn iterate_fn;
1125
1126 /**
1127 * The udata for the iterate function.
1128 */
1129 void* udata;
1130} as_hb_adjacecny_iterate_reduce_udata;
1131
1132/**
1133 * Information tracked for an adjacent nodes.
1134 */
1135typedef struct as_hb_adjacent_node_s
1136{
1137 /**
1138 * The heart beat protocol version.
1139 */
1140 uint32_t protocol_version;
1141
1142 /**
1143 * The remote node's
1144 */
1145 as_endpoint_list* endpoint_list;
1146
1147 /**
1148 * Used to cycle between the two copies of plugin data.
1149 */
1150 int plugin_data_cycler;
1151
1152 /**
1153 * Plugin specific data accumulated by the heartbeat subsystem. The data is
1154 * heap allocated and should be destroyed the moment this element entry is
1155 * unused. There are two copies of the plugin data, one the current copy and
1156 * one the previous copy. Previous copy is used to generate data change
1157 * notifications.
1158 */
1159 as_hb_plugin_node_data plugin_data[AS_HB_PLUGIN_SENTINEL][2];
1160
1161 /**
1162 * The monotonic local time node information was last updated.
1163 */
1164 cf_clock last_updated_monotonic_ts;
1165
1166 /**
1167 * HLC timestamp for the last pulse message.
1168 */
1169 as_hlc_msg_timestamp last_msg_hlc_ts;
1170
1171 /**
1172 * Track number of consecutive cluster-name mismatches.
1173 */
1174 uint32_t cluster_name_mismatch_count;
1175
1176 /**
1177 * Moving average of the latency in ms.
1178 */
1179 uint64_t avg_latency;
1180
1181 /**
1182 * A shift register tracking change of endpoints. On receipt of a heartbeat,
1183 * if source node's endpoints change 1 is inserted at the LSB, else 0 is
1184 * inserted at the LSB.
1185 */
1186 uint64_t endpoint_change_tracker;
1187} as_hb_adjacent_node;
1188
1189/**
1190 * Internal storage for external event listeners.
1191 */
1192typedef struct as_hb_event_listener_s
1193{
1194 /**
1195 * Registered callback function.
1196 */
1197 as_hb_event_fn event_callback;
1198
1199 /**
1200 * Arguments for the listeners.
1201 */
1202 void* udata;
1203} as_hb_event_listener;
1204
1205/**
1206 * Heartbeat subsystem internal state.
1207 */
1208typedef struct as_hb_s
1209{
1210 /**
1211 * The status of the subsystem.
1212 */
1213 as_hb_status status;
1214
1215 /**
1216 * The adjacency dictionary. The key is the nodeid. The value is an instance
1217 * of as_hb_adjacent_node.
1218 */
1219 cf_shash* adjacency;
1220
1221 /**
1222 * The probation dictionary having nodes that display unexpected behavior.
1223 * Nodeids under probation and adjacency hash are always exclusive. The key
1224 * is the nodeid. The value is an instance of as_hb_adjacent_node.
1225 */
1226 cf_shash* on_probation;
1227
1228 /**
1229 * Temporary nodeid to index hash used to compute nodes to evict from a
1230 * clique.
1231 */
1232 cf_shash* nodeid_to_index;
1233
1234 /**
1235 * The mode specific state.
1236 */
1237 as_hb_mode_state mode_state;
1238
1239 /**
1240 * The channel state.
1241 */
1242 as_hb_channel_state channel_state;
1243
1244 /**
1245 * Self node accumulated stats used primarily to detect duplicate node-ids.
1246 */
1247 as_hb_adjacent_node self_node;
1248
1249 /**
1250 * Indicates self node-id has duplicates.
1251 */
1252 bool self_is_duplicate;
1253
1254 /**
1255 * Monotonic timestamp of when a self duplicate was detected.
1256 */
1257 cf_clock self_duplicate_detected_ts;
1258
1259 /**
1260 * The plugin dictionary. The key is the as_hb_plugin entry and the value an
1261 * instance of as_hb_plugin.
1262 */
1263 as_hb_plugin plugins[AS_HB_PLUGIN_SENTINEL];
1264
1265 /**
1266 * Thread id for the transmitter thread.
1267 */
1268 pthread_t transmitter_tid;
1269
1270 /**
1271 * Thread id for the thread expiring nodes from the adjacency list.
1272 */
1273 pthread_t adjacency_tender_tid;
1274} as_hb;
1275
1276/**
1277 * Registered heartbeat listeners.
1278 */
1279typedef struct as_hb_external_events_s
1280{
1281 /**
1282 * Events are batched and published. Queue of unpublished heartbeat events.
1283 */
1284 cf_queue external_events_queue;
1285
1286 /**
1287 * Count of event listeners.
1288 */
1289 int event_listener_count;
1290
1291 /**
1292 * External event listeners.
1293 */
1294 as_hb_event_listener event_listeners[AS_HB_EVENT_LISTENER_MAX];
1295} as_hb_external_events;
1296
1297/**
1298 * Shash reduce function to read current adjacency list.
1299 */
1300typedef struct as_hb_adjacency_reduce_udata_s
1301{
1302 /**
1303 * The target adjacency list.
1304 */
1305 cf_node* adj_list;
1306
1307 /**
1308 * Count of elements in the adjacency list.
1309 */
1310 int adj_count;
1311} as_hb_adjacency_reduce_udata;
1312
1313/**
1314 * Udata for finding nodes in the adjacency list not in the input succession
1315 * list.
1316 */
1317typedef struct
1318{
1319 /**
1320 * Number of events generated.
1321 */
1322 int event_count;
1323
1324 /**
1325 * List of generated events.
1326 */
1327 as_hb_event_node* events;
1328
1329 /**
1330 * Limit on number of generated events.
1331 */
1332 int max_events;
1333
1334 /**
1335 * Current succession list.
1336 */
1337 cf_node* succession;
1338
1339 /**
1340 * Number of nodes in succession list.
1341 */
1342 int succession_size;
1343} as_hb_find_new_nodes_reduce_udata;
1344
1345/**
1346 * Shash reduce function to read current adjacency list.
1347 */
1348typedef struct as_hb_adjacency_tender_udata_s
1349{
1350 /**
1351 * The list of expired nodes.
1352 */
1353 cf_node* dead_nodes;
1354
1355 /**
1356 * Count of elements in the dead node list.
1357 */
1358 int dead_node_count;
1359
1360 /**
1361 * The list of evicted nodes , e.g. due to cluster name mismatch.
1362 */
1363 cf_node* evicted_nodes;
1364
1365 /**
1366 * Count of elements in the evicted node list.
1367 */
1368 int evicted_node_count;
1369} as_hb_adjacency_tender_udata;
1370
1371/**
1372 * Udata for tip clear.
1373 */
1374typedef struct as_hb_mesh_tip_clear_udata_s
1375{
1376 /**
1377 * Host IP or DNS name to be cleared from seed list.
1378 */
1379 char host[DNS_NAME_MAX_SIZE];
1380
1381 /**
1382 * Listening port of the host.
1383 */
1384 int port;
1385
1386 /**
1387 * Number of IP addresses to match.
1388 */
1389 uint32_t n_addrs;
1390
1391 /**
1392 * IP addresses to match.
1393 */
1394 cf_ip_addr* addrs;
1395
1396 /**
1397 * Node id if a specific node-id needs to be removed as well.
1398 */
1399 cf_node nodeid;
1400
1401 /**
1402 * Tip-clear status
1403 */
1404 bool entry_deleted;
1405} as_hb_mesh_tip_clear_udata;
1406
1407/**
1408 * Convert endpoint list to string in a process function.
1409 */
1410typedef struct endpoint_list_to_string_udata_s
1411{
1412 /**
1413 * The endpoint list in string format.
1414 */
1415 char* endpoint_list_str;
1416
1417 /**
1418 * The size of enpoint list.
1419 */
1420 size_t endpoint_list_str_capacity;
1421} endpoint_list_to_string_udata;
1422
1423/**
1424 * Udata to fill an endpoint list into a message.
1425 */
1426typedef struct endpoint_list_to_msg_udata_s
1427{
1428 /**
1429 * The target message.
1430 */
1431 msg* msg;
1432
1433 /**
1434 * Indicates if we are running in mesh mode.
1435 */
1436 bool is_mesh;
1437} endpoint_list_to_msg_udata;
1438
1439/**
1440 * Udata to test if this endpoint list overlaps with other endpoint list.
1441 */
1442typedef struct endpoint_list_equal_check_udata_s
1443{
1444 /**
1445 * The endpoint list of the new node.
1446 */
1447 as_endpoint_list* other;
1448
1449 /**
1450 * Output. Indicates if the lists are equal.
1451 */
1452 bool are_equal;
1453} endpoint_list_equal_check_udata;
1454
1455/**
1456 * Endpoint list process function.
1457 * @param endpoint current endpoint in the iteration.
1458 * @param udata udata passed through from the invoker of the iterate function.
1459 */
1460typedef void
1461(*endpoint_list_process_fn)(const as_endpoint_list* endpoint_list, void* udata);
1462
1463/**
1464 * Seed host list reduce udata.
1465 */
1466typedef struct as_hb_seed_host_list_udata_s
1467{
1468 /**
1469 * The buffer to receive the list.
1470 */
1471 cf_dyn_buf* db;
1472
1473 /**
1474 * Selects TLS seed nodes.
1475 */
1476 bool tls;
1477} as_hb_seed_host_list_udata;
1478
1479/*
1480 * ----------------------------------------------------------------------------
1481 * Globals
1482 * ----------------------------------------------------------------------------
1483 */
1484
1485/**
1486 * Global heartbeat instance.
1487 */
1488static as_hb g_hb;
1489
1490/**
1491 * Global heartbeat events listener instance.
1492 */
1493static as_hb_external_events g_hb_event_listeners;
1494
1495/**
1496 * The big fat lock for all external event publishing. This ensures that a batch
1497 * of external events are published atomically to preserve the order of external
1498 * events.
1499 */
1500static pthread_mutex_t g_external_event_publish_lock =
1501 PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1502
1503/**
1504 * Global lock to serialize all read and writes to the heartbeat subsystem.
1505 */
1506static pthread_mutex_t g_hb_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1507
1508/**
1509 * The big fat lock for all channel state.
1510 */
1511static pthread_mutex_t g_channel_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1512
1513/**
1514 * The big fat lock for all mesh state.
1515 */
1516static pthread_mutex_t g_mesh_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1517
1518/**
1519 * The big fat lock for all multicast state.
1520 */
1521static pthread_mutex_t g_multicast_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1522
1523/**
1524 * The global lock for all heartbeat configuration.
1525 */
1526static pthread_mutex_t g_hb_config_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1527
1528/**
1529 * The lock used while setting heartbeat protocol.
1530 */
1531static pthread_mutex_t g_set_protocol_lock =
1532 PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1533
1534/**
1535 * Message templates for heartbeat messages.
1536 */
1537static msg_template g_hb_msg_template[] = {
1538
1539{ AS_HB_MSG_ID, M_FT_UINT32 },
1540
1541{ AS_HB_MSG_TYPE, M_FT_UINT32 },
1542
1543{ AS_HB_MSG_NODE, M_FT_UINT64 },
1544
1545{ AS_HB_MSG_CLUSTER_NAME, M_FT_STR },
1546
1547{ AS_HB_MSG_HLC_TIMESTAMP, M_FT_UINT64 },
1548
1549{ AS_HB_MSG_ENDPOINTS, M_FT_BUF },
1550
1551{ AS_HB_MSG_COMPRESSED_PAYLOAD, M_FT_BUF },
1552
1553{ AS_HB_MSG_INFO_REQUEST, M_FT_BUF },
1554
1555{ AS_HB_MSG_INFO_REPLY, M_FT_BUF },
1556
1557{ AS_HB_MSG_FABRIC_DATA, M_FT_BUF },
1558
1559{ AS_HB_MSG_HB_DATA, M_FT_BUF },
1560
1561{ AS_HB_MSG_PAXOS_DATA, M_FT_BUF },
1562
1563{ AS_HB_MSG_SKEW_MONITOR_DATA, M_FT_UINT64 } };
1564
1565/*
1566 * ----------------------------------------------------------------------------
1567 * Private internal function forward declarations.
1568 * ----------------------------------------------------------------------------
1569 */
1570
1571static void info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list);
1572static uint32_t round_up_pow2(uint32_t v);
1573static int vector_find(cf_vector* vector, const void* element);
1574
1575static void endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src);
1576static void endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, void* udata);
1577static void endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata);
1578
1579static int msg_compression_threshold(int mtu);
1580static int msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list);
1581static int msg_id_get(msg* msg, uint32_t* id);
1582static int msg_nodeid_get(msg* msg, cf_node* nodeid);
1583static int msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts);
1584static int msg_type_get(msg* msg, as_hb_msg_type* type);
1585static int msg_cluster_name_get(msg* msg, char** cluster_name);
1586static int msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, size_t* adj_length);
1587static int msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length);
1588static void msg_node_list_set(msg* msg, int field_id, cf_node* node_list, size_t node_length);
1589static void msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length);
1590static int msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count);
1591static void msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, void* udata);
1592static void msg_src_fields_fill(msg* msg);
1593static void msg_type_set(msg* msg, as_hb_msg_type msg_type);
1594
1595static int config_mcsize();
1596static const cf_serv_cfg* config_bind_cfg_get();
1597static const cf_mserv_cfg* config_multicast_group_cfg_get();
1598static uint32_t config_tx_interval_get();
1599static void config_tx_interval_set(uint32_t new_interval);
1600static uint32_t config_override_mtu_get();
1601static void config_override_mtu_set(uint32_t mtu);
1602static uint32_t config_max_intervals_missed_get();
1603static void config_max_intervals_missed_set(uint32_t new_max);
1604static unsigned char config_multicast_ttl_get();
1605static as_hb_protocol config_protocol_get();
1606static void config_protocol_set(as_hb_protocol new_protocol);
1607static cf_node config_self_nodeid_get();
1608static as_hb_mode config_mode_get();
1609static void config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, cf_serv_cfg* published_cfg, bool ipv4_only);
1610static bool config_binding_is_valid(char** error, as_hb_protocol protocol);
1611
1612static void channel_init_channel(as_hb_channel* channel);
1613static void channel_event_init(as_hb_channel_event* event);
1614static bool channel_is_running();
1615static bool channel_is_stopped();
1616static uint32_t channel_win_grace_ms();
1617static void channel_events_enabled_set(bool enabled);
1618static bool channel_are_events_enabled();
1619static void channel_event_queue(as_hb_channel_event* event);
1620static void channel_event_publish_pending();
1621static int channel_get_channel(cf_socket* socket, as_hb_channel* result);
1622static void channel_socket_shutdown(cf_socket* socket);
1623static int channel_socket_get(cf_node nodeid, cf_socket** socket);
1624static bool channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find);
1625static void channel_socket_destroy(cf_socket* sock);
1626static void channel_socket_close(cf_socket* socket, bool remote_close, bool raise_close_event);
1627static void channel_sockets_close(cf_vector* sockets);
1628static void channel_socket_close_queue(cf_socket* socket, bool is_remote_close, bool raise_close_event);
1629static void channel_socket_close_pending();
1630static void channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, cf_sock_addr* endpoint_addr);
1631static void channel_accept_connection(cf_socket* lsock);
1632static as_hb_channel_msg_read_status channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len);
1633static void channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata);
1634static int channel_endpoint_search_reduce(const void* key, void* data, void* udata);
1635static bool channel_endpoint_is_connected(as_endpoint_list* endpoint_list);
1636static as_hb_channel_msg_read_status channel_multicast_msg_read(cf_socket* socket, msg* msg);
1637static as_hb_channel_msg_read_status channel_mesh_msg_read(cf_socket* socket, msg* msg);
1638static void channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid);
1639static bool channel_socket_should_live(cf_socket* socket, as_hb_channel* channel);
1640static cf_socket* channel_socket_resolve(cf_socket* socket1, cf_socket* socket2);
1641static int channel_msg_sanity_check(as_hb_channel_event* msg_event);
1642static int channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event);
1643static void channel_msg_read(cf_socket* socket);
1644static void channel_channels_idle_check();
1645void* channel_tender(void* arg);
1646static bool channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata);
1647static void channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, int endpoint_list_count);
1648static int channel_node_disconnect(cf_node nodeid);
1649static void channel_mesh_listening_socks_register(cf_sockets* listening_sockets);
1650static void channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets);
1651static void channel_multicast_listening_socks_register(cf_sockets* listening_sockets);
1652static void channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets);
1653static void channel_init();
1654static void channel_start();
1655static int channel_sockets_get_reduce(const void* key, void* data, void* udata);
1656static void channel_stop();
1657static int channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length);
1658static int channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length);
1659static bool channel_msg_is_compression_required(msg* msg, int wire_size, int mtu);
1660static int channel_msg_buffer_size_get(int wire_size, int mtu);
1661static size_t channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, uint8_t* buffer, size_t buffer_len);
1662static int channel_msg_unicast(cf_node dest, msg* msg);
1663static int channel_msg_broadcast_reduce(const void* key, void* data, void* udata);
1664static int channel_msg_broadcast(msg* msg);
1665static void channel_clear();
1666static int channel_dump_reduce(const void* key, void* data, void* udata);
1667static void channel_dump(bool verbose);
1668
1669static bool mesh_is_running();
1670static bool mesh_is_stopped();
1671static void mesh_published_endpoints_process(endpoint_list_process_fn process_fn, void* udata);
1672static const char* mesh_node_status_string(as_hb_mesh_node_status status);
1673static int mesh_seed_delete_unsafe(int seed_index);
1674static int mesh_seed_find_unsafe(char* host, int port);
1675static void mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, int mesh_node_count);
1676static void mesh_node_status_change(as_hb_mesh_node* mesh_node, as_hb_mesh_node_status new_status);
1677static void mesh_listening_sockets_close();
1678static void mesh_seed_host_list_get(cf_dyn_buf* db, bool tls);
1679static void mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p);
1680static void mesh_stop();
1681static int mesh_tend_reduce(const void* key, void* data, void* udata);
1682void* mesh_tender(void* arg);
1683static void mesh_node_destroy(as_hb_mesh_node* mesh_node);
1684static void mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata);
1685static bool mesh_node_is_discovered(cf_node nodeid);
1686static bool mesh_node_endpoint_list_is_valid(cf_node nodeid);
1687static int mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node);
1688static void mesh_channel_on_node_disconnect(as_hb_channel_event* event);
1689static bool mesh_node_check_fix_self_msg(as_hb_channel_event* event);
1690static void mesh_node_data_update(as_hb_channel_event* event);
1691static int mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, size_t* reply_size);
1692static void mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, size_t reply_count);
1693static msg* mesh_info_msg_init(as_hb_msg_type msg_type);
1694static void mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, size_t to_discover_count);
1695static void mesh_channel_on_pulse(msg* msg);
1696static void mesh_channel_on_info_request(msg* msg);
1697static void mesh_channel_on_info_reply(msg* msg);
1698static int mesh_tip(char* host, int port, bool tls);
1699static void mesh_channel_event_process(as_hb_channel_event* event);
1700static void mesh_init();
1701static int mesh_free_node_data_reduce(const void* key, void* data, void* udata);
1702static int mesh_tip_clear_reduce(const void* key, void* data, void* udata);
1703static int mesh_peer_endpoint_reduce(const void* key, void* data, void* udata);
1704static void mesh_clear();
1705static void mesh_listening_sockets_open();
1706static void mesh_start();
1707static int mesh_dump_reduce(const void* key, void* data, void* udata);
1708static void mesh_dump(bool verbose);
1709
1710static void multicast_init();
1711static void multicast_clear();
1712static void multicast_listening_sockets_open();
1713static void multicast_start();
1714static void multicast_listening_sockets_close();
1715static void multicast_stop();
1716static void multicast_dump(bool verbose);
1717static int multicast_supported_cluster_size_get();
1718
1719static bool hb_is_initialized();
1720static bool hb_is_running();
1721static bool hb_is_stopped();
1722static void hb_mode_init();
1723static void hb_mode_start();
1724static int hb_mtu();
1725static void hb_msg_init();
1726static uint32_t hb_protocol_identifier_get();
1727static cf_clock hb_node_depart_time(cf_clock detect_time);
1728static bool hb_is_mesh();
1729static void hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, int node_count);
1730static void hb_event_publish_pending();
1731static int hb_adjacency_free_data_reduce(const void* key, void* data, void* udata);
1732static void hb_clear();
1733static int hb_adjacency_iterate_reduce(const void* key, void* data, void* udata);
1734static void hb_plugin_set_fn(msg* msg);
1735static void hb_plugin_parse_data_fn(msg* msg, cf_node source, as_hb_plugin_node_data* prev_plugin_data, as_hb_plugin_node_data* plugin_data);
1736static msg* hb_msg_get();
1737static void hb_msg_return(msg* msg);
1738static void hb_plugin_msg_fill(msg* msg);
1739static void hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, as_hb_plugin* plugins, bool plugin_data_changed[]);
1740static void hb_plugin_init();
1741void* hb_transmitter(void* arg);
1742static int hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node);
1743static void hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size);
1744static void hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, cf_node** adjacency_list, size_t* adjacency_length);
1745static bool hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node);
1746static bool hb_self_is_duplicate();
1747static void hb_self_duplicate_update();
1748static void hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node);
1749static int hb_adjacency_tend_reduce(const void* key, void* data, void* udata);
1750void* hb_adjacency_tender(void* arg);
1751static void hb_tx_start();
1752static void hb_tx_stop();
1753static void hb_adjacency_tender_start();
1754static void hb_adjacency_tender_stop();
1755static void hb_init();
1756static void hb_start();
1757static void hb_stop();
1758static void hb_plugin_register(as_hb_plugin* plugin);
1759static bool hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp send_ts);
1760static void hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed);
1761static bool hb_endpoint_change_tracker_is_normal(uint64_t tracker);
1762static bool hb_endpoint_change_tracker_has_changed(uint64_t tracker);
1763static int hb_adjacent_node_update(as_hb_channel_event* msg_event, as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]);
1764static bool hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node);
1765static void hb_channel_on_self_pulse(as_hb_channel_event* msg_event);
1766static void hb_channel_on_pulse(as_hb_channel_event* msg_event);
1767static void hb_channel_on_msg_rcvd(as_hb_channel_event* event);
1768static void hb_handle_cluster_name_mismatch(as_hb_channel_event* event);
1769static void hb_channel_event_process(as_hb_channel_event* event);
1770static void hb_mode_dump(bool verbose);
1771static int hb_dump_reduce(const void* key, void* data, void* udata);
1772static void hb_dump(bool verbose);
1773static void hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph);
1774static void hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict);
1775static int hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata);
1776static void hb_plugin_data_iterate_all(as_hb_plugin_id pluginid,
1777 as_hb_plugin_data_iterate_fn iterate_fn, void* udata);
1778
1779/*
1780 * ----------------------------------------------------------------------------
1781 * Public functions.
1782 * ----------------------------------------------------------------------------
1783 */
1784/**
1785 * Initialize the heartbeat subsystem.
1786 */
1787void
1788as_hb_init()
1789{
1790 // Initialize hb subsystem.
1791 hb_init();
1792
1793 // Add the mesh seed nodes.
1794 // Using one time seed config outside the config module.
1795 if (hb_is_mesh()) {
1796 for (int i = 0; i < AS_CLUSTER_SZ; i++) {
1797 if (g_config.hb_config.mesh_seed_addrs[i]) {
1798 mesh_tip(g_config.hb_config.mesh_seed_addrs[i],
1799 g_config.hb_config.mesh_seed_ports[i],
1800 g_config.hb_config.mesh_seed_tls[i]);
1801 }
1802 else {
1803 break;
1804 }
1805 }
1806 }
1807}
1808
1809/**
1810 * Start the heartbeat subsystem.
1811 */
1812void
1813as_hb_start()
1814{
1815 hb_start();
1816}
1817
1818/**
1819 * Shut down the heartbeat subsystem.
1820 */
1821void
1822as_hb_shutdown()
1823{
1824 hb_stop();
1825}
1826
1827/**
1828 * Indicates if self node is a duplicate
1829 */
1830bool
1831as_hb_self_is_duplicate()
1832{
1833 return hb_self_is_duplicate();
1834}
1835
1836/**
1837 * Free the data structures of heart beat.
1838 */
1839void
1840as_hb_destroy()
1841{
1842 // Destroy the main module.
1843 hb_clear();
1844}
1845
1846/**
1847 * Return a string representation of a heartbeat protocol type.
1848 *
1849 * @param protocol for which the string is computed
1850 * @param protocol_s string representation of protocol
1851 */
1852void
1853as_hb_protocol_get_s(as_hb_protocol protocol, char* protocol_s)
1854{
1855 char *str;
1856 switch (protocol) {
1857 case AS_HB_PROTOCOL_V3:
1858 str = "v3";
1859 break;
1860 case AS_HB_PROTOCOL_NONE:
1861 str = "none";
1862 break;
1863 case AS_HB_PROTOCOL_RESET:
1864 str = "reset";
1865 break;
1866 default:
1867 str = "undefined";
1868 }
1869
1870 sprintf(protocol_s, "%s", str);
1871}
1872
1873/**
1874 * Set heartbeat protocol version.
1875 */
1876as_hb_protocol
1877as_hb_protocol_get()
1878{
1879 return config_protocol_get();
1880}
1881
1882/**
1883 * Set heartbeat protocol version.
1884 */
1885int
1886as_hb_protocol_set(as_hb_protocol new_protocol)
1887{
1888 SET_PROTOCOL_LOCK();
1889 int rv = 0;
1890 if (config_protocol_get() == new_protocol) {
1891 INFO("no heartbeat protocol change needed");
1892 rv = 0;
1893 goto Exit;
1894 }
1895 char old_protocol_s[HB_PROTOCOL_STR_MAX_LEN];
1896 char new_protocol_s[HB_PROTOCOL_STR_MAX_LEN];
1897 as_hb_protocol_get_s(config_protocol_get(), old_protocol_s);
1898 as_hb_protocol_get_s(new_protocol, new_protocol_s);
1899 switch (new_protocol) {
1900 case AS_HB_PROTOCOL_V3:
1901 if (hb_is_running()) {
1902 INFO("disabling current heartbeat protocol %s", old_protocol_s);
1903 hb_stop();
1904 }
1905 INFO("setting heartbeat protocol version number to %s", new_protocol_s);
1906 config_protocol_set(new_protocol);
1907 hb_start();
1908 INFO("heartbeat protocol version set to %s", new_protocol_s);
1909 break;
1910
1911 case AS_HB_PROTOCOL_NONE:
1912 INFO("setting heartbeat protocol version to none");
1913 hb_stop();
1914 config_protocol_set(new_protocol);
1915 INFO("heartbeat protocol set to none");
1916 break;
1917
1918 case AS_HB_PROTOCOL_RESET:
1919 if (config_protocol_get() == AS_HB_PROTOCOL_NONE) {
1920 INFO("heartbeat messaging disabled ~~ not resetting");
1921 rv = -1;
1922 goto Exit;
1923 }
1924
1925 // NB: "protocol" is never actually set to "RESET" ~~
1926 // it is simply a trigger for the reset action.
1927 INFO("resetting heartbeat messaging");
1928
1929 hb_stop();
1930
1931 hb_clear();
1932
1933 hb_start();
1934
1935 break;
1936
1937 default:
1938 WARNING("unknown heartbeat protocol version number: %d", new_protocol);
1939 rv = -1;
1940 goto Exit;
1941 }
1942
1943Exit:
1944 SET_PROTOCOL_UNLOCK();
1945 return rv;
1946}
1947
1948/**
1949 * Register a heartbeat plugin.
1950 */
1951void
1952as_hb_plugin_register(as_hb_plugin* plugin)
1953{
1954 if (!hb_is_initialized()) {
1955 WARNING(
1956 "main heartbeat module uninitialized - not registering the plugin");
1957 return;
1958 }
1959 hb_plugin_register(plugin);
1960}
1961
1962/**
1963 * Register a heartbeat node event listener.
1964 */
1965void
1966as_hb_register_listener(as_hb_event_fn event_callback, void* udata)
1967{
1968 if (!hb_is_initialized()) {
1969 WARNING(
1970 "main heartbeat module uninitialized - not registering the listener");
1971 return;
1972 }
1973
1974 HB_LOCK();
1975
1976 if (g_hb_event_listeners.event_listener_count >=
1977 AS_HB_EVENT_LISTENER_MAX) {
1978 CRASH("cannot register more than %d event listeners",
1979 AS_HB_EVENT_LISTENER_MAX);
1980 }
1981
1982 g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].event_callback =
1983 event_callback;
1984 g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].udata =
1985 udata;
1986 g_hb_event_listeners.event_listener_count++;
1987
1988 HB_UNLOCK();
1989}
1990
1991/**
1992 * Validate heartbeat config.
1993 */
1994void
1995as_hb_config_validate()
1996{
1997 char *error;
1998 // Validate clustering and heartbeat version compatibility.
1999 as_hb_protocol hb_protocol = config_protocol_get();
2000
2001 if (hb_protocol != AS_HB_PROTOCOL_V3
2002 && hb_protocol != AS_HB_PROTOCOL_NONE) {
2003 CRASH_NOSTACK("clustering protocol v5 requires hearbeat version v3");
2004 }
2005
2006 if (!config_binding_is_valid(&error, hb_protocol)) {
2007 CRASH_NOSTACK("%s", error);
2008 }
2009}
2010
2011/**
2012 * Override the computed MTU for the network interface used by heartbeat.
2013 */
2014void
2015as_hb_override_mtu_set(int mtu)
2016{
2017 config_override_mtu_set(mtu);
2018}
2019
2020/**
2021 * Get the heartbeat pulse transmit interval.
2022 */
2023uint32_t
2024as_hb_tx_interval_get()
2025{
2026 return config_tx_interval_get();
2027}
2028
2029/**
2030 * Set the heartbeat pulse transmit interval.
2031 */
2032int
2033as_hb_tx_interval_set(uint32_t new_interval)
2034{
2035 if (new_interval < AS_HB_TX_INTERVAL_MS_MIN
2036 || new_interval > AS_HB_TX_INTERVAL_MS_MAX) {
2037 WARNING("heartbeat interval must be >= %u and <= %u - ignoring %u",
2038 AS_HB_TX_INTERVAL_MS_MIN, AS_HB_TX_INTERVAL_MS_MAX,
2039 new_interval);
2040 return (-1);
2041 }
2042 config_tx_interval_set(new_interval);
2043 return (0);
2044}
2045
2046/**
2047 * Get the maximum number of missed heartbeat intervals after which a node is
2048 * considered expired.
2049 */
2050uint32_t
2051as_hb_max_intervals_missed_get()
2052{
2053 return config_max_intervals_missed_get();
2054}
2055
2056/**
2057 * Set the maximum number of missed heartbeat intervals after which a node is
2058 * considered expired.
2059 */
2060int
2061as_hb_max_intervals_missed_set(uint32_t new_max)
2062{
2063 if (new_max < AS_HB_MAX_INTERVALS_MISSED_MIN) {
2064 WARNING("heartbeat timeout must be >= %u - ignoring %u",
2065 AS_HB_MAX_INTERVALS_MISSED_MIN, new_max);
2066 return (-1);
2067 }
2068 config_max_intervals_missed_set(new_max);
2069 return (0);
2070}
2071
2072/**
2073 * Get the timeout interval to consider a node dead / expired in milliseconds if
2074 * no heartbeat pulse messages are received.
2075 */
2076uint32_t
2077as_hb_node_timeout_get()
2078{
2079 return HB_NODE_TIMEOUT();
2080}
2081
2082/**
2083 * Populate the buffer with heartbeat configuration.
2084 */
2085void
2086as_hb_info_config_get(cf_dyn_buf* db)
2087{
2088 if (hb_is_mesh()) {
2089 info_append_string(db, "heartbeat.mode", "mesh");
2090 info_append_addrs(db, "heartbeat.address", &g_config.hb_serv_spec.bind);
2091 info_append_uint32(db, "heartbeat.port",
2092 (uint32_t)g_config.hb_serv_spec.bind_port);
2093 info_append_addrs(db, "heartbeat.tls-address",
2094 &g_config.hb_tls_serv_spec.bind);
2095 info_append_uint32(db, "heartbeat.tls-port",
2096 g_config.hb_tls_serv_spec.bind_port);
2097 info_append_string_safe(db, "heartbeat.tls-name",
2098 g_config.hb_tls_serv_spec.tls_our_name);
2099 mesh_seed_host_list_get(db, true);
2100 }
2101 else {
2102 info_append_string(db, "heartbeat.mode", "multicast");
2103 info_append_addrs(db, "heartbeat.address", &g_config.hb_serv_spec.bind);
2104 info_append_addrs(db, "heartbeat.multicast-group",
2105 &g_config.hb_multicast_groups);
2106 info_append_uint32(db, "heartbeat.port",
2107 (uint32_t)g_config.hb_serv_spec.bind_port);
2108 }
2109
2110 info_append_uint32(db, "heartbeat.interval", config_tx_interval_get());
2111 info_append_uint32(db, "heartbeat.timeout",
2112 config_max_intervals_missed_get());
2113
2114 info_append_int(db, "heartbeat.mtu", hb_mtu());
2115
2116 char protocol_s[HB_PROTOCOL_STR_MAX_LEN];
2117 as_hb_protocol_get_s(config_protocol_get(), protocol_s);
2118
2119 info_append_string(db, "heartbeat.protocol", protocol_s);
2120}
2121
2122/**
2123 * Populate heartbeat endpoints.
2124 */
2125void
2126as_hb_info_endpoints_get(cf_dyn_buf* db)
2127{
2128 const cf_serv_cfg *cfg = config_bind_cfg_get();
2129
2130 if (cfg->n_cfgs == 0) {
2131 // Will never happen in practice.
2132 return;
2133 }
2134
2135 info_append_int(db, "heartbeat.port", g_config.hb_serv_spec.bind_port);
2136
2137 char *string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT);
2138 info_append_string(db, "heartbeat.addresses", string);
2139 cf_free(string);
2140
2141 info_append_int(db, "heartbeat.tls-port",
2142 g_config.hb_tls_serv_spec.bind_port);
2143
2144 string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT_TLS);
2145 info_append_string(db, "heartbeat.tls-addresses", string);
2146 cf_free(string);
2147
2148 if (hb_is_mesh()) {
2149 MESH_LOCK();
2150 cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
2151 mesh_peer_endpoint_reduce, db);
2152 MESH_UNLOCK();
2153 }
2154 else {
2155 // Output multicast groups.
2156 const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get();
2157 if (multicast_cfg->n_cfgs == 0) {
2158 return;
2159 }
2160
2161 cf_dyn_buf_append_string(db, "heartbeat.multicast-groups=");
2162 uint32_t count = 0;
2163 for (uint32_t i = 0; i < multicast_cfg->n_cfgs; ++i) {
2164 if (count > 0) {
2165 cf_dyn_buf_append_char(db, ',');
2166 }
2167
2168 cf_dyn_buf_append_string(db,
2169 cf_ip_addr_print(&multicast_cfg->cfgs[i].addr));
2170 ++count;
2171 }
2172 cf_dyn_buf_append_char(db, ';');
2173 }
2174}
2175
2176/**
2177 * Generate a string for listening address and port in format ip_address:port
2178 * and return the heartbeat mode.
2179 *
2180 * @param mode (output) current heartbeat subsystem mode.
2181 * @param addr_port (output) listening ip address and port formatted as
2182 * ip_address:port
2183 * @param addr_port_capacity the capacity of the addr_port input.
2184 */
2185void
2186as_hb_info_listen_addr_get(as_hb_mode* mode, char* addr_port,
2187 size_t addr_port_capacity)
2188{
2189 *mode = hb_is_mesh() ? AS_HB_MODE_MESH : AS_HB_MODE_MULTICAST;
2190 if (hb_is_mesh()) {
2191 endpoint_list_to_string_udata udata;
2192 udata.endpoint_list_str = addr_port;
2193 udata.endpoint_list_str_capacity = addr_port_capacity;
2194 mesh_published_endpoints_process(endpoint_list_to_string_process,
2195 &udata);
2196 }
2197 else {
2198 const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get();
2199
2200 char* write_ptr = addr_port;
2201 int remaining = addr_port_capacity;
2202
2203 // Ensure we leave space for the terminating NULL delimiter.
2204 for (int i = 0; i < multicast_cfg->n_cfgs && remaining > 1; i++) {
2205 cf_sock_addr temp;
2206 cf_ip_addr_copy(&multicast_cfg->cfgs[i].addr, &temp.addr);
2207 temp.port = multicast_cfg->cfgs[i].port;
2208 int rv = cf_sock_addr_to_string(&temp, write_ptr, remaining);
2209 if (rv <= 0) {
2210 // We exhausted the write buffer.
2211 // Ensure NULL termination.
2212 addr_port[addr_port_capacity - 1] = 0;
2213 return;
2214 }
2215
2216 write_ptr += rv;
2217 remaining -= rv;
2218
2219 if (i != multicast_cfg->n_cfgs - 1 && remaining > 1) {
2220 *write_ptr = ',';
2221 write_ptr++;
2222 remaining--;
2223 }
2224 }
2225
2226 // Ensure NULL termination.
2227 *write_ptr = 0;
2228 }
2229}
2230
2231/**
2232 * Populate the buffer with duplicate nodeids.
2233 */
2234void
2235as_hb_info_duplicates_get(cf_dyn_buf* db)
2236{
2237 cf_dyn_buf_append_string(db, "cluster_duplicate_nodes=");
2238
2239 HB_LOCK();
2240 bool self_is_duplicate = hb_self_is_duplicate();
2241 int num_probation = cf_shash_get_size(g_hb.on_probation);
2242 cf_node duplicate_list[num_probation + 1];
2243
2244 if (!self_is_duplicate && num_probation == 0) {
2245 cf_dyn_buf_append_string(db, "null");
2246 goto Exit;
2247 }
2248
2249 as_hb_adjacency_reduce_udata probation_reduce_udata = { duplicate_list, 0 };
2250
2251 cf_shash_reduce(g_hb.on_probation, hb_adjacency_iterate_reduce,
2252 &probation_reduce_udata);
2253
2254 if (hb_self_is_duplicate()) {
2255 duplicate_list[probation_reduce_udata.adj_count++] =
2256 config_self_nodeid_get();
2257 }
2258
2259 int num_duplicates = probation_reduce_udata.adj_count;
2260 qsort(duplicate_list, num_duplicates, sizeof(cf_node),
2261 cf_node_compare_desc);
2262
2263 for (int i = 0; i < num_duplicates; i++) {
2264 cf_dyn_buf_append_uint64_x(db, duplicate_list[i]);
2265 cf_dyn_buf_append_char(db, ',');
2266 }
2267 cf_dyn_buf_chomp(db);
2268
2269Exit:
2270 HB_UNLOCK();
2271 cf_dyn_buf_append_char(db, ';');
2272}
2273
2274/*
2275 * -----------------------------------------------------------------
2276 * Mesh mode public API
2277 * -----------------------------------------------------------------
2278 */
2279
2280/**
2281 * Add an aerospike instance from the mesh seed list.
2282 */
2283int
2284as_hb_mesh_tip(char* host, int port, bool tls)
2285{
2286 if (!hb_is_mesh()) {
2287 WARNING("tip not applicable for multicast");
2288 return (-1);
2289 }
2290
2291 return mesh_tip(host, port, tls);
2292}
2293
2294/**
2295 * Remove a mesh node instance from the mesh list.
2296 */
2297int
2298as_hb_mesh_tip_clear(char* host, int port)
2299{
2300 if (!hb_is_mesh()) {
2301 WARNING("tip clear not applicable for multicast");
2302 return (-1);
2303 }
2304
2305 if (host == NULL || host[0] == 0
2306 || strnlen(host, DNS_NAME_MAX_SIZE) == DNS_NAME_MAX_SIZE) {
2307 WARNING("invalid tip clear host:%s or port:%d", host, port);
2308 return (-1);
2309 }
2310
2311 MESH_LOCK();
2312 DETAIL("executing tip clear for %s:%d", host, port);
2313
2314 // FIXME: Remove the mesh host entry and close channel was done to meet
2315 // AER-5241 ???
2316 // tip-clear is not a mechanism to throw a connected node out of the
2317 // cluster.
2318 // We should not be required to use this mechanism now.
2319 // tip-clear should only be used to cleanup seed list after decommisioning
2320 // an ip.
2321 cf_ip_addr addrs[CF_SOCK_CFG_MAX];
2322 uint32_t n_addrs = CF_SOCK_CFG_MAX;
2323
2324 as_hb_mesh_tip_clear_udata mesh_tip_clear_reduce_udata;
2325 strcpy(mesh_tip_clear_reduce_udata.host, host);
2326 mesh_tip_clear_reduce_udata.port = port;
2327 mesh_tip_clear_reduce_udata.entry_deleted = false;
2328 mesh_tip_clear_reduce_udata.nodeid = 0;
2329
2330 if (cf_ip_addr_from_string_multi(host, addrs, &n_addrs) != 0) {
2331 n_addrs = 0;
2332 }
2333
2334 mesh_tip_clear_reduce_udata.addrs = addrs;
2335 mesh_tip_clear_reduce_udata.n_addrs = n_addrs;
2336
2337 int seed_index = mesh_seed_find_unsafe(host, port);
2338 if (seed_index >= 0) {
2339 as_hb_mesh_seed* seed = cf_vector_getp(
2340 &g_hb.mode_state.mesh_state.seeds, seed_index);
2341 mesh_tip_clear_reduce_udata.nodeid = seed->mesh_nodeid;
2342 }
2343
2344 // Refresh the mapping between the seeds and the mesh hosts.
2345 mesh_seed_inactive_refresh_get_unsafe (NULL);
2346 cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
2347 mesh_tip_clear_reduce, &mesh_tip_clear_reduce_udata);
2348
2349 // Remove the seed entry in case we do not find a matching mesh entry.
2350 // Will happen trivially if this seed could not be connected.
2351 mesh_tip_clear_reduce_udata.entry_deleted =
2352 mesh_tip_clear_reduce_udata.entry_deleted
2353 || mesh_seed_delete_unsafe(
2354 mesh_seed_find_unsafe(host, port)) == 0;
2355
2356 MESH_UNLOCK();
2357 return mesh_tip_clear_reduce_udata.entry_deleted ? 0 : -1;
2358}
2359
2360/**
2361 * Clear the entire mesh list.
2362 */
2363int
2364as_hb_mesh_tip_clear_all(uint32_t* cleared)
2365{
2366 if (!hb_is_mesh()) {
2367 WARNING("tip clear not applicable for multicast");
2368 return (-1);
2369 }
2370
2371 MESH_LOCK();
2372 *cleared = cf_shash_get_size(
2373 g_hb.mode_state.mesh_state.nodeid_to_mesh_node);
2374
2375 // Refresh the mapping between the seeds and the mesh hosts.
2376 mesh_seed_inactive_refresh_get_unsafe(NULL);
2377 cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
2378 mesh_tip_clear_reduce, NULL);
2379
2380 // Remove all entries that did not have a matching mesh endpoint.
2381 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
2382 int element_count = cf_vector_size(seeds);
2383 for (int i = 0; i < element_count; i++) {
2384 if (mesh_seed_delete_unsafe(i) == 0) {
2385 i--;
2386 element_count--;
2387 }
2388 else {
2389 // Should not happen in practice.
2390 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
2391 CRASH("error deleting mesh seed entry %s:%d", seed->seed_host_name,
2392 seed->seed_port);
2393 }
2394 }
2395
2396 MESH_UNLOCK();
2397 return (0);
2398}
2399
2400/**
2401 * Read the plugin data for a node in the adjacency list. The plugin_data->data
2402 * input param should be pre allocated and plugin_data->data_capacity should
2403 * indicate its capacity.
2404 *
2405 * @param nodeid the node id
2406 * @param pluginid the plugin identifier.
2407 * @param plugin_data (input/output) on success plugin_data->data will be the
2408 * plugin's data for the node and plugin_data->data_size will be the data size.
2409 * node. NULL if there is no plugin data.
2410 * @praram msg_hlc_ts (output) if not NULL will be filled with the timestamp of
2411 * when the hb message for this data was received.
2412 * @param recv_monotonic_ts (output) if not NULL will be filled with monotonic
2413 * wall clock receive timestamp for this plugin data.
2414 * @return 0 on success and -1 on error, where errno will be set to ENOENT if
2415 * there is no entry for this node and ENOMEM if the input plugin data's
2416 * capacity is less than plugin's data. In ENOMEM case plugin_data->data_size
2417 * will be set to the required capacity.
2418 */
2419int
2420as_hb_plugin_data_get(cf_node nodeid, as_hb_plugin_id plugin,
2421 as_hb_plugin_node_data* plugin_data, as_hlc_msg_timestamp* msg_hlc_ts,
2422 cf_clock* recv_monotonic_ts)
2423{
2424 int rv = 0;
2425
2426 HB_LOCK();
2427
2428 as_hb_adjacent_node adjacent_node;
2429 if (hb_adjacent_node_get(nodeid, &adjacent_node) != 0) {
2430 rv = -1;
2431 plugin_data->data_size = 0;
2432 errno = ENOENT;
2433 goto Exit;
2434 }
2435
2436 as_hb_plugin_node_data* plugin_data_internal =
2437 &adjacent_node.plugin_data[plugin][adjacent_node.plugin_data_cycler
2438 % 2];
2439
2440 if (plugin_data_internal->data && plugin_data_internal->data_size) {
2441 // Set the plugin data size
2442 plugin_data->data_size = plugin_data_internal->data_size;
2443
2444 if (plugin_data_internal->data_size > plugin_data->data_capacity) {
2445 rv = -1;
2446 errno = ENOMEM;
2447 goto Exit;
2448 }
2449
2450 // Copy over the stored copy of the plugin data.
2451 memcpy(plugin_data->data, plugin_data_internal->data,
2452 plugin_data_internal->data_size);
2453
2454 // Copy the message timestamp.
2455 if (msg_hlc_ts) {
2456 memcpy(msg_hlc_ts, &adjacent_node.last_msg_hlc_ts,
2457 sizeof(as_hlc_msg_timestamp));
2458 }
2459
2460 if (recv_monotonic_ts) {
2461 *recv_monotonic_ts = adjacent_node.last_updated_monotonic_ts;
2462 }
2463
2464 rv = 0;
2465 }
2466 else {
2467 // No plugin data set.
2468 plugin_data->data_size = 0;
2469 if (recv_monotonic_ts) {
2470 *recv_monotonic_ts = 0;
2471 }
2472 if (msg_hlc_ts) {
2473 memset(msg_hlc_ts, 0, sizeof(as_hlc_msg_timestamp));
2474 }
2475 rv = 0;
2476 }
2477
2478Exit:
2479 HB_UNLOCK();
2480 return rv;
2481}
2482
2483/**
2484 * Call the iterate method on plugin data for all nodes in the input vector. The
2485 * iterate function will be invoked for all nodes in the input vector even if
2486 * they are not in the adjacency list or they have no plugin data. Plugin data
2487 * will be NULL with size zero in such cases.
2488 *
2489 * @param nodes the iterate on.
2490 * @param plugin the plugin identifier.
2491 * @param iterate_fn the iterate function invoked for plugin data for every
2492 * node.
2493 * @param udata passed as is to the iterate function. Useful for getting results
2494 * out of the iteration.
2495 * NULL if there is no plugin data.
2496 * @return the size of the plugin data. 0 if there is no plugin data.
2497 */
2498void
2499as_hb_plugin_data_iterate(cf_vector* nodes, as_hb_plugin_id plugin,
2500 as_hb_plugin_data_iterate_fn iterate_fn, void* udata)
2501
2502{
2503 HB_LOCK();
2504
2505 int size = cf_vector_size(nodes);
2506
2507 for (int i = 0; i < size; i++) {
2508 cf_node* nodeid = cf_vector_getp(nodes, i);
2509
2510 if (nodeid == NULL || *nodeid == 0) {
2511 continue;
2512 }
2513
2514 as_hb_adjacent_node nodeinfo;
2515
2516 if (hb_adjacent_node_get(*nodeid, &nodeinfo) == 0) {
2517 size_t data_size = 0;
2518 void* data = NULL;
2519
2520 hb_adjacent_node_plugin_data_get(&nodeinfo, plugin, &data,
2521 &data_size);
2522
2523 iterate_fn(*nodeid, data, data_size,
2524 nodeinfo.last_updated_monotonic_ts,
2525 &nodeinfo.last_msg_hlc_ts, udata);
2526 }
2527 else {
2528 // This node is not known to the heartbeat subsystem.
2529 iterate_fn(*nodeid, NULL, 0, 0, NULL, udata);
2530 }
2531 }
2532
2533 HB_UNLOCK();
2534}
2535
2536/**
2537 * Call the iterate method on all nodes in current adjacency list. Note plugin
2538 * data can still be NULL if the plugin data failed to parse the plugin data.
2539 *
2540 * @param pluginid the plugin identifier.
2541 * @param iterate_fn the iterate function invoked for plugin data for every
2542 * node.
2543 * @param udata passed as is to the iterate function. Useful for getting results
2544 * out of the iteration.
2545 * NULL if there is no plugin data.
2546 * @return the size of the plugin data. 0 if there is no plugin data.
2547 */
2548void
2549as_hb_plugin_data_iterate_all(as_hb_plugin_id pluginid,
2550 as_hb_plugin_data_iterate_fn iterate_fn, void* udata)
2551{
2552 hb_plugin_data_iterate_all(pluginid, iterate_fn, udata);
2553}
2554
2555/**
2556 * Log the state of the heartbeat module.
2557 */
2558void
2559as_hb_dump(bool verbose)
2560{
2561 INFO("Heartbeat Dump:");
2562
2563 as_hb_mode mode;
2564 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
2565 as_hb_info_listen_addr_get(&mode, endpoint_list_str,
2566 sizeof(endpoint_list_str));
2567
2568 // Dump the config.
2569 INFO("HB Mode: %s (%d)",
2570 (mode == AS_HB_MODE_MULTICAST ?
2571 "multicast" :
2572 (mode == AS_HB_MODE_MESH ? "mesh" : "undefined")), mode);
2573
2574 INFO("HB Addresses: {%s}", endpoint_list_str);
2575 INFO("HB MTU: %d", hb_mtu());
2576
2577 INFO("HB Interval: %d", config_tx_interval_get());
2578 INFO("HB Timeout: %d", config_max_intervals_missed_get());
2579 char protocol_s[HB_PROTOCOL_STR_MAX_LEN];
2580 as_hb_protocol_get_s(config_protocol_get(), protocol_s);
2581 INFO("HB Protocol: %s (%d)", protocol_s, config_protocol_get());
2582
2583 // dump mode specific state.
2584 hb_mode_dump(verbose);
2585
2586 // Dump the channel state.
2587 channel_dump(verbose);
2588
2589 // Dump the adjacency list.
2590 hb_dump(verbose);
2591}
2592
2593/**
2594 * Indicates if a node is alive.
2595 */
2596bool
2597as_hb_is_alive(cf_node nodeid)
2598{
2599 bool is_alive;
2600 HB_LOCK();
2601
2602 as_hb_adjacent_node adjacent_node;
2603 is_alive = (nodeid == config_self_nodeid_get())
2604 || (hb_adjacent_node_get(nodeid, &adjacent_node) == 0);
2605
2606 HB_UNLOCK();
2607 return is_alive;
2608}
2609
2610/**
2611 * Compute the nodes to evict from the input nodes so that remaining nodes form
2612 * a clique, based on adjacency lists. Self nodeid is never considered for
2613 * eviction.
2614 *
2615 * @param nodes input cf_node vector.
2616 * @param nodes_to_evict output cf_node clique array, that is initialized.
2617 */
2618void
2619as_hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict)
2620{
2621 hb_maximal_clique_evict(nodes, nodes_to_evict);
2622}
2623
2624/**
2625 * Read the hlc timestamp for the message.
2626 * Note: A protected API for the sole benefit of skew monitor.
2627 *
2628 * @param msg the incoming message.
2629 * @param send_ts the output hlc timestamp.
2630 * @return 0 if the time stamp could be parsed -1 on failure.
2631 */
2632int
2633as_hb_msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts)
2634{
2635 return msg_send_hlc_ts_get(msg, send_ts);
2636}
2637
2638/*
2639 * ----------------------------------------------------------------------------
2640 * Common sub module.
2641 * ----------------------------------------------------------------------------
2642 */
2643
2644/*
2645 * ----------------------------------------------------------------------------
2646 * Utility
2647 * ----------------------------------------------------------------------------
2648 */
2649
2650/**
2651 * Round up input int to the nearest power of two.
2652 */
2653static uint32_t
2654round_up_pow2(uint32_t v)
2655{
2656 v--;
2657 v |= v >> 1;
2658 v |= v >> 2;
2659 v |= v >> 4;
2660 v |= v >> 8;
2661 v |= v >> 16;
2662 v++;
2663 return v;
2664}
2665
2666/**
2667 * Generate a hash code for a cf_socket.
2668 */
2669static uint32_t
2670hb_socket_hash_fn(const void* key)
2671{
2672 const cf_socket** socket = (const cf_socket**)key;
2673 return cf_hash_jen32((const uint8_t*)socket, sizeof(cf_socket*));
2674}
2675
2676/**
2677 * Reduce function to delete all entries in a map
2678 */
2679static int
2680hb_delete_all_reduce(const void* key, void* data, void* udata)
2681{
2682 return CF_SHASH_REDUCE_DELETE;
2683}
2684
2685/*
2686 * ----------------------------------------------------------------------------
2687 * Info call related
2688 * ----------------------------------------------------------------------------
2689 */
2690
2691/**
2692 * Append a address spec to a cf_dyn_buf.
2693 */
2694static void
2695info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list)
2696{
2697 for (uint32_t i = 0; i < list->n_addrs; ++i) {
2698 info_append_string(db, name, list->addrs[i]);
2699 }
2700}
2701
2702/*
2703 * ----------------------------------------------------------------------------
2704 * Vector operations
2705 * ----------------------------------------------------------------------------
2706 */
2707
2708/**
2709 * TODO: Move this to cf_vector.
2710 * Find the index of an element in the vector. Equality is based on mem compare.
2711 *
2712 * @param vector the source vector.
2713 * @param element the element to find.
2714 * @return the index if the element is found, -1 otherwise.
2715 */
2716static int
2717vector_find(cf_vector* vector, const void* element)
2718{
2719 int element_count = cf_vector_size(vector);
2720 size_t value_len = cf_vector_element_size(vector);
2721 for (int i = 0; i < element_count; i++) {
2722 // No null check required since we are iterating under a lock and within
2723 // vector bounds.
2724 void* src_element = cf_vector_getp(vector, i);
2725 if (src_element) {
2726 if (memcmp(element, src_element, value_len) == 0) {
2727 return i;
2728 }
2729 }
2730 }
2731 return -1;
2732}
2733
2734/*
2735 * ----------------------------------------------------------------------------
2736 * Endpoint list related
2737 * ----------------------------------------------------------------------------
2738 */
2739
2740/**
2741 * Copy an endpoint list to the destination, while possible reallocating the
2742 * destination space.
2743 * @param dest the double pointer to the destination list, because it might need
2744 * reallocation to accommodate a larger source list.
2745 * @param src the source endpoint list.
2746 */
2747static void
2748endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src)
2749{
2750 size_t src_size;
2751
2752 if (as_endpoint_list_sizeof(src, &src_size) != 0) {
2753 // Bad endpoint list passed.
2754 CRASH("invalid adjacency list passed for copying");
2755 }
2756
2757 *dest = cf_realloc(*dest, src_size);
2758
2759 memcpy(*dest, src, src_size);
2760}
2761
2762/**
2763 * Process function to convert endpoint list to a string.
2764 */
2765static void
2766endpoint_list_to_string_process(const as_endpoint_list* endpoint_list,
2767 void* udata)
2768{
2769 endpoint_list_to_string_udata* to_string_udata =
2770 (endpoint_list_to_string_udata*)udata;
2771 as_endpoint_list_to_string(endpoint_list,
2772 to_string_udata->endpoint_list_str,
2773 to_string_udata->endpoint_list_str_capacity);
2774}
2775
2776/**
2777 * Process function to check if endpoint lists overlap.
2778 */
2779static void
2780endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata)
2781{
2782 endpoint_list_equal_check_udata* equal_udata =
2783 (endpoint_list_equal_check_udata*)udata;
2784
2785 equal_udata->are_equal = equal_udata->are_equal
2786 || as_endpoint_lists_are_equal(endpoint_list, equal_udata->other);
2787}
2788
2789/*
2790 * ----------------------------------------------------------------------------
2791 * Messge related
2792 * ----------------------------------------------------------------------------
2793 */
2794
2795/**
2796 * The size of a buffer beyond which compression should be applied. For now set
2797 * to 60% of the interface mtu.
2798 */
2799static int
2800msg_compression_threshold(int mtu)
2801{
2802 return (int)(mtu * 0.6);
2803}
2804
2805/**
2806 * Read advertised endpoint list from an incoming message.
2807 * @param msg the incoming message.
2808 * @param endpoint_list the output endpoint. The endpoint_list will point to
2809 * input message.
2810 * internal location and should not be freed.
2811 * @return 0 on success -1 on failure.
2812 */
2813static int
2814msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list)
2815{
2816 size_t endpoint_list_size;
2817 if (msg_get_buf(msg, AS_HB_MSG_ENDPOINTS, (uint8_t**)endpoint_list,
2818 &endpoint_list_size, MSG_GET_DIRECT) != 0) {
2819 return -1;
2820 }
2821
2822 size_t parsed_size;
2823 if (as_endpoint_list_nsizeof(*endpoint_list, &parsed_size,
2824 endpoint_list_size) || parsed_size != endpoint_list_size) {
2825 return -1;
2826 }
2827 return 0;
2828}
2829
2830/**
2831 * Read the protocol identifier for this heartbeat message. These functions can
2832 * get called multiple times for a single message. Hence they do not increment
2833 * error counters.
2834 *
2835 * @param msg the incoming message.
2836 * @param id the output id.
2837 * @return 0 if the id could be parsed -1 on failure.
2838 */
2839static int
2840msg_id_get(msg* msg, uint32_t* id)
2841{
2842 if (msg_get_uint32(msg, AS_HB_MSG_ID, id) != 0) {
2843 return -1;
2844 }
2845
2846 return 0;
2847}
2848
2849/**
2850 * Read the source nodeid for a node. These functions can get called multiple
2851 * times for a single message. Hence they do not increment error counters.
2852 * @param msg the incoming message.
2853 * @param nodeid the output nodeid.
2854 * @return 0 if the nodeid could be parsed -1 on failure.
2855 */
2856static int
2857msg_nodeid_get(msg* msg, cf_node* nodeid)
2858{
2859 if (msg_get_uint64(msg, AS_HB_MSG_NODE, nodeid) != 0) {
2860 return -1;
2861 }
2862
2863 return 0;
2864}
2865
2866/**
2867 * Read the HLC send timestamp for the message. These functions can get called
2868 * multiple times for a single message. Hence they do not increment error
2869 * counters.
2870 * @param msg the incoming message.
2871 * @param send_ts the output hlc timestamp.
2872 * @return 0 if the time stamp could be parsed -1 on failure.
2873 */
2874static int
2875msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts)
2876{
2877 if (msg_get_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, send_ts) != 0) {
2878 return -1;
2879 }
2880
2881 return 0;
2882}
2883
2884/**
2885 * Read the message type. These functions can get called multiple times for a
2886 * single message. Hence they do not increment error counters.
2887 * @param msg the incoming message.
2888 * @param type the output message type.
2889 * @return 0 if the type could be parsed -1 on failure.
2890 */
2891static int
2892msg_type_get(msg* msg, as_hb_msg_type* type)
2893{
2894 if (msg_get_uint32(msg, AS_HB_MSG_TYPE, type) != 0) {
2895 return -1;
2896 }
2897
2898 return 0;
2899}
2900
2901/**
2902 * Read the cluster name.
2903 * @param msg the incoming message.
2904 * @param cluster name of the output message type.
2905 * @return 0 if the cluster name could be parsed -1 on failure.
2906 */
2907static int
2908msg_cluster_name_get(msg* msg, char** cluster_name)
2909{
2910 if (msg_get_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name,
2911 MSG_GET_DIRECT) != 0) {
2912 return -1;
2913 }
2914
2915 return 0;
2916}
2917
2918/**
2919 * Get a pointer to a node list in the message.
2920 *
2921 * @param msg the incoming message.
2922 * @param field_id the field id.
2923 * @param adj_list output. on success will point to the adjacency list in the
2924 * message.
2925 * @para adj_length output. on success will contain the length of the adjacency
2926 * list.
2927 * @return 0 on success. -1 if the adjacency list is absent.
2928 */
2929static int
2930msg_node_list_get(msg* msg, int field_id, cf_node** adj_list,
2931 size_t* adj_length)
2932{
2933 if (msg_get_buf(msg, field_id, (uint8_t**)adj_list, adj_length,
2934 MSG_GET_DIRECT) != 0) {
2935 return -1;
2936 }
2937
2938 // correct adjacency list length.
2939 *adj_length /= sizeof(cf_node);
2940
2941 return 0;
2942}
2943
2944/**
2945 * Get a pointer to the adjacency list in the message.
2946 *
2947 * @param msg the incoming message.
2948 * @param adj_list output. on success will point to the adjacency list in the
2949 * message.
2950 * @para adj_length output. on success will contain the length of the adjacency
2951 * list.
2952 * @return 0 on success. -1 if the adjacency list is absent.
2953 */
2954static int
2955msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length)
2956{
2957 return msg_node_list_get(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length);
2958}
2959
2960/**
2961 * Set a node list on an outgoing messages for a field.
2962 *
2963 * @param msg the outgoing message.
2964 * @param field_id the id of the list field.
2965 * @param node_list the adjacency list to set.
2966 * @para node_length the length of the adjacency list.
2967 */
2968static void
2969msg_node_list_set(msg* msg, int field_id, cf_node* node_list,
2970 size_t node_length)
2971{
2972 msg_set_buf(msg, field_id, (uint8_t*)node_list,
2973 sizeof(cf_node) * node_length, MSG_SET_COPY);
2974}
2975
2976/**
2977 * Set the adjacency list on an outgoing messages.
2978 *
2979 * @param msg the outgoing message.
2980 * @param adj_list the adjacency list to set.
2981 * @para adj_length the length of the adjacency list.
2982 */
2983static void
2984msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length)
2985{
2986 msg_node_list_set(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length);
2987}
2988
2989/**
2990 * Set the info reply on an outgoing messages.
2991 *
2992 * @param msg the outgoing message.
2993 * @param response the response list to set.
2994 * @para response_count the length of the response list.
2995 */
2996static void
2997msg_info_reply_set(msg* msg, as_hb_mesh_info_reply* response,
2998 size_t response_count)
2999{
3000 size_t response_size = 0;
3001 if (mesh_info_reply_sizeof(response, response_count, &response_size)) {
3002 CRASH("error setting info reply on msg");
3003 }
3004
3005 msg_set_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t*)response, response_size,
3006 MSG_SET_COPY);
3007
3008 return;
3009}
3010
3011/**
3012 * Get a pointer to the info reply list in the message.
3013 *
3014 * @param msg the incoming message.
3015 * @param reply output. on success will point to the reply list in the message.
3016 * @param reply_count output. on success will contain the length of the reply
3017 * list.
3018 * @return 0 on success. -1 if the reply list is absent.
3019 */
3020static int
3021msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count)
3022{
3023 size_t reply_size;
3024 if (msg_get_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t**)reply, &reply_size,
3025 MSG_GET_DIRECT) != 0) {
3026 return -1;
3027 }
3028
3029 *reply_count = 0;
3030
3031 // Go over reply and compute the count of replies and also validate the
3032 // endpoint lists.
3033 uint8_t* start_ptr = (uint8_t*)*reply;
3034 int64_t remaining_size = reply_size;
3035
3036 while (remaining_size > 0) {
3037 as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr;
3038 remaining_size -= sizeof(as_hb_mesh_info_reply);
3039 start_ptr += sizeof(as_hb_mesh_info_reply);
3040 if (remaining_size <= 0) {
3041 // Incomplete / garbled info reply message.
3042 *reply_count = 0;
3043 return -1;
3044 }
3045
3046 size_t endpoint_list_size = 0;
3047 if (as_endpoint_list_nsizeof(reply_ptr->endpoint_list,
3048 &endpoint_list_size, remaining_size) != 0) {
3049 // Incomplete / garbled info reply message.
3050 *reply_count = 0;
3051 return -1;
3052 }
3053
3054 remaining_size -= endpoint_list_size;
3055 start_ptr += endpoint_list_size;
3056 (*reply_count)++;
3057 }
3058
3059 return 0;
3060}
3061
3062/**
3063 * Fill a message with an endpoint list.
3064 */
3065static void
3066msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list,
3067 void* udata)
3068{
3069 endpoint_list_to_msg_udata* to_msg_udata =
3070 (endpoint_list_to_msg_udata*)udata;
3071 msg* msg = to_msg_udata->msg;
3072 bool is_mesh = to_msg_udata->is_mesh;
3073
3074 if (!published_endpoint_list) {
3075 if (is_mesh) {
3076 // Something is messed up. Except for v3 multicast,
3077 // published list should not be empty.
3078 WARNING("published endpoint list is empty");
3079 }
3080 return;
3081 }
3082
3083 // Makes sense only for mesh.
3084 if (is_mesh && published_endpoint_list) {
3085 // Set the source address
3086 size_t endpoint_list_size = 0;
3087 as_endpoint_list_sizeof(published_endpoint_list, &endpoint_list_size);
3088 msg_set_buf(msg, AS_HB_MSG_ENDPOINTS,
3089 (uint8_t*)published_endpoint_list, endpoint_list_size,
3090 MSG_SET_COPY);
3091 }
3092}
3093
3094/**
3095 * Fill source fields for the message.
3096 * @param msg the message to fill the source fields into.
3097 */
3098static void
3099msg_src_fields_fill(msg* msg)
3100{
3101 bool is_mesh = hb_is_mesh();
3102
3103 // Set the hb protocol id / version.
3104 msg_set_uint32(msg, AS_HB_MSG_ID, hb_protocol_identifier_get());
3105
3106 // Set the source node.
3107 msg_set_uint64(msg, AS_HB_MSG_NODE, config_self_nodeid_get());
3108
3109 endpoint_list_to_msg_udata udata;
3110 udata.msg = msg;
3111 udata.is_mesh = is_mesh;
3112
3113 if (is_mesh) {
3114 // Endpoint list only valid for mesh mode.
3115 mesh_published_endpoints_process(msg_published_endpoints_fill, &udata);
3116 }
3117
3118 // Set the send hlc timestamp
3119 msg_set_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, as_hlc_timestamp_now());
3120}
3121
3122/**
3123 * Set the type for an outgoing message.
3124 * @param msg the outgoing message.
3125 * @param msg_type the type to set.
3126 */
3127static void
3128msg_type_set(msg* msg, as_hb_msg_type msg_type)
3129{
3130 // Set the message type.
3131 msg_set_uint32(msg, AS_HB_MSG_TYPE, msg_type);
3132}
3133
3134/*
3135 * ----------------------------------------------------------------------------
3136 * Config sub module.
3137 * ----------------------------------------------------------------------------
3138 */
3139
3140/**
3141 * Get mcsize.
3142 */
3143static int
3144config_mcsize()
3145{
3146 int mode_cluster_size = 0;
3147 if (hb_is_mesh()) {
3148 // Only bounded by available memory. But let's say its infinite.
3149 mode_cluster_size = INT_MAX;
3150 }
3151 else {
3152 mode_cluster_size = multicast_supported_cluster_size_get();
3153 }
3154
3155 // Ensure we are always upper bounded by the absolute max cluster size.
3156 int supported_cluster_size = MIN(ASC, mode_cluster_size);
3157
3158 DETAIL("supported cluster size %d", supported_cluster_size);
3159 return supported_cluster_size;
3160}
3161
3162/**
3163 * Get the binding addresses for the heartbeat subsystem.
3164 */
3165static const cf_serv_cfg*
3166config_bind_cfg_get()
3167{
3168 // Not protected by config_lock because it is not changed.
3169 return &g_config.hb_config.bind_cfg;
3170}
3171
3172/**
3173 * Get the multicast groups for the multicast mode.
3174 */
3175static const cf_mserv_cfg*
3176config_multicast_group_cfg_get()
3177{
3178 // Not protected by config_lock. Never updated after config parsing..
3179 return &g_config.hb_config.multicast_group_cfg;
3180}
3181
3182/**
3183 * Get the heartbeat pulse transmit interval.
3184 */
3185static uint32_t
3186config_tx_interval_get()
3187{
3188 HB_CONFIG_LOCK();
3189 uint32_t interval = g_config.hb_config.tx_interval;
3190 HB_CONFIG_UNLOCK();
3191 return interval;
3192}
3193
3194/**
3195 * Set the heartbeat pulse transmit interval.
3196 */
3197static void
3198config_tx_interval_set(uint32_t new_interval)
3199{
3200 HB_CONFIG_LOCK();
3201 INFO("changing value of interval from %d to %d ",
3202 g_config.hb_config.tx_interval, new_interval);
3203 g_config.hb_config.tx_interval = new_interval;
3204 HB_CONFIG_UNLOCK();
3205}
3206
3207/**
3208 * Get the heartbeat pulse transmit interval.
3209 */
3210static uint32_t
3211config_override_mtu_get()
3212{
3213 HB_CONFIG_LOCK();
3214 uint32_t override_mtu = g_config.hb_config.override_mtu;
3215 HB_CONFIG_UNLOCK();
3216 return override_mtu;
3217}
3218
3219/**
3220 * Set the heartbeat pulse transmit interval.
3221 */
3222static void
3223config_override_mtu_set(uint32_t mtu)
3224{
3225 HB_CONFIG_LOCK();
3226 INFO("changing value of override mtu from %d to %d ",
3227 g_config.hb_config.override_mtu, mtu);
3228 g_config.hb_config.override_mtu = mtu;
3229 HB_CONFIG_UNLOCK();
3230 INFO("max supported cluster size is %d", config_mcsize());
3231}
3232
3233/**
3234 * Get the maximum number of missed heartbeat intervals after which a node is
3235 * considered expired.
3236 */
3237static uint32_t
3238config_max_intervals_missed_get()
3239{
3240 uint32_t rv = 0;
3241 HB_CONFIG_LOCK();
3242 rv = g_config.hb_config.max_intervals_missed;
3243 HB_CONFIG_UNLOCK();
3244 return rv;
3245}
3246
3247/**
3248 * Get the number intervals endpoints should be tracked for.
3249 */
3250static uint32_t
3251config_endpoint_track_intervals_get()
3252{
3253 // Allow a grace period of half heartbeat timeout, but lower bounded to at
3254 // least 3.
3255 return MAX(3, config_max_intervals_missed_get() / 2);
3256}
3257
3258/**
3259 * Get the maximum number of allowed changes, per endpoint track intervals.
3260 */
3261static uint32_t
3262config_endpoint_changes_allowed_get()
3263{
3264 // Allow no change to the endpoint list for now.
3265 return 0;
3266}
3267
3268/**
3269 * Set the maximum number of missed heartbeat intervals after which a node is
3270 * considered expired.
3271 */
3272static void
3273config_max_intervals_missed_set(uint32_t new_max)
3274{
3275 HB_CONFIG_LOCK();
3276 INFO("changing value of timeout from %d to %d ",
3277 g_config.hb_config.max_intervals_missed, new_max);
3278 g_config.hb_config.max_intervals_missed = new_max;
3279 HB_CONFIG_UNLOCK();
3280}
3281
3282/**
3283 * Return ttl for multicast packets. Set to zero for default TTL.
3284 */
3285static unsigned char
3286config_multicast_ttl_get()
3287{
3288 return g_config.hb_config.multicast_ttl;
3289}
3290
3291/**
3292 * Return the current heartbeat protocol.
3293 */
3294static as_hb_protocol
3295config_protocol_get()
3296{
3297 as_hb_protocol rv = 0;
3298 HB_CONFIG_LOCK();
3299 rv = g_config.hb_config.protocol;
3300 HB_CONFIG_UNLOCK();
3301 return rv;
3302}
3303
3304/**
3305 * Return the current heartbeat protocol.
3306 */
3307static void
3308config_protocol_set(as_hb_protocol new_protocol)
3309{
3310 HB_CONFIG_LOCK();
3311 g_config.hb_config.protocol = new_protocol;
3312 HB_CONFIG_UNLOCK();
3313}
3314
3315/**
3316 * The nodeid for this node.
3317 */
3318static cf_node
3319config_self_nodeid_get()
3320{
3321 // Not protected by config_lock. Never updated after config parsing..
3322 return g_config.self_node;
3323}
3324
3325/**
3326 * Return the heartbeat subsystem mode.
3327 */
3328static as_hb_mode
3329config_mode_get()
3330{
3331 // Not protected by config_lock. Never updated after config parsing..
3332 return g_config.hb_config.mode;
3333}
3334
3335/**
3336 * Expand "any" binding addresses to actual interface addresses.
3337 * @param bind_cfg the binding configuration.
3338 * @param published_cfg (output) the server configuration to expand.
3339 * @param ipv4_only indicates if only legacy addresses should be allowed.
3340 */
3341static void
3342config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg,
3343 cf_serv_cfg* published_cfg, bool ipv4_only)
3344{
3345 cf_serv_cfg_init(published_cfg);
3346 cf_sock_cfg sock_cfg;
3347
3348 for (int i = 0; i < bind_cfg->n_cfgs; i++) {
3349 cf_sock_cfg_copy(&bind_cfg->cfgs[i], &sock_cfg);
3350
3351 // Expand "any" address to all interfaces.
3352 if (cf_ip_addr_is_any(&sock_cfg.addr)) {
3353 cf_ip_addr all_addrs[CF_SOCK_CFG_MAX];
3354 uint32_t n_all_addrs = CF_SOCK_CFG_MAX;
3355 if (cf_inter_get_addr_all(all_addrs, &n_all_addrs) != 0) {
3356 WARNING("error getting all interface addresses");
3357 n_all_addrs = 0;
3358 }
3359
3360 for (int j = 0; j < n_all_addrs; j++) {
3361 // Skip local address if any is specified.
3362 if (cf_ip_addr_is_local(&all_addrs[j])
3363 || (ipv4_only && !cf_ip_addr_is_legacy(&all_addrs[j]))) {
3364 continue;
3365 }
3366
3367 cf_ip_addr_copy(&all_addrs[j], &sock_cfg.addr);
3368 if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) {
3369 CRASH("error initializing published address list");
3370 }
3371 }
3372
3373 // TODO: Does not look like the right warning or the right message.
3374 if (published_cfg->n_cfgs == 0) {
3375 WARNING(
3376 "no network interface addresses detected for heartbeat access");
3377 }
3378 }
3379 else {
3380 if (ipv4_only && !cf_ip_addr_is_legacy(&bind_cfg->cfgs[i].addr)) {
3381 continue;
3382 }
3383
3384 if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) {
3385 CRASH("error initializing published address list");
3386 }
3387 }
3388 }
3389}
3390
3391/**
3392 * Checks if the heartbeat binding configuration is valid.
3393 * @param error pointer to a static error message if validation fails, else will
3394 * be set to NULL.
3395 */
3396static bool
3397config_binding_is_valid(char** error, as_hb_protocol protocol)
3398{
3399 const cf_serv_cfg* bind_cfg = config_bind_cfg_get();
3400 const cf_mserv_cfg* multicast_group_cfg = config_multicast_group_cfg_get();
3401
3402 if (hb_is_mesh()) {
3403 if (bind_cfg->n_cfgs == 0) {
3404 // Should not happen in practice.
3405 *error = "no bind addresses found for heartbeat";
3406 return false;
3407 }
3408
3409 // Ensure we have a valid port for all bind endpoints.
3410 for (int i = 0; i < bind_cfg->n_cfgs; i++) {
3411 if (bind_cfg->cfgs[i].port == 0) {
3412 *error = "invalid mesh listening port";
3413 return false;
3414 }
3415 }
3416
3417 cf_serv_cfg publish_serv_cfg;
3418 cf_serv_cfg_init(&publish_serv_cfg);
3419
3420 if (multicast_group_cfg->n_cfgs != 0) {
3421 *error =
3422 "invalid config option: multicast-group not supported in mesh mode";
3423 return false;
3424 }
3425 }
3426 else {
3427 const cf_mserv_cfg* multicast_group_cfg =
3428 config_multicast_group_cfg_get();
3429
3430 if (multicast_group_cfg->n_cfgs == 0) {
3431 *error = "no multicast groups specified";
3432 return false;
3433 }
3434
3435 // Ensure multicast groups have valid ports.
3436 // TODO: We could check if the address is valid multicast.
3437 for (int i = 0; i < multicast_group_cfg->n_cfgs; i++) {
3438 if (multicast_group_cfg->cfgs[i].port == 0) {
3439 *error = "invalid multicast port";
3440 return false;
3441 }
3442 }
3443
3444 if (g_config.hb_config.mesh_seed_addrs[0]) {
3445 *error =
3446 "invalid config option: mesh-seed-address-port not supported for multicast mode";
3447 return false;
3448 }
3449
3450 cf_serv_cfg publish_serv_cfg;
3451 cf_serv_cfg_init(&publish_serv_cfg);
3452 }
3453
3454 *error = NULL;
3455 return true;
3456}
3457
3458/*
3459 * ----------------------------------------------------------------------------
3460 * Channel sub module.
3461 * ----------------------------------------------------------------------------
3462 */
3463
3464/**
3465 * Initialize the channel structure.
3466 */
3467static void
3468channel_init_channel(as_hb_channel* channel)
3469{
3470 memset(channel, 0, sizeof(as_hb_channel));
3471 cf_ip_addr_set_any(&channel->endpoint_addr.addr);
3472}
3473
3474/**
3475 * Initialize the channel event structure.
3476 */
3477static void
3478channel_event_init(as_hb_channel_event* event)
3479{
3480 memset(event, 0, sizeof(as_hb_channel_event));
3481}
3482
3483/**
3484 * Is channel running.
3485 */
3486static bool
3487channel_is_running()
3488{
3489 CHANNEL_LOCK();
3490 bool retval =
3491 (g_hb.channel_state.status == AS_HB_STATUS_RUNNING) ? true : false;
3492 CHANNEL_UNLOCK();
3493 return retval;
3494}
3495
3496/**
3497 * Is channel stopped.
3498 */
3499static bool
3500channel_is_stopped()
3501{
3502 CHANNEL_LOCK();
3503 bool retval =
3504 (g_hb.channel_state.status == AS_HB_STATUS_STOPPED) ? true : false;
3505 CHANNEL_UNLOCK();
3506 return retval;
3507}
3508
3509/**
3510 * Keep a winning socket as a winner for at least this amount of time to prevent
3511 * constant flip flopping and give the winning socket a chance to send
3512 * heartbeats.
3513 */
3514static uint32_t
3515channel_win_grace_ms()
3516{
3517 return 3 * config_tx_interval_get();
3518}
3519
3520/**
3521 * Enable / disable events.
3522 */
3523static void
3524channel_events_enabled_set(bool enabled)
3525{
3526 CHANNEL_LOCK();
3527 g_hb.channel_state.events_enabled = enabled;
3528 CHANNEL_UNLOCK();
3529}
3530
3531/**
3532 * Know if events are enabled.
3533 */
3534static bool
3535channel_are_events_enabled()
3536{
3537 bool result;
3538 CHANNEL_LOCK();
3539 result = g_hb.channel_state.events_enabled;
3540 CHANNEL_UNLOCK();
3541 return result;
3542}
3543
3544/**
3545 * Discard an event that has been processed.
3546 */
3547static void
3548channel_event_discard(as_hb_channel_event* event)
3549{
3550 // Free the message structure for message received events.
3551 if (event->type == AS_HB_CHANNEL_MSG_RECEIVED) {
3552 hb_msg_return(event->msg);
3553 }
3554}
3555
3556/**
3557 * Queues a channel event for publishing by the channel tender.
3558 */
3559static void
3560channel_event_queue(as_hb_channel_event* event)
3561{
3562 if (!channel_are_events_enabled()) {
3563 channel_event_discard(event);
3564 DETAIL(
3565 "events disabled. Ignoring event of type %d with nodeid %" PRIx64,
3566 event->type, event->nodeid);
3567 return;
3568 }
3569
3570 DETAIL("queuing channel event of type %d for node %" PRIx64, event->type,
3571 event->nodeid);
3572 cf_queue_push(&g_hb.channel_state.events_queue, event);
3573}
3574
3575/**
3576 * Publish queued up channel events. Should be called outside a channel lock to
3577 * prevent deadlocks.
3578 */
3579static void
3580channel_event_publish_pending()
3581{
3582 // No channel lock here to prevent deadlocks.
3583 as_hb_channel_event event;
3584 while (cf_queue_pop(&g_hb.channel_state.events_queue, &event, 0)
3585 == CF_QUEUE_OK) {
3586 // Nothing elaborate, using hardcoded list of event recipients.
3587 mesh_channel_event_process(&event);
3588 hb_channel_event_process(&event);
3589
3590 channel_event_discard(&event);
3591 }
3592}
3593
3594/**
3595 * Return the endpoint associated with this socket if it exists.
3596 *
3597 * @param socket the socket to query for.
3598 * @param result the output result.
3599 * @return 0 if the socket was found and the result value is filled. -1 if a
3600 * mapping for the socket could not be found.
3601 */
3602static int
3603channel_get_channel(cf_socket* socket, as_hb_channel* result)
3604{
3605 int status;
3606 CHANNEL_LOCK();
3607
3608 if (cf_shash_get(g_hb.channel_state.socket_to_channel, &socket, result)
3609 == CF_SHASH_OK) {
3610 status = 0;
3611 }
3612 else {
3613 status = -1;
3614 }
3615
3616 CHANNEL_UNLOCK();
3617 return status;
3618}
3619
3620/**
3621 * Shutdown a channel socket without closing, forcing the channel tender to
3622 * cleanup associated data structures.
3623 */
3624static void
3625channel_socket_shutdown(cf_socket* socket)
3626{
3627 cf_socket_shutdown(socket);
3628}
3629
3630/**
3631 * Return the socket associated with this node.
3632 * Returns 0 on success and -1 if there is no socket attached to this node.
3633 */
3634static int
3635channel_socket_get(cf_node nodeid, cf_socket** socket)
3636{
3637 int rv = -1;
3638 CHANNEL_LOCK();
3639 if (cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, socket)
3640 == CF_SHASH_ERR_NOT_FOUND) {
3641 rv = -1;
3642 }
3643 else {
3644 rv = 0;
3645 }
3646
3647 CHANNEL_UNLOCK();
3648 return rv;
3649}
3650
3651/**
3652 * Indicate if a socket is present in a sockets list.
3653 */
3654static bool
3655channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find)
3656{
3657 for (int i = 0; i < sockets->n_socks; i++) {
3658 if (&sockets->socks[i] == to_find) {
3659 return true;
3660 }
3661 }
3662
3663 return false;
3664}
3665
3666/**
3667 * Destroy an allocated socket.
3668 */
3669static void
3670channel_socket_destroy(cf_socket* sock)
3671{
3672 cf_socket_close(sock);
3673 cf_socket_term(sock);
3674 cf_free(sock);
3675}
3676
3677/**
3678 * Close a channel socket. Precondition is that the socket is registered with
3679 * the channel module using channel_socket_register.
3680 */
3681static void
3682channel_socket_close(cf_socket* socket, bool remote_close,
3683 bool raise_close_event)
3684{
3685 if (remote_close) {
3686 DEBUG("remote close: fd %d event", CSFD(socket));
3687 }
3688
3689 CHANNEL_LOCK();
3690
3691 if (channel_cf_sockets_contains(g_hb.channel_state.listening_sockets,
3692 socket)) {
3693 // Listening sockets will be closed by the mode (mesh/multicast
3694 // ) modules.
3695 goto Exit;
3696 }
3697
3698 // Clean up data structures.
3699 as_hb_channel channel;
3700 int status = channel_get_channel(socket, &channel);
3701
3702 if (status == 0) {
3703 if (channel.nodeid != 0) {
3704 cf_socket* node_socket;
3705 if (channel_socket_get(channel.nodeid, &node_socket) == 0
3706 && node_socket == socket) {
3707 // Remove associated node for this socket.
3708 cf_shash_delete(g_hb.channel_state.nodeid_to_socket,
3709 &channel.nodeid);
3710
3711 if (!channel.is_multicast && raise_close_event) {
3712 as_hb_channel_event event;
3713 channel_event_init(&event);
3714
3715 // Notify others that this node is no longer connected.
3716 event.type = AS_HB_CHANNEL_NODE_DISCONNECTED;
3717 event.nodeid = channel.nodeid;
3718 event.msg = NULL;
3719
3720 channel_event_queue(&event);
3721 }
3722 }
3723 }
3724
3725 DETAIL("removed channel associated with fd %d polarity %s Type: %s",
3726 CSFD(socket), channel.is_inbound ? "inbound" : "outbound",
3727 channel.is_multicast ? "multicast" : "mesh");
3728 // Remove associated channel.
3729 cf_shash_delete(g_hb.channel_state.socket_to_channel, &socket);
3730 }
3731 else {
3732 // Will only happen if we are closing this socket twice. Cannot
3733 // deference the underlying fd because the socket has been freed.
3734 WARNING("found a socket %p without an associated channel", socket);
3735 goto Exit;
3736 }
3737
3738 static int32_t err_ok[] = { ENOENT, EBADF, EPERM };
3739 int32_t err = cf_poll_delete_socket_forgiving(g_hb.channel_state.poll,
3740 socket, sizeof(err_ok) / sizeof(int32_t), err_ok);
3741
3742 if (err == ENOENT) {
3743 // There is no valid code path where epoll ctl should fail.
3744 CRASH("unable to remove fd %d from epoll fd list: %s", CSFD(socket),
3745 cf_strerror(errno));
3746 goto Exit;
3747 }
3748
3749 cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
3750 DEBUG("closing channel with fd %d", CSFD(socket));
3751
3752 channel_socket_destroy(socket);
3753
3754Exit:
3755 CHANNEL_UNLOCK();
3756}
3757
3758/**
3759 * Close multiple sockets. Should be invoked only by channel stop.
3760 * @param sockets the vector consisting of sockets to be closed.
3761 */
3762static void
3763channel_sockets_close(cf_vector* sockets)
3764{
3765 uint32_t socket_count = cf_vector_size(sockets);
3766 for (int index = 0; index < socket_count; index++) {
3767 cf_socket* socket;
3768 if (cf_vector_get(sockets, index, &socket) != 0) {
3769 WARNING("error finding the fd %d to be deleted", CSFD(socket));
3770 continue;
3771 }
3772 channel_socket_close(socket, false, true);
3773 }
3774}
3775
3776/**
3777 * Queues a socket for closing by the channel tender. Should be used by all code
3778 * paths other than the channel stop code path.
3779 */
3780static void
3781channel_socket_close_queue(cf_socket* socket, bool is_remote_close,
3782 bool raise_close_event)
3783{
3784 as_hb_channel_socket_close_entry close_entry = {
3785 socket,
3786 is_remote_close,
3787 raise_close_event };
3788 DETAIL("queuing close of fd %d", CSFD(socket));
3789 cf_queue_push(&g_hb.channel_state.socket_close_queue, &close_entry);
3790}
3791
3792/**
3793 * Close queued up sockets.
3794 */
3795static void
3796channel_socket_close_pending()
3797{
3798 // No channel lock required here.
3799 as_hb_channel_socket_close_entry close_entry;
3800 while (cf_queue_pop(&g_hb.channel_state.socket_close_queue, &close_entry, 0)
3801 == CF_QUEUE_OK) {
3802 channel_socket_close(close_entry.socket, close_entry.is_remote,
3803 close_entry.raise_close_event);
3804 }
3805}
3806
3807/**
3808 * Register a new socket.
3809 *
3810 * @param socket the socket.
3811 * @param is_multicast indicates if this socket is a multicast socket.
3812 * @param is_inbound indicates if this socket is an inbound / outbound.
3813 * @param endpoint peer endpoint this socket connects to. Will be NULL for
3814 * inbound sockets.
3815 */
3816static void
3817channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound,
3818 cf_sock_addr* endpoint_addr)
3819{
3820 CHANNEL_LOCK();
3821
3822 as_hb_channel channel;
3823 channel_init_channel(&channel);
3824
3825 // This socket should not be part of the socket to channel map.
3826 ASSERT(channel_get_channel(socket, &channel) == -1,
3827 "error the channel already exists for fd %d", CSFD(socket));
3828
3829 channel.is_multicast = is_multicast;
3830 channel.is_inbound = is_inbound;
3831 channel.last_received = cf_getms();
3832
3833 if (endpoint_addr) {
3834 memcpy(&channel.endpoint_addr, endpoint_addr, sizeof(*endpoint_addr));
3835 }
3836
3837 // Add socket to poll list
3838 cf_poll_add_socket(g_hb.channel_state.poll, socket,
3839 EPOLLIN | EPOLLERR | EPOLLRDHUP, socket);
3840
3841 cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel);
3842
3843 DEBUG("channel created for fd %d - polarity %s type: %s", CSFD(socket),
3844 channel.is_inbound ? "inbound" : "outbound",
3845 channel.is_multicast ? "multicast" : "mesh");
3846
3847 CHANNEL_UNLOCK();
3848}
3849
3850/**
3851 * Accept an incoming tcp connection. For now this is relevant only to the mesh
3852 * mode.
3853 * @param lsock the listening socket that received the connection.
3854 */
3855static void
3856channel_accept_connection(cf_socket* lsock)
3857{
3858 if (!hb_is_mesh()) {
3859 // We do not accept connections in non mesh modes.
3860 return;
3861 }
3862
3863 cf_socket csock;
3864 cf_sock_addr caddr;
3865
3866 if (cf_socket_accept(lsock, &csock, &caddr) < 0) {
3867 if ((errno == EMFILE) || (errno == ENFILE) || (errno == ENOMEM)
3868 || (errno == ENOBUFS)) {
3869 TICKER_WARNING(
3870 "failed to accept heartbeat connection due to error : %s",
3871 cf_strerror(errno));
3872 // We are in an extreme situation where we ran out of system
3873 // resources (file/mem). We should rather lie low and not do too
3874 // much activity. So, sleep. We should not sleep too long as this
3875 // same function is supposed to send heartbeat also.
3876 usleep(MAX(AS_HB_TX_INTERVAL_MS_MIN, 1) * 1000);
3877 return;
3878 }
3879 else {
3880 // TODO: Find what there errors are.
3881 WARNING("accept failed: %s", cf_strerror(errno));
3882 return;
3883 }
3884 }
3885
3886 // Update the stats to reflect to a new connection opened.
3887 cf_atomic_int_incr(&g_stats.heartbeat_connections_opened);
3888
3889 char caddr_str[DNS_NAME_MAX_SIZE];
3890 cf_sock_addr_to_string_safe(&caddr, caddr_str, sizeof(caddr_str));
3891 DEBUG("new connection from %s", caddr_str);
3892
3893 cf_sock_cfg *cfg = lsock->cfg;
3894
3895 if (cfg->owner == CF_SOCK_OWNER_HEARTBEAT_TLS) {
3896 tls_socket_prepare_server(g_config.hb_config.tls, &csock);
3897
3898 if (tls_socket_accept_block(&csock) != 1) {
3899 WARNING("heartbeat TLS server handshake with %s failed", caddr_str);
3900 cf_socket_close(&csock);
3901 cf_socket_term(&csock);
3902
3903 cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
3904 return;
3905 }
3906 }
3907
3908 // Allocate a new socket.
3909 cf_socket* sock = cf_malloc(sizeof(cf_socket));
3910 cf_socket_init(sock);
3911 cf_socket_copy(&csock, sock);
3912
3913 // Register this socket with the channel subsystem.
3914 channel_socket_register(sock, false, true, NULL);
3915}
3916
3917/**
3918 * Parse compressed buffer into a message.
3919 *
3920 * @param msg the input parsed compressed message and also the output heartbeat
3921 * message.
3922 * @param buffer the input buffer.
3923 * @param buffer_content_len the length of the content in the buffer.
3924 * @return the status of parsing the message.
3925 */
3926static as_hb_channel_msg_read_status
3927channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len)
3928{
3929 // This is a direct pointer inside the buffer parameter. No allocation
3930 // required.
3931 uint8_t* compressed_buffer = NULL;
3932 size_t compressed_buffer_length = 0;
3933 int parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL;
3934 void* uncompressed_buffer = NULL;
3935 size_t uncompressed_buffer_length = 0;
3936
3937 if (msg_get_buf(msg, AS_HB_MSG_COMPRESSED_PAYLOAD, &compressed_buffer,
3938 &compressed_buffer_length, MSG_GET_DIRECT) != 0) {
3939 parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL;
3940 goto Exit;
3941 }
3942
3943 // Assume compression ratio of 3. We will expand the buffer if needed.
3944 uncompressed_buffer_length = round_up_pow2(3 * compressed_buffer_length);
3945
3946 // Keep trying till we allocate enough memory for the uncompressed buffer.
3947 while (true) {
3948 uncompressed_buffer = MSG_BUFF_ALLOC_OR_DIE(uncompressed_buffer_length,
3949 "error allocating memory size %zu for decompressing message",
3950 uncompressed_buffer_length);
3951
3952 int uncompress_rv = uncompress(uncompressed_buffer,
3953 &uncompressed_buffer_length, compressed_buffer,
3954 compressed_buffer_length);
3955
3956 if (uncompress_rv == Z_OK) {
3957 // Decompression was successful.
3958 break;
3959 }
3960
3961 if (uncompress_rv == Z_BUF_ERROR) {
3962 // The uncompressed buffer is not large enough. Free current buffer
3963 // and allocate a new buffer.
3964 MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length);
3965
3966 // Give uncompressed buffer more space.
3967 uncompressed_buffer_length *= 2;
3968 continue;
3969 }
3970
3971 // Decompression failed. Clean up and exit.
3972 parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL;
3973 goto Exit;
3974 }
3975
3976 // Reset the message to prepare for parsing the uncompressed buffer. We have
3977 // no issues losing the compressed buffer because we have an uncompressed
3978 // copy.
3979 msg_reset(msg);
3980
3981 // Parse the uncompressed buffer.
3982 parsed =
3983 msg_parse(msg, uncompressed_buffer, uncompressed_buffer_length) ?
3984 AS_HB_CHANNEL_MSG_READ_SUCCESS :
3985 AS_HB_CHANNEL_MSG_PARSE_FAIL;
3986
3987 if (parsed == AS_HB_CHANNEL_MSG_READ_SUCCESS) {
3988 // Copying the buffer content to ensure that the message and the buffer
3989 // can have separate life cycles and we never get into races. The
3990 // frequency of heartbeat messages is low enough to make this not matter
3991 // much unless we have massive clusters.
3992 msg_preserve_all_fields(msg);
3993 }
3994
3995Exit:
3996 MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length);
3997 return parsed;
3998}
3999
4000/**
4001 * Parse the buffer into a message.
4002 *
4003 * @param msg the output heartbeat message.
4004 * @param buffer the input buffer.
4005 * @param buffer_content_len the length of the content in the buffer.
4006 * @return the status of parsing the message.
4007 */
4008static as_hb_channel_msg_read_status
4009channel_message_parse(msg* msg, void* buffer, int buffer_content_len)
4010{
4011 // Peek into the buffer to get hold of the message type.
4012 msg_type type = 0;
4013 uint32_t msg_size = 0;
4014 if (! msg_parse_hdr(&msg_size, &type, (uint8_t*)buffer, buffer_content_len)
4015 || type != msg->type) {
4016 // Pre check because msg_parse considers this a warning but this would
4017 // be common when protocol version between nodes do not match.
4018 DEBUG("message type mismatch - expected:%d received:%d", msg->type,
4019 type);
4020 return AS_HB_CHANNEL_MSG_PARSE_FAIL;
4021 }
4022
4023 bool parsed = msg_parse(msg, buffer, buffer_content_len);
4024
4025 if (parsed) {
4026 if (msg_is_set(msg, AS_HB_MSG_COMPRESSED_PAYLOAD)) {
4027 // This is a compressed message.
4028 return channel_compressed_message_parse(msg, buffer,
4029 buffer_content_len);
4030 }
4031
4032 // This is an uncompressed message. Copying the buffer content to ensure
4033 // that the message and the buffer can have separate life cycles and we
4034 // never get into races. The frequency of heartbeat messages is low
4035 // enough to make this not matter much unless we have massive clusters.
4036 msg_preserve_all_fields(msg);
4037 }
4038
4039 return parsed ?
4040 AS_HB_CHANNEL_MSG_READ_SUCCESS : AS_HB_CHANNEL_MSG_PARSE_FAIL;
4041}
4042
4043/**
4044 * Iterate over a endpoint list and see if there is a matching socket address.
4045 */
4046static void
4047channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata)
4048{
4049 cf_sock_addr sock_addr;
4050 as_hb_channel_endpoint_iterate_udata* iterate_data =
4051 (as_hb_channel_endpoint_iterate_udata*)udata;
4052 if (as_endpoint_to_sock_addr(endpoint, &sock_addr) != 0) {
4053 return;
4054 }
4055
4056 if (cf_sock_addr_is_any(&sock_addr)) {
4057 return;
4058 }
4059
4060 iterate_data->found = iterate_data->found
4061 || (cf_sock_addr_compare(&sock_addr, iterate_data->addr_to_search)
4062 == 0);
4063}
4064
4065/**
4066 * Reduce function to find a matching endpoint.
4067 */
4068static int
4069channel_endpoint_search_reduce(const void* key, void* data, void* udata)
4070{
4071 cf_socket** socket = (cf_socket**)key;
4072 as_hb_channel* channel = (as_hb_channel*)data;
4073 as_hb_channel_endpoint_reduce_udata* endpoint_reduce_udata =
4074 (as_hb_channel_endpoint_reduce_udata*)udata;
4075
4076 as_hb_channel_endpoint_iterate_udata iterate_udata;
4077 iterate_udata.addr_to_search = &channel->endpoint_addr;
4078 iterate_udata.found = false;
4079
4080 as_endpoint_list_iterate(endpoint_reduce_udata->endpoint_list,
4081 channel_endpoint_find_iterate_fn, &iterate_udata);
4082
4083 if (iterate_udata.found) {
4084 endpoint_reduce_udata->found = true;
4085 endpoint_reduce_udata->socket = *socket;
4086 // Stop the reduce, we have found a match.
4087 return CF_SHASH_ERR_FOUND;
4088 }
4089
4090 return CF_SHASH_OK;
4091}
4092
4093/**
4094 * Indicates if any endpoint from the input endpoint list is already connected.
4095 * @param endpoint_list the endpoint list to check.
4096 * @return true if at least one endpoint is already connected to, false
4097 * otherwise.
4098 */
4099static bool
4100channel_endpoint_is_connected(as_endpoint_list* endpoint_list)
4101{
4102 CHANNEL_LOCK();
4103 // Linear search. This will in practice not be a very frequent operation.
4104 as_hb_channel_endpoint_reduce_udata udata;
4105 memset(&udata, 0, sizeof(udata));
4106 udata.endpoint_list = endpoint_list;
4107
4108 cf_shash_reduce(g_hb.channel_state.socket_to_channel,
4109 channel_endpoint_search_reduce, &udata);
4110
4111 CHANNEL_UNLOCK();
4112 return udata.found;
4113}
4114
4115/**
4116 * Read a message from the multicast socket.
4117 *
4118 * @param socket the multicast socket to read from.
4119 * @param msg the message to read into.
4120 *
4121 * @return the status the read operation.
4122 */
4123static as_hb_channel_msg_read_status
4124channel_multicast_msg_read(cf_socket* socket, msg* msg)
4125{
4126 CHANNEL_LOCK();
4127
4128 as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF;
4129
4130 int buffer_len = MAX(hb_mtu(), STACK_ALLOC_LIMIT);
4131 uint8_t* buffer = MSG_BUFF_ALLOC(buffer_len);
4132
4133 if (!buffer) {
4134 WARNING(
4135 "error allocating space for multicast recv buffer of size %d on fd %d",
4136 buffer_len, CSFD(socket));
4137 goto Exit;
4138 }
4139
4140 cf_sock_addr from;
4141
4142 int num_rcvd = cf_socket_recv_from(socket, buffer, buffer_len, 0, &from);
4143
4144 if (num_rcvd <= 0) {
4145 DEBUG("multicast packed read failed on fd %d", CSFD(socket));
4146 rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL;
4147 goto Exit;
4148 }
4149
4150 rv = channel_message_parse(msg, buffer, num_rcvd);
4151 if (rv != AS_HB_CHANNEL_MSG_READ_SUCCESS) {
4152 goto Exit;
4153 }
4154
4155 rv = AS_HB_CHANNEL_MSG_READ_SUCCESS;
4156
4157Exit:
4158 MSG_BUFF_FREE(buffer, buffer_len);
4159
4160 CHANNEL_UNLOCK();
4161 return rv;
4162}
4163
4164/**
4165 * Read a message from the a tcp mesh socket.
4166 *
4167 * @param socket the tcp socket to read from.
4168 * @param msg the message to read into.
4169 *
4170 * @return status of the read operation.
4171 */
4172static as_hb_channel_msg_read_status
4173channel_mesh_msg_read(cf_socket* socket, msg* msg)
4174{
4175 CHANNEL_LOCK();
4176
4177 uint32_t buffer_len = 0;
4178 uint8_t* buffer = NULL;
4179
4180 as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF;
4181 uint8_t len_buff[MSG_WIRE_LENGTH_SIZE];
4182
4183 if (cf_socket_recv_all(socket, len_buff, MSG_WIRE_LENGTH_SIZE, 0,
4184 MESH_RW_TIMEOUT) < 0) {
4185 WARNING("mesh size recv failed fd %d : %s", CSFD(socket),
4186 cf_strerror(errno));
4187 rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL;
4188 goto Exit;
4189 }
4190
4191 buffer_len = ntohl(*((uint32_t*)len_buff)) + 6;
4192
4193 buffer = MSG_BUFF_ALLOC(buffer_len);
4194
4195 if (!buffer) {
4196 WARNING(
4197 "error allocating space for mesh recv buffer of size %d on fd %d",
4198 buffer_len, CSFD(socket));
4199 goto Exit;
4200 }
4201
4202 memcpy(buffer, len_buff, MSG_WIRE_LENGTH_SIZE);
4203
4204 if (cf_socket_recv_all(socket, buffer + MSG_WIRE_LENGTH_SIZE,
4205 buffer_len - MSG_WIRE_LENGTH_SIZE, 0, MESH_RW_TIMEOUT) < 0) {
4206 DETAIL("mesh recv failed fd %d : %s", CSFD(socket), cf_strerror(errno));
4207 rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL;
4208 goto Exit;
4209 }
4210
4211 DETAIL("mesh recv success fd %d message size %d", CSFD(socket), buffer_len);
4212
4213 rv = channel_message_parse(msg, buffer, buffer_len);
4214
4215Exit:
4216 MSG_BUFF_FREE(buffer, buffer_len);
4217
4218 CHANNEL_UNLOCK();
4219 return rv;
4220}
4221
4222/**
4223 * Associate a socket with a nodeid and notify listeners about a node being
4224 * connected, effective only for mesh channels.
4225 *
4226 * For multicast channels this function is a no-op. The reason being additional
4227 * machinery would be required to clean up the node to channel mapping on node
4228 * expiry.
4229 *
4230 * @param socket the socket.
4231 * @param channel the channel to associate.
4232 * @param nodeid the nodeid associated with this socket.
4233 */
4234static void
4235channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid)
4236{
4237 // For now node to socket mapping is not maintained for multicast channels.
4238 if (channel->is_multicast) {
4239 return;
4240 }
4241
4242 CHANNEL_LOCK();
4243
4244 // Update the node information for the channel.
4245 // This is the first time this node has a connection. Record the mapping.
4246 cf_shash_put(g_hb.channel_state.nodeid_to_socket, &nodeid, &socket);
4247
4248 channel->nodeid = nodeid;
4249 cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, channel);
4250
4251 DEBUG("attached fd %d to node %" PRIx64, CSFD(socket), nodeid);
4252
4253 CHANNEL_UNLOCK();
4254
4255 // Publish an event to let know that a new node has a channel now.
4256 as_hb_channel_event node_connected_event;
4257 channel_event_init(&node_connected_event);
4258 node_connected_event.nodeid = nodeid;
4259 node_connected_event.type = AS_HB_CHANNEL_NODE_CONNECTED;
4260 channel_event_queue(&node_connected_event);
4261}
4262
4263/**
4264 * Indicates if a channel should be allowed to continue to win and live because
4265 * of a winning grace period.
4266 */
4267static bool
4268channel_socket_should_live(cf_socket* socket, as_hb_channel* channel)
4269{
4270 if (channel->resolution_win_ts > 0
4271 && channel->resolution_win_ts + channel_win_grace_ms()
4272 > cf_getms()) {
4273 // Losing socket was a previous winner. Allow it time to do some work
4274 // before knocking it off.
4275 INFO("giving %d unresolved fd some grace time", CSFD(socket));
4276 return true;
4277 }
4278 return false;
4279}
4280
4281/**
4282 * Selects one out give two sockets connected to same remote node. The algorithm
4283 * is deterministic and ensures the remote node also chooses a socket that drops
4284 * the same connection.
4285 *
4286 * @param socket1 one of the sockets
4287 * @param socket2 one of the sockets
4288 * @return resolved socket on success, NULL if resolution fails.
4289 */
4290static cf_socket*
4291channel_socket_resolve(cf_socket* socket1, cf_socket* socket2)
4292{
4293 cf_socket* rv = NULL;
4294 CHANNEL_LOCK();
4295
4296 DEBUG("resolving between fd %d and %d", CSFD(socket1), CSFD(socket2));
4297
4298 as_hb_channel channel1;
4299 if (channel_get_channel(socket1, &channel1) < 0) {
4300 // Should not happen in practice.
4301 WARNING("resolving fd %d without channel", CSFD(socket1));
4302 rv = socket2;
4303 goto Exit;
4304 }
4305
4306 as_hb_channel channel2;
4307 if (channel_get_channel(socket2, &channel2) < 0) {
4308 // Should not happen in practice.
4309 WARNING("resolving fd %d without channel", CSFD(socket2));
4310 rv = socket1;
4311 goto Exit;
4312 }
4313
4314 if (channel_socket_should_live(socket1, &channel1)) {
4315 rv = socket1;
4316 goto Exit;
4317 }
4318
4319 if (channel_socket_should_live(socket2, &channel2)) {
4320 rv = socket2;
4321 goto Exit;
4322 }
4323
4324 cf_node remote_nodeid =
4325 channel1.nodeid != 0 ? channel1.nodeid : channel2.nodeid;
4326
4327 if (remote_nodeid == 0) {
4328 // Should not happen in practice.
4329 WARNING("remote node id unknown for fds %d and %d", CSFD(socket1),
4330 CSFD(socket2));
4331 rv = NULL;
4332 goto Exit;
4333 }
4334
4335 // Choose the socket with the highest acceptor nodeid.
4336 cf_node acceptor_nodeid1 =
4337 channel1.is_inbound ? config_self_nodeid_get() : remote_nodeid;
4338 cf_node acceptor_nodeid2 =
4339 channel2.is_inbound ? config_self_nodeid_get() : remote_nodeid;
4340
4341 as_hb_channel* winner_channel = NULL;
4342 cf_socket* winner_socket = NULL;
4343 if (acceptor_nodeid1 > acceptor_nodeid2) {
4344 winner_channel = &channel1;
4345 winner_socket = socket1;
4346 }
4347 else if (acceptor_nodeid1 < acceptor_nodeid2) {
4348 winner_channel = &channel2;
4349 winner_socket = socket2;
4350 }
4351 else {
4352 // Both connections have the same acceptor. Should not happen in
4353 // practice. Despair and report resolution failure.
4354 INFO(
4355 "found redundant connections to same node, fds %d %d - choosing at random",
4356 CSFD(socket1), CSFD(socket2));
4357
4358 if (cf_getms() % 2 == 0) {
4359 winner_channel = &channel1;
4360 winner_socket = socket1;
4361 }
4362 else {
4363 winner_channel = &channel2;
4364 winner_socket = socket2;
4365 }
4366 }
4367
4368 cf_clock now = cf_getms();
4369 if (winner_channel->resolution_win_ts == 0) {
4370 winner_channel->resolution_win_ts = now;
4371 // Update the winning count of the winning channel in the channel data
4372 // structures.
4373 cf_shash_put(g_hb.channel_state.socket_to_channel, &winner_socket,
4374 winner_channel);
4375 }
4376
4377 if (winner_channel->resolution_win_ts > now + channel_win_grace_ms()) {
4378 // The winner has been winning a lot, most likely the other side has us
4379 // with a seed address different from our published address.
4380 //
4381 // Break the cycle here and choose the loosing channel as the winner.
4382 INFO("breaking socket resolve loop dropping winning fd %d",
4383 CSFD(winner_socket));
4384 winner_channel = (winner_channel == &channel1) ? &channel2 : &channel1;
4385 winner_socket = (socket1 == winner_socket) ? socket2 : socket1;
4386 }
4387
4388 rv = winner_socket;
4389
4390Exit:
4391 CHANNEL_UNLOCK();
4392 return rv;
4393}
4394
4395/**
4396 * Basic sanity check for a message.
4397 * @param msg_event the message event.
4398 * @return 0 if the message passes basic sanity tests. -1 on failure.
4399 */
4400static int
4401channel_msg_sanity_check(as_hb_channel_event* msg_event)
4402{
4403 msg* msg = msg_event->msg;
4404 uint32_t id = 0;
4405
4406 as_hb_msg_type type = 0;
4407 cf_node src_nodeid = 0;
4408
4409 int rv = 0;
4410
4411 if (msg_nodeid_get(msg, &src_nodeid) != 0) {
4412 TICKER_WARNING("received message without a source node");
4413 rv = -1;
4414 }
4415
4416 // Validate the fact that we have a valid source nodeid.
4417 if (src_nodeid == 0) {
4418 // Event nodeid is zero. Not a valid source nodeid. This will happen in
4419 // compatibility mode if the info request from a new node arrives before
4420 // the pulse message. Can be ignored.
4421 TICKER_WARNING("received a message from node with unknown nodeid");
4422 rv = -1;
4423 }
4424
4425 if (msg_id_get(msg, &id) != 0) {
4426 TICKER_WARNING(
4427 "received message without heartbeat protocol identifier from node %" PRIx64,
4428 src_nodeid);
4429 rv = -1;
4430 }
4431 else {
4432 DETAIL(
4433 "received message with heartbeat protocol identifier %d from node %" PRIx64,
4434 id, src_nodeid);
4435
4436 // Ignore the message if the protocol of the incoming message does not
4437 // match.
4438 if (id != hb_protocol_identifier_get()) {
4439 TICKER_WARNING(
4440 "received message with different heartbeat protocol identifier from node %" PRIx64,
4441 src_nodeid);
4442 rv = -1;
4443 }
4444 }
4445
4446 if (msg_type_get(msg, &type) != 0) {
4447 TICKER_WARNING(
4448 "received message without message type from node %" PRIx64,
4449 src_nodeid);
4450 rv = -1;
4451 }
4452
4453 as_endpoint_list* endpoint_list;
4454 if (hb_is_mesh()) {
4455 // Check only applies to v3 mesh.
4456 // v3 multicast protocol does not advertise endpoint list.
4457 if (msg_endpoint_list_get(msg, &endpoint_list) != 0
4458 || endpoint_list->n_endpoints <= 0) {
4459 TICKER_WARNING(
4460 "received message without address/port from node %" PRIx64,
4461 src_nodeid);
4462 rv = -1;
4463 }
4464 }
4465
4466 as_hlc_timestamp send_ts;
4467 if (msg_send_hlc_ts_get(msg, &send_ts) != 0) {
4468 TICKER_WARNING("received message without HLC time from node %" PRIx64,
4469 src_nodeid);
4470 rv = -1;
4471 }
4472
4473 if (type == AS_HB_MSG_TYPE_PULSE) {
4474 char* remote_cluster_name = NULL;
4475 if (msg_cluster_name_get(msg, &remote_cluster_name) != 0) {
4476 remote_cluster_name = "";
4477 }
4478
4479 if (!as_config_cluster_name_matches(remote_cluster_name)) {
4480 // Generate cluster-name mismatch event.
4481 as_hb_channel_event mismatch_event;
4482 channel_event_init(&mismatch_event);
4483
4484 // Notify hb about cluster-name mismatch.
4485 mismatch_event.type = AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH;
4486 mismatch_event.nodeid = src_nodeid;
4487 mismatch_event.msg = NULL;
4488 memcpy(&mismatch_event.msg_hlc_ts, &msg_event->msg_hlc_ts,
4489 sizeof(msg_event->msg_hlc_ts));
4490
4491 channel_event_queue(&mismatch_event);
4492
4493 TICKER_WARNING("ignoring message from %"PRIX64" with different cluster name(%s)",
4494 src_nodeid, remote_cluster_name[0] == '\0' ? "null" : remote_cluster_name );
4495 rv = -1;
4496 }
4497 }
4498
4499 DETAIL("received message of type %d from node %" PRIx64, type, src_nodeid);
4500
4501 return rv;
4502}
4503
4504/**
4505 * Process incoming message to possibly update channel state.
4506 *
4507 * @param socket the socket on which the message is received.
4508 * @param event the message wrapped around in a channel event.
4509 * @return 0 if the message can be further processed, -1 if the message should
4510 * be discarded.
4511 */
4512static int
4513channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event)
4514{
4515 // Basic sanity check for the inbound message.
4516 if (channel_msg_sanity_check(event) != 0) {
4517 DETAIL("sanity check failed for message on fd %d", CSFD(socket));
4518 return -1;
4519 }
4520
4521 int rv = -1;
4522 CHANNEL_LOCK();
4523
4524 as_hb_channel channel;
4525 if (channel_get_channel(socket, &channel) < 0) {
4526 // This is a bug and should not happen. Be paranoid and try fixing it ?
4527 WARNING("received a message on an unregistered fd %d - closing the fd",
4528 CSFD(socket));
4529 channel_socket_close_queue(socket, false, true);
4530 rv = -1;
4531 goto Exit;
4532 }
4533
4534 if (channel.is_multicast) {
4535 rv = 0;
4536 goto Exit;
4537 }
4538
4539 cf_node nodeid = event->nodeid;
4540
4541 if (channel.nodeid != 0 && channel.nodeid != nodeid) {
4542 // The event nodeid does not match previously know event id. Something
4543 // seriously wrong here.
4544 WARNING("received a message from node with incorrect nodeid - expected %" PRIx64 " received %" PRIx64 "on fd %d",
4545 channel.nodeid, nodeid, CSFD(socket));
4546 rv = -1;
4547 goto Exit;
4548 }
4549
4550 // Update the last received time for this node
4551 channel.last_received = cf_getms();
4552
4553 cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel);
4554
4555 cf_socket* existing_socket;
4556 int get_result = cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid,
4557 &existing_socket);
4558
4559 if (get_result == CF_SHASH_ERR_NOT_FOUND) {
4560 // Associate this socket with the node.
4561 channel_node_attach(socket, &channel, nodeid);
4562 }
4563 else if (existing_socket != socket) {
4564 // Somehow the other node and this node discovered each other together
4565 // both connected via two tcp connections. Choose one and close the
4566 // other.
4567 cf_socket* resolved = channel_socket_resolve(socket, existing_socket);
4568
4569 if (!resolved) {
4570 DEBUG(
4571 "resolving between fd %d and %d failed - closing both connections",
4572 CSFD(socket), CSFD(existing_socket));
4573
4574 // Resolution failed. Should not happen but there is a window where
4575 // the same node initiated two connections.
4576 // Close both connections and try again.
4577 channel_socket_close_queue(socket, false, true);
4578 channel_socket_close_queue(existing_socket, false, true);
4579
4580 // Nothing wrong with the message. Let it through.
4581 rv = 0;
4582 goto Exit;
4583 }
4584
4585 DEBUG("resolved fd %d between redundant fd %d and %d for node %" PRIx64,
4586 CSFD(resolved), CSFD(socket), CSFD(existing_socket), nodeid);
4587
4588 if (resolved == existing_socket) {
4589 // The node to socket mapping is correct, just close this socket and
4590 // this node will still be connected to the remote node. Do not
4591 // raise any event for this closure.
4592 channel_socket_close_queue(socket, false, false);
4593 }
4594 else {
4595 // We need to close the existing socket. Disable channel events
4596 // because we make the node appear to be not connected. Do not raise
4597 // any event for this closure.
4598 channel_socket_close_queue(existing_socket, false, false);
4599 // Associate this socket with the node.
4600 channel_node_attach(socket, &channel, nodeid);
4601 }
4602 }
4603
4604 rv = 0;
4605
4606Exit:
4607 CHANNEL_UNLOCK();
4608 return rv;
4609}
4610
4611/**
4612 * Read a message from a socket that has data.
4613 * @param socket the socket having data to be read.
4614 */
4615static void
4616channel_msg_read(cf_socket* socket)
4617{
4618 CHANNEL_LOCK();
4619
4620 as_hb_channel_msg_read_status status;
4621 as_hb_channel channel;
4622
4623 bool free_msg = true;
4624
4625 msg* msg = hb_msg_get();
4626
4627 if (channel_get_channel(socket, &channel) != 0) {
4628 // Would happen if the channel was closed in the same epoll loop.
4629 DEBUG("error the channel does not exist for fd %d", CSFD(socket));
4630 goto Exit;
4631 }
4632
4633 if (channel.is_multicast) {
4634 status = channel_multicast_msg_read(socket, msg);
4635 }
4636 else {
4637 status = channel_mesh_msg_read(socket, msg);
4638 }
4639
4640 switch (status) {
4641 case AS_HB_CHANNEL_MSG_READ_SUCCESS: {
4642 break;
4643 }
4644
4645 case AS_HB_CHANNEL_MSG_PARSE_FAIL: {
4646 TICKER_WARNING("unable to parse heartbeat message on fd %d",
4647 CSFD(socket));
4648 goto Exit;
4649 }
4650
4651 case AS_HB_CHANNEL_MSG_CHANNEL_FAIL: // Falling through
4652 default: {
4653 DEBUG("could not read message from fd %d", CSFD(socket));
4654 if (!channel.is_multicast) {
4655 // Shut down only mesh socket.
4656 channel_socket_shutdown(socket);
4657 }
4658 goto Exit;
4659 }
4660 }
4661
4662 as_hb_channel_event event;
4663 channel_event_init(&event);
4664
4665 if (msg_get_uint64(msg, AS_HB_MSG_NODE, &event.nodeid) < 0) {
4666 // Node id missing from the message. Assume this message to be corrupt.
4667 TICKER_WARNING("message with invalid nodeid received on fd %d",
4668 CSFD(socket));
4669 goto Exit;
4670 }
4671
4672 event.msg = msg;
4673 event.type = AS_HB_CHANNEL_MSG_RECEIVED;
4674
4675 // Update hlc and store update message timestamp for the event.
4676 as_hlc_timestamp send_ts = 0;
4677 msg_send_hlc_ts_get(msg, &send_ts);
4678 as_hlc_timestamp_update(event.nodeid, send_ts, &event.msg_hlc_ts);
4679
4680 // Process received message to update channel state.
4681 if (channel_msg_event_process(socket, &event) == 0) {
4682 // The message needs to be delivered to the listeners. Prevent a free.
4683 free_msg = false;
4684 channel_event_queue(&event);
4685 }
4686
4687Exit:
4688 CHANNEL_UNLOCK();
4689
4690 // release the message.
4691 if (free_msg) {
4692 hb_msg_return(msg);
4693 }
4694}
4695
4696/**
4697 * Reduce function to remove faulty channels / nodes. Shutdown associated socket
4698 * to have channel tender cleanup.
4699 */
4700static int
4701channel_channels_tend_reduce(const void* key, void* data, void* udata)
4702{
4703 cf_socket** socket = (cf_socket**)key;
4704 as_hb_channel* channel = (as_hb_channel*)data;
4705
4706 DETAIL("tending channel fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s",
4707 CSFD(*socket), channel->nodeid, channel->last_received,
4708 cf_sock_addr_print(&channel->endpoint_addr));
4709
4710 if (channel->last_received + CHANNEL_NODE_READ_IDLE_TIMEOUT()
4711 < cf_getms()) {
4712 // Shutdown associated socket if it is not a multicast socket.
4713 if (!channel->is_multicast) {
4714 DEBUG("channel shutting down idle fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s",
4715 CSFD(*socket), channel->nodeid, channel->last_received,
4716 cf_sock_addr_print(&channel->endpoint_addr));
4717 channel_socket_shutdown(*socket);
4718 }
4719 }
4720
4721 return CF_SHASH_OK;
4722}
4723
4724/**
4725 * Tend channel specific node information to remove channels that are faulty (or
4726 * TODO: attached to misbehaving nodes).
4727 */
4728static void
4729channel_channels_idle_check()
4730{
4731 CHANNEL_LOCK();
4732
4733 cf_clock now = cf_getms();
4734 if (g_hb.channel_state.last_channel_idle_check + CHANNEL_IDLE_CHECK_PERIOD
4735 <= now) {
4736 cf_shash_reduce(g_hb.channel_state.socket_to_channel,
4737 channel_channels_tend_reduce, NULL);
4738 g_hb.channel_state.last_channel_idle_check = now;
4739 }
4740
4741 CHANNEL_UNLOCK();
4742}
4743
4744/**
4745 * Socket tending thread. Manages heartbeat receive as well.
4746 */
4747void*
4748channel_tender(void* arg)
4749{
4750 DETAIL("channel tender started");
4751
4752 while (channel_is_running()) {
4753 cf_poll_event events[POLL_SZ];
4754 int32_t nevents = cf_poll_wait(g_hb.channel_state.poll, events, POLL_SZ,
4755 AS_HB_TX_INTERVAL_MS_MIN);
4756
4757 DETAIL("tending channel");
4758
4759 for (int32_t i = 0; i < nevents; i++) {
4760 cf_socket* socket = events[i].data;
4761 if (channel_cf_sockets_contains(
4762 g_hb.channel_state.listening_sockets, socket)
4763 && hb_is_mesh()) {
4764 // Accept a new connection.
4765 channel_accept_connection(socket);
4766 }
4767 else if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) {
4768 channel_socket_close_queue(socket, true, true);
4769 }
4770 else if (events[i].events & EPOLLIN) {
4771 // Read a message for the socket that is ready.
4772 channel_msg_read(socket);
4773 }
4774 }
4775
4776 // Tend channels to discard stale channels.
4777 channel_channels_idle_check();
4778
4779 // Close queued up socket.
4780 channel_socket_close_pending();
4781
4782 // Publish pending events. Should be outside channel lock.
4783 channel_event_publish_pending();
4784
4785 DETAIL("done tending channel");
4786 }
4787
4788 DETAIL("channel tender shut down");
4789 return NULL;
4790}
4791
4792/*
4793 * ----------------------------------------------------------------------------
4794 * Channel public API
4795 * ----------------------------------------------------------------------------
4796 */
4797
4798/**
4799 * Filter out endpoints not matching this node's capabilities.
4800 */
4801static bool
4802channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata)
4803{
4804 if ((cf_ip_addr_legacy_only())
4805 && endpoint->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6) {
4806 return false;
4807 }
4808
4809 // If we don't offer TLS, then we won't connect via TLS, either.
4810 if (g_config.hb_tls_serv_spec.bind_port == 0
4811 && as_endpoint_capability_is_supported(endpoint,
4812 AS_ENDPOINT_TLS_MASK)) {
4813 return false;
4814 }
4815
4816 return true;
4817}
4818
4819/**
4820 * Try and connect to a set of endpoint_lists.
4821 */
4822static void
4823channel_mesh_channel_establish(as_endpoint_list** endpoint_lists,
4824 int endpoint_list_count)
4825{
4826 for (int i = 0; i < endpoint_list_count; i++) {
4827 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
4828 as_endpoint_list_to_string(endpoint_lists[i], endpoint_list_str,
4829 sizeof(endpoint_list_str));
4830
4831 if (channel_endpoint_is_connected(endpoint_lists[i])) {
4832 DEBUG(
4833 "duplicate endpoint connect request - ignoring endpoint list {%s}",
4834 endpoint_list_str);
4835 continue;
4836 }
4837
4838 DEBUG("attempting to connect mesh host at {%s}", endpoint_list_str);
4839
4840 cf_socket* sock = (cf_socket*)cf_malloc(sizeof(cf_socket));
4841
4842 const as_endpoint* connected_endpoint = as_endpoint_connect_any(
4843 endpoint_lists[i], channel_mesh_endpoint_filter, NULL,
4844 CONNECT_TIMEOUT(), sock);
4845
4846 if (connected_endpoint) {
4847 cf_atomic_int_incr(&g_stats.heartbeat_connections_opened);
4848
4849 cf_sock_addr endpoint_addr;
4850 memset(&endpoint_addr, 0, sizeof(endpoint_addr));
4851 cf_ip_addr_set_any(&endpoint_addr.addr);
4852 if (as_endpoint_to_sock_addr(connected_endpoint, &endpoint_addr)
4853 != 0) {
4854 // Should never happen in practice.
4855 WARNING("error converting endpoint to socket address");
4856 channel_socket_destroy(sock);
4857 sock = NULL;
4858
4859 cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
4860 continue;
4861 }
4862
4863 if (as_endpoint_capability_is_supported(connected_endpoint,
4864 AS_ENDPOINT_TLS_MASK)) {
4865 tls_socket_prepare_client(g_config.hb_config.tls, sock);
4866
4867 if (tls_socket_connect_block(sock) != 1) {
4868 WARNING("heartbeat TLS client handshake with {%s} failed",
4869 endpoint_list_str);
4870 channel_socket_destroy(sock);
4871 sock = NULL;
4872
4873 cf_atomic_int_incr(&g_stats.heartbeat_connections_closed);
4874 return;
4875 }
4876 }
4877
4878 channel_socket_register(sock, false, false, &endpoint_addr);
4879 }
4880 else {
4881 TICKER_WARNING("could not create heartbeat connection to node {%s}",
4882 endpoint_list_str);
4883 if (sock) {
4884 cf_free(sock);
4885 sock = NULL;
4886 }
4887 }
4888 }
4889}
4890
4891/**
4892 * Disconnect a node from the channel list.
4893 * @param nodeid the nodeid of the node whose channel should be disconnected.
4894 * @return 0 if the node had a channel and was disconnected. -1 otherwise.
4895 */
4896static int
4897channel_node_disconnect(cf_node nodeid)
4898{
4899 int rv = -1;
4900
4901 CHANNEL_LOCK();
4902
4903 cf_socket* socket;
4904 if (channel_socket_get(nodeid, &socket) != 0) {
4905 // not found
4906 rv = -1;
4907 goto Exit;
4908 }
4909
4910 DEBUG("disconnecting the channel attached to node %" PRIx64, nodeid);
4911
4912 channel_socket_close_queue(socket, false, true);
4913
4914 rv = 0;
4915
4916Exit:
4917 CHANNEL_UNLOCK();
4918
4919 return rv;
4920}
4921
4922/**
4923 * Register mesh listening sockets.
4924 */
4925static void
4926channel_mesh_listening_socks_register(cf_sockets* listening_sockets)
4927{
4928 CHANNEL_LOCK();
4929 g_hb.channel_state.listening_sockets = listening_sockets;
4930
4931 cf_poll_add_sockets(g_hb.channel_state.poll,
4932 g_hb.channel_state.listening_sockets,
4933 EPOLLIN | EPOLLERR | EPOLLHUP);
4934 cf_socket_show_server(AS_HB, "mesh heartbeat",
4935 g_hb.channel_state.listening_sockets);
4936
4937 // We do not need a separate channel to cover this socket because IO will
4938 // not happen on these sockets.
4939 CHANNEL_UNLOCK();
4940}
4941
4942/**
4943 * Deregister mesh listening socket from epoll event.
4944 * @param socket the listening socket socket.
4945 */
4946static void
4947channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets)
4948{
4949 CHANNEL_LOCK();
4950 cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets);
4951 CHANNEL_UNLOCK();
4952}
4953
4954/**
4955 * Register the multicast listening socket.
4956 * @param socket the listening socket.
4957 * @param endpoint the endpoint on which multicast io happens.
4958 */
4959static void
4960channel_multicast_listening_socks_register(cf_sockets* listening_sockets)
4961{
4962 CHANNEL_LOCK();
4963 g_hb.channel_state.listening_sockets = listening_sockets;
4964
4965 // Create a new multicast channel for each multicast socket.
4966 for (uint32_t i = 0;
4967 i < g_hb.mode_state.multicast_state.listening_sockets.n_socks;
4968 ++i) {
4969 channel_socket_register(&g_hb.channel_state.listening_sockets->socks[i],
4970 true, false, NULL);
4971 }
4972
4973 cf_socket_mcast_show(AS_HB, "multicast heartbeat",
4974 g_hb.channel_state.listening_sockets);
4975 CHANNEL_UNLOCK();
4976}
4977
4978/**
4979 * Deregister multicast listening socket from epoll event.
4980 * @param socket the listening socket socket.
4981 */
4982static void
4983channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets)
4984{
4985 CHANNEL_LOCK();
4986 cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets);
4987 CHANNEL_UNLOCK();
4988}
4989
4990/**
4991 * Initialize the channel sub module.
4992 */
4993static void
4994channel_init()
4995{
4996 CHANNEL_LOCK();
4997
4998 // Disable events till initialization is complete.
4999 channel_events_enabled_set(false);
5000
5001 // Initialize unpublished event queue.
5002 cf_queue_init(&g_hb.channel_state.events_queue, sizeof(as_hb_channel_event),
5003 AS_HB_CLUSTER_MAX_SIZE_SOFT, true);
5004
5005 // Initialize sockets to close queue.
5006 cf_queue_init(&g_hb.channel_state.socket_close_queue,
5007 sizeof(as_hb_channel_socket_close_entry),
5008 AS_HB_CLUSTER_MAX_SIZE_SOFT, true);
5009
5010 // Initialize the nodeid to socket hash.
5011 g_hb.channel_state.nodeid_to_socket = cf_shash_create(cf_nodeid_shash_fn,
5012 sizeof(cf_node), sizeof(cf_socket*), AS_HB_CLUSTER_MAX_SIZE_SOFT,
5013 0);
5014
5015 // Initialize the socket to channel state hash.
5016 g_hb.channel_state.socket_to_channel = cf_shash_create(hb_socket_hash_fn,
5017 sizeof(cf_socket*), sizeof(as_hb_channel),
5018 AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
5019
5020 g_hb.channel_state.status = AS_HB_STATUS_STOPPED;
5021
5022 CHANNEL_UNLOCK();
5023}
5024
5025/**
5026 * Start channel sub module. Kicks off the channel tending thread.
5027 */
5028static void
5029channel_start()
5030{
5031 CHANNEL_LOCK();
5032
5033 if (channel_is_running()) {
5034 WARNING("heartbeat channel already started");
5035 goto Exit;
5036 }
5037
5038 // create the epoll socket.
5039 cf_poll_create(&g_hb.channel_state.poll);
5040
5041 DEBUG("created epoll fd %d", CEFD(g_hb.channel_state.poll));
5042
5043 // Disable events till initialization is complete.
5044 channel_events_enabled_set(false);
5045
5046 // Data structures have been initialized.
5047 g_hb.channel_state.status = AS_HB_STATUS_RUNNING;
5048
5049 // Initialization complete enable events.
5050 channel_events_enabled_set(true);
5051
5052 // Start the channel tender.
5053 g_hb.channel_state.channel_tender_tid =
5054 cf_thread_create_joinable(channel_tender, (void*)&g_hb);
5055
5056Exit:
5057 CHANNEL_UNLOCK();
5058}
5059
5060/**
5061 * Get all sockets.
5062 */
5063static int
5064channel_sockets_get_reduce(const void* key, void* data, void* udata)
5065{
5066 cf_vector* sockets = (cf_vector*)udata;
5067 cf_vector_append(sockets, key);
5068 return CF_SHASH_OK;
5069}
5070
5071/**
5072 * Stop the channel sub module called on hb_stop.
5073 */
5074static void
5075channel_stop()
5076{
5077 if (!channel_is_running()) {
5078 WARNING("heartbeat channel already stopped");
5079 return;
5080 }
5081
5082 DEBUG("stopping the channel");
5083
5084 // Unguarded state change but this should be OK.
5085 g_hb.channel_state.status = AS_HB_STATUS_SHUTTING_DOWN;
5086
5087 // Wait for the channel tender thread to finish.
5088 cf_thread_join(g_hb.channel_state.channel_tender_tid);
5089
5090 CHANNEL_LOCK();
5091
5092 cf_vector sockets;
5093 cf_socket buff[cf_shash_get_size(g_hb.channel_state.socket_to_channel)];
5094 cf_vector_init_smalloc(&sockets, sizeof(cf_socket*), (uint8_t*)buff,
5095 sizeof(buff), VECTOR_FLAG_INITZERO);
5096
5097 cf_shash_reduce(g_hb.channel_state.socket_to_channel,
5098 channel_sockets_get_reduce, &sockets);
5099
5100 channel_sockets_close(&sockets);
5101
5102 // Disable events.
5103 channel_events_enabled_set(false);
5104
5105 cf_vector_destroy(&sockets);
5106
5107 // Close epoll socket.
5108 cf_poll_destroy(g_hb.channel_state.poll);
5109 EFD(g_hb.channel_state.poll) = -1;
5110
5111 // Disable the channel thread.
5112 g_hb.channel_state.status = AS_HB_STATUS_STOPPED;
5113
5114 DEBUG("channel Stopped");
5115
5116 CHANNEL_UNLOCK();
5117}
5118
5119/**
5120 * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK
5121 * @param socket the socket to send the buffer over.
5122 * @param buff the data buffer.
5123 * @param buffer_length the number of bytes in the buffer to send.
5124 * @return 0 on successful send -1 on failure
5125 */
5126static int
5127channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length)
5128{
5129 CHANNEL_LOCK();
5130 int rv;
5131
5132 if (cf_socket_send_all(socket, buff, buffer_length, 0,
5133 MESH_RW_TIMEOUT) < 0) {
5134 as_hb_channel channel;
5135 if (channel_get_channel(socket, &channel) == 0) {
5136 // Would happen if the channel was closed in the same epoll loop.
5137 TICKER_WARNING("sending mesh message to %"PRIx64" on fd %d failed : %s",
5138 channel.nodeid, CSFD(socket), cf_strerror(errno));
5139 }
5140 else {
5141 TICKER_WARNING("sending mesh message on fd %d failed : %s",
5142 CSFD(socket), cf_strerror(errno));
5143 }
5144
5145 channel_socket_shutdown(socket);
5146 rv = -1;
5147 }
5148 else {
5149 rv = 0;
5150 }
5151
5152 CHANNEL_UNLOCK();
5153 return rv;
5154}
5155
5156/**
5157 * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK
5158 * @param socket the socket to send the buffer over.
5159 * @param buff the data buffer.
5160 * @param buffer_length the number of bytes in the buffer to send.
5161 * @return 0 on successful send -1 on failure
5162 */
5163static int
5164channel_multicast_msg_send(cf_socket* socket, uint8_t* buff,
5165 size_t buffer_length)
5166{
5167 CHANNEL_LOCK();
5168 int rv = 0;
5169 DETAIL("sending udp heartbeat to fd %d: msg size %zu", CSFD(socket),
5170 buffer_length);
5171
5172 int mtu = hb_mtu();
5173 if (buffer_length > mtu) {
5174 TICKER_WARNING("mtu breach, sending udp heartbeat to fd %d: mtu %d",
5175 CSFD(socket), mtu);
5176 }
5177
5178 cf_msock_cfg* socket_cfg = (cf_msock_cfg*)(socket->cfg);
5179 cf_sock_addr dest;
5180 dest.port = socket_cfg->port;
5181 cf_ip_addr_copy(&socket_cfg->addr, &dest.addr);
5182
5183 if (cf_socket_send_to(socket, buff, buffer_length, 0, &dest) < 0) {
5184 TICKER_WARNING("multicast message send failed on fd %d %s",
5185 CSFD(socket), cf_strerror(errno));
5186 rv = -1;
5187 }
5188 CHANNEL_UNLOCK();
5189 return rv;
5190}
5191
5192/**
5193 * Indicates if this msg requires compression.
5194 */
5195static bool
5196channel_msg_is_compression_required(msg* msg, int wire_size, int mtu)
5197{
5198 return wire_size > msg_compression_threshold(mtu);
5199}
5200
5201/**
5202 * Estimate the size of the buffer required to fill out the serialized message.
5203 * @param msg the input message.
5204 * @param mtu the underlying network mtu.
5205 * @return the size of the buffer required.
5206 */
5207static int
5208channel_msg_buffer_size_get(int wire_size, int mtu)
5209{
5210 return round_up_pow2(MAX(wire_size, compressBound(wire_size)));
5211}
5212
5213/**
5214 * Fills the buffer with the serialized message.
5215 * @param original_msg the original message to serialize.
5216 * @param wire_size the message wire size.
5217 * @param mtu the underlying network mtu.
5218 * @param buffer the destination buffer.
5219 * @param buffer_len the buffer length.
5220 *
5221 * @return length of the serialized message.
5222 */
5223static size_t
5224channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu,
5225 uint8_t* buffer, size_t buffer_len)
5226{
5227 // This is output by msg_to_wire. Using a separate variable so that we do
5228 // not lose the actual buffer length needed for compression later on.
5229 size_t msg_size = msg_to_wire(original_msg, buffer);
5230
5231 if (channel_msg_is_compression_required(original_msg, msg_size, mtu)) {
5232 // Compression is required.
5233 const size_t compressed_buffer_len = buffer_len;
5234 uint8_t* compressed_buffer = MSG_BUFF_ALLOC_OR_DIE(
5235 compressed_buffer_len,
5236 "error allocating memory size %zu for compressing message",
5237 compressed_buffer_len);
5238
5239 size_t compressed_msg_size = compressed_buffer_len;
5240 int compress_rv = compress2(compressed_buffer, &compressed_msg_size,
5241 buffer, wire_size, Z_BEST_COMPRESSION);
5242
5243 if (compress_rv == Z_BUF_ERROR) {
5244 // Compression result going to be larger than original input buffer.
5245 // Skip compression and try to send the message as is.
5246 DETAIL(
5247 "skipping compression - compressed size larger than input size %zu",
5248 msg_size);
5249 }
5250 else {
5251 msg* temp_msg = hb_msg_get();
5252
5253 msg_set_buf(temp_msg, AS_HB_MSG_COMPRESSED_PAYLOAD,
5254 compressed_buffer, compressed_msg_size, MSG_SET_COPY);
5255 msg_size = msg_to_wire(temp_msg, buffer);
5256
5257 hb_msg_return(temp_msg);
5258 }
5259
5260 MSG_BUFF_FREE(compressed_buffer, compressed_buffer_len);
5261
5262 }
5263
5264 return msg_size;
5265}
5266
5267/**
5268 * Send a message to a destination node.
5269 */
5270static int
5271channel_msg_unicast(cf_node dest, msg* msg)
5272{
5273 size_t buffer_len = 0;
5274 uint8_t* buffer = NULL;
5275 if (!hb_is_mesh()) {
5276 // Can't send a unicast message in the multicast mode.
5277 WARNING("ignoring sending unicast message in multicast mode");
5278 return -1;
5279 }
5280
5281 CHANNEL_LOCK();
5282
5283 int rv = -1;
5284 cf_socket* connected_socket;
5285
5286 if (channel_socket_get(dest, &connected_socket) != 0) {
5287 DEBUG("failing message send to disconnected node %" PRIx64, dest);
5288 rv = -1;
5289 goto Exit;
5290 }
5291
5292 // Read the message to a buffer.
5293 int mtu = hb_mtu();
5294 int wire_size = msg_get_wire_size(msg);
5295 buffer_len = channel_msg_buffer_size_get(wire_size, mtu);
5296 buffer =
5297 MSG_BUFF_ALLOC_OR_DIE(buffer_len,
5298 "error allocating memory size %zu for sending message to node %" PRIx64,
5299 buffer_len, dest);
5300
5301 size_t msg_size = channel_msg_buffer_fill(msg, wire_size, mtu, buffer,
5302 buffer_len);
5303
5304 // Send over the buffer.
5305 rv = channel_mesh_msg_send(connected_socket, buffer, msg_size);
5306
5307Exit:
5308 MSG_BUFF_FREE(buffer, buffer_len);
5309 CHANNEL_UNLOCK();
5310 return rv;
5311}
5312
5313/**
5314 * Shash reduce function to walk over the socket to channel hash and broadcast
5315 * the message in udata.
5316 */
5317static int
5318channel_msg_broadcast_reduce(const void* key, void* data, void* udata)
5319{
5320 CHANNEL_LOCK();
5321 cf_socket** socket = (cf_socket**)key;
5322 as_hb_channel* channel = (as_hb_channel*)data;
5323 as_hb_channel_buffer_udata* buffer_udata =
5324 (as_hb_channel_buffer_udata*)udata;
5325
5326 if (!channel->is_multicast) {
5327 DETAIL(
5328 "broadcasting message of length %zu on channel %d assigned to node %" PRIx64,
5329 buffer_udata->buffer_len, CSFD(*socket), channel->nodeid);
5330
5331 channel_mesh_msg_send(*socket, buffer_udata->buffer,
5332 buffer_udata->buffer_len);
5333 }
5334 else {
5335 channel_multicast_msg_send(*socket, buffer_udata->buffer,
5336 buffer_udata->buffer_len);
5337 }
5338
5339 CHANNEL_UNLOCK();
5340
5341 return CF_SHASH_OK;
5342}
5343
5344/**
5345 * Broadcast a message over all channels.
5346 */
5347static int
5348channel_msg_broadcast(msg* msg)
5349{
5350 CHANNEL_LOCK();
5351
5352 int rv = -1;
5353
5354 // Read the message to a buffer.
5355 int mtu = hb_mtu();
5356 int wire_size = msg_get_wire_size(msg);
5357 size_t buffer_len = channel_msg_buffer_size_get(wire_size, mtu);
5358 uint8_t* buffer = MSG_BUFF_ALLOC_OR_DIE(buffer_len,
5359 "error allocating memory size %zu for sending broadcast message",
5360 buffer_len);
5361
5362 as_hb_channel_buffer_udata udata;
5363 udata.buffer = buffer;
5364
5365 // Note this is the length of buffer to send.
5366 udata.buffer_len = channel_msg_buffer_fill(msg, wire_size, mtu, buffer,
5367 buffer_len);
5368
5369 cf_shash_reduce(g_hb.channel_state.socket_to_channel,
5370 channel_msg_broadcast_reduce, &udata);
5371
5372 MSG_BUFF_FREE(buffer, buffer_len);
5373 CHANNEL_UNLOCK();
5374 return rv;
5375}
5376
5377/**
5378 * Clear all channel state.
5379 */
5380static void
5381channel_clear()
5382{
5383 if (!channel_is_stopped()) {
5384 WARNING("attempted channel clear without stopping the channel");
5385 return;
5386 }
5387
5388 CHANNEL_LOCK();
5389
5390 // Free the unpublished event queue.
5391 cf_queue_delete_all(&g_hb.channel_state.events_queue);
5392
5393 // Delete nodeid to socket hash.
5394 cf_shash_reduce(g_hb.channel_state.nodeid_to_socket, hb_delete_all_reduce,
5395 NULL);
5396
5397 // Delete the socket_to_channel hash.
5398 cf_shash_reduce(g_hb.channel_state.socket_to_channel, hb_delete_all_reduce,
5399 NULL);
5400
5401 DETAIL("cleared channel information");
5402 CHANNEL_UNLOCK();
5403}
5404
5405/**
5406 * Reduce function to dump channel node info to log file.
5407 */
5408static int
5409channel_dump_reduce(const void* key, void* data, void* udata)
5410{
5411 cf_socket** socket = (cf_socket**)key;
5412 as_hb_channel* channel = (as_hb_channel*)data;
5413
5414 INFO("\tHB Channel (%s): node-id %" PRIx64 " fd %d endpoint %s polarity %s last-received %" PRIu64,
5415 channel->is_multicast ? "multicast" : "mesh", channel->nodeid,
5416 CSFD(*socket), (cf_sock_addr_is_any(&channel->endpoint_addr))
5417 ? "unknown"
5418 : cf_sock_addr_print(&channel->endpoint_addr),
5419 channel->is_inbound ? "inbound" : "outbound",
5420 channel->last_received);
5421
5422 return CF_SHASH_OK;
5423}
5424
5425/**
5426 * Dump channel state to logs.
5427 * @param verbose enables / disables verbose logging.
5428 */
5429static void
5430channel_dump(bool verbose)
5431{
5432 CHANNEL_LOCK();
5433
5434 INFO("HB Channel Count %d",
5435 cf_shash_get_size(g_hb.channel_state.socket_to_channel));
5436
5437 if (verbose) {
5438 cf_shash_reduce(g_hb.channel_state.socket_to_channel,
5439 channel_dump_reduce, NULL);
5440 }
5441
5442 CHANNEL_UNLOCK();
5443}
5444
5445/*
5446 * ----------------------------------------------------------------------------
5447 * Mesh sub module.
5448 * ----------------------------------------------------------------------------
5449 */
5450
5451/**
5452 * Is mesh running.
5453 */
5454static bool
5455mesh_is_running()
5456{
5457 MESH_LOCK();
5458 bool retval =
5459 (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_RUNNING) ?
5460 true : false;
5461 MESH_UNLOCK();
5462 return retval;
5463}
5464
5465/**
5466 * Is mesh stopped.
5467 */
5468static bool
5469mesh_is_stopped()
5470{
5471 MESH_LOCK();
5472 bool retval =
5473 (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_STOPPED) ?
5474 true : false;
5475 MESH_UNLOCK();
5476 return retval;
5477}
5478
5479/**
5480 * Refresh the mesh published endpoint list.
5481 * @return 0 on successful list creation, -1 otherwise.
5482 */
5483static int
5484mesh_published_endpoint_list_refresh()
5485{
5486 int rv = -1;
5487 MESH_LOCK();
5488
5489 // TODO: Add interface addresses change detection logic here as well.
5490 if (g_hb.mode_state.mesh_state.published_endpoint_list != NULL
5491 && g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only
5492 == cf_ip_addr_legacy_only()) {
5493 rv = 0;
5494 goto Exit;
5495 }
5496
5497 // The global flag has changed, refresh the published address list.
5498 if (g_hb.mode_state.mesh_state.published_endpoint_list) {
5499 // Free the obsolete list.
5500 cf_free(g_hb.mode_state.mesh_state.published_endpoint_list);
5501 }
5502
5503 const cf_serv_cfg* bind_cfg = config_bind_cfg_get();
5504 cf_serv_cfg published_cfg;
5505
5506 config_bind_serv_cfg_expand(bind_cfg, &published_cfg,
5507 g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only);
5508
5509 g_hb.mode_state.mesh_state.published_endpoint_list =
5510 as_endpoint_list_from_serv_cfg(&published_cfg);
5511
5512 if (!g_hb.mode_state.mesh_state.published_endpoint_list) {
5513 CRASH("error initializing mesh published address list");
5514 }
5515
5516 g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only =
5517 cf_ip_addr_legacy_only();
5518
5519 rv = 0;
5520
5521 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
5522 as_endpoint_list_to_string(
5523 g_hb.mode_state.mesh_state.published_endpoint_list,
5524 endpoint_list_str, sizeof(endpoint_list_str));
5525 INFO("updated heartbeat published address list to {%s}", endpoint_list_str);
5526
5527Exit:
5528 MESH_UNLOCK();
5529 return rv;
5530}
5531
5532/**
5533 * Read the published endpoint list via a callback. The call back pattern is to
5534 * prevent access to the published list outside the mesh lock.
5535 * @param process_fn the list process function. The list passed to the process
5536 * function can be NULL.
5537 * @param udata passed as is to the process function.
5538 */
5539static void
5540mesh_published_endpoints_process(endpoint_list_process_fn process_fn,
5541 void* udata)
5542{
5543 MESH_LOCK();
5544
5545 as_endpoint_list* rv = NULL;
5546 if (mesh_published_endpoint_list_refresh()) {
5547 WARNING("error creating mesh published endpoint list");
5548 rv = NULL;
5549 }
5550 else {
5551 rv = g_hb.mode_state.mesh_state.published_endpoint_list;
5552 }
5553
5554 (process_fn)(rv, udata);
5555
5556 MESH_UNLOCK();
5557}
5558
5559/**
5560 * Convert mesh status to a string.
5561 */
5562static const char*
5563mesh_node_status_string(as_hb_mesh_node_status status)
5564{
5565 static char* status_str[] = {
5566 "active",
5567 "pending",
5568 "inactive",
5569 "endpoint-unknown" };
5570
5571 if (status >= AS_HB_MESH_NODE_STATUS_SENTINEL) {
5572 return "corrupted";
5573 }
5574 return status_str[status];
5575}
5576
5577/**
5578 * Change the state of a mesh node. Note: memset the mesh_nodes to zero before
5579 * calling state change for the first time.
5580 */
5581static void
5582mesh_seed_status_change(as_hb_mesh_seed* seed,
5583 as_hb_mesh_node_status new_status)
5584{
5585 seed->status = new_status;
5586 seed->last_status_updated = cf_getms();
5587}
5588
5589/**
5590 * Destroy a mesh seed node.
5591 */
5592static void
5593mesh_seed_destroy(as_hb_mesh_seed* seed)
5594{
5595 MESH_LOCK();
5596 if (seed->resolved_endpoint_list) {
5597 cf_free(seed->resolved_endpoint_list);
5598 seed->resolved_endpoint_list = NULL;
5599 }
5600 MESH_UNLOCK();
5601}
5602
5603static void
5604mesh_seed_dns_resolve_cb(bool is_resolved, const char* hostname,
5605 const cf_ip_addr *addrs, uint32_t n_addrs, void *udata)
5606{
5607 MESH_LOCK();
5608 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
5609 int element_count = cf_vector_size(seeds);
5610 for (int i = 0; i < element_count; i++) {
5611 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
5612
5613 if ((strncmp(seed->seed_host_name, hostname,
5614 sizeof(seed->seed_host_name)) != 0)
5615 || seed->resolved_endpoint_list != NULL) {
5616 continue;
5617 }
5618
5619 cf_serv_cfg temp_serv_cfg;
5620 cf_serv_cfg_init(&temp_serv_cfg);
5621
5622 cf_sock_cfg sock_cfg;
5623 cf_sock_cfg_init(&sock_cfg,
5624 seed->seed_tls ?
5625 CF_SOCK_OWNER_HEARTBEAT_TLS : CF_SOCK_OWNER_HEARTBEAT);
5626 sock_cfg.port = seed->seed_port;
5627
5628 for (int i = 0; i < n_addrs; i++) {
5629 cf_ip_addr_copy(&addrs[i], &sock_cfg.addr);
5630 if (cf_serv_cfg_add_sock_cfg(&temp_serv_cfg, &sock_cfg)) {
5631 CRASH("error initializing resolved address list");
5632 }
5633
5634 DETAIL("resolved mesh node hostname %s to %s", seed->seed_host_name,
5635 cf_ip_addr_print(&addrs[i]));
5636 }
5637
5638 seed->resolved_endpoint_list = as_endpoint_list_from_serv_cfg(
5639 &temp_serv_cfg);
5640 }
5641
5642 MESH_UNLOCK();
5643}
5644
5645/**
5646 * Fill the endpoint list for a mesh seed using the mesh seed hostname and port.
5647 * returns the
5648 * @param mesh_node the mesh node
5649 * @return 0 on success. -1 if a valid endpoint list does not exist and it could
5650 * not be generated.
5651 */
5652static int
5653mesh_seed_endpoint_list_fill(as_hb_mesh_seed* seed)
5654{
5655 if (seed->resolved_endpoint_list != NULL
5656 && seed->resolved_endpoint_list->n_endpoints > 0) {
5657 // A valid endpoint list already exists. For now we resolve only once.
5658 return 0;
5659 }
5660
5661 cf_clock now = cf_getms();
5662 if (now
5663 < seed->resolved_endpoint_list_ts
5664 + MESH_SEED_RESOLVE_ATTEMPT_INTERVAL()) {
5665 // We have just resolved this seed entry unsuccessfully. Don't try again
5666 // for sometime.
5667 return -1;
5668 }
5669
5670 // Resolve and get all IPv4/IPv6 ip addresses asynchronously.
5671 seed->resolved_endpoint_list_ts = now;
5672 cf_ip_addr_from_string_multi_a(seed->seed_host_name,
5673 mesh_seed_dns_resolve_cb, NULL);
5674 return -1;
5675}
5676
5677/**
5678 * Find a mesh seed in the seed list that has an overlapping endpoint and return
5679 * an internal pointer. Assumes this function is called within mesh lock to
5680 * prevent invalidating the returned index after function return.
5681 *
5682 * @param endpoint_list the endpoint list to find the endpoint by.
5683 * @return index to matching seed entry if found, else -1
5684 */
5685static int
5686mesh_seed_endpoint_list_overlapping_find_unsafe(as_endpoint_list* endpoint_list)
5687{
5688 MESH_LOCK();
5689
5690 int match_index = -1;
5691 if (!endpoint_list) {
5692 // Null / empty endpoint list.
5693 goto Exit;
5694 }
5695 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
5696 int element_count = cf_vector_size(seeds);
5697 for (int i = 0; i < element_count; i++) {
5698 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
5699
5700 // Ensure the seed hostname is resolved.
5701 mesh_seed_endpoint_list_fill(seed);
5702
5703 if (as_endpoint_lists_are_overlapping(endpoint_list,
5704 seed->resolved_endpoint_list, true)) {
5705 match_index = i;
5706 break;
5707 }
5708 }
5709
5710Exit:
5711 MESH_UNLOCK();
5712 return match_index;
5713}
5714
5715/**
5716 * Remove a seed entry from the seed list.
5717 * Assumes this function is called within mesh lock to prevent invalidating the
5718 * used index during a function call.
5719 * @param seed_index the index of the seed element.
5720 * @return 0 on success -1 on failure.
5721 */
5722static int
5723mesh_seed_delete_unsafe(int seed_index)
5724{
5725 int rv = -1;
5726 MESH_LOCK();
5727 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
5728 if (seed_index >= 0) {
5729 as_hb_mesh_seed* seed = cf_vector_getp(seeds, seed_index);
5730 mesh_seed_destroy(seed);
5731 rv = cf_vector_delete(seeds, seed_index);
5732 if (rv == 0) {
5733 INFO("removed mesh seed host:%s port %d", seed->seed_host_name,
5734 seed->seed_port);
5735 }
5736 }
5737 MESH_UNLOCK();
5738 return rv;
5739}
5740
5741/**
5742 * Find a mesh seed in the seed list with exactly matching hostname and port.
5743 * Assumes this function is called within mesh lock to prevent invalidating the
5744 * returned index after function return.
5745 *
5746 * @param host the seed hostname
5747 * @param port the seed port
5748 * @return index to matching seed entry if found, else -1
5749 */
5750static int
5751mesh_seed_find_unsafe(char* host, int port)
5752{
5753 MESH_LOCK();
5754
5755 int match_index = -1;
5756 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
5757 int element_count = cf_vector_size(seeds);
5758 for (int i = 0; i < element_count; i++) {
5759 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
5760 if (strncmp(seed->seed_host_name, host, sizeof(seed->seed_host_name))
5761 == 0 && seed->seed_port == port) {
5762 match_index = i;
5763 break;
5764 }
5765 }
5766
5767 MESH_UNLOCK();
5768 return match_index;
5769}
5770
5771/**
5772 * Endure mesh tend udata has enough space for current mesh nodes.
5773 */
5774static void
5775mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata,
5776 int mesh_node_count)
5777{
5778 // Ensure capacity for nodes to connect.
5779 if (tend_reduce_udata->to_connect_capacity < mesh_node_count) {
5780 uint32_t alloc_size = round_up_pow2(
5781 mesh_node_count * sizeof(as_endpoint_list*));
5782 int old_capacity = tend_reduce_udata->to_connect_capacity;
5783 tend_reduce_udata->to_connect_capacity = alloc_size
5784 / sizeof(as_endpoint_list*);
5785 tend_reduce_udata->to_connect = cf_realloc(
5786 tend_reduce_udata->to_connect, alloc_size);
5787
5788 // NULL out newly allocated elements.
5789 for (int i = old_capacity; i < tend_reduce_udata->to_connect_capacity;
5790 i++) {
5791 tend_reduce_udata->to_connect[i] = NULL;
5792 }
5793 }
5794}
5795
5796/**
5797 * Change the state of a mesh node. Note: memset the mesh_nodes to zero before
5798 * calling state change for the first time.
5799 */
5800static void
5801mesh_node_status_change(as_hb_mesh_node* mesh_node,
5802 as_hb_mesh_node_status new_status)
5803{
5804 as_hb_mesh_node_status old_status = mesh_node->status;
5805 mesh_node->status = new_status;
5806
5807 if ((new_status != AS_HB_MESH_NODE_CHANNEL_ACTIVE
5808 && old_status == AS_HB_MESH_NODE_CHANNEL_ACTIVE)
5809 || mesh_node->last_status_updated == 0) {
5810 mesh_node->inactive_since = cf_getms();
5811 }
5812 mesh_node->last_status_updated = cf_getms();
5813 return;
5814}
5815
5816/**
5817 * Close mesh listening sockets.
5818 */
5819static void
5820mesh_listening_sockets_close()
5821{
5822 MESH_LOCK();
5823 INFO("closing mesh heartbeat sockets");
5824 cf_sockets_close(&g_hb.mode_state.mesh_state.listening_sockets);
5825 DEBUG("closed mesh heartbeat sockets");
5826 MESH_UNLOCK();
5827}
5828
5829/**
5830 * Populate the buffer with mesh seed list.
5831 */
5832static void
5833mesh_seed_host_list_get(cf_dyn_buf* db, bool tls)
5834{
5835 if (!hb_is_mesh()) {
5836 return;
5837 }
5838
5839 MESH_LOCK();
5840
5841 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
5842 int element_count = cf_vector_size(seeds);
5843 for (int i = 0; i < element_count; i++) {
5844 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
5845 const char* info_key =
5846 seed->seed_tls ?
5847 "heartbeat.tls-mesh-seed-address-port=" :
5848 "heartbeat.mesh-seed-address-port=";
5849
5850 cf_dyn_buf_append_string(db, info_key);
5851 cf_dyn_buf_append_string(db, seed->seed_host_name);
5852 cf_dyn_buf_append_char(db, ':');
5853 cf_dyn_buf_append_uint32(db, seed->seed_port);
5854 cf_dyn_buf_append_char(db, ';');
5855 }
5856
5857 MESH_UNLOCK();
5858}
5859
5860/**
5861 * Checks if the match between a mesh seed and a mesh node is valid.
5862 * The matching would be invalid if the mesh node's endpoint has been updated
5863 * after the match was made or there has been no match.
5864 */
5865static bool
5866mesh_seed_mesh_node_check(as_hb_mesh_seed* seed)
5867{
5868 if (seed->status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) {
5869 return false;
5870 }
5871
5872 as_hb_mesh_node node;
5873 if (mesh_node_get(seed->mesh_nodeid, &node) != 0) {
5874 // The matched node has vanished.
5875 return false;
5876 }
5877
5878 return seed->mesh_node_endpoint_change_ts == node.endpoint_change_ts;
5879}
5880
5881/**
5882 * Refresh the matching between seeds and mesh nodes and get inactive seeds.
5883 * Should be invoked under a mesh lock to ensure the validity of returned
5884 * pointers.
5885 * @param inactive_seeds_p output vector of inactive seed pointers. Can be NULL
5886 * if inactive nodes need not be returned.
5887 */
5888static void
5889mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p)
5890{
5891 MESH_LOCK();
5892
5893 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
5894 int element_count = cf_vector_size(seeds);
5895 if (inactive_seeds_p) {
5896 cf_vector_clear(inactive_seeds_p);
5897 }
5898
5899 // Mark seeds that do not have a matching mesh node and transitively do not
5900 // have a matching channel.
5901 cf_clock now = cf_getms();
5902 for (int i = 0; i < element_count; i++) {
5903 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
5904 if (mesh_seed_mesh_node_check(seed)) {
5905 continue;
5906 }
5907
5908 seed->mesh_nodeid = 0;
5909 seed->mesh_node_endpoint_change_ts = 0;
5910
5911 // The mesh node is being connected. Skip.
5912 if (seed->status == AS_HB_MESH_NODE_CHANNEL_PENDING) {
5913 if (seed->last_status_updated + MESH_PENDING_TIMEOUT > now) {
5914 // Spare the pending seeds, since we are attempting to connect
5915 // to the seed host.
5916 continue;
5917 }
5918
5919 // Flip to inactive if we have been in pending state for a long
5920 // time.
5921 mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
5922 }
5923
5924 if (seed->status != AS_HB_MESH_NODE_CHANNEL_PENDING) {
5925 mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
5926 if (inactive_seeds_p) {
5927 cf_vector_append(inactive_seeds_p, &seed);
5928 }
5929 }
5930 }
5931
5932 MESH_UNLOCK();
5933}
5934
5935/**
5936 * Match input seeds to a mesh node using its endpoint address and
5937 */
5938static void
5939mesh_seeds_mesh_node_match_update(cf_vector* inactive_seeds_p,
5940 as_hb_mesh_node* mesh_node, cf_node mesh_nodeid)
5941{
5942 if (mesh_node->status
5943 == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN|| mesh_node->endpoint_list == NULL) {
5944 return;
5945 }
5946
5947 int element_count = cf_vector_size(inactive_seeds_p);
5948 for (int i = 0; i < element_count; i++) {
5949 // No null check required since we are iterating under a lock and within
5950 // vector bounds.
5951 as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp(
5952 inactive_seeds_p, i);
5953 if (as_endpoint_lists_are_overlapping(seed->resolved_endpoint_list,
5954 mesh_node->endpoint_list, true)) {
5955 // We found a matching mesh node for the seed, flip its status to
5956 // active.
5957 seed->mesh_nodeid = mesh_nodeid;
5958 seed->mesh_node_endpoint_change_ts = mesh_node->endpoint_change_ts;
5959 mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_ACTIVE);
5960 DEBUG("seed entry %s:%d connected", seed->seed_host_name,
5961 seed->seed_port);
5962 }
5963 }
5964}
5965
5966/**
5967 * Determines if a mesh entry should be connected to or expired and deleted.
5968 */
5969static int
5970mesh_tend_reduce(const void* key, void* data, void* udata)
5971{
5972 MESH_LOCK();
5973
5974 int rv = CF_SHASH_OK;
5975 cf_node nodeid = *(cf_node*)key;
5976 as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
5977 as_hb_mesh_tend_reduce_udata* tend_reduce_udata =
5978 (as_hb_mesh_tend_reduce_udata*)udata;
5979
5980 DETAIL("tending mesh node %"PRIx64" with status %s", nodeid,
5981 mesh_node_status_string(mesh_node->status));
5982
5983 mesh_seeds_mesh_node_match_update(tend_reduce_udata->inactive_seeds_p,
5984 mesh_node, nodeid);
5985
5986 if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) {
5987 // The mesh node is connected. Skip.
5988 goto Exit;
5989 }
5990
5991 cf_clock now = cf_getms();
5992
5993 if (!mesh_node->endpoint_list) {
5994 // Will happen if node discover and disconnect happen close together.
5995 mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_ENDPOINT_UNKNOWN);
5996 }
5997
5998 if (mesh_node->inactive_since + MESH_INACTIVE_TIMEOUT <= now) {
5999 DEBUG("mesh forgetting node %" PRIx64" because it could not be connected since %" PRIx64,
6000 nodeid, mesh_node->inactive_since);
6001 rv = CF_SHASH_REDUCE_DELETE;
6002 goto Exit;
6003 }
6004
6005 if (mesh_node->status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) {
6006 if (mesh_node->last_status_updated + MESH_ENDPOINT_UNKNOWN_TIMEOUT
6007 > now) {
6008 DEBUG("mesh forgetting node %"PRIx64" ip address/port undiscovered since %"PRIu64,
6009 nodeid, mesh_node->last_status_updated);
6010
6011 rv = CF_SHASH_REDUCE_DELETE;
6012 }
6013 // Skip connecting with a node with unknown endpoint.
6014 goto Exit;
6015 }
6016
6017 if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_PENDING) {
6018 // The mesh node is being connected. Skip.
6019 if (mesh_node->last_status_updated + MESH_PENDING_TIMEOUT > now) {
6020 goto Exit;
6021 }
6022
6023 // Flip to inactive if we have been in pending state for a long time.
6024 mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
6025 }
6026
6027 // Channel for this node is inactive. Prompt the channel sub module to
6028 // connect to this node.
6029 if (tend_reduce_udata->to_connect_count
6030 >= tend_reduce_udata->to_connect_capacity) {
6031 // New nodes found but we are out of capacity. Ultra defensive coding.
6032 // This will never happen under the locks.
6033 WARNING("skipping connecting to node %" PRIx64" - not enough memory allocated",
6034 nodeid);
6035 goto Exit;
6036 }
6037
6038 endpoint_list_copy(
6039 &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count],
6040 mesh_node->endpoint_list);
6041 tend_reduce_udata->to_connect_count++;
6042
6043 // Flip status to pending.
6044 mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_PENDING);
6045
6046Exit:
6047 if (rv == CF_SHASH_REDUCE_DELETE) {
6048 // Clear all internal allocated memory.
6049 mesh_node_destroy(mesh_node);
6050 }
6051
6052 MESH_UNLOCK();
6053
6054 return rv;
6055}
6056
6057/**
6058 * Add inactive seeds to to_connect array.
6059 * Should be invoked under mesh lock to prevent invalidating the array of seed
6060 * node pointers.
6061 * @param seed_p vector of seed pointers.
6062 * @param tend reduce udata having the to connect endpoint list.
6063 */
6064void
6065mesh_seeds_inactive_add_to_connect(cf_vector* seeds_p,
6066 as_hb_mesh_tend_reduce_udata* tend_reduce_udata)
6067{
6068 MESH_LOCK();
6069 int element_count = cf_vector_size(seeds_p);
6070 for (int i = 0; i < element_count; i++) {
6071 as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp(seeds_p, i);
6072 if (seed->status != AS_HB_MESH_NODE_CHANNEL_INACTIVE) {
6073 continue;
6074 }
6075
6076 // Channel for this node is inactive. Prompt the channel sub module to
6077 // connect to this node.
6078 if (tend_reduce_udata->to_connect_count
6079 >= tend_reduce_udata->to_connect_capacity) {
6080 // New nodes found but we are out of capacity. Ultra defensive
6081 // coding.
6082 // This will never happen under the locks.
6083 WARNING(
6084 "skipping connecting to %s:%d - not enough memory allocated",
6085 seed->seed_host_name, seed->seed_port);
6086 return;
6087 }
6088
6089 // Ensure the seed hostname is resolved.
6090 if (mesh_seed_endpoint_list_fill(seed) != 0) {
6091 continue;
6092 }
6093
6094 endpoint_list_copy(
6095 &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count],
6096 seed->resolved_endpoint_list);
6097 tend_reduce_udata->to_connect_count++;
6098
6099 // Flip status to pending.
6100 mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_PENDING);
6101 }
6102 MESH_UNLOCK();
6103}
6104
6105/**
6106 * Tends the mesh host list, to discover and remove nodes. Should never invoke a
6107 * channel call while holding a mesh lock.
6108 */
6109void*
6110mesh_tender(void* arg)
6111{
6112 DETAIL("mesh tender started");
6113 // Figure out which nodes need to be connected to.
6114 // collect nodes to connect to and remove dead nodes.
6115 as_hb_mesh_tend_reduce_udata tend_reduce_udata = { NULL, 0, 0 };
6116
6117 // Vector of pointer to inactive seeds.
6118 cf_vector inactive_seeds_p;
6119 cf_vector_init(&inactive_seeds_p, sizeof(as_hb_mesh_seed*),
6120 AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);
6121
6122 cf_clock last_time = 0;
6123
6124 while (hb_is_mesh() && mesh_is_running()) {
6125 cf_clock curr_time = cf_getms();
6126
6127 // Unlocked access but this should be alright Set the discovered flag.
6128 bool nodes_discovered = g_hb.mode_state.mesh_state.nodes_discovered;
6129 if ((curr_time - last_time) < MESH_TEND_INTERVAL && !nodes_discovered) {
6130 // Interval has not been reached for sending heartbeats
6131 usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time +
6132 MESH_TEND_INTERVAL) - curr_time) * 1000);
6133 continue;
6134 }
6135 last_time = curr_time;
6136
6137 DETAIL("tending mesh list");
6138
6139 MESH_LOCK();
6140 // Unset the discovered flag.
6141 g_hb.mode_state.mesh_state.nodes_discovered = false;
6142
6143 // Update the list of inactive seeds.
6144 mesh_seed_inactive_refresh_get_unsafe(&inactive_seeds_p);
6145
6146 // Make sure the udata has enough capacity.
6147 int connect_count_max = cf_shash_get_size(
6148 g_hb.mode_state.mesh_state.nodeid_to_mesh_node)
6149 + cf_vector_size(&inactive_seeds_p);
6150 mesh_tend_udata_capacity_ensure(&tend_reduce_udata, connect_count_max);
6151
6152 tend_reduce_udata.to_connect_count = 0;
6153 tend_reduce_udata.inactive_seeds_p = &inactive_seeds_p;
6154 cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
6155 mesh_tend_reduce, &tend_reduce_udata);
6156
6157 // Add inactive seeds for connection.
6158 mesh_seeds_inactive_add_to_connect(&inactive_seeds_p,
6159 &tend_reduce_udata);
6160
6161 MESH_UNLOCK();
6162
6163 // Connect can be time consuming, especially in failure cases.
6164 // Connect outside of the mesh lock and prevent hogging the lock.
6165 if (tend_reduce_udata.to_connect_count > 0) {
6166 // Try connecting the newer nodes.
6167 channel_mesh_channel_establish(tend_reduce_udata.to_connect,
6168 tend_reduce_udata.to_connect_count);
6169 }
6170
6171 DETAIL("done tending mesh list");
6172 }
6173
6174 if (tend_reduce_udata.to_connect) {
6175 // Free space allocated for endpoint lists.
6176 for (int i = 0; i < tend_reduce_udata.to_connect_capacity; i++) {
6177 if (tend_reduce_udata.to_connect[i]) {
6178 cf_free(tend_reduce_udata.to_connect[i]);
6179 }
6180 }
6181 cf_free(tend_reduce_udata.to_connect);
6182 }
6183
6184 cf_vector_destroy(&inactive_seeds_p);
6185
6186 DETAIL("mesh tender shut down");
6187 return NULL;
6188}
6189
6190/**
6191 * Add or update a mesh node to mesh node list.
6192 */
6193static void
6194mesh_node_add_update(cf_node nodeid, as_hb_mesh_node* mesh_node)
6195{
6196 MESH_LOCK();
6197 cf_shash_put(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid,
6198 mesh_node);
6199 MESH_UNLOCK();
6200}
6201
6202/**
6203 * Destroy a mesh node.
6204 */
6205static void
6206mesh_node_destroy(as_hb_mesh_node* mesh_node)
6207{
6208 MESH_LOCK();
6209 if (mesh_node->endpoint_list) {
6210 cf_free(mesh_node->endpoint_list);
6211 mesh_node->endpoint_list = NULL;
6212 }
6213 MESH_UNLOCK();
6214}
6215
6216/**
6217 * Endpoint list iterate function find endpoint matching sock addr.
6218 */
6219static void
6220mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata)
6221{
6222 cf_sock_addr endpoint_addr;
6223 if (as_endpoint_to_sock_addr(endpoint, &endpoint_addr) != 0) {
6224 return;
6225 }
6226
6227 as_hb_endpoint_list_addr_find_udata* endpoint_reduce_udata =
6228 (as_hb_endpoint_list_addr_find_udata*)udata;
6229
6230 if (cf_sock_addr_compare(&endpoint_addr, endpoint_reduce_udata->to_search)
6231 == 0) {
6232 endpoint_reduce_udata->found = true;
6233 }
6234}
6235
6236/**
6237 * Indicates if a give node is discovered.
6238 * @param nodeid the input nodeid.
6239 * @return true if discovered, false otherwise.
6240 */
6241static bool
6242mesh_node_is_discovered(cf_node nodeid)
6243{
6244 if (nodeid == config_self_nodeid_get()) {
6245 // Assume this node knows itself.
6246 return true;
6247 }
6248
6249 as_hb_mesh_node mesh_node;
6250 return mesh_node_get(nodeid, &mesh_node) == 0;
6251}
6252
6253/**
6254 * Indicates if a give node has a valid endpoint list.
6255 * @param nodeid the input nodeid.
6256 * @return true if node has valid endpoint list, false otherwise.
6257 */
6258static bool
6259mesh_node_endpoint_list_is_valid(cf_node nodeid)
6260{
6261 if (nodeid == config_self_nodeid_get()) {
6262 // Assume this node knows itself.
6263 return true;
6264 }
6265
6266 as_hb_mesh_node mesh_node;
6267 return mesh_node_get(nodeid, &mesh_node) == 0
6268 && mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN
6269 && mesh_node.endpoint_list;
6270}
6271
6272/**
6273 * Get the mesh node associated with this node.
6274 * @param nodeid the nodeid to search for.
6275 * @param is_real_nodeid indicates if the query is for a real or fake nodeid.
6276 * @param mesh_node the output mesh node.
6277 * @return 0 on success -1 if there is mesh node attached.
6278 */
6279static int
6280mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node)
6281{
6282 int rv = -1;
6283
6284 MESH_LOCK();
6285 if (cf_shash_get(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid,
6286 mesh_node) == CF_SHASH_OK) {
6287 rv = 0;
6288 }
6289 else {
6290 // The node not found.
6291 rv = -1;
6292 }
6293 MESH_UNLOCK();
6294 return rv;
6295}
6296
6297/**
6298 * Handle the event when the channel reports a node as disconnected.
6299 */
6300static void
6301mesh_channel_on_node_disconnect(as_hb_channel_event* event)
6302{
6303 MESH_LOCK();
6304
6305 as_hb_mesh_node mesh_node;
6306 if (mesh_node_get(event->nodeid, &mesh_node) != 0) {
6307 // Again should not happen in practice. But not really bad.
6308 DEBUG("unknown mesh node disconnected %" PRIx64, event->nodeid);
6309 goto Exit;
6310 }
6311
6312 DEBUG("mesh setting node %" PRIx64" status as inactive on loss of channel",
6313 event->nodeid);
6314
6315 // Mark this node inactive and move on. Mesh tender should remove this node
6316 // after it has been inactive for a while.
6317 mesh_node_status_change(&mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
6318
6319 // Update the mesh entry.
6320 mesh_node_add_update(event->nodeid, &mesh_node);
6321
6322Exit:
6323 MESH_UNLOCK();
6324}
6325
6326/**
6327 * Check and fix the case where we received a self incoming message probably
6328 * because one of our non loop back interfaces was used as a seed address.
6329 *
6330 * @return true if this message is a self message, false otherwise.
6331 */
6332static bool
6333mesh_node_check_fix_self_msg(as_hb_channel_event* event)
6334{
6335 if (event->nodeid == config_self_nodeid_get()) {
6336 // Handle self message. Will happen if the seed node address on this
6337 // node does not match the listen / publish address.
6338 as_endpoint_list* msg_endpoint_list;
6339 msg_endpoint_list_get(event->msg, &msg_endpoint_list);
6340
6341 MESH_LOCK();
6342
6343 // Check if this node has published an endpoint list matching self node.
6344 endpoint_list_equal_check_udata udata = { 0 };
6345 udata.are_equal = false;
6346 udata.other = msg_endpoint_list;
6347 mesh_published_endpoints_process(endpoint_list_equal_process, &udata);
6348
6349 if (udata.are_equal) {
6350 // Definitely pulse message from self node.
6351 int self_seed_index =
6352 mesh_seed_endpoint_list_overlapping_find_unsafe(
6353 msg_endpoint_list);
6354 if (self_seed_index >= 0) {
6355 as_hb_mesh_seed* self_seed = cf_vector_getp(
6356 &g_hb.mode_state.mesh_state.seeds, self_seed_index);
6357 INFO("removing self seed entry host:%s port:%d",
6358 self_seed->seed_host_name, self_seed->seed_port);
6359 as_hb_mesh_tip_clear(self_seed->seed_host_name,
6360 self_seed->seed_port);
6361 }
6362 }
6363 MESH_UNLOCK();
6364 return true;
6365 }
6366 return false;
6367}
6368
6369/**
6370 * Update mesh node status based on an incoming message.
6371 */
6372static void
6373mesh_node_data_update(as_hb_channel_event* event)
6374{
6375 if (mesh_node_check_fix_self_msg(event)) {
6376 // Message from self, can be ignored.
6377 return;
6378 }
6379
6380 MESH_LOCK();
6381 as_hb_mesh_node existing_mesh_node = { 0 };
6382 as_endpoint_list* msg_endpoint_list = NULL;
6383 msg_endpoint_list_get(event->msg, &msg_endpoint_list);
6384
6385 // Search for existing entry.
6386 bool needs_update = mesh_node_get(event->nodeid, &existing_mesh_node) != 0;
6387
6388 // Update the endpoint list to be the message endpoint list if the seed ip
6389 // list and the published ip list differ
6390 if (!as_endpoint_lists_are_equal(existing_mesh_node.endpoint_list,
6391 msg_endpoint_list)) {
6392 char endpoint_list_str1[ENDPOINT_LIST_STR_SIZE];
6393 endpoint_list_str1[0] = 0;
6394
6395 as_endpoint_list_to_string(existing_mesh_node.endpoint_list,
6396 endpoint_list_str1, sizeof(endpoint_list_str1));
6397
6398 char endpoint_list_str2[ENDPOINT_LIST_STR_SIZE];
6399 as_endpoint_list_to_string(msg_endpoint_list, endpoint_list_str2,
6400 sizeof(endpoint_list_str2));
6401
6402 if (existing_mesh_node.endpoint_list) {
6403 INFO("for node %"PRIx64" updating mesh endpoint address from {%s} to {%s}",event->nodeid,
6404 endpoint_list_str1, endpoint_list_str2);
6405 }
6406
6407 // Update the endpoints.
6408 endpoint_list_copy(&existing_mesh_node.endpoint_list,
6409 msg_endpoint_list);
6410 existing_mesh_node.endpoint_change_ts = as_hlc_timestamp_now();
6411
6412 needs_update = true;
6413 }
6414
6415 if (existing_mesh_node.status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) {
6416 // Update status to active.
6417 mesh_node_status_change(&existing_mesh_node,
6418 AS_HB_MESH_NODE_CHANNEL_ACTIVE);
6419 needs_update = true;
6420 }
6421
6422 if (needs_update) {
6423 // Apply the update.
6424 mesh_node_add_update(event->nodeid, &existing_mesh_node);
6425 }
6426
6427 MESH_UNLOCK();
6428}
6429
6430/**
6431 * Return the in memory and on wire size of an info reply array.
6432 * @param reply the info reply.
6433 * @param reply_count the number of replies.
6434 * @param reply_size the wire size of the message.
6435 * @return 0 on successful reply count computation, -1 otherwise,
6436 */
6437static int
6438mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count,
6439 size_t* reply_size)
6440{
6441 // Go over reply and compute the count of replies and also validate the
6442 // endpoint lists.
6443 uint8_t* start_ptr = (uint8_t*)reply;
6444 *reply_size = 0;
6445
6446 for (int i = 0; i < reply_count; i++) {
6447 as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr;
6448 *reply_size += sizeof(as_hb_mesh_info_reply);
6449 start_ptr += sizeof(as_hb_mesh_info_reply);
6450
6451 size_t endpoint_list_size = 0;
6452 if (as_endpoint_list_sizeof(&reply_ptr->endpoint_list[0],
6453 &endpoint_list_size)) {
6454 // Incomplete / garbled info reply message.
6455 *reply_size = 0;
6456 return -1;
6457 }
6458
6459 *reply_size += endpoint_list_size;
6460 start_ptr += endpoint_list_size;
6461 }
6462
6463 return 0;
6464}
6465
6466/**
6467 * Send a info reply in reply to an info request.
6468 * @param dest the destination node to send the info reply to.
6469 * @param reply array of node ids and endpoints
6470 * @param reply_count the count of replies.
6471 */
6472static void
6473mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply,
6474 size_t reply_count)
6475{
6476 // Create the discover message.
6477 msg* msg = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REPLY);
6478
6479 // Set the reply.
6480 msg_info_reply_set(msg, reply, reply_count);
6481
6482 DEBUG("sending info reply to node %" PRIx64, dest);
6483
6484 // Send the info reply.
6485 if (channel_msg_unicast(dest, msg) != 0) {
6486 TICKER_WARNING("error sending info reply message to node %" PRIx64,
6487 dest);
6488 }
6489
6490 hb_msg_return(msg);
6491}
6492
6493/**
6494 * Initialize the info request msg buffer
6495 */
6496static msg*
6497mesh_info_msg_init(as_hb_msg_type msg_type)
6498{
6499 msg* msg = hb_msg_get();
6500 msg_src_fields_fill(msg);
6501 msg_type_set(msg, msg_type);
6502 return msg;
6503}
6504
6505/**
6506 * Send a info request for all undiscovered nodes.
6507 * @param dest the destination node to send the discover message to.
6508 * @param to_discover array of node ids to discover.
6509 * @param to_discover_count the count of nodes in the array.
6510 */
6511static void
6512mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover,
6513 size_t to_discover_count)
6514{
6515 // Create the discover message.
6516 msg* info_req = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REQUEST);
6517
6518 // Set the list of nodes to discover.
6519 msg_node_list_set(info_req, AS_HB_MSG_INFO_REQUEST, to_discover,
6520 to_discover_count);
6521
6522 DEBUG("sending info request to node %" PRIx64, dest);
6523
6524 // Send the info request.
6525 if (channel_msg_unicast(dest, info_req) != 0) {
6526 TICKER_WARNING("error sending info request message to node %" PRIx64,
6527 dest);
6528 }
6529 hb_msg_return(info_req);
6530}
6531
6532/**
6533 * Handle an incoming pulse message to discover new neighbours.
6534 */
6535static void
6536mesh_channel_on_pulse(msg* msg)
6537{
6538 cf_node* adj_list;
6539 size_t adj_length;
6540
6541 cf_node source;
6542
6543 // Channel has validated the source. Don't bother checking here.
6544 msg_nodeid_get(msg, &source);
6545 if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) {
6546 // Adjacency list absent.
6547 WARNING("received message from %" PRIx64" without adjacency list",
6548 source);
6549 return;
6550 }
6551
6552 cf_node to_discover[adj_length];
6553 size_t num_to_discover = 0;
6554
6555 // TODO: Track already queried nodes so that we do not retry immediately.
6556 // Will need a separate state, pending query.
6557 MESH_LOCK();
6558
6559 // Try and discover new nodes from this message's adjacency list.
6560 for (int i = 0; i < adj_length; i++) {
6561 if (!mesh_node_is_discovered(adj_list[i])) {
6562 DEBUG("discovered new mesh node %" PRIx64, adj_list[i]);
6563
6564 as_hb_mesh_node new_node;
6565 memset(&new_node, 0, sizeof(new_node));
6566 mesh_node_status_change(&new_node,
6567 AS_HB_MESH_NODE_ENDPOINT_UNKNOWN);
6568
6569 // Add as a new node
6570 mesh_node_add_update(adj_list[i], &new_node);
6571 }
6572
6573 if (!mesh_node_endpoint_list_is_valid(adj_list[i])) {
6574 to_discover[num_to_discover++] = adj_list[i];
6575 }
6576 }
6577
6578 MESH_UNLOCK();
6579
6580 // Discover these nodes outside a lock.
6581 if (num_to_discover) {
6582 mesh_nodes_send_info_request(msg, source, to_discover, num_to_discover);
6583 }
6584}
6585
6586/**
6587 * Handle an incoming info message.
6588 */
6589static void
6590mesh_channel_on_info_request(msg* msg)
6591{
6592 cf_node* query_nodeids;
6593 size_t query_count;
6594
6595 cf_node source;
6596 msg_nodeid_get(msg, &source);
6597
6598 if (msg_node_list_get(msg, AS_HB_MSG_INFO_REQUEST, &query_nodeids,
6599 &query_count) != 0) {
6600 TICKER_WARNING("got an info request without query nodes from %" PRIx64,
6601 source);
6602 return;
6603 }
6604
6605 MESH_LOCK();
6606
6607 // Compute the entire response size.
6608 size_t reply_size = 0;
6609
6610 for (int i = 0; i < query_count; i++) {
6611 as_hb_mesh_node mesh_node;
6612
6613 if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) {
6614 if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN
6615 && mesh_node.endpoint_list) {
6616 size_t endpoint_list_size = 0;
6617 as_endpoint_list_sizeof(mesh_node.endpoint_list,
6618 &endpoint_list_size);
6619 reply_size += sizeof(as_hb_mesh_info_reply)
6620 + endpoint_list_size;
6621 }
6622 }
6623 }
6624
6625 as_hb_mesh_info_reply* replies = alloca(reply_size);
6626 uint8_t* reply_ptr = (uint8_t*)replies;
6627 size_t reply_count = 0;
6628
6629 DEBUG("received info request from node : %" PRIx64, source);
6630 DEBUG("preparing a reply for %zu requests", query_count);
6631
6632 for (int i = 0; i < query_count; i++) {
6633 as_hb_mesh_node mesh_node;
6634
6635 DEBUG("mesh received info request for node %" PRIx64, query_nodeids[i]);
6636
6637 if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) {
6638 if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN
6639 && mesh_node.endpoint_list) {
6640 as_hb_mesh_info_reply* reply = (as_hb_mesh_info_reply*)reply_ptr;
6641
6642 reply->nodeid = query_nodeids[i];
6643
6644 size_t endpoint_list_size = 0;
6645 as_endpoint_list_sizeof(mesh_node.endpoint_list,
6646 &endpoint_list_size);
6647
6648 memcpy(&reply->endpoint_list[0], mesh_node.endpoint_list,
6649 endpoint_list_size);
6650
6651 reply_ptr += sizeof(as_hb_mesh_info_reply) + endpoint_list_size;
6652
6653 reply_count++;
6654 }
6655 }
6656 }
6657
6658 MESH_UNLOCK();
6659
6660 // Send the reply
6661 if (reply_count > 0) {
6662 mesh_nodes_send_info_reply(source, replies, reply_count);
6663 }
6664}
6665
6666/**
6667 * Handle an incoming info reply.
6668 */
6669static void
6670mesh_channel_on_info_reply(msg* msg)
6671{
6672 as_hb_mesh_info_reply* reply = NULL;
6673 size_t reply_count = 0;
6674 cf_node source = 0;
6675 msg_nodeid_get(msg, &source);
6676 if (msg_info_reply_get(msg, &reply, &reply_count) != 0
6677 || reply_count == 0) {
6678 TICKER_WARNING(
6679 "got an info reply from without query nodes from %" PRIx64,
6680 source);
6681 return;
6682 }
6683
6684 DEBUG("received info reply from node %" PRIx64, source);
6685
6686 MESH_LOCK();
6687
6688 uint8_t *start_ptr = (uint8_t*)reply;
6689 for (int i = 0; i < reply_count; i++) {
6690 as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr;
6691 as_hb_mesh_node existing_node;
6692 if (mesh_node_get(reply_ptr->nodeid, &existing_node) != 0) {
6693 // Somehow the node was removed from the mesh hash. Maybe a timeout.
6694 goto NextReply;
6695 }
6696
6697 // Update the state of this node.
6698 if (existing_node.status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) {
6699 // Update the endpoint.
6700 endpoint_list_copy(&existing_node.endpoint_list,
6701 reply_ptr->endpoint_list);
6702
6703 mesh_node_status_change(&existing_node,
6704 AS_HB_MESH_NODE_CHANNEL_INACTIVE);
6705 // Set the discovered flag.
6706 g_hb.mode_state.mesh_state.nodes_discovered = true;
6707
6708 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
6709 as_endpoint_list_to_string(existing_node.endpoint_list,
6710 endpoint_list_str, sizeof(endpoint_list_str));
6711
6712 DEBUG("for node %" PRIx64" discovered endpoints {%s}",
6713 reply_ptr->nodeid, endpoint_list_str);
6714
6715 // Update the hash.
6716 mesh_node_add_update(reply_ptr->nodeid, &existing_node);
6717 }
6718
6719 NextReply:
6720 start_ptr += sizeof(as_hb_mesh_info_reply);
6721 size_t endpoint_list_size = 0;
6722 as_endpoint_list_sizeof(reply_ptr->endpoint_list, &endpoint_list_size);
6723 start_ptr += endpoint_list_size;
6724 }
6725
6726 MESH_UNLOCK();
6727}
6728
6729/**
6730 * Handle the case when a message is received on a channel.
6731 */
6732static void
6733mesh_channel_on_msg_rcvd(as_hb_channel_event* event)
6734{
6735 // Update the mesh node status.
6736 mesh_node_data_update(event);
6737
6738 as_hb_msg_type msg_type;
6739 msg_type_get(event->msg, &msg_type);
6740
6741 switch (msg_type) {
6742 case AS_HB_MSG_TYPE_PULSE: // A pulse message. Try and discover new nodes.
6743 mesh_channel_on_pulse(event->msg);
6744 break;
6745 case AS_HB_MSG_TYPE_INFO_REQUEST: // Send back an info reply.
6746 mesh_channel_on_info_request(event->msg);
6747 break;
6748 case AS_HB_MSG_TYPE_INFO_REPLY: // Update the list of mesh nodes, if this is an undiscovered node.
6749 mesh_channel_on_info_reply(event->msg);
6750 break;
6751 default:
6752 WARNING("received a message of unknown type from");
6753 // Ignore other messages.
6754 break;
6755 }
6756}
6757
6758/*
6759 * ----------------------------------------------------------------------------
6760 * Mesh public API
6761 * ----------------------------------------------------------------------------
6762 */
6763
6764/**
6765 * Add a host / port to the mesh seed list.
6766 * @param host the seed node hostname / ip address
6767 * @param port the seed node port.
6768 * @param tls indicates TLS support.
6769 * @return CF_SHASH_OK, CF_SHASH_ERR, CF_SHASH_ERR_FOUND.
6770 */
6771static int
6772mesh_tip(char* host, int port, bool tls)
6773{
6774 MESH_LOCK();
6775
6776 int rv = -1;
6777 as_hb_mesh_seed new_seed = { { 0 } };
6778
6779 // Check validity of hostname and port.
6780 int hostname_len = strnlen(host, DNS_NAME_MAX_SIZE);
6781 if (hostname_len <= 0 || hostname_len == DNS_NAME_MAX_SIZE) {
6782 // Invalid hostname.
6783 WARNING("mesh seed host %s exceeds allowed %d characters", host,
6784 DNS_NAME_MAX_LEN);
6785 goto Exit;
6786 }
6787 if (port <= 0 || port > USHRT_MAX) {
6788 WARNING("mesh seed port %s:%d exceeds should be between 0 to %d", host,
6789 port, USHRT_MAX);
6790 goto Exit;
6791 }
6792
6793 // Check if we already have a match for this seed.
6794 if (mesh_seed_find_unsafe(host, port) >= 0) {
6795 WARNING("mesh seed host %s:%d already in seed list", host, port);
6796 goto Exit;
6797 }
6798
6799 mesh_seed_status_change(&new_seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
6800 strcpy(new_seed.seed_host_name, host);
6801 new_seed.seed_port = port;
6802 new_seed.seed_tls = tls;
6803
6804 cf_vector_append(&g_hb.mode_state.mesh_state.seeds, &new_seed);
6805
6806 INFO("added new mesh seed %s:%d", host, port);
6807 rv = 0;
6808
6809Exit:
6810 if (rv != 0) {
6811 // Ensure endpoint allocated space is freed.
6812 mesh_seed_destroy(&new_seed);
6813 }
6814
6815 MESH_UNLOCK();
6816 return rv;
6817}
6818
6819/**
6820 * Handle a channel event on an endpoint.
6821 */
6822static void
6823mesh_channel_event_process(as_hb_channel_event* event)
6824{
6825 // Skip if we are not in mesh mode.
6826 if (!hb_is_mesh()) {
6827 return;
6828 }
6829
6830 MESH_LOCK();
6831 switch (event->type) {
6832 case AS_HB_CHANNEL_NODE_CONNECTED:
6833 // Ignore this event. The subsequent message event will be use for
6834 // determining mesh node active status.
6835 break;
6836 case AS_HB_CHANNEL_NODE_DISCONNECTED:
6837 mesh_channel_on_node_disconnect(event);
6838 break;
6839 case AS_HB_CHANNEL_MSG_RECEIVED:
6840 mesh_channel_on_msg_rcvd(event);
6841 break;
6842 case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH: // Ignore this event. HB module will handle it.
6843 break;
6844 }
6845
6846 MESH_UNLOCK();
6847}
6848
6849/**
6850 * Initialize mesh mode data structures.
6851 */
6852static void
6853mesh_init()
6854{
6855 if (!hb_is_mesh()) {
6856 return;
6857 }
6858
6859 MESH_LOCK();
6860
6861 g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED;
6862
6863 // Initialize the mesh node hash.
6864 g_hb.mode_state.mesh_state.nodeid_to_mesh_node = cf_shash_create(
6865 cf_nodeid_shash_fn, sizeof(cf_node), sizeof(as_hb_mesh_node),
6866 AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
6867
6868 // Initialize the seed list.
6869 cf_vector_init(&g_hb.mode_state.mesh_state.seeds, sizeof(as_hb_mesh_seed),
6870 AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO);
6871
6872 MESH_UNLOCK();
6873}
6874
6875/**
6876 * Delete the shash entries only if they are not seed entries.
6877 */
6878static int
6879mesh_free_node_data_reduce(const void* key, void* data, void* udata)
6880{
6881 as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
6882 mesh_node_destroy(mesh_node);
6883 return CF_SHASH_REDUCE_DELETE;
6884}
6885
6886/**
6887 * Remove a host / port from the mesh list.
6888 */
6889static int
6890mesh_tip_clear_reduce(const void* key, void* data, void* udata)
6891{
6892 int rv = CF_SHASH_OK;
6893
6894 MESH_LOCK();
6895
6896 cf_node nodeid = *(cf_node*)key;
6897 as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
6898 as_hb_mesh_tip_clear_udata* tip_clear_udata =
6899 (as_hb_mesh_tip_clear_udata*)udata;
6900
6901 if (tip_clear_udata == NULL || nodeid == tip_clear_udata->nodeid) {
6902 // Handling tip clear all or clear of a specific node.
6903 rv = CF_SHASH_REDUCE_DELETE;
6904 goto Exit;
6905 }
6906
6907 // See if the address matches any one of the endpoints in the node's
6908 // endpoint list.
6909 for (int i = 0; i < tip_clear_udata->n_addrs; i++) {
6910 cf_sock_addr sock_addr;
6911 cf_ip_addr_copy(&tip_clear_udata->addrs[i], &sock_addr.addr);
6912 sock_addr.port = tip_clear_udata->port;
6913 as_hb_endpoint_list_addr_find_udata udata;
6914 udata.found = false;
6915 udata.to_search = &sock_addr;
6916
6917 as_endpoint_list_iterate(mesh_node->endpoint_list,
6918 mesh_endpoint_addr_find_iterate, &udata);
6919
6920 if (udata.found) {
6921 rv = CF_SHASH_REDUCE_DELETE;
6922 goto Exit;
6923 }
6924 }
6925
6926 // Not found by endpoint.
6927 rv = CF_SHASH_OK;
6928
6929Exit:
6930 if (rv == CF_SHASH_REDUCE_DELETE) {
6931 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
6932 as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str,
6933 sizeof(endpoint_list_str));
6934
6935 // Find all seed entries matching this mesh entry and delete them.
6936 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
6937 int element_count = cf_vector_size(seeds);
6938 for (int i = 0; i < element_count; i++) {
6939 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
6940 if (seed->mesh_nodeid != nodeid) {
6941 // Does not match this mesh entry.
6942 continue;
6943 }
6944 if (mesh_seed_delete_unsafe(i) == 0) {
6945 i--;
6946 element_count--;
6947 }
6948 else {
6949 // Should not happen in practice.
6950 CRASH("error deleting mesh seed entry %s:%d",
6951 seed->seed_host_name, seed->seed_port);
6952 }
6953 }
6954
6955 if (channel_node_disconnect(nodeid) != 0) {
6956 WARNING("unable to disconnect the channel to node %" PRIx64,
6957 nodeid);
6958 }
6959
6960 mesh_node_destroy(mesh_node);
6961 if (tip_clear_udata != NULL) {
6962 tip_clear_udata->entry_deleted = true;
6963 }
6964 }
6965
6966 MESH_UNLOCK();
6967 return rv;
6968}
6969
6970/**
6971 * Output Heartbeat endpoints of peers.
6972 */
6973static int
6974mesh_peer_endpoint_reduce(const void* key, void* data, void* udata)
6975{
6976 int rv = CF_SHASH_OK;
6977 MESH_LOCK();
6978 cf_node nodeid = *(cf_node*)key;
6979 as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
6980 cf_dyn_buf* db = (cf_dyn_buf*)udata;
6981
6982 cf_dyn_buf_append_string(db, "heartbeat.peer=");
6983 cf_dyn_buf_append_string(db, "node-id=");
6984 cf_dyn_buf_append_uint64_x(db, nodeid);
6985 cf_dyn_buf_append_string(db, ":");
6986 as_endpoint_list_info(mesh_node->endpoint_list, db);
6987 cf_dyn_buf_append_string(db, ";");
6988
6989 MESH_UNLOCK();
6990 return rv;
6991}
6992
6993/**
6994 * Free the mesh mode data structures.
6995 */
6996static void
6997mesh_clear()
6998{
6999 if (!mesh_is_stopped()) {
7000 WARNING(
7001 "attempted clearing mesh module without stopping it - skip mesh clear!");
7002 return;
7003 }
7004
7005 MESH_LOCK();
7006 // Delete the elements from the map.
7007 cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
7008 mesh_free_node_data_reduce, NULL);
7009
7010 // Reset the seeds to inactive state
7011 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
7012 int element_count = cf_vector_size(seeds);
7013 for (int i = 0; i < element_count; i++) {
7014 // Should not happen in practice.
7015 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
7016 seed->mesh_nodeid = 0;
7017 mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE);
7018 }
7019
7020 MESH_UNLOCK();
7021}
7022
7023/**
7024 * Open mesh listening socket. Crashes if open failed.
7025 */
7026static void
7027mesh_listening_sockets_open()
7028{
7029 MESH_LOCK();
7030
7031 const cf_serv_cfg* bind_cfg = config_bind_cfg_get();
7032
7033 // Compute min MTU across all binding interfaces.
7034 int min_mtu = -1;
7035 char addr_string[DNS_NAME_MAX_SIZE];
7036 for (uint32_t i = 0; i < bind_cfg->n_cfgs; ++i) {
7037 const cf_sock_cfg* sock_cfg = &bind_cfg->cfgs[i];
7038 cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string,
7039 sizeof(addr_string));
7040
7041 INFO("initializing mesh heartbeat socket: %s:%d", addr_string,
7042 sock_cfg->port);
7043
7044 int bind_interface_mtu =
7045 !cf_ip_addr_is_any(&sock_cfg->addr) ?
7046 cf_inter_mtu(&sock_cfg->addr) : cf_inter_min_mtu();
7047
7048 if (min_mtu == -1 || min_mtu > bind_interface_mtu) {
7049 min_mtu = bind_interface_mtu;
7050 }
7051 }
7052
7053 if (cf_socket_init_server((cf_serv_cfg*)bind_cfg,
7054 &g_hb.mode_state.mesh_state.listening_sockets) != 0) {
7055 CRASH("couldn't initialize unicast heartbeat sockets");
7056 }
7057
7058 for (uint32_t i = 0;
7059 i < g_hb.mode_state.mesh_state.listening_sockets.n_socks; ++i) {
7060 DEBUG("opened mesh heartbeat socket: %d",
7061 CSFD(&g_hb.mode_state.mesh_state.listening_sockets.socks[i]));
7062 }
7063
7064 if (min_mtu == -1) {
7065 WARNING("error getting the min MTU - using the default %d",
7066 DEFAULT_MIN_MTU);
7067 min_mtu = DEFAULT_MIN_MTU;
7068 }
7069
7070 g_hb.mode_state.mesh_state.min_mtu = min_mtu;
7071 INFO("mtu of the network is %d", min_mtu);
7072
7073 MESH_UNLOCK();
7074}
7075
7076/**
7077 * Start mesh threads.
7078 */
7079static void
7080mesh_start()
7081{
7082 if (!hb_is_mesh()) {
7083 return;
7084 }
7085
7086 MESH_LOCK();
7087
7088 mesh_listening_sockets_open();
7089 channel_mesh_listening_socks_register(
7090 &g_hb.mode_state.mesh_state.listening_sockets);
7091
7092 g_hb.mode_state.mesh_state.status = AS_HB_STATUS_RUNNING;
7093
7094 // Start the mesh tender thread.
7095 g_hb.mode_state.mesh_state.mesh_tender_tid =
7096 cf_thread_create_joinable(mesh_tender, (void*)&g_hb);
7097
7098 MESH_UNLOCK();
7099}
7100
7101/**
7102 * Stop the mesh module.
7103 */
7104static void
7105mesh_stop()
7106{
7107 if (!mesh_is_running()) {
7108 WARNING("mesh is already stopped");
7109 return;
7110 }
7111
7112 // Unguarded state, but this should be OK.
7113 g_hb.mode_state.mesh_state.status = AS_HB_STATUS_SHUTTING_DOWN;
7114
7115 // Wait for the channel tender thread to finish.
7116 cf_thread_join(g_hb.mode_state.mesh_state.mesh_tender_tid);
7117
7118 MESH_LOCK();
7119
7120 channel_mesh_listening_socks_deregister(
7121 &g_hb.mode_state.mesh_state.listening_sockets);
7122
7123 mesh_listening_sockets_close();
7124
7125 g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED;
7126
7127 // Clear allocated state if any.
7128 if (g_hb.mode_state.mesh_state.published_endpoint_list) {
7129 cf_free(g_hb.mode_state.mesh_state.published_endpoint_list);
7130 g_hb.mode_state.mesh_state.published_endpoint_list = NULL;
7131 }
7132
7133 MESH_UNLOCK();
7134}
7135
7136/**
7137 * Reduce function to dump mesh node info to log file.
7138 */
7139static int
7140mesh_dump_reduce(const void* key, void* data, void* udata)
7141{
7142 cf_node nodeid = *(cf_node*)key;
7143 as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data;
7144
7145 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
7146 as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str,
7147 sizeof(endpoint_list_str));
7148
7149 INFO("\tHB Mesh Node: node-id %" PRIx64" status %s last-updated %" PRIu64 " endpoints {%s}",
7150 nodeid, mesh_node_status_string(mesh_node->status),
7151 mesh_node->last_status_updated, endpoint_list_str);
7152
7153 return CF_SHASH_OK;
7154}
7155
7156/**
7157 * Dump mesh state to logs.
7158 * @param verbose enables / disables verbose logging.
7159 */
7160static void
7161mesh_dump(bool verbose)
7162{
7163 if (!hb_is_mesh() || !verbose) {
7164 return;
7165 }
7166
7167 MESH_LOCK();
7168 cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds;
7169 int element_count = cf_vector_size(seeds);
7170 INFO("HB Seed Count %d", element_count);
7171 for (int i = 0; i < element_count; i++) {
7172 as_hb_mesh_seed* seed = cf_vector_getp(seeds, i);
7173 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
7174 as_endpoint_list_to_string(seed->resolved_endpoint_list,
7175 endpoint_list_str, sizeof(endpoint_list_str));
7176 INFO("\tHB Mesh Seed: host %s port %d node-id %" PRIx64" status %s endpoints {%s}",
7177 seed->seed_host_name, seed->seed_port, seed->mesh_nodeid, mesh_node_status_string(seed->status),
7178 endpoint_list_str);
7179 }
7180
7181 INFO("HB Mesh Nodes Count %d", cf_shash_get_size(g_hb.mode_state.mesh_state.nodeid_to_mesh_node));
7182 cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node,
7183 mesh_dump_reduce, NULL);
7184 MESH_UNLOCK();
7185}
7186
7187/*
7188 * ----------------------------------------------------------------------------
7189 * Multicast sub module.
7190 * ----------------------------------------------------------------------------
7191 */
7192
7193/**
7194 * Initialize multicast data structures.
7195 */
7196static void
7197multicast_init()
7198{
7199}
7200
7201/**
7202 * Clear multicast data structures.
7203 */
7204static void
7205multicast_clear()
7206{
7207 // Free multicast data structures. Nothing to do.
7208}
7209
7210/**
7211 * Open multicast sockets. Crashes if open failed.
7212 */
7213static void
7214multicast_listening_sockets_open()
7215{
7216 MULTICAST_LOCK();
7217
7218 const cf_mserv_cfg* mserv_cfg = config_multicast_group_cfg_get();
7219
7220 // Compute min MTU across all binding interfaces.
7221 int min_mtu = -1;
7222 char addr_string[DNS_NAME_MAX_SIZE];
7223 for (uint32_t i = 0; i < mserv_cfg->n_cfgs; ++i) {
7224 const cf_msock_cfg* sock_cfg = &mserv_cfg->cfgs[i];
7225 cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string,
7226 sizeof(addr_string));
7227
7228 INFO("initializing multicast heartbeat socket: %s:%d", addr_string,
7229 sock_cfg->port);
7230
7231 int bind_interface_mtu =
7232 !cf_ip_addr_is_any(&sock_cfg->if_addr) ?
7233 cf_inter_mtu(&sock_cfg->if_addr) : cf_inter_min_mtu();
7234
7235 if (min_mtu == -1 || min_mtu > bind_interface_mtu) {
7236 min_mtu = bind_interface_mtu;
7237 }
7238 }
7239
7240 if (cf_socket_mcast_init((cf_mserv_cfg*)mserv_cfg,
7241 &g_hb.mode_state.multicast_state.listening_sockets) != 0) {
7242 CRASH("couldn't initialize multicast heartbeat socket: %s",
7243 cf_strerror(errno));
7244 }
7245
7246 for (uint32_t i = 0;
7247 i < g_hb.mode_state.multicast_state.listening_sockets.n_socks;
7248 ++i) {
7249 DEBUG("opened multicast socket %d",
7250 CSFD(
7251 &g_hb.mode_state.multicast_state.listening_sockets.socks[i]));
7252 }
7253
7254 if (min_mtu == -1) {
7255 WARNING("error getting the min mtu - using the default %d",
7256 DEFAULT_MIN_MTU);
7257 min_mtu = DEFAULT_MIN_MTU;
7258 }
7259
7260 g_hb.mode_state.multicast_state.min_mtu = min_mtu;
7261
7262 INFO("mtu of the network is %d", min_mtu);
7263 MULTICAST_UNLOCK();
7264}
7265
7266/**
7267 * Start multicast module.
7268 */
7269static void
7270multicast_start()
7271{
7272 MULTICAST_LOCK();
7273 multicast_listening_sockets_open();
7274 channel_multicast_listening_socks_register(
7275 &g_hb.mode_state.multicast_state.listening_sockets);
7276 MULTICAST_UNLOCK();
7277}
7278
7279/**
7280 * Close multicast listening socket.
7281 */
7282static void
7283multicast_listening_sockets_close()
7284{
7285 MULTICAST_LOCK();
7286 INFO("closing multicast heartbeat sockets");
7287 cf_sockets_close(&g_hb.mode_state.multicast_state.listening_sockets);
7288 DEBUG("closed multicast heartbeat socket");
7289 MULTICAST_UNLOCK();
7290}
7291
7292/**
7293 * Stop Multicast.
7294 */
7295static void
7296multicast_stop()
7297{
7298 MULTICAST_LOCK();
7299 channel_multicast_listening_socks_deregister(
7300 &g_hb.mode_state.multicast_state.listening_sockets);
7301 multicast_listening_sockets_close();
7302
7303 MULTICAST_UNLOCK();
7304}
7305
7306/**
7307 * Dump multicast state to logs.
7308 * @param verbose enables / disables verbose logging.
7309 */
7310static void
7311multicast_dump(bool verbose)
7312{
7313 if (hb_is_mesh()) {
7314 return;
7315 }
7316
7317 // Mode is multicast.
7318 INFO("HB Multicast TTL: %d", config_multicast_ttl_get());
7319}
7320
7321/**
7322 * Find the maximum cluster size based on MTU of the network.
7323 *
7324 * num_nodes is computed so that
7325 *
7326 * MTU = compression_factor(fixed_size + num_nodesper_node_size)
7327 * where,
7328 * fixed_size = udp_header_size + msg_header_size +
7329 * sigma(per_plugin_fixed_size)
7330 * per_node_size = sigma(per_plugin_per_node_size).
7331 */
7332static int
7333multicast_supported_cluster_size_get()
7334{
7335 // Calculate the fixed size for a UDP packet and the message header.
7336 size_t msg_fixed_size = msg_get_template_fixed_sz(g_hb_msg_template,
7337 sizeof(g_hb_msg_template) / sizeof(msg_template));
7338
7339 size_t msg_plugin_per_node_size = 0;
7340
7341 for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
7342 // Adding plugin specific fixed size
7343 msg_fixed_size += g_hb.plugins[i].wire_size_fixed;
7344 // Adding plugin specific per node size.
7345 msg_plugin_per_node_size += g_hb.plugins[i].wire_size_per_node;
7346 }
7347
7348 // TODO: Compute the max cluster size using max storage per node in cluster
7349 // and the min mtu.
7350 int supported_cluster_size = MAX(1,
7351 (((hb_mtu() - UDP_HEADER_SIZE_MAX) * MSG_COMPRESSION_RATIO)
7352 - msg_fixed_size) / msg_plugin_per_node_size);
7353
7354 return supported_cluster_size;
7355}
7356
7357/*
7358 * ----------------------------------------------------------------------------
7359 * Heartbeat main sub module.
7360 * ----------------------------------------------------------------------------
7361 */
7362
7363/**
7364 * Is Main module initialized.
7365 */
7366static bool
7367hb_is_initialized()
7368{
7369 HB_LOCK();
7370 bool retval = (g_hb.status != AS_HB_STATUS_UNINITIALIZED) ? true : false;
7371 HB_UNLOCK();
7372 return retval;
7373}
7374
7375/**
7376 * Is Main module running.
7377 */
7378static bool
7379hb_is_running()
7380{
7381 HB_LOCK();
7382 bool retval = (g_hb.status == AS_HB_STATUS_RUNNING) ? true : false;
7383 HB_UNLOCK();
7384 return retval;
7385}
7386
7387/**
7388 * Is Main module stopped.
7389 */
7390static bool
7391hb_is_stopped()
7392{
7393 HB_LOCK();
7394 bool retval = (g_hb.status == AS_HB_STATUS_STOPPED) ? true : false;
7395 HB_UNLOCK();
7396 return retval;
7397}
7398
7399/**
7400 * Initialize the mode specific data structures.
7401 */
7402static void
7403hb_mode_init()
7404{
7405 if (hb_is_mesh()) {
7406 mesh_init();
7407 }
7408 else {
7409 multicast_init();
7410 }
7411}
7412
7413/**
7414 * Start mode specific threads..
7415 */
7416static void
7417hb_mode_start()
7418{
7419 if (hb_is_mesh()) {
7420 mesh_start();
7421 }
7422 else {
7423 multicast_start();
7424 }
7425}
7426
7427/**
7428 * The MTU for underlying network.
7429 */
7430static int
7431hb_mtu()
7432{
7433 int __mtu = config_override_mtu_get();
7434 if (!__mtu) {
7435 __mtu = hb_is_mesh() ?
7436 g_hb.mode_state.mesh_state.min_mtu :
7437 g_hb.mode_state.multicast_state.min_mtu;
7438 __mtu = __mtu > 0 ? __mtu : DEFAULT_MIN_MTU;
7439 }
7440 return __mtu;
7441}
7442
7443/**
7444 * Initialize the template to be used for heartbeat messages.
7445 */
7446static void
7447hb_msg_init()
7448{
7449 // Register fabric heartbeat msg type with no processing function:
7450 // This permits getting / putting heartbeat msgs to be moderated via an idle
7451 // msg queue.
7452 as_fabric_register_msg_fn(M_TYPE_HEARTBEAT, g_hb_msg_template,
7453 sizeof(g_hb_msg_template),
7454 AS_HB_MSG_SCRATCH_SIZE, 0, 0);
7455}
7456
7457/**
7458 * Get hold of current heartbeat protocol version
7459 */
7460static uint32_t
7461hb_protocol_identifier_get()
7462{
7463 return HB_PROTOCOL_V3_IDENTIFIER;
7464}
7465
7466/**
7467 * Node depart event time estimate. Assumes node departed timeout milliseconds
7468 * before the detection.
7469 */
7470static cf_clock
7471hb_node_depart_time(cf_clock detect_time)
7472{
7473 return (detect_time - HB_NODE_TIMEOUT());
7474}
7475
7476/**
7477 * Indicates if mode is mesh.
7478 */
7479static bool
7480hb_is_mesh()
7481{
7482 return (config_mode_get() == AS_HB_MODE_MESH);
7483}
7484
7485/**
7486 * Publish an event to subsystems listening to heart beat events.
7487 */
7488static void
7489hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes,
7490 int node_count)
7491{
7492 // Lock-less because the queue is thread safe and we do not use heartbeat
7493 // state here.
7494 for (int i = 0; i < node_count; i++) {
7495 as_hb_event_node event;
7496 event.nodeid = nodes[i];
7497 event.event_detected_time = cf_getms();
7498
7499 switch (event_type) {
7500 case AS_HB_INTERNAL_NODE_ARRIVE:
7501 event.evt = AS_HB_NODE_ARRIVE;
7502 event.event_time = event.event_detected_time;
7503 as_health_add_node_counter(event.nodeid, AS_HEALTH_NODE_ARRIVALS);
7504 break;
7505 case AS_HB_INTERNAL_NODE_DEPART:
7506 event.evt = AS_HB_NODE_DEPART;
7507 event.event_time = hb_node_depart_time(event.event_detected_time);
7508 break;
7509 case AS_HB_INTERNAL_NODE_EVICT:
7510 event.evt = AS_HB_NODE_DEPART;
7511 event.event_time = event.event_detected_time;
7512 break;
7513 case AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED:
7514 event.evt = AS_HB_NODE_ADJACENCY_CHANGED;
7515 event.event_time = event.event_detected_time;
7516 break;
7517 }
7518
7519 DEBUG("queuing event of type %d for node %" PRIx64, event.evt,
7520 event.nodeid);
7521 cf_queue_push(&g_hb_event_listeners.external_events_queue, &event);
7522 }
7523}
7524
7525/**
7526 * Publish all pending events. Should be invoked outside hb locks.
7527 */
7528static void
7529hb_event_publish_pending()
7530{
7531 EXTERNAL_EVENT_PUBLISH_LOCK();
7532 int num_events = cf_queue_sz(&g_hb_event_listeners.external_events_queue);
7533 if (num_events <= 0) {
7534 // Events need not be published.
7535 goto Exit;
7536 }
7537
7538 as_hb_event_node events[AS_HB_CLUSTER_MAX_SIZE_SOFT];
7539 int published_count = 0;
7540 while (published_count < AS_HB_CLUSTER_MAX_SIZE_SOFT
7541 && cf_queue_pop(&g_hb_event_listeners.external_events_queue,
7542 &events[published_count], 0) == CF_QUEUE_OK) {
7543 published_count++;
7544 }
7545
7546 if (published_count) {
7547 // Assuming that event listeners are not registered after system init,
7548 // no locks here.
7549 DEBUG("publishing %d heartbeat events", published_count);
7550 for (int i = 0; i < g_hb_event_listeners.event_listener_count; i++) {
7551 (g_hb_event_listeners.event_listeners[i].event_callback)(
7552 published_count, events,
7553 g_hb_event_listeners.event_listeners[i].udata);
7554 }
7555 }
7556
7557Exit:
7558 EXTERNAL_EVENT_PUBLISH_UNLOCK();
7559}
7560
7561/**
7562 * Delete the heap allocated data while iterating through the hash and deleting
7563 * entries.
7564 */
7565static int
7566hb_adjacency_free_data_reduce(const void* key, void* data, void* udata)
7567{
7568 as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
7569
7570 const cf_node* nodeid = (const cf_node*)key;
7571
7572 hb_adjacent_node_destroy(adjacent_node);
7573
7574 // Send event depart to for this node
7575 hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, nodeid, 1);
7576
7577 return CF_SHASH_REDUCE_DELETE;
7578}
7579
7580/**
7581 * Clear the heartbeat data structures.
7582 */
7583static void
7584hb_clear()
7585{
7586 if (!hb_is_stopped()) {
7587 WARNING("attempted to clear heartbeat module without stopping it");
7588 return;
7589 }
7590
7591 HB_LOCK();
7592
7593 // Free the plugin data and delete adjacent nodes.
7594 cf_shash_reduce(g_hb.adjacency, hb_adjacency_free_data_reduce, NULL);
7595 cf_shash_reduce(g_hb.on_probation, hb_adjacency_free_data_reduce, NULL);
7596 hb_adjacent_node_destroy(&g_hb.self_node);
7597 memset(&g_hb.self_node, 0, sizeof(g_hb.self_node));
7598
7599 HB_UNLOCK();
7600
7601 // Publish node departed events for the removed nodes.
7602 hb_event_publish_pending();
7603
7604 // Clear the mode module.
7605 if (hb_is_mesh()) {
7606 mesh_clear();
7607 }
7608 else {
7609 multicast_clear();
7610 }
7611
7612 channel_clear();
7613}
7614
7615/**
7616 * Reduce function to get hold of current adjacency list.
7617 */
7618static int
7619hb_adjacency_iterate_reduce(const void* key, void* data, void* udata)
7620{
7621 const cf_node* nodeid = (const cf_node*)key;
7622 as_hb_adjacency_reduce_udata* adjacency_reduce_udata =
7623 (as_hb_adjacency_reduce_udata*)udata;
7624
7625 adjacency_reduce_udata->adj_list[adjacency_reduce_udata->adj_count] =
7626 *nodeid;
7627 adjacency_reduce_udata->adj_count++;
7628
7629 return CF_SHASH_OK;
7630}
7631
7632/**
7633 * Plugin function to set heartbeat adjacency list into a pulse message.
7634 */
7635static void
7636hb_plugin_set_fn(msg* msg)
7637{
7638 HB_LOCK();
7639
7640 cf_node adj_list[cf_shash_get_size(g_hb.adjacency)];
7641 as_hb_adjacency_reduce_udata adjacency_reduce_udata = { adj_list, 0 };
7642
7643 cf_shash_reduce(g_hb.adjacency, hb_adjacency_iterate_reduce,
7644 &adjacency_reduce_udata);
7645
7646 HB_UNLOCK();
7647
7648 // Populate adjacency list.
7649 msg_adjacency_set(msg, adj_list, adjacency_reduce_udata.adj_count);
7650
7651 // Set cluster name.
7652 char cluster_name[AS_CLUSTER_NAME_SZ];
7653 as_config_cluster_name_get(cluster_name);
7654
7655 if (cluster_name[0] != '\0') {
7656 msg_set_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, MSG_SET_COPY);
7657 }
7658}
7659
7660/**
7661 * Plugin function that parses adjacency list out of a heartbeat pulse message.
7662 */
7663static void
7664hb_plugin_parse_data_fn(msg* msg, cf_node source,
7665 as_hb_plugin_node_data* prev_plugin_data,
7666 as_hb_plugin_node_data* plugin_data)
7667{
7668 size_t adj_length = 0;
7669 cf_node* adj_list = NULL;
7670
7671 if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) {
7672 // Store a zero length adjacency list. Should not have happened.
7673 WARNING("received heartbeat without adjacency list %" PRIx64, source);
7674 adj_length = 0;
7675 }
7676
7677 // The guess can be larger for older protocols which also include self node
7678 // in the adjacency list.
7679 int guessed_data_size = (adj_length * sizeof(cf_node));
7680
7681 if (guessed_data_size > plugin_data->data_capacity) {
7682 // Round up to nearest multiple of block size to prevent very frequent
7683 // reallocation.
7684 size_t data_capacity = ((guessed_data_size + HB_PLUGIN_DATA_BLOCK_SIZE
7685 - 1) /
7686 HB_PLUGIN_DATA_BLOCK_SIZE) *
7687 HB_PLUGIN_DATA_BLOCK_SIZE;
7688
7689 // Reallocate since we have outgrown existing capacity.
7690 plugin_data->data = cf_realloc(plugin_data->data, data_capacity);
7691 plugin_data->data_capacity = data_capacity;
7692 }
7693
7694 cf_node* dest_list = (cf_node*)(plugin_data->data);
7695
7696 size_t final_list_length = 0;
7697 for (size_t i = 0; i < adj_length; i++) {
7698 if (adj_list[i] == source) {
7699 // Skip the source node.
7700 continue;
7701 }
7702 dest_list[final_list_length++] = adj_list[i];
7703 }
7704
7705 plugin_data->data_size = (final_list_length * sizeof(cf_node));
7706}
7707
7708/**
7709 * Get the msg buffer from a pool based on the protocol under use.
7710 * @return the msg buff
7711 */
7712static msg*
7713hb_msg_get()
7714{
7715 return as_fabric_msg_get(M_TYPE_HEARTBEAT);
7716}
7717
7718/**
7719 * Return the message buffer back to the pool.
7720 */
7721static void
7722hb_msg_return(msg* msg)
7723{
7724 as_fabric_msg_put(msg);
7725}
7726
7727/**
7728 * Fill the outgoing pulse message with plugin specific data.
7729 *
7730 * Note: The set functions would be acquiring their locks. This function should
7731 * never directly use nor have a call stack under HB_LOCK.
7732 *
7733 * @param msg the outgoing pulse message.
7734 */
7735static void
7736hb_plugin_msg_fill(msg* msg)
7737{
7738 for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
7739 if (g_hb.plugins[i].set_fn) {
7740 (g_hb.plugins[i].set_fn)(msg);
7741 }
7742 }
7743}
7744
7745/**
7746 * Parse fields from the message into plugin specific data.
7747 * @param msg the outgoing pulse message.
7748 * @param adjacent_node the node from which this message was received.
7749 * @param plugin_data_changed (output) array whose ith entry is set to true if
7750 * ith plugin's data changed, false otherwise. Should be large enough to hold
7751 * flags for all plugins.
7752 */
7753static void
7754hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node,
7755 as_hb_plugin* plugins, bool plugin_data_changed[])
7756{
7757 cf_node source;
7758 adjacent_node->plugin_data_cycler++;
7759
7760 msg_nodeid_get(msg, &source);
7761 for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
7762 plugin_data_changed[i] = false;
7763 if (plugins[i].parse_fn) {
7764 as_hb_plugin_node_data* curr_data =
7765 &adjacent_node->plugin_data[i][adjacent_node->plugin_data_cycler
7766 % 2];
7767
7768 as_hb_plugin_node_data* prev_data =
7769 &adjacent_node->plugin_data[i][(adjacent_node->plugin_data_cycler
7770 + 1) % 2];
7771
7772 // Ensure there is a preallocated data pointer.
7773 if (curr_data->data == NULL) {
7774 curr_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE);
7775 curr_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE;
7776 curr_data->data_size = 0;
7777 }
7778
7779 if (prev_data->data == NULL) {
7780 prev_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE);
7781 prev_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE;
7782 prev_data->data_size = 0;
7783 }
7784
7785 // Parse message data into current data.
7786 (plugins[i]).parse_fn(msg, source, prev_data, curr_data);
7787
7788 if (!plugins[i].change_listener) {
7789 // No change listener configured. Skip detecting change.
7790 continue;
7791 }
7792
7793 size_t curr_data_size = curr_data->data_size;
7794 void* curr_data_blob = curr_data_size ? curr_data->data : NULL;
7795
7796 size_t prev_data_size = prev_data->data_size;
7797 void* prev_data_blob = prev_data_size ? prev_data->data : NULL;
7798
7799 if (prev_data_blob == curr_data_blob) {
7800 // Old and new data both NULL or both point to the same memory
7801 // location.
7802 plugin_data_changed[i] = false;
7803 continue;
7804 }
7805
7806 if (prev_data_size != curr_data_size || prev_data_blob == NULL
7807 || curr_data_blob == NULL) {
7808 // Plugin data definitely changed, as the data sizes differ or
7809 // exactly one of old or new data pointers is NULL.
7810 plugin_data_changed[i] = true;
7811 continue;
7812 }
7813
7814 // The data sizes match at this point and neither values are NULL.
7815 plugin_data_changed[i] = memcmp(prev_data_blob, curr_data_blob,
7816 curr_data_size) != 0;
7817 }
7818 }
7819}
7820
7821/**
7822 * Adjacency list for an adjacent node changed.
7823 */
7824static void
7825hb_plugin_data_change_listener(cf_node changed_node_id)
7826{
7827 hb_event_queue(AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED, &changed_node_id, 1);
7828}
7829
7830/**
7831 * Initialize the plugin specific data structures.
7832 */
7833static void
7834hb_plugin_init()
7835{
7836 memset(&g_hb.plugins, 0, sizeof(g_hb.plugins));
7837
7838 // Be cute. Register self as a plugin.
7839 as_hb_plugin self_plugin;
7840 memset(&self_plugin, 0, sizeof(self_plugin));
7841 self_plugin.id = AS_HB_PLUGIN_HB;
7842 self_plugin.wire_size_fixed = 0;
7843 self_plugin.wire_size_per_node = sizeof(cf_node);
7844 self_plugin.set_fn = hb_plugin_set_fn;
7845 self_plugin.parse_fn = hb_plugin_parse_data_fn;
7846 self_plugin.change_listener = hb_plugin_data_change_listener;
7847 hb_plugin_register(&self_plugin);
7848}
7849
7850/**
7851 * Transmits heartbeats at fixed intervals.
7852 */
7853void*
7854hb_transmitter(void* arg)
7855{
7856 DETAIL("heartbeat transmitter started");
7857
7858 cf_clock last_time = 0;
7859
7860 while (hb_is_running()) {
7861 cf_clock curr_time = cf_getms();
7862
7863 if ((curr_time - last_time) < PULSE_TRANSMIT_INTERVAL()) {
7864 // Interval has not been reached for sending heartbeats
7865 usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time +
7866 PULSE_TRANSMIT_INTERVAL()) - curr_time) * 1000);
7867 continue;
7868 }
7869
7870 last_time = curr_time;
7871
7872 // Construct the pulse message.
7873 msg* msg = hb_msg_get();
7874
7875 msg_src_fields_fill(msg);
7876 msg_type_set(msg, AS_HB_MSG_TYPE_PULSE);
7877
7878 // Have plugins fill their data into the heartbeat pulse message.
7879 hb_plugin_msg_fill(msg);
7880
7881 // Broadcast the heartbeat to all known recipients.
7882 channel_msg_broadcast(msg);
7883
7884 // Return the msg back to the fabric.
7885 hb_msg_return(msg);
7886
7887 DETAIL("done sending pulse message");
7888 }
7889
7890 DETAIL("heartbeat transmitter stopped");
7891 return NULL;
7892}
7893
7894/**
7895 * Get hold of adjacent node information given its nodeid.
7896 * @param nodeid the nodeid.
7897 * @param adjacent_node the output node information.
7898 * @return 0 on success, -1 on failure.
7899 */
7900static int
7901hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node)
7902{
7903 int rv = -1;
7904 HB_LOCK();
7905
7906 if (cf_shash_get(g_hb.adjacency, &nodeid, adjacent_node) == CF_SHASH_OK) {
7907 rv = 0;
7908 }
7909
7910 HB_UNLOCK();
7911 return rv;
7912}
7913
7914/**
7915 * Get hold of an on-probation node information given its nodeid.
7916 * @param nodeid the nodeid.
7917 * @param adjacent_node the output node information.
7918 * @return 0 on success, -1 on failure.
7919 */
7920static int
7921hb_on_probation_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node)
7922{
7923 int rv = -1;
7924 HB_LOCK();
7925
7926 if (cf_shash_get(g_hb.on_probation, &nodeid, adjacent_node)
7927 == CF_SHASH_OK) {
7928 rv = 0;
7929 }
7930
7931 HB_UNLOCK();
7932 return rv;
7933}
7934
7935/**
7936 * Read the plugin data from an adjacent node.
7937 * @param adjacent_node the adjacent node.
7938 * @param plugin_data (output) will be null if this node has no plugin data.
7939 * Else will point to the plugin data.
7940 * @param plugin_data_size (output) the size of the plugin data.
7941 */
7942static void
7943hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node,
7944 as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size)
7945{
7946 *plugin_data_size =
7947 adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler
7948 % 2].data_size;
7949
7950 *plugin_data =
7951 *plugin_data_size ?
7952 (cf_node*)(adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler
7953 % 2].data) : NULL;
7954}
7955
7956/**
7957 * Get adjacency list for an adjacent node.
7958 */
7959static void
7960hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node,
7961 cf_node** adjacency_list, size_t* adjacency_length)
7962{
7963 hb_adjacent_node_plugin_data_get(adjacent_node, AS_HB_PLUGIN_HB,
7964 (void**)adjacency_list, adjacency_length);
7965 (*adjacency_length) /= sizeof(cf_node);
7966}
7967
7968/**
7969 * Indicates if a give node has expired and should be removed from the adjacency
7970 * list.
7971 */
7972static bool
7973hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node)
7974{
7975 if (nodeid == config_self_nodeid_get()) {
7976 return false;
7977 }
7978
7979 HB_LOCK();
7980
7981 cf_clock now = cf_getms();
7982
7983 bool expired = adjacent_node->last_updated_monotonic_ts + HB_NODE_TIMEOUT()
7984 < now;
7985
7986 HB_UNLOCK();
7987 return expired;
7988}
7989
7990/**
7991 * Indicates if self node has duplicate ids.
7992 */
7993static bool
7994hb_self_is_duplicate(){
7995 HB_LOCK();
7996 bool self_is_duplicate = g_hb.self_is_duplicate;
7997 HB_UNLOCK();
7998 return self_is_duplicate;
7999}
8000
8001/**
8002 * Updates the self is duplicate flag.
8003 */
8004static void
8005hb_self_duplicate_update()
8006{
8007 cf_clock now = cf_getms();
8008 HB_LOCK();
8009 if (g_hb.self_is_duplicate) {
8010 uint32_t duplicate_block_interval =
8011 config_endpoint_track_intervals_get()
8012 * config_tx_interval_get();
8013 if (g_hb.self_duplicate_detected_ts + duplicate_block_interval <= now) {
8014 // We have not seen duplicates for the endpoint change tracking
8015 // interval. Mark ourself as non-duplicate.
8016 g_hb.self_is_duplicate = false;
8017 }
8018 }
8019 HB_UNLOCK();
8020}
8021
8022/**
8023 * Free up space occupied by plugin data from adjacent node.
8024 */
8025static void
8026hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node)
8027{
8028 HB_LOCK();
8029 for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
8030 as_hb_plugin_node_data* curr_plugin_data = adjacent_node->plugin_data[i];
8031 for (int j = 0; j < 2; j++) {
8032 if (curr_plugin_data[j].data) {
8033 cf_free(curr_plugin_data[j].data);
8034 curr_plugin_data[j].data = NULL;
8035 }
8036
8037 curr_plugin_data[j].data_capacity = 0;
8038 curr_plugin_data[j].data_size = 0;
8039 }
8040 }
8041
8042 if (adjacent_node->endpoint_list) {
8043 // Free the endpoint list.
8044 cf_free(adjacent_node->endpoint_list);
8045 adjacent_node->endpoint_list = NULL;
8046 }
8047
8048 HB_UNLOCK();
8049}
8050
8051/**
8052 * Tend reduce function that removes expired nodes from adjacency list.
8053 */
8054static int
8055hb_adjacency_tend_reduce(const void* key, void* data, void* udata)
8056{
8057 cf_node nodeid = *(const cf_node*)key;
8058 as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
8059 as_hb_adjacency_tender_udata* adjacency_tender_udata =
8060 (as_hb_adjacency_tender_udata*)udata;
8061
8062 int rv = CF_SHASH_OK;
8063 bool cluster_name_mismatch = adjacent_node->cluster_name_mismatch_count
8064 > CLUSTER_NAME_MISMATCH_MAX;
8065 if (hb_node_has_expired(nodeid, adjacent_node) || cluster_name_mismatch) {
8066 INFO("node expired %" PRIx64" %s", nodeid, cluster_name_mismatch ? "(cluster name mismatch)" : "");
8067 if (cluster_name_mismatch) {
8068 adjacency_tender_udata->evicted_nodes[adjacency_tender_udata->evicted_node_count++] =
8069 nodeid;
8070 }
8071 else {
8072 adjacency_tender_udata->dead_nodes[adjacency_tender_udata->dead_node_count++] =
8073 nodeid;
8074 }
8075
8076 // Free plugin data as well.
8077 hb_adjacent_node_destroy(adjacent_node);
8078
8079 rv = CF_SHASH_REDUCE_DELETE;
8080 }
8081
8082 return rv;
8083}
8084
8085/**
8086 * Tend reduce function that removes expired nodes from the probationary list.
8087 */
8088static int
8089hb_on_probation_tend_reduce(const void* key, void* data, void* udata)
8090{
8091 cf_node nodeid = *(const cf_node*)key;
8092 as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
8093
8094 int rv = CF_SHASH_OK;
8095 if (hb_node_has_expired(nodeid, adjacent_node)) {
8096 DEBUG("on-probation node %" PRIx64 " expired", nodeid);
8097 // Free plugin data as well.
8098 hb_adjacent_node_destroy(adjacent_node);
8099 rv = CF_SHASH_REDUCE_DELETE;
8100 }
8101 return rv;
8102}
8103
8104/**
8105 * Tends the adjacency list. Removes nodes that expire.
8106 */
8107void*
8108hb_adjacency_tender(void* arg)
8109{
8110 DETAIL("adjacency tender started");
8111
8112 cf_clock last_time = 0;
8113 cf_clock last_depart_time = 0;
8114
8115 while (hb_is_running()) {
8116 cf_clock curr_time = cf_getms();
8117 uint32_t adjacency_tend_interval = ADJACENCY_TEND_INTERVAL;
8118 // Interval after node depart where we tend faster to detect additional
8119 // node departures.
8120 uint32_t fast_check_interval = 2 * config_tx_interval_get();
8121 if (last_depart_time + fast_check_interval > curr_time) {
8122 adjacency_tend_interval = ADJACENCY_FAST_TEND_INTERVAL;
8123 }
8124
8125 hb_self_duplicate_update();
8126
8127 if ((curr_time - last_time) < adjacency_tend_interval) {
8128 // Publish any pendng events.
8129 hb_event_publish_pending();
8130
8131 // Interval has not been reached for sending heartbeats
8132 usleep(
8133 MIN(AS_HB_TX_INTERVAL_MS_MIN,
8134 (last_time + adjacency_tend_interval) - curr_time)
8135 * 1000);
8136 continue;
8137 }
8138
8139 last_time = curr_time;
8140
8141 DETAIL("tending adjacency list");
8142
8143 HB_LOCK();
8144 cf_node dead_nodes[cf_shash_get_size(g_hb.adjacency)];
8145 cf_node evicted_nodes[cf_shash_get_size(g_hb.adjacency)];
8146 as_hb_adjacency_tender_udata adjacency_tender_udata;
8147 adjacency_tender_udata.dead_nodes = dead_nodes;
8148 adjacency_tender_udata.dead_node_count = 0;
8149 adjacency_tender_udata.evicted_nodes = evicted_nodes;
8150 adjacency_tender_udata.evicted_node_count = 0;
8151
8152 cf_shash_reduce(g_hb.adjacency, hb_adjacency_tend_reduce,
8153 &adjacency_tender_udata);
8154
8155 if (adjacency_tender_udata.dead_node_count > 0) {
8156 last_depart_time = curr_time;
8157 // Queue events for dead nodes.
8158 hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, dead_nodes,
8159 adjacency_tender_udata.dead_node_count);
8160 }
8161
8162 if (adjacency_tender_udata.evicted_node_count > 0) {
8163 last_depart_time = curr_time;
8164 // Queue events for evicted nodes.
8165 hb_event_queue(AS_HB_INTERNAL_NODE_EVICT, evicted_nodes,
8166 adjacency_tender_udata.evicted_node_count);
8167 }
8168
8169 // Expire nodes from the on-probation list.
8170 cf_shash_reduce(g_hb.on_probation, hb_on_probation_tend_reduce, NULL);
8171 HB_UNLOCK();
8172
8173 // See if we have pending events to publish.
8174 hb_event_publish_pending();
8175
8176 DETAIL("done tending adjacency list");
8177 }
8178
8179 DETAIL("adjacency tender shut down");
8180 return NULL;
8181}
8182
8183/**
8184 * Start the transmitter thread.
8185 */
8186static void
8187hb_tx_start()
8188{
8189 // Start the transmitter thread.
8190 g_hb.transmitter_tid = cf_thread_create_joinable(hb_transmitter,
8191 (void*)&g_hb);
8192}
8193
8194/**
8195 * Stop the transmitter thread.
8196 */
8197static void
8198hb_tx_stop()
8199{
8200 DETAIL("waiting for the transmitter thread to stop");
8201 // Wait for the adjacency tender thread to stop.
8202 cf_thread_join(g_hb.transmitter_tid);
8203}
8204
8205/**
8206 * Start the transmitter thread.
8207 */
8208static void
8209hb_adjacency_tender_start()
8210{
8211 // Start the transmitter thread.
8212 g_hb.adjacency_tender_tid = cf_thread_create_joinable(hb_adjacency_tender,
8213 (void*)&g_hb);
8214}
8215
8216/**
8217 * Stop the adjacency tender thread.
8218 */
8219static void
8220hb_adjacency_tender_stop()
8221{
8222 // Wait for the adjacency tender thread to stop.
8223 cf_thread_join(g_hb.adjacency_tender_tid);
8224}
8225
8226/**
8227 * Initialize the heartbeat subsystem.
8228 */
8229static void
8230hb_init()
8231{
8232 if (hb_is_initialized()) {
8233 WARNING("heartbeat main module is already initialized");
8234 return;
8235 }
8236
8237 // Operate under a lock. Let's be paranoid everywhere.
8238 HB_LOCK();
8239
8240 // Initialize the heartbeat data structure.
8241 memset(&g_hb, 0, sizeof(g_hb));
8242
8243 // Initialize the adjacency hash.
8244 g_hb.adjacency = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node),
8245 sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
8246
8247 // Initialize the on_probation hash.
8248 g_hb.on_probation = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node),
8249 sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
8250
8251 // Initialize the temporary hash to map nodeid to index.
8252 g_hb.nodeid_to_index = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node),
8253 sizeof(int), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0);
8254
8255 // Initialize unpublished event queue.
8256 cf_queue_init(&g_hb_event_listeners.external_events_queue,
8257 sizeof(as_hb_event_node),
8258 AS_HB_CLUSTER_MAX_SIZE_SOFT, true);
8259
8260 // Initialize the mode specific state.
8261 hb_mode_init();
8262
8263 // Initialize the plugin functions.
8264 hb_plugin_init();
8265
8266 // Initialize IO channel subsystem.
8267 channel_init();
8268
8269 g_hb.status = AS_HB_STATUS_STOPPED;
8270
8271 HB_UNLOCK();
8272}
8273
8274/**
8275 * Start the heartbeat subsystem.
8276 */
8277static void
8278hb_start()
8279{
8280 // Operate under a lock. Let's be paranoid everywhere.
8281 HB_LOCK();
8282
8283 if (hb_is_running()) {
8284 // Shutdown the heartbeat subsystem.
8285 hb_stop();
8286 }
8287
8288 g_hb.status = AS_HB_STATUS_RUNNING;
8289
8290 // Initialize the heartbeat message templates. Called from here because
8291 // fabric needs to be initialized for this call to succeed. Fabric init
8292 // happens after heartbeat init.
8293 hb_msg_init();
8294
8295 // Initialize channel sub module.
8296 channel_start();
8297
8298 // Start the mode sub module
8299 hb_mode_start();
8300
8301 // Start heart beat transmitter.
8302 hb_tx_start();
8303
8304 // Start heart beat adjacency tender.
8305 hb_adjacency_tender_start();
8306
8307 HB_UNLOCK();
8308}
8309
8310/**
8311 * Shut down the heartbeat subsystem.
8312 */
8313static void
8314hb_stop()
8315{
8316 if (!hb_is_running()) {
8317 WARNING("heartbeat is already stopped");
8318 return;
8319 }
8320
8321 HB_LOCK();
8322 g_hb.status = AS_HB_STATUS_SHUTTING_DOWN;
8323 HB_UNLOCK();
8324
8325 // Publish pending events. Should not delay any events.
8326 hb_event_publish_pending();
8327
8328 // Shutdown mode.
8329 if (hb_is_mesh()) {
8330 mesh_stop();
8331 }
8332 else {
8333 multicast_stop();
8334 }
8335
8336 // Wait for the threads to shut down.
8337 hb_tx_stop();
8338
8339 hb_adjacency_tender_stop();
8340
8341 // Stop channels.
8342 channel_stop();
8343
8344 g_hb.status = AS_HB_STATUS_STOPPED;
8345}
8346
8347/**
8348 * Register a plugin with the heart beat system.
8349 */
8350static void
8351hb_plugin_register(as_hb_plugin* plugin)
8352{
8353 HB_LOCK();
8354 memcpy(&g_hb.plugins[plugin->id], plugin, sizeof(as_hb_plugin));
8355 HB_UNLOCK();
8356}
8357
8358/**
8359 * Check if the heartbeat recieved is duplicate or stale.
8360 */
8361static bool
8362hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp last_send_ts)
8363{
8364 if (as_hlc_timestamp_order_get(event->msg_hlc_ts.send_ts, last_send_ts)
8365 == AS_HLC_HAPPENS_BEFORE) {
8366 // Received a delayed heartbeat send before the current heartbeat.
8367 return true;
8368 }
8369 return false;
8370}
8371
8372/**
8373 * Update the tracker with endpoint change status.
8374 */
8375static void
8376hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed)
8377{
8378 *tracker = *tracker << 1;
8379 if (endpoint_changed) {
8380 (*tracker)++;
8381 }
8382}
8383
8384/**
8385 * Indicates if endpoint changes for this node are normal.
8386 */
8387static bool
8388hb_endpoint_change_tracker_is_normal(uint64_t tracker)
8389{
8390 if (tracker == 0) {
8391 // Normal and healthy case.
8392 return true;
8393 }
8394
8395 uint32_t num_intervals_to_track = MIN(64,
8396 config_endpoint_track_intervals_get());
8397 uint64_t mask = ~(~(uint64_t)0 << num_intervals_to_track);
8398
8399 // Ignore older history.
8400 tracker &= mask;
8401
8402 int flip_count = 0;
8403 static int nibblebits[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
8404 for (; tracker != 0; tracker >>= 4) {
8405 flip_count += nibblebits[tracker & 0x0f];
8406 }
8407
8408 return flip_count <= config_endpoint_changes_allowed_get();
8409}
8410
8411
8412/**
8413 * Indicates if the change tracker just changed.
8414 */
8415static bool
8416hb_endpoint_change_tracker_has_changed(uint64_t tracker)
8417{
8418 return tracker % 2;
8419}
8420
8421/**
8422 * Update adjacent node data on receiving a valid pulse message.
8423 *
8424 * @return 0 if the update was successfully applied, -1 if the update should be
8425 * rejected.
8426 */
8427static int
8428hb_adjacent_node_update(as_hb_channel_event* msg_event,
8429 as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[])
8430{
8431 msg* msg = msg_event->msg;
8432
8433 cf_node source = 0;
8434 // Channel has validated the source. Don't bother checking here.
8435 msg_nodeid_get(msg, &source);
8436
8437 msg_id_get(msg, &adjacent_node->protocol_version);
8438
8439 as_hlc_timestamp send_ts = adjacent_node->last_msg_hlc_ts.send_ts;
8440
8441 if (hb_endpoint_change_tracker_has_changed(
8442 adjacent_node->endpoint_change_tracker)) {
8443 // Allow a little more slack for obsolete checking because the two nodes
8444 // might not have matching send timestamps.
8445 send_ts = as_hlc_timestamp_subtract_ms(send_ts,
8446 config_tx_interval_get());
8447 }
8448
8449 if (hb_msg_is_obsolete(msg_event, send_ts)) {
8450 WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64,
8451 send_ts,
8452 msg_event->msg_hlc_ts.send_ts, source);
8453 return -1;
8454 }
8455
8456 // Populate plugin data.
8457 hb_plugin_msg_parse(msg, adjacent_node, g_hb.plugins, plugin_data_changed);
8458
8459 // Get the ip address.
8460 as_endpoint_list* msg_endpoint_list;
8461 if (msg_endpoint_list_get(msg, &msg_endpoint_list) == 0
8462 && !as_endpoint_lists_are_equal(adjacent_node->endpoint_list,
8463 msg_endpoint_list)) {
8464 // Update the endpoints.
8465 endpoint_list_copy(&adjacent_node->endpoint_list, msg_endpoint_list);
8466 }
8467
8468 // Update the last updated time.
8469 adjacent_node->last_updated_monotonic_ts = cf_getms();
8470 memcpy(&adjacent_node->last_msg_hlc_ts, &msg_event->msg_hlc_ts,
8471 sizeof(adjacent_node->last_msg_hlc_ts));
8472
8473 // Update the latency.
8474 int64_t latency = as_hlc_timestamp_diff_ms(msg_event->msg_hlc_ts.send_ts,
8475 msg_event->msg_hlc_ts.recv_ts);
8476 latency = latency < 0 ? -latency : latency;
8477 adjacent_node->avg_latency = ALPHA * latency
8478 + (1 - ALPHA) * adjacent_node->avg_latency;
8479
8480 // Reset the cluster-name mismatch counter to zero.
8481 adjacent_node->cluster_name_mismatch_count = 0;
8482
8483 // Check if fabric endpoints have changed.
8484 as_hb_plugin_node_data* curr_data =
8485 &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][adjacent_node->plugin_data_cycler
8486 % 2];
8487
8488 as_hb_plugin_node_data* prev_data =
8489 &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][(adjacent_node->plugin_data_cycler
8490 + 1) % 2];
8491
8492 as_endpoint_list* curr_fabric_endpoints =
8493 as_fabric_hb_plugin_get_endpoint_list(curr_data);
8494 as_endpoint_list* prev_fabric_endpoints =
8495 as_fabric_hb_plugin_get_endpoint_list(prev_data);
8496
8497 // Endpoints changed if this is not the first update and if the endpoint
8498 // lists do not match.
8499 bool endpoints_changed = prev_fabric_endpoints != NULL
8500 && !as_endpoint_lists_are_equal(curr_fabric_endpoints,
8501 prev_fabric_endpoints);
8502
8503 if (endpoints_changed) {
8504 char curr_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE];
8505 char prev_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE];
8506
8507 as_endpoint_list_to_string(curr_fabric_endpoints,
8508 curr_fabric_endpoints_str, sizeof(curr_fabric_endpoints_str));
8509 as_endpoint_list_to_string(prev_fabric_endpoints,
8510 prev_fabric_endpoints_str, sizeof(prev_fabric_endpoints_str));
8511
8512 TICKER_WARNING("node: %"PRIx64" fabric endpoints changed from {%s} to {%s}", source, prev_fabric_endpoints_str, curr_fabric_endpoints_str);
8513 }
8514
8515 hb_endpoint_change_tracker_update(&adjacent_node->endpoint_change_tracker,
8516 endpoints_changed);
8517
8518 return 0;
8519}
8520
8521/**
8522 * Indicates if a node can be considered adjacent, based on accumulated
8523 * statistics.
8524 */
8525static bool
8526hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node)
8527{
8528 return hb_endpoint_change_tracker_is_normal(
8529 adjacent_node->endpoint_change_tracker);
8530}
8531
8532/**
8533 * Process a pulse from source having our node-id.
8534 */
8535static void
8536hb_channel_on_self_pulse(as_hb_channel_event* msg_event)
8537{
8538 bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 };
8539
8540 HB_LOCK();
8541 if (hb_adjacent_node_update(msg_event, &g_hb.self_node, plugin_data_changed)
8542 != 0) {
8543 goto Exit;
8544 }
8545
8546 as_hb_plugin_node_data* curr_data =
8547 &g_hb.self_node.plugin_data[AS_HB_PLUGIN_FABRIC][g_hb.self_node.plugin_data_cycler
8548 % 2];
8549 as_endpoint_list* curr_fabric_endpoints =
8550 as_fabric_hb_plugin_get_endpoint_list(curr_data);
8551
8552 if (!as_fabric_is_published_endpoint_list(curr_fabric_endpoints)) {
8553 // Mark self as having duplicate node-id.
8554 g_hb.self_is_duplicate = true;
8555 g_hb.self_duplicate_detected_ts = cf_getms();
8556
8557 // Found another node with duplicate node-id.
8558 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
8559 as_endpoint_list_to_string(curr_fabric_endpoints, endpoint_list_str,
8560 sizeof(endpoint_list_str));
8561 TICKER_WARNING("duplicate node-id: %" PRIx64 " with fabric endpoints {%s}", config_self_nodeid_get(), endpoint_list_str);
8562 }
8563 else {
8564 cf_atomic_int_incr(&g_stats.heartbeat_received_self);
8565 }
8566
8567Exit:
8568 HB_UNLOCK();
8569}
8570
8571/**
8572 * Process an incoming pulse message.
8573 */
8574static void
8575hb_channel_on_pulse(as_hb_channel_event* msg_event)
8576{
8577 msg* msg = msg_event->msg;
8578 cf_node source;
8579
8580 // Print cluster breach only once per second.
8581 static cf_clock last_cluster_breach_print = 0;
8582
8583 // Channel has validated the source. Don't bother checking here.
8584 msg_nodeid_get(msg, &source);
8585
8586 if (source == config_self_nodeid_get()) {
8587 hb_channel_on_self_pulse(msg_event);
8588 // Ignore self heartbeats.
8589 return;
8590 }
8591
8592 HB_LOCK();
8593
8594 as_hb_adjacent_node adjacent_node = { 0 };
8595
8596 bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 };
8597 bool is_in_adjacency = (hb_adjacent_node_get(source, &adjacent_node) == 0);
8598 bool should_be_on_probation = false;
8599
8600 if (!is_in_adjacency) {
8601 hb_on_probation_node_get(source, &adjacent_node);
8602 }
8603
8604 // Update the adjacent node with contents of the message.
8605 if (hb_adjacent_node_update(msg_event, &adjacent_node, plugin_data_changed)
8606 != 0) {
8607 // Update rejected.
8608 goto Exit;
8609 }
8610
8611 // Check if this node needs to be on probation.
8612 should_be_on_probation = !hb_node_can_consider_adjacent(&adjacent_node);
8613
8614 cf_atomic_int_incr(&g_stats.heartbeat_received_foreign);
8615
8616 bool is_new = !should_be_on_probation && !is_in_adjacency;
8617
8618 if (is_new) {
8619 int mcsize = config_mcsize();
8620 // Note: adjacency list does not contain self node hence
8621 // (mcsize - 1) in the check.
8622 if (cf_shash_get_size(g_hb.adjacency) >= (mcsize - 1)) {
8623 if (last_cluster_breach_print != (cf_getms() / 1000L)) {
8624 WARNING("ignoring node: %" PRIx64" - exceeding maximum supported cluster size %d",
8625 source, mcsize);
8626 last_cluster_breach_print = cf_getms() / 1000L;
8627 }
8628 goto Exit;
8629 }
8630 }
8631
8632 // Move the node to appropriate hash.
8633 cf_shash_put(should_be_on_probation ? g_hb.on_probation : g_hb.adjacency,
8634 &source, &adjacent_node);
8635
8636 // Maintain mutual exclusion between adjacency and on_probation hashes.
8637 cf_shash_delete(should_be_on_probation ? g_hb.adjacency : g_hb.on_probation,
8638 &source);
8639
8640 if (is_new) {
8641 // Publish event if this is a new node.
8642 INFO("node arrived %" PRIx64, source);
8643 hb_event_queue(AS_HB_INTERNAL_NODE_ARRIVE, &source, 1);
8644 }
8645 else if (should_be_on_probation && is_in_adjacency) {
8646 // This node needs to be on probation, most likely due to duplicate
8647 // node-ids.
8648 WARNING("node expired %" PRIx64" - potentially duplicate node-id", source);
8649 hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, &source, 1);
8650 }
8651
8652Exit:
8653 HB_UNLOCK();
8654
8655 // Publish any pending node arrival events.
8656 hb_event_publish_pending();
8657
8658 if (!should_be_on_probation) {
8659 // Call plugin change listeners outside of a lock to prevent deadlocks.
8660 for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) {
8661 if (plugin_data_changed[i] && g_hb.plugins[i].change_listener) {
8662 // Notify that data for this plugin for the source node has
8663 // changed.
8664 DETAIL("plugin data for node %" PRIx64" changed for plugin %d",
8665 source, i);
8666 (g_hb.plugins[i]).change_listener(source);
8667 }
8668 }
8669 }
8670}
8671
8672/**
8673 * Process an incoming heartbeat message.
8674 */
8675static void
8676hb_channel_on_msg_rcvd(as_hb_channel_event* event)
8677{
8678 msg* msg = event->msg;
8679 as_hb_msg_type type;
8680 msg_type_get(msg, &type);
8681
8682 switch (type) {
8683 case AS_HB_MSG_TYPE_PULSE: // A pulse message. Update the adjacent node data.
8684 hb_channel_on_pulse(event);
8685 break;
8686 default: // Ignore other messages.
8687 break;
8688 }
8689}
8690
8691/**
8692 * Increase the cluster-name mismatch counter the node.
8693 */
8694static void
8695hb_handle_cluster_name_mismatch(as_hb_channel_event* event)
8696{
8697 HB_LOCK();
8698
8699 as_hb_adjacent_node adjacent_node;
8700 memset(&adjacent_node, 0, sizeof(adjacent_node));
8701
8702 if (hb_adjacent_node_get(event->nodeid, &adjacent_node) != 0) {
8703 // Node does not exist in the adjacency list
8704 goto Exit;
8705 }
8706
8707 if (hb_msg_is_obsolete(event, adjacent_node.last_msg_hlc_ts.send_ts)) {
8708 WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64,
8709 adjacent_node.last_msg_hlc_ts.send_ts,
8710 event->msg_hlc_ts.send_ts, event->nodeid);
8711 goto Exit;
8712 }
8713
8714 // Update the cluster_name_mismatch counter.
8715 adjacent_node.cluster_name_mismatch_count++;
8716 cf_shash_put(g_hb.adjacency, &event->nodeid, &adjacent_node);
8717Exit:
8718 HB_UNLOCK();
8719}
8720
8721/**
8722 * Process channel events.
8723 */
8724static void
8725hb_channel_event_process(as_hb_channel_event* event)
8726{
8727 // Deal with pulse messages here.
8728 switch (event->type) {
8729 case AS_HB_CHANNEL_MSG_RECEIVED:
8730 hb_channel_on_msg_rcvd(event);
8731 break;
8732 case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH:
8733 hb_handle_cluster_name_mismatch(event);
8734 break;
8735 default: // Ignore channel active and inactive events. Rather rely on the adjacency
8736 // tender to expire nodes.
8737 break;
8738 }
8739}
8740
8741/**
8742 * Dump hb mode state to logs.
8743 * @param verbose enables / disables verbose logging.
8744 */
8745static void
8746hb_mode_dump(bool verbose)
8747{
8748 if (hb_is_mesh()) {
8749 mesh_dump(verbose);
8750 }
8751 else {
8752 multicast_dump(verbose);
8753 }
8754}
8755
8756/**
8757 * Reduce function to dump hb node info to log file.
8758 */
8759static int
8760hb_dump_reduce(const void* key, void* data, void* udata)
8761{
8762 const cf_node* nodeid = (const cf_node*)key;
8763 as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
8764
8765 char endpoint_list_str[ENDPOINT_LIST_STR_SIZE];
8766 as_endpoint_list_to_string(adjacent_node->endpoint_list, endpoint_list_str,
8767 sizeof(endpoint_list_str));
8768
8769 INFO("\tHB %s Node: node-id %" PRIx64" protocol %" PRIu32" endpoints {%s} last-updated %" PRIu64 " latency-ms %" PRIu64 ,
8770 (char*)udata,
8771 *nodeid, adjacent_node->protocol_version, endpoint_list_str,
8772 adjacent_node->last_updated_monotonic_ts, adjacent_node->avg_latency);
8773
8774 return CF_SHASH_OK;
8775}
8776
8777/**
8778 * Dump hb state to logs.
8779 * @param verbose enables / disables verbose logging.
8780 */
8781static void
8782hb_dump(bool verbose)
8783{
8784 HB_LOCK();
8785
8786 INFO("HB Adjacency Size: %d", cf_shash_get_size(g_hb.adjacency));
8787
8788 if (verbose) {
8789 cf_shash_reduce(g_hb.adjacency, hb_dump_reduce, "Adjacent");
8790 }
8791
8792 if (cf_shash_get_size(g_hb.on_probation)) {
8793 INFO("HB On-probation Size: %d", cf_shash_get_size(g_hb.on_probation));
8794
8795 if (verbose) {
8796 cf_shash_reduce(g_hb.on_probation, hb_dump_reduce, "On-probation");
8797 }
8798 }
8799
8800 HB_UNLOCK();
8801}
8802
8803/**
8804 * Compute a complement / inverted adjacency graph for input nodes such that
8805 * entry
8806 *
8807 * inverted_graph[i][j] = 0 iff node[i] and node[j] are in each others adjacency
8808 * lists. That is they have a bidirectional network link active between them.
8809 *
8810 * else
8811 *
8812 * inverted_graph[i][j] > 0 iff there is no link or a unidirectional link
8813 * between them.
8814 *
8815 *
8816 * @param nodes the input vector of nodes.
8817 * @param inverted_graph (output) a (num_nodes x num_nodes ) 2D byte array.
8818 */
8819static void
8820hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph)
8821{
8822 HB_LOCK();
8823 int num_nodes = cf_vector_size(nodes);
8824
8825 for (int i = 0; i < num_nodes; i++) {
8826 for (int j = 0; j < num_nodes; j++) {
8827 inverted_graph[i][j] = 2;
8828 }
8829 cf_node nodeid = 0;
8830 cf_vector_get(nodes, i, &nodeid);
8831 cf_shash_put(g_hb.nodeid_to_index, &nodeid, &i);
8832 }
8833
8834 cf_node self_nodeid = config_self_nodeid_get();
8835 int self_node_index = -1;
8836 cf_shash_get(g_hb.nodeid_to_index, &self_nodeid, &self_node_index);
8837
8838 for (int i = 0; i < num_nodes; i++) {
8839 // Mark the node connected from itself, i.e, disconnected in the
8840 // inverted graph.
8841 inverted_graph[i][i] = 0;
8842
8843 cf_node node = *(cf_node*)cf_vector_getp(nodes, i);
8844 as_hb_adjacent_node node_info;
8845
8846 if (hb_adjacent_node_get(node, &node_info) == 0) {
8847 if (self_node_index >= 0) {
8848 // Self node will not have plugin data. But the fact that this
8849 // node has an adjacent node indicates that is is in our
8850 // adjacency list. Adjust the graph.
8851 inverted_graph[i][self_node_index]--;
8852 inverted_graph[self_node_index][i]--;
8853 }
8854
8855 cf_node* adjacency_list = NULL;
8856 size_t adjacency_length = 0;
8857 hb_adjacent_node_adjacency_get(&node_info, &adjacency_list, &adjacency_length);
8858
8859 for (int j = 0; j < adjacency_length; j++) {
8860 int other_node_index = -1;
8861 cf_shash_get(g_hb.nodeid_to_index, &adjacency_list[j],
8862 &other_node_index);
8863 if (other_node_index < 0) {
8864 // This node is not in the input set of nodes.
8865 continue;
8866 }
8867
8868 if (i != other_node_index) {
8869 inverted_graph[i][other_node_index]--;
8870 inverted_graph[other_node_index][i]--;
8871 }
8872 }
8873 }
8874 }
8875
8876 // Cleanup the temporary hash.
8877 cf_shash_delete_all(g_hb.nodeid_to_index);
8878
8879 HB_UNLOCK();
8880}
8881
8882/**
8883 * Compute the nodes to evict from the input nodes so that remaining nodes form
8884 * a clique, based on adjacency lists using minimal vertex cover.
8885 *
8886 * The minimal vertex cover on this graph is the set of nodes that should be
8887 * removed to result in a clique on the remaining nodes. This implementation is
8888 * an approximation of the minimal vertex cover. The notion is to keep removing
8889 * vertices having the highest degree until there are no more edges remaining.
8890 * The heuristic gets rid of the more problematic nodes first.
8891 *
8892 * @param nodes input cf_node vector.
8893 * @param nodes_to_evict output cf_node clique array, that is initialized.
8894 */
8895static void
8896hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict)
8897{
8898 int num_nodes = cf_vector_size(nodes);
8899
8900 if (num_nodes == 0) {
8901 // Nothing to do.
8902 return;
8903 }
8904
8905 int graph_alloc_size = sizeof(uint8_t) * num_nodes * num_nodes;
8906 void* graph_data = MSG_BUFF_ALLOC(graph_alloc_size);
8907
8908 if (!graph_data) {
8909 CRASH("error allocating space for clique finding data structure");
8910 }
8911
8912 uint8_t* inverted_graph[num_nodes];
8913 inverted_graph[0] = graph_data;
8914 for (int i = 1; i < num_nodes; i++) {
8915 inverted_graph[i] = *inverted_graph + num_nodes * i;
8916 }
8917
8918 hb_adjacency_graph_invert(nodes, inverted_graph);
8919
8920 // Count the number of edges in the inverted graph. These edges are the ones
8921 // that need to be removed so that the remaining nodes form a clique in the
8922 // adjacency graph. Also for performance get hold of the self node index in
8923 // the nodes vector.
8924 int edge_count = 0;
8925 int self_node_index = -1;
8926 for (int i = 0; i < num_nodes; i++) {
8927 cf_node node = 0;
8928 cf_vector_get(nodes, i, &node);
8929 if (node == config_self_nodeid_get()) {
8930 self_node_index = i;
8931 }
8932
8933 for (int j = 0; j < num_nodes; j++) {
8934 if (inverted_graph[i][j]) {
8935 edge_count++;
8936 }
8937 }
8938 }
8939
8940 cf_vector_delete_range(nodes_to_evict, 0,
8941 cf_vector_size(nodes_to_evict) - 1);
8942
8943 // Since we always decide to retain self node, first get rid of all nodes
8944 // having missing links to self node.
8945 if (self_node_index >= 0) {
8946 for (int i = 0; i < num_nodes; i++) {
8947 if (inverted_graph[self_node_index][i]
8948 || inverted_graph[i][self_node_index]) {
8949 cf_node to_evict = 0;
8950 cf_vector_get(nodes, i, &to_evict);
8951 DEBUG("marking node %" PRIx64" for clique based eviction",
8952 to_evict);
8953
8954 cf_vector_append(nodes_to_evict, &to_evict);
8955
8956 // Remove all edges attached to the removed node.
8957 for (int j = 0; j < num_nodes; j++) {
8958 if (inverted_graph[i][j]) {
8959 inverted_graph[i][j] = 0;
8960 edge_count--;
8961 }
8962 if (inverted_graph[j][i]) {
8963 inverted_graph[j][i] = 0;
8964 edge_count--;
8965 }
8966 }
8967 }
8968 }
8969 }
8970
8971 while (edge_count > 0) {
8972 // Find vertex with highest degree.
8973 cf_node max_degree_node = 0;
8974 int max_degree_node_idx = -1;
8975 int max_degree = 0;
8976
8977 for (int i = 0; i < num_nodes; i++) {
8978 cf_node to_evict = 0;
8979 cf_vector_get(nodes, i, &to_evict);
8980
8981 if (vector_find(nodes_to_evict, &to_evict) >= 0) {
8982 // We have already decided to evict this node.
8983 continue;
8984 }
8985
8986 if (to_evict == config_self_nodeid_get()) {
8987 // Do not evict self.
8988 continue;
8989 }
8990
8991 // Get the degree of this node.
8992 int degree = 0;
8993 for (int j = 0; j < num_nodes; j++) {
8994 if (inverted_graph[i][j]) {
8995 degree++;
8996 }
8997 }
8998
8999 DETAIL("inverted degree for node %" PRIx64" is %d",
9000 to_evict, degree);
9001
9002 // See if this node has a higher degree. On ties choose the node
9003 // with a smaller nodeid
9004 if (degree > max_degree
9005 || (degree == max_degree && max_degree_node > to_evict)) {
9006 max_degree = degree;
9007 max_degree_node = to_evict;
9008 max_degree_node_idx = i;
9009 }
9010 }
9011
9012 if (max_degree_node_idx < 0) {
9013 // We are done no node to evict.
9014 break;
9015 }
9016
9017 DEBUG("marking node %" PRIx64" with degree %d for clique based eviction",
9018 max_degree_node, max_degree);
9019
9020 cf_vector_append(nodes_to_evict, &max_degree_node);
9021
9022 // Remove all edges attached to the removed node.
9023 for (int i = 0; i < num_nodes; i++) {
9024 if (inverted_graph[max_degree_node_idx][i]) {
9025 inverted_graph[max_degree_node_idx][i] = 0;
9026 edge_count--;
9027 }
9028 if (inverted_graph[i][max_degree_node_idx]) {
9029 inverted_graph[i][max_degree_node_idx] = 0;
9030 edge_count--;
9031 }
9032 }
9033 }
9034
9035 MSG_BUFF_FREE(graph_data, graph_alloc_size);
9036}
9037
9038/**
9039 * Reduce function to iterate over plugin data for all adjacent nodes.
9040 */
9041static int
9042hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata)
9043{
9044 const cf_node* nodeid = (const cf_node*)key;
9045 as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data;
9046 as_hb_adjacecny_iterate_reduce_udata* reduce_udata =
9047 (as_hb_adjacecny_iterate_reduce_udata*)udata;
9048
9049 size_t plugin_data_size =
9050 adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler
9051 % 2].data_size;
9052 void* plugin_data =
9053 plugin_data_size ?
9054 adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler
9055 % 2].data : NULL;
9056
9057 reduce_udata->iterate_fn(*nodeid, plugin_data, plugin_data_size,
9058 adjacent_node->last_updated_monotonic_ts,
9059 &adjacent_node->last_msg_hlc_ts, reduce_udata->udata);
9060
9061 return CF_SHASH_OK;
9062}
9063
9064/**
9065 * Call the iterate method on all nodes in current adjacency list. Note plugin
9066 * data can still be NULL if the plugin data failed to parse the plugin data.
9067 *
9068 * @param pluginid the plugin identifier.
9069 * @param iterate_fn the iterate function invoked for plugin data forevery node.
9070 * @param udata passed as is to the iterate function. Useful for getting results
9071 * out of the iteration. NULL if there is no plugin data.
9072 * @return the size of the plugin data. 0 if there is no plugin data.
9073 */
9074static void
9075hb_plugin_data_iterate_all(as_hb_plugin_id pluginid,
9076 as_hb_plugin_data_iterate_fn iterate_fn, void* udata)
9077{
9078 HB_LOCK();
9079
9080 as_hb_adjacecny_iterate_reduce_udata reduce_udata;
9081 reduce_udata.pluginid = pluginid;
9082 reduce_udata.iterate_fn = iterate_fn;
9083 reduce_udata.udata = udata;
9084 cf_shash_reduce(g_hb.adjacency, hb_plugin_data_iterate_reduce,
9085 &reduce_udata);
9086
9087 HB_UNLOCK();
9088}
9089