clustering.c source code [Aerospike/as/src/fabric/clustering.c]

1	/*
2	* clustering.c
3	*
4	* Copyright (C) 2016 Aerospike, Inc.
5	*
6	* Portions may be licensed to Aerospike, Inc. under one or more contributor
7	* license agreements.
8	*
9	* This program is free software: you can redistribute it and/or modify it under
10	* the terms of the GNU Affero General Public License as published by the Free
11	* Software Foundation, either version 3 of the License, or (at your option) any
12	* later version.
13	*
14	* This program is distributed in the hope that it will be useful, but WITHOUT
15	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16	* FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17	* details.
18	*
19	* You should have received a copy of the GNU Affero General Public License
20	* along with this program. If not, see http://www.gnu.org/licenses/
21	*/
22
23	#include "fabric/clustering.h"
24
25	#include <errno.h>
26	#include <math.h>
27	#include <stdio.h>
28	#include <unistd.h>
29	#include <sys/param.h> // For MAX() and MIN().
30
31	#include "citrusleaf/alloc.h"
32	#include "citrusleaf/cf_clock.h"
33	#include "citrusleaf/cf_random.h"
34
35	#include "cf_thread.h"
36	#include "fault.h"
37	#include "msg.h"
38	#include "node.h"
39	#include "shash.h"
40
41	#include "base/cfg.h"
42	#include "fabric/fabric.h"
43	#include "fabric/hlc.h"
44
45	/*
46	* Overview
47	* ========
48	* Clustering v5 implementation based on the design at
49	* https://aerospike.atlassian.net/wiki/pages/viewpage.action?spaceKey=DEV&title=Central+Wiki%3A++Clustering+V5
50	*
51	* Public and private view of the cluster
52	* =======================================
53	* This clustering algorithm introduces an orphan state, in which this node is
54	* not part of a cluster, but is looking to form/join a cluster. During this
55	* transitionary phase, the public view of the cluster the tuple, <cluster_key,
56	* succession_list), does not change from the last view. However the internal
57	* view, which is published along with the heartbeat messages, is set to <0,
58	* []>.
59	*
60	* This ensures clients continue to function, (maybe with errors), during the
61	* transition from orphan to part of a cluster state. This is in line with the
62	* clustering v4 and prior behaviour.
63	*
64	* TODO: (revise)
65	*
66	* Deviations from paxos
67	* =====================
68	*
69	* Accepted value
70	* ---------------
71	*
72	* Accepted value is not send along with accept and accepted message. The latest
73	* accepted value overwrites the previous value at a node. In paxos if a node
74	* has already accepted a value, it is send back to the proposer who should use
75	* the value with highest proposal id as the final value. The proposer generates
76	* the final consensus value as the succession list with the nodes that have
77	* both returned promise and accepted replies.
78	*
79	* This is not safe in terms of achieveing a single paxos value, however it is
80	* safe in that nodes courted by other principals will get filtered out during
81	* paxos and not require additional paxos rounds.
82	*
83	* It is still possible that the final consensus succession list might has a few
84	* nodes moving out owing to a neighboring principal. However the faulty node
85	* check in the next quantum interval will fix this.
86	*
87	* Quorum
88	* ------
89	* The prepare phase uses a majority quorum for the promise messages, to speed
90	* through the paxos round. However the accept phase uses a complete / full
91	* quorum for accepted messages. This helps with ensuring that when a node
92	* generartes a cluster change event all cluster member have applied the current
93	* cluster membership.
94	*
95	* Design
96	* ======
97	* The clustering sub-system with rest of Aerospike via input event notification
98	* (primarily heartbeat events) and output events notifications (primary cluster
99	* change notifications).
100	*
101	* The subsystem is driven by internal events (that also encapsulate external
102	* input event notifications) like timer, quantum interval start, adjaceny
103	* changed, message received, etc.
104	*
105	* The clustering-v5 subsystem is further organized as the following sub-modules
106	* each of which reacts to the above mentioned events based on individual state
107	* transition diagrams.
108	*
109	* 1. Timer
110	* 2. Quantum interval generator
111	* 3. Paxos proposer
112	* 4. Paxos acceptor
113	* 5. Register
114	* 6. External event publisher
115	* 7. Internal event dispatcher
116	* 8. Clustering main
117	*
118	* The sub modules also interact with each other via inline internal event
119	* dispatch and handling.
120	*
121	* Timer
122	* -----
123	* Generates timer events that serve as the internal tick/clock for the
124	* clustering-v5 sub system. Other sub-modules use the timer events to drive
125	* actions to be performed at fixed intervals, for e.g. message retransmits.
126	*
127	* Quantum interval generator
128	* --------------------------
129	* Generates quantum interval start events, at which cluster change decision are
130	* taken.
131	*
132	* Paxos proposer
133	* --------------
134	* The paxos proposer proposes a cluster change. The node may or may not be the
135	* eventual principal for the cluster.
136	*
137	* Paxos acceptor
138	* --------------
139	* Participates in voting for a proposal. A paxos proposer is also necessarily
140	* an accetor in this design.
141	*
142	* Register
143	* --------
144	* Holds current cluster membership and cluster key. It is responsible for
145	* ensuring all cluster members have their registers in sync before publishing
146	* an external cluster change event.
147	*
148	* External event publisher
149	* ------------------------
150	* Generate and publishes external events or cluster changes. Runs as a separate
151	* thread to prevent interference and potential deadlocks with the clustering
152	* subsystem.
153	*
154	* Internal event dispatcher
155	* -------------------------
156	* Dispatches internal events to current function based in the event type and
157	* current state.
158	*
159	* Clustering main
160	* ---------------
161	* Monitors the cluster and triggers cluster changes.
162	*
163	* State transitions
164	* =================
165	* TODO: diagrams for each sub-module
166	*
167	* Message send rules
168	* ==================
169	* Message send should preferably be outside the main clustering lock and should
170	* not be followed by any state change in the same function. This is because
171	* fabric relays messages to self inline in the send call itself which can lead
172	* to corruption if the message handler involves a state change as well or can
173	* result in the message handler seeing inconsistent partially updated state.
174	*/
175
176	/*
177	* ----------------------------------------------------------------------------
178	* Constants
179	* ----------------------------------------------------------------------------
180	*/
181
182	/**
183	* A soft limit for the maximum cluster size. Meant to be optimize hash and list
184	* data structures and not as a limit on the number of nodes.
185	*/
186	#define AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT 200
187
188	/**
189	* Timer event generation interval.
190	*/
191	#define CLUSTERING_TIMER_TICK_INTERVAL 75
192
193	/**
194	* Maximum time paxos round would take for completion. 3 RTTs paxos message
195	* exchanges and 1 RTT as a buffer.
196	*/
197	#define PAXOS_COMPLETION_TIME_MAX (4 * network_rtt_max())
198
199	/**
200	* Maximum quantum interval duration, should be at least two heartbeat
201	* intervals, to ensure there is at least one exchange of clustering information
202	* over heartbeats.
203	*/
204	#define QUANTUM_INTERVAL_MAX MAX(5000, 2 * as_hb_tx_interval_get())
205
206	/**
207	* Block size for allocating node plugin data. Ensure the allocation is in
208	* multiples of 128 bytes, allowing expansion to 16 nodes without reallocating.
209	*/
210	#define HB_PLUGIN_DATA_BLOCK_SIZE 128
211
212	/**
213	* Scratch size for clustering messages.
214	*
215	* TODO: Compute this properly.
216	*/
217	#define AS_CLUSTERING_MSG_SCRATCH_SIZE 1024
218
219	/**
220	* Majority value for preferred principal to be selected for move. Use tow
221	* thirds as the majority value.
222	*/
223	#define AS_CLUSTERING_PREFERRRED_PRINCIPAL_MAJORITY (2 / 3)
224
225	/*
226	* ----------------------------------------------------------------------------
227	* Paxos data structures
228	* ----------------------------------------------------------------------------
229	*/
230
231	/**
232	* Paxos sequence number. We will use the hybrid logical clock timestamp as
233	* sequence numbers, to ensure node restarts do not reset the sequence number
234	* back to zero and sequence numbers are monotoniocally increasing. A sequence
235	* number value of zero is invalid.
236	*/
237	typedef as_hlc_timestamp as_paxos_sequence_number;
238
239	/**
240	* Paxos proposal identifier.
241	* Note: The nodeid can be skipped when sending the proposal id over the wire
242	* and can be inferred from the source duirng paxos message exchanges.
243	*/
244	typedef struct as_paxos_proposal_id_s
245	{
246	/**
247	* The sequence number.
248	*/
249	as_paxos_sequence_number sequence_number;
250
251	/**
252	* The proposing node's nodeid to break ties.
253	*/
254	cf_node src_nodeid;
255	} as_paxos_proposal_id;
256
257	/**
258	* The proposed cluster membership.
259	*/
260	typedef struct as_paxos_proposed_value_s
261	{
262	/**
263	* The cluster key.
264	*/
265	as_cluster_key cluster_key;
266
267	/**
268	* The succession list.
269	*/
270	cf_vector succession_list;
271	} as_paxos_proposed_value;
272
273	/**
274	* Paxos acceptor state.
275	*/
276	typedef enum
277	{
278	/**
279	* Acceptor is idel with no active paxos round.
280	*/
281	AS_PAXOS_ACCEPTOR_STATE_IDLE,
282
283	/**
284	* Acceptor has received and acked a promise message.
285	*/
286	AS_PAXOS_ACCEPTOR_STATE_PROMISED,
287
288	/**
289	* Acceptor has received and accepted an accept message from a proposer.
290	*/
291	AS_PAXOS_ACCEPTOR_STATE_ACCEPTED
292	} as_paxos_acceptor_state;
293
294	/**
295	* Data tracked by the node in the role of a paxos acceptor.
296	* All nodes are paxos acceptors.
297	*/
298	typedef struct as_paxos_acceptor_s
299	{
300	/**
301	* The paxos acceptor state.
302	*/
303	as_paxos_acceptor_state state;
304
305	/**
306	* Monotonic timestamp when the first message for current proposal was
307	* received from the proposer.
308	*/
309	cf_clock acceptor_round_start;
310
311	/**
312	* Monotonic timestamp when the promise message was sent.
313	*/
314	cf_clock promise_send_time;
315
316	/**
317	* Monotonic timestamp when the promise message was sent.
318	*/
319	cf_clock accepted_send_time;
320
321	/**
322	* Id of the last proposal, promised or accepted by this node.
323	*/
324	as_paxos_proposal_id last_proposal_received_id;
325	} as_paxos_acceptor;
326
327	/**
328	* State of a paxos proposer.
329	*/
330	typedef enum as_paxos_proposer_state_e
331	{
332	/**
333	* Paxos proposer is idle. No pending paxos rounds.
334	*/
335	AS_PAXOS_PROPOSER_STATE_IDLE,
336
337	/**
338	* Paxos proposer sent out a prepare message.
339	*/
340	AS_PAXOS_PROPOSER_STATE_PREPARE_SENT,
341
342	/**
343	* Paxos proposer has sent out an accept message.
344	*/
345	AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT
346	} as_paxos_proposer_state;
347
348	/**
349	* Data tracked by the node in the role of a paxos proposer. The proposer node
350	* may or may not be the current or eventual principal.
351	*/
352	typedef struct as_paxos_proposer_s
353	{
354	/**
355	* The state of the proposer.
356	*/
357	as_paxos_proposer_state state;
358
359	/**
360	* The sequence number / id for the last proposed paxos value.
361	*/
362	as_paxos_sequence_number sequence_number;
363
364	/**
365	* The proposed cluster value.
366	*/
367	as_paxos_proposed_value proposed_value;
368
369	/**
370	* The time current paxos round was started.
371	*/
372	cf_clock paxos_round_start_time;
373
374	/**
375	* The time current proposal's prepare message was sent.
376	*/
377	cf_clock prepare_send_time;
378
379	/**
380	* The time current proposal's accept message was sent.
381	*/
382	cf_clock accept_send_time;
383
384	/**
385	* The time current proposal's learn message was sent.
386	*/
387	cf_clock learn_send_time;
388
389	/**
390	* Indicates if learn message needs retransmit.
391	*/
392	bool learn_retransmit_needed;
393
394	/**
395	* The set of acceptor nodes including self.
396	*/
397	cf_vector acceptors;
398
399	/**
400	* Set of nodeids that send out a promise response to the current prepare
401	* message.
402	*/
403	cf_vector promises_received;
404
405	/**
406	* Set of nodeids that send out an accepted response to the current accept
407	* message.
408	*/
409	cf_vector accepted_received;
410	} as_paxos_proposer;
411
412	/**
413	* Result of paxos round start call.
414	*/
415	typedef enum as_paxos_start_result_e
416	{
417	/**
418	* Paxos round started successfully.
419	*/
420	AS_PAXOS_RESULT_STARTED,
421
422	/**
423	* cluster size is less than minimum required cluster size.
424	*/
425	AS_PAXOS_RESULT_CLUSTER_TOO_SMALL,
426
427	/**
428	* Paxos round already in progress. Paxos not started.
429	*/
430	AS_PAXOS_RESULT_ROUND_RUNNING
431	} as_paxos_start_result;
432
433	/**
434	* Node clustering status.
435	*/
436	typedef enum
437	{
438	/**
439	* Peer node is orphaned.
440	*/
441	AS_NODE_ORPHAN,
442
443	/**
444	* Peer node has a cluster assigned.
445	*/
446	AS_NODE_CLUSTER_ASSIGNED,
447
448	/**
449	* Peer node status is unknown.
450	*/
451	AS_NODE_UNKNOWN
452	} as_clustering_peer_node_state;
453
454	/*
455	* ----------------------------------------------------------------------------
456	* Clustering data structures
457	* ----------------------------------------------------------------------------
458	*/
459
460	/**
461	* Clustering message types.
462	*/
463	typedef enum
464	{
465	/*
466	* ---- Clustering management messages ----
467	*/
468	AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST,
469	AS_CLUSTERING_MSG_TYPE_JOIN_REJECT,
470	AS_CLUSTERING_MSG_TYPE_MERGE_MOVE,
471	AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED,
472
473	/*
474	* ---- Paxos messages ----
475	*/
476	AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE,
477	AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE,
478	AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK,
479	AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT,
480	AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED,
481	AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK,
482	AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN,
483	} as_clustering_msg_type;
484
485	/**
486	* The fields in the clustering message.
487	*/
488	typedef enum
489	{
490	/**
491	* Clustering message identifier.
492	*/
493	AS_CLUSTERING_MSG_ID,
494
495	/**
496	* Clustering message type.
497	*/
498	AS_CLUSTERING_MSG_TYPE,
499
500	/**
501	* The source node send timestamp.
502	*/
503	AS_CLUSTERING_MSG_HLC_TIMESTAMP,
504
505	/**
506	* The paxos sequence number. Not all messages will have this.
507	*/
508	AS_CLUSTERING_MSG_SEQUENCE_NUMBER,
509
510	/**
511	* The proposed cluster key. Only part of the paxos accept message.
512	*/
513	AS_CLUSTERING_MSG_CLUSTER_KEY,
514
515	/**
516	* The proposed succession list. Only part of the paxos accept message.
517	*/
518	AS_CLUSTERING_MSG_SUCCESSION_LIST,
519
520	/**
521	* The proposed principal relevant only to cluster move commands, which will
522	* merge two well formed paxos clusters.
523	*/
524	AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL,
525
526	/**
527	* Sentinel value to keep track of the number of message fields.
528	*/
529	AS_CLUSTERING_MGS_SENTINEL
530	} as_clustering_msg_field;
531
532	/**
533	* Internal clustering event type.
534	*/
535	typedef enum
536	{
537	/**
538	* Timer event.
539	*/
540	AS_CLUSTERING_INTERNAL_EVENT_TIMER,
541
542	/**
543	* Incoming message event.
544	*/
545	AS_CLUSTERING_INTERNAL_EVENT_MSG,
546
547	/**
548	* A join request was accepted.
549	*/
550	AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED,
551
552	/**
553	* Indicates the start of a quantum interval.
554	*/
555	AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START,
556
557	/**
558	* Indicates that self node's cluster membership changed.
559	*/
560	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED,
561
562	/**
563	* Indicates that self node's cluster membership has been synced across all
564	* cluster members.
565	*/
566	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED,
567
568	/**
569	* Indicates that self node has been marked as an orphan.
570	*/
571	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED,
572
573	/**
574	* Indicates an incoming heartbeat event.
575	*/
576	AS_CLUSTERING_INTERNAL_EVENT_HB,
577
578	/**
579	* Indicates that plugin data for a node has changed.
580	*/
581	AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED,
582
583	/**
584	* The paxos round being accepted succeeded and the proposed value should be
585	* committed.
586	* This implies that all the proposed cluster members have all agreed on the
587	* proposed cluster key and the proposed cluster membership.
588	*/
589	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS,
590
591	/**
592	* The last paxos round being accepted failed.
593	*/
594	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_FAIL,
595
596	/**
597	* The paxos round proposed by this node.
598	*/
599	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_SUCCESS,
600
601	/**
602	* The last paxos round proposed failed.
603	*/
604	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL,
605	} as_clustering_internal_event_type;
606
607	/**
608	* An event used internally by the clustering subsystem.
609	*/
610	typedef struct as_clustering_internal_event_s
611	{
612	/**
613	* The event type.
614	*/
615	as_clustering_internal_event_type type;
616
617	/**
618	* The event qualifier.
619	*/
620	as_clustering_event_qualifier qualifier;
621
622	/*
623	* ----- Quantum interval start event related fields
624	*/
625	/**
626	* Indicates if this quantum interval start can be skipped by the event
627	* handler.
628	*/
629	bool quantum_interval_is_skippable;
630
631	/*
632	* ----- Message event related fields.
633	*/
634	/**
635	* The source node id.
636	*/
637	cf_node msg_src_nodeid;
638
639	/**
640	* Incoming message type.
641	*/
642	as_clustering_msg_type msg_type;
643
644	/**
645	* The hlc timestamp for message receipt.
646	*/
647	as_hlc_msg_timestamp msg_hlc_ts;
648
649	/**
650	* Local monotonic received timestamp.
651	*/
652	cf_clock msg_recvd_ts;
653
654	/**
655	* The received message.
656	*/
657	msg* msg;
658
659	/*
660	* ----- HB event related fields.
661	*/
662	/**
663	* Number of heartbeat events.
664	*/
665	int hb_n_events;
666
667	/**
668	* Heartbeat events.
669	*/
670	as_hb_event_node* hb_events;
671
672	/*
673	* ----- HB plugin data changed event related fields.
674	*/
675	/**
676	* Node id of the node whose plugin data has changed.
677	*/
678	cf_node plugin_data_changed_nodeid;
679
680	/**
681	* Node's plugin data.
682	*/
683	as_hb_plugin_node_data* plugin_data;
684
685	/**
686	* The hlc timestamp for message receipt.
687	*/
688	as_hlc_msg_timestamp plugin_data_changed_hlc_ts;
689
690	/**
691	* Local monotonic received timestamp.
692	*/
693	cf_clock plugin_data_changed_ts;
694
695	/*
696	* ----- Join request handled related fields.
697	*/
698	cf_node join_request_source_nodeid;
699
700	/*
701	* ----- Paxos success related fields.
702	*/
703	/**
704	* New succession list.
705	*/
706	cf_vector *new_succession_list;
707
708	/**
709	* New cluster key.
710	*/
711	as_cluster_key new_cluster_key;
712
713	/**
714	* New paxos sequence number.
715	*/
716	as_paxos_sequence_number new_sequence_number;
717	} as_clustering_internal_event;
718
719	/**
720	* The clustering timer state.
721	*/
722	typedef struct as_clustering_timer_s
723	{
724	/**
725	* The timer thread id.
726	*/
727	pthread_t timer_tid;
728	} as_clustering_timer;
729
730	/**
731	* Clustering subsystem state.
732	*/
733	typedef enum
734	{
735	AS_CLUSTERING_SYS_STATE_UNINITIALIZED,
736	AS_CLUSTERING_SYS_STATE_RUNNING,
737	AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN,
738	AS_CLUSTERING_SYS_STATE_STOPPED
739	} as_clustering_sys_state;
740
741	/**
742	* Type of quantum interval fault. Ensure the vtable in quantum iterval table is
743	* updated for each type.
744	*/
745	typedef enum as_clustering_quantum_fault_type_e
746	{
747	/**
748	* A new node arrived.
749	*/
750	QUANTUM_FAULT_NODE_ARRIVED,
751
752	/**
753	* A node not our principal departed from the cluster.
754	*/
755	QUANTUM_FAULT_NODE_DEPARTED,
756
757	/**
758	* We are in a cluster and out principal departed.
759	*/
760	QUANTUM_FAULT_PRINCIPAL_DEPARTED,
761
762	/**
763	* A member node's adjacency list has changed.
764	*/
765	QUANTUM_FAULT_PEER_ADJACENCY_CHANGED,
766
767	/**
768	* Join request accepted.
769	*/
770	QUANTUM_FAULT_JOIN_ACCEPTED,
771
772	/**
773	* We have seen a principal who might send us a merge request.
774	*/
775	QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN,
776
777	/**
778	* A node in our cluster has been orphaned.
779	*/
780	QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED,
781
782	/**
783	* Sentinel value. Should be the last in the enum.
784	*/
785	QUANTUM_FAULT_TYPE_SENTINEL
786	} as_clustering_quantum_fault_type;
787
788	/**
789	* Fault information for for first fault event detected in a quantum interval.
790	*/
791	typedef struct as_clustering_quantum_fault_s
792	{
793	/**
794	* First time the fault event was detected in current quantum based on
795	* monotonic clock. Should be initialized to zero at quantum start / end.
796	*/
797	cf_clock event_ts;
798
799	/**
800	* Last time the fault event was detected in current quantum based on
801	* monotonic clock. Should be initialized to zero at quantum start / end.
802	*/
803	cf_clock last_event_ts;
804	} as_clustering_quantum_fault;
805
806	/**
807	* Function to determine the minimum wait time after given fault happens.
808	*/
809	typedef uint32_t
810	(as_clustering_quantum_fault_wait_fn)(as_clustering_quantum_fault* fault);
811
812	/**
813	* Vtable for different types of faults.
814	*/
815	typedef struct as_clustering_quantum_fault_vtable_s
816	{
817	/**
818	* String used to log this fault type.
819	*/
820	char *fault_log_str;
821
822	/**
823	* Function providing the wait time for this fault type.
824	*/
825	as_clustering_quantum_fault_wait_fn* wait_fn;
826	} as_clustering_quantum_fault_vtable;
827
828	/**
829	* Generates quantum intervals.
830	*/
831	typedef struct as_clustering_quantum_interval_generator_s
832	{
833	/**
834	* Quantum interval fault vtable.
835	*/
836	as_clustering_quantum_fault_vtable vtable[QUANTUM_FAULT_TYPE_SENTINEL];
837
838	/**
839	* Quantum interval faults.
840	*/
841	as_clustering_quantum_fault fault[QUANTUM_FAULT_TYPE_SENTINEL];
842
843	/**
844	* Time quantum interval last started.
845	*/
846	cf_clock last_quantum_start_time;
847
848	/**
849	* For quantum interval being skippable respect the last quantum interval
850	* since quantum_interval() will be affected by changes to hb config.
851	*/
852	uint32_t last_quantum_interval;
853
854	/**
855	* Indicates if current quantum interval should be postponed.
856	*/
857	bool is_interval_postponed;
858	} as_clustering_quantum_interval_generator;
859
860	/**
861	* State of the clustering register.
862	*/
863	typedef enum
864	{
865	/**
866	* The register contents are in synced with all cluster members.
867	*/
868	AS_CLUSTERING_REGISTER_STATE_SYNCED,
869
870	/**
871	* The register contents are being synced with other cluster members.
872	*/
873	AS_CLUSTERING_REGISTER_STATE_SYNCING
874	} as_clustering_register_state;
875
876	/**
877	* Stores current cluster key and succession list and generates external events.
878	*/
879	typedef struct as_clustering_register_s
880	{
881	/**
882	* The register state.
883	*/
884	as_clustering_register_state state;
885
886	/**
887	* Current cluster key.
888	*/
889	as_cluster_key cluster_key;
890
891	/**
892	* Current succession list.
893	*/
894	cf_vector succession_list;
895
896	/**
897	* Indicates if this node has transitioned to orphan state after being in a
898	* valid cluster.
899	*/
900	bool has_orphan_transitioned;
901
902	/**
903	* The sequence number for the current cluster.
904	*/
905	as_paxos_sequence_number sequence_number;
906
907	/**
908	* Nodes pending sync.
909	*/
910	cf_vector sync_pending;
911
912	/**
913	* Nodes that send a sync applied for an unexpected cluster. Store it in
914	* case this is an imminent cluster change we will see in the future. All
915	* the nodes in this vector have sent the same cluster key and the same
916	* succession list.
917	*/
918	cf_vector ooo_change_applied_received;
919
920	/**
921	* Cluster key sent by nodes in ooo_change_applied_received vector.
922	*/
923	as_cluster_key ooo_cluster_key;
924
925	/**
926	* Succession sent by nodes in ooo_change_applied_received vector.
927	*/
928	cf_vector ooo_succession_list;
929
930	/**
931	* Timestamp of the first ooo change applied message.
932	*/
933	as_hlc_timestamp ooo_hlc_timestamp;
934
935	/**
936	* The time cluster last changed.
937	*/
938	as_hlc_timestamp cluster_modified_hlc_ts;
939
940	/**
941	* The monotonic clock time cluster last changed.
942	*/
943	cf_clock cluster_modified_time;
944
945	/**
946	* The last time the register sync was checked in the syncing state.
947	*/
948	cf_clock last_sync_check_time;
949	} as_clustering_register;
950
951	/**
952	* * Clustering state.
953	*/
954	typedef enum
955	{
956	/**
957	* Self node is not part of a cluster.
958	*/
959	AS_CLUSTERING_STATE_ORPHAN,
960
961	/**
962	* Self node is not part of a cluster.
963	*/
964	AS_CLUSTERING_STATE_PRINCIPAL,
965
966	/**
967	* Self node is part of a cluster but not the principal.
968	*/
969	AS_CLUSTERING_STATE_NON_PRINCIPAL
970	} as_clustering_state;
971
972	/**
973	* Clustering state maintained by this node.
974	*/
975	typedef struct as_clustering_s
976	{
977
978	/**
979	* Clustering submodule state, indicates if the clustering sub system is
980	* running, stopped or initialized.
981	*/
982	as_clustering_sys_state sys_state;
983
984	/**
985	* Simple view of whether or not the cluster is well-formed.
986	*/
987	bool has_integrity;
988
989	/**
990	* Clustering relevant state, e.g. orphan, principal, non-principal.
991	*/
992	as_clustering_state state;
993
994	/**
995	* The preferred principal is a node such that removing current principal
996	* and making said node new principal will lead to a larger cluster. This is
997	* updated in the non-principal state at each quantum interval and is sent
998	* out with each heartbeat pulse.
999	*/
1000	cf_node preferred_principal;
1001
1002	/**
1003	* Pending join requests.
1004	*/
1005	cf_vector pending_join_requests;
1006
1007	/**
1008	* The monotonic clock time when this node entered orphan state.
1009	* Will be set to zero when the node is not an orphan.
1010	*/
1011	cf_clock orphan_state_start_time;
1012
1013	/**
1014	* Time when the last move command was sent.
1015	*/
1016	cf_clock move_cmd_issue_time;
1017
1018	/**
1019	* Hash from nodes whom join request was sent to the time the join request
1020	* was send . Used to prevent sending join request too quickly to the same
1021	* principal again and again.
1022	*/
1023	cf_shash* join_request_blackout;
1024
1025	/**
1026	* The principal to which the last join request was sent.
1027	*/
1028	cf_node last_join_request_principal;
1029
1030	/**
1031	* The time at which the last join request was sent, to track and timeout
1032	* join requests.
1033	*/
1034	cf_clock last_join_request_sent_time;
1035
1036	/**
1037	* The time at which the last join request was retransmitted, to track and
1038	* retransmit join requests.
1039	*/
1040	cf_clock last_join_request_retransmit_time;
1041	} as_clustering;
1042
1043	/**
1044	* Result of sending out a join request.
1045	*/
1046	typedef enum as_clustering_join_request_result_e
1047	{
1048	/**
1049	*
1050	* Join request was sent out.
1051	*/
1052	AS_CLUSTERING_JOIN_REQUEST_SENT,
1053
1054	/**
1055	*
1056	* Join request was attempted, but sending failed.
1057	*/
1058	AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED,
1059
1060	/**
1061	* Join request already pending. A new join request was not sent.
1062	*/
1063	AS_CLUSTERING_JOIN_REQUEST_PENDING,
1064
1065	/**
1066	* No neighboring principals present to send the join request.
1067	*/
1068	AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS
1069	} as_clustering_join_request_result;
1070
1071	/**
1072	* External event publisher state.
1073	*/
1074	typedef struct as_clustering_external_event_publisher_s
1075	{
1076	/**
1077	* State of the external event publisher.
1078	*/
1079	as_clustering_sys_state sys_state;
1080
1081	/**
1082	* Inidicates if there is an event to publish.
1083	*/
1084	bool event_queued;
1085
1086	/**
1087	* The pending event to publish.
1088	*/
1089	as_clustering_event to_publish;
1090
1091	/**
1092	* The static succession list published with the message.
1093	*/
1094	cf_vector published_succession_list;
1095
1096	/**
1097	* Conditional variable to signal pending event to publish.
1098	*/
1099	pthread_cond_t is_pending;
1100
1101	/**
1102	* Thread id of the publisher thread.
1103	*/
1104	pthread_t event_publisher_tid;
1105
1106	/**
1107	* Mutex to protect the conditional variable.
1108	*/
1109	pthread_mutex_t is_pending_mutex;
1110	} as_clustering_external_event_publisher;
1111
1112	/*
1113	* ----------------------------------------------------------------------------
1114	* Forward declarations
1115	* ----------------------------------------------------------------------------
1116	*/
1117	static void
1118	internal_event_dispatch(as_clustering_internal_event* event);
1119	static bool
1120	clustering_is_our_principal(cf_node nodeid);
1121	static bool
1122	clustering_is_principal();
1123	static bool
1124	clustering_is_cluster_member(cf_node nodeid);
1125
1126	/*
1127	* ----------------------------------------------------------------------------
1128	* Non-public hooks to exchange subsystem.
1129	* ----------------------------------------------------------------------------
1130	*/
1131	extern void
1132	exchange_clustering_event_listener(as_clustering_event* event);
1133
1134	/*
1135	* ----------------------------------------------------------------------------
1136	* Timer, timeout values and intervals
1137	*
1138	* All values should be multiples of timer tick interval.
1139	* ----------------------------------------------------------------------------
1140	*/
1141
1142	/**
1143	* Timer tick interval, which should be a GCD of all clustering intervals.
1144	*/
1145	static uint32_t
1146	timer_tick_interval()
1147	{
1148	return CLUSTERING_TIMER_TICK_INTERVAL;
1149	}
1150
1151	/**
1152	* Maximum network latency for the cluster.
1153	*/
1154	static uint32_t
1155	network_latency_max()
1156	{
1157	return g_config.fabric_latency_max_ms;
1158	}
1159
1160	/**
1161	* Maximum network rtt for the cluster.
1162	*/
1163	static uint32_t
1164	network_rtt_max()
1165	{
1166	return `2` * network_latency_max();
1167	}
1168
1169	/**
1170	* Quantum interval in milliseconds.
1171	*/
1172	static uint32_t
1173	quantum_interval()
1174	{
1175	uint32_t std_quantum_interval = MIN(QUANTUM_INTERVAL_MAX,
1176	as_hb_node_timeout_get()
1177	+ `2` * (as_hb_tx_interval_get() + network_latency_max()));
1178
1179	// Ensure we give paxos enough time to complete.
1180	return MAX(PAXOS_COMPLETION_TIME_MAX, std_quantum_interval);
1181	}
1182
1183	/**
1184	* Maximum number of times quantum interval start can be skipped.
1185	*/
1186	static uint32_t
1187	quantum_interval_skip_max()
1188	{
1189	return `2`;
1190	}
1191
1192	/**
1193	* Interval at which register sync is checked.
1194	*/
1195	static uint32_t
1196	register_sync_check_interval()
1197	{
1198	return MAX(network_rtt_max(), as_hb_tx_interval_get());
1199	}
1200
1201	/**
1202	* Timeout for a join request, should definitely be larger than a quantum
1203	* interval to prevent the requesting node from making new requests before the
1204	* current requested principal node can finish the paxos round.
1205	*/
1206	static uint32_t
1207	join_request_timeout()
1208	{
1209	// Allow for
1210	// - 1 quantum interval, where our request lands just after the potential
1211	// principal's quantum interval start.
1212	// - 0.5 quantum intervals to give time for a paxos round to finish
1213	// - (quantum_interval_skip_max -1) intervals if the principal had to skip
1214	// quantum intervals.
1215	return (uint32_t)(
1216	(`1` + `0.5` + (quantum_interval_skip_max() - `1`)) * quantum_interval());
1217	}
1218
1219	/**
1220	* Timeout for a retransmitting a join request.
1221	*/
1222	static uint32_t
1223	join_request_retransmit_timeout()
1224	{
1225	return (uint32_t)(MIN(as_hb_tx_interval_get() / `2`, quantum_interval() / `2`));
1226	}
1227
1228	/**
1229	* The interval at which a node checks to see if it should join a cluster.
1230	*/
1231	static uint32_t
1232	join_cluster_check_interval()
1233	{
1234	return timer_tick_interval();
1235	}
1236
1237	/**
1238	* Blackout period for join requests to a particular principal to prevent
1239	* bombarding it with join requests. Should be less than join_request_timeout().
1240	*/
1241	static uint32_t
1242	join_request_blackout_interval()
1243	{
1244	return MIN(join_request_timeout(),
1245	MIN(quantum_interval() / `2`, `2` * as_hb_tx_interval_get()));
1246	}
1247
1248	/**
1249	* Blackout period after sending a move command, during which join requests will
1250	* be rejected.
1251	*/
1252	static uint32_t
1253	join_request_move_reject_interval()
1254	{
1255	// Wait for one quantum interval before accepting join requests after
1256	// sending a move command.
1257	return quantum_interval();
1258	}
1259
1260	/**
1261	* Maximum tolerable join request transmission delay in milliseconds. Join
1262	* requests delayed by more than this amount will not be accepted.
1263	*/
1264	static uint32_t
1265	join_request_accept_delay_max()
1266	{
1267	// Join request is considered stale / delayed if the (received hlc timestamp
1268	// - send hlc timestamp) > this value;
1269	return (`2` * as_hb_tx_interval_get() + network_latency_max());
1270	}
1271
1272	/**
1273	* Timeout in milliseconds for a paxos proposal. Give a paxos round two thirds
1274	* of an interval to timeout.
1275	* A paxos round should definitely timeout before the next quantum interval, so
1276	* that it does not delay cluster convergence.
1277	*/
1278	static uint32_t
1279	paxos_proposal_timeout()
1280	{
1281	return MAX(quantum_interval() / `2`, network_rtt_max());
1282	}
1283
1284	/**
1285	* Timeout in milliseconds after which a paxos message is retransmitted.
1286	*/
1287	static uint32_t
1288	paxos_msg_timeout()
1289	{
1290	return MAX(MIN(quantum_interval() / `4`, `100`), network_rtt_max());
1291	}
1292
1293	/**
1294	* Maximum amount of time a node will be in orphan state. After this timeout the
1295	* node will try forming a new cluster even if there are other adjacent
1296	* clusters/nodes visible.
1297	*/
1298	static uint32_t
1299	clustering_orphan_timeout()
1300	{
1301	return UINT_MAX;
1302	}
1303
1304	/*
1305	* ----------------------------------------------------------------------------
1306	* Stack allocation
1307	* ----------------------------------------------------------------------------
1308	*/
1309
1310	/**
1311	* Maximum memory size allocated on the call stack.
1312	*/
1313	#define STACK_ALLOC_LIMIT() (16 * 1024)
1314
1315	/**
1316	* Allocate a buffer on stack if possible. Larger buffers are heap allocated to
1317	* prevent stack overflows.
1318	*/
1319	#define BUFFER_ALLOC_OR_DIE(size) \
1320	(((size) > STACK_ALLOC_LIMIT()) ? cf_malloc(size) : alloca(size))
1321
1322	/**
1323	* Free the buffer allocated by BUFFER_ALLOC
1324	*/
1325	#define BUFFER_FREE(buffer, size) \
1326	if (((size) > STACK_ALLOC_LIMIT()) && buffer) {cf_free(buffer);}
1327
1328	/*
1329	* ----------------------------------------------------------------------------
1330	* Logging
1331	* ----------------------------------------------------------------------------
1332	*/
1333	#define LOG_LENGTH_MAX() (800)
1334	#define CRASH(format, ...) cf_crash(AS_CLUSTERING, format, ##__VA_ARGS__)
1335	#define WARNING(format, ...) cf_warning(AS_CLUSTERING, format, ##__VA_ARGS__)
1336	#define INFO(format, ...) cf_info(AS_CLUSTERING, format, ##__VA_ARGS__)
1337	#define DEBUG(format, ...) cf_debug(AS_CLUSTERING, format, ##__VA_ARGS__)
1338	#define DETAIL(format, ...) cf_detail(AS_CLUSTERING, format, ##__VA_ARGS__)
1339
1340	#define ASSERT(expression, message, ...) \
1341	if (!(expression)) {WARNING(message, ##__VA_ARGS__);}
1342
1343	#define log_cf_node_array(message, nodes, node_count, severity) \
1344	as_clustering_log_cf_node_array(severity, AS_CLUSTERING, message, \
1345	nodes, node_count)
1346	#define log_cf_node_vector(message, nodes, severity) \
1347	as_clustering_log_cf_node_vector(severity, AS_CLUSTERING, message, \
1348	nodes)
1349
1350	/*
1351	* ----------------------------------------------------------------------------
1352	* Vector functions
1353	* ----------------------------------------------------------------------------
1354	*/
1355
1356	/**
1357	* Clear / delete all entries in a vector.
1358	*/
1359	static void
1360	vector_clear(cf_vector* vector)
1361	{
1362	cf_vector_delete_range(vector, `0`, cf_vector_size(vector));
1363	}
1364
1365	/**
1366	* Create temporary stack variables.
1367	*/
1368	#define TOKEN_PASTE(x, y) x##y
1369	#define STACK_VAR(x, y) TOKEN_PASTE(x, y)
1370
1371	/**
1372	* Initialize a lockless vector, initially sized to store cluster node number
1373	* of elements.
1374	*/
1375	#define vector_lockless_init(vectorp, value_type) \
1376	({ \
1377	cf_vector_init(vectorp, sizeof(value_type), \
1378	AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); \
1379	})
1380
1381	/**
1382	* Create and initialize a lockless stack allocated vector to initially sized to
1383	* store cluster node number of elements.
1384	*/
1385	#define vector_stack_lockless_create(value_type) \
1386	({ \
1387	cf_vector * STACK_VAR(vector, __LINE__) = (cf_vector*)alloca( \
1388	sizeof(cf_vector)); \
1389	size_t buffer_size = AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT \
1390	* sizeof(value_type); \
1391	void* STACK_VAR(buff, __LINE__) = alloca(buffer_size); cf_vector_init_smalloc( \
1392	STACK_VAR(vector, __LINE__), sizeof(value_type), \
1393	(uint8_t*)STACK_VAR(buff, __LINE__), buffer_size, \
1394	VECTOR_FLAG_INITZERO); \
1395	STACK_VAR(vector, __LINE__); \
1396	})
1397
1398	/**
1399	* Check two vector for equality. Two vector are euql if they have the same
1400	* number of elements and corresponding elements are equal. For now simple
1401	* memory compare is used to compare elements. Assumes the vectors are not
1402	* accessed by other threads during this operation.
1403	*
1404	* @param v1 the first vector to compare.
1405	* @param v2 the second vector to compare.
1406	* @return true if the vectors are true, false otherwise.
1407	*/
1408	static bool
1409	vector_equals(cf_vector* v1, cf_vector* v2)
1410	{
1411	int v1_count = cf_vector_size(v1);
1412	int v2_count = cf_vector_size(v2);
1413	int v1_elem_sz = VECTOR_ELEM_SZ(v1);
1414	int v2_elem_sz = VECTOR_ELEM_SZ(v2);
1415
1416	if (v1_count != v2_count \|\| v1_elem_sz != v2_elem_sz) {
1417	return false;
1418	}
1419
1420	for (int i = `0`; i < v1_count; i++) {
1421	// No null check required since we are iterating under a lock and within
1422	// vector bounds.
1423	void* v1_element = cf_vector_getp(v1, i);
1424	void* v2_element = cf_vector_getp(v2, i);
1425
1426	if (v1_element == v2_element) {
1427	// Same reference or both are NULL.
1428	continue;
1429	}
1430
1431	if (v1_element == NULL \|\| v2_element == NULL) {
1432	// Exactly one reference is NULL.
1433	return false;
1434	}
1435
1436	if (memcmp(v1_element, v2_element, v1_elem_sz) != `0`) {
1437	return false;
1438	}
1439	}
1440
1441	return true;
1442	}
1443
1444	/**
1445	* Find the index of an element in the vector. Equality is based on mem compare.
1446	*
1447	* @param vector the source vector.
1448	* @param element the element to find.
1449	* @return the index if the element is found, -1 otherwise.
1450	*/
1451	static int
1452	vector_find(cf_vector* vector, void* element)
1453	{
1454	int element_count = cf_vector_size(vector);
1455	size_t value_len = VECTOR_ELEM_SZ(vector);
1456	for (int i = `0`; i < element_count; i++) {
1457	// No null check required since we are iterating under a lock and within
1458	// vector bounds.
1459	void* src_element = cf_vector_getp(vector, i);
1460	if (src_element) {
1461	if (memcmp(element, src_element, value_len) == `0`) {
1462	return i;
1463	}
1464	}
1465	}
1466	return -`1`;
1467	}
1468
1469	/**
1470	* Copy all elements form the source vector to the destination vector to the
1471	* destination vector. Assumes the source and destination vector are not being
1472	* modified while the copy operation is in progress.
1473	*
1474	* @param dest the destination vector.
1475	* @param src the source vector.
1476	* @return the number of elements copied.
1477	*/
1478	static int
1479	vector_copy(cf_vector* dest, cf_vector* src)
1480	{
1481	int element_count = cf_vector_size(src);
1482	int copied_count = `0`;
1483	for (int i = `0`; i < element_count; i++) {
1484	// No null check required since we are iterating under a lock and within
1485	// vector bounds.
1486	void* src_element = cf_vector_getp(src, i);
1487	if (src_element) {
1488	cf_vector_append(dest, src_element);
1489	copied_count++;
1490	}
1491	}
1492	return copied_count;
1493	}
1494
1495	/**
1496	* Copy all elements form the source vector to the destination vector only if
1497	* they do not exist in the destination vector. Assumes the source and
1498	* destination vector are not being modified while the copy operation is in
1499	* progress.
1500	*
1501	* @param dest the destination vector.
1502	* @param src the source vector.
1503	* @return the number of elements copied.
1504	*/
1505	static int
1506	vector_copy_unique(cf_vector* dest, cf_vector* src)
1507	{
1508	int element_count = cf_vector_size(src);
1509	int copied_count = `0`;
1510	for (int i = `0`; i < element_count; i++) {
1511	// No null check required since we are iterating under a lock and within
1512	// vector bounds.
1513	void* src_element = cf_vector_getp(src, i);
1514	if (src_element) {
1515	cf_vector_append_unique(dest, src_element);
1516	copied_count++;
1517	}
1518	}
1519	return copied_count;
1520	}
1521
1522	/**
1523	* Sorts in place the elements in the vector using the inout comparator function
1524	* and retains only unique elements. Assumes the source vector is not being
1525	* modified while the sort operation is in progress.
1526	*
1527	* @param src the source vector.
1528	* @return comparator the comparator function, which must return an integer less
1529	* than, equal to, or greater than zero if the first argument is considered to
1530	* be respectively less than, equal to, or greater than the second
1531	*/
1532	static void
1533	vector_sort_unique(cf_vector* src, int
1534	(comparator)(const* void, const* void*))
1535	{
1536	int element_count = cf_vector_size(src);
1537	size_t value_len = VECTOR_ELEM_SZ(src);
1538	size_t array_size = element_count * value_len;
1539	void* element_array = BUFFER_ALLOC_OR_DIE(array_size);
1540
1541	// A lame approach to sorting. Copying the elements to an array and invoking
1542	// qsort.
1543	uint8_t* next_element_ptr = element_array;
1544	int array_element_count = `0`;
1545	for (int i = `0`; i < element_count; i++) {
1546	// No null check required since we are iterating under a lock and within
1547	// vector bounds.
1548	void* src_element = cf_vector_getp(src, i);
1549	if (src_element) {
1550	memcpy(next_element_ptr, src_element, value_len);
1551	next_element_ptr += value_len;
1552	array_element_count++;
1553	}
1554	}
1555
1556	qsort(element_array, array_element_count, value_len, comparator);
1557
1558	vector_clear(src);
1559	next_element_ptr = element_array;
1560	for (int i = `0`; i < array_element_count; i++) {
1561	cf_vector_append_unique(src, next_element_ptr);
1562	next_element_ptr += value_len;
1563	}
1564
1565	BUFFER_FREE(element_array, array_size);
1566	return;
1567	}
1568
1569	/**
1570	* Remove all elements from the to_remove vector present in the target vector.
1571	* Equality is based on simple mem compare.
1572	*
1573	* @param target the target vector being modified.
1574	* @param to_remove the vector whose elements must be removed from the target.
1575	* @return the number of elements removed.
1576	*/
1577	static int
1578	vector_subtract(cf_vector* target, cf_vector* to_remove)
1579	{
1580	int element_count = cf_vector_size(to_remove);
1581	int removed_count = `0`;
1582	for (int i = `0`; i < element_count; i++) {
1583	// No null check required since we are iterating under a lock and within
1584	// vector bounds.
1585	void* to_remove_element = cf_vector_getp(to_remove, i);
1586	if (to_remove_element) {
1587	int found_at = `0`;
1588	while ((found_at = vector_find(target, to_remove_element)) >= `0`) {
1589	cf_vector_delete(target, found_at);
1590	removed_count++;
1591	}
1592	}
1593	}
1594
1595	return removed_count;
1596	}
1597
1598	/**
1599	* Convert a vector to an array.
1600	* FIXME: return pointer to the internal vector storage.
1601	*/
1602	static cf_node*
1603	vector_to_array(cf_vector* vector)
1604	{
1605	return (cf_node*)vector->vector;
1606	}
1607
1608	/**
1609	* Copy elements in a vector to an array.
1610	* @param array the destination array. Should be large enough to hold the number
1611	* all elements in the vector.
1612	* @param src the source vector.
1613	* @param element_count the number of elements to copy from the source vector.
1614	*/
1615	static void
1616	vector_array_cpy(void* array, cf_vector* src, int element_count)
1617	{
1618	uint8_t* element_ptr = array;
1619	int element_size = VECTOR_ELEM_SZ(src);
1620	for (int i = `0`; i < element_count; i++) {
1621	cf_vector_get(src, i, element_ptr);
1622	element_ptr += element_size;
1623	}
1624	}
1625
1626	/*
1627	* ----------------------------------------------------------------------------
1628	* Globals
1629	* ----------------------------------------------------------------------------
1630	*/
1631
1632	/**
1633	* The big fat lock for all clustering state.
1634	*/
1635	static pthread_mutex_t g_clustering_lock =
1636	PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1637
1638	/**
1639	* The fat lock for all clustering events listener changes.
1640	*/
1641	static pthread_mutex_t g_clustering_event_publisher_lock =
1642	PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
1643
1644	/**
1645	* Debugging lock acquition.
1646	* #define LOCK_DEBUG_ENABLED 1
1647	*/
1648	#ifdef LOCK_DEBUG_ENABLED
1649	#define LOCK_DEBUG(format, ...) DEBUG(format, ##__VA_ARGS__)
1650	#else
1651	#define LOCK_DEBUG(format, ...)
1652	#endif
1653
1654	/**
1655	* Acquire a lock on the clustering module.
1656	*/
1657	#define CLUSTERING_LOCK() \
1658	({ \
1659	pthread_mutex_lock (&g_clustering_lock); \
1660	LOCK_DEBUG("locked in %s", __FUNCTION__); \
1661	})
1662
1663	/**
1664	* Relinquish the lock on the clustering module.
1665	*/
1666	#define CLUSTERING_UNLOCK() \
1667	({ \
1668	pthread_mutex_unlock (&g_clustering_lock); \
1669	LOCK_DEBUG("unLocked in %s", __FUNCTION__); \
1670	})
1671
1672	/**
1673	* Acquire a lock on the clustering publisher.
1674	*/
1675	#define CLUSTERING_EVENT_PUBLISHER_LOCK() \
1676	({ \
1677	pthread_mutex_lock (&g_clustering_event_publisher_lock); \
1678	LOCK_DEBUG("publisher locked in %s", __FUNCTION__); \
1679	})
1680
1681	/**
1682	* Relinquish the lock on the clustering publisher.
1683	*/
1684	#define CLUSTERING_EVENT_PUBLISHER_UNLOCK() \
1685	({ \
1686	pthread_mutex_unlock (&g_clustering_event_publisher_lock); \
1687	LOCK_DEBUG("publisher unLocked in %s", __FUNCTION__); \
1688	})
1689
1690	/**
1691	* Singleton timer.
1692	*/
1693	static as_clustering_timer g_timer;
1694
1695	/**
1696	* Singleton external events publisher.
1697	*/
1698	static as_clustering_external_event_publisher g_external_event_publisher;
1699
1700	/**
1701	* Singleton cluster register to store this node's cluster membership.
1702	*/
1703	static as_clustering_register g_register;
1704
1705	/**
1706	* Singleton clustrering state all initialized to zero.
1707	*/
1708	static as_clustering g_clustering = { `0` };
1709
1710	/**
1711	* Singleton paxos proposer.
1712	*/
1713	static as_paxos_proposer g_proposer;
1714
1715	/**
1716	* Singleton paxos acceptor.
1717	*/
1718	static as_paxos_acceptor g_acceptor;
1719
1720	/**
1721	* Singleton quantum interval generator.
1722	*/
1723	static as_clustering_quantum_interval_generator g_quantum_interval_generator;
1724
1725	/**
1726	* Message template for heart beat messages.
1727	*/
1728	static msg_template g_clustering_msg_template[] = {
1729
1730	{ AS_CLUSTERING_MSG_ID, M_FT_UINT32 },
1731
1732	{ AS_CLUSTERING_MSG_TYPE, M_FT_UINT32 },
1733
1734	{ AS_CLUSTERING_MSG_HLC_TIMESTAMP, M_FT_UINT64 },
1735
1736	{ AS_CLUSTERING_MSG_SEQUENCE_NUMBER, M_FT_UINT64 },
1737
1738	{ AS_CLUSTERING_MSG_CLUSTER_KEY, M_FT_UINT64 },
1739
1740	{ AS_CLUSTERING_MSG_SUCCESSION_LIST, M_FT_BUF },
1741
1742	{ AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, M_FT_UINT64 }
1743
1744	};
1745
1746	/*
1747	* ----------------------------------------------------------------------------
1748	* Clustering life cycle
1749	* ----------------------------------------------------------------------------
1750	*/
1751
1752	/**
1753	* Check if clustering is initialized.
1754	*/
1755	static bool
1756	clustering_is_initialized()
1757	{
1758	CLUSTERING_LOCK();
1759	bool initialized = (g_clustering.sys_state
1760	!= AS_CLUSTERING_SYS_STATE_UNINITIALIZED);
1761	CLUSTERING_UNLOCK();
1762	return initialized;
1763	}
1764
1765	/**
1766	* * Check if clustering is running.
1767	*/
1768	static bool
1769	clustering_is_running()
1770	{
1771	CLUSTERING_LOCK();
1772	bool running = g_clustering.sys_state == AS_CLUSTERING_SYS_STATE_RUNNING;
1773	CLUSTERING_UNLOCK();
1774	return running;
1775	}
1776
1777	/*
1778	* ----------------------------------------------------------------------------
1779	* Config related functions
1780	* ----------------------------------------------------------------------------
1781	*/
1782
1783	/**
1784	* The nodeid for this node.
1785	*/
1786	static cf_node
1787	config_self_nodeid_get()
1788	{
1789	return g_config.self_node;
1790	}
1791
1792	/*
1793	* ----------------------------------------------------------------------------
1794	* Compatibility mode functions
1795	* ----------------------------------------------------------------------------
1796	*/
1797
1798	/**
1799	* Return current protocol version identifier.
1800	*/
1801	as_cluster_proto_identifier
1802	clustering_protocol_identifier_get()
1803	{
1804	return `0x707C`;
1805	}
1806
1807	/**
1808	* Compare clustering protocol versions for compatibility.
1809	*/
1810	bool
1811	clustering_versions_are_compatible(as_cluster_proto_identifier v1,
1812	as_cluster_proto_identifier v2)
1813	{
1814	return v1 == v2;
1815	}
1816
1817	/*
1818	* ----------------------------------------------------------------------------
1819	* Timer event generator
1820	*
1821	* TODO: Can be abstracted out as a single scheduler single utility across
1822	* modules.
1823	* ----------------------------------------------------------------------------
1824	*/
1825
1826	static void
1827	timer_init()
1828	{
1829	CLUSTERING_LOCK();
1830	memset(&g_timer, `0`, sizeof(g_timer));
1831	CLUSTERING_UNLOCK();
1832	}
1833
1834	/**
1835	* Clustering timer event generator thread, to help with retries and retransmits
1836	* across all states.
1837	*/
1838	static void*
1839	timer_thr(void* arg)
1840	{
1841	as_clustering_internal_event timer_event;
1842	memset(&timer_event, `0`, sizeof(timer_event));
1843	timer_event.type = AS_CLUSTERING_INTERNAL_EVENT_TIMER;
1844
1845	while (clustering_is_running()) {
1846	// Wait for a while and retry.
1847	internal_event_dispatch(&timer_event);
1848	usleep(timer_tick_interval() * `1000`);
1849	}
1850
1851	return NULL;
1852	}
1853
1854	/**
1855	* Start the timer.
1856	*/
1857	static void
1858	timer_start()
1859	{
1860	CLUSTERING_LOCK();
1861	g_timer.timer_tid = cf_thread_create_joinable(timer_thr, NULL);
1862	CLUSTERING_UNLOCK();
1863	}
1864
1865	/**
1866	* Stop the timer.
1867	*/
1868	static void
1869	timer_stop()
1870	{
1871	CLUSTERING_LOCK();
1872	cf_thread_join(g_timer.timer_tid);
1873	CLUSTERING_UNLOCK();
1874	}
1875
1876	/*
1877	* ----------------------------------------------------------------------------
1878	* Heartbeat subsystem interfacing
1879	* ----------------------------------------------------------------------------
1880	*/
1881
1882	/*
1883	* The structure of data clustring subsystem pushes with in hb pulse messages
1884	* and retains as plugin data is as follows.
1885	*
1886	* Each row occupies 4 bytes.
1887	*
1888	* V5 heartbeat wire payload structure.
1889	* ===============================
1890	*
1891	* ------------\|-------------\|------------\|------------\|
1892	* \| Clustering Protocol identifier \|
1893	* \|---------------------------------------------------\|
1894	* \| \|
1895	* \|-------- Cluster Key ------------------------------\|
1896	* \| \|
1897	* \|---------------------------------------------------\|
1898	* \| \|
1899	* \|-------- Paxos sequence number --------------------\|
1900	* \| \|
1901	* \|---------------------------------------------------\|
1902	* \| \|
1903	* \|-------- Preferred principal ----------------------\|
1904	* \| \|
1905	* \|---------------------------------------------------\|
1906	* \| Length of succession list \|
1907	* \|---------------------------------------------------\|
1908	* \| \|
1909	* \|-------- Succ. Node id 0 --------------------------\|
1910	* \| \|
1911	* \|---------------------------------------------------\|
1912	* \| \|
1913	* \|-------- Succ. Node id 1 --------------------------\|
1914	* \| \|
1915	* \|---------------------------------------------------\|
1916	* \| . \|
1917	* \| . \|
1918	*
1919	*
1920	* Cluster key and succession lists helps with detecting cluster integrity,
1921	* Plain clusterkey should be good enough but matching succession lists adds to
1922	* another level of safety (may not be required but being cautious).
1923	*
1924	* For orpahned node cluster key and length of succession list are set to zero.
1925	*
1926	* The parsed hb pluging data is just the same as the wire payload structure.
1927	* The plugin code ensure invalid content will never be parsed as plugin data to
1928	* memory. The direct implication is that if plugin data is not NULL,
1929	* required fields
1930	* - Clustering protocol identifier
1931	* - Cluster key
1932	* - Succession list length will always be present when read back from the
1933	* heartbeat subsystem and the succession list will be consistent with the
1934	* succession list length.
1935	*/
1936
1937	/**
1938	* Read plugin data from hb layer for a node, using stack allocated space.
1939	* Will attempt a max of 3 attempts before crashing.
1940	* plugin_data_p->data_size will be zero and plugin_data_p->data will be NULL if
1941	* an entry for the node does not exist.
1942	*/
1943	#define clustering_hb_plugin_data_get(nodeid, plugin_data_p, \
1944	hb_msg_hlc_ts_p, msg_recv_ts_p) \
1945	({ \
1946	(plugin_data_p)->data_capacity = 1024; \
1947	int tries_remaining = 3; \
1948	bool enoent = false; \
1949	bool rv = -1; \
1950	while (tries_remaining--) { \
1951	(plugin_data_p)->data = alloca((plugin_data_p)->data_capacity); \
1952	if (as_hb_plugin_data_get(nodeid, AS_HB_PLUGIN_CLUSTERING, \
1953	plugin_data_p, hb_msg_hlc_ts_p, msg_recv_ts_p) == 0) { \
1954	rv = 0; \
1955	break; \
1956	} \
1957	if (errno == ENOENT) { \
1958	enoent = true; \
1959	break; \
1960	} \
1961	if (errno == ENOMEM) { \
1962	(plugin_data_p)->data_capacity = (plugin_data_p)->data_size; \
1963	} \
1964	} \
1965	if (rv != 0 && !enoent && tries_remaining < 0) { \
1966	CRASH("error allocating space for paxos hb plugin data"); \
1967	} \
1968	if (enoent) { \
1969	(plugin_data_p)->data_size = 0; \
1970	(plugin_data_p)->data = NULL; \
1971	} \
1972	rv; \
1973	})
1974
1975	/**
1976	* Get a pointer to the protocol identifier inside plugin data. Will be NULL if
1977	* plugin data is null or there are not enough bytes in the data to hold the
1978	* identifier.
1979	* @param plugin_data can be NULL.
1980	* @param plugin_data_size the size of plugin data.
1981	* @return pointer to the protocol identifier on success, NULL on failure.
1982	*/
1983	static as_cluster_proto_identifier*
1984	clustering_hb_plugin_proto_get(void* plugin_data, size_t plugin_data_size)
1985	{
1986	if (plugin_data == NULL
1987	\|\| plugin_data_size < sizeof(as_cluster_proto_identifier)) {
1988	// The data does not hold valid data or there is no cluster key and or
1989	// succession list is missing.
1990	return NULL;
1991	}
1992
1993	return (as_cluster_proto_identifier*)plugin_data;
1994	}
1995
1996	/**
1997	* Retrieves the cluster key from clustering hb plugin data.
1998	* @param plugin_data can be NULL.
1999	* @param plugin_data_size the size of plugin data.
2000	* @return pointer to the cluster key on success, NULL on failure.
2001	*/
2002	static as_cluster_key*
2003	clustering_hb_plugin_cluster_key_get(void* plugin_data, size_t plugin_data_size)
2004	{
2005	uint8_t* proto = (uint8_t*)clustering_hb_plugin_proto_get(plugin_data,
2006	plugin_data_size);
2007	if (proto == NULL) {
2008	// The data does not hold valid data.
2009	return NULL;
2010	}
2011
2012	if ((uint8_t*)plugin_data + plugin_data_size
2013	< proto + sizeof(as_cluster_proto_identifier)
2014	+ sizeof(as_cluster_key)) {
2015	// Not enough bytes for cluster key.
2016	return NULL;
2017	}
2018
2019	return (as_cluster_key)(proto + sizeof*(as_cluster_proto_identifier));
2020	}
2021
2022	/**
2023	* Retrieves the sequence number from clustering hb plugin data.
2024	* @param plugin_data can be NULL.
2025	* @param plugin_data_size the size of plugin data.
2026	* @return pointer to the sequence number on success, NULL on failure.
2027	*/
2028	static as_paxos_sequence_number*
2029	clustering_hb_plugin_sequence_number_get(void* plugin_data,
2030	size_t plugin_data_size)
2031	{
2032	uint8_t* cluster_key = (uint8_t*)clustering_hb_plugin_cluster_key_get(
2033	plugin_data, plugin_data_size);
2034	if (cluster_key == NULL) {
2035	// The data does not hold valid data or there is no cluster key.
2036	return NULL;
2037	}
2038
2039	if ((uint8_t*)plugin_data + plugin_data_size
2040	< cluster_key + sizeof(as_cluster_key)
2041	+ sizeof(as_paxos_sequence_number)) {
2042	// Not enough bytes for succession list length.
2043	return NULL;
2044	}
2045
2046	return (as_paxos_sequence_number)(cluster_key + sizeof*(as_cluster_key));
2047	}
2048
2049	/**
2050	* Retrieves the preferred principal from clustering hb plugin data.
2051	* @param plugin_data can be NULL.
2052	* @param plugin_data_size the size of plugin data.
2053	* @return pointer to the preferred principal on success, NULL on failure.
2054	*/
2055	static cf_node*
2056	clustering_hb_plugin_preferred_principal_get(void* plugin_data,
2057	size_t plugin_data_size)
2058	{
2059	uint8_t* sequence_number_p =
2060	(uint8_t*)clustering_hb_plugin_sequence_number_get(plugin_data,
2061	plugin_data_size);
2062	if (sequence_number_p == NULL) {
2063	// The data does not hold valid data or there is no sequence number.
2064	return NULL;
2065	}
2066
2067	if ((uint8_t*)plugin_data + plugin_data_size
2068	< sequence_number_p + sizeof(as_paxos_sequence_number)
2069	+ sizeof(cf_node)) {
2070	// Not enough bytes for preferred principal.
2071	return NULL;
2072	}
2073
2074	return (as_paxos_sequence_number*)(sequence_number_p
2075	+ sizeof(as_paxos_sequence_number));
2076	}
2077
2078	/**
2079	* Retrieves the succession list length pointer from clustering hb plugin data.
2080	* @param plugin_data can be NULL.
2081	* @param plugin_data_size the size of plugin data.
2082	* @return pointer to succession list length on success, NULL on failure.
2083	*/
2084	static uint32_t*
2085	clustering_hb_plugin_succession_length_get(void* plugin_data,
2086	size_t plugin_data_size)
2087	{
2088	uint8_t* preferred_principal_p =
2089	(uint8_t*)clustering_hb_plugin_preferred_principal_get(plugin_data,
2090	plugin_data_size);
2091	if (preferred_principal_p == NULL) {
2092	// The data does not hold valid data or there is no preferred principal
2093	// and or succession list is missing.
2094	return NULL;
2095	}
2096
2097	if ((uint8_t*)plugin_data + plugin_data_size
2098	< preferred_principal_p + sizeof(cf_node) + sizeof(uint32_t)) {
2099	// Not enough bytes for succession list length.
2100	return NULL;
2101	}
2102
2103	return (uint32_t)(preferred_principal_p + sizeof*(cf_node));
2104	}
2105
2106	/**
2107	* Retrieves the pointer to the first node in the succession list.
2108	* @param plugin_data can be NULL.
2109	* @param plugin_data_size the size of plugin data.
2110	* @return pointer to first node in succession list on success, NULL on failure
2111	* or if the succession list is empty.
2112	*/
2113	static cf_node*
2114	clustering_hb_plugin_succession_get(void* plugin_data, size_t plugin_data_size)
2115	{
2116	uint8_t* succession_list_length_p =
2117	(uint8_t*)clustering_hb_plugin_succession_length_get(plugin_data,
2118	plugin_data_size);
2119	if (succession_list_length_p == NULL) {
2120	// The data does not hold valid data or there is no cluster key and or
2121	// succession list is missing.
2122	return NULL;
2123	}
2124
2125	if ((uint32_t)succession_list_length_p == `0`) {
2126	// Empty succession list.
2127	return NULL;
2128	}
2129
2130	if ((uint8_t*)plugin_data + plugin_data_size
2131	< succession_list_length_p + sizeof(uint32_t)
2132	+ (sizeof(cf_node) * ((uint32_t)succession_list_length_p))) {
2133	// Not enough bytes for succession list length.
2134	return NULL;
2135	}
2136
2137	return (cf_node)(succession_list_length_p + sizeof*(uint32_t));
2138	}
2139
2140	/**
2141	* Validate the correctness of plugin data. By ensuring all required fields are
2142	* present and the succession list matches the provided length.
2143	* @param plugin_data can be NULL.
2144	* @param plugin_data_size the size of plugin data.
2145	* @return pointer to first node in succession list on success, NULL on failure.
2146	*/
2147	static bool
2148	clustering_hb_plugin_data_is_valid(void* plugin_data, size_t plugin_data_size)
2149	{
2150	void* proto_identifier_p = clustering_hb_plugin_proto_get(plugin_data,
2151	plugin_data_size);
2152	if (proto_identifier_p == NULL) {
2153	DEBUG("plugin data missing protocol identifier");
2154	return false;
2155	}
2156
2157	as_cluster_proto_identifier current_proto_identifier =
2158	clustering_protocol_identifier_get();
2159	if (!clustering_versions_are_compatible(current_proto_identifier,
2160	(as_cluster_proto_identifier)proto_identifier_p)) {
2161	DEBUG("protocol versions incompatible - expected %"PRIx32" but was: %"PRIx32,
2162	current_proto_identifier,
2163	(as_cluster_proto_identifier)proto_identifier_p);
2164	return false;
2165	}
2166
2167	void* cluster_key_p = clustering_hb_plugin_cluster_key_get(plugin_data,
2168	plugin_data_size);
2169	if (cluster_key_p == NULL) {
2170	DEBUG("plugin data missing cluster key");
2171	return false;
2172	}
2173
2174	void* sequence_number_p = clustering_hb_plugin_sequence_number_get(
2175	plugin_data, plugin_data_size);
2176	if (sequence_number_p == NULL) {
2177	DEBUG("plugin data missing sequence number");
2178	return false;
2179	}
2180
2181	void* preferred_principal_p = clustering_hb_plugin_preferred_principal_get(
2182	plugin_data, plugin_data_size);
2183	if (preferred_principal_p == NULL) {
2184	DEBUG("plugin data missing preferred principal");
2185	return false;
2186	}
2187
2188	uint32_t* succession_list_length_p =
2189	(void*)clustering_hb_plugin_succession_length_get(plugin_data,
2190	plugin_data_size);
2191	if (succession_list_length_p == NULL) {
2192	DEBUG("plugin data missing succession list length");
2193	return false;
2194	}
2195
2196	void* succession_list_p = clustering_hb_plugin_succession_get(plugin_data,
2197	plugin_data_size);
2198
2199	if (*succession_list_length_p > `0` && succession_list_p == NULL) {
2200	DEBUG("succession list length %d, but succession list is empty",
2201	*succession_list_length_p);
2202	return false;
2203	}
2204
2205	return true;
2206	}
2207
2208	/**
2209	* Determines if the plugin data with hb subsystem is old to be ignored.
2210	* ALL access to plugin data should be vetted through this function. The plugin
2211	* data is obsolete if it was send before the current cluster state or has a
2212	* version mismatch.
2213	*
2214	* This is detemined by comparing the plugin data hb message hlc timestamp and
2215	* monotonic timestamps with the cluster formation hlc and monotonic times.
2216	*
2217	* @param cluster_modified_hlc_ts the hlc timestamp when current cluster change
2218	* happened. Sent to avoid locking in this function.
2219	* @param cluster_modified_time the monotonic timestamp when current cluster
2220	* change happened. Sento to avoid locking in this function.
2221	* @param plugin_data the plugin data.
2222	* @param plugin_data_size the size of plugin data.
2223	* @param msg_recv_ts the monotonic timestamp for plugin data receive.
2224	* @param hb_msg_hlc_ts the hlc timestamp for plugin data receive.
2225	* @return true if plugin data is obsolete, false otherwise.
2226	*/
2227	static bool
2228	clustering_hb_plugin_data_is_obsolete(as_hlc_timestamp cluster_modified_hlc_ts,
2229	cf_clock cluster_modified_time, void* plugin_data,
2230	size_t plugin_data_size, cf_clock msg_recv_ts,
2231	as_hlc_msg_timestamp* hb_msg_hlc_ts)
2232	{
2233	if (!clustering_hb_plugin_data_is_valid(plugin_data, plugin_data_size)) {
2234	// Plugin data is invalid. Assume it to be obsolete.
2235	// Seems like a redundant check but required in case clustering protocol
2236	// was switched to an incompatible version.
2237	return true;
2238	}
2239
2240	if (as_hlc_send_timestamp_order(cluster_modified_hlc_ts, hb_msg_hlc_ts)
2241	!= AS_HLC_HAPPENS_BEFORE) {
2242	// Cluster formation time after message send or the order is unknown,
2243	// assume cluster formation is after message send. the caller should
2244	// ignore this message.
2245	return true;
2246	}
2247
2248	// HB data should be atleast after cluster formation time + one hb interval
2249	// to send out our cluster state + one network delay for our information to
2250	// reach the remote node + one hb interval for the other node to send out
2251	// the his updated state + one network delay for the updated state to reach
2252	// us.
2253	if (cluster_modified_time + `2` * as_hb_tx_interval_get()
2254	+ `2` * g_config.fabric_latency_max_ms > msg_recv_ts) {
2255	return true;
2256	}
2257
2258	return false;
2259	}
2260
2261	/**
2262	* Indicates if the plugin data for a node indicates that it is an orphan node.
2263	*/
2264	static as_clustering_peer_node_state
2265	clustering_hb_plugin_data_node_status(void* plugin_data,
2266	size_t plugin_data_size)
2267	{
2268	if (!clustering_hb_plugin_data_is_valid(plugin_data, plugin_data_size)) {
2269	// Either we have not hb channel to this node or it has sen invalid
2270	// plugin data. Assuming the cluster state is unknown.
2271	return AS_NODE_UNKNOWN;
2272	}
2273
2274	as_cluster_key* cluster_key = clustering_hb_plugin_cluster_key_get(
2275	plugin_data, plugin_data_size);
2276
2277	if (*cluster_key == `0`) {
2278	return AS_NODE_ORPHAN;
2279	}
2280
2281	// Redundant paranoid check.
2282	uint32_t* succession_list_length_p =
2283	clustering_hb_plugin_succession_length_get(plugin_data,
2284	plugin_data_size);
2285
2286	if (*succession_list_length_p == `0`) {
2287	return AS_NODE_ORPHAN;
2288	}
2289
2290	return AS_NODE_CLUSTER_ASSIGNED;
2291	}
2292
2293	/**
2294	* Push clustering payload into a heartbeat pulse message. The payload format is
2295	* as described above.
2296	*/
2297	static void
2298	clustering_hb_plugin_set_fn(msg* msg)
2299	{
2300	if (!clustering_is_initialized()) {
2301	// Clustering not initialized. Send no data at all.
2302	return;
2303	}
2304
2305	CLUSTERING_LOCK();
2306
2307	uint32_t cluster_size = cf_vector_size(&g_register.succession_list);
2308
2309	size_t payload_size =
2310	// For the paxos version identifier
2311	sizeof(uint32_t)
2312	// For cluster key
2313	+ sizeof(as_cluster_key)
2314	// For sequence number
2315	+ sizeof(as_paxos_sequence_number)
2316	// For preferred principal
2317	+ sizeof(cf_node)
2318	// For succession list length.
2319	+ sizeof(uint32_t)
2320	// For succession list.
2321	+ (sizeof(cf_node) * cluster_size);
2322
2323	uint8_t* payload = alloca(payload_size);
2324
2325	uint8_t* current_field_p = payload;
2326
2327	// Set the paxos protocol identifier.
2328	uint32_t protocol = clustering_protocol_identifier_get();
2329	memcpy(current_field_p, &protocol, sizeof(protocol));
2330	current_field_p += sizeof(protocol);
2331
2332	// Set cluster key.
2333	memcpy(current_field_p, &g_register.cluster_key,
2334	sizeof(g_register.cluster_key));
2335	current_field_p += sizeof(g_register.cluster_key);
2336
2337	// Set the sequence number.
2338	memcpy(current_field_p, &g_register.sequence_number,
2339	sizeof(g_register.sequence_number));
2340	current_field_p += sizeof(g_register.sequence_number);
2341
2342	// Set the preferred principal.
2343	memcpy(current_field_p, &g_clustering.preferred_principal,
2344	sizeof(g_clustering.preferred_principal));
2345	current_field_p += sizeof(g_clustering.preferred_principal);
2346
2347	// Set succession length
2348	memcpy(current_field_p, &cluster_size, sizeof(cluster_size));
2349	current_field_p += sizeof(cluster_size);
2350
2351	// Copy over the succession list.
2352	cf_node* succession = (cf_node*)(current_field_p);
2353	for (int i = `0`; i < cluster_size; i++) {
2354	cf_vector_get(&g_register.succession_list, i, &succession[i]);
2355	}
2356
2357	msg_set_buf(msg, AS_HB_MSG_PAXOS_DATA, payload, payload_size, MSG_SET_COPY);
2358
2359	CLUSTERING_UNLOCK();
2360	}
2361
2362	/**
2363	* Plugin parse function that copies the msg payload verbatim to a plugin data.
2364	*/
2365	static void
2366	clustering_hb_plugin_parse_data_fn(msg* msg, cf_node source,
2367	as_hb_plugin_node_data* prev_plugin_data,
2368	as_hb_plugin_node_data* plugin_data)
2369	{
2370	// Lockless check to prevent deadlocks.
2371	if (g_clustering.sys_state == AS_CLUSTERING_SYS_STATE_UNINITIALIZED) {
2372	// Ignore this heartbeat.
2373	plugin_data->data_size = `0`;
2374	return;
2375	}
2376
2377	void* payload;
2378	size_t payload_size;
2379
2380	if (msg_get_buf(msg, AS_HB_MSG_PAXOS_DATA, (uint8_t**)&payload,
2381	&payload_size, MSG_GET_DIRECT) != `0`) {
2382	cf_ticker_warning(AS_CLUSTERING,
2383	"received empty clustering payload in heartbeat pulse from node %"PRIx64,
2384	source);
2385	plugin_data->data_size = `0`;
2386	return;
2387	}
2388
2389	// Validate and retain only valid plugin data.
2390	if (!clustering_hb_plugin_data_is_valid(payload, payload_size)) {
2391	cf_ticker_warning(AS_CLUSTERING,
2392	"received invalid clustering payload in heartbeat pulse from node %"PRIx64,
2393	source);
2394	plugin_data->data_size = `0`;
2395	return;
2396	}
2397
2398	if (payload_size > plugin_data->data_capacity) {
2399	// Round up to nearest multiple of block size to prevent very frequent
2400	// reallocation.
2401	size_t data_capacity = ((payload_size + HB_PLUGIN_DATA_BLOCK_SIZE - `1`)
2402	/ HB_PLUGIN_DATA_BLOCK_SIZE) * HB_PLUGIN_DATA_BLOCK_SIZE;
2403
2404	// Reallocate since we have outgrown existing capacity.
2405	plugin_data->data = cf_realloc(plugin_data->data, data_capacity);
2406	plugin_data->data_capacity = data_capacity;
2407	}
2408
2409	plugin_data->data_size = payload_size;
2410	memcpy(plugin_data->data, payload, payload_size);
2411	}
2412
2413	/**
2414	* Check if the input succession list from hb plugin data matches, with a
2415	* succession list vector.
2416	* @param succession_list the first succession list.
2417	* @param succession_list_length the length of the succession list.
2418	* @param succession_list_vector the second succession list as a vector. Should
2419	* be protected from multithreaded access while this function is running.
2420	* @return true if the succcession lists are equal, false otherwise.
2421	*/
2422	bool
2423	clustering_hb_succession_list_matches(cf_node* succession_list,
2424	uint32_t succession_list_length, cf_vector* succession_list_vector)
2425	{
2426	if (succession_list_length != cf_vector_size(succession_list_vector)) {
2427	return false;
2428	}
2429
2430	for (uint32_t i = `0`; i < succession_list_length; i++) {
2431	cf_node* vector_element = cf_vector_getp(succession_list_vector, i);
2432	if (vector_element == NULL \|\| *vector_element != succession_list[i]) {
2433	return false;
2434	}
2435	}
2436	return true;
2437	}
2438
2439	/*
2440	* ----------------------------------------------------------------------------
2441	* Quantum interval generator
2442	* ----------------------------------------------------------------------------
2443	*/
2444
2445	/**
2446	* Time taken for the effect of a fault to get propogated via HB.
2447	*/
2448	static uint32_t
2449	quantum_interval_hb_fault_comm_delay()
2450	{
2451	return as_hb_tx_interval_get() + network_latency_max();
2452	}
2453
2454	/**
2455	* Quantum wait time after node arrived event.
2456	*/
2457	static uint32_t
2458	quantum_interval_node_arrived_wait_time(as_clustering_quantum_fault* fault)
2459	{
2460	return MIN(quantum_interval(),
2461	(fault->last_event_ts - fault->event_ts) / `2`
2462	+ `2` * quantum_interval_hb_fault_comm_delay()
2463	+ quantum_interval() / `2`);
2464	}
2465
2466	/**
2467	* Quantum wait time after node departs.
2468	*/
2469	static uint32_t
2470	quantum_interval_node_departed_wait_time(as_clustering_quantum_fault* fault)
2471	{
2472	return MIN(quantum_interval(),
2473	as_hb_node_timeout_get()
2474	+ `2` * quantum_interval_hb_fault_comm_delay()
2475	+ quantum_interval() / `4`);
2476	}
2477
2478	/**
2479	* Quantum wait time after a peer nodes adjacency changed.
2480	*/
2481	static uint32_t
2482	quantum_interval_peer_adjacency_changed_wait_time(
2483	as_clustering_quantum_fault* fault)
2484	{
2485	return MIN(quantum_interval(), quantum_interval_hb_fault_comm_delay());
2486	}
2487
2488	/**
2489	* Quantum wait time after accepting a join request.
2490	*/
2491	static uint32_t
2492	quantum_interval_join_accepted_wait_time(as_clustering_quantum_fault* fault)
2493	{
2494	// Ensure we wait for atleast one heartbeat interval to receive the latest
2495	// heartbeat after the last join request and for other nodes to send their
2496	// join requests as well.
2497	return MIN(quantum_interval(),
2498	(fault->last_event_ts - fault->event_ts)
2499	+ join_cluster_check_interval() + network_latency_max()
2500	+ as_hb_tx_interval_get());
2501	}
2502
2503	/**
2504	* Quantum wait time after principal node departs.
2505	*/
2506	static uint32_t
2507	quantum_interval_principal_departed_wait_time(
2508	as_clustering_quantum_fault* fault)
2509	{
2510	// Anticipate an incoming join request from other orphaned cluster members.
2511	return MIN(quantum_interval(),
2512	as_hb_node_timeout_get()
2513	+ `2` * quantum_interval_hb_fault_comm_delay()
2514	+ MAX(quantum_interval() / `4`,
2515	quantum_interval_join_accepted_wait_time(fault)));
2516	}
2517
2518	/**
2519	* Quantum wait time after seeing a cluster that might send us a join request.
2520	*/
2521	static uint32_t
2522	quantum_interval_inbound_merge_candidate_wait_time(
2523	as_clustering_quantum_fault* fault)
2524	{
2525	return quantum_interval();
2526	}
2527
2528	/**
2529	* Quantum wait time after a cluster member has been orphaned.
2530	*/
2531	static uint32_t
2532	quantum_interval_member_orphaned_wait_time(as_clustering_quantum_fault* fault)
2533	{
2534	return quantum_interval();
2535	}
2536
2537	/**
2538	* Marks the current quantum interval as skipped. A kludge to allow quantum to
2539	* allow quantum interval generator to mark quantum intervals as postponed.
2540	*/
2541	static void
2542	quantum_interval_mark_postponed()
2543	{
2544	CLUSTERING_LOCK();
2545	g_quantum_interval_generator.is_interval_postponed = true;
2546	CLUSTERING_UNLOCK();
2547	}
2548
2549	/**
2550	* Update the vtable for a fault.
2551	*/
2552	static void
2553	quantum_interval_vtable_update(as_clustering_quantum_fault_type type,
2554	char *fault_log_str, as_clustering_quantum_fault_wait_fn wait_fn)
2555	{
2556	CLUSTERING_LOCK();
2557	g_quantum_interval_generator.vtable[type].fault_log_str = fault_log_str;
2558	g_quantum_interval_generator.vtable[type].wait_fn = wait_fn;
2559	CLUSTERING_UNLOCK();
2560	}
2561
2562	/**
2563	* Initialize quantum interval generator.
2564	*/
2565	static void
2566	quantum_interval_generator_init()
2567	{
2568	CLUSTERING_LOCK();
2569	memset(&g_quantum_interval_generator, `0`,
2570	sizeof(g_quantum_interval_generator));
2571	g_quantum_interval_generator.last_quantum_start_time = cf_getms();
2572	g_quantum_interval_generator.last_quantum_interval = quantum_interval();
2573
2574	// Initialize the vtable.
2575	quantum_interval_vtable_update(QUANTUM_FAULT_NODE_ARRIVED, "node arrived",
2576	quantum_interval_node_arrived_wait_time);
2577	quantum_interval_vtable_update(QUANTUM_FAULT_NODE_DEPARTED, "node departed",
2578	quantum_interval_node_departed_wait_time);
2579	quantum_interval_vtable_update(QUANTUM_FAULT_PRINCIPAL_DEPARTED,
2580	"principal departed",
2581	quantum_interval_principal_departed_wait_time);
2582	quantum_interval_vtable_update(QUANTUM_FAULT_PEER_ADJACENCY_CHANGED,
2583	"peer adjacency changed",
2584	quantum_interval_peer_adjacency_changed_wait_time);
2585	quantum_interval_vtable_update(QUANTUM_FAULT_JOIN_ACCEPTED,
2586	"join request accepted", quantum_interval_join_accepted_wait_time);
2587	quantum_interval_vtable_update(QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN,
2588	"merge candidate seen",
2589	quantum_interval_inbound_merge_candidate_wait_time);
2590	quantum_interval_vtable_update(QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED,
2591	"member orphaned", quantum_interval_member_orphaned_wait_time);
2592
2593	CLUSTERING_UNLOCK();
2594	}
2595
2596	/**
2597	* Get the earliest possible monotonic clock time the next quantum interval can
2598	* start.
2599	*
2600	* Start quantum interval after the last update to any one of adjacency,
2601	* pending_join_requests , neighboring_principals. The heuristic is that these
2602	* should be stable to initiate cluster merge / join or cluster formation
2603	* requests.
2604	*/
2605	static cf_clock
2606	quantum_interval_earliest_start_time()
2607	{
2608	CLUSTERING_LOCK();
2609	cf_clock fault_event_time = `0`;
2610	for (int i = `0`; i < QUANTUM_FAULT_TYPE_SENTINEL; i++) {
2611	if (g_quantum_interval_generator.fault[i].event_ts) {
2612	fault_event_time = MAX(fault_event_time,
2613	g_quantum_interval_generator.fault[i].event_ts
2614	+ g_quantum_interval_generator.vtable[i].wait_fn(
2615	&g_quantum_interval_generator.fault[i]));
2616	}
2617
2618	DETAIL("Fault:%s event_ts:%"PRIu64,
2619	g_quantum_interval_generator.vtable[i].fault_log_str,
2620	g_quantum_interval_generator.fault[i].event_ts);
2621	}
2622
2623	DETAIL("Last Quantum interval:%"PRIu64,
2624	g_quantum_interval_generator.last_quantum_start_time);
2625
2626	cf_clock start_time = g_quantum_interval_generator.last_quantum_start_time
2627	+ quantum_interval();
2628	if (fault_event_time) {
2629	// Ensure we have at least 1/2 quantum interval of separation between
2630	// quantum intervals to give chance to multiple fault events that are
2631	// resonably close in time.
2632	start_time = MAX(
2633	g_quantum_interval_generator.last_quantum_start_time
2634	+ quantum_interval() / `2`, fault_event_time);
2635	}
2636	CLUSTERING_UNLOCK();
2637
2638	return start_time;
2639	}
2640
2641	/**
2642	* Reset quantum interval fault.
2643	* @param fault_type the fault type.
2644	*/
2645	static void
2646	quantum_interval_fault_reset(as_clustering_quantum_fault_type fault_type)
2647	{
2648	CLUSTERING_LOCK();
2649	memset(&g_quantum_interval_generator.fault[fault_type], `0`,
2650	sizeof(g_quantum_interval_generator.fault[fault_type]));
2651	CLUSTERING_UNLOCK();
2652	}
2653
2654	/**
2655	* Update a fault event based on the current fault ts.
2656	* @param fault the fault to update.
2657	* @param fault_ts the new fault timestamp
2658	* @param src_nodeid the fault causing nodeid, 0 if the nodeid is not known.
2659	*/
2660	static void
2661	quantum_interval_fault_update(as_clustering_quantum_fault_type fault_type,
2662	cf_clock fault_ts, cf_node src_nodeid)
2663	{
2664	CLUSTERING_LOCK();
2665	as_clustering_quantum_fault* fault =
2666	&g_quantum_interval_generator.fault[fault_type];
2667	if (fault->event_ts == `0`
2668	\|\| fault_ts - fault->event_ts > quantum_interval() / `2`) {
2669	// Fault event detected first time in this quantum or we are seeing the
2670	// effect of a different event more than half quantum apart.
2671	fault->event_ts = fault_ts;
2672	DETAIL("updated '%s' fault with ts %"PRIu64" for node %"PRIx64,
2673	g_quantum_interval_generator.vtable[fault_type].fault_log_str, fault_ts, src_nodeid);
2674	}
2675
2676	fault->last_event_ts = fault_ts;
2677	CLUSTERING_UNLOCK();
2678	}
2679
2680	/**
2681	* Reset the state for the next quantum interval.
2682	*/
2683	static void
2684	quantum_interval_generator_reset(cf_clock last_quantum_start_time)
2685	{
2686	CLUSTERING_LOCK();
2687	if (!g_quantum_interval_generator.is_interval_postponed) {
2688	// Update last quantum interval.
2689	g_quantum_interval_generator.last_quantum_interval = MAX(`0`,
2690	last_quantum_start_time
2691	- g_quantum_interval_generator.last_quantum_start_time);
2692
2693	g_quantum_interval_generator.last_quantum_start_time =
2694	last_quantum_start_time;
2695	for (int i = `0`; i < QUANTUM_FAULT_TYPE_SENTINEL; i++) {
2696	quantum_interval_fault_reset(i);
2697	}
2698	}
2699	g_quantum_interval_generator.is_interval_postponed = false;
2700
2701	CLUSTERING_UNLOCK();
2702	}
2703
2704	/**
2705	* Handle timer event and generate a quantum internal event if required.
2706	*/
2707	static void
2708	quantum_interval_generator_timer_event_handle(
2709	as_clustering_internal_event* timer_event)
2710	{
2711	CLUSTERING_LOCK();
2712	cf_clock now = cf_getms();
2713
2714	cf_clock earliest_quantum_start_time =
2715	quantum_interval_earliest_start_time();
2716
2717	cf_clock expected_quantum_start_time =
2718	g_quantum_interval_generator.last_quantum_start_time
2719	+ g_quantum_interval_generator.last_quantum_interval;
2720
2721	// Provide a buffer for current quantum interval to finish gracefully as
2722	// long as it is less than half a quantum interval.
2723	cf_clock quantum_wait_buffer = MIN(
2724	earliest_quantum_start_time > expected_quantum_start_time ?
2725	earliest_quantum_start_time - expected_quantum_start_time :
2726	`0`, g_quantum_interval_generator.last_quantum_interval / `2`);
2727
2728	// Fire quantum interval start event if it is time, or if we have skipped
2729	// quantum interval start for more that the max skip number of intervals.
2730	// Add a buffer of wait time to ensure we wait a bit more if we can cover
2731	// the waiting time.
2732	bool is_skippable = g_quantum_interval_generator.last_quantum_start_time
2733	+ (quantum_interval_skip_max() + `1`)
2734	* g_quantum_interval_generator.last_quantum_interval
2735	+ quantum_wait_buffer > now;
2736	bool fire_quantum_event = earliest_quantum_start_time <= now
2737	\|\| !is_skippable;
2738	CLUSTERING_UNLOCK();
2739
2740	if (fire_quantum_event) {
2741	as_clustering_internal_event timer_event;
2742	memset(&timer_event, `0`, sizeof(timer_event));
2743	timer_event.type = AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START;
2744	timer_event.quantum_interval_is_skippable = is_skippable;
2745	internal_event_dispatch(&timer_event);
2746
2747	// Reset for next interval generation.
2748	quantum_interval_generator_reset(now);
2749	}
2750	}
2751
2752	/**
2753	* Check if the interval generator has seen an adjacency fault in the current
2754	* quantum interval.
2755	* @return true if the quantum interval generator has seen an adjacency fault,
2756	* false otherwise.
2757	*/
2758	static bool
2759	quantum_interval_is_adjacency_fault_seen()
2760	{
2761	CLUSTERING_LOCK();
2762	bool is_fault_seen =
2763	g_quantum_interval_generator.fault[QUANTUM_FAULT_NODE_ARRIVED].event_ts
2764	\|\| g_quantum_interval_generator.fault[QUANTUM_FAULT_NODE_DEPARTED].event_ts
2765	\|\| g_quantum_interval_generator.fault[QUANTUM_FAULT_PRINCIPAL_DEPARTED].event_ts;
2766	CLUSTERING_UNLOCK();
2767	return is_fault_seen;
2768	}
2769
2770	/**
2771	* Check if the interval generator has seen a peer node adjacency changed fault
2772	* in current quantum interval.
2773	* @return true if the quantum interval generator has seen a peer node adjacency
2774	* changed fault,
2775	* false otherwise.
2776	*/
2777	static bool
2778	quantum_interval_is_peer_adjacency_fault_seen()
2779	{
2780	CLUSTERING_LOCK();
2781	bool is_fault_seen =
2782	g_quantum_interval_generator.fault[QUANTUM_FAULT_PEER_ADJACENCY_CHANGED].event_ts;
2783	CLUSTERING_UNLOCK();
2784	return is_fault_seen;
2785	}
2786
2787	/**
2788	* Update the fault time for this quantum on self heartbeat adjacency list
2789	* change.
2790	*/
2791	static void
2792	quantum_interval_generator_hb_event_handle(
2793	as_clustering_internal_event* hb_event)
2794	{
2795	CLUSTERING_LOCK();
2796
2797	cf_clock min_event_time[AS_HB_NODE_EVENT_SENTINEL];
2798	cf_clock min_event_node[AS_HB_NODE_EVENT_SENTINEL];
2799
2800	memset(min_event_time, `0`, sizeof(min_event_time));
2801	memset(min_event_node, `0`, sizeof(min_event_node));
2802
2803	as_hb_event_node* events = hb_event->hb_events;
2804	for (int i = `0`; i < hb_event->hb_n_events; i++) {
2805	if (min_event_time[events[i].evt] == `0`
2806	\|\| min_event_time[events[i].evt] > events[i].event_time) {
2807	min_event_time[events[i].evt] = events[i].event_time;
2808	min_event_node[events[i].evt] = events[i].nodeid;
2809	}
2810
2811	if (events[i].evt == AS_HB_NODE_DEPART
2812	&& clustering_is_our_principal(events[i].nodeid)) {
2813	quantum_interval_fault_update(QUANTUM_FAULT_PRINCIPAL_DEPARTED,
2814	events[i].event_time, events[i].nodeid);
2815	}
2816	}
2817
2818	for (int i = `0`; i < AS_HB_NODE_EVENT_SENTINEL; i++) {
2819	if (min_event_time[i]) {
2820	switch (i) {
2821	case AS_HB_NODE_ARRIVE:
2822	quantum_interval_fault_update(QUANTUM_FAULT_NODE_ARRIVED,
2823	min_event_time[i], min_event_node[i]);
2824	break;
2825	case AS_HB_NODE_DEPART:
2826	quantum_interval_fault_update(QUANTUM_FAULT_NODE_DEPARTED,
2827	min_event_time[i], min_event_node[i]);
2828	break;
2829	case AS_HB_NODE_ADJACENCY_CHANGED:
2830	if (clustering_is_cluster_member(min_event_node[i])) {
2831	quantum_interval_fault_update(
2832	QUANTUM_FAULT_PEER_ADJACENCY_CHANGED,
2833	min_event_time[i], min_event_node[i]);
2834	}
2835	break;
2836	default:
2837	break;
2838	}
2839
2840	}
2841	}
2842	CLUSTERING_UNLOCK();
2843	}
2844
2845	/**
2846	* Update the fault time for this quantum on clustering information for an
2847	* adjacent node change. Assumes the node's plugin data is not obsolete.
2848	*/
2849	static void
2850	quantum_interval_generator_hb_plugin_data_changed_handle(
2851	as_clustering_internal_event* change_event)
2852	{
2853	CLUSTERING_LOCK();
2854
2855	if (clustering_hb_plugin_data_is_obsolete(
2856	g_register.cluster_modified_hlc_ts,
2857	g_register.cluster_modified_time, change_event->plugin_data->data,
2858	change_event->plugin_data->data_size,
2859	change_event->plugin_data_changed_ts,
2860	&change_event->plugin_data_changed_hlc_ts)) {
2861	// The plugin data is obsolete. Can't take decisions based on it.
2862	goto Exit;
2863	}
2864
2865	// Get the changed node's succession list, cluster key. All the fields
2866	// should be present since the obsolete check also checked for fields being
2867	// valid.
2868	cf_node* succession_list_p = clustering_hb_plugin_succession_get(
2869	change_event->plugin_data->data,
2870	change_event->plugin_data->data_size);
2871	uint32_t* succession_list_length_p =
2872	clustering_hb_plugin_succession_length_get(
2873	change_event->plugin_data->data,
2874	change_event->plugin_data->data_size);
2875
2876	if (*succession_list_length_p > `0`
2877	&& !clustering_is_our_principal(succession_list_p[`0`])
2878	&& clustering_is_principal()) {
2879	if (succession_list_p[`0`] < config_self_nodeid_get()) {
2880	// We are seeing a new principal who could potentially merge with
2881	// this cluster.
2882	if (g_quantum_interval_generator.fault[QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN].event_ts
2883	!= `1`) {
2884	quantum_interval_fault_update(
2885	QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, cf_getms(),
2886	change_event->plugin_data_changed_nodeid);
2887	}
2888	}
2889	else {
2890	// We see a cluster with higher nodeid and most probably we will not
2891	// be the principal of the merged cluster. Reset the fault
2892	// timestamp, however set it to 1 to differentiate between no fault
2893	// and a fault to be ingnored in this quantum interval. A value of 1
2894	// for practical purposes will never push the quantum interval
2895	// forward.
2896	quantum_interval_fault_update(
2897	QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, `1`,
2898	change_event->plugin_data_changed_nodeid);
2899	}
2900	}
2901	else {
2902	if (clustering_is_principal() && *succession_list_length_p == `0`
2903	&& vector_find(&g_register.succession_list,
2904	&change_event->plugin_data_changed_nodeid) >= `0`) {
2905	// One of our cluster members switched to orphan state. Most likely
2906	// a quick restart.
2907	quantum_interval_fault_update(QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED,
2908	cf_getms(), change_event->plugin_data_changed_nodeid);
2909	}
2910	else {
2911	// A node becoming an orphan node or seeing a succession with our
2912	// principal does not mean we have seen a new cluster.
2913	}
2914	}
2915	Exit:
2916	CLUSTERING_UNLOCK();
2917	}
2918
2919	/**
2920	* Update the fault time for this quantum on self heartbeat adjacency list
2921	* change.
2922	*/
2923	static void
2924	quantum_interval_generator_join_request_accepted_handle(
2925	as_clustering_internal_event* join_request_event)
2926	{
2927	quantum_interval_fault_update(QUANTUM_FAULT_JOIN_ACCEPTED, cf_getms(),
2928	join_request_event->join_request_source_nodeid);
2929	}
2930
2931	/**
2932	* Dispatch internal clustering events for the quantum interval generator.
2933	*/
2934	static void
2935	quantum_interval_generator_event_dispatch(as_clustering_internal_event* event)
2936	{
2937	switch (event->type) {
2938	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
2939	quantum_interval_generator_timer_event_handle(event);
2940	break;
2941	case AS_CLUSTERING_INTERNAL_EVENT_HB:
2942	quantum_interval_generator_hb_event_handle(event);
2943	break;
2944	case AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED:
2945	quantum_interval_generator_hb_plugin_data_changed_handle(event);
2946	break;
2947	case AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED:
2948	quantum_interval_generator_join_request_accepted_handle(event);
2949	break;
2950	default:
2951	break;
2952	}
2953	}
2954
2955	/**
2956	* Start quantum interval generator.
2957	*/
2958	static void
2959	quantum_interval_generator_start()
2960	{
2961	CLUSTERING_LOCK();
2962	g_quantum_interval_generator.last_quantum_start_time = cf_getms();
2963	CLUSTERING_UNLOCK();
2964	}
2965
2966	/*
2967	* ----------------------------------------------------------------------------
2968	* Clustering common
2969	* ----------------------------------------------------------------------------
2970	*/
2971
2972	/**
2973	* Generate a new random and most likely a unique cluster key.
2974	* @param current_cluster_key current cluster key to prevent collision.
2975	* @return randomly generated cluster key.
2976	*/
2977	static as_cluster_key
2978	clustering_cluster_key_generate(as_cluster_key current_cluster_key)
2979	{
2980	// Generate one uuid and use this for the cluster key
2981	as_cluster_key cluster_key = `0`;
2982
2983	// Generate a non-zero cluster key that fits in 6 bytes.
2984	while ((cluster_key = (cf_get_rand64() >> `16`)) == `0`
2985	\|\| cluster_key == current_cluster_key) {
2986	;
2987	}
2988
2989	return cluster_key;
2990	}
2991
2992	/**
2993	* Indicates if this node is an orphan. A node is deemed orphan if it is not a
2994	* memeber of any cluster.
2995	*/
2996	static bool
2997	clustering_is_orphan()
2998	{
2999	CLUSTERING_LOCK();
3000
3001	bool is_orphan = cf_vector_size(&g_register.succession_list) <= `0`
3002	\|\| g_register.cluster_key == `0`;
3003
3004	CLUSTERING_UNLOCK();
3005
3006	return is_orphan;
3007	}
3008
3009	/**
3010	* Return the principal node for current cluster.
3011	* @param principal (output) the current principal for the cluster.
3012	* @return 0 if there is a valid principal, -1 if the node is in orphan state
3013	* and there is no valid principal.
3014	*/
3015	static int
3016	clustering_principal_get(cf_node* principal)
3017	{
3018	CLUSTERING_LOCK();
3019	int rv = -`1`;
3020
3021	if (cf_vector_get(&g_register.succession_list, `0`, principal) == `0`) {
3022	rv = `0`;
3023	}
3024
3025	CLUSTERING_UNLOCK();
3026
3027	return rv;
3028	}
3029
3030	/**
3031	* Indicates if this node is the principal for its cluster.
3032	*/
3033	static bool
3034	clustering_is_principal()
3035	{
3036	CLUSTERING_LOCK();
3037	cf_node current_principal;
3038
3039	bool is_principal = clustering_principal_get(&current_principal) == `0`
3040	&& current_principal == config_self_nodeid_get();
3041
3042	CLUSTERING_UNLOCK();
3043
3044	return is_principal;
3045	}
3046
3047	/**
3048	* Indicates if input node is this node's principal. Input node can be self node
3049	* as well.
3050	*/
3051	static bool
3052	clustering_is_our_principal(cf_node nodeid)
3053	{
3054	CLUSTERING_LOCK();
3055	cf_node current_principal;
3056
3057	bool is_principal = clustering_principal_get(&current_principal) == `0`
3058	&& current_principal == nodeid;
3059
3060	CLUSTERING_UNLOCK();
3061
3062	return is_principal;
3063	}
3064
3065	/**
3066	* Indicates if a node is our cluster member.
3067	*/
3068	static bool
3069	clustering_is_cluster_member(cf_node nodeid)
3070	{
3071	CLUSTERING_LOCK();
3072	bool is_member = vector_find(&g_register.succession_list, &nodeid) >= `0`;
3073	CLUSTERING_UNLOCK();
3074	return is_member;
3075	}
3076
3077	/**
3078	* Indicates if the input node is present in a succession list.
3079	* @param nodeid the nodeid to search.
3080	* @param succession_list the succession list.
3081	* @param succession_list_length the length of the succession list.
3082	* @return true if the node is present in the succession list, false otherwise.
3083	*/
3084	static bool
3085	clustering_is_node_in_succession(cf_node nodeid, cf_node* succession_list,
3086	int succession_list_length)
3087	{
3088	for (int i = `0`; i < succession_list_length; i++) {
3089	if (succession_list[i] == nodeid) {
3090	return true;
3091	}
3092	}
3093
3094	return false;
3095	}
3096
3097	/**
3098	* Indicates if the input node can be accepted as this a paxos proposer. We can
3099	* accept the new node as our principal if we are in the orphan state or if the
3100	* input node is already our principal.
3101	*
3102	* Note: In case we send a join request to a node with a lower node id, input
3103	* node's nodeid can be less than our nodeid. This is still valid as the
3104	* proposer who will hand over the principalship to us once paxos round is over.
3105	*
3106	* @param nodeid the nodeid of the proposer to check.
3107	* @return true if this input node is an acceptable proposer.
3108	*/
3109	static bool
3110	clustering_can_accept_as_proposer(cf_node nodeid)
3111	{
3112	return clustering_is_orphan() \|\| clustering_is_our_principal(nodeid);
3113	}
3114
3115	/**
3116	* Plugin data iterate function that finds and collects neighboring principals,
3117	* excluding current principal if any .
3118	*/
3119	static void
3120	clustering_neighboring_principals_find(cf_node nodeid, void* plugin_data,
3121	size_t plugin_data_size, cf_clock recv_monotonic_ts,
3122	as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
3123	{
3124	cf_vector* neighboring_principals = (cf_vector*)udata;
3125
3126	CLUSTERING_LOCK();
3127
3128	// For determining neighboring principal it is alright if this data is
3129	// within two heartbeat intervals. So obsolete check has the timestamps as
3130	// zero. This way we will not reject principals that have nothing to do with
3131	// our cluster changes.
3132	if (recv_monotonic_ts + `2` * as_hb_tx_interval_get() >= cf_getms()
3133	&& !clustering_hb_plugin_data_is_obsolete(`0`, `0`, plugin_data,
3134	plugin_data_size, recv_monotonic_ts, msg_hlc_ts)) {
3135	cf_node* succession_list = clustering_hb_plugin_succession_get(
3136	plugin_data, plugin_data_size);
3137
3138	uint32_t* succession_list_length_p =
3139	clustering_hb_plugin_succession_length_get(plugin_data,
3140	plugin_data_size);
3141
3142	if (succession_list != NULL && succession_list_length_p != NULL
3143	&& *succession_list_length_p > `0`
3144	&& succession_list[`0`] != config_self_nodeid_get()) {
3145	cf_vector_append_unique(neighboring_principals,
3146	&succession_list[`0`]);
3147	}
3148	}
3149	else {
3150	DETAIL(
3151	"neighboring principal check skipped - found obsolete plugin data for node %"PRIx64,
3152	nodeid);
3153	}
3154
3155	CLUSTERING_UNLOCK();
3156	}
3157
3158	/**
3159	* Get a list of adjacent principal nodes ordered by descending nodeids.
3160	*/
3161	static void
3162	clustering_neighboring_principals_get(cf_vector* neighboring_principals)
3163	{
3164	CLUSTERING_LOCK();
3165
3166	// Use a single iteration over the clustering data received via the
3167	// heartbeats instead of individual calls to get a consistent view and avoid
3168	// small lock and releases.
3169	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
3170	clustering_neighboring_principals_find, neighboring_principals);
3171
3172	vector_sort_unique(neighboring_principals, cf_node_compare_desc);
3173
3174	CLUSTERING_UNLOCK();
3175	}
3176
3177	/**
3178	* Find dead nodes in current succession list.
3179	*/
3180	static void
3181	clustering_dead_nodes_find(cf_vector* dead_nodes)
3182	{
3183	CLUSTERING_LOCK();
3184
3185	cf_vector* succession_list_p = &g_register.succession_list;
3186	int succession_list_count = cf_vector_size(succession_list_p);
3187	for (int i = `0`; i < succession_list_count; i++) {
3188	// No null check required since we are iterating under a lock and within
3189	// vector bounds.
3190	cf_node cluster_member_nodeid = ((cf_node)cf_vector_getp(
3191	succession_list_p, i));
3192
3193	if (!as_hb_is_alive(cluster_member_nodeid)) {
3194	cf_vector_append(dead_nodes, &cluster_member_nodeid);
3195	}
3196	}
3197
3198	CLUSTERING_UNLOCK();
3199	}
3200
3201	/**
3202	* Indicates if a node is faulty. A node in the succecssion list deemed faulty
3203	* - if the node is alive and it reports to be an orphan or is part of some
3204	* other cluster.
3205	* - if the node is alive its clustering protocol identifier does not match this
3206	* node's clustering protocol identifier.
3207	*/
3208	static bool
3209	clustering_node_is_faulty(cf_node nodeid)
3210	{
3211	if (nodeid == config_self_nodeid_get()) {
3212	// Self node is never faulty wrt clustering.
3213	return false;
3214	}
3215
3216	CLUSTERING_LOCK();
3217	bool is_faulty = false;
3218	as_hlc_msg_timestamp hb_msg_hlc_ts;
3219	cf_clock msg_recv_ts = `0`;
3220	as_hb_plugin_node_data plugin_data = { `0` };
3221
3222	if (clustering_hb_plugin_data_get(nodeid, &plugin_data, &hb_msg_hlc_ts,
3223	&msg_recv_ts) != `0`
3224	\|\| clustering_hb_plugin_data_is_obsolete(
3225	g_register.cluster_modified_hlc_ts,
3226	g_register.cluster_modified_time, plugin_data.data,
3227	plugin_data.data_size, msg_recv_ts, &hb_msg_hlc_ts)) {
3228	INFO(
3229	"faulty check skipped - found obsolete plugin data for node %"PRIx64,
3230	nodeid);
3231	is_faulty = false;
3232	goto Exit;
3233	}
3234
3235	// We have clustering data from the node after the current cluster change.
3236	// Compare protocol identifier, clusterkey, and succession.
3237	as_cluster_proto_identifier* proto_p = clustering_hb_plugin_proto_get(
3238	plugin_data.data, plugin_data.data_size);
3239
3240	if (proto_p == NULL
3241	\|\| !clustering_versions_are_compatible(*proto_p,
3242	clustering_protocol_identifier_get())) {
3243	DEBUG("for node %"PRIx64" protocol version mismatch - expected: %"PRIx32" but was : %"PRIx32,
3244	nodeid, clustering_protocol_identifier_get(),
3245	proto_p != NULL ? *proto_p : `0`);
3246	is_faulty = true;
3247	goto Exit;
3248	}
3249
3250	as_cluster_key* cluster_key_p = clustering_hb_plugin_cluster_key_get(
3251	plugin_data.data, plugin_data.data_size);
3252	if (cluster_key_p == NULL \|\| *cluster_key_p != g_register.cluster_key) {
3253	DEBUG("for node %"PRIx64" cluster key mismatch - expected: %"PRIx64" but was : %"PRIx64,
3254	nodeid, g_register.cluster_key, cluster_key_p != NULL ? *cluster_key_p : `0`);
3255	is_faulty = true;
3256	goto Exit;
3257	}
3258
3259	// Check succession list just to be sure.
3260	// We have clustering data from the node after the current cluster change.
3261	cf_node* succession_list = clustering_hb_plugin_succession_get(
3262	plugin_data.data, plugin_data.data_size);
3263
3264	uint32_t* succession_list_length_p =
3265	clustering_hb_plugin_succession_length_get(plugin_data.data,
3266	plugin_data.data_size);
3267
3268	if (succession_list == NULL \|\| succession_list_length_p == NULL
3269	\|\| !clustering_hb_succession_list_matches(succession_list,
3270	*succession_list_length_p, &g_register.succession_list)) {
3271	INFO("for node %"PRIx64" succession list mismatch", nodeid);
3272
3273	log_cf_node_vector("self succession list:", &g_register.succession_list,
3274	CF_INFO);
3275
3276	if (succession_list) {
3277	log_cf_node_array("node succession list:", succession_list,
3278	succession_list && succession_list_length_p ?
3279	*succession_list_length_p : `0`, CF_INFO);
3280	}
3281	else {
3282	INFO("node succession list: (empty)");
3283	}
3284
3285	is_faulty = true;
3286	goto Exit;
3287	}
3288
3289	Exit:
3290	CLUSTERING_UNLOCK();
3291	return is_faulty;
3292	}
3293
3294	/**
3295	* Find "faulty" nodes in current succession list.
3296	*/
3297	static void
3298	clustering_faulty_nodes_find(cf_vector* faulty_nodes)
3299	{
3300	CLUSTERING_LOCK();
3301
3302	if (clustering_is_orphan()) {
3303	goto Exit;
3304	}
3305
3306	cf_vector* succession_list_p = &g_register.succession_list;
3307	int succession_list_count = cf_vector_size(succession_list_p);
3308	for (int i = `0`; i < succession_list_count; i++) {
3309	// No null check required since we are iterating under a lock and within
3310	// vector bounds.
3311	cf_node cluster_member_nodeid = ((cf_node)cf_vector_getp(
3312	succession_list_p, i));
3313	if (clustering_node_is_faulty(cluster_member_nodeid)) {
3314	cf_vector_append(faulty_nodes, &cluster_member_nodeid);
3315	}
3316	}
3317
3318	Exit:
3319	CLUSTERING_UNLOCK();
3320	}
3321
3322	/**
3323	* Indicates if a node is in sync with this node's cluster. A node in the
3324	* succecssion list is deemed in sync if the node is alive and it reports to be
3325	* in the same cluster via its heartbeats.
3326	*/
3327	static bool
3328	clustering_node_is_sync(cf_node nodeid)
3329	{
3330	if (nodeid == config_self_nodeid_get()) {
3331	// Self node is always in sync wrt clustering.
3332	return true;
3333	}
3334
3335	CLUSTERING_LOCK();
3336	bool is_sync = false;
3337	as_hlc_msg_timestamp hb_msg_hlc_ts;
3338	cf_clock msg_recv_ts = `0`;
3339	as_hb_plugin_node_data plugin_data = { `0` };
3340	bool data_exists =
3341	clustering_hb_plugin_data_get(nodeid, &plugin_data, &hb_msg_hlc_ts,
3342	&msg_recv_ts) == `0`;
3343
3344	// Latest valid plugin data is ok as long as other checks are met. Hence the
3345	// timestamps are zero.
3346	if (!data_exists \|\| msg_recv_ts + `2` * as_hb_tx_interval_get() < cf_getms()
3347	\|\| clustering_hb_plugin_data_is_obsolete(`0`, `0`, plugin_data.data,
3348	plugin_data.data_size, msg_recv_ts, &hb_msg_hlc_ts)) {
3349	is_sync = false;
3350	goto Exit;
3351	}
3352
3353	// We have clustering data from the node after the current cluster change.
3354	// Compare protocol identifier, clusterkey, and succession.
3355	as_cluster_proto_identifier* proto_p = clustering_hb_plugin_proto_get(
3356	plugin_data.data, plugin_data.data_size);
3357
3358	if (proto_p == NULL
3359	\|\| !clustering_versions_are_compatible(*proto_p,
3360	clustering_protocol_identifier_get())) {
3361	DEBUG(
3362	"for node %"PRIx64" protocol version mismatch - expected: %"PRIx32" but was : %"PRIx32,
3363	nodeid, clustering_protocol_identifier_get(),
3364	proto_p != NULL ? *proto_p : `0`);
3365	is_sync = false;
3366	goto Exit;
3367	}
3368
3369	as_cluster_key* cluster_key_p = clustering_hb_plugin_cluster_key_get(
3370	plugin_data.data, plugin_data.data_size);
3371	if (cluster_key_p == NULL \|\| *cluster_key_p != g_register.cluster_key) {
3372	DEBUG(
3373	"for node %"PRIx64" cluster key mismatch - expected: %"PRIx64" but was : %"PRIx64,
3374	nodeid, g_register.cluster_key, cluster_key_p != NULL ? *cluster_key_p : `0`);
3375	is_sync = false;
3376	goto Exit;
3377	}
3378
3379	// Check succession list just to be sure.
3380	// We have clustering data from the node after the current cluster change.
3381	cf_node* succession_list = clustering_hb_plugin_succession_get(
3382	plugin_data.data, plugin_data.data_size);
3383
3384	uint32_t* succession_list_length_p =
3385	clustering_hb_plugin_succession_length_get(plugin_data.data,
3386	plugin_data.data_size);
3387
3388	if (succession_list == NULL \|\| succession_list_length_p == NULL
3389	\|\| !clustering_hb_succession_list_matches(succession_list,
3390	*succession_list_length_p, &g_register.succession_list)) {
3391	DEBUG("for node %"PRIx64" succession list mismatch", nodeid);
3392
3393	log_cf_node_vector("self succession list:", &g_register.succession_list,
3394	CF_DEBUG);
3395
3396	if (succession_list) {
3397	log_cf_node_array("node succession list:", succession_list,
3398	succession_list && succession_list_length_p ?
3399	*succession_list_length_p : `0`, CF_DEBUG);
3400	}
3401	else {
3402	DEBUG("node succession list: (empty)");
3403	}
3404
3405	is_sync = false;
3406	goto Exit;
3407	}
3408
3409	is_sync = true;
3410
3411	Exit:
3412	CLUSTERING_UNLOCK();
3413	return is_sync;
3414	}
3415
3416	/**
3417	* Find orphan nodes using clustering data for each node in the heartbeat's
3418	* adjacency list.
3419	*/
3420	static void
3421	clustering_orphan_nodes_find(cf_node nodeid, void* plugin_data,
3422	size_t plugin_data_size, cf_clock recv_monotonic_ts,
3423	as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
3424	{
3425	cf_vector* orphans = udata;
3426
3427	CLUSTERING_LOCK();
3428
3429	// For determining orphan it is alright if this data is within two heartbeat
3430	// intervals. So obsolete check has the timestamps as zero.
3431	if (recv_monotonic_ts + `2` * as_hb_tx_interval_get() >= cf_getms()
3432	&& !clustering_hb_plugin_data_is_obsolete(`0`, `0`, plugin_data,
3433	plugin_data_size, recv_monotonic_ts, msg_hlc_ts)) {
3434	if (clustering_hb_plugin_data_node_status(plugin_data, plugin_data_size)
3435	== AS_NODE_ORPHAN) {
3436	cf_vector_append(orphans, &nodeid);
3437	}
3438
3439	}
3440	else {
3441	DETAIL(
3442	"orphan check skipped - found obsolete plugin data for node %"PRIx64,
3443	nodeid);
3444	}
3445
3446	CLUSTERING_UNLOCK();
3447	}
3448
3449	/**
3450	* Get a list of neighboring nodes that are orphans. Does not include self node.
3451	*/
3452	static void
3453	clustering_neighboring_orphans_get(cf_vector* neighboring_orphans)
3454	{
3455	CLUSTERING_LOCK();
3456
3457	// Use a single iteration over the clustering data received via the
3458	// heartbeats instead of individual calls to get a consistent view and avoid
3459	// small lock and release.
3460	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
3461	clustering_orphan_nodes_find, neighboring_orphans);
3462
3463	CLUSTERING_UNLOCK();
3464	}
3465
3466	/**
3467	* Find neighboring nodes using clustering data for each node in the heartbeat's
3468	* adjacency list.
3469	*/
3470	static void
3471	clustering_neighboring_nodes_find(cf_node nodeid, void* plugin_data,
3472	size_t plugin_data_size, cf_clock recv_monotonic_ts,
3473	as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
3474	{
3475	cf_vector* nodes = udata;
3476	cf_vector_append(nodes, &nodeid);
3477	}
3478
3479	/**
3480	* Get a list of all neighboring nodes. Does not include self node.
3481	*/
3482	static void
3483	clustering_neighboring_nodes_get(cf_vector* neighboring_nodes)
3484	{
3485	CLUSTERING_LOCK();
3486
3487	// Use a single iteration over the clustering data received via the
3488	// heartbeats instead of individual calls to get a consistent view and avoid
3489	// small lock and release.
3490	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
3491	clustering_neighboring_nodes_find, neighboring_nodes);
3492
3493	CLUSTERING_UNLOCK();
3494	}
3495
3496	/**
3497	* Evict nodes not forming a clique from the succession list.
3498	*/
3499	static uint32_t
3500	clustering_succession_list_clique_evict(cf_vector* succession_list,
3501	char* evict_msg)
3502	{
3503	uint32_t num_evicted = `0`;
3504	if (g_config.clustering_config.clique_based_eviction_enabled) {
3505	// Remove nodes that do not form a clique.
3506	cf_vector* evicted_nodes = vector_stack_lockless_create(cf_node);
3507	as_hb_maximal_clique_evict(succession_list, evicted_nodes);
3508	num_evicted = cf_vector_size(evicted_nodes);
3509	log_cf_node_vector(evict_msg, evicted_nodes,
3510	num_evicted > `0` ? CF_INFO : CF_DEBUG);
3511
3512	vector_subtract(succession_list, evicted_nodes);
3513	cf_vector_destroy(evicted_nodes);
3514	}
3515	return num_evicted;
3516	}
3517
3518	/*
3519	* ----------------------------------------------------------------------------
3520	* Clustering network message functions
3521	* ----------------------------------------------------------------------------
3522	*/
3523
3524	/**
3525	* Fill common source node specific fields for the message.
3526	* @param msg the message to fill the source fields into.
3527	*/
3528	static void
3529	msg_src_fields_fill(msg* msg)
3530	{
3531	// Set the hb protocol id / version.
3532	msg_set_uint32(msg, AS_CLUSTERING_MSG_ID,
3533	clustering_protocol_identifier_get());
3534
3535	// Set the send timestamp
3536	msg_set_uint64(msg, AS_CLUSTERING_MSG_HLC_TIMESTAMP,
3537	as_hlc_timestamp_now());
3538	}
3539
3540	/**
3541	* Read the protocol identifier for this clustering message. These functions can
3542	* get called multiple times for a single message. Hence they do not increment
3543	* error counters.
3544	* @param msg the incoming message.
3545	* @param id the output id.
3546	* @return 0 if the type could be parsed -1 on failure.
3547	*/
3548	static int
3549	msg_proto_id_get(msg* msg, uint32_t* id)
3550	{
3551	if (msg_get_uint32(msg, AS_CLUSTERING_MSG_ID, id) != `0`) {
3552	return -`1`;
3553	}
3554
3555	return `0`;
3556	}
3557
3558	/**
3559	* Read the message type. These functions can get called multiple times for a
3560	* single message. Hence they do not increment error counters.
3561	* @param msg the incoming message.
3562	* @param type the output message type.
3563	* @return 0 if the type could be parsed -1 on failure.
3564	*/
3565	static int
3566	msg_type_get(msg* msg, as_clustering_msg_type* type)
3567	{
3568	if (msg_get_uint32(msg, AS_CLUSTERING_MSG_TYPE, type) != `0`) {
3569	return -`1`;
3570	}
3571
3572	return `0`;
3573	}
3574
3575	/**
3576	* Set the type for an outgoing message.
3577	* @param msg the outgoing message.
3578	* @param msg_type the type to set.
3579	*/
3580	static void
3581	msg_type_set(msg* msg, as_clustering_msg_type msg_type)
3582	{
3583	// Set the message type.
3584	msg_set_uint32(msg, AS_CLUSTERING_MSG_TYPE, msg_type);
3585	}
3586
3587	/**
3588	* Read the proposed principal field from the message.
3589	* @param msg the incoming message.
3590	* @param nodeid the output nodeid.
3591	* @return 0 if the type could be parsed -1 on failure.
3592	*/
3593	static int
3594	msg_proposed_principal_get(msg* msg, cf_node* nodeid)
3595	{
3596	if (msg_get_uint64(msg, AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, nodeid)
3597	!= `0`) {
3598	return -`1`;
3599	}
3600
3601	return `0`;
3602	}
3603
3604	/**
3605	* Set the proposed principal field in the message.
3606	* @param msg the outgoing message.
3607	* @param nodeid the proposed principal nodeid.
3608	*/
3609	static void
3610	msg_proposed_principal_set(msg* msg, cf_node nodeid)
3611	{
3612	msg_set_uint64(msg, AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, nodeid);
3613	}
3614
3615	/**
3616	* Read the HLC send timestamp for the message. These functions can get called
3617	* multiple times for a single message. Hence they do not increment error
3618	* counters.
3619	* @param msg the incoming message.
3620	* @param send_ts the output hls timestamp.
3621	* @return 0 if the type could be parsed -1 on failure.
3622	*/
3623	static int
3624	msg_send_ts_get(msg* msg, as_hlc_timestamp* send_ts)
3625	{
3626	if (msg_get_uint64(msg, AS_CLUSTERING_MSG_HLC_TIMESTAMP, send_ts) != `0`) {
3627	return -`1`;
3628	}
3629
3630	return `0`;
3631	}
3632
3633	/**
3634	* Set the sequence number for an outgoing message.
3635	* @param msg the outgoing message.
3636	* @param sequence_number the sequence number to set.
3637	*/
3638	static void
3639	msg_sequence_number_set(msg* msg, as_paxos_sequence_number sequence_number)
3640	{
3641	// Set the message type.
3642	msg_set_uint64(msg, AS_CLUSTERING_MSG_SEQUENCE_NUMBER, sequence_number);
3643	}
3644
3645	/**
3646	* Read sequence number from the message.
3647	* @param msg the incoming message.
3648	* @param sequence_number the output sequence number.
3649	* @return 0 if the sequence number could be parsed -1 on failure.
3650	*/
3651	static int
3652	msg_sequence_number_get(msg* msg, as_paxos_sequence_number* sequence_number)
3653	{
3654	if (msg_get_uint64(msg, AS_CLUSTERING_MSG_SEQUENCE_NUMBER, sequence_number)
3655	!= `0`) {
3656	return -`1`;
3657	}
3658
3659	return `0`;
3660	}
3661
3662	/**
3663	* Set the cluster key for an outgoing message field.
3664	* @param msg the outgoing message.
3665	* @param cluster_key the cluster key to set.
3666	* @param field the field to set the cluster key to.
3667	*/
3668	static void
3669	msg_cluster_key_field_set(msg* msg, as_cluster_key cluster_key,
3670	as_clustering_msg_field field)
3671	{
3672	msg_set_uint64(msg, field, cluster_key);
3673	}
3674
3675	/**
3676	* Set the cluster key for an outgoing message.
3677	* @param msg the outgoing message.
3678	* @param cluster_key the cluster key to set.
3679	*/
3680	static void
3681	msg_cluster_key_set(msg* msg, as_cluster_key cluster_key)
3682	{
3683	msg_cluster_key_field_set(msg, cluster_key, AS_CLUSTERING_MSG_CLUSTER_KEY);
3684	}
3685
3686	/**
3687	* Read cluster key from a message field.
3688	* @param msg the incoming message.
3689	* @param cluster_key the output cluster key.
3690	* @param field the field to set the cluster key to.
3691	* @return 0 if the cluster key could be parsed -1 on failure.
3692	*/
3693	static int
3694	msg_cluster_key_field_get(msg* msg, as_cluster_key* cluster_key,
3695	as_clustering_msg_field field)
3696	{
3697	if (msg_get_uint64(msg, field, cluster_key) != `0`) {
3698	return -`1`;
3699	}
3700
3701	return `0`;
3702	}
3703
3704	/**
3705	* Read cluster key from the message.
3706	* @param msg the incoming message.
3707	* @param cluster_key the output cluster key.
3708	* @return 0 if the cluster key could be parsed -1 on failure.
3709	*/
3710	static int
3711	msg_cluster_key_get(msg* msg, as_cluster_key* cluster_key)
3712	{
3713	return msg_cluster_key_field_get(msg, cluster_key,
3714	AS_CLUSTERING_MSG_CLUSTER_KEY);
3715	}
3716
3717	/**
3718	* Set the succession list for an outgoing message in a particular field.
3719	* @param msg the outgoing message.
3720	* @param succession_list the succession list to set.
3721	* @param field the field to set for the succession list.
3722	*/
3723	static void
3724	msg_succession_list_field_set(msg* msg, cf_vector* succession_list,
3725	as_clustering_msg_field field)
3726
3727	{
3728	int num_elements = cf_vector_size(succession_list);
3729	size_t buffer_size = num_elements * sizeof(cf_node);
3730	cf_node* succession_buffer = (cf_node*)BUFFER_ALLOC_OR_DIE(buffer_size);
3731
3732	for (int i = `0`; i < num_elements; i++) {
3733	cf_vector_get(succession_list, i, &succession_buffer[i]);
3734	}
3735
3736	msg_set_buf(msg, field, (uint8_t*)succession_buffer, buffer_size,
3737	MSG_SET_COPY);
3738
3739	BUFFER_FREE(succession_buffer, buffer_size);
3740	}
3741
3742	/**
3743	* Set the succession list for an outgoing message.
3744	* @param msg the outgoing message.
3745	* @param succession_list the succession list to set.
3746	*/
3747	static void
3748	msg_succession_list_set(msg* msg, cf_vector* succession_list)
3749	{
3750	int num_elements = cf_vector_size(succession_list);
3751	if (num_elements <= `0`) {
3752	// Empty succession list being sent. Definitely wrong.Something is amiss
3753	// let it through. The receiver will reject it anyways.
3754	WARNING("setting empty succession list");
3755	return;
3756	}
3757
3758	msg_succession_list_field_set(msg, succession_list,
3759	AS_CLUSTERING_MSG_SUCCESSION_LIST);
3760	}
3761
3762	/**
3763	* Read succession list from a message field.
3764	* @param msg the incoming message.
3765	* @param succession_list the output succession list.
3766	* @param field the field to read from.
3767	* @return 0 if the succession list could be parsed -1 on failure.
3768	*/
3769	static int
3770	msg_succession_list_field_get(msg* msg, cf_vector* succession_list,
3771	as_clustering_msg_field field)
3772	{
3773	vector_clear(succession_list);
3774	cf_node* succession_buffer;
3775	size_t buffer_size;
3776	if (msg_get_buf(msg, field, (uint8_t**)&succession_buffer, &buffer_size,
3777	MSG_GET_DIRECT) != `0`) {
3778	// Empty succession list should not be allowed.
3779	return -`1`;
3780	}
3781
3782	// Correct adjacency list length.
3783	int num_elements = buffer_size / sizeof(cf_node);
3784
3785	for (int i = `0`; i < num_elements; i++) {
3786	cf_vector_append(succession_list, &succession_buffer[i]);
3787	}
3788
3789	vector_sort_unique(succession_list, cf_node_compare_desc);
3790
3791	return `0`;
3792	}
3793
3794	/**
3795	* Read succession list from the message.
3796	* @param msg the incoming message.
3797	* @param succession_list the output succession list.
3798	* @return 0 if the succession list could be parsed -1 on failure.
3799	*/
3800	static int
3801	msg_succession_list_get(msg* msg, cf_vector* succession_list)
3802	{
3803	return msg_succession_list_field_get(msg, succession_list,
3804	AS_CLUSTERING_MSG_SUCCESSION_LIST);
3805	}
3806
3807	/**
3808	* Get the paxos proposal id for message event.
3809	* @param event the message event.
3810	* @param proposal_id the paxos proposal id.
3811	* @return 0 if the type could be parsed -1 on failure.
3812	*/
3813	static int
3814	msg_event_proposal_id_get(as_clustering_internal_event* event,
3815	as_paxos_proposal_id* proposal_id)
3816	{
3817	if (msg_sequence_number_get(event->msg, &proposal_id->sequence_number)
3818	!= `0`) {
3819	return -`1`;
3820	}
3821	proposal_id->src_nodeid = event->msg_src_nodeid;
3822	return `0`;
3823	}
3824
3825	/**
3826	* Get a network message object from the message pool with all common fields for
3827	* clustering, like protocol identifier, and hlc timestamp filled in.
3828	* @param type the type of the message.
3829	*/
3830	static msg*
3831	msg_pool_get(as_clustering_msg_type type)
3832	{
3833	msg* msg = as_fabric_msg_get(M_TYPE_CLUSTERING);
3834	msg_src_fields_fill(msg);
3835	msg_type_set(msg, type);
3836	return msg;
3837	}
3838
3839	/**
3840	* Return a message back to the message pool.
3841	*/
3842	static void
3843	msg_pool_return(msg* msg)
3844	{
3845	as_fabric_msg_put(msg);
3846	}
3847
3848	/**
3849	* Determines if the received message is old to be ignored.
3850	*
3851	* This is detemined by comparing the message hlc timestamp and monotonic
3852	* timestamps with the cluster formation hlc and monotonic times.
3853	*
3854	* @param cluster_modified_hlc_ts the hlc timestamp when for current cluster
3855	* change happened. Sent to avoid locking in this function.
3856	* @param cluster_modified_time the monotonic timestamp when for current
3857	* cluster change happened. Sento to avoid locking in this function.
3858	* @param msg_recv_ts the monotonic timestamp for plugin data receive.
3859	* @param msg_hlc_ts the hlc timestamp for plugin data receive.
3860	* @return true if plugin data is obsolete, false otherwise.
3861	*/
3862	bool
3863	msg_is_obsolete(as_hlc_timestamp cluster_modified_hlc_ts,
3864	cf_clock cluster_modified_time, cf_clock msg_recv_ts,
3865	as_hlc_msg_timestamp* msg_hlc_ts)
3866	{
3867	if (as_hlc_send_timestamp_order(cluster_modified_hlc_ts, msg_hlc_ts)
3868	!= AS_HLC_HAPPENS_BEFORE) {
3869	// Cluster formation time after message send or the order is unknown,
3870	// assume cluster formation is after message received.
3871	// The caller should ignore this message.
3872	return true;
3873	}
3874
3875	// MSG should be atleast after cluster formation time + one hb interval to
3876	// send out our cluster state + one network delay for our information to
3877	// reach the remote node + one hb for the other node to send out the his
3878	// updated state +
3879	// one network delay for the updated state to reach us.
3880	if (cluster_modified_time + `2` * as_hb_tx_interval_get()
3881	+ `2` * g_config.fabric_latency_max_ms > msg_recv_ts) {
3882	return true;
3883	}
3884
3885	return false;
3886	}
3887
3888	/**
3889	* Send a message to all input nodes. This is best effort some sends could fail.
3890	* The message will be returned back to the pool.
3891	* @param msg the message to send.
3892	* @param nodes the nodes to send the message to.
3893	* @return 0 on successfu queueing of message (does not imply guaranteed
3894	* delivery), -1 if the message could not be queued.
3895	*/
3896	static int
3897	msg_node_send(msg* msg, cf_node node)
3898	{
3899	int rv = as_fabric_send(node, msg, AS_FABRIC_CHANNEL_CTRL);
3900	if (rv) {
3901	// Fabric did not clean up the message, return it back to the message
3902	// pool.
3903	msg_pool_return(msg);
3904	}
3905	return rv;
3906	}
3907
3908	/**
3909	* Send a message to all input nodes. This is best effort some sends could fail.
3910	* The message will be returned back to the pool.
3911	* @param msg the message to send.
3912	* @param nodes the nodes to send the message to.
3913	* @return the number of nodes the message was sent to. Does not imply
3914	* guaranteed receipt by these nodes however.
3915	*/
3916	static int
3917	msg_nodes_send(msg* msg, cf_vector* nodes)
3918	{
3919	int node_count = cf_vector_size(nodes);
3920	int sent_count = `0`;
3921
3922	if (node_count <= `0`) {
3923	return sent_count;
3924	}
3925
3926	int alloc_size = node_count * sizeof(cf_node);
3927	cf_node* send_list = (cf_node*)BUFFER_ALLOC_OR_DIE(alloc_size);
3928
3929	vector_array_cpy(send_list, nodes, node_count);
3930
3931	if (as_fabric_send_list(send_list, node_count, msg, AS_FABRIC_CHANNEL_CTRL)
3932	!= `0`) {
3933	// Fabric did not clean up the message, return it back to the message
3934	// pool.
3935	msg_pool_return(msg);
3936	}
3937
3938	BUFFER_FREE(send_list, alloc_size);
3939	return sent_count;
3940	}
3941
3942	/*
3943	* ----------------------------------------------------------------------------
3944	* Paxos common
3945	* ----------------------------------------------------------------------------
3946	*/
3947
3948	/**
3949	* Compare paxos proposal ids. Compares the sequence numbers, ties in sequence
3950	* number are broken by nodeids.
3951	*
3952	* @param id1 the first identifier.
3953	* @param id2 the second identifier.
3954	*
3955	* @return 0 if id1 equals id2, 1 if id1 > id2 and -1 if id1 < id2.
3956	*/
3957	static int
3958	paxos_proposal_id_compare(as_paxos_proposal_id* id1, as_paxos_proposal_id* id2)
3959	{
3960	if (id1->sequence_number != id2->sequence_number) {
3961	return id1->sequence_number > id2->sequence_number ? `1` : -`1`;
3962	}
3963
3964	// Sequence numbers match, compare nodeids.
3965	if (id1->src_nodeid != id2->src_nodeid) {
3966	return id1->src_nodeid > id2->src_nodeid ? `1` : -`1`;
3967	}
3968
3969	// Node id and sequence numbers match.
3970	return `0`;
3971	}
3972
3973	/*
3974	* ----------------------------------------------------------------------------
3975	* Paxos proposer
3976	* ----------------------------------------------------------------------------
3977	*/
3978
3979	/**
3980	* Dump paxos proposer state to logs.
3981	*/
3982	static void
3983	paxos_proposer_dump(bool verbose)
3984	{
3985	CLUSTERING_LOCK();
3986
3987	// Output paxos proposer state.
3988	switch (g_proposer.state) {
3989	case AS_PAXOS_PROPOSER_STATE_IDLE:
3990	INFO("CL: paxos proposer: idle");
3991	break;
3992	case AS_PAXOS_PROPOSER_STATE_PREPARE_SENT:
3993	INFO("CL: paxos proposer: prepare sent");
3994	break;
3995	case AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT:
3996	INFO("CL: paxos proposer: accept sent");
3997	break;
3998	}
3999
4000	if (verbose) {
4001	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE) {
4002	INFO("CL: paxos proposal start time: %"PRIu64" now: %"PRIu64,
4003	g_proposer.paxos_round_start_time, cf_getms());
4004	INFO("CL: paxos proposed cluster key: %"PRIx64,
4005	g_proposer.proposed_value.cluster_key);
4006	INFO("CL: paxos proposed sequence: %"PRIu64,
4007	g_proposer.sequence_number);
4008	log_cf_node_vector("CL: paxos proposed succession:",
4009	&g_proposer.proposed_value.succession_list, CF_INFO);
4010	log_cf_node_vector("CL: paxos promises received:",
4011	&g_proposer.promises_received, CF_INFO);
4012	log_cf_node_vector("CL: paxos accepted received:",
4013	&g_proposer.accepted_received, CF_INFO);
4014	}
4015	}
4016
4017	CLUSTERING_UNLOCK();
4018	}
4019
4020	/**
4021	* Reset state on failure of a paxos round.
4022	*/
4023	static void
4024	paxos_proposer_reset()
4025	{
4026	CLUSTERING_LOCK();
4027
4028	// Flipping state to idle to indicate paxos round is over.
4029	g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE;
4030	memset(&g_proposer.sequence_number, `0`, sizeof(g_proposer.sequence_number));
4031
4032	g_proposer.proposed_value.cluster_key = `0`;
4033	vector_clear(&g_proposer.proposed_value.succession_list);
4034
4035	vector_clear(&g_proposer.acceptors);
4036
4037	DETAIL("paxos round over for proposal id %"PRIx64":%"PRIu64,
4038	config_self_nodeid_get(), g_proposer.sequence_number);
4039
4040	CLUSTERING_UNLOCK();
4041	}
4042
4043	/**
4044	* Invoked to fail an ongoing paxos proposal.
4045	*/
4046	static void
4047	paxos_proposer_fail()
4048	{
4049	// Cleanup state for the paxos round.
4050	paxos_proposer_reset();
4051
4052	as_clustering_internal_event paxos_fail_event;
4053	memset(&paxos_fail_event, `0`, sizeof(paxos_fail_event));
4054	paxos_fail_event.type = AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL;
4055
4056	internal_event_dispatch(&paxos_fail_event);
4057	}
4058
4059	/**
4060	* Indicates if a paxos proposal from self node is active.
4061	*/
4062	static bool
4063	paxos_proposer_proposal_is_active()
4064	{
4065	CLUSTERING_LOCK();
4066	bool rv = g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE;
4067	CLUSTERING_UNLOCK();
4068	return rv;
4069	}
4070
4071	/**
4072	* Send paxos prepare message current list of acceptor nodes.
4073	*/
4074	static void
4075	paxos_proposer_prepare_send()
4076	{
4077	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE);
4078
4079	CLUSTERING_LOCK();
4080
4081	// Set the sequence number
4082	msg_sequence_number_set(msg, g_proposer.sequence_number);
4083
4084	log_cf_node_vector("paxos prepare message sent to:", &g_proposer.acceptors,
4085	CF_DEBUG);
4086
4087	g_proposer.prepare_send_time = cf_getms();
4088
4089	cf_vector* acceptors = vector_stack_lockless_create(cf_node);
4090	vector_copy(acceptors, &g_proposer.acceptors);
4091
4092	CLUSTERING_UNLOCK();
4093
4094	// Sent the message to the acceptors.
4095	msg_nodes_send(msg, acceptors);
4096	cf_vector_destroy(acceptors);
4097	}
4098
4099	/**
4100	* Send paxos accept message current list of acceptor nodes.
4101	*/
4102	static void
4103	paxos_proposer_accept_send()
4104	{
4105	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT);
4106
4107	CLUSTERING_LOCK();
4108
4109	// Set the sequence number
4110	msg_sequence_number_set(msg, g_proposer.sequence_number);
4111
4112	// Skip send of the proposed value for accept, since we do not use it. Learn
4113	// message is the only way a consensus value is sent out.
4114	log_cf_node_vector("paxos accept message sent to:", &g_proposer.acceptors,
4115	CF_DEBUG);
4116
4117	g_proposer.accept_send_time = cf_getms();
4118
4119	cf_vector* acceptors = vector_stack_lockless_create(cf_node);
4120	vector_copy(acceptors, &g_proposer.acceptors);
4121
4122	CLUSTERING_UNLOCK();
4123
4124	// Sent the message to the acceptors.
4125	msg_nodes_send(msg, acceptors);
4126	cf_vector_destroy(acceptors);
4127	}
4128
4129	/**
4130	* Send paxos learn message current list of acceptor nodes.
4131	*/
4132	static void
4133	paxos_proposer_learn_send()
4134	{
4135	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN);
4136
4137	CLUSTERING_LOCK();
4138
4139	// Set the sequence number
4140	msg_sequence_number_set(msg, g_proposer.sequence_number);
4141
4142	// Set the cluster key
4143	msg_cluster_key_set(msg, g_proposer.proposed_value.cluster_key);
4144
4145	// Set the succession list
4146	msg_succession_list_set(msg, &g_proposer.proposed_value.succession_list);
4147
4148	log_cf_node_vector("paxos learn message sent to:", &g_proposer.acceptors,
4149	CF_DEBUG);
4150
4151	g_proposer.learn_send_time = cf_getms();
4152
4153	cf_vector* acceptors = vector_stack_lockless_create(cf_node);
4154	vector_copy(acceptors, &g_proposer.acceptors);
4155
4156	CLUSTERING_UNLOCK();
4157
4158	// Sent the message to the acceptors.
4159	msg_nodes_send(msg, acceptors);
4160	cf_vector_destroy(acceptors);
4161	}
4162
4163	/**
4164	* Handle an incoming paxos promise message.
4165	*/
4166	static void
4167	paxos_proposer_promise_handle(as_clustering_internal_event* event)
4168	{
4169	cf_node src_nodeid = event->msg_src_nodeid;
4170	msg* msg = event->msg;
4171
4172	DEBUG("received paxos promise from node %"PRIx64, src_nodeid);
4173
4174	CLUSTERING_LOCK();
4175	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_PREPARE_SENT) {
4176	// We are not in the prepare phase. Reject this message.
4177	DEBUG("ignoring paxos promise from node %"PRIx64" - we are not in prepare phase",
4178	src_nodeid);
4179	goto Exit;
4180	}
4181
4182	if (vector_find(&g_proposer.acceptors, &src_nodeid) < `0`) {
4183	WARNING("ignoring paxos promise from node %"PRIx64" - it is not in acceptor list",
4184	src_nodeid);
4185	goto Exit;
4186	}
4187
4188	as_paxos_sequence_number sequence_number = `0`;
4189	if (msg_sequence_number_get(msg, &sequence_number) != `0`) {
4190	WARNING("ignoring paxos promise from node %"PRIx64" with invalid proposal id",
4191	src_nodeid);
4192	goto Exit;
4193	}
4194
4195	if (sequence_number != g_proposer.sequence_number) {
4196	// Not a matching promise message. Ignore.
4197	INFO("ignoring paxos promise from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64,
4198	src_nodeid, sequence_number,
4199	g_proposer.sequence_number);
4200	goto Exit;
4201	}
4202
4203	cf_vector_append_unique(&g_proposer.promises_received, &src_nodeid);
4204
4205	int promised_count = cf_vector_size(&g_proposer.promises_received);
4206	int acceptor_count = cf_vector_size(&g_proposer.acceptors);
4207
4208	// Use majority quorum to move on.
4209	if (promised_count >= `1` + (acceptor_count / `2`)) {
4210	// We have quorum number of promises. go ahead to the accept phase.
4211	g_proposer.state = AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT;
4212	paxos_proposer_accept_send();
4213	}
4214
4215	Exit:
4216	CLUSTERING_UNLOCK();
4217	}
4218
4219	/**
4220	* Handle an incoming paxos prepare nack message.
4221	*/
4222	static void
4223	paxos_proposer_prepare_nack_handle(as_clustering_internal_event* event)
4224	{
4225	cf_node src_nodeid = event->msg_src_nodeid;
4226	msg* msg = event->msg;
4227
4228	DEBUG("received paxos prepare nack from node %"PRIx64, src_nodeid);
4229
4230	CLUSTERING_LOCK();
4231	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_PREPARE_SENT) {
4232	// We are not in the prepare phase. Reject this message.
4233	INFO("ignoring paxos prepare nack from node %"PRIx64" - we are not in prepare phase",
4234	src_nodeid);
4235	goto Exit;
4236	}
4237
4238	if (vector_find(&g_proposer.acceptors, &src_nodeid) < `0`) {
4239	WARNING("ignoring paxos prepare nack from node %"PRIx64" - it is not in acceptor list",
4240	src_nodeid);
4241	goto Exit;
4242	}
4243
4244	as_paxos_sequence_number sequence_number = `0`;
4245	if (msg_sequence_number_get(msg, &sequence_number) != `0`) {
4246	WARNING("ignoring paxos prepare nack from node %"PRIx64" with invalid proposal id",
4247	src_nodeid);
4248	goto Exit;
4249	}
4250
4251	if (sequence_number != g_proposer.sequence_number) {
4252	// Not a matching prepare nack message. Ignore.
4253	INFO("ignoring paxos prepare nack from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64,
4254	src_nodeid, sequence_number,
4255	g_proposer.sequence_number);
4256	goto Exit;
4257	}
4258
4259	INFO(
4260	"aborting current paxos proposal because of a prepare nack from node %"PRIx64,
4261	src_nodeid);
4262	paxos_proposer_fail();
4263
4264	Exit:
4265	CLUSTERING_UNLOCK();
4266	}
4267
4268	/**
4269	* Invoked when all acceptors have accepted the proposal.
4270	*/
4271	static void
4272	paxos_proposer_success()
4273	{
4274	CLUSTERING_LOCK();
4275
4276	// Set the proposer to back idle state.
4277	g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE;
4278
4279	// Send out learn message and enable retransmits of learn message.
4280	g_proposer.learn_retransmit_needed = true;
4281	paxos_proposer_learn_send();
4282
4283	// Retain the sequence_number, cluster key and succession list for
4284	// retransmits of the learn message.
4285	as_clustering_internal_event paxos_success_event;
4286	memset(&paxos_success_event, `0`, sizeof(paxos_success_event));
4287	paxos_success_event.type =
4288	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_SUCCESS;
4289
4290	CLUSTERING_UNLOCK();
4291	}
4292
4293	/**
4294	* Indicates if the proposer can accept, accepted messages.
4295	*/
4296	static bool
4297	paxos_proposer_can_accept_accepted(cf_node src_nodeid, msg* msg)
4298	{
4299	bool rv = false;
4300
4301	CLUSTERING_LOCK();
4302	// We also allow accepted messages in the idle state to deal with a loss of
4303	// the learn message.
4304	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT
4305	&& g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE) {
4306	// We are not in the accept phase. Reject this message.
4307	DEBUG("ignoring paxos accepted from node %"PRIx64" - we are not in accept phase. Actual phase %d",
4308	src_nodeid, g_proposer.state);
4309	goto Exit;
4310	}
4311
4312	if (vector_find(&g_proposer.acceptors, &src_nodeid) < `0`) {
4313	WARNING("ignoring paxos accepted from node %"PRIx64" - it is not in acceptor list",
4314	src_nodeid);
4315	goto Exit;
4316	}
4317
4318	as_paxos_sequence_number sequence_number = `0`;
4319	if (msg_sequence_number_get(msg, &sequence_number) != `0`) {
4320	WARNING("ignoring paxos accepted from node %"PRIx64" with invalid proposal id",
4321	src_nodeid);
4322	goto Exit;
4323	}
4324
4325	if (sequence_number != g_proposer.sequence_number) {
4326	// Not a matching accepted message. Ignore.
4327	INFO("ignoring paxos accepted from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64,
4328	src_nodeid, sequence_number,
4329	g_proposer.sequence_number);
4330	goto Exit;
4331	}
4332
4333	if (g_proposer.proposed_value.cluster_key == g_register.cluster_key
4334	&& vector_equals(&g_proposer.proposed_value.succession_list,
4335	&g_register.succession_list)) {
4336	// The register is already synced for this proposal. We can ignore this
4337	// accepted message.
4338	INFO("ignoring paxos accepted from node %"PRIx64" because its proposal id %"PRIu64" is a duplicate",
4339	src_nodeid, sequence_number
4340	);
4341	goto Exit;
4342	}
4343
4344	rv = true;
4345	Exit:
4346	CLUSTERING_UNLOCK();
4347	return rv;
4348	}
4349
4350	/**
4351	* Handle an incoming paxos accepted message.
4352	*/
4353	static void
4354	paxos_proposer_accepted_handle(as_clustering_internal_event* event)
4355	{
4356	cf_node src_nodeid = event->msg_src_nodeid;
4357	msg* msg = event->msg;
4358
4359	DEBUG("received paxos accepted from node %"PRIx64, src_nodeid);
4360
4361	if (!paxos_proposer_can_accept_accepted(src_nodeid, msg)) {
4362	return;
4363	}
4364
4365	CLUSTERING_LOCK();
4366
4367	cf_vector_append_unique(&g_proposer.accepted_received, &src_nodeid);
4368
4369	int accepted_count = cf_vector_size(&g_proposer.accepted_received);
4370	int acceptor_count = cf_vector_size(&g_proposer.acceptors);
4371
4372	// Use a simple quorum, all acceptors should accept for success.
4373	if (accepted_count == acceptor_count) {
4374	// This is the point after which the succession list will not change for
4375	// this paxos round. Ensure that we meet the minimum cluster size
4376	// criterion.
4377	int cluster_size = cf_vector_size(
4378	&g_proposer.proposed_value.succession_list);
4379	if (cluster_size < g_config.clustering_config.cluster_size_min) {
4380	WARNING(
4381	"failing paxos round - the remaining number of nodes %d is less than minimum cluster size %d",
4382	cluster_size, g_config.clustering_config.cluster_size_min);
4383	// Fail paxos.
4384	paxos_proposer_fail();
4385	goto Exit;
4386	}
4387
4388	// We have quorum number of accepted nodes. The proposal succeeded.
4389	paxos_proposer_success();
4390	}
4391
4392	Exit:
4393	CLUSTERING_UNLOCK();
4394	}
4395
4396	/**
4397	* Handle an incoming paxos accept nack message.
4398	*/
4399	static void
4400	paxos_proposer_accept_nack_handle(as_clustering_internal_event* event)
4401	{
4402	cf_node src_nodeid = event->msg_src_nodeid;
4403	msg* msg = event->msg;
4404
4405	DEBUG("received paxos accept nack from node %"PRIx64, src_nodeid);
4406
4407	CLUSTERING_LOCK();
4408	if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT) {
4409	// We are not in the accept phase. Reject this message.
4410	INFO("ignoring paxos accept nack from node %"PRIx64" - we are not in accept phase",
4411	src_nodeid);
4412	goto Exit;
4413	}
4414
4415	if (vector_find(&g_proposer.acceptors, &src_nodeid) < `0`) {
4416	WARNING("ignoring paxos accept nack from node %"PRIx64" - it is not in acceptor list",
4417	src_nodeid);
4418	goto Exit;
4419	}
4420
4421	as_paxos_sequence_number sequence_number = `0`;
4422	if (msg_sequence_number_get(msg, &sequence_number) != `0`) {
4423	WARNING("ignoring paxos accept nack from node %"PRIx64" with invalid proposal id",
4424	src_nodeid);
4425	goto Exit;
4426	}
4427
4428	if (sequence_number != g_proposer.sequence_number) {
4429	// Not a matching accept nack message. Ignore.
4430	INFO("ignoring paxos accept nack from node %"PRIx64"because its proposal id %"PRIu64" does not match expected id %"PRIu64,
4431	src_nodeid, sequence_number,
4432	g_proposer.sequence_number);
4433	goto Exit;
4434	}
4435
4436	INFO(
4437	"aborting current paxos proposal because of an accept nack from node %"PRIx64,
4438	src_nodeid);
4439	paxos_proposer_fail();
4440
4441	Exit:
4442	CLUSTERING_UNLOCK();
4443	}
4444
4445	/**
4446	* Handle an incoming message.
4447	*/
4448	static void
4449	paxos_proposer_msg_event_handle(as_clustering_internal_event* msg_event)
4450	{
4451	switch (msg_event->msg_type) {
4452	case AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE:
4453	paxos_proposer_promise_handle(msg_event);
4454	break;
4455	case AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK:
4456	paxos_proposer_prepare_nack_handle(msg_event);
4457	break;
4458	case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED:
4459	paxos_proposer_accepted_handle(msg_event);
4460	break;
4461	case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK:
4462	paxos_proposer_accept_nack_handle(msg_event);
4463	break;
4464	default: // Other message types are not of interest.
4465	break;
4466	}
4467	}
4468
4469	/**
4470	* Handle heartbeat event.
4471	*/
4472	static void
4473	paxos_proposer_hb_event_handle(as_clustering_internal_event* hb_event)
4474	{
4475	if (!paxos_proposer_proposal_is_active()) {
4476	return;
4477	}
4478
4479	CLUSTERING_LOCK();
4480	for (int i = `0`; i < hb_event->hb_n_events; i++) {
4481	if (hb_event->hb_events[i].evt == AS_HB_NODE_DEPART) {
4482	cf_node departed_node = hb_event->hb_events[i].nodeid;
4483	if (vector_find(&g_proposer.acceptors, &departed_node)) {
4484	// One of the acceptors has departed. Abort the paxos proposal.
4485	INFO("paxos acceptor %"PRIx64" departed - aborting current paxos proposal", departed_node);
4486	paxos_proposer_fail();
4487	break;
4488	}
4489	}
4490	}
4491	CLUSTERING_UNLOCK();
4492	}
4493
4494	/**
4495	* Check and retransmit prepare message if paxos promise messages have not yet
4496	* being received.
4497	*/
4498	static void
4499	paxos_proposer_prepare_check_retransmit()
4500	{
4501	CLUSTERING_LOCK();
4502	cf_clock now = cf_getms();
4503	if (g_proposer.state == AS_PAXOS_PROPOSER_STATE_PREPARE_SENT
4504	&& g_proposer.prepare_send_time + paxos_msg_timeout() < now) {
4505	paxos_proposer_prepare_send();
4506	}
4507	CLUSTERING_UNLOCK();
4508	}
4509
4510	/**
4511	* Check and retransmit accept message if paxos accepted has yet being received.
4512	*/
4513	static void
4514	paxos_proposer_accept_check_retransmit()
4515	{
4516	CLUSTERING_LOCK();
4517	cf_clock now = cf_getms();
4518	if (g_proposer.state == AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT
4519	&& g_proposer.accept_send_time + paxos_msg_timeout() < now) {
4520	paxos_proposer_accept_send();
4521	}
4522	CLUSTERING_UNLOCK();
4523	}
4524
4525	/**
4526	* Check and retransmit learn message if all acceptors have not applied the
4527	* current cluster change.
4528	*/
4529	static void
4530	paxos_proposer_learn_check_retransmit()
4531	{
4532	CLUSTERING_LOCK();
4533	cf_clock now = cf_getms();
4534	bool learn_timedout = g_proposer.learn_retransmit_needed
4535	&& (g_proposer.state == AS_PAXOS_PROPOSER_STATE_IDLE)
4536	&& (g_proposer.proposed_value.cluster_key != `0`)
4537	&& (g_proposer.learn_send_time + paxos_msg_timeout() < now);
4538
4539	if (learn_timedout) {
4540	// If the register is not synced, most likely the learn message did not
4541	// make it through, retransmit the learn message to move the paxos
4542	// acceptor forward and start register sync.
4543	INFO("retransmitting paxos learn message");
4544	paxos_proposer_learn_send();
4545	}
4546	CLUSTERING_UNLOCK();
4547	}
4548
4549	/**
4550	* Handle a timer event and retransmit messages if required.
4551	*/
4552	static void
4553	paxos_proposer_timer_event_handle()
4554	{
4555	CLUSTERING_LOCK();
4556	switch (g_proposer.state) {
4557	case AS_PAXOS_PROPOSER_STATE_IDLE:
4558	paxos_proposer_learn_check_retransmit();
4559	break;
4560	case AS_PAXOS_PROPOSER_STATE_PREPARE_SENT:
4561	paxos_proposer_prepare_check_retransmit();
4562	break;
4563	case AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT:
4564	paxos_proposer_accept_check_retransmit();
4565	break;
4566	}
4567	CLUSTERING_UNLOCK();
4568	}
4569
4570	/**
4571	* Handle register getting synched.
4572	*/
4573	static void
4574	paxos_proposer_register_synched()
4575	{
4576	CLUSTERING_LOCK();
4577	// Register synched we no longer need learn messages to be retransmitted.
4578	g_proposer.learn_retransmit_needed = false;
4579	CLUSTERING_UNLOCK();
4580	}
4581
4582	/**
4583	* Initialize paxos proposer state.
4584	*/
4585	static void
4586	paxos_proposer_init()
4587	{
4588	CLUSTERING_LOCK();
4589	// Memset to zero which ensures that all proposer state variables have zero
4590	// which is the correct initial value for elements other that contained
4591	// vectors and status.
4592	memset(&g_proposer, `0`, sizeof(g_proposer));
4593
4594	// Initialize the proposer state.
4595	// No paxos round running, so the state has to be idle.
4596	g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE;
4597
4598	// Set the current acceptor list to be empty.
4599	vector_lockless_init(&g_proposer.acceptors, cf_node);
4600
4601	// Set the current promises received node list to empty.
4602	vector_lockless_init(&g_proposer.promises_received, cf_node);
4603
4604	// Set the current accepted received node list to empty.
4605	vector_lockless_init(&g_proposer.accepted_received, cf_node);
4606
4607	// Initialize the proposed value.
4608	vector_lockless_init(&g_proposer.proposed_value.succession_list, cf_node);
4609	g_proposer.proposed_value.cluster_key = `0`;
4610
4611	CLUSTERING_UNLOCK();
4612	}
4613
4614	/**
4615	* Log paxos results.
4616	*/
4617	static void
4618	paxos_result_log(as_paxos_start_result result, cf_vector* new_succession_list)
4619	{
4620	CLUSTERING_LOCK();
4621	switch (result) {
4622	case AS_PAXOS_RESULT_STARTED: {
4623	// Running check required because paxos round finished for single node
4624	// cluster by this time.
4625	if (paxos_proposer_proposal_is_active()) {
4626	INFO("paxos round started - cluster key: %"PRIx64,
4627	g_proposer.proposed_value.cluster_key);
4628	log_cf_node_vector("paxos round started - succession list:",
4629	&g_proposer.proposed_value.succession_list, CF_INFO);
4630	}
4631	break;
4632	}
4633
4634	case AS_PAXOS_RESULT_CLUSTER_TOO_SMALL: {
4635	WARNING(
4636	"paxos round aborted - new cluster size %d less than min cluster size %d",
4637	cf_vector_size(new_succession_list),
4638	g_config.clustering_config.cluster_size_min);
4639	break;
4640	}
4641
4642	case AS_PAXOS_RESULT_ROUND_RUNNING: {
4643	// Should never happen in practice. Let the old round finish or timeout.
4644	WARNING(
4645	"older paxos round still running - should have finished by now");
4646	}
4647	}
4648
4649	CLUSTERING_UNLOCK();
4650	}
4651
4652	/**
4653	* Start a new paxos round.
4654	*
4655	* @param new_succession_list the new succession list.
4656	* @param acceptor_list the list of nodes to use for paxos acceptors.
4657	* @param current_cluster_key the current cluster key
4658	* @param current_succession_list the current succession list, can be null if
4659	* this node is an orphan.
4660	*/
4661	static as_paxos_start_result
4662	paxos_proposer_proposal_start(cf_vector* new_succession_list,
4663	cf_vector* acceptor_list)
4664	{
4665	if (cf_vector_size(new_succession_list)
4666	< g_config.clustering_config.cluster_size_min) {
4667	// Fail paxos.
4668	return AS_PAXOS_RESULT_CLUSTER_TOO_SMALL;
4669	}
4670
4671	CLUSTERING_LOCK();
4672
4673	as_paxos_start_result result;
4674	if (paxos_proposer_proposal_is_active()) {
4675	result = AS_PAXOS_RESULT_ROUND_RUNNING;
4676	goto Exit;
4677	}
4678
4679	// Update state to prepare.
4680	g_proposer.state = AS_PAXOS_PROPOSER_STATE_PREPARE_SENT;
4681
4682	g_proposer.sequence_number = as_hlc_timestamp_now();
4683
4684	g_proposer.paxos_round_start_time = cf_getms();
4685
4686	// Populate the proposed value struct with new succession list and a new
4687	// cluster key.
4688	vector_clear(&g_proposer.proposed_value.succession_list);
4689	vector_copy(&g_proposer.proposed_value.succession_list,
4690	new_succession_list);
4691	g_proposer.proposed_value.cluster_key = clustering_cluster_key_generate(
4692	g_register.cluster_key);
4693
4694	// Remember the acceptors for this paxos round.
4695	vector_clear(&g_proposer.acceptors);
4696	vector_copy(&g_proposer.acceptors, acceptor_list);
4697
4698	// Clear the promise received and accepted received vectors for this new
4699	// round.
4700	vector_clear(&g_proposer.promises_received);
4701	vector_clear(&g_proposer.accepted_received);
4702
4703	paxos_proposer_prepare_send();
4704
4705	result = AS_PAXOS_RESULT_STARTED;
4706
4707	Exit:
4708	CLUSTERING_UNLOCK();
4709
4710	return result;
4711	}
4712
4713	/**
4714	* Paxos proposer monitor to detect and cleanup long running and most likely
4715	* failed paxos rounds.
4716	*/
4717	static void
4718	paxos_proposer_monitor()
4719	{
4720	CLUSTERING_LOCK();
4721	if (paxos_proposer_proposal_is_active()) {
4722	if (g_proposer.paxos_round_start_time + paxos_proposal_timeout()
4723	<= cf_getms()) {
4724	// Paxos round is running and has timed out.
4725	// Consider paxos round failed.
4726	INFO("paxos round timed out for proposal id %"PRIx64":%"PRIu64,
4727	config_self_nodeid_get(),
4728	g_proposer.sequence_number);
4729	paxos_proposer_fail();
4730	}
4731	}
4732	CLUSTERING_UNLOCK();
4733	}
4734
4735	/*
4736	* ----------------------------------------------------------------------------
4737	* Paxos acceptor
4738	* ----------------------------------------------------------------------------
4739	*/
4740
4741	/**
4742	* Dump paxos acceptor state to logs.
4743	*/
4744	static void
4745	paxos_acceptor_dump(bool verbose)
4746	{
4747	CLUSTERING_LOCK();
4748
4749	// Output paxos acceptor state.
4750	switch (g_acceptor.state) {
4751	case AS_PAXOS_ACCEPTOR_STATE_IDLE:
4752	INFO("CL: paxos acceptor: idle");
4753	break;
4754	case AS_PAXOS_ACCEPTOR_STATE_PROMISED:
4755	INFO("CL: paxos acceptor: promised");
4756	break;
4757	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED:
4758	INFO("CL: paxos acceptor: accepted");
4759	break;
4760	}
4761
4762	if (verbose) {
4763	if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_IDLE) {
4764	INFO("CL: paxos acceptor start time: %"PRIu64" now: %"PRIu64,
4765	g_acceptor.acceptor_round_start, cf_getms());
4766	INFO("CL: paxos acceptor proposal id: (%"PRIx64":%"PRIu64")",
4767	g_acceptor.last_proposal_received_id.src_nodeid,
4768	g_acceptor.last_proposal_received_id.sequence_number);
4769	INFO("CL: paxos acceptor promised time: %"PRIu64" now: %"PRIu64,
4770	g_acceptor.promise_send_time, cf_getms());
4771	INFO("CL: paxos acceptor accepted time: %"PRIu64" now: %"PRIu64,
4772	g_acceptor.accepted_send_time, cf_getms());
4773	}
4774	}
4775
4776	CLUSTERING_UNLOCK();
4777	}
4778
4779	/**
4780	* Reset the acceptor for the next round.
4781	*/
4782	static void
4783	paxos_acceptor_reset()
4784	{
4785	CLUSTERING_LOCK();
4786	g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_IDLE;
4787	g_acceptor.acceptor_round_start = `0`;
4788	g_acceptor.promise_send_time = `0`;
4789	g_acceptor.accepted_send_time = `0`;
4790	CLUSTERING_UNLOCK();
4791	}
4792
4793	/**
4794	* Invoked to fail an ongoing paxos proposal.
4795	*/
4796	static void
4797	paxos_acceptor_fail()
4798	{
4799	// Cleanup state for the paxos round.
4800	paxos_acceptor_reset();
4801
4802	as_clustering_internal_event paxos_fail_event;
4803	memset(&paxos_fail_event, `0`, sizeof(paxos_fail_event));
4804	paxos_fail_event.type = AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_FAIL;
4805
4806	internal_event_dispatch(&paxos_fail_event);
4807	}
4808
4809	/**
4810	* Invoked on success of an ongoing paxos proposal.
4811	*/
4812	static void
4813	paxos_acceptor_success(as_cluster_key cluster_key, cf_vector* succession_list,
4814	as_paxos_sequence_number sequence_number)
4815	{
4816	// Cleanup state for the paxos round.
4817	paxos_acceptor_reset();
4818
4819	as_clustering_internal_event paxos_success_event;
4820	memset(&paxos_success_event, `0`, sizeof(paxos_success_event));
4821	paxos_success_event.type =
4822	AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS;
4823	paxos_success_event.new_succession_list = succession_list;
4824	paxos_success_event.new_cluster_key = cluster_key;
4825	paxos_success_event.new_sequence_number = sequence_number;
4826
4827	internal_event_dispatch(&paxos_success_event);
4828	}
4829
4830	/**
4831	* Send paxos promise message to the proposer node.
4832	* @param dest the destination node.
4833	* @param sequence_number the sequence number from the incoming message.
4834	*/
4835	static void
4836	paxos_acceptor_promise_send(cf_node dest,
4837	as_paxos_sequence_number sequence_number)
4838	{
4839	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE);
4840
4841	msg_sequence_number_set(msg, sequence_number);
4842
4843	DEBUG("paxos promise message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
4844
4845	CLUSTERING_LOCK();
4846	g_acceptor.promise_send_time = cf_getms();
4847	CLUSTERING_UNLOCK();
4848
4849	// Send the message to the proposer.
4850	msg_node_send(msg, dest);
4851	}
4852
4853	/**
4854	* Send paxos prepare nack message to the proposer.
4855	* @param dest the destination node.
4856	* @param sequence_number the sequence number from the incoming message.
4857	*/
4858	static void
4859	paxos_acceptor_prepare_nack_send(cf_node dest,
4860	as_paxos_sequence_number sequence_number)
4861	{
4862	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK);
4863
4864	msg_sequence_number_set(msg, sequence_number);
4865
4866	DEBUG("paxos prepare nack message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
4867
4868	// Send the message to the proposer.
4869	msg_node_send(msg, dest);
4870	}
4871
4872	/**
4873	* Send paxos accepted message to the proposer node.
4874	* @param dest the destination node.
4875	* @param sequence_number the sequence number from the incoming message.
4876	*/
4877	static void
4878	paxos_acceptor_accepted_send(cf_node dest,
4879	as_paxos_sequence_number sequence_number)
4880	{
4881	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED);
4882
4883	msg_sequence_number_set(msg, sequence_number);
4884
4885	DEBUG("paxos accepted message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
4886
4887	CLUSTERING_LOCK();
4888	g_acceptor.accepted_send_time = cf_getms();
4889	CLUSTERING_UNLOCK();
4890
4891	// Send the message to the proposer.
4892	msg_node_send(msg, dest);
4893	}
4894
4895	/**
4896	* Send paxos accept nack message to the proposer.
4897	* @param dest the destination node.
4898	* @param sequence_number the sequence number from the incoming message.
4899	*/
4900	static void
4901	paxos_acceptor_accept_nack_send(cf_node dest,
4902	as_paxos_sequence_number sequence_number)
4903	{
4904	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK);
4905
4906	msg_sequence_number_set(msg, sequence_number);
4907
4908	DEBUG("paxos accept nack message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number);
4909
4910	// Send the message to the proposer.
4911	msg_node_send(msg, dest);
4912	}
4913
4914	/**
4915	* Check if the incoming prepare can be promised.
4916	*/
4917	static bool
4918	paxos_acceptor_prepare_can_promise(cf_node src_nodeid,
4919	as_paxos_proposal_id* proposal_id)
4920	{
4921	if (!clustering_can_accept_as_proposer(src_nodeid)) {
4922	INFO("ignoring paxos prepare from node %"PRIx64" because it cannot be a principal",
4923	src_nodeid);
4924	return false;
4925	}
4926
4927	bool can_promise = false;
4928	CLUSTERING_LOCK();
4929	int comparison = paxos_proposal_id_compare(proposal_id,
4930	&g_acceptor.last_proposal_received_id);
4931
4932	switch (g_acceptor.state) {
4933	case AS_PAXOS_ACCEPTOR_STATE_IDLE:
4934	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: {
4935	// Allow only higher valued proposal to prevent replays and also to
4936	// ensure convergence in the face of competing proposals.
4937	can_promise = comparison > `0`;
4938	}
4939	break;
4940	case AS_PAXOS_ACCEPTOR_STATE_PROMISED: {
4941	// We allow for replays of the prepare message as well so that the
4942	// proposer can receive a promise for this node's lost promise message.
4943	can_promise = comparison >= `0`;
4944	}
4945	break;
4946	}
4947
4948	CLUSTERING_UNLOCK();
4949
4950	return can_promise;
4951	}
4952
4953	/**
4954	* Handle an incoming paxos prepare message.
4955	*/
4956	static void
4957	paxos_acceptor_prepare_handle(as_clustering_internal_event* event)
4958	{
4959	cf_node src_nodeid = event->msg_src_nodeid;
4960	DEBUG("received paxos prepare from node %"PRIx64, src_nodeid);
4961
4962	as_paxos_proposal_id proposal_id = { `0` };
4963	if (msg_event_proposal_id_get(event, &proposal_id) != `0`) {
4964	INFO("ignoring paxos prepare from node %"PRIx64" with invalid proposal id",
4965	src_nodeid);
4966	return;
4967	}
4968
4969	if (!paxos_acceptor_prepare_can_promise(src_nodeid, &proposal_id)) {
4970	INFO("ignoring paxos prepare from node %"PRIx64" with obsolete proposal id (%"PRIx64":%"PRIu64")", proposal_id.src_nodeid, proposal_id.src_nodeid, proposal_id.sequence_number);
4971	paxos_acceptor_prepare_nack_send(src_nodeid,
4972	proposal_id.sequence_number);
4973	return;
4974	}
4975
4976	CLUSTERING_LOCK();
4977
4978	bool is_new_proposal = paxos_proposal_id_compare(&proposal_id,
4979	&g_acceptor.last_proposal_received_id) != `0`;
4980
4981	if (is_new_proposal) {
4982	// Remember this to be the last proposal id we received.
4983	memcpy(&g_acceptor.last_proposal_received_id, &proposal_id,
4984	sizeof(proposal_id));
4985
4986	// Update the round start time.
4987	g_acceptor.acceptor_round_start = cf_getms();
4988
4989	// Switch to promised state.
4990	g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_PROMISED;
4991	}
4992	else {
4993	// This is a retransmit or delayed message in which case we do not
4994	// update the state.
4995	// If we have already accepted this proposal, we would want to remain in
4996	// accepted state.
4997	}
4998
4999	// The proposal is promised. Send back a paxos promise.
5000	paxos_acceptor_promise_send(src_nodeid, proposal_id.sequence_number);
5001
5002	CLUSTERING_UNLOCK();
5003	}
5004
5005	/**
5006	* Check if the incoming accept can be accepted.
5007	*/
5008	static bool
5009	paxos_acceptor_accept_can_accept(cf_node src_nodeid,
5010	as_paxos_proposal_id* proposal_id)
5011	{
5012	if (!clustering_can_accept_as_proposer(src_nodeid)) {
5013	INFO("ignoring paxos accept from node %"PRIx64" because it cannot be a principal",
5014	src_nodeid);
5015	return false;
5016	}
5017
5018	bool can_accept = false;
5019	CLUSTERING_LOCK();
5020	int comparison = paxos_proposal_id_compare(proposal_id,
5021	&g_acceptor.last_proposal_received_id);
5022
5023	switch (g_acceptor.state) {
5024	case AS_PAXOS_ACCEPTOR_STATE_IDLE:
5025	case AS_PAXOS_ACCEPTOR_STATE_PROMISED:
5026	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: {
5027	// We allow for replays of the accept message as well, so that the
5028	// proposer can receive an accepted for this node's lost accepted
5029	// message.
5030	can_accept = comparison >= `0`;
5031	}
5032	break;
5033	}
5034
5035	CLUSTERING_UNLOCK();
5036
5037	return can_accept;
5038	}
5039
5040	/**
5041	* Handle an incoming paxos accept message.
5042	*/
5043	static void
5044	paxos_acceptor_accept_handle(as_clustering_internal_event* event)
5045	{
5046	cf_node src_nodeid = event->msg_src_nodeid;
5047
5048	DEBUG("received paxos accept from node %"PRIx64, src_nodeid);
5049
5050	// Its ok to proceed even is paxos is running, because this could be a
5051	// competing proposal and the winner will be decided by paxos sequence
5052	// number.
5053	as_paxos_proposal_id proposal_id = { `0` };
5054	if (msg_event_proposal_id_get(event, &proposal_id) != `0`) {
5055	INFO("ignoring paxos accept from node %"PRIx64" with invalid proposal id",
5056	src_nodeid);
5057	return;
5058	}
5059
5060	if (!paxos_acceptor_accept_can_accept(src_nodeid, &proposal_id)) {
5061	INFO("ignoring paxos accept from node %"PRIx64" with obsolete proposal id (%"PRIx64":%"PRIu64")", proposal_id.src_nodeid, proposal_id.src_nodeid, proposal_id.sequence_number);
5062	paxos_acceptor_accept_nack_send(src_nodeid,
5063	proposal_id.sequence_number);
5064	return;
5065	}
5066
5067	CLUSTERING_LOCK();
5068
5069	bool is_new_proposal = paxos_proposal_id_compare(&proposal_id,
5070	&g_acceptor.last_proposal_received_id) != `0`;
5071
5072	if (is_new_proposal) {
5073	// This node has missed the prepare message, but received the accept
5074	// message. This is alright.
5075
5076	// Remember this to be the last proposal id we received.
5077	memcpy(&g_acceptor.last_proposal_received_id, &proposal_id,
5078	sizeof(proposal_id));
5079
5080	// Mark this as the start of the acceptor paxos round.
5081	g_acceptor.acceptor_round_start = cf_getms();
5082	}
5083
5084	g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_ACCEPTED;
5085	// The proposal is accepted. Send back a paxos accept.
5086	paxos_acceptor_accepted_send(src_nodeid, proposal_id.sequence_number);
5087
5088	CLUSTERING_UNLOCK();
5089	}
5090
5091	/**
5092	* Handle an incoming paxos learn message.
5093	*/
5094	static void
5095	paxos_acceptor_learn_handle(as_clustering_internal_event* event)
5096	{
5097	cf_node src_nodeid = event->msg_src_nodeid;
5098	msg* msg = event->msg;
5099
5100	DEBUG("received paxos learn from node %"PRIx64, src_nodeid);
5101
5102	if (!clustering_can_accept_as_proposer(src_nodeid)) {
5103	INFO("ignoring learn message from a non-principal node %"PRIx64" because we are already in a cluster",
5104	src_nodeid);
5105	return;
5106	}
5107
5108	// Its ok to proceed even if paxos is running, because this could be a
5109	// competing proposal and the winner was decided by paxos sequence number.
5110	as_paxos_proposal_id proposal_id = { `0` };
5111	if (msg_event_proposal_id_get(event, &proposal_id) != `0`) {
5112	INFO("ignoring paxos learn from node %"PRIx64"with invalid proposal id",
5113	src_nodeid);
5114	return;
5115	}
5116
5117	CLUSTERING_LOCK();
5118
5119	if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_ACCEPTED) {
5120	INFO(
5121	"ignoring paxos learn from node %"PRIx64" - proposal id (%"PRIx64":%"PRIu64") we are already in a cluster",
5122	src_nodeid, proposal_id.src_nodeid,
5123	proposal_id.sequence_number);
5124	goto Exit;
5125	}
5126
5127	if (paxos_proposal_id_compare(&proposal_id,
5128	&g_acceptor.last_proposal_received_id) != `0`) {
5129	// We have not promised nor accepted this proposal,
5130	// ignore the learn message.
5131	INFO(
5132	"ignoring paxos learn from node %"PRIx64" - proposal id (%"PRIx64":%"PRIu64") mismatches current proposal id (%"PRIx64":%"PRIu64")",
5133	src_nodeid, proposal_id.src_nodeid,
5134	proposal_id.sequence_number,
5135	g_acceptor.last_proposal_received_id.src_nodeid,
5136	g_acceptor.last_proposal_received_id.sequence_number);
5137	goto Exit;
5138	}
5139
5140	as_cluster_key new_cluster_key = `0`;
5141	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
5142
5143	if (msg_cluster_key_get(msg, &new_cluster_key) != `0`) {
5144	INFO("ignoring paxos learn from node %"PRIx64" without cluster key",
5145	src_nodeid);
5146	goto Exit_destory_succession;
5147	}
5148
5149	if (msg_succession_list_get(msg, new_succession_list) != `0`) {
5150	INFO("ignoring paxos learn from node %"PRIx64" without succession list",
5151	src_nodeid);
5152	goto Exit_destory_succession;
5153	}
5154
5155	if (new_cluster_key == g_register.cluster_key) {
5156	if (!vector_equals(new_succession_list, &g_register.succession_list)) {
5157	// We have the same cluster key repeated for a new round. Should
5158	// never happen.
5159	CRASH("duplicate cluster key %"PRIx64" generated for different paxos rounds - disastrous", new_cluster_key);
5160	}
5161
5162	INFO("ignoring duplicate paxos learn from node %"PRIx64, src_nodeid);
5163	goto Exit_destory_succession;
5164	}
5165
5166	// Paxos round converged, apply the new cluster configuration.
5167	paxos_acceptor_success(new_cluster_key, new_succession_list,
5168	proposal_id.sequence_number);
5169
5170	Exit_destory_succession:
5171	cf_vector_destroy(new_succession_list);
5172
5173	Exit:
5174	CLUSTERING_UNLOCK();
5175	}
5176
5177	/**
5178	* Handle an incoming message.
5179	*/
5180	static void
5181	paxos_acceptor_msg_event_handle(as_clustering_internal_event *msg_event)
5182	{
5183	switch (msg_event->msg_type) {
5184	case AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE:
5185	paxos_acceptor_prepare_handle(msg_event);
5186	break;
5187	case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT:
5188	paxos_acceptor_accept_handle(msg_event);
5189	break;
5190	case AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN:
5191	paxos_acceptor_learn_handle(msg_event);
5192	break;
5193	default: // Other message types are not of interest.
5194	break;
5195	}
5196	}
5197
5198	/**
5199	* Check and retransmit promise message if paxos proposer has not moved ahead
5200	* and send back an accept message.
5201	*/
5202	static void
5203	paxos_acceptor_promise_check_retransmit()
5204	{
5205	CLUSTERING_LOCK();
5206	cf_clock now = cf_getms();
5207	if (g_acceptor.state == AS_PAXOS_ACCEPTOR_STATE_PROMISED
5208	&& g_acceptor.promise_send_time + paxos_msg_timeout() < now) {
5209	paxos_acceptor_promise_send(
5210	g_acceptor.last_proposal_received_id.src_nodeid,
5211	g_acceptor.last_proposal_received_id.sequence_number);
5212	}
5213	CLUSTERING_UNLOCK();
5214	}
5215
5216	/**
5217	* Check and retransmit accepted message if paxos proposer has not send back a
5218	* learn message.
5219	*/
5220	static void
5221	paxos_acceptor_accepted_check_retransmit()
5222	{
5223	CLUSTERING_LOCK();
5224	cf_clock now = cf_getms();
5225	if (g_acceptor.state == AS_PAXOS_ACCEPTOR_STATE_ACCEPTED
5226	&& g_acceptor.accepted_send_time + paxos_msg_timeout() < now) {
5227	paxos_acceptor_accepted_send(
5228	g_acceptor.last_proposal_received_id.src_nodeid,
5229	g_acceptor.last_proposal_received_id.sequence_number);
5230	}
5231	CLUSTERING_UNLOCK();
5232	}
5233
5234	/**
5235	* Handle a timer event and retransmit messages if required.
5236	*/
5237	static void
5238	paxos_acceptor_timer_event_handle()
5239	{
5240	CLUSTERING_LOCK();
5241	switch (g_acceptor.state) {
5242	case AS_PAXOS_ACCEPTOR_STATE_IDLE: {
5243	// No retransmitts required.
5244	break;
5245	}
5246	case AS_PAXOS_ACCEPTOR_STATE_PROMISED:
5247	paxos_acceptor_promise_check_retransmit();
5248	break;
5249	case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED:
5250	paxos_acceptor_accepted_check_retransmit();
5251	break;
5252	}
5253
5254	CLUSTERING_UNLOCK();
5255	}
5256
5257	/**
5258	* Initialize paxos acceptor state.
5259	*/
5260	static void
5261	paxos_acceptor_init()
5262	{
5263	CLUSTERING_LOCK();
5264	// Memset to zero which ensures that all acceptor state variables have zero
5265	// which is the correct initial value for elements other that contained
5266	// vectors and status.
5267	memset(&g_acceptor, `0`, sizeof(g_acceptor));
5268	g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_IDLE;
5269	CLUSTERING_UNLOCK();
5270	}
5271
5272	/**
5273	* Paxos acceptor monitor to detect and cleanup long running and most likely
5274	* failed paxos rounds.
5275	*/
5276	static void
5277	paxos_acceptor_monitor()
5278	{
5279	CLUSTERING_LOCK();
5280	if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_IDLE
5281	&& g_acceptor.acceptor_round_start + paxos_proposal_timeout()
5282	<= cf_getms()) {
5283	// Paxos round is running and has timed out.
5284	// Consider paxos round failed.
5285	INFO("paxos round timed out for proposal id %"PRIx64":%"PRIu64,
5286	config_self_nodeid_get(),
5287	g_proposer.sequence_number);
5288	paxos_acceptor_fail();
5289	}
5290	CLUSTERING_UNLOCK();
5291	}
5292
5293	/*
5294	* ----------------------------------------------------------------------------
5295	* Paxos lifecycle and common event handling
5296	* ----------------------------------------------------------------------------
5297	*/
5298
5299	/**
5300	* Paxos monitor to detect and cleanup long running and most likely failed paxos
5301	* rounds.
5302	*/
5303	static void
5304	paxos_monitor()
5305	{
5306	paxos_proposer_monitor();
5307	paxos_acceptor_monitor();
5308	}
5309
5310	/**
5311	* Handle an incoming timer event.
5312	*/
5313	static void
5314	paxos_timer_event_handle()
5315	{
5316	// Acceptor retransmits handled here.
5317	paxos_acceptor_timer_event_handle();
5318
5319	// Proposer retransmits handled here.
5320	paxos_proposer_timer_event_handle();
5321
5322	// Invoke Paxos monitor to timeout long running paxos rounds.
5323	paxos_monitor();
5324	}
5325
5326	/**
5327	* Handle incoming messages.
5328	*/
5329	static void
5330	paxos_msg_event_handle(as_clustering_internal_event* msg_event)
5331	{
5332	paxos_acceptor_msg_event_handle(msg_event);
5333	paxos_proposer_msg_event_handle(msg_event);
5334	}
5335
5336	/**
5337	* Handle heartbeat event.
5338	*/
5339	static void
5340	paxos_hb_event_handle(as_clustering_internal_event* hb_event)
5341	{
5342	paxos_proposer_hb_event_handle(hb_event);
5343	}
5344
5345	/**
5346	* Dispatch clustering events.
5347	*/
5348	static void
5349	paxos_event_dispatch(as_clustering_internal_event* event)
5350	{
5351	switch (event->type) {
5352	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
5353	paxos_timer_event_handle();
5354	break;
5355	case AS_CLUSTERING_INTERNAL_EVENT_MSG:
5356	paxos_msg_event_handle(event);
5357	break;
5358	case AS_CLUSTERING_INTERNAL_EVENT_HB:
5359	paxos_hb_event_handle(event);
5360	break;
5361	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED:
5362	paxos_proposer_register_synched();
5363	default: // Not of interest for paxos.
5364	break;
5365	}
5366	}
5367
5368	/**
5369	* Initialize paxos proposer and acceptor data structures.
5370	*/
5371	static void
5372	paxos_init()
5373	{
5374	paxos_proposer_init();
5375	paxos_acceptor_init();
5376	}
5377
5378	/*
5379	* ----------------------------------------------------------------------------
5380	* Clustering external event publisher
5381	* ----------------------------------------------------------------------------
5382	*/
5383
5384	/**
5385	* * Check if event publisher is running.
5386	*/
5387	static bool
5388	external_event_publisher_is_running()
5389	{
5390	CLUSTERING_EVENT_PUBLISHER_LOCK();
5391	bool running = g_external_event_publisher.sys_state
5392	== AS_CLUSTERING_SYS_STATE_RUNNING;
5393	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
5394	return running;
5395	}
5396
5397	/**
5398	* Initialize the event publisher.
5399	*/
5400	static void
5401	external_event_publisher_init()
5402	{
5403	CLUSTERING_EVENT_PUBLISHER_LOCK();
5404	memset(&g_external_event_publisher, `0`, sizeof(g_external_event_publisher));
5405	vector_lockless_init(&g_external_event_publisher.published_succession_list,
5406	cf_node);
5407
5408	pthread_mutex_init(&g_external_event_publisher.is_pending_mutex, NULL);
5409	pthread_cond_init(&g_external_event_publisher.is_pending, NULL);
5410	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
5411	}
5412
5413	/**
5414	* Wakeup the publisher thread.
5415	*/
5416	static void
5417	external_event_publisher_thr_wakeup()
5418	{
5419	pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex);
5420	pthread_cond_signal(&g_external_event_publisher.is_pending);
5421	pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex);
5422	}
5423
5424	/**
5425	* Queue up and external event to publish.
5426	*/
5427	static void
5428	external_event_queue(as_clustering_event* event)
5429	{
5430	CLUSTERING_EVENT_PUBLISHER_LOCK();
5431	memcpy(&g_external_event_publisher.to_publish, event,
5432	sizeof(g_external_event_publisher.to_publish));
5433
5434	vector_clear(&g_external_event_publisher.published_succession_list);
5435	if (event->succession_list) {
5436	// Use the static list for the published event, so that the input event
5437	// object can be destroyed irrespective of when the it is published.
5438	vector_copy(&g_external_event_publisher.published_succession_list,
5439	event->succession_list);
5440	g_external_event_publisher.to_publish.succession_list =
5441	&g_external_event_publisher.published_succession_list;
5442
5443	}
5444
5445	g_external_event_publisher.event_queued = true;
5446
5447	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
5448
5449	// Wake up the publisher thread.
5450	external_event_publisher_thr_wakeup();
5451	}
5452
5453	/**
5454	* Publish external events if any are pending.
5455	*/
5456	static void
5457	external_events_publish()
5458	{
5459	CLUSTERING_EVENT_PUBLISHER_LOCK();
5460
5461	if (g_external_event_publisher.event_queued) {
5462	g_external_event_publisher.event_queued = false;
5463	exchange_clustering_event_listener(
5464	&g_external_event_publisher.to_publish);
5465	}
5466	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
5467	}
5468
5469	/**
5470	* External event publisher thread.
5471	*/
5472	static void*
5473	external_event_publisher_thr(void* arg)
5474	{
5475	pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex);
5476
5477	while (true) {
5478	pthread_cond_wait(&g_external_event_publisher.is_pending,
5479	&g_external_event_publisher.is_pending_mutex);
5480	if (external_event_publisher_is_running()) {
5481	external_events_publish();
5482	}
5483	else {
5484	// Publisher stopped, exit the tread.
5485	break;
5486	}
5487	}
5488
5489	pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex);
5490	return NULL;
5491	}
5492
5493	/**
5494	* Start the event publisher.
5495	*/
5496	static void
5497	external_event_publisher_start()
5498	{
5499	CLUSTERING_EVENT_PUBLISHER_LOCK();
5500	g_external_event_publisher.sys_state = AS_CLUSTERING_SYS_STATE_RUNNING;
5501	g_external_event_publisher.event_publisher_tid =
5502	cf_thread_create_joinable(external_event_publisher_thr, NULL);
5503	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
5504	}
5505
5506	/**
5507	* Stop the event publisher.
5508	*/
5509	static void
5510	external_event_publisher_stop()
5511	{
5512	CLUSTERING_EVENT_PUBLISHER_LOCK();
5513	g_external_event_publisher.sys_state =
5514	AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN;
5515	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
5516
5517	external_event_publisher_thr_wakeup();
5518	cf_thread_join(g_external_event_publisher.event_publisher_tid);
5519
5520	CLUSTERING_EVENT_PUBLISHER_LOCK();
5521	g_external_event_publisher.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED;
5522	g_external_event_publisher.event_queued = false;
5523	CLUSTERING_EVENT_PUBLISHER_UNLOCK();
5524	}
5525
5526	/*
5527	* ----------------------------------------------------------------------------
5528	* Clustering register
5529	* ----------------------------------------------------------------------------
5530	*/
5531
5532	/**
5533	* Dump register state to logs.
5534	*/
5535	static void
5536	register_dump(bool verbose)
5537	{
5538	CLUSTERING_LOCK();
5539
5540	// Output register state.
5541	switch (g_register.state) {
5542	case AS_CLUSTERING_REGISTER_STATE_SYNCED:
5543	INFO("CL: register: synced");
5544	break;
5545	case AS_CLUSTERING_REGISTER_STATE_SYNCING:
5546	INFO("CL: register: syncing");
5547	break;
5548	}
5549
5550	// Cluster state details.
5551	INFO("CL: cluster changed at: %"PRIu64" now: %"PRIu64,
5552	g_register.cluster_modified_time, cf_getms());
5553
5554	INFO("CL: cluster key: %"PRIx64, g_register.cluster_key);
5555	INFO("CL: cluster sequence: %"PRIu64, g_register.sequence_number);
5556	INFO("CL: cluster size: %d", cf_vector_size(&g_register.succession_list));
5557
5558	if (verbose) {
5559	log_cf_node_vector("CL: succession:", &g_register.succession_list,
5560	CF_INFO);
5561	}
5562
5563	CLUSTERING_UNLOCK();
5564	}
5565
5566	/**
5567	* Initialize the register.
5568	*/
5569	static void
5570	register_init()
5571	{
5572	CLUSTERING_LOCK();
5573	memset(&g_register, `0`, sizeof(g_register));
5574	vector_lockless_init(&g_register.succession_list, cf_node);
5575	vector_lockless_init(&g_register.sync_pending, cf_node);
5576	vector_lockless_init(&g_register.ooo_change_applied_received, cf_node);
5577	vector_lockless_init(&g_register.ooo_succession_list, cf_node);
5578
5579	// We are in the orphan state but that will be considered as sync state.
5580	g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED;
5581	CLUSTERING_UNLOCK();
5582	}
5583
5584	/**
5585	* Returns true if register sync is pending.
5586	*/
5587	static bool
5588	register_is_sycn_pending()
5589	{
5590	CLUSTERING_LOCK();
5591	bool sync_pending = cf_vector_size(&g_register.sync_pending) > `0`;
5592	log_cf_node_vector("pending register sync:", &g_register.sync_pending,
5593	CF_DETAIL);
5594	CLUSTERING_UNLOCK();
5595	return sync_pending;
5596	}
5597
5598	/**
5599	* Check if the register is synced across the cluster and move to sync state if
5600	* it is synced.
5601	*/
5602	static void
5603	register_check_and_switch_synced()
5604	{
5605	CLUSTERING_LOCK();
5606	if (!register_is_sycn_pending()
5607	&& g_register.state != AS_CLUSTERING_REGISTER_STATE_SYNCED) {
5608	g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED;
5609	// Generate internal cluster changed synced.
5610	as_clustering_internal_event cluster_synced;
5611	memset(&cluster_synced, `0`, sizeof(cluster_synced));
5612	cluster_synced.type =
5613	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED;
5614	internal_event_dispatch(&cluster_synced);
5615	}
5616	CLUSTERING_UNLOCK();
5617	}
5618
5619	/**
5620	* Update register to become an orphan node.
5621	*/
5622	static void
5623	register_become_orphan(as_clustering_event_qualifier qualifier)
5624	{
5625	CLUSTERING_LOCK();
5626	g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED;
5627	g_register.cluster_key = `0`;
5628	g_register.sequence_number = `0`;
5629	g_register.has_orphan_transitioned = true;
5630	g_clustering.has_integrity = false;
5631	vector_clear(&g_register.succession_list);
5632	vector_clear(&g_register.sync_pending);
5633
5634	g_register.cluster_modified_time = cf_getms();
5635	g_register.cluster_modified_hlc_ts = as_hlc_timestamp_now();
5636
5637	// Queue internal orphaned event.
5638	as_clustering_internal_event orphaned_event;
5639	memset(&orphaned_event, `0`, sizeof(orphaned_event));
5640	orphaned_event.type = AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED;
5641	orphaned_event.qualifier = qualifier;
5642	internal_event_dispatch(&orphaned_event);
5643
5644	CLUSTERING_UNLOCK();
5645
5646	INFO("moved self node to orphan state");
5647	}
5648
5649	/**
5650	* Handle timer event in the syncing state.
5651	*/
5652	static void
5653	register_syncing_timer_event_handle()
5654	{
5655	CLUSTERING_LOCK();
5656	cf_clock now = cf_getms();
5657	if (g_register.last_sync_check_time + register_sync_check_interval()
5658	> now) {
5659	// Give more time before checking for sync.
5660	goto Exit;
5661	}
5662
5663	if (register_is_sycn_pending()) {
5664	// Update pending nodes based on heartbeat status.
5665	int num_pending = cf_vector_size(&g_register.sync_pending);
5666	for (int i = `0`; i < num_pending; i++) {
5667	cf_node pending;
5668	cf_vector_get(&g_register.sync_pending, i, &pending);
5669	if (clustering_node_is_sync(pending)) {
5670	cf_vector_delete(&g_register.sync_pending, i);
5671
5672	// Compensate the index for the delete.
5673	i--;
5674
5675	// Adjust vector size.
5676	num_pending--;
5677	}
5678	}
5679	}
5680
5681	register_check_and_switch_synced();
5682
5683	Exit:
5684	CLUSTERING_UNLOCK();
5685	}
5686
5687	/**
5688	* Send cluster change applied message to all cluster members.
5689	*/
5690	static void
5691	register_cluster_change_applied_msg_send()
5692	{
5693	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED);
5694
5695	CLUSTERING_LOCK();
5696
5697	// Set the cluster key.
5698	msg_cluster_key_set(msg, g_register.cluster_key);
5699
5700	// Set the succession list.
5701	msg_succession_list_set(msg, &g_register.succession_list);
5702
5703	log_cf_node_vector("cluster change applied message sent to:",
5704	&g_register.succession_list, CF_DEBUG);
5705
5706	cf_vector* members = vector_stack_lockless_create(cf_node);
5707	vector_copy(members, &g_register.succession_list);
5708
5709	CLUSTERING_UNLOCK();
5710
5711	// Sent the message to the cluster members.
5712	msg_nodes_send(msg, members);
5713	cf_vector_destroy(members);
5714	}
5715
5716	/**
5717	* Validate cluster state. For now ensure the cluster size is greater than the
5718	* min cluster size.
5719	*/
5720	static void
5721	register_validate_cluster()
5722	{
5723	CLUSTERING_LOCK();
5724	int cluster_size = cf_vector_size(&g_register.succession_list);
5725	if (!clustering_is_orphan()
5726	&& cluster_size < g_config.clustering_config.cluster_size_min) {
5727	WARNING(
5728	"cluster size %d less than required minimum size %d - switching to orphan state",
5729	cluster_size, g_config.clustering_config.cluster_size_min);
5730	register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
5731	}
5732	CLUSTERING_UNLOCK();
5733	}
5734
5735	/**
5736	* Handle a timer event for the register.
5737	*/
5738	static void
5739	register_timer_event_handle()
5740	{
5741	CLUSTERING_LOCK();
5742	switch (g_register.state) {
5743	case AS_CLUSTERING_REGISTER_STATE_SYNCED:
5744	register_validate_cluster();
5745	break;
5746	case AS_CLUSTERING_REGISTER_STATE_SYNCING:
5747	register_syncing_timer_event_handle();
5748	break;
5749	}
5750	CLUSTERING_UNLOCK();
5751	}
5752
5753	/**
5754	* Handle paxos round succeeding.
5755	*/
5756	static void
5757	register_paxos_acceptor_success_handle(
5758	as_clustering_internal_event* paxos_success_event)
5759	{
5760	CLUSTERING_LOCK();
5761
5762	g_register.has_orphan_transitioned = false;
5763
5764	g_register.cluster_key = paxos_success_event->new_cluster_key;
5765	g_register.sequence_number = paxos_success_event->new_sequence_number;
5766
5767	vector_clear(&g_register.succession_list);
5768	vector_copy(&g_register.succession_list,
5769	paxos_success_event->new_succession_list);
5770
5771	// Update the timestamps as the register has changed its contents.
5772	g_register.cluster_modified_time = cf_getms();
5773	g_register.cluster_modified_hlc_ts = as_hlc_timestamp_now();
5774
5775	// Initialize pending list with all cluster members.
5776	g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCING;
5777	vector_clear(&g_register.sync_pending);
5778	vector_copy(&g_register.sync_pending, &g_register.succession_list);
5779	register_cluster_change_applied_msg_send();
5780
5781	if (g_register.cluster_key == g_register.ooo_cluster_key
5782	&& vector_equals(&g_register.succession_list,
5783	&g_register.ooo_succession_list)) {
5784	// We have already received change applied message from these node
5785	// account for them.
5786	vector_subtract(&g_register.sync_pending,
5787	&g_register.ooo_change_applied_received);
5788	}
5789	vector_clear(&g_register.ooo_change_applied_received);
5790	vector_clear(&g_register.ooo_succession_list);
5791	g_register.ooo_cluster_key = `0`;
5792	g_register.ooo_hlc_timestamp = `0`;
5793
5794	INFO("applied new cluster key %"PRIx64,
5795	paxos_success_event->new_cluster_key);
5796	log_cf_node_vector("applied new succession list",
5797	&g_register.succession_list, CF_INFO);
5798	INFO("applied cluster size %d",
5799	cf_vector_size(&g_register.succession_list));
5800
5801	as_clustering_internal_event cluster_changed;
5802	memset(&cluster_changed, `0`, sizeof(cluster_changed));
5803	cluster_changed.type =
5804	AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED;
5805	internal_event_dispatch(&cluster_changed);
5806
5807	// Send change appied message. Its alright even if they are out of order.
5808	register_cluster_change_applied_msg_send();
5809
5810	CLUSTERING_UNLOCK();
5811	}
5812
5813	/**
5814	* Handle incoming cluster change applied message.
5815	*/
5816	static void
5817	register_cluster_change_applied_msg_handle(
5818	as_clustering_internal_event* msg_event)
5819	{
5820	CLUSTERING_LOCK();
5821	as_cluster_key msg_cluster_key = `0`;
5822	msg_cluster_key_get(msg_event->msg, &msg_cluster_key);
5823	cf_vector *msg_succession_list = vector_stack_lockless_create(cf_node);
5824	msg_succession_list_get(msg_event->msg, msg_succession_list);
5825	as_hlc_timestamp msg_hlc_timestamp = `0`;
5826	msg_send_ts_get(msg_event->msg, &msg_hlc_timestamp);
5827
5828	DEBUG("received cluster change applied message from node %"PRIx64,
5829	msg_event->msg_src_nodeid);
5830	if (g_register.cluster_key == msg_cluster_key
5831	&& vector_equals(&g_register.succession_list,
5832	msg_succession_list)) {
5833	// This is a matching change applied message.
5834	int found_at = `0`;
5835	if ((found_at = vector_find(&g_register.sync_pending,
5836	&msg_event->msg_src_nodeid)) >= `0`) {
5837	// Remove from the pending list.
5838	cf_vector_delete(&g_register.sync_pending, found_at);
5839	}
5840
5841	}
5842	else if (g_register.ooo_cluster_key == msg_cluster_key
5843	&& vector_equals(&g_register.ooo_succession_list,
5844	msg_succession_list)) {
5845	DEBUG("received ooo cluster change applied message from node %"PRIx64" with cluster key %"PRIx64, msg_event->msg_src_nodeid, msg_cluster_key);
5846	cf_vector_append_unique(&g_register.ooo_change_applied_received,
5847	&msg_event->msg_src_nodeid);
5848
5849	}
5850	else if (g_register.ooo_hlc_timestamp < msg_hlc_timestamp) {
5851	// Prefer a later version of OOO message.
5852	g_register.ooo_cluster_key = msg_cluster_key;
5853	g_register.ooo_hlc_timestamp = msg_hlc_timestamp;
5854	vector_clear(&g_register.ooo_succession_list);
5855	vector_copy(&g_register.ooo_succession_list, msg_succession_list);
5856	vector_clear(&g_register.ooo_change_applied_received);
5857	cf_vector_append_unique(&g_register.ooo_change_applied_received,
5858	&msg_event->msg_src_nodeid);
5859	DEBUG("received ooo cluster change applied message from node %"PRIx64" with cluster key %"PRIx64, msg_event->msg_src_nodeid, msg_cluster_key);
5860	}
5861	else {
5862	INFO(
5863	"ignoring cluster mismatching change applied message from node %"PRIx64,
5864	msg_event->msg_src_nodeid);
5865	}
5866	cf_vector_destroy(msg_succession_list);
5867	register_check_and_switch_synced();
5868	CLUSTERING_UNLOCK();
5869	}
5870
5871	/**
5872	* Handle incoming message.
5873	*/
5874	static void
5875	register_msg_event_handle(as_clustering_internal_event* msg_event)
5876	{
5877	CLUSTERING_LOCK();
5878	as_clustering_msg_type type;
5879	msg_type_get(msg_event->msg, &type);
5880
5881	if (type == AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED) {
5882	register_cluster_change_applied_msg_handle(msg_event);
5883	}
5884	CLUSTERING_UNLOCK();
5885	}
5886
5887	/**
5888	* Dispatch internal events to the register.
5889	*/
5890	static void
5891	register_event_dispatch(as_clustering_internal_event* event)
5892	{
5893	switch (event->type) {
5894	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
5895	register_timer_event_handle();
5896	break;
5897	case AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS:
5898	register_paxos_acceptor_success_handle(event);
5899	break;
5900	case AS_CLUSTERING_INTERNAL_EVENT_MSG:
5901	register_msg_event_handle(event);
5902	break;
5903	default: // Not of interest for the register.
5904	break;
5905	}
5906	}
5907
5908	/*
5909	* ----------------------------------------------------------------------------
5910	* Clustering core (triggers cluster changes)
5911	* ----------------------------------------------------------------------------
5912	*/
5913
5914	/**
5915	* Send a join reject message to destination node.
5916	*/
5917	static void
5918	clustering_join_reject_send(cf_node dest)
5919	{
5920	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REJECT);
5921
5922	DETAIL("sent join reject to node %"PRIx64, dest);
5923
5924	// Sent the message to the acceptors.
5925	msg_node_send(msg, dest);
5926	}
5927
5928	/**
5929	* Send cluster join reject message to all nodes in the vector.
5930	*/
5931	static void
5932	clustering_join_requests_reject(cf_vector* rejected_nodes)
5933	{
5934	int rejected_node_count = cf_vector_size(rejected_nodes);
5935	for (int i = `0`; i < rejected_node_count; i++) {
5936	// No null check required since we are iterating under a lock and within
5937	// vector bounds.
5938	cf_node requesting_nodeid = ((cf_node)cf_vector_getp(rejected_nodes,
5939	i));
5940
5941	// Send the reject message.
5942	clustering_join_reject_send(requesting_nodeid);
5943	}
5944	}
5945
5946	/**
5947	* Send join reject message for all pending join requests.
5948	*/
5949	static void
5950	clustering_join_requests_reject_all()
5951	{
5952	CLUSTERING_LOCK();
5953
5954	cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node);
5955	vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests);
5956
5957	vector_clear(&g_clustering.pending_join_requests);
5958
5959	CLUSTERING_UNLOCK();
5960
5961	clustering_join_requests_reject(rejected_nodes);
5962
5963	cf_vector_destroy(rejected_nodes);
5964	}
5965
5966	/**
5967	* Send a join request to a principal.
5968	* @param new_principal the destination principal node.
5969	* @return 0 on successful message queue, -1 on failure.
5970	*/
5971	static int
5972	clustering_join_request_send(cf_node new_principal)
5973	{
5974	int rv = -`1`;
5975	CLUSTERING_LOCK();
5976
5977	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST);
5978
5979	DETAIL("sending cluster join request to node %"PRIx64, new_principal);
5980
5981	if (msg_node_send(msg, new_principal) == `0`) {
5982	cf_clock now = cf_getms();
5983	cf_shash_put(g_clustering.join_request_blackout, &new_principal, &now);
5984
5985	g_clustering.last_join_request_principal = new_principal;
5986	g_clustering.last_join_request_sent_time =
5987	g_clustering.last_join_request_retransmit_time = cf_getms();
5988
5989	INFO("sent cluster join request to %"PRIx64, new_principal);
5990	rv = `0`;
5991	}
5992
5993	// Send early reject to all nodes that have send us a join request in the
5994	// orphan state, because self node is not going to become a principal node.
5995	// This allows the requesting nodes to send requests to other
5996	// (potential)principals.
5997	clustering_join_requests_reject_all();
5998
5999	CLUSTERING_UNLOCK();
6000	return rv;
6001	}
6002
6003	/**
6004	* Retransmit a join request to a previously attmepted principal.
6005	* @param last_join_request_principal the principal to retransmit to.
6006	*/
6007	static void
6008	clustering_join_request_retransmit(cf_node last_join_request_principal)
6009	{
6010	CLUSTERING_LOCK();
6011	cf_node new_principal = g_clustering.last_join_request_principal;
6012	g_clustering.last_join_request_retransmit_time = cf_getms();
6013	CLUSTERING_UNLOCK();
6014
6015	if (new_principal != last_join_request_principal) {
6016	// The last attempted principal has changed. Don't retransmit.
6017	return;
6018	}
6019
6020	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST);
6021	DETAIL("re-sending cluster join request to node %"PRIx64, new_principal);
6022	if (msg_node_send(msg, new_principal) == `0`) {
6023	DEBUG("re-sent cluster join request to %"PRIx64, new_principal);
6024	}
6025	}
6026
6027	/**
6028	* Remove nodes for which join requests are blocked.
6029	*
6030	* @param requestees the nodes considered for join requests.
6031	* @param target the result with requestees that are not blocked.
6032	*/
6033	static void
6034	clustering_join_request_filter_blocked(cf_vector* requestees, cf_vector* target)
6035	{
6036	CLUSTERING_LOCK();
6037	cf_clock last_sent;
6038	int requestee_count = cf_vector_size(requestees);
6039	for (int i = `0`; i < requestee_count; i++) {
6040	cf_node requestee;
6041	cf_vector_get(requestees, i, &requestee);
6042	if (cf_shash_get(g_clustering.join_request_blackout, &requestee,
6043	&last_sent) != CF_SHASH_OK) {
6044	// The requestee is not marked for blackout
6045	cf_vector_append(target, &requestee);
6046	}
6047	}
6048	CLUSTERING_UNLOCK();
6049	}
6050
6051	/**
6052	* Send a cluster join request to a neighboring principal. If
6053	* preferred_principal is set and it is an eligible neighboring principal, a
6054	* request is sent to that principal, else this function cycles among eligible
6055	* neighboring principals at each call.
6056	*
6057	* A request will not be sent if there is no neighboring principal.
6058	*
6059	* @param preferred_principal the preferred principal to join. User zero if
6060	* there is no preference.
6061	* @return 0 if the join request was send or there is one in progress. -1 if
6062	* there are no principals to try and send the join request.
6063	*/
6064	static as_clustering_join_request_result
6065	clustering_principal_join_request_attempt(cf_node preferred_principal)
6066	{
6067	CLUSTERING_LOCK();
6068
6069	as_clustering_join_request_result rv = AS_CLUSTERING_JOIN_REQUEST_SENT;
6070	cf_vector* neighboring_principals = vector_stack_lockless_create(cf_node);
6071	cf_vector* eligible_principals = vector_stack_lockless_create(cf_node);
6072
6073	// Get list of neighboring principals.
6074	clustering_neighboring_principals_get(neighboring_principals);
6075	if (cf_vector_size(neighboring_principals) == `0`) {
6076	DEBUG("no neighboring principal found - not sending join request");
6077	rv = AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS;
6078	goto Exit;
6079	}
6080
6081	clustering_join_request_filter_blocked(neighboring_principals,
6082	eligible_principals);
6083
6084	if (cf_vector_size(eligible_principals) == `0`) {
6085	DETAIL("no eligible principals found to make a join request");
6086	// This principal is still in the blackout list. Do not send a request.
6087	rv = AS_CLUSTERING_JOIN_REQUEST_PENDING;
6088	goto Exit;
6089	}
6090
6091	int next_join_request_principal_index = -`1`;
6092
6093	// We have some well-formed neighboring clusters, try and join them
6094	if (preferred_principal != `0`) {
6095	int preferred_principal_index = vector_find(eligible_principals,
6096	&preferred_principal);
6097	if (preferred_principal_index >= `0`) {
6098	DETAIL("sending join request to preferred principal %"PRIx64,
6099	preferred_principal);
6100
6101	// Update the index of the principal to try.
6102	next_join_request_principal_index = preferred_principal_index;
6103	}
6104	}
6105
6106	if (next_join_request_principal_index == -`1`) {
6107	// Choose the first entry, since we have no valid preferred principal.
6108	next_join_request_principal_index = `0`;
6109	if (g_clustering.last_join_request_principal != `0`) {
6110	// Choose the node after the current principal. If the current
6111	// principal is not found we start at index 0 else the next index.
6112	next_join_request_principal_index = vector_find(eligible_principals,
6113	&g_clustering.last_join_request_principal) + `1`;
6114	}
6115	}
6116
6117	// Forget the fact that a join request is pending for a principal.
6118	g_clustering.last_join_request_principal = `0`;
6119
6120	cf_node* principal_to_try = cf_vector_getp(eligible_principals,
6121	next_join_request_principal_index
6122	% cf_vector_size(eligible_principals));
6123
6124	if (principal_to_try) {
6125	rv = clustering_join_request_send(*principal_to_try) == `0` ?
6126	AS_CLUSTERING_JOIN_REQUEST_SENT :
6127	AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED;
6128
6129	}
6130	else {
6131	DEBUG("no neighboring principal found - not sending join request");
6132	rv = AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS;
6133	}
6134
6135	Exit:
6136	if (rv != AS_CLUSTERING_JOIN_REQUEST_SENT) {
6137	// Forget the last principal we sent the join request to.
6138	g_clustering.last_join_request_principal = `0`;
6139	g_clustering.last_join_request_sent_time = `0`;
6140	}
6141
6142	CLUSTERING_UNLOCK();
6143
6144	cf_vector_destroy(neighboring_principals);
6145	cf_vector_destroy(eligible_principals);
6146
6147	return rv;
6148	}
6149
6150	/**
6151	* Send a cluster join request to a neighboring orphan who this node thinks will
6152	* be best suited to form a new cluster.
6153	*/
6154	static as_clustering_join_request_result
6155	clustering_orphan_join_request_attempt()
6156	{
6157	CLUSTERING_LOCK();
6158
6159	// Get list of neighboring orphans.
6160	cf_vector* orphans = vector_stack_lockless_create(cf_node);
6161	clustering_neighboring_orphans_get(orphans);
6162
6163	// Get filtered list of orphans.
6164	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
6165	clustering_join_request_filter_blocked(orphans, new_succession_list);
6166
6167	log_cf_node_vector("neighboring orphans for join request:",
6168	new_succession_list, CF_DEBUG);
6169
6170	// Add self node.
6171	cf_node self_nodeid = config_self_nodeid_get();
6172	cf_vector_append_unique(new_succession_list, &self_nodeid);
6173
6174	clustering_succession_list_clique_evict(new_succession_list,
6175	"clique based evicted nodes for potential cluster:");
6176
6177	// Sort the new succession list.
6178	vector_sort_unique(new_succession_list, cf_node_compare_desc);
6179
6180	as_clustering_join_request_result rv =
6181	AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS;
6182
6183	if (cf_vector_size(new_succession_list) > `0`) {
6184	cf_node new_principal = ((cf_node)cf_vector_getp(new_succession_list,
6185	`0`));
6186	if (new_principal == config_self_nodeid_get()) {
6187	// No need to send self a join request.
6188	goto Exit;
6189	}
6190	else {
6191	rv = clustering_join_request_send(new_principal) == `0` ?
6192	AS_CLUSTERING_JOIN_REQUEST_SENT :
6193	AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED;
6194	}
6195	}
6196
6197	Exit:
6198	cf_vector_destroy(new_succession_list);
6199	cf_vector_destroy(orphans);
6200
6201	CLUSTERING_UNLOCK();
6202	return rv;
6203	}
6204
6205	/**
6206	* Remove nodes from the blackout hash once they have been in the list for
6207	* greater than the blackout period.
6208	*/
6209	int
6210	clustering_join_request_blackout_tend_reduce(const void* key, void* data,
6211	void* udata)
6212	{
6213	cf_clock* join_request_send_time = (cf_clock*)data;
6214	if (*join_request_send_time + join_request_blackout_interval()
6215	< cf_getms()) {
6216	return CF_SHASH_REDUCE_DELETE;
6217	}
6218	return CF_SHASH_OK;
6219	}
6220
6221	/**
6222	* Tend the join request blackout data structure to remove blacked out
6223	* principals.
6224	*/
6225	static void
6226	clustering_join_request_blackout_tend()
6227	{
6228	CLUSTERING_LOCK();
6229	cf_shash_reduce(g_clustering.join_request_blackout,
6230	clustering_join_request_blackout_tend_reduce, NULL);
6231	CLUSTERING_UNLOCK();
6232	}
6233
6234	/**
6235	* Send a cluster join request to a neighboring principal if one exists, else if
6236	* there are no neighboring principals, send a join request to a neighboring
6237	* orphan node if this node thinks it will win paxos and become the new
6238	* principal.
6239	*/
6240	static as_clustering_join_request_result
6241	clustering_join_request_attempt()
6242	{
6243	clustering_join_request_blackout_tend();
6244
6245	CLUSTERING_LOCK();
6246	cf_node last_join_request_principal =
6247	g_clustering.last_join_request_principal;
6248	cf_clock last_join_request_sent_time =
6249	g_clustering.last_join_request_sent_time;
6250	cf_clock last_join_request_retransmit_time =
6251	g_clustering.last_join_request_retransmit_time;
6252	CLUSTERING_UNLOCK();
6253
6254	// Check if the outgoing join request has timed out.
6255	if (last_join_request_principal
6256	&& as_hb_is_alive(last_join_request_principal)) {
6257	if (last_join_request_sent_time + join_request_timeout() > cf_getms()) {
6258	if (last_join_request_retransmit_time
6259	+ join_request_retransmit_timeout() < cf_getms()) {
6260	// Re-transmit join request to the same principal, to cover the
6261	// case where the previous join request was lost.
6262	clustering_join_request_retransmit(last_join_request_principal);
6263	}
6264	// Wait for the principal to respond. do nothing
6265	DETAIL(
6266	"join request to principal %"PRIx64" pending - not attempting new join request",
6267	last_join_request_principal);
6268
6269	return AS_CLUSTERING_JOIN_REQUEST_PENDING;
6270	}
6271	// Timeout joining a principal. Choose a different principal.
6272	INFO("join request timed out for principal %"PRIx64,
6273	last_join_request_principal);
6274
6275	}
6276
6277	// Try sending a join request to a neighboring principal.
6278	as_clustering_join_request_result rv =
6279	clustering_principal_join_request_attempt(`0`);
6280
6281	if (rv != AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS) {
6282	// There are valid principals around. Don't send a request to
6283	// neighboring orphan nodes.
6284	return rv;
6285	}
6286
6287	// Send a join request to an orphan node, best suited to be the new
6288	// principal.
6289	return clustering_orphan_join_request_attempt();
6290	}
6291
6292	/**
6293	* Try to become a principal and start a new cluster.
6294	*/
6295	static void
6296	clustering_cluster_form()
6297	{
6298	ASSERT(clustering_is_orphan(),
6299	"should not attempt forming new cluster when not an orphan node");
6300
6301	CLUSTERING_LOCK();
6302	bool paxos_proposal_started = false;
6303	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
6304	cf_vector* expected_succession_list = vector_stack_lockless_create(cf_node);
6305	cf_vector* orphans = vector_stack_lockless_create(cf_node);
6306
6307	clustering_neighboring_orphans_get(orphans);
6308	vector_copy(new_succession_list, orphans);
6309
6310	log_cf_node_vector("neighboring orphans for cluster formation:",
6311	new_succession_list,
6312	cf_vector_size(new_succession_list) > `0` ? CF_INFO : CF_DEBUG);
6313	log_cf_node_vector("pending join requests:",
6314	&g_clustering.pending_join_requests,
6315	cf_vector_size(&g_clustering.pending_join_requests) > `0` ?
6316	CF_INFO : CF_DEBUG);
6317
6318	// Add self node.
6319	cf_node self_nodeid = config_self_nodeid_get();
6320	cf_vector_append_unique(new_succession_list, &self_nodeid);
6321
6322	clustering_succession_list_clique_evict(new_succession_list,
6323	"clique based evicted nodes at cluster formation:");
6324
6325	// Sort the new succession list.
6326	vector_sort_unique(new_succession_list, cf_node_compare_desc);
6327
6328	cf_vector_append(expected_succession_list, &self_nodeid);
6329	vector_copy_unique(expected_succession_list,
6330	&g_clustering.pending_join_requests);
6331	// Sort the expected succession list.
6332	vector_sort_unique(expected_succession_list, cf_node_compare_desc);
6333	// The result should match the pending join requests exactly to consider the
6334	// new succession list.
6335	if (!vector_equals(expected_succession_list, new_succession_list)) {
6336	log_cf_node_vector(
6337	"skipping forming cluster - cannot form new cluster from pending join requests",
6338	&g_clustering.pending_join_requests, CF_INFO);
6339	goto Exit;
6340	}
6341
6342	if (cf_vector_size(orphans) > `0`
6343	&& cf_vector_size(new_succession_list) == `1`) {
6344	log_cf_node_vector(
6345	"skipping forming cluster - there are neighboring orphans that cannot be clustered with",
6346	orphans, CF_INFO);
6347	goto Exit;
6348	}
6349
6350	if (cf_vector_size(new_succession_list) > `0`) {
6351	cf_node new_principal = ((cf_node)cf_vector_getp(new_succession_list,
6352	`0`));
6353	if (new_principal == config_self_nodeid_get()) {
6354	log_cf_node_vector(
6355	"principal node - forming new cluster with succession list:",
6356	new_succession_list, CF_INFO);
6357
6358	as_paxos_start_result result = paxos_proposer_proposal_start(
6359	new_succession_list, new_succession_list);
6360
6361	// Log paxos result.
6362	paxos_result_log(result, new_succession_list);
6363
6364	paxos_proposal_started = (result == AS_PAXOS_RESULT_STARTED);
6365	}
6366	else {
6367	INFO("skipping cluster formation - a new potential principal %"PRIx64" exists",
6368	new_principal);
6369	}
6370	}
6371
6372	Exit:
6373	// Compute list of rejected nodes.
6374	if (paxos_proposal_started) {
6375	// Nodes in set (pending_join - new succession list) could not be
6376	// accomodated and should receive a join reject.
6377	vector_subtract(&g_clustering.pending_join_requests,
6378	new_succession_list);
6379	}
6380	else {
6381	// Reject all pending join requests. Will happen below.
6382	}
6383
6384	cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node);
6385	vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests);
6386
6387	// Clear the pending join requests
6388	vector_clear(&g_clustering.pending_join_requests);
6389
6390	// Send reject messages to rejected nodes.
6391	clustering_join_requests_reject(rejected_nodes);
6392
6393	cf_vector_destroy(rejected_nodes);
6394
6395	cf_vector_destroy(orphans);
6396	cf_vector_destroy(expected_succession_list);
6397	cf_vector_destroy(new_succession_list);
6398
6399	CLUSTERING_UNLOCK();
6400	}
6401
6402	/**
6403	* Try to join a cluster if there is a neighboring one,
6404	* else try to form one.
6405	*/
6406	static void
6407	clustering_join_or_form_cluster()
6408	{
6409	ASSERT(clustering_is_orphan(),
6410	"should not attempt forming new cluster when not an orphan node");
6411
6412	if (paxos_proposer_proposal_is_active()) {
6413	// There is an active paxos round with this node as the proposed
6414	// principal.
6415	// Skip join cluster attempt and give current paxos round a chance to
6416	// form the cluster.
6417	return;
6418	}
6419
6420	CLUSTERING_LOCK();
6421
6422	// TODO (Discuss this): after some timeout and exhausting all neighboring
6423	// principals, become a single node cluster / try our own cluster. This
6424	// might not be required. Nonetheless discuss and figure this out. Current
6425	// behaviour is form new cluster after a timeout.
6426
6427	// A node is orphan for too long if it has attempted a join request which
6428	// timedout and its in orphan state for a while.
6429	bool orphan_for_too_long = (clustering_orphan_timeout()
6430	+ g_clustering.orphan_state_start_time) < cf_getms()
6431	&& g_clustering.last_join_request_principal
6432	&& g_clustering.last_join_request_sent_time + join_request_timeout()
6433	< cf_getms();
6434
6435	if (orphan_for_too_long
6436	\|\| clustering_join_request_attempt()
6437	== AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS) {
6438	// No neighboring principal found or we have been orphan for too long,
6439	// try and form a new cluster.
6440	clustering_cluster_form();
6441	}
6442	else {
6443	// A join request sent successfully or pending. Wait for the new
6444	// principal to respond.
6445
6446	// We are not going to be a principal node in this quantum, reject all
6447	// pending join requests.
6448	clustering_join_requests_reject_all();
6449	}
6450
6451	CLUSTERING_UNLOCK();
6452	}
6453
6454	/**
6455	* Get a list of nodes that need to be added to current succession list from
6456	* pending join requests. Bascially filters out node that are not orphans.
6457	*/
6458	static void
6459	clustering_nodes_to_add_get(cf_vector* nodes_to_add)
6460	{
6461	CLUSTERING_LOCK();
6462
6463	// Use a single iteration over the clustering data received via the
6464	// heartbeats instead of individual calls to get a consistent view and avoid
6465	// small lock and release.
6466	as_hb_plugin_data_iterate(&g_clustering.pending_join_requests,
6467	AS_HB_PLUGIN_CLUSTERING, clustering_orphan_nodes_find,
6468	nodes_to_add);
6469
6470	CLUSTERING_UNLOCK();
6471	}
6472
6473	/**
6474	* Handle quantum interval start in the orphan state. Try and join / form a
6475	* cluster.
6476	*/
6477	static void
6478	clustering_orphan_quantum_interval_start_handle()
6479	{
6480	if (!as_hb_self_is_duplicate()) {
6481	// Try to join a cluster or form a new one.
6482	clustering_join_or_form_cluster();
6483	}
6484	}
6485
6486	/**
6487	* Send a cluster move command to all nodes in the input list.
6488	*
6489	* @param candidate_principal the principal to which the other nodes should try
6490	* and join after receiving the move command.
6491	* @param cluster_key current cluster key for receiver validation.
6492	* @param nodeids the nodes to send move command to.
6493	*/
6494	static void
6495	clustering_cluster_move_send(cf_node candidate_principal,
6496	as_cluster_key cluster_key, cf_vector* nodeids)
6497	{
6498	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_MERGE_MOVE);
6499
6500	// Set the proposed principal.
6501	msg_proposed_principal_set(msg, candidate_principal);
6502
6503	// Set cluster key for message validation.
6504	msg_cluster_key_set(msg, cluster_key);
6505
6506	log_cf_node_vector("cluster merge move command sent to:", nodeids,
6507	CF_DEBUG);
6508
6509	// Sent the message to the acceptors.
6510	msg_nodes_send(msg, nodeids);
6511	}
6512
6513	/**
6514	* Update preferred principal votes using hb plugin data.
6515	*/
6516	static void
6517	clustering_principal_preferred_principal_votes_count(cf_node nodeid,
6518	void* plugin_data, size_t plugin_data_size, cf_clock recv_monotonic_ts,
6519	as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
6520	{
6521	// A hash from each unique non null vinfo to a vector of partition ids
6522	// having the vinfo.
6523	cf_shash* preferred_principal_votes = (cf_shash*)udata;
6524
6525	CLUSTERING_LOCK();
6526	if (!clustering_hb_plugin_data_is_obsolete(
6527	g_register.cluster_modified_hlc_ts,
6528	g_register.cluster_modified_time, plugin_data, plugin_data_size,
6529	recv_monotonic_ts, msg_hlc_ts)) {
6530	cf_node* preferred_principal_p =
6531	clustering_hb_plugin_preferred_principal_get(plugin_data,
6532	plugin_data_size);
6533
6534	int current_votes = `0`;
6535	if (cf_shash_get(preferred_principal_votes, preferred_principal_p,
6536	&current_votes) == CF_SHASH_OK) {
6537	current_votes++;
6538	}
6539	else {
6540	// We are seeing this preferred principal for the first time.
6541	current_votes = `0`;
6542	}
6543
6544	cf_shash_put(preferred_principal_votes, preferred_principal_p,
6545	&current_votes);
6546	}
6547	else {
6548	DETAIL(
6549	"preferred principal voting skipped - found obsolete plugin data for node %"PRIx64,
6550	nodeid);
6551	}
6552	CLUSTERING_UNLOCK();
6553	}
6554
6555	/**
6556	* Get the preferred majority principal.
6557	*/
6558	static int
6559	clustering_principal_preferred_principal_majority_find(const void* key,
6560	void* data, void* udata)
6561	{
6562
6563	const cf_node* current_preferred_principal = (const cf_node*)key;
6564	int current_preferred_principal_votes = (int**)data;
6565	cf_node* majority_preferred_principal = (cf_node*)udata;
6566
6567	CLUSTERING_LOCK();
6568	int preferred_principal_majority =
6569	(int)ceil(
6570	cf_vector_size(
6571	&g_register.succession_list) * AS_CLUSTERING_PREFERRRED_PRINCIPAL_MAJORITY);
6572	bool is_majority = current_preferred_principal_votes
6573	>= preferred_principal_majority;
6574	CLUSTERING_UNLOCK();
6575
6576	if (is_majority) {
6577	majority_preferred_principal = current_preferred_principal;
6578	// Majority found, halt reduce.
6579	return CF_SHASH_ERR_FOUND;
6580	}
6581
6582	return CF_SHASH_OK;
6583	}
6584
6585	/**
6586	* Get preferred principal based on a majority of non-principal's preferred
6587	* principals.
6588	* @return the preferred principal nodeid if there is a majority, else zero.
6589	*/
6590	static cf_node
6591	clustering_principal_majority_preferred_principal_get()
6592	{
6593	// A hash from each unique non null vinfo to a vector of partition ids
6594	// having the vinfo.
6595	cf_shash* preferred_principal_votes = cf_shash_create(cf_nodeid_shash_fn,
6596	sizeof(cf_node), sizeof(int), AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT,
6597	`0`);
6598
6599	CLUSTERING_LOCK();
6600
6601	// Use a single iteration over the clustering data received via the
6602	// heartbeats instead of individual calls to get a consistent view and avoid
6603	// small lock and release.
6604	as_hb_plugin_data_iterate(&g_register.succession_list,
6605	AS_HB_PLUGIN_CLUSTERING,
6606	clustering_principal_preferred_principal_votes_count,
6607	preferred_principal_votes);
6608
6609	// Find the majority preferred principal.
6610	cf_node preferred_principal = `0`;
6611	cf_shash_reduce(preferred_principal_votes,
6612	clustering_principal_preferred_principal_majority_find,
6613	&preferred_principal);
6614
6615	CLUSTERING_UNLOCK();
6616
6617	cf_shash_destroy(preferred_principal_votes);
6618
6619	DETAIL("preferred principal is %"PRIx64, preferred_principal);
6620
6621	return preferred_principal;
6622	}
6623
6624	/**
6625	* Indicates if this node is a principal and its cluster can be merged with this
6626	* principal node's cluster.
6627	*
6628	* @param nodeid the candidate nodeid.
6629	* @param node_succession_list the candidate node's succession list.
6630	* @param node_succession_list_length the length of the node's succession list.
6631	* @return true if current node can be merged with this node's cluster.
6632	*/
6633	bool
6634	clustering_is_merge_candidate(cf_node nodeid, cf_node* node_succession_list,
6635	int node_succession_list_length)
6636	{
6637	if (node_succession_list_length <= `0` \|\| node_succession_list[`0`] != nodeid) {
6638	// Not a principal node. Ignore.
6639	return false;
6640	}
6641
6642	if (nodeid < config_self_nodeid_get()) {
6643	// Has a smaller nodeid. Ignore. This node will merge with our cluster.
6644	return false;
6645	}
6646
6647	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
6648
6649	CLUSTERING_LOCK();
6650	vector_copy_unique(new_succession_list, &g_register.succession_list);
6651	CLUSTERING_UNLOCK();
6652
6653	bool is_candidate = false;
6654
6655	// Node is the principal of its cluster. Create the new succession list.
6656	for (int i = `0`; i < node_succession_list_length; i++) {
6657	cf_vector_append_unique(new_succession_list, &node_succession_list[i]);
6658	}
6659
6660	int expected_cluster_size = cf_vector_size(new_succession_list);
6661
6662	// Find and evict the nodes that are not well connected.
6663	clustering_succession_list_clique_evict(new_succession_list,
6664	"clique based evicted nodes at cluster merge:");
6665	int new_cluster_size = cf_vector_size(new_succession_list);
6666
6667	// If no nodes need to be evicted then the merge is fine.
6668	is_candidate = (expected_cluster_size == new_cluster_size);
6669
6670	// Exit:
6671	cf_vector_destroy(new_succession_list);
6672
6673	return is_candidate;
6674	}
6675
6676	/**
6677	* HB plugin iterate function to find principals that this node's cluster can be
6678	* merged with.
6679	*/
6680	static void
6681	clustering_merge_candiate_find(cf_node nodeid, void* plugin_data,
6682	size_t plugin_data_size, cf_clock recv_monotonic_ts,
6683	as_hlc_msg_timestamp* msg_hlc_ts, void* udata)
6684	{
6685	cf_node* candidate_principal = (cf_node*)udata;
6686
6687	CLUSTERING_LOCK();
6688
6689	if (!clustering_hb_plugin_data_is_obsolete(
6690	g_register.cluster_modified_hlc_ts,
6691	g_register.cluster_modified_time, plugin_data, plugin_data_size,
6692	recv_monotonic_ts, msg_hlc_ts)) {
6693	uint32_t* other_succession_list_length =
6694	clustering_hb_plugin_succession_length_get(plugin_data,
6695	plugin_data_size);
6696
6697	cf_node* other_succession_list = clustering_hb_plugin_succession_get(
6698	plugin_data, plugin_data_size);
6699
6700	if (other_succession_list != NULL
6701	&& clustering_is_merge_candidate(nodeid, other_succession_list,
6702	*other_succession_list_length)
6703	&& *candidate_principal < nodeid) {
6704	DETAIL("principal node %"PRIx64" potential candidate for cluster merge", nodeid);
6705	*candidate_principal = nodeid;
6706	}
6707
6708	}
6709	else {
6710	DETAIL(
6711	"merge check skipped - found obsolete plugin data for node %"PRIx64,
6712	nodeid);
6713	}
6714
6715	CLUSTERING_UNLOCK();
6716	}
6717
6718	/**
6719	* Attempt to move to the majority preferred principal.
6720	*
6721	* @return 0 if the move to preferred principal was attempted, -1 otherwise.
6722	*/
6723	static int
6724	clustering_preferred_principal_move()
6725	{
6726	cf_node preferred_principal =
6727	clustering_principal_majority_preferred_principal_get();
6728
6729	if (preferred_principal == `0`
6730	\|\| preferred_principal == config_self_nodeid_get()) {
6731	return -`1`;
6732	}
6733
6734	cf_vector* succession_list = vector_stack_lockless_create(cf_node);
6735	as_cluster_key cluster_key = `0`;
6736	CLUSTERING_LOCK();
6737	vector_copy(succession_list, &g_register.succession_list);
6738	cluster_key = g_register.cluster_key;
6739	// Update the time move command was sent.
6740	g_clustering.move_cmd_issue_time = cf_getms();
6741	CLUSTERING_UNLOCK();
6742
6743	INFO("majority nodes find %"PRIx64" to be a better principal - sending move command to all cluster members",
6744	preferred_principal);
6745	clustering_cluster_move_send(preferred_principal, cluster_key,
6746	succession_list);
6747	cf_vector_destroy(succession_list);
6748
6749	return `0`;
6750	}
6751
6752	/**
6753	* Attempt to merge with a larger adjacent cluster is the resulting cluster will
6754	* form a clique.
6755	*
6756	* @return 0 if a merge is attempted, -1 otherwise.
6757	*/
6758	static int
6759	clustering_merge_attempt()
6760	{
6761	int rv = -`1`;
6762	CLUSTERING_LOCK();
6763	cf_vector* succession_list = vector_stack_lockless_create(cf_node);
6764	vector_copy(succession_list, &g_register.succession_list);
6765	as_cluster_key cluster_key = g_register.cluster_key;
6766	cf_node candidate_principal = `0`;
6767
6768	// Use a single iteration over the clustering data received via the
6769	// heartbeats instead of individual calls to get a consistent view and avoid
6770	// small lock and release.
6771	as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING,
6772	clustering_merge_candiate_find, &candidate_principal);
6773
6774	CLUSTERING_UNLOCK();
6775
6776	if (candidate_principal == `0`) {
6777	DEBUG("no cluster merge candidates found");
6778	rv = -`1`;
6779	goto Exit;
6780	}
6781
6782	// Send a move command to all nodes in the succession list. Need not switch
6783	// to orphan state immediately, this node will receive the move command too
6784	// and will handle the move accordingly.
6785	INFO("this cluster can merge with cluster with principal %"PRIx64" - sending move command to all cluster members",
6786	candidate_principal);
6787	clustering_cluster_move_send(candidate_principal, cluster_key,
6788	succession_list);
6789	rv = `0`;
6790	Exit:
6791	cf_vector_destroy(succession_list);
6792	return rv;
6793	}
6794
6795	/**
6796	* Handle quantum interval start when self node is the principal of its cluster.
6797	*/
6798	static void
6799	clustering_principal_quantum_interval_start_handle(
6800	as_clustering_internal_event* event)
6801	{
6802	DETAIL("principal node quantum wakeup");
6803
6804	if (as_hb_self_is_duplicate()) {
6805	// Cluster is in a bad shape and self node has a duplicate node-id.
6806	register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
6807	return;
6808	}
6809
6810	CLUSTERING_LOCK();
6811	bool paxos_proposal_started = false;
6812
6813	cf_vector* dead_nodes = vector_stack_lockless_create(cf_node);
6814	clustering_dead_nodes_find(dead_nodes);
6815
6816	log_cf_node_vector("dead nodes at quantum start:", dead_nodes,
6817	cf_vector_size(dead_nodes) > `0` ? CF_INFO : CF_DEBUG);
6818
6819	cf_vector* faulty_nodes = vector_stack_lockless_create(cf_node);
6820	clustering_faulty_nodes_find(faulty_nodes);
6821
6822	log_cf_node_vector("faulty nodes at quantum start:", faulty_nodes,
6823	cf_vector_size(faulty_nodes) > `0` ? CF_INFO : CF_DEBUG);
6824
6825	// Having dead node or faulty nodes is a sign of cluster integrity breach.
6826	// New nodes should not count as integrity breach.
6827	g_clustering.has_integrity = cf_vector_size(faulty_nodes) == `0`
6828	&& cf_vector_size(dead_nodes) == `0`;
6829
6830	cf_vector* new_nodes = vector_stack_lockless_create(cf_node);
6831	clustering_nodes_to_add_get(new_nodes);
6832	log_cf_node_vector("join requests at quantum start:", new_nodes,
6833	cf_vector_size(new_nodes) > `0` ? CF_INFO : CF_DEBUG);
6834
6835	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
6836	vector_copy_unique(new_succession_list, &g_register.succession_list);
6837	vector_subtract(new_succession_list, dead_nodes);
6838	vector_subtract(new_succession_list, faulty_nodes);
6839	vector_copy_unique(new_succession_list, new_nodes);
6840
6841	// Add self node. We should not miss self in the succession list, but be
6842	// doubly sure.
6843	cf_node self_nodeid = config_self_nodeid_get();
6844	cf_vector_append_unique(new_succession_list, &self_nodeid);
6845
6846	vector_sort_unique(new_succession_list, cf_node_compare_desc);
6847	uint32_t num_evicted = clustering_succession_list_clique_evict(
6848	new_succession_list,
6849	"clique based evicted nodes at quantum start:");
6850
6851	if (event->quantum_interval_is_skippable && cf_vector_size(dead_nodes) != `0`
6852	&& !quantum_interval_is_adjacency_fault_seen()) {
6853	// There is an imminent adjacency fault that has not been seen by the
6854	// quantum interval generator, lets not take any action.
6855	DEBUG("adjacency fault imminent - skipping quantum interval handling");
6856	quantum_interval_mark_postponed();
6857	goto Exit;
6858	}
6859
6860	if (event->quantum_interval_is_skippable && num_evicted != `0`
6861	&& !quantum_interval_is_peer_adjacency_fault_seen()) {
6862	// There is an imminent adjacency fault that has not been seen by the
6863	// quantum interval generator, lets not take any action.
6864	DEBUG(
6865	"peer adjacency fault imminent - skipping quantum interval handling");
6866	quantum_interval_mark_postponed();
6867	goto Exit;
6868	}
6869
6870	if (cf_vector_size(faulty_nodes) == `0` && cf_vector_size(dead_nodes) == `0`) {
6871	// We might have only pending join requests. Attempt a move to a
6872	// preferred principal or a merge before trying to add new nodes.
6873	if (clustering_preferred_principal_move() == `0`
6874	\|\| clustering_merge_attempt() == `0`) {
6875	goto Exit;
6876	}
6877	}
6878
6879	if (vector_equals(new_succession_list, &g_register.succession_list)
6880	&& cf_vector_size(faulty_nodes) == `0`) {
6881	// There is no change in the succession list and also there are no
6882	// faulty nodes. If there are faulty nodes they have probably restarted
6883	// quickly, in which case a new cluster transition with the same
6884	// succession list is required.
6885	goto Exit;
6886	}
6887
6888	if (cf_vector_size(faulty_nodes) != `0`
6889	&& cf_vector_size(new_succession_list) == `1`) {
6890	// This node most likely lost time (slept/paused) and the rest of the
6891	// cluster reformed. Its best to go to the orphan state and start from
6892	// there instead of moving to a single node cluster and again eventually
6893	// forming a larger cluster.
6894	WARNING(
6895	"all cluster members are part of different cluster - changing state to orphan");
6896	register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
6897	goto Exit;
6898	}
6899
6900	// Start a new paxos round.
6901	log_cf_node_vector("current succession list", &g_register.succession_list,
6902	CF_DEBUG);
6903
6904	log_cf_node_vector("proposed succession list", new_succession_list,
6905	CF_DEBUG);
6906	DEBUG("proposed cluster size %d", cf_vector_size(new_succession_list));
6907
6908	as_paxos_start_result result = paxos_proposer_proposal_start(
6909	new_succession_list, new_succession_list);
6910
6911	// Log paxos result.
6912	paxos_result_log(result, new_succession_list);
6913
6914	// TODO: Should we move to orphan state if there are not enough nodes in the
6915	// cluster.
6916	// Tentatively yes....
6917	if (result == AS_PAXOS_RESULT_CLUSTER_TOO_SMALL) {
6918	register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
6919	}
6920
6921	paxos_proposal_started = (result == AS_PAXOS_RESULT_STARTED);
6922	Exit:
6923	// Although these are stack vectors the contents can be heap allocated on
6924	// resize. Destroy call is prudent.
6925	cf_vector_destroy(dead_nodes);
6926	cf_vector_destroy(faulty_nodes);
6927	cf_vector_destroy(new_nodes);
6928	cf_vector_destroy(new_succession_list);
6929
6930	// Compute list of rejected nodes.
6931	if (paxos_proposal_started) {
6932	// Nodes in set (pending_join - new succession list) could not be
6933	// accomodated and should receive a join reject.
6934	vector_subtract(&g_clustering.pending_join_requests,
6935	new_succession_list);
6936	}
6937	else {
6938	// Nodes in set (pending_join - current succession list) could not be
6939	// accomodated and should receive a join reject.
6940	vector_subtract(&g_clustering.pending_join_requests,
6941	&g_register.succession_list);
6942
6943	}
6944
6945	cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node);
6946	vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests);
6947
6948	// Clear the pending join requests
6949	vector_clear(&g_clustering.pending_join_requests);
6950
6951	// Send reject messages to rejected nodes.
6952	clustering_join_requests_reject(rejected_nodes);
6953
6954	cf_vector_destroy(rejected_nodes);
6955
6956	CLUSTERING_UNLOCK();
6957	}
6958
6959	/**
6960	* Check for and handle eviction by self node's principal.
6961	*
6962	* @param principal_plugin_data the pluging data for the principal.
6963	* @param plugin_data_hlc_ts the hlc timestamp when the plugin data was
6964	* received.
6965	* @param plugin_data_ts the monotonic clock timestamp when the plugin data was
6966	* recvied.
6967	*/
6968	static void
6969	clustering_non_principal_evicted_check(cf_node principal_nodeid,
6970	as_hb_plugin_node_data* principal_plugin_data,
6971	as_hlc_msg_timestamp* plugin_data_hlc_ts, cf_clock plugin_data_ts)
6972	{
6973	CLUSTERING_LOCK();
6974	bool is_evicted = false;
6975
6976	if (!as_hb_is_alive(principal_nodeid)) {
6977	is_evicted = true;
6978	goto Exit;
6979	}
6980
6981	if (!clustering_is_our_principal(principal_nodeid)
6982	\|\| clustering_hb_plugin_data_is_obsolete(
6983	g_register.cluster_modified_hlc_ts,
6984	g_register.cluster_modified_time,
6985	principal_plugin_data->data,
6986	principal_plugin_data->data_size, plugin_data_ts,
6987	plugin_data_hlc_ts)) {
6988	// The plugin data is obsolete. Can't take decisions based on it.
6989	goto Exit;
6990	}
6991
6992	// Get the changed node's succession list, cluster key. All the fields
6993	// should be present since the obsolete check also checked for fields being
6994	// valid.
6995	cf_node* succession_list_p = clustering_hb_plugin_succession_get(
6996	principal_plugin_data->data, principal_plugin_data->data_size);
6997	uint32_t* succession_list_length_p =
6998	clustering_hb_plugin_succession_length_get(
6999	principal_plugin_data->data,
7000	principal_plugin_data->data_size);
7001
7002	// Check if we have been evicted.
7003	if (!clustering_is_node_in_succession(config_self_nodeid_get(),
7004	succession_list_p, *succession_list_length_p)) {
7005	is_evicted = true;
7006	}
7007
7008	Exit:
7009	if (is_evicted) {
7010	// This node has been evicted from the cluster.
7011	WARNING("evicted from cluster by principal node %"PRIx64"- changing state to orphan",
7012	principal_nodeid);
7013	register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
7014	}
7015
7016	CLUSTERING_UNLOCK();
7017	}
7018
7019	/**
7020	* Monitor plugin data change events for evictions.
7021	*/
7022	static void
7023	clustering_non_principal_hb_plugin_data_changed_handle(
7024	as_clustering_internal_event* change_event)
7025	{
7026	clustering_non_principal_evicted_check(
7027	change_event->plugin_data_changed_nodeid, change_event->plugin_data,
7028	&change_event->plugin_data_changed_hlc_ts,
7029	change_event->plugin_data_changed_ts);
7030	}
7031
7032	/**
7033	* Update the preferred principal in the non-principal mode.
7034	*/
7035	static void
7036	clustering_non_principal_preferred_principal_update()
7037	{
7038	cf_node current_principal = `0`;
7039	if (clustering_principal_get(&current_principal) != `0`
7040	\|\| current_principal == `0`) {
7041	// We are an orphan.
7042	return;
7043	}
7044
7045	cf_vector* new_succession_list = vector_stack_lockless_create(cf_node);
7046
7047	clustering_neighboring_nodes_get(new_succession_list);
7048	cf_node self_nodeid = config_self_nodeid_get();
7049	cf_vector_append(new_succession_list, &self_nodeid);
7050
7051	clustering_succession_list_clique_evict(new_succession_list,
7052	"clique based evicted nodes while updating preferred principal:");
7053
7054	// Sort the new succession list.
7055	vector_sort_unique(new_succession_list, cf_node_compare_desc);
7056
7057	cf_node preferred_principal = `0`;
7058	int new_cluster_size = cf_vector_size(new_succession_list);
7059	if (new_cluster_size > `0`) {
7060	if (vector_find(new_succession_list, &current_principal) < `0`) {
7061	cf_vector_get(new_succession_list, `0`, &preferred_principal);
7062	}
7063	}
7064
7065	CLUSTERING_LOCK();
7066	if (preferred_principal != `0`
7067	&& g_clustering.preferred_principal != preferred_principal) {
7068	INFO("preferred principal updated to %"PRIx64,
7069	g_clustering.preferred_principal);
7070	}
7071	g_clustering.preferred_principal = preferred_principal;
7072
7073	cf_vector_destroy(new_succession_list);
7074	CLUSTERING_UNLOCK();
7075	}
7076
7077	/**
7078	* Handle quantum interval start in the non principal state.
7079	*/
7080	static void
7081	clustering_non_principal_quantum_interval_start_handle()
7082	{
7083	// Reject all accumulated join requests since we are no longer a principal.
7084	clustering_join_requests_reject_all();
7085
7086	if (as_hb_self_is_duplicate()) {
7087	// Cluster is in a bad shape and self node has a duplicate node-id.
7088	register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
7089	return;
7090	}
7091
7092	// Update the preferred principal.
7093	clustering_non_principal_preferred_principal_update();
7094
7095	// Check if we have been evicted.
7096	cf_node principal = `0`;
7097
7098	if (clustering_principal_get(&principal) != `0`) {
7099	WARNING("could not get principal for self node");
7100	return;
7101	}
7102
7103	as_hlc_msg_timestamp plugin_data_hlc_ts;
7104	cf_clock plugin_data_ts = `0`;
7105	as_hb_plugin_node_data plugin_data = { `0` };
7106
7107	if (clustering_hb_plugin_data_get(principal, &plugin_data,
7108	&plugin_data_hlc_ts, &plugin_data_ts) != `0`) {
7109	plugin_data_ts = `0`;
7110	memset(&plugin_data, `0`, sizeof(plugin_data));
7111	}
7112
7113	clustering_non_principal_evicted_check(principal, &plugin_data,
7114	&plugin_data_hlc_ts, plugin_data_ts);
7115	}
7116
7117	/**
7118	* Handle quantum interval start.
7119	*/
7120	static void
7121	clustering_quantum_interval_start_handle(as_clustering_internal_event* event)
7122	{
7123	CLUSTERING_LOCK();
7124
7125	// Dispatch based on state.
7126	switch (g_clustering.state) {
7127	case AS_CLUSTERING_STATE_ORPHAN:
7128	clustering_orphan_quantum_interval_start_handle();
7129	break;
7130	case AS_CLUSTERING_STATE_PRINCIPAL:
7131	clustering_principal_quantum_interval_start_handle(event);
7132	break;
7133	case AS_CLUSTERING_STATE_NON_PRINCIPAL:
7134	clustering_non_principal_quantum_interval_start_handle();
7135	default:
7136	break;
7137	}
7138
7139	CLUSTERING_UNLOCK();
7140	}
7141
7142	/**
7143	* Handle a timer event in the orphan state.
7144	*/
7145	static void
7146	clustering_orphan_timer_event_handle()
7147	{
7148	// Attempt a join request.
7149	DETAIL("attempting join request from orphan state");
7150	clustering_join_request_attempt();
7151	}
7152
7153	/**
7154	* Handle a timer event for the clustering module.
7155	*/
7156	static void
7157	clustering_timer_event_handle()
7158	{
7159	CLUSTERING_LOCK();
7160
7161	// Dispatch based on state.
7162	switch (g_clustering.state) {
7163	case AS_CLUSTERING_STATE_ORPHAN:
7164	clustering_orphan_timer_event_handle();
7165	break;
7166	default:
7167	break;
7168	}
7169
7170	CLUSTERING_UNLOCK();
7171	}
7172
7173	/**
7174	* Check if the incoming message is sane to be proccessed further.
7175	*/
7176	static bool
7177	clustering_message_sanity_check(cf_node src_nodeid, msg* msg)
7178	{
7179	as_cluster_proto_identifier proto;
7180	if (msg_proto_id_get(msg, &proto) != `0`) {
7181	WARNING(
7182	"received message with no clustering protocol identifier from node %"PRIx64,
7183	src_nodeid);
7184	return false;
7185	}
7186
7187	return clustering_versions_are_compatible(proto,
7188	clustering_protocol_identifier_get());
7189	}
7190
7191	/**
7192	* Handle an incoming join request. We do not bother with older replay's for
7193	* join requests because the pending request are cleanup during new cluster
7194	* formation.
7195	*/
7196	static void
7197	clustering_join_request_handle(as_clustering_internal_event* msg_event)
7198	{
7199	cf_node src_nodeid = msg_event->msg_src_nodeid;
7200	DEBUG("received cluster join request from node %"PRIx64, src_nodeid);
7201	bool fire_quantum_event = false;
7202
7203	CLUSTERING_LOCK();
7204
7205	cf_clock now = cf_getms();
7206
7207	if (g_clustering.move_cmd_issue_time + join_request_move_reject_interval()
7208	> now) {
7209	// We have just send out a move request. Reject this join request.
7210	INFO("ignoring join request from node %"PRIx64" since we have just issued a move command",
7211	src_nodeid);
7212	clustering_join_reject_send(src_nodeid);
7213	goto Exit;
7214	}
7215
7216	if ((!clustering_is_principal() && !clustering_is_orphan())
7217	\|\| g_clustering.last_join_request_sent_time + join_request_timeout()
7218	>= cf_getms()) {
7219	// Can't handle a join request this node is not the principal right now
7220	// or this node is trying to join another cluster.
7221	msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REJECT);
7222
7223	DETAIL("sent join reject to node %"PRIx64, msg_event->msg_src_nodeid);
7224
7225	// Sent the message to the acceptors.
7226	msg_node_send(msg, msg_event->msg_src_nodeid);
7227
7228	goto Exit;
7229	}
7230
7231	if (vector_find(&g_clustering.pending_join_requests, &src_nodeid) >= `0`) {
7232	DEBUG("ignoring join request from node %"PRIx64" since a request is already pending",
7233	src_nodeid);
7234	goto Exit;
7235	}
7236
7237	// Check if we are receiving a stale or very delayed join request.
7238	int64_t message_delay_estimate = as_hlc_timestamp_diff_ms(
7239	as_hlc_timestamp_now(), msg_event->msg_hlc_ts.send_ts);
7240	if (message_delay_estimate < `0`
7241	\|\| message_delay_estimate > join_request_accept_delay_max()) {
7242	INFO("ignoring stale join request from node %"PRIx64" - delay estimate %lu(ms) ",
7243	src_nodeid, message_delay_estimate);
7244	goto Exit;
7245	}
7246
7247	// Add this request to the pending queue.
7248	cf_vector_append_unique(&g_clustering.pending_join_requests, &src_nodeid);
7249
7250	// Generate a join request accepted event for the quantum interval
7251	// generator.
7252	as_clustering_internal_event join_request_event;
7253	memset(&join_request_event, `0`, sizeof(join_request_event));
7254	join_request_event.type =
7255	AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED;
7256	join_request_event.join_request_source_nodeid = src_nodeid;
7257	internal_event_dispatch(&join_request_event);
7258	fire_quantum_event = true;
7259
7260	INFO("accepted join request from node %"PRIx64, src_nodeid);
7261
7262	Exit:
7263	CLUSTERING_UNLOCK();
7264
7265	if (fire_quantum_event) {
7266	internal_event_dispatch(&join_request_event);
7267	}
7268	}
7269
7270	/**
7271	* Handle an incoming join reject.
7272	*/
7273	static void
7274	clustering_join_reject_handle(as_clustering_internal_event* event)
7275	{
7276	cf_node src_nodeid = event->msg_src_nodeid;
7277
7278	DEBUG("received cluster join reject from node %"PRIx64, src_nodeid);
7279
7280	CLUSTERING_LOCK();
7281
7282	if (!clustering_is_orphan()) {
7283	// Already part of a cluster. Ignore the reject.
7284	INFO(
7285	"already part of a cluster - ignoring join reject from node %"PRIx64,
7286	src_nodeid);
7287	goto Exit;
7288	}
7289
7290	if (paxos_proposer_proposal_is_active()) {
7291	// This node is attempting to form a new cluster.
7292	INFO(
7293	"already trying to form a cluster - ignoring join reject from node %"PRIx64,
7294	src_nodeid);
7295	goto Exit;
7296	}
7297
7298	if (g_clustering.last_join_request_principal == src_nodeid) {
7299	// This node had requested the source principal for cluster membership
7300	// which was rejected. Try and join a different cluster.
7301
7302	// This join request should not be considered as pending, so reset the
7303	// join request sent time.
7304	g_clustering.last_join_request_sent_time = `0`;
7305	g_clustering.last_join_request_principal = `0`;
7306	clustering_join_request_attempt();
7307	}
7308
7309	Exit:
7310	CLUSTERING_UNLOCK();
7311	}
7312
7313	/**
7314	* Handle an incoming merge move command. Basically this node switched to orphan
7315	* state and sends a join request to the principal listed in the merge move.
7316	*/
7317	static void
7318	clustering_merge_move_handle(as_clustering_internal_event* event)
7319	{
7320	cf_node src_nodeid = event->msg_src_nodeid;
7321
7322	DEBUG("received cluster merge move from node %"PRIx64, src_nodeid);
7323
7324	CLUSTERING_LOCK();
7325
7326	as_cluster_key msg_cluster_key = `0`;
7327	msg_cluster_key_get(event->msg, &msg_cluster_key);
7328
7329	if (clustering_is_orphan()) {
7330	// Already part of a cluster. Ignore the reject.
7331	INFO(
7332	"already orphan node - ignoring merge move command from node %"PRIx64,
7333	src_nodeid);
7334	goto Exit;
7335	}
7336
7337	if (msg_is_obsolete(g_register.cluster_modified_hlc_ts,
7338	g_register.cluster_modified_time, event->msg_recvd_ts,
7339	&event->msg_hlc_ts) \|\| !clustering_is_our_principal(src_nodeid)
7340	\|\| paxos_proposer_proposal_is_active()
7341	\|\| msg_cluster_key != g_register.cluster_key) {
7342	INFO("ignoring cluster merge move from node %"PRIx64, src_nodeid);
7343	goto Exit;
7344	}
7345
7346	// Madril simulation black lists current principal so that we do not end up
7347	// joining him again immediately. However the check for obsolete data should
7348	// make that check from madril redundant.
7349	cf_node new_principal = `0`;
7350
7351	if (msg_proposed_principal_get(event->msg, &new_principal) != `0`) {
7352	// Move command does not have the proposed principal
7353	WARNING(
7354	"received merge move command without a proposed principal. Will join the first available principal");
7355	new_principal = `0`;
7356	}
7357
7358	// Switch to orphan cluster state so that we move to the new principal.
7359	register_become_orphan (AS_CLUSTERING_ATTEMPTING_MERGE);
7360
7361	// Send a join request to a the new principal
7362	clustering_principal_join_request_attempt(new_principal);
7363	Exit:
7364	CLUSTERING_UNLOCK();
7365	}
7366
7367	/**
7368	* Handle an incoming message.
7369	*/
7370	static void
7371	clustering_msg_event_handle(as_clustering_internal_event* msg_event)
7372	{
7373	// Delegate handling based on message type.
7374	switch (msg_event->msg_type) {
7375	case AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST:
7376	clustering_join_request_handle(msg_event);
7377	break;
7378	case AS_CLUSTERING_MSG_TYPE_JOIN_REJECT:
7379	clustering_join_reject_handle(msg_event);
7380	break;
7381	case AS_CLUSTERING_MSG_TYPE_MERGE_MOVE:
7382	clustering_merge_move_handle(msg_event);
7383	break;
7384	default: // Non cluster management messages.
7385	break;
7386	}
7387	}
7388
7389	/**
7390	* Fabric msg listener that generates an internal message event and dispatches
7391	* it to the sub system.
7392	*/
7393	static int
7394	clustering_fabric_msg_listener(cf_node msg_src_nodeid, msg* msg, void* udata)
7395	{
7396	if (!clustering_is_running()) {
7397	// Ignore fabric messages when clustering is not running.
7398	WARNING("clustering stopped - ignoring message from node %"PRIx64,
7399	msg_src_nodeid);
7400	goto Exit;
7401	}
7402
7403	// Sanity check.
7404	if (!clustering_message_sanity_check(msg_src_nodeid, msg)) {
7405	WARNING("invalid mesage received from node %"PRIx64, msg_src_nodeid);
7406	goto Exit;
7407	}
7408
7409	as_clustering_internal_event msg_event;
7410	memset(&msg_event, `0`, sizeof(msg_event));
7411	msg_event.type = AS_CLUSTERING_INTERNAL_EVENT_MSG;
7412
7413	msg_event.msg_src_nodeid = msg_src_nodeid;
7414
7415	// Update hlc and store update message timestamp for the event.
7416	as_hlc_timestamp send_ts = `0`;
7417	msg_send_ts_get(msg, &send_ts);
7418	as_hlc_timestamp_update(msg_event.msg_src_nodeid, send_ts,
7419	&msg_event.msg_hlc_ts);
7420
7421	msg_event.msg = msg;
7422	msg_event.msg_recvd_ts = cf_getms();
7423	msg_type_get(msg, &msg_event.msg_type);
7424
7425	internal_event_dispatch(&msg_event);
7426
7427	Exit:
7428	as_fabric_msg_put(msg);
7429	return `0`;
7430	}
7431
7432	/**
7433	* Handle register cluster changed.
7434	*/
7435	static void
7436	clustering_register_cluster_changed_handle()
7437	{
7438	CLUSTERING_LOCK();
7439
7440	if (paxos_proposer_proposal_is_active()) {
7441	paxos_proposer_fail();
7442	}
7443
7444	if (clustering_is_principal()) {
7445	g_clustering.state = AS_CLUSTERING_STATE_PRINCIPAL;
7446	}
7447	else {
7448	g_clustering.state = AS_CLUSTERING_STATE_NON_PRINCIPAL;
7449	// We are a non-principal. Reject all pending join requests.
7450	clustering_join_requests_reject_all();
7451	}
7452
7453	g_clustering.preferred_principal = `0`;
7454	g_clustering.last_join_request_principal = `0`;
7455	g_clustering.move_cmd_issue_time = `0`;
7456
7457	CLUSTERING_UNLOCK();
7458	}
7459
7460	/**
7461	* Handle register synced events. Basically this means it is safe to publish the
7462	* cluster changed event to external sub systems.
7463	*/
7464	static void
7465	clustering_register_cluster_synced_handle(as_clustering_internal_event* event)
7466	{
7467	CLUSTERING_LOCK();
7468
7469	// Queue the cluster change event for publishing.
7470	as_clustering_event cluster_change_event;
7471	cluster_change_event.type = AS_CLUSTERING_CLUSTER_CHANGED;
7472	cluster_change_event.qualifier = event->qualifier;
7473	cluster_change_event.cluster_key = g_register.cluster_key;
7474	cluster_change_event.succession_list = &g_register.succession_list;
7475	external_event_queue(&cluster_change_event);
7476
7477	g_clustering.has_integrity = true;
7478
7479	CLUSTERING_UNLOCK();
7480	}
7481
7482	/**
7483	* Handle the register going to orphaned state.
7484	*/
7485	static void
7486	clustering_register_orphaned_handle(as_clustering_internal_event* event)
7487	{
7488	CLUSTERING_LOCK();
7489	g_clustering.state = AS_CLUSTERING_STATE_ORPHAN;
7490	g_clustering.orphan_state_start_time = cf_getms();
7491	g_clustering.preferred_principal = `0`;
7492
7493	// Queue the cluster change event for publishing.
7494	as_clustering_event orphaned_event;
7495	orphaned_event.type = AS_CLUSTERING_ORPHANED;
7496	orphaned_event.qualifier = event->qualifier;
7497	orphaned_event.cluster_key = `0`;
7498	orphaned_event.succession_list = NULL;
7499	external_event_queue(&orphaned_event);
7500	CLUSTERING_UNLOCK();
7501	}
7502
7503	/**
7504	* Handle hb plugin data change by dispatching it based on clustering change.
7505	*/
7506	static void
7507	clustering_hb_plugin_data_changed_event_handle(
7508	as_clustering_internal_event* change_event)
7509	{
7510	CLUSTERING_LOCK();
7511	switch (g_clustering.state) {
7512	case AS_CLUSTERING_STATE_NON_PRINCIPAL:
7513	clustering_non_principal_hb_plugin_data_changed_handle(change_event);
7514	break;
7515	default:
7516	break;
7517	}
7518	CLUSTERING_UNLOCK();
7519	}
7520
7521	/**
7522	* Handle heartbeat event.
7523	*/
7524	static void
7525	clustering_hb_event_handle(as_clustering_internal_event* hb_event)
7526	{
7527	for (int i = `0`; i < hb_event->hb_n_events; i++) {
7528	if (hb_event->hb_events[i].evt == AS_HB_NODE_DEPART
7529	&& clustering_is_our_principal(hb_event->hb_events[i].nodeid)) {
7530	// Our principal is no longer visible.
7531	INFO("principal node %"PRIx64" departed - switching to orphan state",
7532	hb_event->hb_events[i].nodeid);
7533	register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST);
7534	}
7535	}
7536	}
7537
7538	/**
7539	* Handle the fail of a paxos proposal started by the self node.
7540	*/
7541	static void
7542	clustering_paxos_proposer_fail_handle()
7543	{
7544	// Send reject to all pending join requesters.
7545	clustering_join_requests_reject_all();
7546	}
7547
7548	/**
7549	* Clustering module event handler.
7550	*/
7551	static void
7552	clustering_event_handle(as_clustering_internal_event* event)
7553	{
7554	// Lock to enusure the entire event handling is atomic and parallel events
7555	// events (hb/fabric) do not interfere.
7556	CLUSTERING_LOCK();
7557
7558	switch (event->type) {
7559	case AS_CLUSTERING_INTERNAL_EVENT_TIMER:
7560	clustering_timer_event_handle();
7561	break;
7562	case AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START:
7563	clustering_quantum_interval_start_handle(event);
7564	break;
7565	case AS_CLUSTERING_INTERNAL_EVENT_HB:
7566	clustering_hb_event_handle(event);
7567	break;
7568	case AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED:
7569	clustering_hb_plugin_data_changed_event_handle(event);
7570	break;
7571	case AS_CLUSTERING_INTERNAL_EVENT_MSG:
7572	clustering_msg_event_handle(event);
7573	break;
7574	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED:
7575	clustering_register_orphaned_handle(event);
7576	break;
7577	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED:
7578	clustering_register_cluster_changed_handle();
7579	break;
7580	case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED:
7581	clustering_register_cluster_synced_handle(event);
7582	break;
7583	case AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL: // Send reject message to all
7584	clustering_paxos_proposer_fail_handle();
7585	break;
7586	default: // Not of interest for main clustering module.
7587	break;
7588	}
7589
7590	CLUSTERING_UNLOCK();
7591	}
7592
7593	/**
7594	* Initialize the template to be used for clustering messages.
7595	*/
7596	static void
7597	clustering_msg_init()
7598	{
7599	// Register fabric clustering msg type with no processing function:
7600	// This permits getting / putting clustering msgs to be moderated via an
7601	// idle msg queue.
7602	as_fabric_register_msg_fn(M_TYPE_CLUSTERING, g_clustering_msg_template,
7603	sizeof(g_clustering_msg_template), AS_CLUSTERING_MSG_SCRATCH_SIZE,
7604	clustering_fabric_msg_listener, NULL);
7605	}
7606
7607	/**
7608	* Change listener that updates the first time in current quantum.
7609	*/
7610	static void
7611	clustering_hb_plugin_data_change_listener(cf_node changed_node_id)
7612	{
7613	if (!clustering_is_running()) {
7614	return;
7615	}
7616
7617	DETAIL("cluster information change detected for node %"PRIx64,
7618	changed_node_id);
7619
7620	as_hb_plugin_node_data plugin_data;
7621	as_clustering_internal_event change_event;
7622	memset(&change_event, `0`, sizeof(change_event));
7623	change_event.type = AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED;
7624	change_event.plugin_data_changed_nodeid = changed_node_id;
7625	change_event.plugin_data = &plugin_data;
7626
7627	if (clustering_hb_plugin_data_get(changed_node_id, &plugin_data,
7628	&change_event.plugin_data_changed_hlc_ts,
7629	&change_event.plugin_data_changed_ts) != `0`) {
7630	// Not possible. We should be able to read the plugin data that changed.
7631	return;
7632	}
7633	internal_event_dispatch(&change_event);
7634	}
7635
7636	/**
7637	* Listen to external heartbeat event and dispatch an internal heartbeat event.
7638	*/
7639	static void
7640	clustering_hb_event_listener(int n_events, as_hb_event_node* hb_node_events,
7641	void* udata)
7642	{
7643	if (!clustering_is_running()) {
7644	return;
7645	}
7646
7647	// Wrap the events in an internal event and dispatch.
7648	as_clustering_internal_event hb_event;
7649	memset(&hb_event, `0`, sizeof(hb_event));
7650	hb_event.type = AS_CLUSTERING_INTERNAL_EVENT_HB;
7651	hb_event.hb_n_events = n_events;
7652	hb_event.hb_events = hb_node_events;
7653
7654	internal_event_dispatch(&hb_event);
7655	}
7656
7657	/**
7658	* Reform the cluster with the same succession list.This would trigger the
7659	* generation of new partition info and the cluster would get a new cluster key.
7660	*
7661	* @return 0 if new clustering round started, 1 if not principal, -1 otherwise.
7662	*/
7663	static int
7664	clustering_cluster_reform()
7665	{
7666	int rv = -`1`;
7667	CLUSTERING_LOCK();
7668
7669	cf_vector* dead_nodes = vector_stack_lockless_create(cf_node);
7670	clustering_dead_nodes_find(dead_nodes);
7671
7672	log_cf_node_vector("recluster: dead nodes - ", dead_nodes,
7673	cf_vector_size(dead_nodes) > `0` ? CF_INFO : CF_DEBUG);
7674
7675	cf_vector* faulty_nodes = vector_stack_lockless_create(cf_node);
7676	clustering_faulty_nodes_find(faulty_nodes);
7677
7678	log_cf_node_vector("recluster: faulty nodes - ", faulty_nodes,
7679	cf_vector_size(faulty_nodes) > `0` ? CF_INFO : CF_DEBUG);
7680
7681	cf_vector* new_nodes = vector_stack_lockless_create(cf_node);
7682	clustering_nodes_to_add_get(new_nodes);
7683	log_cf_node_vector("recluster: pending join requests - ", new_nodes,
7684	cf_vector_size(new_nodes) > `0` ? CF_INFO : CF_DEBUG);
7685
7686	if (!clustering_is_running() \|\| !clustering_is_principal()
7687	\|\| cf_vector_size(dead_nodes) > `0`
7688	\|\| cf_vector_size(faulty_nodes) > `0`
7689	\|\| cf_vector_size(new_nodes) > `0`) {
7690	INFO(
7691	"recluster: skipped - principal %s dead_nodes %d faulty_nodes %d new_nodes %d",
7692	clustering_is_principal() ? "true" : "false",
7693	cf_vector_size(dead_nodes), cf_vector_size(faulty_nodes),
7694	cf_vector_size(new_nodes));
7695
7696	if (!clustering_is_principal()) {
7697	// Common case - command will likely be sent to all nodes.
7698	rv = `1`;
7699	}
7700
7701	goto Exit;
7702	}
7703
7704	cf_vector* succession_list = vector_stack_lockless_create(cf_node);
7705	vector_copy(succession_list, &g_register.succession_list);
7706
7707	log_cf_node_vector(
7708	"recluster: principal node - reforming new cluster with succession list:",
7709	succession_list, CF_INFO);
7710
7711	as_paxos_start_result result = paxos_proposer_proposal_start(
7712	succession_list, succession_list);
7713
7714	// Log paxos result.
7715	paxos_result_log(result, succession_list);
7716
7717	rv = (result == AS_PAXOS_RESULT_STARTED) ? `0` : -`1`;
7718
7719	if (rv == -`1`) {
7720	INFO("recluster: skipped");
7721	}
7722	else {
7723	INFO("recluster: triggered...");
7724	}
7725
7726	cf_vector_destroy(succession_list);
7727
7728	Exit:
7729	cf_vector_destroy(dead_nodes);
7730	cf_vector_destroy(faulty_nodes);
7731	cf_vector_destroy(new_nodes);
7732	CLUSTERING_UNLOCK();
7733	return rv;
7734	}
7735
7736	/**
7737	* Initialize clustering subsystem.
7738	*/
7739	static void
7740	clustering_init()
7741	{
7742	if (clustering_is_initialized()) {
7743	return;
7744	}
7745
7746	CLUSTERING_LOCK();
7747	memset(&g_clustering, `0`, sizeof(g_clustering));
7748
7749	// Start out as an orphan cluster.
7750	g_clustering.state = AS_CLUSTERING_STATE_ORPHAN;
7751	g_clustering.orphan_state_start_time = cf_getms();
7752
7753	g_clustering.join_request_blackout = cf_shash_create(cf_nodeid_shash_fn,
7754	sizeof(cf_node), sizeof(cf_clock),
7755	AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT, `0`);
7756
7757	vector_lockless_init(&g_clustering.pending_join_requests, cf_node);
7758
7759	// Register as a plugin with the heartbeat subsystem.
7760	as_hb_plugin clustering_plugin;
7761	memset(&clustering_plugin, `0`, sizeof(clustering_plugin));
7762
7763	clustering_plugin.id = AS_HB_PLUGIN_CLUSTERING;
7764	// Includes the size for the protocol version, the cluster key, the paxos
7765	// sequence number for current cluster and the preferred principal.
7766	clustering_plugin.wire_size_fixed = sizeof(uint32_t)
7767	+ sizeof(as_cluster_key) + sizeof(as_paxos_sequence_number)
7768	+ sizeof(cf_node);
7769	// Size of the node in succession list.
7770	clustering_plugin.wire_size_per_node = sizeof(cf_node);
7771	clustering_plugin.set_fn = clustering_hb_plugin_set_fn;
7772	clustering_plugin.parse_fn = clustering_hb_plugin_parse_data_fn;
7773	clustering_plugin.change_listener =
7774	clustering_hb_plugin_data_change_listener;
7775
7776	as_hb_plugin_register(&clustering_plugin);
7777
7778	// Register as hb event listener
7779	as_hb_register_listener(clustering_hb_event_listener, NULL);
7780
7781	// Initialize fabric message pool.
7782	clustering_msg_init();
7783
7784	// Initialize external event publisher.
7785	external_event_publisher_init();
7786
7787	// Initialize the register.
7788	register_init();
7789
7790	// Initialize timer.
7791	timer_init();
7792
7793	// Initialize the quantum interval generator
7794	quantum_interval_generator_init();
7795
7796	// Initialize paxos.
7797	paxos_init();
7798
7799	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED;
7800
7801	DETAIL("clustering module initialized");
7802
7803	CLUSTERING_UNLOCK();
7804	}
7805
7806	/**
7807	* Start the clustering sub-system.
7808	*/
7809	static void
7810	clustering_start()
7811	{
7812	if (clustering_is_running()) {
7813	return;
7814	}
7815
7816	CLUSTERING_LOCK();
7817	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_RUNNING;
7818	CLUSTERING_UNLOCK();
7819
7820	// Start quantum interval generator.
7821	quantum_interval_generator_start();
7822
7823	// Start the timer.
7824	timer_start();
7825
7826	// Start the external event publisher.
7827	external_event_publisher_start();
7828	}
7829
7830	/**
7831	* Stop the clustering sub-system.
7832	*/
7833	static void
7834	clustering_stop()
7835	{
7836	if (!clustering_is_running()) {
7837	return;
7838	}
7839
7840	CLUSTERING_LOCK();
7841	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN;
7842	CLUSTERING_UNLOCK();
7843
7844	// Stop the timer.
7845	timer_stop();
7846
7847	// Stop the external event publisher.
7848	external_event_publisher_stop();
7849
7850	CLUSTERING_LOCK();
7851	g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED;
7852	CLUSTERING_UNLOCK();
7853	}
7854
7855	/**
7856	* Dump clustering state to logs.
7857	*/
7858	static void
7859	clustering_dump(bool verbose)
7860	{
7861	if (!clustering_is_running()) {
7862	INFO("CL: stopped");
7863	return;
7864	}
7865
7866	paxos_proposer_dump(verbose);
7867	paxos_acceptor_dump(verbose);
7868	register_dump(verbose);
7869
7870	CLUSTERING_LOCK();
7871
7872	switch (g_clustering.state) {
7873	case AS_CLUSTERING_STATE_ORPHAN:
7874	INFO("CL: state: orphan");
7875	break;
7876	case AS_CLUSTERING_STATE_PRINCIPAL:
7877	INFO("CL: state: principal");
7878	break;
7879	case AS_CLUSTERING_STATE_NON_PRINCIPAL:
7880	INFO("CL: state: non-principal");
7881	break;
7882	}
7883
7884	INFO("CL: %s",
7885	g_clustering.has_integrity ? "has integrity" : "integrity fault");
7886	cf_node current_principal;
7887	if (clustering_principal_get(&current_principal) != `0`) {
7888	if (g_clustering.preferred_principal != current_principal) {
7889	INFO("CL: preferred principal %"PRIx64,
7890	g_clustering.preferred_principal);
7891	}
7892	}
7893
7894	if (g_clustering.state == AS_CLUSTERING_STATE_ORPHAN) {
7895	INFO("CL: join request sent to principal %"PRIx64,
7896	g_clustering.last_join_request_principal);
7897	INFO("CL: join request sent time: %"PRIu64" now: %"PRIu64 ,
7898	g_clustering.last_join_request_sent_time, cf_getms());
7899	}
7900
7901	if (verbose) {
7902	log_cf_node_vector("CL: pending join requests:",
7903	&g_clustering.pending_join_requests, CF_INFO);
7904	}
7905
7906	CLUSTERING_UNLOCK();
7907	}
7908
7909	/*
7910	* ----------------------------------------------------------------------------
7911	* Internal event dispatcher
7912	* ----------------------------------------------------------------------------
7913	*/
7914
7915	/**
7916	* Simple dispatcher for events. The order of dispatch is from lower (less
7917	* dependent) to higher (more dependent) sub-modules.
7918	*/
7919	static void
7920	internal_event_dispatch(as_clustering_internal_event* event)
7921	{
7922	// Sub-module dispatch.
7923	quantum_interval_generator_event_dispatch(event);
7924	paxos_event_dispatch(event);
7925	register_event_dispatch(event);
7926
7927	// Dispatch to the main clustering module.
7928	clustering_event_handle(event);
7929	}
7930
7931	/*
7932	* ----------------------------------------------------------------------------
7933	* Public API.
7934	* ----------------------------------------------------------------------------
7935	*/
7936
7937	/**
7938	*
7939	* Initialize clustering subsystem.
7940	*/
7941	void
7942	as_clustering_init()
7943	{
7944	clustering_init();
7945	}
7946
7947	/**
7948	* Start clustering subsystem.
7949	*/
7950	void
7951	as_clustering_start()
7952	{
7953	clustering_start();
7954	}
7955
7956	/**
7957	* Stop clustering subsystem.
7958	*/
7959	void
7960	as_clustering_stop()
7961	{
7962	clustering_stop();
7963	}
7964
7965	/**
7966	* Reform the cluster with the same succession list.This would trigger the
7967	* generation of new partition info and the cluster would get a new cluster key.
7968	*
7969	* @return 0 if new clustering round started, -1 otherwise.
7970	*/
7971	int
7972	as_clustering_cluster_reform()
7973	{
7974	return clustering_cluster_reform();
7975	}
7976
7977	/**
7978	* Return the quantum interval, i.e., the interval at which cluster change
7979	* decisions are taken. The unit is milliseconds.
7980	*/
7981	uint64_t
7982	as_clustering_quantum_interval()
7983	{
7984	return quantum_interval();
7985	}
7986
7987	/**
7988	* TEMPORARY - used by paxos only.
7989	*/
7990	void
7991	as_clustering_set_integrity(bool has_integrity)
7992	{
7993	g_clustering.has_integrity = has_integrity;
7994	}
7995
7996	/*
7997	* ----------------------------------------------------------------------------
7998	* Clustering info command functions.
7999	* ----------------------------------------------------------------------------
8000	*/
8001
8002	/**
8003	* If false means than either this node is orphaned, or is undergoing a cluster
8004	* change.
8005	*/
8006	bool
8007	as_clustering_has_integrity()
8008	{
8009	return g_clustering.has_integrity;
8010	}
8011
8012	/**
8013	* Indicates if self node is orphaned.
8014	*/
8015	bool
8016	as_clustering_is_orphan()
8017	{
8018	return clustering_is_orphan();
8019	}
8020
8021	/**
8022	* Dump clustering state to the log.
8023	*/
8024	void
8025	as_clustering_dump(bool verbose)
8026	{
8027	clustering_dump(verbose);
8028	}
8029
8030	/**
8031	* Set the min cluster size.
8032	*/
8033	int
8034	as_clustering_cluster_size_min_set(uint32_t new_cluster_size_min)
8035	{
8036	CLUSTERING_LOCK();
8037	int rv = `0`;
8038	uint32_t cluster_size = cf_vector_size(&g_register.succession_list);
8039	if (clustering_is_orphan() \|\| cluster_size >= new_cluster_size_min) {
8040	INFO("changing value of min-cluster-size from %u to %u",
8041	g_config.clustering_config.cluster_size_min,
8042	new_cluster_size_min);
8043	g_config.clustering_config.cluster_size_min = new_cluster_size_min;
8044	}
8045	else {
8046	WARNING(
8047	"min-cluster-size %d should be <= current cluster size %d - ignoring",
8048	new_cluster_size_min, cluster_size);
8049	rv = -`1`;
8050	}
8051	CLUSTERING_UNLOCK();
8052	return rv;
8053	}
8054
8055	/**
8056	* Log a vector of node-ids at input severity spliting long vectors over
8057	* multiple lines. The call might not work if the vector is not protected
8058	* against multi-threaded access.
8059	*
8060	* @param context the logging context.
8061	* @param severity the log severity.
8062	* @param file_name the source file name for the log line.
8063	* @param line the source file line number for the log line.
8064	* @param message the message prefix for each log line. Message and node list
8065	* will be separated with a space. Can be NULL for no prefix.
8066	* @param nodes the vector of nodes.
8067	*/
8068	void
8069	as_clustering_cf_node_vector_event(cf_fault_severity severity,
8070	cf_fault_context context, char* file_name, int line, char* message,
8071	cf_vector* nodes)
8072	{
8073	as_clustering_cf_node_array_event(severity, context, file_name, line,
8074	message, vector_to_array(nodes), cf_vector_size(nodes));
8075	}
8076
8077	/**
8078	* Log an array of node-ids at input severity spliting long vectors over
8079	* multiple lines. The call might not work if the array is not protected against
8080	* multi-threaded access.
8081	*
8082	* @param context the logging context.
8083	* @param severity the log severity.
8084	* @param file_name the source file name for the log line.
8085	* @param line the source file line number for the log line.
8086	* @param message the message prefix for each log line. Message and node list
8087	* will be separated with a space. Can be NULL for no prefix.
8088	* @param nodes the array of nodes.
8089	* @param node_count the count of nodes in the array.
8090	*/
8091	void
8092	as_clustering_cf_node_array_event(cf_fault_severity severity,
8093	cf_fault_context context, char* file_name, int line, char* message,
8094	cf_node* nodes, int node_count)
8095	{
8096	if (!cf_context_at_severity(context, severity) && severity != CF_DETAIL) {
8097	return;
8098	}
8099
8100	// Also account the space following the nodeid.
8101	int node_str_len = `2` * (sizeof(cf_node)) + `1`;
8102
8103	int message_length = `0`;
8104	char copied_message[LOG_LENGTH_MAX()];
8105
8106	if (message) {
8107	// Limit the message length to allow at least one node to fit in the log
8108	// line. Accounting for the separator between message and node list.
8109	message_length = MIN(strnlen(message, LOG_LENGTH_MAX() - `1`),
8110	LOG_LENGTH_MAX() - `1` - node_str_len) + `1`;
8111
8112	// Truncate the message.
8113	strncpy(copied_message, message, message_length);
8114	message = copied_message;
8115	}
8116
8117	// Allow for the NULL terminator.
8118	int nodes_per_line = (LOG_LENGTH_MAX() - message_length - `1`) / node_str_len;
8119	nodes_per_line = MAX(`1`, nodes_per_line);
8120
8121	// Have a buffer large enough to accomodate the message and nodes per line.
8122	char log_buffer[message_length + (nodes_per_line * node_str_len) + `1`]; // For the NULL terminator.
8123	int output_node_count = `0`;
8124
8125	// Marks the start of the nodeid list in the log line buffer.
8126	char* node_buffer_start = log_buffer;
8127	if (message) {
8128	node_buffer_start += sprintf(log_buffer, "%s ", message);
8129	}
8130
8131	for (int i = `0`; i < node_count;) {
8132	char* buffer = node_buffer_start;
8133
8134	for (int j = `0`; j < nodes_per_line && i < node_count; j++) {
8135	buffer += sprintf(buffer, "%"PRIx64" ", nodes[i]);
8136	output_node_count++;
8137	i++;
8138	}
8139
8140	// Overwrite the space from the last node on the log line only if there
8141	// is atleast one node output
8142	if (buffer != node_buffer_start) {
8143	*(buffer - `1`) = `0`;
8144	cf_fault_event(context, severity, file_name, line, "%s",
8145	log_buffer);
8146	}
8147	}
8148
8149	// Handle the empty vector case.
8150	if (output_node_count == `0`) {
8151	sprintf(node_buffer_start, "(empty)");
8152	cf_fault_event(context, severity, file_name, line, "%s", log_buffer);
8153	}
8154	}
8155

Browse the source code of Aerospike/as/src/fabric/clustering.c