1/*
2 * hlc.c
3 *
4 * Copyright (C) 2008-2016 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23#include "fabric/hlc.h"
24
25#include <math.h>
26#include <sys/param.h> // For MAX() and MIN().
27
28#include "citrusleaf/cf_clock.h"
29#include "citrusleaf/cf_atomic.h"
30
31#include "fault.h"
32
33#include "base/cfg.h"
34
35/*
36 * Overview
37 * ========
38 * Hybrid logical clock as described in
39 * "Logical Physical Clocks and Consistent Snapshots in Globally Distributed
40 * Databases" available at http://www.cse.buffalo.edu/tech-reports/2014-04.pdf.
41 *
42 * Relies on a global 64 bit variable that has the logical time.
43 * The 48 MSBs include the physical component of the timestamp and the least
44 * significant 16 bits include the logical component. 48 bits for milliseconds
45 * since epoch gives us (8925 - years elapsed since epoch today) years before
46 * wrap around.
47 *
48 * The notion of HLC is to bound the skew between the logical clock and phsycial
49 * clock. This requires rejecting updates to the clock from nodes with large
50 * clock skews. We DO NOT do that yet and print a warning instead. The current
51 * envisioned usage is a global monotonically increasing timestamp. Should be
52 * fixed if we are to use it as a surrogate for wall clock.
53 *
54 * Guarantees
55 * ==========
56 * 1. Monotonically increasing. (Wraps around after ~8900 years). Service
57 * restarts might break the monotonicity, however the new clock will leapfrog
58 * the hlc value before the restart eventually.
59 * 2. as_hlc_timestamp_update call after every message receipt will ensure the
60 * message (send hlc ts) < (message receive hlc ts).
61 * 3. A fixed local timestamp will eventually be marked as happened before a
62 * remote message. This is an important requirement. For example, in paxos the
63 * local cluster change timestamp should have happened before some incoming
64 * heartbeat. The ordering system should not always return a
65 * AS_HLC_ORDER_INDETERMINATE for a fixed local timestamp and a new message
66 * received.
67 *
68 * Not guaranteed (requires hlc persistence across service restarts)
69 * ==============
70 * 1. On service restart the HLC clock will not start where it left off, however
71 * it will eventually leapfrog the older value. Fixing this requires persistence
72 * which is not implemented. eventually leapfrogging is alright for all current
73 * requirements.
74 * 2. If a as_hlc_msg_timestamp is persisted and compared with a current running
75 * value, the result may not be correct.
76 *
77 *
78 * Requirements
79 * ============
80 * Subsystems that reply on hlc should have their network messages timestamped
81 * with hlc timestamps and should invoke the as_hlc_timestamp_update on receipt
82 * of every message. This will ensure the hlc are in sync across the cluster and
83 * (send hlc ts) < (message receive hlc ts).
84 */
85
86/**
87 * Global timestamp with current hlc value.
88 */
89static as_hlc_timestamp g_now;
90
91/**
92 * Previous value of the physical component.
93 */
94static cf_atomic64 g_prev_physical_component;
95
96/**
97 * Previous value of the wall clock, when the physical component changed.
98 */
99static cf_atomic64 g_prev_wall_clock;
100
101/*
102 * ----------------------------------------------------------------------------
103 * Globals.
104 * ----------------------------------------------------------------------------
105 */
106/**
107 * Mask for the physical component of a hlc timestamp.
108 */
109#define PHYSICAL_TS_MASK 0xffffffffffff0000
110
111/**
112 * Mask for logical component of a hls timestamp.
113 */
114#define LOGICAL_TS_MASK 0x000000000000ffff
115
116/**
117 * Inform when HLC jumps by more than this amount.
118 */
119#define HLC_JUMP_WARN 5000
120
121/**
122 * Logging macros.
123 */
124#define CRASH(format, ...) cf_crash(AS_HLC, format, ##__VA_ARGS__)
125#define WARNING(format, ...) cf_warning(AS_HLC, format, ##__VA_ARGS__)
126#define INFO(format, ...) cf_info(AS_HLC, format, ##__VA_ARGS__)
127#define DEBUG(format, ...) cf_debug(AS_HLC, format, ##__VA_ARGS__)
128#define DETAIL(format, ...) cf_detail(AS_HLC, format, ##__VA_ARGS__)
129#define ASSERT(expression, message, ...) \
130if (!(expression)) {WARNING(message, __VA_ARGS__);}
131
132/*
133 * ----------------------------------------------------------------------------
134 * Forward declarations.
135 * ----------------------------------------------------------------------------
136 */
137static cf_clock
138hlc_wall_clock_get();
139static as_hlc_timestamp
140hlc_ts_get();
141static bool
142hlc_ts_set(as_hlc_timestamp old_value, as_hlc_timestamp new_value,
143 cf_node source);
144static cf_clock
145hlc_physical_ts_get(as_hlc_timestamp hlc_ts);
146static uint16_t
147hlc_logical_ts_get(as_hlc_timestamp hlc_ts);
148static void
149hlc_physical_ts_set(as_hlc_timestamp* hlc_ts, cf_clock physical_ts);
150static void
151hlc_physical_ts_on_set(cf_clock physical_ts, cf_clock wall_clock_now);
152static void
153hlc_logical_ts_set(as_hlc_timestamp* hlc_ts, uint16_t logical_ts);
154static void
155hlc_logical_ts_incr(uint16_t* logical_ts, cf_clock* physical_ts,
156 cf_clock wall_clock_now);
157
158/*
159 * ----------------------------------------------------------------------------
160 * Public API.
161 * ----------------------------------------------------------------------------
162 */
163/**
164 * Initialize hybrid logical clock.
165 */
166void
167as_hlc_init()
168{
169 g_now = 0;
170 g_prev_physical_component = 0;
171 g_prev_wall_clock = 0;
172}
173
174/**
175 * Return the physical component of a hlc timstamp
176 * @param hlc_ts the hybrid logical clock timestamp.
177 */
178cf_clock
179as_hlc_physical_ts_get(as_hlc_timestamp hlc_ts)
180{
181 return hlc_physical_ts_get(hlc_ts);
182}
183
184/**
185 * Return a hlc timestamp representing the hlc time "now". The notion is to make
186 * the minimum increment to the hlc timestamp necessary.
187 */
188as_hlc_timestamp
189as_hlc_timestamp_now()
190{
191 // Keep trying till an atomic operation succeeds. Looks like a tight loop
192 // but even with reasonable contention should not take more then a few
193 // iterations to succeed.
194 while (true) {
195 as_hlc_timestamp current_hlc_ts = hlc_ts_get();
196
197 // Initialize the new physical and logical values to current values.
198 cf_clock new_hlc_physical_ts = hlc_physical_ts_get(current_hlc_ts);
199 uint16_t new_hlc_logical_ts = hlc_logical_ts_get(current_hlc_ts);
200
201 cf_clock wall_clock_physical_ts = hlc_wall_clock_get();
202
203 if (new_hlc_physical_ts >= wall_clock_physical_ts) {
204 // The HLC physical component is greater than the physical wall
205 // time. Advance the logical timestamp.
206 hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
207 wall_clock_physical_ts);
208 }
209 else {
210 // The wall clock is greater, use this as the physical component and
211 // reset the logical timestamp.
212 new_hlc_physical_ts = wall_clock_physical_ts;
213 new_hlc_logical_ts = 0;
214 }
215
216 as_hlc_timestamp new_hlc_ts = 0;
217
218 hlc_physical_ts_set(&new_hlc_ts, new_hlc_physical_ts);
219 hlc_logical_ts_set(&new_hlc_ts, new_hlc_logical_ts);
220
221 if (hlc_ts_set(current_hlc_ts, new_hlc_ts, g_config.self_node)) {
222 hlc_physical_ts_on_set(new_hlc_physical_ts, wall_clock_physical_ts);
223 DETAIL("changed HLC value from %" PRIu64 " to %" PRIu64,
224 current_hlc_ts, new_hlc_ts);
225 return new_hlc_ts;
226 }
227 }
228}
229
230/**
231 * Update the HLC on receipt of a remote message. The notion is to adjust this
232 * node's hlc to ensure the receive hlc ts > the send hlc ts.
233 *
234 * @param source for debugging and tracking only.
235 * @param send_timestamp the hlc timestamp when this message was sent.
236 * @param recv_timestamp (output) the message receive timestamp which will be
237 * populated. Can be NULL in which case it will be ignored.
238 */
239void
240as_hlc_timestamp_update(cf_node source, as_hlc_timestamp send_ts,
241 as_hlc_msg_timestamp* msg_ts)
242{
243 cf_clock send_ts_physical_ts = hlc_physical_ts_get(send_ts);
244 uint16_t send_ts_logical_ts = hlc_logical_ts_get(send_ts);
245
246 // Keep trying till an atomic operation succeeds. Looks like a tight loop
247 // but even with reasonable contention should not take more then a few
248 // iterations to succeed.
249 while (true) {
250 as_hlc_timestamp current_hlc_ts = hlc_ts_get();
251
252 cf_clock current_hlc_physical_ts = hlc_physical_ts_get(current_hlc_ts);
253 uint16_t current_hlc_logical_ts = hlc_logical_ts_get(current_hlc_ts);
254
255 cf_clock wall_clock_physical_ts = hlc_wall_clock_get();
256
257 cf_clock new_hlc_physical_ts = MAX(
258 MAX(current_hlc_physical_ts, send_ts_physical_ts),
259 wall_clock_physical_ts);
260 uint16_t new_hlc_logical_ts = 0;
261
262 if (new_hlc_physical_ts == current_hlc_physical_ts
263 && new_hlc_physical_ts == send_ts_physical_ts) {
264 // There is no change in the physical components of peer and local
265 // hlc clocks. Set logical component to max of the two values and
266 // increment.
267 new_hlc_logical_ts = MAX(current_hlc_logical_ts,
268 send_ts_logical_ts);
269 hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
270 wall_clock_physical_ts);
271 }
272 else if (new_hlc_physical_ts == current_hlc_physical_ts) {
273 // The physical component of the send timestamp is smaller than our
274 // current physical component. We just need to increment the logical
275 // component.
276 new_hlc_logical_ts = current_hlc_ts;
277 hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
278 wall_clock_physical_ts);
279 }
280 else if (new_hlc_physical_ts == send_ts_physical_ts) {
281 // Current physical component is lesser than the incoming physical
282 // component. We need to ensure that the updated logical component
283 // is greater than the send logical component.
284 new_hlc_logical_ts = send_ts_logical_ts;
285 hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts,
286 wall_clock_physical_ts);
287 }
288 else {
289 // Our physical clock is greater than current physical component and
290 // the send physical component. We can reset the logical clock to
291 // zero and still maintain the send and receive ordering.
292 new_hlc_logical_ts = 0;
293 }
294
295 as_hlc_timestamp new_hlc_ts = 0;
296
297 hlc_physical_ts_set(&new_hlc_ts, new_hlc_physical_ts);
298 hlc_logical_ts_set(&new_hlc_ts, new_hlc_logical_ts);
299
300 if (hlc_ts_set(current_hlc_ts, new_hlc_ts, source)) {
301 hlc_physical_ts_on_set(new_hlc_physical_ts, wall_clock_physical_ts);
302 DETAIL("message received from node %" PRIx64 " with HLC %" PRIu64 " - changed HLC value from %" PRIu64 " to %" PRIu64,
303 source, send_ts, current_hlc_ts, new_hlc_ts);
304 if (msg_ts) {
305 msg_ts->send_ts = send_ts;
306 msg_ts->recv_ts = new_hlc_ts;
307 }
308 return;
309 }
310 }
311}
312
313/**
314 * Return the difference in milliseconds between two hlc timestamps. Note this
315 * difference may be greater than or equal to (but never less than)
316 * the physical wall call difference, because HLC can have non linear jumps,
317 * whenever the clock is adjusted. The difference should be used as an estimate
318 * rather than an absolute difference.
319 * For e.g. use the difference to check that the real time difference is most
320 * some number of milliseconds. However do not use this for interval statistics
321 * or to check if the difference in time is at least some number of
322 * milliseconds.
323 *
324 * @param ts1 the first timestamp.
325 * @param ts2 the seconds timestamp.
326 * @return ts1 - ts2 in milliseconds. if ts1 < ts2 the result is negative,
327 * else it is positive or zero.
328 */
329int64_t
330as_hlc_timestamp_diff_ms(as_hlc_timestamp ts1, as_hlc_timestamp ts2)
331{
332 int64_t diff = 0;
333 if (ts1 >= ts2) {
334 diff = hlc_physical_ts_get(ts1) - hlc_physical_ts_get(ts2);
335 }
336 else {
337 diff = -(hlc_physical_ts_get(ts2) - hlc_physical_ts_get(ts1));
338 }
339
340 return diff;
341}
342
343/**
344 * Orders a local timestamp and remote message send timestamp.
345 *
346 * @param local_ts the local timestamp.
347 * @param msg_ts message receive timestamp containing the remote send and the
348 * local receive timestamp.
349 * @return the order between the local and the message timestamp.
350 */
351as_hlc_timestamp_order
352as_hlc_send_timestamp_order(as_hlc_timestamp local_ts,
353 as_hlc_msg_timestamp* msg_ts)
354{
355 if (local_ts > msg_ts->recv_ts) {
356 // The local event happened after the local message received timestamp
357 // and therefore after the remote send as well.
358 return AS_HLC_HAPPENS_AFTER;
359 }
360
361 // Compute the unceratinty window around the local receive timestamp.
362 uint64_t offset = abs(msg_ts->send_ts - msg_ts->recv_ts);
363
364 if (local_ts > (msg_ts->recv_ts - offset)) {
365 // Local timestamp is in the uncertainty window. We cannot tell the
366 // order.
367 return AS_HLC_ORDER_INDETERMINATE;
368 }
369
370 cf_clock local_physical_ts = hlc_physical_ts_get(local_ts);
371 cf_clock recv_physical_ts = hlc_physical_ts_get(msg_ts->recv_ts);
372
373 if ((recv_physical_ts - local_physical_ts)
374 < g_config.fabric_latency_max_ms) {
375 // Consider the max network delay worth of time to also be part of the
376 // uncertainty window.
377 return AS_HLC_ORDER_INDETERMINATE;
378 }
379
380 return AS_HLC_HAPPENS_BEFORE;
381}
382
383/**
384 * Orders two timestamp generated by the same node / process.
385 *
386 * @param ts1 the first timestamp.
387 * @param ts2 the second timestamp.
388 * @return AS_HLC_HAPPENS_BEFORE if ts1 happens before ts2 else
389 * AS_HLC_HAPPENS_AFTER if ts1 happens after ts2 else
390 * AS_HLC_ORDER_INDETERMINATE.
391 */
392as_hlc_timestamp_order
393as_hlc_timestamp_order_get(as_hlc_timestamp ts1, as_hlc_timestamp ts2)
394{
395 if (ts1 < ts2) {
396 return AS_HLC_HAPPENS_BEFORE;
397 }
398 else if (ts1 > ts2) {
399 return AS_HLC_HAPPENS_AFTER;
400 }
401
402 return AS_HLC_ORDER_INDETERMINATE;
403}
404
405/**
406 * Subtract milliseconds worth of time from the timestamp.
407 * @param timestamp the input timestamp.
408 * @param ms the number of milliseconds to subtract.
409 */
410as_hlc_timestamp
411as_hlc_timestamp_subtract_ms(as_hlc_timestamp timestamp, int ms)
412{
413 cf_clock physical_ts = hlc_physical_ts_get(timestamp);
414 uint16_t logical_ts = hlc_logical_ts_get(timestamp);
415 physical_ts -= ms;
416 as_hlc_timestamp new_hlc_ts = 0;
417
418 hlc_physical_ts_set(&new_hlc_ts, physical_ts);
419 hlc_logical_ts_set(&new_hlc_ts, logical_ts);
420 return new_hlc_ts;
421}
422
423/**
424 * Dump some debugging information to the logs.
425 */
426void
427as_hlc_dump(bool verbose)
428{
429 as_hlc_timestamp now = as_hlc_timestamp_now();
430 cf_clock current_hlc_physical_ts = hlc_physical_ts_get(now);
431 uint16_t current_hlc_logical_ts = hlc_logical_ts_get(now);
432
433 INFO("HLC Ts:%" PRIu64 " HLC Physical Ts:%" PRIu64 " HLC Logical Ts:%d Wall Clock:%" PRIu64,
434 now, current_hlc_physical_ts, current_hlc_logical_ts,
435 hlc_wall_clock_get());
436}
437
438/*
439 * ----------------------------------------------------------------------------
440 * Private functions.
441 * ----------------------------------------------------------------------------
442 */
443
444/**
445 * Return this node's wall clock.
446 */
447static cf_clock
448hlc_wall_clock_get()
449{
450 // Unix timestamps will be 48 bits for a reasonable future. We will use only
451 // 48 bits.
452 return cf_clock_getabsolute();
453}
454
455/**
456 * Return the physical component of a hlc timstamp
457 * @param hlc_ts the hybrid logical clock timestamp.
458 */
459static cf_clock
460hlc_physical_ts_get(as_hlc_timestamp hlc_ts)
461{
462 return hlc_ts >> 16;
463}
464
465/**
466 * Return the logical component of a hlc timstamp
467 * @param hlc_ts the hybrid logical clock timestamp.
468 */
469static uint16_t
470hlc_logical_ts_get(as_hlc_timestamp hlc_ts)
471{
472 return (uint16_t)(hlc_ts & LOGICAL_TS_MASK);
473}
474
475/**
476 * Set the physical component of a hlc timestamp. 16 LSBs of the input physical
477 * timestamp will be ignored.
478 * @param hlc_ts the timestamp
479 * @param physical_ts the physical timestamp whose value should be set into the
480 * hls timestamp.
481 */
482static void
483hlc_physical_ts_set(as_hlc_timestamp* hlc_ts, cf_clock physical_ts)
484{
485 *hlc_ts = (*hlc_ts & LOGICAL_TS_MASK) | (physical_ts << 16);
486}
487
488/**
489 * Handle setting updating the physical component of the hlc timestamp.
490 */
491static void
492hlc_physical_ts_on_set(cf_clock physical_ts, cf_clock wall_clock_now)
493{
494 if (g_prev_physical_component != physical_ts) {
495 g_prev_physical_component = physical_ts;
496 g_prev_wall_clock = wall_clock_now;
497 }
498}
499
500/**
501 * Increment the logical timestamp and deal with a wrap around by incrementing
502 * the physical timestamp and ensure physical component moves at least at the
503 * rate of the wall clock to ensure hlc can be used as a crude measure of time
504 * intervals.
505 */
506static void
507hlc_logical_ts_incr(uint16_t* logical_ts, cf_clock* physical_ts,
508 cf_clock wall_clock_now)
509{
510 (*logical_ts)++;
511 if (*logical_ts == 0) {
512 (*physical_ts)++;
513 }
514 cf_clock physical_component_diff = *physical_ts - g_prev_physical_component;
515 cf_clock wall_clock_diff =
516 (wall_clock_now > g_prev_wall_clock) ?
517 wall_clock_now - g_prev_wall_clock : 0;
518 if (physical_component_diff < wall_clock_diff) {
519 *physical_ts += wall_clock_diff - physical_component_diff;
520 }
521}
522
523/**
524 * Set the logical component of a hlc timestamp.
525 * @param hlc_ts the timestamp
526 * @param logical_ts the logical timestamp whose value should be set into the
527 * hls timestamp.
528 */
529static void
530hlc_logical_ts_set(as_hlc_timestamp* hlc_ts, uint16_t logical_ts)
531{
532 *hlc_ts = (*hlc_ts & PHYSICAL_TS_MASK) | (((uint64_t)logical_ts));
533}
534
535/**
536 * Get current value for the global timestamp atomically.
537 *
538 * @param new_value the new value for the global timestamp.
539 * @return true on successful set, false on failure to do an atomic set.
540 */
541static as_hlc_timestamp
542hlc_ts_get()
543{
544 return ck_pr_load_64(&g_now);
545}
546
547/**
548 * Set a new value for the global timestamp atomically.
549 *
550 * @param old_value the old value.
551 * @param new_value the new value for the global timestamp.
552 * @param source the source node that caused this jump update in HLC, self if we
553 * are advancing the clock, peer node if advance is caused on message receipt.
554 * @return true on successful set, false on failure to do an atomic set.
555 */
556static bool
557hlc_ts_set(as_hlc_timestamp old_value, as_hlc_timestamp new_value,
558 cf_node source)
559{
560 // Default to ck atomic check and set.
561 cf_clock jump = hlc_physical_ts_get(new_value)
562 - hlc_physical_ts_get(old_value);
563 if (jump > HLC_JUMP_WARN && old_value > 0) {
564 INFO("HLC jumped by %"PRIu64" ms cause:%"PRIx64" old:%"PRIu64" new:%"PRIu64, jump, source, old_value, new_value);
565 }
566 return ck_pr_cas_64(&g_now, old_value, new_value);
567}
568