1 | /* |
2 | * as.c |
3 | * |
4 | * Copyright (C) 2008-2017 Aerospike, Inc. |
5 | * |
6 | * Portions may be licensed to Aerospike, Inc. under one or more contributor |
7 | * license agreements. |
8 | * |
9 | * This program is free software: you can redistribute it and/or modify it under |
10 | * the terms of the GNU Affero General Public License as published by the Free |
11 | * Software Foundation, either version 3 of the License, or (at your option) any |
12 | * later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, but WITHOUT |
15 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
16 | * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
17 | * details. |
18 | * |
19 | * You should have received a copy of the GNU Affero General Public License |
20 | * along with this program. If not, see http://www.gnu.org/licenses/ |
21 | */ |
22 | |
23 | #include <errno.h> |
24 | #include <fcntl.h> |
25 | #include <getopt.h> |
26 | #include <pthread.h> |
27 | #include <stdbool.h> |
28 | #include <stddef.h> |
29 | #include <stdint.h> |
30 | #include <stdio.h> |
31 | #include <stdlib.h> |
32 | #include <string.h> |
33 | #include <syscall.h> |
34 | #include <unistd.h> |
35 | #include <sys/stat.h> |
36 | |
37 | #include "citrusleaf/alloc.h" |
38 | |
39 | #include "cf_thread.h" |
40 | #include "daemon.h" |
41 | #include "dns.h" |
42 | #include "fault.h" |
43 | #include "hardware.h" |
44 | #include "tls.h" |
45 | |
46 | #include "base/batch.h" |
47 | #include "base/cfg.h" |
48 | #include "base/datamodel.h" |
49 | #include "base/health.h" |
50 | #include "base/index.h" |
51 | #include "base/json_init.h" |
52 | #include "base/monitor.h" |
53 | #include "base/nsup.h" |
54 | #include "base/scan.h" |
55 | #include "base/secondary_index.h" |
56 | #include "base/security.h" |
57 | #include "base/service.h" |
58 | #include "base/smd.h" |
59 | #include "base/stats.h" |
60 | #include "base/thr_info.h" |
61 | #include "base/thr_info_port.h" |
62 | #include "base/thr_sindex.h" |
63 | #include "base/ticker.h" |
64 | #include "base/xdr_serverside.h" |
65 | #include "fabric/clustering.h" |
66 | #include "fabric/exchange.h" |
67 | #include "fabric/fabric.h" |
68 | #include "fabric/hb.h" |
69 | #include "fabric/migrate.h" |
70 | #include "fabric/skew_monitor.h" |
71 | #include "storage/storage.h" |
72 | #include "transaction/proxy.h" |
73 | #include "transaction/rw_request_hash.h" |
74 | #include "transaction/udf.h" |
75 | |
76 | |
77 | //========================================================== |
78 | // Constants. |
79 | // |
80 | |
81 | // String constants in version.c, generated by make. |
82 | extern const char aerospike_build_type[]; |
83 | extern const char aerospike_build_id[]; |
84 | |
85 | // Command line options for the Aerospike server. |
86 | static const struct option CMD_OPTS[] = { |
87 | { "help" , no_argument, NULL, 'h' }, |
88 | { "version" , no_argument, NULL, 'v' }, |
89 | { "config-file" , required_argument, NULL, 'f' }, |
90 | { "foreground" , no_argument, NULL, 'd' }, |
91 | { "fgdaemon" , no_argument, NULL, 'F' }, |
92 | { "cold-start" , no_argument, NULL, 'c' }, |
93 | { "instance" , required_argument, NULL, 'n' }, |
94 | { NULL, 0, NULL, 0 } |
95 | }; |
96 | |
97 | static const char HELP[] = |
98 | "\n" |
99 | "Aerospike server installation installs the script /etc/init.d/aerospike which\n" |
100 | "is normally used to start and stop the server. The script is also found as\n" |
101 | "as/etc/init-script in the source tree.\n" |
102 | "\n" |
103 | "asd informative command-line options:\n" |
104 | "\n" |
105 | "--help" |
106 | "\n" |
107 | "Print this message and exit.\n" |
108 | "\n" |
109 | "--version" |
110 | "\n" |
111 | "Print edition and build version information and exit.\n" |
112 | "\n" |
113 | "asd runtime command-line options:\n" |
114 | "\n" |
115 | "--config-file <file>" |
116 | "\n" |
117 | "Specify the location of the Aerospike server config file. If this option is not\n" |
118 | "specified, the default location /etc/aerospike/aerospike.conf is used.\n" |
119 | "\n" |
120 | "--foreground" |
121 | "\n" |
122 | "Specify that Aerospike not be daemonized. This is useful for running Aerospike\n" |
123 | "in gdb. Alternatively, add 'run-as-daemon false' in the service context of the\n" |
124 | "Aerospike config file.\n" |
125 | "\n" |
126 | "--fgdaemon" |
127 | "\n" |
128 | "Specify that Aerospike is to be run as a \"new-style\" (foreground) daemon. This\n" |
129 | "is useful for running Aerospike under systemd or Docker.\n" |
130 | "\n" |
131 | "--cold-start" |
132 | "\n" |
133 | "(Enterprise edition only.) At startup, force the Aerospike server to read all\n" |
134 | "records from storage devices to rebuild the index.\n" |
135 | "\n" |
136 | "--instance <0-15>" |
137 | "\n" |
138 | "(Enterprise edition only.) If running multiple instances of Aerospike on one\n" |
139 | "machine (not recommended), each instance must be uniquely designated via this\n" |
140 | "option.\n" |
141 | ; |
142 | |
143 | static const char USAGE[] = |
144 | "\n" |
145 | "asd informative command-line options:\n" |
146 | "[--help]\n" |
147 | "[--version]\n" |
148 | "\n" |
149 | "asd runtime command-line options:\n" |
150 | "[--config-file <file>] " |
151 | "[--foreground] " |
152 | "[--fgdaemon] " |
153 | "[--cold-start] " |
154 | "[--instance <0-15>]\n" |
155 | ; |
156 | |
157 | static const char DEFAULT_CONFIG_FILE[] = "/etc/aerospike/aerospike.conf" ; |
158 | |
159 | static const char SMD_DIR_NAME[] = "/smd" ; |
160 | |
161 | |
162 | //========================================================== |
163 | // Globals. |
164 | // |
165 | |
166 | // Not cf_mutex, which won't tolerate unlock if already unlocked. |
167 | pthread_mutex_t g_main_deadlock = PTHREAD_MUTEX_INITIALIZER; |
168 | |
169 | bool g_startup_complete = false; |
170 | bool g_shutdown_started = false; |
171 | |
172 | |
173 | //========================================================== |
174 | // Forward declarations. |
175 | // |
176 | |
177 | // signal.c doesn't have header file. |
178 | extern void as_signal_setup(); |
179 | |
180 | static void write_pidfile(char *pidfile); |
181 | static void validate_directory(const char *path, const char *log_tag); |
182 | static void validate_smd_directory(); |
183 | |
184 | |
185 | //========================================================== |
186 | // Aerospike server entry point. |
187 | // |
188 | |
189 | int |
190 | main(int argc, char **argv) |
191 | { |
192 | g_start_sec = cf_get_seconds(); |
193 | |
194 | // Initialize cf_thread wrapper. |
195 | cf_thread_init(); |
196 | |
197 | // Initialize memory allocation. |
198 | cf_alloc_init(); |
199 | |
200 | // Initialize fault management framework. |
201 | cf_fault_init(); |
202 | |
203 | // Setup signal handlers. |
204 | as_signal_setup(); |
205 | |
206 | // Initialize TLS library. |
207 | tls_check_init(); |
208 | |
209 | int opt; |
210 | int opt_i; |
211 | const char *config_file = DEFAULT_CONFIG_FILE; |
212 | bool run_in_foreground = false; |
213 | bool new_style_daemon = false; |
214 | bool cold_start_cmd = false; |
215 | uint32_t instance = 0; |
216 | |
217 | // Parse command line options. |
218 | while ((opt = getopt_long(argc, argv, "" , CMD_OPTS, &opt_i)) != -1) { |
219 | switch (opt) { |
220 | case 'h': |
221 | // printf() since we want stdout and don't want cf_fault's prefix. |
222 | printf("%s\n" , HELP); |
223 | return 0; |
224 | case 'v': |
225 | // printf() since we want stdout and don't want cf_fault's prefix. |
226 | printf("%s build %s\n" , aerospike_build_type, aerospike_build_id); |
227 | return 0; |
228 | case 'f': |
229 | config_file = cf_strdup(optarg); |
230 | break; |
231 | case 'F': |
232 | // As a "new-style" daemon(*), asd runs in the foreground and |
233 | // ignores the following configuration items: |
234 | // - user ('user') |
235 | // - group ('group') |
236 | // - PID file ('pidfile') |
237 | // |
238 | // If ignoring configuration items, or if the 'console' sink is not |
239 | // specified, warnings will appear in stderr. |
240 | // |
241 | // (*) http://0pointer.de/public/systemd-man/daemon.html#New-Style%20Daemons |
242 | run_in_foreground = true; |
243 | new_style_daemon = true; |
244 | break; |
245 | case 'd': |
246 | run_in_foreground = true; |
247 | break; |
248 | case 'c': |
249 | cold_start_cmd = true; |
250 | break; |
251 | case 'n': |
252 | instance = (uint32_t)strtol(optarg, NULL, 0); |
253 | break; |
254 | default: |
255 | // fprintf() since we don't want cf_fault's prefix. |
256 | fprintf(stderr, "%s\n" , USAGE); |
257 | return 1; |
258 | } |
259 | } |
260 | |
261 | // Set all fields in the global runtime configuration instance. This parses |
262 | // the configuration file, and creates as_namespace objects. (Return value |
263 | // is a shortcut pointer to the global runtime configuration instance.) |
264 | as_config *c = as_config_init(config_file); |
265 | |
266 | // Detect NUMA topology and, if requested, prepare for CPU and NUMA pinning. |
267 | cf_topo_config(c->auto_pin, (cf_topo_numa_node_index)instance, |
268 | &c->service.bind); |
269 | |
270 | // Perform privilege separation as necessary. If configured user & group |
271 | // don't have root privileges, all resources created or reopened past this |
272 | // point must be set up so that they are accessible without root privileges. |
273 | // If not, the process will self-terminate with (hopefully!) a log message |
274 | // indicating which resource is not set up properly. |
275 | cf_process_privsep(c->uid, c->gid); |
276 | |
277 | // |
278 | // All resources such as files, devices, and shared memory must be created |
279 | // or reopened below this line! (The configuration file is the only thing |
280 | // that must be opened above, in order to parse the user & group.) |
281 | //========================================================================== |
282 | |
283 | // A "new-style" daemon expects console logging to be configured. (If not, |
284 | // log messages won't be seen via the standard path.) |
285 | if (new_style_daemon) { |
286 | if (! cf_fault_console_is_held()) { |
287 | cf_warning(AS_AS, "in new-style daemon mode, console logging is not configured" ); |
288 | } |
289 | } |
290 | |
291 | // Activate log sinks. Up to this point, 'cf_' log output goes to stderr, |
292 | // filtered according to NO_SINKS_LIMIT in fault.c. After this point, 'cf_' |
293 | // log output will appear in all log file sinks specified in configuration, |
294 | // with specified filtering. If console sink is specified in configuration, |
295 | // 'cf_' log output will continue going to stderr, but filtering will switch |
296 | // from NO_SINKS_LIMIT to that specified in console sink configuration. |
297 | if (0 != cf_fault_sink_activate_all_held()) { |
298 | // Specifics of failure are logged in cf_fault_sink_activate_all_held(). |
299 | cf_crash_nostack(AS_AS, "can't open log sink(s)" ); |
300 | } |
301 | |
302 | // Daemonize asd if specified. After daemonization, output to stderr will no |
303 | // longer appear in terminal. Instead, check /tmp/aerospike-console.<pid> |
304 | // for console output. |
305 | if (! run_in_foreground && c->run_as_daemon) { |
306 | // Don't close any open files when daemonizing. At this point only log |
307 | // sink files are open - instruct cf_process_daemonize() to ignore them. |
308 | int open_fds[CF_FAULT_SINKS_MAX]; |
309 | int num_open_fds = cf_fault_sink_get_fd_list(open_fds); |
310 | |
311 | cf_process_daemonize(open_fds, num_open_fds); |
312 | } |
313 | |
314 | // Log which build this is - should be the first line in the log file. |
315 | cf_info(AS_AS, "<><><><><><><><><><> %s build %s <><><><><><><><><><>" , |
316 | aerospike_build_type, aerospike_build_id); |
317 | |
318 | // Includes echoing the configuration file to log. |
319 | as_config_post_process(c, config_file); |
320 | |
321 | xdr_config_post_process(); |
322 | |
323 | // If we allocated a non-default config file name, free it. |
324 | if (config_file != DEFAULT_CONFIG_FILE) { |
325 | cf_free((void*)config_file); |
326 | } |
327 | |
328 | // Write the pid file, if specified. |
329 | if (! new_style_daemon) { |
330 | write_pidfile(c->pidfile); |
331 | } |
332 | else { |
333 | if (c->pidfile) { |
334 | cf_warning(AS_AS, "will not write PID file in new-style daemon mode" ); |
335 | } |
336 | } |
337 | |
338 | // Check that required directories are set up properly. |
339 | validate_directory(c->work_directory, "work" ); |
340 | validate_directory(c->mod_lua.user_path, "Lua user" ); |
341 | validate_smd_directory(); |
342 | |
343 | // Initialize subsystems. At this point we're allocating local resources, |
344 | // starting worker threads, etc. (But no communication with other server |
345 | // nodes or clients yet.) |
346 | |
347 | as_json_init(); // Jansson JSON API used by System Metadata |
348 | as_index_tree_gc_init(); // thread to purge dropped index trees |
349 | as_sindex_thr_init(); // defrag secondary index (ok during population) |
350 | as_nsup_init(); // load previous evict-void-time(s) |
351 | |
352 | // Initialize namespaces. Each namespace decides here whether it will do a |
353 | // warm or cold start. Index arenas, partition structures and index tree |
354 | // structures are initialized. Secondary index system metadata is restored. |
355 | as_namespaces_init(cold_start_cmd, instance); |
356 | |
357 | // Initialize the storage system. For warm and cool restarts, this includes |
358 | // fully resuming persisted indexes - this may take a few minutes. |
359 | as_storage_init(); |
360 | |
361 | // Migrate memory to correct NUMA node (includes resumed index arenas). |
362 | cf_topo_migrate_memory(); |
363 | |
364 | // Drop capabilities that we kept only for initialization. |
365 | cf_process_drop_startup_caps(); |
366 | |
367 | // Activate the storage system. For cold starts and cool restarts, this |
368 | // includes full drive scans - this may take several hours. The defrag |
369 | // subsystem starts operating at the end of this call. |
370 | as_storage_load(); |
371 | |
372 | // Populate all secondary indexes. This may block for a long time. |
373 | as_sindex_boot_populateall(); |
374 | |
375 | cf_info(AS_AS, "initializing services..." ); |
376 | |
377 | cf_dns_init(); // DNS resolver |
378 | as_netio_init(); // query responses |
379 | as_security_init(); // security features |
380 | as_service_init(); // server may process internal transactions |
381 | as_hb_init(); // inter-node heartbeat |
382 | as_skew_monitor_init(); // clock skew monitor |
383 | as_fabric_init(); // inter-node communications |
384 | as_exchange_init(); // initialize the cluster exchange subsystem |
385 | as_clustering_init(); // clustering-v5 start |
386 | as_info_init(); // info transaction handling |
387 | as_migrate_init(); // move data between nodes |
388 | as_proxy_init(); // do work on behalf of others |
389 | as_rw_init(); // read & write service |
390 | as_query_init(); // query transaction handling |
391 | as_udf_init(); // user-defined functions |
392 | as_scan_init(); // scan a namespace or set |
393 | as_batch_init(); // batch transaction handling |
394 | as_xdr_init(); // cross data-center replication |
395 | as_mon_init(); // monitor |
396 | |
397 | // Wait for enough available storage. We've been defragging all along, but |
398 | // here we wait until it's enough. This may block for a long time. |
399 | as_storage_wait_for_defrag(); |
400 | |
401 | // Start subsystems. At this point we may begin communicating with other |
402 | // cluster nodes, and ultimately with clients. |
403 | |
404 | as_smd_start(); // enables receiving cluster state change events |
405 | as_health_start(); // starts before fabric and hb to capture them |
406 | as_fabric_start(); // may send & receive fabric messages |
407 | as_xdr_start(); // XDR should start before it joins other nodes |
408 | as_hb_start(); // start inter-node heartbeat |
409 | as_exchange_start(); // start the cluster exchange subsystem |
410 | as_clustering_start(); // clustering-v5 start |
411 | as_nsup_start(); // may send evict-void-time(s) to other nodes |
412 | as_service_start(); // server will now receive client transactions |
413 | as_info_port_start(); // server will now receive info transactions |
414 | as_ticker_start(); // only after everything else is started |
415 | |
416 | // Relevant for enterprise edition only. |
417 | as_storage_start_tomb_raider(); |
418 | |
419 | // Log a service-ready message. |
420 | cf_info(AS_AS, "service ready: soon there will be cake!" ); |
421 | |
422 | //-------------------------------------------- |
423 | // Startup is done. This thread will now wait |
424 | // quietly for a shutdown signal. |
425 | // |
426 | |
427 | // Stop this thread from finishing. Intentionally deadlocking on a mutex is |
428 | // a remarkably efficient way to do this. |
429 | pthread_mutex_lock(&g_main_deadlock); |
430 | g_startup_complete = true; |
431 | pthread_mutex_lock(&g_main_deadlock); |
432 | |
433 | // When the service is running, you are here (deadlocked) - the signals that |
434 | // stop the service (yes, these signals always occur in this thread) will |
435 | // unlock the mutex, allowing us to continue. |
436 | |
437 | g_shutdown_started = true; |
438 | pthread_mutex_unlock(&g_main_deadlock); |
439 | pthread_mutex_destroy(&g_main_deadlock); |
440 | |
441 | //-------------------------------------------- |
442 | // Received a shutdown signal. |
443 | // |
444 | |
445 | cf_info(AS_AS, "initiating clean shutdown ..." ); |
446 | |
447 | as_storage_shutdown(instance); |
448 | as_xdr_shutdown(); |
449 | |
450 | cf_info(AS_AS, "finished clean shutdown - exiting" ); |
451 | |
452 | // If shutdown was totally clean (all threads joined) we could just return, |
453 | // but for now we exit to make sure all threads die. |
454 | #ifdef DOPROFILE |
455 | exit(0); // exit(0) so profile build actually dumps gmon.out |
456 | #else |
457 | _exit(0); |
458 | #endif |
459 | |
460 | return 0; |
461 | } |
462 | |
463 | |
464 | //========================================================== |
465 | // Local helpers. |
466 | // |
467 | |
468 | static void |
469 | write_pidfile(char *pidfile) |
470 | { |
471 | if (! pidfile) { |
472 | // If there's no pid file specified in the config file, just move on. |
473 | return; |
474 | } |
475 | |
476 | // Note - the directory the pid file is in must already exist. |
477 | |
478 | remove(pidfile); |
479 | |
480 | int pid_fd = open(pidfile, O_CREAT | O_RDWR, |
481 | S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH); |
482 | |
483 | if (pid_fd < 0) { |
484 | cf_crash_nostack(AS_AS, "failed to open pid file %s: %s" , pidfile, |
485 | cf_strerror(errno)); |
486 | } |
487 | |
488 | char pidstr[16]; |
489 | sprintf(pidstr, "%u\n" , (uint32_t)getpid()); |
490 | |
491 | // If we can't access this resource, just log a warning and continue - |
492 | // it is not critical to the process. |
493 | if (write(pid_fd, pidstr, strlen(pidstr)) == -1) { |
494 | cf_warning(AS_AS, "failed write to pid file %s: %s" , pidfile, |
495 | cf_strerror(errno)); |
496 | } |
497 | |
498 | close(pid_fd); |
499 | } |
500 | |
501 | static void |
502 | validate_directory(const char *path, const char *log_tag) |
503 | { |
504 | struct stat buf; |
505 | |
506 | if (stat(path, &buf) != 0) { |
507 | cf_crash_nostack(AS_AS, "%s directory '%s' is not set up properly: %s" , |
508 | log_tag, path, cf_strerror(errno)); |
509 | } |
510 | else if (! S_ISDIR(buf.st_mode)) { |
511 | cf_crash_nostack(AS_AS, "%s directory '%s' is not set up properly: Not a directory" , |
512 | log_tag, path); |
513 | } |
514 | } |
515 | |
516 | static void |
517 | validate_smd_directory() |
518 | { |
519 | size_t len = strlen(g_config.work_directory); |
520 | char smd_path[len + sizeof(SMD_DIR_NAME)]; |
521 | |
522 | strcpy(smd_path, g_config.work_directory); |
523 | strcpy(smd_path + len, SMD_DIR_NAME); |
524 | validate_directory(smd_path, "system metadata" ); |
525 | } |
526 | |