1/*
2 * hardware.c
3 *
4 * Copyright (C) 2016-2017 Aerospike, Inc.
5 *
6 * Portions may be licensed to Aerospike, Inc. under one or more contributor
7 * license agreements.
8 *
9 * This program is free software: you can redistribute it and/or modify it under
10 * the terms of the GNU Affero General Public License as published by the Free
11 * Software Foundation, either version 3 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU Affero General Public License
20 * along with this program. If not, see http://www.gnu.org/licenses/
21 */
22
23#include "hardware.h"
24
25#include <ctype.h>
26#include <dirent.h>
27#include <errno.h>
28#include <fcntl.h>
29#include <inttypes.h>
30#include <libgen.h>
31#include <limits.h>
32#include <mntent.h>
33#include <regex.h>
34#include <sched.h>
35#include <stdbool.h>
36#include <stddef.h>
37#include <stdint.h>
38#include <stdio.h>
39#include <stdlib.h>
40#include <string.h>
41#include <syscall.h>
42#include <unistd.h>
43
44#include <sys/ioctl.h>
45#include <sys/socket.h>
46#include <sys/stat.h>
47#include <sys/statvfs.h>
48#include <sys/sysmacros.h>
49#include <sys/types.h>
50#include <sys/vfs.h>
51
52#include <linux/capability.h>
53#include <linux/ethtool.h>
54#include <linux/if.h>
55#include <linux/limits.h>
56#include <linux/mempolicy.h>
57#include <linux/sockios.h>
58
59#include "cf_mutex.h"
60#include "daemon.h"
61#include "fault.h"
62#include "shash.h"
63#include "socket.h"
64
65#include "citrusleaf/alloc.h"
66#include "citrusleaf/cf_clock.h"
67
68#include "warnings.h"
69
70// Only available in Linux kernel version 3.19 and later; but we'd like to
71// allow compilation with older kernel headers.
72#if !defined SO_INCOMING_CPU
73#define SO_INCOMING_CPU 49
74#endif
75
76// Only available in Linux kernel version 4.12 and later; but we'd like to
77// allow compilation with older kernel headers.
78#if !defined SO_INCOMING_NAPI_ID
79#define SO_INCOMING_NAPI_ID 56
80#endif
81
82// The linux/nvme_ioctl.h kernel header came in Linux 4.4, but we'd like to
83// allow compilation with older kernel headers.
84//
85// Also, we need to be prepared for this IOCTL to fail with EINVAL, when we
86// run on older kernels that don't support it.
87
88#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd)
89#define NVME_SC_INVALID_LOG_PAGE 0x109
90
91struct nvme_admin_cmd {
92 uint8_t opcode;
93 uint8_t flags;
94 uint16_t rsvd1;
95 uint32_t nsid;
96 uint32_t cdw2;
97 uint32_t cdw3;
98 uint64_t metadata;
99 uint64_t addr;
100 uint32_t metadata_len;
101 uint32_t data_len;
102 uint32_t cdw10;
103 uint32_t cdw11;
104 uint32_t cdw12;
105 uint32_t cdw13;
106 uint32_t cdw14;
107 uint32_t cdw15;
108 uint32_t timeout_ms;
109 uint32_t result;
110};
111
112#define INVALID_INDEX ((uint16_t)-1)
113#define POLICY_SCRIPT "/etc/aerospike/irqbalance-ban.sh"
114
115#define MEM_PAGE_SIZE (4096L)
116
117typedef enum {
118 FILE_RES_OK,
119 FILE_RES_NOT_FOUND,
120 FILE_RES_ERROR
121} file_res;
122
123typedef enum {
124 CHECK_PROC_PRESENT,
125 CHECK_PROC_PRESENT_NO_ARG,
126 CHECK_PROC_ABSENT
127} check_proc_res;
128
129typedef uint16_t os_numa_node_index;
130typedef uint16_t os_package_index;
131typedef uint16_t os_core_index;
132
133typedef uint16_t irq_number;
134
135typedef struct {
136 uint16_t n_irqs;
137 irq_number irqs[CPU_SETSIZE];
138 uint16_t per_cpu;
139} irq_list;
140
141static cpu_set_t g_os_cpus_online;
142static cpu_set_t g_numa_node_os_cpus_online[CPU_SETSIZE];
143
144static uint16_t g_n_numa_nodes;
145static uint16_t g_n_cores;
146static uint16_t g_n_os_cpus;
147static uint16_t g_n_cpus;
148static uint16_t g_n_irq_cpus;
149
150static os_numa_node_index g_numa_node_index_to_os_numa_node_index[CPU_SETSIZE];
151static cf_topo_os_cpu_index g_core_index_to_os_cpu_index[CPU_SETSIZE];
152static cf_topo_os_cpu_index g_cpu_index_to_os_cpu_index[CPU_SETSIZE];
153static cf_topo_cpu_index g_os_cpu_index_to_cpu_index[CPU_SETSIZE];
154
155static cf_topo_numa_node_index g_i_numa_node;
156
157#define DEVICE_PATH_SIZE 1024
158#define DEVICE_NAME_SIZE 256
159
160#define MAX_DEVICE_CHILDREN 100
161#define MAX_DEVICE_SCHEDULERS 100
162
163typedef struct dev_key_s {
164 uint32_t major;
165 uint32_t minor;
166} dev_key_t;
167
168typedef struct dev_node_s {
169 uint32_t n_children;
170 struct dev_node_s *children[MAX_DEVICE_CHILDREN];
171
172 char name[DEVICE_NAME_SIZE];
173 char dev_path[DEVICE_PATH_SIZE];
174
175 char sys_home[DEVICE_PATH_SIZE];
176 char sys_sched[DEVICE_PATH_SIZE];
177} dev_node_t;
178
179typedef struct path_data_s {
180 cf_storage_device_info info;
181
182 uint32_t n_sys_scheds;
183 const char *sys_scheds[MAX_DEVICE_SCHEDULERS];
184
185 cf_clock mod_time;
186} path_data_t;
187
188static cf_shash *g_dev_graph;
189
190static cf_mutex g_path_data_lock = CF_MUTEX_INIT;
191static cf_shash *g_path_data;
192
193static file_res
194read_file(const char *path, void *buff, size_t *limit)
195{
196 cf_detail(CF_HARDWARE, "reading file %s with buffer size %zu", path, *limit);
197 int32_t fd = open(path, O_RDONLY);
198
199 if (fd < 0) {
200 if (errno == ENOENT) {
201 cf_detail(CF_HARDWARE, "file %s not found", path);
202 return FILE_RES_NOT_FOUND;
203 }
204
205 cf_warning(CF_HARDWARE, "error while opening file %s for reading: %d (%s)",
206 path, errno, cf_strerror(errno));
207 return FILE_RES_ERROR;
208 }
209
210 size_t total = 0;
211
212 while (total < *limit) {
213 cf_detail(CF_HARDWARE, "reading %zd byte(s) at offset %zu", *limit - total, total);
214 ssize_t len = read(fd, (uint8_t *)buff + total, *limit - total);
215 CF_NEVER_FAILS(len);
216
217 if (len == 0) {
218 cf_detail(CF_HARDWARE, "EOF");
219 break;
220 }
221
222 total += (size_t)len;
223 }
224
225 cf_detail(CF_HARDWARE, "read %zu byte(s) from file %s", total, path);
226 file_res res;
227
228 if (total == *limit) {
229 cf_warning(CF_HARDWARE, "read buffer too small for file %s", path);
230 res = FILE_RES_ERROR;
231 }
232 else {
233 res = FILE_RES_OK;
234 *limit = total;
235 }
236
237 CF_NEVER_FAILS(close(fd));
238 return res;
239}
240
241static file_res
242write_file(const char *path, const void *buff, size_t limit)
243{
244 cf_detail(CF_HARDWARE, "writing file %s with buffer size %zu", path, limit);
245 int32_t fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0600);
246
247 if (fd < 0) {
248 if (errno == ENOENT) {
249 cf_detail(CF_HARDWARE, "file %s not found", path);
250 return FILE_RES_NOT_FOUND;
251 }
252
253 cf_warning(CF_HARDWARE, "error while opening file %s for writing: %d (%s)",
254 path, errno, cf_strerror(errno));
255 return FILE_RES_ERROR;
256 }
257
258 size_t total = 0;
259
260 while (total < limit) {
261 cf_detail(CF_HARDWARE, "writing %zd byte(s) at offset %zu", limit - total, total);
262 ssize_t len = write(fd, (uint8_t *)buff + total, limit - total);
263
264 if (len < 0) {
265 cf_warning(CF_HARDWARE, "error while writing to file %s: %d (%s)",
266 path, errno, cf_strerror(errno));
267 CF_NEVER_FAILS(close(fd));
268 return FILE_RES_ERROR;
269 }
270
271 total += (size_t)len;
272 }
273
274 cf_detail(CF_HARDWARE, "done writing");
275 CF_NEVER_FAILS(close(fd));
276 return FILE_RES_OK;
277}
278
279static void
280write_file_safe(const char *path, const void *buff, size_t limit)
281{
282 if (write_file(path, buff, limit) != FILE_RES_OK) {
283 cf_crash(CF_HARDWARE, "write failed unexpectedly");
284 }
285}
286
287static DIR *
288opendir_safe(const char *path)
289{
290 DIR *dir = opendir(path);
291
292 if (dir == NULL) {
293 cf_crash(CF_HARDWARE, "error while opening directory %s: %d (%s)",
294 path, errno, cf_strerror(errno));
295 }
296
297 return dir;
298}
299
300static int32_t
301readdir_safe(DIR *dir, struct dirent *ent)
302{
303 while (true) {
304 errno = 0;
305 struct dirent *tmp = readdir(dir);
306
307 if (tmp == NULL) {
308 if (errno != 0) {
309 cf_crash(CF_HARDWARE, "error while reading directory: %d (%s)",
310 errno, cf_strerror(errno));
311 }
312
313 return -1;
314 }
315
316 if (strcmp(tmp->d_name, ".") == 0 || strcmp(tmp->d_name, "..") == 0) {
317 continue;
318 }
319
320 memcpy(ent, tmp, sizeof(struct dirent));
321 return 0;
322 }
323}
324
325static void
326closedir_safe(DIR *dir)
327{
328 if (closedir(dir) < 0) {
329 cf_crash(CF_HARDWARE, "error while closing PCI device directory: %d (%s)",
330 errno, cf_strerror(errno));
331 }
332}
333
334static bool
335path_exists(const char *path)
336{
337 struct stat st;
338
339 if (stat(path, &st) < 0) {
340 if (errno == ENOENT) {
341 cf_detail(CF_HARDWARE, "path %s does not exist", path);
342 return false;
343 }
344
345 cf_crash(CF_HARDWARE, "error while checking for path %s: %d (%s)",
346 path, errno, cf_strerror(errno));
347 }
348
349 cf_detail(CF_HARDWARE, "path %s exists", path);
350 return true;
351}
352
353static bool
354path_is_dir(const char *path)
355{
356 struct stat st;
357
358 if (stat(path, &st) < 0) {
359 cf_crash(CF_HARDWARE, "error while checking path %s: %d (%s)",
360 path, errno, cf_strerror(errno));
361 }
362
363 bool is_dir = S_ISDIR(st.st_mode);
364
365 cf_detail(CF_HARDWARE, "path %s is %s directory", path, is_dir ?
366 "a" : "not a");
367
368 return is_dir;
369}
370
371static bool
372path_works(const char *path)
373{
374 int32_t fd = open(path, O_RDONLY);
375
376 if (fd < 0) {
377 if (errno == ENOENT || errno == EINVAL) {
378 cf_detail(CF_HARDWARE, "path %s does not work (open): %d (%s)",
379 path, errno, cf_strerror(errno));
380 return false;
381 }
382
383 cf_crash(CF_HARDWARE, "error while verifying path %s (open): %d (%s)",
384 path, errno, cf_strerror(errno));
385 }
386
387 uint8_t buff[1000];
388
389 if (read(fd, buff, sizeof(buff)) < 0) {
390 if (errno == EINVAL) {
391 cf_detail(CF_HARDWARE, "path %s does not work (read): %d (%s)",
392 path, errno, cf_strerror(errno));
393 CF_NEVER_FAILS(close(fd));
394 return false;
395 }
396
397 cf_crash(CF_HARDWARE, "error while verifying path %s (read): %d (%s)",
398 path, errno, cf_strerror(errno));
399 }
400
401 cf_detail(CF_HARDWARE, "path %s works", path);
402 CF_NEVER_FAILS(close(fd));
403 return true;
404}
405
406static void
407set_mempolicy_safe(uint32_t mode, uint64_t *node_mask, size_t max_node)
408{
409 if (syscall(__NR_set_mempolicy, mode, node_mask, max_node) < 0) {
410 cf_crash(CF_HARDWARE, "set_mempolicy() system call failed: %d (%s)",
411 errno, cf_strerror(errno));
412 }
413}
414
415static void
416migrate_pages_safe(pid_t pid, size_t max_node, uint64_t *from_mask, uint64_t *to_mask)
417{
418 int64_t res = syscall(__NR_migrate_pages, pid, max_node, from_mask, to_mask);
419
420 if (res < 0) {
421 cf_crash(CF_HARDWARE, "migrate_pages() syscall failed: %d (%s)",
422 errno, cf_strerror(errno));
423 }
424
425 if (res > 0) {
426 cf_warning(CF_HARDWARE, "could not NUMA-migrate %" PRId64 " page(s)", res);
427 }
428}
429
430static void
431mask_to_string(cpu_set_t *mask, char *buff, size_t limit)
432{
433 cf_topo_os_cpu_index max;
434
435 for (max = CPU_SETSIZE - 1; max > 0; --max) {
436 if (CPU_ISSET(max, mask)) {
437 break;
438 }
439 }
440
441 int32_t words = max / 32 + 1;
442 size_t size = (size_t)words * 9;
443
444 if (size > limit) {
445 cf_crash(CF_HARDWARE, "CPU mask buffer overflow: %zu vs. %zu", size, limit);
446 }
447
448 for (int32_t i = words - 1; i >= 0; --i) {
449 uint32_t val = 0;
450
451 for (int32_t k = 0; k < 32; ++k) {
452 if (CPU_ISSET((size_t)(i * 32 + k), mask)) {
453 val |= 1u << k;
454 }
455 }
456
457 snprintf(buff, limit, "%08x", val);
458
459 if (i > 0) {
460 buff[8] = ',';
461 }
462
463 buff += 9;
464 limit -= 9;
465 }
466}
467
468static file_res
469read_value(const char *path, int64_t *val)
470{
471 cf_detail(CF_HARDWARE, "reading value from file %s", path);
472
473 char buff[100];
474 size_t limit = sizeof(buff);
475 file_res res = read_file(path, buff, &limit);
476
477 if (res != FILE_RES_OK) {
478 return res;
479 }
480
481 buff[limit - 1] = '\0';
482
483 cf_detail(CF_HARDWARE, "parsing value \"%s\"", buff);
484
485 char *end;
486 int64_t x = strtol(buff, &end, 10);
487
488 if (*end != '\0' || x >= CPU_SETSIZE) {
489 cf_warning(CF_HARDWARE, "invalid value \"%s\" in %s", buff, path);
490 return FILE_RES_ERROR;
491 }
492
493 *val = x;
494 return FILE_RES_OK;
495}
496
497static file_res
498read_index(const char *path, uint16_t *val)
499{
500 int64_t x;
501 file_res res = read_value(path, &x);
502
503 if (res != FILE_RES_OK) {
504 return res;
505 }
506
507 if (x < 0) {
508 cf_warning(CF_HARDWARE, "invalid index in %s", path);
509 return FILE_RES_ERROR;
510 }
511
512 *val = (uint16_t)x;
513 return FILE_RES_OK;
514}
515
516static file_res
517read_numa_node(const char *path, cf_topo_numa_node_index *i_numa_node)
518{
519 int64_t x;
520 file_res res = read_value(path, &x);
521
522 if (res != FILE_RES_OK) {
523 return res;
524 }
525
526 if (x < 0) {
527 cf_detail(CF_HARDWARE, "no NUMA node in %s", path);
528 return FILE_RES_ERROR;
529 }
530
531 *i_numa_node = (cf_topo_numa_node_index)x;
532 return FILE_RES_OK;
533}
534
535static file_res
536read_device_numbers(const char *path, uint32_t *major, uint32_t *minor)
537{
538 cf_detail(CF_HARDWARE, "reading device numbers from file %s", path);
539
540 char buff[100];
541 size_t limit = sizeof(buff);
542 file_res res = read_file(path, buff, &limit);
543
544 if (res != FILE_RES_OK) {
545 return res;
546 }
547
548 buff[limit - 1] = '\0';
549
550 cf_detail(CF_HARDWARE, "parsing device numbers \"%s\"", buff);
551
552 if (sscanf(buff, "%u:%u\n", major, minor) != 2) {
553 cf_warning(CF_HARDWARE, "invalid device numbers \"%s\" in %s", buff,
554 path);
555 return FILE_RES_ERROR;
556 }
557
558 return FILE_RES_OK;
559}
560
561static file_res
562read_list(const char *path, cpu_set_t *mask)
563{
564 cf_detail(CF_HARDWARE, "reading list from file %s", path);
565 char buff[1000];
566 size_t limit = sizeof(buff);
567 file_res res = read_file(path, buff, &limit);
568
569 if (res != FILE_RES_OK) {
570 return res;
571 }
572
573 buff[limit - 1] = '\0';
574 cf_detail(CF_HARDWARE, "parsing list \"%s\"", buff);
575
576 CPU_ZERO(mask);
577 char *walker = buff;
578
579 while (true) {
580 char *delim;
581 uint64_t from = strtoul(walker, &delim, 10);
582 uint64_t thru;
583
584 if (*delim == ',' || *delim == '\0'){
585 thru = from;
586 }
587 else if (*delim == '-') {
588 walker = delim + 1;
589 thru = strtoul(walker, &delim, 10);
590 }
591 else {
592 cf_warning(CF_HARDWARE, "invalid list \"%s\" in %s", buff, path);
593 return FILE_RES_ERROR;
594 }
595
596 if (from >= CPU_SETSIZE || thru >= CPU_SETSIZE || from > thru) {
597 cf_warning(CF_HARDWARE, "invalid list \"%s\" in %s", buff, path);
598 return FILE_RES_ERROR;
599 }
600
601 cf_detail(CF_HARDWARE, "marking %d through %d", (int32_t)from, (int32_t)thru);
602
603 for (size_t i = from; i <= thru; ++i) {
604 CPU_SET(i, mask);
605 }
606
607 if (*delim == '\0') {
608 break;
609 }
610
611 walker = delim + 1;
612 }
613
614 char buff2[1000];
615 mask_to_string(mask, buff2, sizeof(buff2));
616 cf_detail(CF_HARDWARE, "list \"%s\" -> mask %s", buff, buff2);
617
618 return FILE_RES_OK;
619}
620
621static void
622detect(cf_topo_numa_node_index a_numa_node)
623{
624 if (a_numa_node == INVALID_INDEX) {
625 cf_detail(CF_HARDWARE, "detecting online CPUs");
626 }
627 else {
628 cf_detail(CF_HARDWARE, "detecting online CPUs on NUMA node %hu", a_numa_node);
629 }
630
631 if (read_list("/sys/devices/system/cpu/online", &g_os_cpus_online) != FILE_RES_OK) {
632 cf_crash(CF_HARDWARE, "error while reading list of online CPUs");
633 }
634
635 cf_detail(CF_HARDWARE, "learning CPU topology");
636
637 cf_topo_numa_node_index os_numa_node_index_to_numa_node_index[CPU_SETSIZE];
638
639 for (int32_t i = 0; i < CPU_SETSIZE; ++i) {
640 CPU_ZERO(&g_numa_node_os_cpus_online[i]);
641
642 g_core_index_to_os_cpu_index[i] = INVALID_INDEX;
643 g_cpu_index_to_os_cpu_index[i] = INVALID_INDEX;
644 g_os_cpu_index_to_cpu_index[i] = INVALID_INDEX;
645
646 os_numa_node_index_to_numa_node_index[i] = INVALID_INDEX;
647 g_numa_node_index_to_os_numa_node_index[i] = INVALID_INDEX;
648 }
649
650 cpu_set_t covered_numa_nodes;
651 cpu_set_t covered_cores[CPU_SETSIZE]; // One mask per package.
652
653 CPU_ZERO(&covered_numa_nodes);
654
655 for (int32_t i = 0; i < CPU_SETSIZE; ++i) {
656 CPU_ZERO(&covered_cores[i]);
657 }
658
659 g_n_numa_nodes = 0;
660 g_n_cores = 0;
661 g_n_os_cpus = 0;
662 g_n_cpus = 0;
663 char path[1000];
664 bool no_numa = false;
665
666 // Loop through all CPUs in the system by looping through OS CPU indexes.
667
668 for (g_n_os_cpus = 0; g_n_os_cpus < CPU_SETSIZE; ++g_n_os_cpus) {
669 cf_detail(CF_HARDWARE, "querying OS CPU index %hu", g_n_os_cpus);
670
671 // Let's look at the CPU's package.
672
673 snprintf(path, sizeof(path),
674 "/sys/devices/system/cpu/cpu%hu/topology/physical_package_id",
675 g_n_os_cpus);
676 os_package_index i_os_package;
677 file_res res = read_index(path, &i_os_package);
678
679 // The entry doesn't exist. We've processed all available CPUs. Stop
680 // looping through the CPUs.
681
682 if (res == FILE_RES_NOT_FOUND) {
683 break;
684 }
685
686 if (res != FILE_RES_OK) {
687 cf_crash(CF_HARDWARE, "error while reading OS package index from %s", path);
688 break;
689 }
690
691 cf_detail(CF_HARDWARE, "OS package index is %hu", i_os_package);
692
693 // Only consider CPUs that are actually in use.
694
695 if (!CPU_ISSET(g_n_os_cpus, &g_os_cpus_online)) {
696 cf_detail(CF_HARDWARE, "OS CPU index %hu is offline", g_n_os_cpus);
697 continue;
698 }
699
700 // Let's look at the CPU's underlying core. In Hyper Threading systems,
701 // two (logical) CPUs share one (physical) core.
702
703 snprintf(path, sizeof(path),
704 "/sys/devices/system/cpu/cpu%hu/topology/core_id",
705 g_n_os_cpus);
706 os_core_index i_os_core;
707 res = read_index(path, &i_os_core);
708
709 if (res != FILE_RES_OK) {
710 cf_crash(CF_HARDWARE, "error while reading OS core index from %s", path);
711 break;
712 }
713
714 cf_detail(CF_HARDWARE, "OS core index is %hu", i_os_core);
715
716 // Consider a core when we see it for the first time. In other words, we
717 // consider the first Hyper Threading peer of each core to be that core.
718
719 bool new_core;
720
721 if (CPU_ISSET(i_os_core, &covered_cores[i_os_package])) {
722 cf_detail(CF_HARDWARE, "core (%hu, %hu) already covered", i_os_core, i_os_package);
723 new_core = false;
724 }
725 else {
726 cf_detail(CF_HARDWARE, "core (%hu, %hu) is new", i_os_core, i_os_package);
727 new_core = true;
728 CPU_SET(i_os_core, &covered_cores[i_os_package]);
729 }
730
731 // Identify the NUMA node of the current CPU. We simply look for the
732 // current CPU's topology info subtree in each NUMA node's subtree.
733 // Specifically, we look for the current CPU's "core_id" entry.
734
735 os_numa_node_index i_os_numa_node;
736
737 for (i_os_numa_node = 0; i_os_numa_node < CPU_SETSIZE; ++i_os_numa_node) {
738 snprintf(path, sizeof(path),
739 "/sys/devices/system/cpu/cpu%hu/node%hu/cpu%hu/topology/core_id",
740 g_n_os_cpus, i_os_numa_node, g_n_os_cpus);
741 uint16_t dummy;
742 res = read_index(path, &dummy);
743
744 // We found the NUMA node that has the current CPU in its subtree.
745
746 if (res == FILE_RES_OK) {
747 break;
748 }
749
750 if (res != FILE_RES_NOT_FOUND) {
751 cf_crash(CF_HARDWARE, "error while reading core number from %s", path);
752 }
753 }
754
755 // Some Docker installations seem to not have any NUMA information
756 // in /sys. In this case, assume a system with a single NUMA node.
757
758 if (i_os_numa_node == CPU_SETSIZE) {
759 cf_detail(CF_HARDWARE, "OS CPU index %hu does not have a NUMA node", g_n_os_cpus);
760 no_numa = true;
761 i_os_numa_node = 0;
762 }
763
764 cf_detail(CF_HARDWARE, "OS NUMA node index is %hu", i_os_numa_node);
765
766 // Again, just like with cores, we consider a NUMA node when we encounter
767 // it for the first time.
768
769 bool new_numa_node;
770
771 if (CPU_ISSET(i_os_numa_node, &covered_numa_nodes)) {
772 cf_detail(CF_HARDWARE, "OS NUMA node index %hu already covered", i_os_numa_node);
773 new_numa_node = false;
774 }
775 else {
776 cf_detail(CF_HARDWARE, "OS NUMA node index %hu is new", i_os_numa_node);
777 new_numa_node = true;
778 CPU_SET(i_os_numa_node, &covered_numa_nodes);
779
780 // For now, we only support a 64-bit bitmask (= one uint64_t).
781
782 if (i_os_numa_node >= 64) {
783 cf_crash(CF_HARDWARE, "OS NUMA node index %hu too high", i_os_numa_node);
784 }
785 }
786
787 // Now we know that the CPU is online and we know, whether it is in a newly
788 // seen core (new_core) and/or a newly seen NUMA node (new_numa_node).
789
790 cf_topo_numa_node_index i_numa_node;
791
792 if (new_numa_node) {
793 i_numa_node = g_n_numa_nodes;
794 ++g_n_numa_nodes;
795 os_numa_node_index_to_numa_node_index[i_os_numa_node] = i_numa_node;
796 g_numa_node_index_to_os_numa_node_index[i_numa_node] = i_os_numa_node;
797 cf_detail(CF_HARDWARE, "OS NUMA node index %hu -> new NUMA node index %hu",
798 i_os_numa_node, i_numa_node);
799 }
800 else {
801 i_numa_node = os_numa_node_index_to_numa_node_index[i_os_numa_node];
802 cf_detail(CF_HARDWARE, "OS NUMA node index %hu -> existing NUMA node index %hu",
803 i_os_numa_node, i_numa_node);
804 }
805
806 cf_detail(CF_HARDWARE, "OS CPU index %hu on NUMA node index %hu", g_n_os_cpus, i_numa_node);
807 CPU_SET(g_n_os_cpus, &g_numa_node_os_cpus_online[i_numa_node]);
808
809 // If we're in NUMA mode and the CPU isn't on the NUMA mode that we're
810 // running on, then ignore the CPU.
811
812 if (a_numa_node != INVALID_INDEX && a_numa_node != i_numa_node) {
813 cf_detail(CF_HARDWARE, "skipping unwanted NUMA node index %hu", i_numa_node);
814 continue;
815 }
816
817 // If the CPU is a new core, then map a new core index to the OS CPU index.
818
819 if (new_core) {
820 g_core_index_to_os_cpu_index[g_n_cores] = g_n_os_cpus;
821 cf_detail(CF_HARDWARE, "core index %hu -> OS CPU index %hu", g_n_cores, g_n_os_cpus);
822 ++g_n_cores;
823 }
824
825 // Map the OS CPU index to a new CPU index and vice versa.
826
827 g_os_cpu_index_to_cpu_index[g_n_os_cpus] = g_n_cpus;
828 g_cpu_index_to_os_cpu_index[g_n_cpus] = g_n_os_cpus;
829
830 cf_detail(CF_HARDWARE, "OS CPU index %hu <-> CPU index %hu", g_n_os_cpus, g_n_cpus);
831 ++g_n_cpus;
832 }
833
834 if (g_n_os_cpus == CPU_SETSIZE) {
835 cf_crash(CF_HARDWARE, "too many CPUs");
836 }
837
838 if (a_numa_node != INVALID_INDEX && no_numa) {
839 cf_warning(CF_HARDWARE, "no NUMA information found in /sys");
840 }
841
842 g_i_numa_node = a_numa_node;
843}
844
845static void
846pin_to_numa_node(cf_topo_numa_node_index a_numa_node)
847{
848 cf_info(CF_HARDWARE, "pinning to NUMA node %hu", a_numa_node);
849
850 // Move the current thread (and all of its future descendants) to the CPUs
851 // on the selected NUMA node.
852
853 cpu_set_t cpu_set;
854 CPU_ZERO(&cpu_set);
855
856 for (cf_topo_cpu_index i_cpu = 0; i_cpu < g_n_cpus; ++i_cpu) {
857 cf_topo_os_cpu_index i_os_cpu = g_cpu_index_to_os_cpu_index[i_cpu];
858 CPU_SET(i_os_cpu, &cpu_set);
859 }
860
861 char buff[1000];
862 mask_to_string(&cpu_set, buff, sizeof(buff));
863 cf_detail(CF_HARDWARE, "NUMA node %hu CPU mask: %s", a_numa_node, buff);
864
865 if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
866 cf_crash(CF_HARDWARE, "error while pinning thread to NUMA node %hu: %d (%s)",
867 a_numa_node, errno, cf_strerror(errno));
868 }
869
870 // Force future memory allocations to the selected NUMA node.
871
872 os_numa_node_index i_os_numa_node = g_numa_node_index_to_os_numa_node_index[a_numa_node];
873 uint64_t to_mask = 1UL << i_os_numa_node;
874 cf_detail(CF_HARDWARE, "NUMA node mask (to): %016" PRIx64, to_mask);
875
876 // Unlike select(), we have to pass "number of valid bits + 1".
877 set_mempolicy_safe(MPOL_BIND, &to_mask, 65);
878
879 // Make sure we can migrate shared memory that we later attach and map.
880 cf_process_add_startup_cap(CAP_SYS_NICE);
881}
882
883static uint32_t
884pick_random(uint32_t limit)
885{
886 static __thread uint64_t state = 0;
887
888 if (state == 0) {
889 state = (uint64_t)syscall(SYS_gettid);
890 }
891
892 state = state * 6364136223846793005 + 1;
893
894 if (state == 0) {
895 state = 1;
896 }
897
898 return (uint32_t)((state >> 32) % limit);
899}
900
901uint16_t
902cf_topo_count_cores(void)
903{
904 return g_n_cores;
905}
906
907uint16_t
908cf_topo_count_cpus(void)
909{
910 return g_n_cpus;
911}
912
913static cf_topo_cpu_index
914os_cpu_index_to_cpu_index(cf_topo_os_cpu_index i_os_cpu)
915{
916 cf_detail(CF_HARDWARE, "translating OS CPU index %hu", i_os_cpu);
917
918 if (i_os_cpu >= g_n_os_cpus) {
919 cf_crash(CF_HARDWARE, "invalid OS CPU index %hu", i_os_cpu);
920 }
921
922 cf_topo_cpu_index i_cpu = g_os_cpu_index_to_cpu_index[i_os_cpu];
923
924 if (i_cpu == INVALID_INDEX) {
925 cf_detail(CF_HARDWARE, "foreign OS CPU index %hu", i_os_cpu);
926 }
927 else {
928 cf_detail(CF_HARDWARE, "CPU index is %hu", i_cpu);
929 }
930
931 return i_cpu;
932}
933
934cf_topo_cpu_index
935cf_topo_current_cpu(void)
936{
937 cf_detail(CF_HARDWARE, "getting current OS CPU index");
938 int32_t os = sched_getcpu();
939
940 if (os < 0) {
941 cf_crash(CF_HARDWARE, "error while getting OS CPU index: %d (%s)",
942 errno, cf_strerror(errno));
943 }
944
945 return os_cpu_index_to_cpu_index((cf_topo_os_cpu_index)os);
946}
947
948cf_topo_cpu_index
949cf_topo_socket_cpu(const cf_socket *sock)
950{
951 cf_detail(CF_HARDWARE, "determining CPU index for socket FD %d", CSFD(sock));
952
953 int32_t os;
954 socklen_t len = sizeof(os);
955
956 if (getsockopt(sock->fd, SOL_SOCKET, SO_INCOMING_CPU, &os, &len) < 0) {
957 cf_crash(CF_HARDWARE, "error while determining incoming OS CPU index: %d (%s)",
958 errno, cf_strerror(errno));
959 }
960
961 cf_detail(CF_HARDWARE, "OS CPU index is %d", os);
962 cf_topo_cpu_index i_cpu = os_cpu_index_to_cpu_index((cf_topo_os_cpu_index)os);
963
964 // 1. The incoming connection was handled on the wrong NUMA node. In this case,
965 // pick a random CPU on the correct NUMA node.
966
967 if (i_cpu == INVALID_INDEX) {
968 i_cpu = (cf_topo_cpu_index)pick_random(g_n_cpus);
969 cf_detail(CF_HARDWARE, "picking random CPU index %hu", i_cpu);
970 return i_cpu;
971 }
972
973 // 2. The incoming connection was handled on a CPU that doesn't get any NIC
974 // interrupts. This should not happen for connections from other machines, but
975 // it does happen for connections from the local machine, because they don't
976 // go through the NIC hardware. In this case, pick a random CPU.
977
978 if (i_cpu >= g_n_irq_cpus) {
979 i_cpu = (cf_topo_cpu_index)pick_random(g_n_cpus);
980 cf_detail(CF_HARDWARE, "randomizing unexpected CPU index >%hu to %hu",
981 g_n_irq_cpus - 1, i_cpu);
982 return i_cpu;
983 }
984
985 // 3. Otherwise, redistribute. The first g_n_irq_cpus CPUs out of a total of
986 // g_n_cpus CPUs get NIC interrupts. Suppose we have 2 NIC queues and 8 CPUs,
987 // i.e., that g_n_irq_cpus == 2 and g_n_cpus == 8. We want to redistribute
988 // evenly across the 8 CPUs, i.e., each CPU should be picked with a probability
989 // of 0.125.
990
991 // We're currently running on one of the 2 CPUs that get NIC interrupts, on
992 // either with a probability of p1 = 0.5. We want to stay on the current CPU
993 // with a probability of p2 = g_n_irq_cpus / g_n_cpus == 2 / 8 == 0.25, which
994 // yields the desired total probability of p1 * p2 = 0.5 * 0.25 = 0.125.
995
996 if (pick_random(100000) < g_n_irq_cpus * (uint32_t)100000 / g_n_cpus) {
997 cf_detail(CF_HARDWARE, "staying on CPU index %hu", i_cpu);
998 return i_cpu;
999 }
1000
1001 // 4. Otherwise, if we switch CPUs, then we jump to a CPU that doesn't receive
1002 // NIC interrupts, i.e., one of the remaining 6 CPUs [2 .. 8] in our example.
1003 // This reaches each CPU with a probability of (1 - p2) / 6 = 0.125.
1004
1005 i_cpu = (cf_topo_cpu_index)(g_n_irq_cpus +
1006 pick_random((uint32_t)g_n_cpus - (uint32_t)g_n_irq_cpus));
1007 cf_detail(CF_HARDWARE, "redirecting to CPU index %hu", i_cpu);
1008 return i_cpu;
1009}
1010
1011cf_topo_napi_id
1012cf_topo_socket_napi_id(const cf_socket *sock)
1013{
1014 cf_topo_napi_id id;
1015 socklen_t len = sizeof(id);
1016
1017 if (getsockopt(sock->fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &id, &len) < 0) {
1018 cf_crash(CF_HARDWARE, "SO_INCOMING_NAPI_ID failed: %d (%s)", errno,
1019 cf_strerror(errno));
1020 }
1021
1022 cf_detail(CF_HARDWARE, "incoming connection with NAPI-id %d", id);
1023 return id;
1024}
1025
1026static void
1027pin_to_os_cpu(cf_topo_os_cpu_index i_os_cpu)
1028{
1029 cf_detail(CF_HARDWARE, "pinning to OS CPU index %hu", i_os_cpu);
1030
1031 cpu_set_t cpu_set;
1032 CPU_ZERO(&cpu_set);
1033 CPU_SET(i_os_cpu, &cpu_set);
1034
1035 if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
1036 cf_crash(CF_HARDWARE, "error while pinning thread to OS CPU %hu: %d (%s)",
1037 i_os_cpu, errno, cf_strerror(errno));
1038 }
1039}
1040
1041void
1042cf_topo_pin_to_core(cf_topo_core_index i_core)
1043{
1044 cf_detail(CF_HARDWARE, "pinning to core index %hu", i_core);
1045
1046 if (i_core >= g_n_cores) {
1047 cf_crash(CF_HARDWARE, "invalid core index %hu", i_core);
1048 }
1049
1050 pin_to_os_cpu(g_core_index_to_os_cpu_index[i_core]);
1051}
1052
1053void
1054cf_topo_pin_to_cpu(cf_topo_cpu_index i_cpu)
1055{
1056 cf_detail(CF_HARDWARE, "pinning to CPU index %hu", i_cpu);
1057
1058 if (i_cpu >= g_n_cpus) {
1059 cf_crash(CF_HARDWARE, "invalid CPU index %hu", i_cpu);
1060 }
1061
1062 pin_to_os_cpu(g_cpu_index_to_os_cpu_index[i_cpu]);
1063}
1064
1065static check_proc_res
1066check_proc(const char *name, int32_t argc, const char *argv[])
1067{
1068 cf_detail(CF_HARDWARE, "looking for process %s", name);
1069
1070 for (int32_t i = 0; i < argc; ++i) {
1071 cf_detail(CF_HARDWARE, "argv[%d]: %s", i, argv[i]);
1072 }
1073
1074 DIR *dir = opendir_safe("/proc");
1075 struct dirent ent;
1076 char cmd[10000];
1077 size_t limit;
1078 bool found = false;
1079
1080 while (readdir_safe(dir, &ent) >= 0) {
1081 bool numeric = true;
1082
1083 for (int32_t i = 0; ent.d_name[i] != 0; ++i) {
1084 if (!isascii(ent.d_name[i]) || !isdigit(ent.d_name[i])) {
1085 numeric = false;
1086 break;
1087 }
1088 }
1089
1090 if (!numeric) {
1091 continue;
1092 }
1093
1094 char path[500];
1095 snprintf(path, sizeof(path), "/proc/%s/cmdline", ent.d_name);
1096
1097 limit = sizeof(cmd) - 1;
1098 file_res rfr = read_file(path, cmd, &limit);
1099
1100 // Can legitimately happen, if the process has exited in the meantime.
1101 if (rfr == FILE_RES_NOT_FOUND) {
1102 continue;
1103 }
1104
1105 if (rfr == FILE_RES_ERROR) {
1106 cf_crash(CF_HARDWARE, "error while reading file %s", path);
1107 }
1108
1109 if (limit > 0 && cmd[limit - 1] != 0) {
1110 cmd[limit] = 0;
1111 }
1112
1113 const char *name2 = strrchr(cmd, '/');
1114
1115 if (name2 != NULL) {
1116 ++name2;
1117 }
1118 else {
1119 name2 = cmd;
1120 }
1121
1122 if (strcmp(name2, name) == 0) {
1123 found = true;
1124 break;
1125 }
1126 }
1127
1128 closedir_safe(dir);
1129
1130 if (!found) {
1131 cf_detail(CF_HARDWARE, "process %s absent", name);
1132 return CHECK_PROC_ABSENT;
1133 }
1134
1135 cf_detail(CF_HARDWARE, "process %s is %s", name, cmd);
1136
1137 if (argc > 0) {
1138 int32_t i_arg = 0;
1139
1140 for (size_t off = strlen(cmd) + 1; off < limit; off += strlen(cmd + off) + 1) {
1141 cf_detail(CF_HARDWARE, "checking argument %s against %s", cmd + off, argv[i_arg]);
1142
1143 if (strcmp(cmd + off, argv[i_arg]) == 0) {
1144 ++i_arg;
1145
1146 if (i_arg >= argc) {
1147 break;
1148 }
1149 }
1150 else {
1151 i_arg = 0;
1152 }
1153 }
1154
1155 if (i_arg >= argc) {
1156 cf_detail(CF_HARDWARE, "process %s present with argument", name);
1157 return CHECK_PROC_PRESENT;
1158 }
1159 }
1160
1161 cf_detail(CF_HARDWARE, "process %s present", name);
1162 return CHECK_PROC_PRESENT_NO_ARG;
1163}
1164
1165static uint16_t
1166interface_queues(const char *if_name, const char *format)
1167{
1168 uint16_t n_queues = 0;
1169
1170 while (true) {
1171 char path[1000];
1172 snprintf(path, sizeof(path), format, if_name, n_queues);
1173 cf_detail(CF_HARDWARE, "checking for working path %s", path);
1174
1175 if (!path_works(path)) {
1176 cf_detail(CF_HARDWARE, "path does not work");
1177 break;
1178 }
1179
1180 ++n_queues;
1181 }
1182
1183 cf_assert(n_queues != 0, CF_HARDWARE, "interface %s has no queues", if_name);
1184
1185 return n_queues;
1186}
1187
1188static uint16_t
1189interface_rx_queues(const char *if_name)
1190{
1191 cf_detail(CF_HARDWARE, "getting receive queues for interface %s", if_name);
1192 return interface_queues(if_name, "/sys/class/net/%s/queues/rx-%hu/rps_cpus");
1193}
1194
1195static uint16_t
1196interface_tx_queues(const char *if_name)
1197{
1198 cf_detail(CF_HARDWARE, "getting transmit queues for interface %s", if_name);
1199 return interface_queues(if_name, "/sys/class/net/%s/queues/tx-%hu/xps_cpus");
1200}
1201
1202static int
1203comp_irq_number(const void *lhs, const void *rhs)
1204{
1205 return *(irq_number *)lhs - *(irq_number *)rhs;
1206}
1207
1208static void
1209interface_irqs(const char *if_name, irq_list *irqs)
1210{
1211 cf_detail(CF_HARDWARE, "getting IRQs for interface %s", if_name);
1212
1213 DIR *dir = opendir_safe("/sys/bus/pci/devices");
1214 struct dirent ent;
1215 char path[PATH_MAX];
1216 bool found = false;
1217
1218 while (readdir_safe(dir, &ent) >= 0) {
1219 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/net/%s/ifindex",
1220 ent.d_name, if_name);
1221 bool exists = path_exists(path);
1222
1223 if (!exists) {
1224 for (int32_t i = 0; i < 100; ++i) {
1225 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/virtio%d/net/%s/ifindex",
1226 ent.d_name, i, if_name);
1227 exists = path_exists(path);
1228
1229 if (exists) {
1230 break;
1231 }
1232 }
1233 }
1234
1235 if (!exists) {
1236 continue;
1237 }
1238
1239 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/msi_irqs", ent.d_name);
1240
1241 if (!path_exists(path)) {
1242 cf_crash(CF_HARDWARE, "interface %s does not support MSIs", if_name);
1243 }
1244
1245 cf_detail(CF_HARDWARE, "interface %s is %s", if_name, ent.d_name);
1246 found = true;
1247 break;
1248 }
1249
1250 closedir_safe(dir);
1251
1252 if (!found) {
1253 cf_crash(CF_HARDWARE, "interface %s does not have a PCI device entry", if_name);
1254 }
1255
1256 dir = opendir_safe(path);
1257 int32_t count = 0;
1258 irq_number irq_nums[CPU_SETSIZE];
1259
1260 while (readdir_safe(dir, &ent) >= 0) {
1261 char *end;
1262 uint64_t tmp = strtoul(ent.d_name, &end, 10);
1263
1264 if (*end != 0 || tmp > 65535) {
1265 cf_crash(CF_HARDWARE, "invalid IRQ number %s in %s", ent.d_name, path);
1266 }
1267
1268 if (count >= CPU_SETSIZE) {
1269 cf_crash(CF_HARDWARE, "too many IRQs in %s", path);
1270 }
1271
1272 cf_detail(CF_HARDWARE, "interface %s has IRQ %hu", if_name, (irq_number)tmp);
1273 irq_nums[count] = (irq_number)tmp;
1274 ++count;
1275 }
1276
1277 closedir_safe(dir);
1278
1279 // Sort IRQ numbers, so that RX and TX interrupts pair up nicely when
1280 // populating irqs->irqs[].
1281 qsort(irq_nums, (size_t)count, sizeof(irq_number), comp_irq_number);
1282
1283 char actions[count][100];
1284 memset(actions, 0, sizeof(actions));
1285
1286 FILE *fh = fopen("/proc/interrupts", "r");
1287
1288 if (fh == NULL) {
1289 cf_crash(CF_HARDWARE, "error while opening /proc/interrupts");
1290 }
1291
1292 int32_t line_no = 0;
1293 char line[25000];
1294
1295 while (fgets(line, sizeof(line), fh) != NULL) {
1296 ++line_no;
1297
1298 if (line_no == 1) {
1299 continue;
1300 }
1301
1302 int32_t i = 0;
1303
1304 while (line[i] == ' ') {
1305 ++i;
1306 }
1307
1308 irq_number irq_num = 0;
1309
1310 while (line[i] >= '0' && line[i] <= '9') {
1311 irq_num = (irq_number)(irq_num * 10 + line[i] - '0');
1312 ++i;
1313 }
1314
1315 if (line[i] != ':') {
1316 continue;
1317 }
1318
1319 while (line[i] != 0 && line[i] != '\n') {
1320 ++i;
1321 }
1322
1323 line[i] = 0;
1324
1325 while (i >= 0 && line[i] != ' ') {
1326 --i;
1327 }
1328
1329 char *action = line + i + 1;
1330
1331 if (strlen(action) >= sizeof(actions[0])) {
1332 cf_crash(CF_HARDWARE, "oversize action in line %d in /proc/interrupts: %s",
1333 line_no, action);
1334 }
1335
1336 cf_detail(CF_HARDWARE, "IRQ %hu has action %s", irq_num, action);
1337
1338 for (i = 0; i < count; ++i) {
1339 if (irq_nums[i] == irq_num) {
1340 int32_t m = 0;
1341
1342 // Remove any digits, so that the queue index goes away and all queues
1343 // look alike. Also, normalize to lower case. For example:
1344 //
1345 // "i40e-em1-TxRx-0" -> "ie-em-txrx-"
1346 // "i40e-em1-TxRx-1" -> "ie-em-txrx-"
1347 // ...
1348
1349 for (int32_t k = 0; action[k] != 0; ++k) {
1350 if (action[k] < '0' || action[k] > '9') {
1351 actions[i][m] = (char)tolower((uint8_t)action[k]);
1352 ++m;
1353 }
1354 }
1355
1356 actions[i][m] = 0;
1357 cf_detail(CF_HARDWARE, "action pattern is %s", actions[i]);
1358 break;
1359 }
1360 }
1361 }
1362
1363 fclose(fh);
1364
1365 int32_t n_groups = 0;
1366 int32_t group_sizes[count];
1367 int32_t group_extra[count];
1368 int32_t action_groups[count];
1369 int32_t inactive_group = -1;
1370
1371 for (int32_t i = 0; i < count; ++i) {
1372 group_sizes[i] = 0;
1373 group_extra[i] = 0;
1374 action_groups[i] = -1;
1375 }
1376
1377 // Group by action pattern.
1378
1379 for (int32_t i = 0; i < count; ++i) {
1380 if (action_groups[i] >= 0) {
1381 continue;
1382 }
1383
1384 action_groups[i] = n_groups;
1385 ++group_sizes[n_groups];
1386
1387 if (actions[i][0] == 0) {
1388 inactive_group = n_groups;
1389 cf_detail(CF_HARDWARE, "inactive IRQs in new group %d", n_groups);
1390 }
1391 else {
1392 cf_detail(CF_HARDWARE, "new group %d: %s", n_groups, actions[i]);
1393 }
1394
1395 for (int32_t k = i + 1; k < count; ++k) {
1396 if (strcmp(actions[i], actions[k]) == 0) {
1397 action_groups[k] = n_groups;
1398 ++group_sizes[n_groups];
1399 }
1400 }
1401
1402 cf_detail(CF_HARDWARE, "group %d has %d member(s)", n_groups, group_sizes[n_groups]);
1403
1404 // Prefer groups whose action patterns have "rx", "tx", "input", or "output" in them.
1405
1406 if (strstr(actions[i], "rx") != NULL || strstr(actions[i], "tx") != NULL ||
1407 strstr(actions[i], "input") != NULL || strstr(actions[i], "output") != NULL) {
1408 cf_detail(CF_HARDWARE, "preferring group %d", n_groups);
1409 group_extra[n_groups] = 1;
1410 }
1411
1412 ++n_groups;
1413 }
1414
1415 // Find the two largest groups.
1416
1417 int32_t a = -1;
1418 int32_t b = -1;
1419
1420 for (int32_t i = 0; i < n_groups; ++i) {
1421 if (i != inactive_group &&
1422 (a < 0 || group_sizes[i] + group_extra[i] > group_sizes[a] + group_extra[a])) {
1423 a = i;
1424 }
1425 }
1426
1427 if (a < 0) {
1428 cf_crash(CF_HARDWARE, "no active interrupts for interface %s", if_name);
1429 }
1430
1431 for (int32_t i = 0; i < n_groups; ++i) {
1432 if (i != inactive_group && i != a &&
1433 (b < 0 || group_sizes[i] + group_extra[i] > group_sizes[b] + group_extra[b])) {
1434 b = i;
1435 }
1436 }
1437
1438 cf_detail(CF_HARDWARE, "largest groups: %d, %d", a, b);
1439
1440 // If the two largest groups have an equal number of members, then we assume
1441 // that it's a NIC with separate RX and TX queue IRQs.
1442
1443 if (b >= 0 && group_sizes[a] == group_sizes[b]) {
1444 cf_detail(CF_HARDWARE, "assuming %d separate RX and TX queue IRQ(s)",
1445 group_sizes[a] + group_sizes[b]);
1446 int32_t ia = 0;
1447 int32_t ib = 0;
1448
1449 // Make RX and TX queue IRQs take turns in the IRQ list.
1450
1451 for (int32_t k = 0; k < count; ++k) {
1452 if (action_groups[k] == a) {
1453 irqs->irqs[ia * 2] = irq_nums[k];
1454 cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ia * 2, irq_nums[k]);
1455 ++ia;
1456 }
1457 else if (action_groups[k] == b) {
1458 irqs->irqs[ib * 2 + 1] = irq_nums[k];
1459 cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ib * 2 + 1, irq_nums[k]);
1460 ++ib;
1461 }
1462 }
1463
1464 irqs->n_irqs = (uint16_t)(group_sizes[a] + group_sizes[b]);
1465
1466 // Send pairs of two consecutive IRQs in the IRQ list (= the RX and the
1467 // TX queue IRQ of a given NIC queue pair) to the same CPU.
1468
1469 irqs->per_cpu = 2;
1470 return;
1471 }
1472
1473 // Otherwise, we assume that it's a NIC with combined RX and TX queue IRQs
1474 // and that the largest group contains these IRQs.
1475
1476 cf_detail(CF_HARDWARE, "assuming %d combined RX and TX queue IRQ(s)", group_sizes[a]);
1477 int32_t ia = 0;
1478
1479 for (int32_t k = 0; k < count; ++k) {
1480 if (action_groups[k] == a) {
1481 irqs->irqs[ia] = irq_nums[k];
1482 cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ia, irq_nums[k]);
1483 ++ia;
1484 }
1485 }
1486
1487 irqs->n_irqs = (uint16_t)group_sizes[a];
1488
1489 // Send each IRQ in the IRQ list to a different CPU.
1490
1491 irqs->per_cpu = 1;
1492}
1493
1494static void
1495pin_irq(irq_number i_irq, cf_topo_os_cpu_index i_os_cpu)
1496{
1497 cf_detail(CF_HARDWARE, "pinning IRQ number %hu to OS CPU index %hu", i_irq, i_os_cpu);
1498
1499 cpu_set_t mask;
1500 CPU_ZERO(&mask);
1501 CPU_SET(i_os_cpu, &mask);
1502
1503 char mask_str[200];
1504 mask_to_string(&mask, mask_str, sizeof(mask_str));
1505 cf_detail(CF_HARDWARE, "CPU mask is %s", mask_str);
1506
1507 char path[1000];
1508 snprintf(path, sizeof(path), "/proc/irq/%hu/smp_affinity", i_irq);
1509
1510 if (write_file(path, mask_str, strlen(mask_str)) != FILE_RES_OK) {
1511 cf_crash(CF_HARDWARE, "error while pinning IRQ, path %s", path);
1512 }
1513}
1514
1515static cf_topo_os_cpu_index
1516fix_os_cpu_index(cf_topo_os_cpu_index i_os_cpu, const cpu_set_t *online)
1517{
1518 while (true) {
1519 if (i_os_cpu >= g_n_os_cpus) {
1520 i_os_cpu = 0;
1521 }
1522
1523 if (CPU_ISSET(i_os_cpu, online)) {
1524 return i_os_cpu;
1525 }
1526
1527 ++i_os_cpu;
1528 }
1529}
1530
1531static void
1532config_steering(const char *format, const char *if_name, uint16_t n_queues, bool enable)
1533{
1534 uint16_t i_queue;
1535 cpu_set_t masks[n_queues];
1536
1537 for (i_queue = 0; i_queue < n_queues; ++i_queue) {
1538 CPU_ZERO(&masks[i_queue]);
1539 }
1540
1541 if (enable) {
1542 i_queue = 0;
1543
1544 for (cf_topo_os_cpu_index i_os_cpu = 0; i_os_cpu < g_n_os_cpus; ++i_os_cpu) {
1545 if (CPU_ISSET(i_os_cpu, &g_os_cpus_online)) {
1546 CPU_SET(i_os_cpu, &masks[i_queue % n_queues]);
1547 ++i_queue;
1548 }
1549 }
1550 }
1551
1552 for (i_queue = 0; i_queue < n_queues; ++i_queue) {
1553 char path[1000];
1554 snprintf(path, sizeof(path), format, if_name, i_queue);
1555 cf_detail(CF_HARDWARE, "path is %s", path);
1556
1557 char mask_str[200];
1558 mask_to_string(&masks[i_queue], mask_str, sizeof(mask_str));
1559 cf_detail(CF_HARDWARE, "CPU mask is %s", mask_str);
1560
1561 write_file_safe(path, mask_str, strlen(mask_str));
1562 }
1563}
1564
1565static void
1566enable_xps(const char *if_name)
1567{
1568 cf_detail(CF_HARDWARE, "enabling XPS for interface %s", if_name);
1569 uint16_t n_queues = interface_tx_queues(if_name);
1570 config_steering("/sys/class/net/%s/queues/tx-%hu/xps_cpus", if_name, n_queues, true);
1571}
1572
1573static void
1574disable_rps(const char *if_name)
1575{
1576 cf_detail(CF_HARDWARE, "disabling RPS for interface %s", if_name);
1577 uint16_t n_queues = interface_rx_queues(if_name);
1578 config_steering("/sys/class/net/%s/queues/rx-%hu/rps_cpus", if_name, n_queues, false);
1579}
1580
1581static void
1582config_rfs(const char *if_name, bool enable)
1583{
1584 cf_detail(CF_HARDWARE, "%s RFS for interface %s", enable ? "enabling" : "disabling", if_name);
1585
1586 uint16_t n_queues = interface_rx_queues(if_name);
1587 uint32_t sz_glob = enable ? 1000000 : 0;
1588 uint32_t sz_queue = sz_glob / n_queues;
1589
1590 cf_detail(CF_HARDWARE, "global size is %u, per-queue size is %u", sz_glob, sz_queue);
1591
1592 char string[50];
1593 snprintf(string, sizeof(string), "%u", sz_glob);
1594 write_file_safe("/proc/sys/net/core/rps_sock_flow_entries", string, strlen(string));
1595
1596 snprintf(string, sizeof(string), "%u", sz_queue);
1597
1598 for (uint16_t i_queue = 0; i_queue < n_queues; ++i_queue) {
1599 char path[1000];
1600 snprintf(path, sizeof(path), "/sys/class/net/%s/queues/rx-%hu/rps_flow_cnt",
1601 if_name, i_queue);
1602 write_file_safe(path, string, strlen(string));
1603 }
1604}
1605
1606static void
1607enable_coalescing(const char *if_name)
1608{
1609 cf_detail(CF_HARDWARE, "enabling interrupt coalescing for interface %s", if_name);
1610 int32_t sock = socket(AF_INET, SOCK_DGRAM, 0);
1611
1612 if (sock < 0) {
1613 cf_crash(CF_HARDWARE, "error while create ethtool socket: %d (%s)", errno, cf_strerror(errno));
1614 }
1615
1616 struct ifreq req;
1617 memset(&req, 0, sizeof(req));
1618
1619 if (strlen(if_name) > IFNAMSIZ - 1) {
1620 cf_crash(CF_HARDWARE, "invalid interface name %s", if_name);
1621 }
1622
1623 strcpy(req.ifr_name, if_name);
1624 struct ethtool_coalesce coal = { .cmd = ETHTOOL_GCOALESCE };
1625 req.ifr_data = &coal;
1626
1627 if (ioctl(sock, SIOCETHTOOL, &req) < 0) {
1628 if (errno == EOPNOTSUPP) {
1629 cf_detail(CF_HARDWARE, "interface %s does not support ETHTOOL_GCOALESCE", if_name);
1630 goto cleanup1;
1631 }
1632
1633 cf_crash(CF_HARDWARE, "error while getting interface settings: %d (%s)",
1634 errno, cf_strerror(errno));
1635 }
1636
1637 cf_detail(CF_HARDWARE, "current interface settings: adaptive = %u, usecs = %u",
1638 coal.use_adaptive_rx_coalesce, coal.rx_coalesce_usecs);
1639
1640 if (coal.use_adaptive_rx_coalesce != 0 || coal.rx_coalesce_usecs >= 100) {
1641 cf_detail(CF_HARDWARE, "leaving interface settings untouched");
1642 goto cleanup1;
1643 }
1644
1645 cf_detail(CF_HARDWARE, "adjusting interface settings");
1646 coal = (struct ethtool_coalesce){
1647 .cmd = ETHTOOL_SCOALESCE,
1648 .rx_coalesce_usecs = 100 // .1 ms for now, which adds .05 ms to a request on average.
1649 };
1650
1651 if (ioctl(sock, SIOCETHTOOL, &req) < 0) {
1652 if (errno == EOPNOTSUPP) {
1653 cf_detail(CF_HARDWARE, "interface %s does not support ETHTOOL_SCOALESCE", if_name);
1654 goto cleanup1;
1655 }
1656
1657 cf_crash(CF_HARDWARE, "error while adjusting interface settings: %d (%s)",
1658 errno, cf_strerror(errno));
1659 }
1660
1661cleanup1:
1662 CF_NEVER_FAILS(close(sock));
1663}
1664
1665static void
1666check_irqbalance(void)
1667{
1668 cf_detail(CF_HARDWARE, "checking irqbalance");
1669
1670 check_proc_res res = check_proc("irqbalance", 1, (const char *[]){
1671 "--policyscript=" POLICY_SCRIPT
1672 });
1673
1674 if (res == CHECK_PROC_PRESENT_NO_ARG) {
1675 res = check_proc("irqbalance", 2, (const char *[]){
1676 "--policyscript",
1677 POLICY_SCRIPT
1678 });
1679 }
1680
1681 if (res == CHECK_PROC_PRESENT_NO_ARG) {
1682 res = check_proc("irqbalance", 1, (const char *[]){
1683 "-l" POLICY_SCRIPT
1684 });
1685 }
1686
1687 if (res == CHECK_PROC_PRESENT_NO_ARG) {
1688 res = check_proc("irqbalance", 2, (const char *[]){
1689 "-l",
1690 POLICY_SCRIPT
1691 });
1692 }
1693
1694 if (res == CHECK_PROC_PRESENT_NO_ARG) {
1695 cf_crash_nostack(CF_HARDWARE, "please disable irqbalance or run it with the Aerospike policy script, /etc/aerospike/irqbalance-ban.sh");
1696 }
1697}
1698
1699static void
1700config_interface(const char *if_name, bool rfs, irq_list *irqs)
1701{
1702 uint16_t n_irq_cpus = 0;
1703 cf_topo_os_cpu_index i_os_cpu = fix_os_cpu_index(0, &g_os_cpus_online);
1704
1705 for (uint16_t i = 0; i < irqs->n_irqs; ++i) {
1706 pin_irq(irqs->irqs[i], i_os_cpu);
1707
1708 if (i % irqs->per_cpu == irqs->per_cpu - 1) {
1709 ++n_irq_cpus;
1710 i_os_cpu = fix_os_cpu_index((cf_topo_os_cpu_index)(i_os_cpu + 1), &g_os_cpus_online);
1711 }
1712 }
1713
1714 cf_detail(CF_HARDWARE, "interface %s with %hu RX interrupt(s)", if_name, n_irq_cpus);
1715
1716 if (g_n_irq_cpus == 0) {
1717 g_n_irq_cpus = n_irq_cpus;
1718 }
1719 else if (n_irq_cpus != g_n_irq_cpus) {
1720 cf_crash(CF_HARDWARE, "interface %s with inconsistent number of RX interrupts: %hu vs. %hu",
1721 if_name, n_irq_cpus, g_n_irq_cpus);
1722 }
1723
1724 disable_rps(if_name);
1725 config_rfs(if_name, rfs);
1726 enable_xps(if_name);
1727
1728 // Redistributing packets with RFS causes inter-CPU interrupts, which increases
1729 // the interrupt load on the machine. For low-end systems, make sure that
1730 // interrupt coalescing is enabled.
1731 //
1732 // We consider a machine low-end, if we handle interrupts on 25% or less of the
1733 // available CPUs (i.e., if the number of NIC queues is 25% or less of the number
1734 // of available CPUs) and it has fewer than 4 NIC queues.
1735 //
1736 // Better (i.e., NUMA) machines typically come with adaptive interrupt coalescing
1737 // enabled by default. That's why we only do this here and not in the NUMA case.
1738
1739 if (rfs && n_irq_cpus <= g_n_cpus / 4 && n_irq_cpus < 4) {
1740 enable_coalescing(if_name);
1741 }
1742}
1743
1744static void
1745config_interface_numa(const char *if_name, irq_list *irqs)
1746{
1747 uint16_t n_irq_cpus = 0;
1748 cf_topo_os_cpu_index i_os_cpu[g_n_numa_nodes];
1749 uint16_t i_numa_node;
1750
1751 for (i_numa_node = 0; i_numa_node < g_n_numa_nodes; ++i_numa_node) {
1752 i_os_cpu[i_numa_node] = fix_os_cpu_index(0, &g_numa_node_os_cpus_online[i_numa_node]);
1753 }
1754
1755 i_numa_node = 0;
1756
1757 // This configures the IRQs for all NUMA nodes. If multiple asd processes are
1758 // running, each process does this, but each does it identically. Hence there
1759 // isn't any conflict.
1760
1761 for (uint16_t i = 0; i < irqs->n_irqs; ++i) {
1762 char mask_str[200];
1763 mask_to_string(&g_numa_node_os_cpus_online[i_numa_node], mask_str, sizeof(mask_str));
1764 cf_detail(CF_HARDWARE, "NUMA node index %hu CPU mask is %s", i_numa_node, mask_str);
1765
1766 pin_irq(irqs->irqs[i], i_os_cpu[i_numa_node]);
1767
1768 if (i % irqs->per_cpu == irqs->per_cpu - 1) {
1769 // Only count CPUs on our NUMA node.
1770
1771 if (i_numa_node == g_i_numa_node) {
1772 ++n_irq_cpus;
1773 }
1774
1775 i_os_cpu[i_numa_node] =
1776 fix_os_cpu_index((cf_topo_os_cpu_index)(i_os_cpu[i_numa_node] + 1),
1777 &g_numa_node_os_cpus_online[i_numa_node]);
1778 i_numa_node = (uint16_t)((i_numa_node + 1) % g_n_numa_nodes);
1779 }
1780 }
1781
1782 cf_detail(CF_HARDWARE, "interface %s with %hu RX interrupt(s) on NUMA node %hu",
1783 if_name, n_irq_cpus, g_i_numa_node);
1784
1785 if (g_n_irq_cpus == 0) {
1786 g_n_irq_cpus = n_irq_cpus;
1787 }
1788 else if (n_irq_cpus != g_n_irq_cpus) {
1789 cf_crash(CF_HARDWARE, "interface %s with inconsistent number of RX interrupts: %hu vs. %hu",
1790 if_name, n_irq_cpus, g_n_irq_cpus);
1791 }
1792
1793 disable_rps(if_name);
1794 config_rfs(if_name, true);
1795 enable_xps(if_name);
1796}
1797
1798static void
1799optimize_interface(const char *if_name)
1800{
1801 cf_detail(CF_HARDWARE, "optimizing interface %s", if_name);
1802 uint16_t n_queues = interface_rx_queues(if_name);
1803 irq_list irqs;
1804 interface_irqs(if_name, &irqs);
1805
1806 cf_info(CF_HARDWARE, "detected %hu NIC receive queue(s), %hu interrupt(s) for %s",
1807 n_queues, irqs.n_irqs, if_name);
1808
1809 // We either expect one interrupt per RX queue (shared with TX) or two
1810 // interrupts per RX queue (one RX, one TX).
1811
1812 uint16_t n_irq_cpus = irqs.n_irqs / irqs.per_cpu;
1813
1814 if (n_irq_cpus != n_queues) {
1815 cf_crash(CF_HARDWARE, "suspicious NIC interrupt count %hu with %hu NIC receive queue(s)",
1816 irqs.n_irqs, n_queues);
1817 }
1818
1819 if (n_irq_cpus == g_n_cpus) {
1820 if (g_i_numa_node != INVALID_INDEX) {
1821 cf_detail(CF_HARDWARE, "setting up for a fancy interface with NUMA");
1822 config_interface_numa(if_name, &irqs);
1823 }
1824 else {
1825 cf_detail(CF_HARDWARE, "setting up for a fancy interface, no NUMA");
1826 config_interface(if_name, false, &irqs);
1827 }
1828 }
1829 else {
1830 if (n_irq_cpus <= g_n_cpus / 4) {
1831 cf_warning(CF_HARDWARE, "%s has very few NIC queues; only %hu out of %hu CPUs handle(s) NIC interrupts",
1832 if_name, n_irq_cpus, g_n_cpus);
1833 }
1834
1835 if (g_i_numa_node != INVALID_INDEX) {
1836 cf_detail(CF_HARDWARE, "setting up for a lame interface with NUMA");
1837 config_interface_numa(if_name, &irqs);
1838 }
1839 else {
1840 cf_detail(CF_HARDWARE, "setting up for a lame interface, no NUMA");
1841 config_interface(if_name, true, &irqs);
1842 }
1843 }
1844}
1845
1846// Make sure that we are running on appropriate kernel.
1847static void
1848check_socket_option(int optname, const char *tag)
1849{
1850 int32_t fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
1851
1852 if (fd < 0) {
1853 cf_crash(CF_HARDWARE, "error while creating UDP test socket: %d (%s)",
1854 errno, cf_strerror(errno));
1855 }
1856
1857 int32_t val;
1858 socklen_t val_len = sizeof(val);
1859
1860 if (getsockopt(fd, SOL_SOCKET, optname, &val, &val_len) < 0) {
1861 if (errno == ENOPROTOOPT) {
1862 cf_crash_nostack(CF_HARDWARE, "auto-pin requires %s or later", tag);
1863 }
1864
1865 cf_crash(CF_HARDWARE, "error while testing for socket option: %d (%s)",
1866 errno, cf_strerror(errno));
1867 }
1868
1869 CF_NEVER_FAILS(close(fd));
1870}
1871
1872// Reconfigure NIC queues and interrupts.
1873static void
1874optimize_interfaces(const cf_addr_list *addrs)
1875{
1876 if (addrs->n_addrs == 0) {
1877 cf_crash_nostack(CF_HARDWARE, "auto-pinning requires binding the service to one or more network interfaces");
1878 }
1879
1880 for (uint32_t i = 0; i < addrs->n_addrs; ++i) {
1881 const char *if_name = addrs->addrs[i];
1882
1883 if (!cf_inter_is_inter_name(if_name)) {
1884 cf_crash_nostack(CF_HARDWARE, "auto-pinning requires binding the service to network interfaces; \"%s\" isn't a network interface",
1885 if_name);
1886 }
1887
1888 char phys_name[50];
1889 CF_NEVER_FAILS(cf_inter_get_physical(if_name, phys_name, sizeof(phys_name)));
1890
1891 char *exp_names[100];
1892 uint32_t n_exp = sizeof(exp_names) / sizeof(exp_names[0]);
1893 cf_inter_expand_bond(phys_name, exp_names, &n_exp);
1894
1895 for (uint32_t k = 0; k < n_exp; ++k) {
1896 optimize_interface(exp_names[k]);
1897 cf_free(exp_names[k]);
1898 }
1899 }
1900}
1901
1902void
1903cf_topo_config(cf_topo_auto_pin auto_pin, cf_topo_numa_node_index a_numa_node,
1904 const cf_addr_list *addrs)
1905{
1906 // Detect the NUMA topology.
1907
1908 switch (auto_pin) {
1909 case CF_TOPO_AUTO_PIN_NONE:
1910 case CF_TOPO_AUTO_PIN_CPU:
1911 detect(INVALID_INDEX);
1912 break;
1913
1914 case CF_TOPO_AUTO_PIN_NUMA:
1915 case CF_TOPO_AUTO_PIN_ADQ:
1916 detect(a_numa_node);
1917
1918 // Clamp the given NUMA node index to the valid range. We can only do this
1919 // after we know what g_n_numa_nodes is, which is initialized by the above
1920 // call to detect().
1921
1922 if (a_numa_node >= g_n_numa_nodes) {
1923 cf_topo_numa_node_index orig = a_numa_node;
1924 a_numa_node = (cf_topo_numa_node_index)(a_numa_node % g_n_numa_nodes);
1925 cf_detail(CF_HARDWARE, "invalid NUMA node index: %hu, clamping to %hu", orig, a_numa_node);
1926 detect(a_numa_node);
1927 }
1928
1929 break;
1930
1931 default:
1932 cf_crash(CF_HARDWARE, "bad auto-pin value %d", auto_pin);
1933 break;
1934 }
1935
1936 // If we don't do any pinning, then we're done after NUMA topology detection.
1937 if (auto_pin == CF_TOPO_AUTO_PIN_NONE) {
1938 return;
1939 }
1940
1941 check_irqbalance(); // ensure irqbalance is disabled
1942
1943 switch (auto_pin) {
1944 case CF_TOPO_AUTO_PIN_CPU:
1945 check_socket_option(SO_INCOMING_CPU, "Linux kernel 3.19");
1946 optimize_interfaces(addrs);
1947 break;
1948 case CF_TOPO_AUTO_PIN_NUMA:
1949 check_socket_option(SO_INCOMING_CPU, "Linux kernel 3.19");
1950 optimize_interfaces(addrs);
1951 pin_to_numa_node(a_numa_node);
1952 break;
1953 case CF_TOPO_AUTO_PIN_ADQ:
1954 check_socket_option(SO_INCOMING_NAPI_ID, "Linux kernel 4.12");
1955 pin_to_numa_node(a_numa_node);
1956 break;
1957 default:
1958 cf_crash(CF_HARDWARE, "bad auto-pin value %d", auto_pin);
1959 break;
1960 }
1961}
1962
1963void
1964cf_topo_force_map_memory(const uint8_t *from, size_t size)
1965{
1966 if (g_i_numa_node == INVALID_INDEX || size == 0) {
1967 return;
1968 }
1969
1970 cf_assert(from, CF_HARDWARE, "invalid cf_topo_force_map_memory() call");
1971
1972 // Read one byte per memory page to force otherwise lazy mapping.
1973
1974 const uint8_t *start = (const uint8_t *)
1975 (((int64_t)from + (MEM_PAGE_SIZE - 1)) & -MEM_PAGE_SIZE);
1976 const uint8_t *end = from + size;
1977 const volatile uint8_t *p_byte;
1978
1979 // In case 'from' was not page-aligned, take care of the partial page.
1980 if (start > from) {
1981 p_byte = from;
1982 p_byte[0];
1983 }
1984
1985 for (p_byte = start; p_byte < end; p_byte += MEM_PAGE_SIZE) {
1986 p_byte[0];
1987 }
1988}
1989
1990void
1991cf_topo_migrate_memory(void)
1992{
1993 if (g_i_numa_node == INVALID_INDEX) {
1994 return;
1995 }
1996
1997 // Migrate existing memory allocations to the selected NUMA node.
1998
1999 os_numa_node_index i_os_numa_node = g_numa_node_index_to_os_numa_node_index[g_i_numa_node];
2000 uint64_t to_mask = 1UL << i_os_numa_node;
2001 cf_detail(CF_HARDWARE, "NUMA node mask (to): %016" PRIx64, to_mask);
2002
2003 uint64_t from_mask = 0;
2004
2005 for (cf_topo_numa_node_index i_numa_node = 0; i_numa_node < g_n_numa_nodes; ++i_numa_node) {
2006 i_os_numa_node = g_numa_node_index_to_os_numa_node_index[i_numa_node];
2007 from_mask |= 1u << i_os_numa_node;
2008 }
2009
2010 from_mask &= ~to_mask;
2011 cf_detail(CF_HARDWARE, "NUMA node mask (from): %016" PRIx64, from_mask);
2012
2013 if (from_mask != 0) {
2014 cf_info(CF_HARDWARE, "migrating shared memory to local NUMA node - this may take a bit");
2015 // Unlike select(), we have to pass "number of valid bits + 1".
2016 migrate_pages_safe(0, 65, &from_mask, &to_mask);
2017 }
2018}
2019
2020void
2021cf_topo_info(void)
2022{
2023 if (g_i_numa_node == INVALID_INDEX) {
2024 cf_info(CF_HARDWARE, "detected %hu CPU(s), %hu core(s), %hu NUMA node(s)",
2025 g_n_cpus, g_n_cores, g_n_numa_nodes);
2026 }
2027 else {
2028 cf_info(CF_HARDWARE, "detected %hu CPU(s), %hu core(s) on NUMA node %hu of %hu",
2029 g_n_cpus, g_n_cores, g_i_numa_node, g_n_numa_nodes);
2030 }
2031}
2032
2033static uint32_t
2034dev_key_hash(const void *k)
2035{
2036 const dev_key_t *key = k;
2037 return (1 + key->major) * (1 + key->minor);
2038}
2039
2040static void
2041add_child(const dev_key_t *key, dev_node_t *node, const dev_key_t *child_key,
2042 dev_node_t *child_node)
2043{
2044 cf_detail(CF_HARDWARE, "parent %u:%u -> child %u:%u",
2045 key->major, key->minor, child_key->major, child_key->minor);
2046
2047 node->children[node->n_children] = child_node;
2048 ++node->n_children;
2049}
2050
2051static void
2052collect_edges(const char *sys_dir, const char *prefix, bool flip,
2053 const dev_key_t *key, dev_node_t *node)
2054{
2055 cf_detail(CF_HARDWARE, "collecting devices in %s", sys_dir);
2056
2057 if (!path_exists(sys_dir)) {
2058 return;
2059 }
2060
2061 size_t prefix_len = strlen(prefix);
2062
2063 DIR *dir = opendir_safe(sys_dir);
2064 struct dirent ent;
2065
2066 while (readdir_safe(dir, &ent) >= 0) {
2067 cf_detail(CF_HARDWARE, "considering %s", ent.d_name);
2068
2069 if (prefix_len > 0 && strncmp(ent.d_name, prefix, prefix_len) != 0) {
2070 cf_detail(CF_HARDWARE, "prefix mismatch");
2071 continue;
2072 }
2073
2074 char sys_path[DEVICE_PATH_SIZE];
2075 snprintf(sys_path, DEVICE_PATH_SIZE, "%s/%s", sys_dir, ent.d_name);
2076
2077 if (!path_is_dir(sys_path)) {
2078 cf_detail(CF_HARDWARE, "not a directory");
2079 continue;
2080 }
2081
2082 snprintf(sys_path, DEVICE_PATH_SIZE, "%s/%s/dev", sys_dir, ent.d_name);
2083
2084 dev_key_t sub_key;
2085
2086 if (read_device_numbers(sys_path, &sub_key.major, &sub_key.minor) !=
2087 FILE_RES_OK) {
2088 cf_detail(CF_HARDWARE, "no device numbers");
2089 continue;
2090 }
2091
2092 dev_node_t *sub_node;
2093
2094 if (cf_shash_get(g_dev_graph, &sub_key, &sub_node) != CF_SHASH_OK) {
2095 cf_warning(CF_HARDWARE, "no node for sub device %s/%s (%u:%u)",
2096 sys_dir, ent.d_name, sub_key.major, sub_key.minor);
2097 continue;
2098 }
2099
2100 if (!flip) {
2101 add_child(&sub_key, sub_node, key, node);
2102 }
2103 else {
2104 add_child(key, node, &sub_key, sub_node);
2105 }
2106 }
2107
2108 closedir_safe(dir);
2109}
2110
2111static int32_t
2112create_device_edges(const void *k, void *v, void *udata)
2113{
2114 (void)udata;
2115
2116 const dev_key_t *key = k;
2117 dev_node_t **node = v;
2118
2119 cf_detail(CF_HARDWARE, "creating edges for %s", (*node)->sys_home);
2120
2121 // Collect partitions on a device.
2122 collect_edges((*node)->sys_home, (*node)->name, false, key, *node);
2123
2124 char sys_slaves[DEVICE_PATH_SIZE + 7]; // +7 to silence the compiler
2125 snprintf(sys_slaves, DEVICE_PATH_SIZE + 7, "%s/slaves", (*node)->sys_home);
2126
2127 // Collect inter-device dependencies.
2128 collect_edges(sys_slaves, "", true, key, *node);
2129
2130 return CF_SHASH_OK;
2131}
2132
2133static void
2134build_device_graph(void)
2135{
2136 // Step 1. Create a device map entry for each device. Don't yet link them
2137 // into a device dependency graph.
2138
2139 static const char *sys_dirs[] = {
2140 "/sys/class/nvme",
2141 "/sys/class/block",
2142 NULL
2143 };
2144
2145 g_dev_graph = cf_shash_create(dev_key_hash, sizeof(dev_key_t),
2146 sizeof(dev_node_t *), 256, 0);
2147
2148 for (int32_t i_dir = 0; sys_dirs[i_dir] != NULL; ++i_dir) {
2149 const char *sys_dir = sys_dirs[i_dir];
2150
2151 cf_detail(CF_HARDWARE, "collecting devices in %s", sys_dir);
2152
2153 if (!path_exists(sys_dir)) {
2154 cf_detail(CF_HARDWARE, "directory does not exist");
2155 continue;
2156 }
2157
2158 DIR *dir = opendir_safe(sys_dir);
2159 struct dirent ent;
2160
2161 while (readdir_safe(dir, &ent) >= 0) {
2162 cf_detail(CF_HARDWARE, "considering %s", ent.d_name);
2163
2164 char sys_path[DEVICE_PATH_SIZE];
2165 snprintf(sys_path, DEVICE_PATH_SIZE, "%s/%s/dev", sys_dir,
2166 ent.d_name);
2167
2168 dev_key_t key;
2169
2170 if (read_device_numbers(sys_path, &key.major, &key.minor) !=
2171 FILE_RES_OK) {
2172 cf_detail(CF_HARDWARE, "no device numbers");
2173 continue;
2174 }
2175
2176 dev_node_t *node = cf_malloc(sizeof(dev_node_t));
2177 memset(node, 0, sizeof(dev_node_t));
2178
2179 snprintf(node->name, DEVICE_NAME_SIZE, "%s", ent.d_name);
2180 snprintf(node->dev_path, DEVICE_PATH_SIZE, "/dev/%s", ent.d_name);
2181
2182 snprintf(node->sys_home, DEVICE_PATH_SIZE, "%s/%s", sys_dir,
2183 ent.d_name);
2184
2185 snprintf(sys_path, DEVICE_PATH_SIZE, "%s/%s/queue/scheduler",
2186 sys_dir, ent.d_name);
2187
2188 if (path_exists(sys_path)) {
2189 strcpy(node->sys_sched, sys_path);
2190 }
2191
2192 cf_detail(CF_HARDWARE, "new device %s (%u:%u), home %s, "
2193 "scheduler %s", node->dev_path, key.major, key.minor,
2194 node->sys_home, node->sys_sched[0] != 0 ?
2195 node->sys_sched : "-");
2196
2197 if (cf_shash_put_unique(g_dev_graph, &key, &node) != CF_SHASH_OK) {
2198 cf_warning(CF_HARDWARE, "duplicate device %s (%u:%u)",
2199 node->dev_path, key.major, key.minor);
2200 }
2201 }
2202
2203 closedir_safe(dir);
2204 }
2205
2206 // Step 2. Link the devices in the device map to create the device
2207 // dependency graph. Here's an example graph path for logical volume
2208 // lv_foo on encrypted partition sda3:
2209 //
2210 // lv_foo 253:1 -> sda3_crypt 253:0 -> sda3 8:3 -> sda 8:0
2211 //
2212 // In short: Going from parents to children takes you closer to
2213 // physical devices.
2214 //
2215 // Devices can have multiple parents, e.g., sda could have sda1, sda2,
2216 // and sda3.
2217 //
2218 // Devices can also have multiple children, e.g., lv_bar could have
2219 // children sda1 and sdb1.
2220
2221 cf_detail(CF_HARDWARE, "creating device edges");
2222 cf_shash_reduce(g_dev_graph, create_device_edges, NULL);
2223}
2224
2225static char *
2226get_mounted_device(const char *fs_path)
2227{
2228 cf_detail(CF_HARDWARE, "mapping mount point %s", fs_path);
2229
2230 char *fs_real = realpath(fs_path, NULL);
2231
2232 if (fs_real == NULL) {
2233 cf_warning(CF_HARDWARE, "failed to resolve path %s: %d (%s)",
2234 fs_path, errno, cf_strerror(errno));
2235 return NULL;
2236 }
2237
2238 cf_detail(CF_HARDWARE, "resolved path %s", fs_real);
2239
2240 FILE *fh = setmntent("/proc/mounts", "r");
2241
2242 struct mntent mnt;
2243 char buff[1000];
2244
2245 size_t best_len = 0;
2246 char best_path[DEVICE_PATH_SIZE];
2247
2248 while (getmntent_r(fh, &mnt, buff, sizeof(buff)) != NULL) {
2249 cf_detail(CF_HARDWARE, "mount point %s", mnt.mnt_dir);
2250
2251 char *mount_real = realpath(mnt.mnt_dir, NULL);
2252
2253 if (mount_real == NULL) {
2254 // Don't warn; current user may simply not be allowed access to all
2255 // mount points.
2256 cf_detail(CF_HARDWARE,
2257 "failed to resolve mount point %s: %d (%s)",
2258 mnt.mnt_dir, errno, cf_strerror(errno));
2259 continue;
2260 }
2261
2262 cf_detail(CF_HARDWARE, "resolved mount point %s", mount_real);
2263
2264 size_t len = strlen(mount_real);
2265
2266 if (len > best_len && strncmp(fs_real, mount_real, len) == 0) {
2267 strcpy(best_path, mnt.mnt_fsname);
2268 best_len = len;
2269 cf_detail(CF_HARDWARE, "new best %s with length %zu",
2270 best_path, best_len);
2271 }
2272
2273 free(mount_real);
2274 }
2275
2276 endmntent(fh);
2277 free(fs_real);
2278
2279 if (best_len == 0) {
2280 cf_warning(CF_HARDWARE, "no mount point found for %s", fs_path);
2281 return NULL;
2282 }
2283
2284 if (strncmp(best_path, "/dev", 4) != 0) {
2285 // Don't warn; could be tmpfs, etc.
2286 cf_detail(CF_HARDWARE, "invalid device %s found for %s", best_path,
2287 fs_path);
2288 return NULL;
2289 }
2290
2291 char *best_real = realpath(best_path, NULL);
2292
2293 if (best_real == NULL) {
2294 cf_warning(CF_HARDWARE,
2295 "failed to resolve mounted device %s: %d (%s)", best_path,
2296 errno, cf_strerror(errno));
2297 return NULL;
2298 }
2299
2300 // Return a result allocated with the cf_*() allocation functions.
2301
2302 char *res = cf_strdup(best_real);
2303 free(best_real);
2304
2305 cf_detail(CF_HARDWARE, "mount point is %s", res);
2306 return res;
2307}
2308
2309static bool
2310get_dev_key(const char *dev_path, dev_key_t *key)
2311{
2312 cf_detail(CF_HARDWARE, "getting device key for %s", dev_path);
2313
2314 struct stat st;
2315
2316 if (stat(dev_path, &st) < 0) {
2317 cf_warning(CF_HARDWARE, "failed to query meta data for %s: %d (%s)",
2318 dev_path, errno, cf_strerror(errno));
2319 return false;
2320 }
2321
2322 if (!S_ISBLK(st.st_mode) && !S_ISCHR(st.st_mode)) {
2323 cf_warning(CF_HARDWARE, "%s is not a device", dev_path);
2324 return false;
2325 }
2326
2327 key->major = major(st.st_rdev);
2328 key->minor = minor(st.st_rdev);
2329
2330 cf_detail(CF_HARDWARE, "device key %u:%u", key->major, key->minor);
2331 return true;
2332}
2333
2334static cf_topo_numa_node_index
2335get_numa_node(const char *sys_path)
2336{
2337 cf_detail(CF_HARDWARE, "finding NUMA node for %s", sys_path);
2338
2339 char *sys_real = realpath(sys_path, NULL);
2340
2341 if (sys_real == NULL) {
2342 cf_warning(CF_HARDWARE, "failed to resolve path %s: %d (%s)",
2343 sys_path, errno, cf_strerror(errno));
2344 return INVALID_INDEX;
2345 }
2346
2347 cf_topo_numa_node_index res = INVALID_INDEX;
2348
2349 for (int32_t i = 0; i < 25; ++i) {
2350 cf_detail(CF_HARDWARE, "considering %s", sys_real);
2351
2352 char sys_numa[DEVICE_PATH_SIZE];
2353 snprintf(sys_numa, DEVICE_PATH_SIZE, "%s/numa_node", sys_real);
2354
2355 cf_topo_numa_node_index tmp;
2356
2357 if (read_numa_node(sys_numa, &tmp) == FILE_RES_OK) {
2358 cf_detail(CF_HARDWARE, "NUMA node found");
2359 res = tmp;
2360 break;
2361 }
2362
2363 int32_t i_slash = -1;
2364
2365 for (int32_t k = 0; sys_real[k] != 0; ++k) {
2366 if (sys_real[k] == '/') {
2367 i_slash = k;
2368 }
2369 }
2370
2371 if (i_slash < 1) {
2372 break;
2373 }
2374
2375 sys_real[i_slash] = 0;
2376 }
2377
2378 free(sys_real);
2379 return res;
2380}
2381
2382static int32_t
2383get_nvme_age(const char *dev_path)
2384{
2385 static const uint32_t SZ_BUFF = 512;
2386
2387 cf_detail(CF_HARDWARE, "getting age for %s", dev_path);
2388
2389 if (!cf_process_has_cap(CAP_SYS_ADMIN)) {
2390 cf_detail(CF_HARDWARE, "insufficient privileges to query %s",
2391 dev_path);
2392 return -1;
2393 }
2394
2395 int32_t fd = open(dev_path, O_RDONLY);
2396
2397 if (fd < 0) {
2398 if (errno == EACCES) {
2399 cf_detail(CF_HARDWARE, "insufficient privileges to open %s",
2400 dev_path);
2401 }
2402 else {
2403 cf_warning(CF_HARDWARE, "failed to open %s: %d (%s)",
2404 dev_path, errno, cf_strerror(errno));
2405 }
2406
2407 return -1;
2408 }
2409
2410 uint8_t *buff = cf_valloc(SZ_BUFF);
2411
2412 // Silence Valgrind, which doesn't know about this ioctl.
2413
2414 memset(buff, 0, SZ_BUFF);
2415
2416 // NVMe specification: https://bit.ly/2HPAS99
2417 //
2418 // - See 4.2 for overall command format.
2419 // - See 5.14 for specifics of the Get Log page command.
2420 //
2421 // "0's based value" in the spec means that a value x in a data
2422 // structure actually means x + 1.
2423
2424 uint32_t numdl = (SZ_BUFF / 4) - 1; // number of dwords lower (0's based)
2425 uint32_t lid = 2; // log page identifier: 2 (SMART log)
2426
2427 uint32_t cdw10 = (numdl << 16) | lid;
2428
2429 struct nvme_admin_cmd cmd = {
2430 .opcode = 0x02, // Get Log Page
2431 .nsid = 0xffffffff, // no namespace
2432 .addr = (uint64_t)buff, // result buffer
2433 .data_len = SZ_BUFF, // size of result buffer
2434 .cdw10 = cdw10 // command arguments
2435 };
2436
2437 cf_process_enable_cap(CAP_SYS_ADMIN);
2438
2439 cf_detail(CF_HARDWARE, "querying %s", dev_path);
2440 int32_t res = ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
2441
2442 cf_process_disable_cap(CAP_SYS_ADMIN);
2443
2444 if (res < 0) {
2445 // Older kernels that don't support the IOCTL return EINVAL.
2446 // Submitting to non-NVMe devices causes ENOTTY.
2447 if (errno != EINVAL && errno != ENOTTY) {
2448 cf_warning(CF_HARDWARE, "failed to submit command to %s: %d (%s)",
2449 dev_path, errno, cf_strerror(errno));
2450 }
2451
2452 cf_free(buff);
2453 close(fd);
2454 return -1;
2455 }
2456
2457 if (res > 0){
2458 // Some virtualized environments don't provide a SMART log page.
2459 if (res != NVME_SC_INVALID_LOG_PAGE) {
2460 cf_warning(CF_HARDWARE, "failed to submit command to %s: 0x%x",
2461 dev_path, res);
2462 }
2463
2464 cf_free(buff);
2465 close(fd);
2466 return -1;
2467 }
2468
2469 // 0 <= age <= 255 - reported percentage used may exceed 100, when a drive
2470 // lives longer than predicted by its vendor.
2471
2472 int32_t age = buff[5];
2473 cf_detail(CF_HARDWARE, "percentage lived %d", age);
2474
2475 cf_free(buff);
2476 close(fd);
2477
2478 return age;
2479}
2480
2481static void
2482update_path_data(path_data_t *data)
2483{
2484 cf_storage_device_info *info = &data->info;
2485
2486 cf_detail(CF_HARDWARE, "updating path data for %s", info->dev_path);
2487
2488 for (uint32_t i = 0; i < info->n_phys; ++i) {
2489 cf_detail(CF_HARDWARE, "updating %s", info->phys[i].dev_path);
2490 info->phys[i].nvme_age = get_nvme_age(info->phys[i].dev_path);
2491 }
2492
2493 data->mod_time = cf_get_seconds();
2494}
2495
2496static void
2497visit_children(path_data_t *data, dev_node_t *node)
2498{
2499 cf_storage_device_info *info = &data->info;
2500
2501 cf_detail(CF_HARDWARE, "considering %s for %s", node->dev_path,
2502 info->dev_path);
2503
2504 if (node->sys_sched[0] != 0) {
2505 cf_detail(CF_HARDWARE, "found scheduler %s", node->sys_sched);
2506
2507 uint32_t n_sys_scheds = data->n_sys_scheds;
2508
2509 if (n_sys_scheds >= CF_STORAGE_MAX_PHYS) {
2510 cf_warning(CF_HARDWARE, "too many schedulers for %s",
2511 info->dev_path);
2512 return;
2513 }
2514
2515 data->sys_scheds[n_sys_scheds] = node->sys_sched;
2516 ++data->n_sys_scheds;
2517 }
2518
2519 if (node->n_children == 0) {
2520 cf_detail(CF_HARDWARE, "found physical device");
2521
2522 uint32_t n_phys = info->n_phys;
2523
2524 if (n_phys >= CF_STORAGE_MAX_PHYS) {
2525 cf_warning(CF_HARDWARE, "too many physical devices for %s",
2526 info->dev_path);
2527 return;
2528 }
2529
2530 info->phys[n_phys].dev_path = node->dev_path;
2531 info->phys[n_phys].numa_node = get_numa_node(node->sys_home);
2532 info->phys[n_phys].nvme_age = -1;
2533
2534 ++info->n_phys;
2535 return;
2536 }
2537
2538 cf_detail(CF_HARDWARE, "examining children");
2539
2540 for (uint32_t i = 0; i < node->n_children; ++i) {
2541 visit_children(data, node->children[i]);
2542 }
2543}
2544
2545static path_data_t *
2546new_path_data(const char *any_path)
2547{
2548 cf_detail(CF_HARDWARE, "creating path data for %s", any_path);
2549
2550 path_data_t *data = cf_malloc(sizeof(path_data_t));
2551 struct stat st;
2552
2553 if (stat(any_path, &st) < 0) {
2554 cf_warning(CF_HARDWARE, "failed to query meta data for %s: %d (%s)",
2555 any_path, errno, cf_strerror(errno));
2556 cf_free(data);
2557 return NULL;
2558 }
2559
2560 cf_storage_device_info *info = &data->info;
2561
2562 if (S_ISREG(st.st_mode) || S_ISDIR(st.st_mode)) {
2563 cf_detail(CF_HARDWARE, "%s is a file or directory", any_path);
2564 info->dev_path = get_mounted_device(any_path);
2565
2566 if (info->dev_path == NULL) {
2567 cf_free(data);
2568 return NULL;
2569 }
2570 }
2571 else if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
2572 cf_detail(CF_HARDWARE, "%s is a device", any_path);
2573 info->dev_path = cf_strdup(any_path);
2574 }
2575 else {
2576 cf_warning(CF_HARDWARE, "%s with unknown type 0x%x", any_path,
2577 st.st_mode & S_IFMT);
2578 cf_free(data);
2579 return NULL;
2580 }
2581
2582 cf_detail(CF_HARDWARE, "mapping device %s", info->dev_path);
2583
2584 dev_key_t key;
2585
2586 if (!get_dev_key(info->dev_path, &key)) {
2587 cf_free(info->dev_path);
2588 cf_free(data);
2589 return NULL;
2590 }
2591
2592 dev_node_t *node;
2593
2594 if (cf_shash_get(g_dev_graph, &key, &node) != CF_SHASH_OK) {
2595 cf_warning(CF_HARDWARE, "no node for device key %u:%u", key.major,
2596 key.minor);
2597 cf_free(info->dev_path);
2598 cf_free(data);
2599 return NULL;
2600 }
2601
2602 cf_detail(CF_HARDWARE, "collecting dependency info");
2603
2604 data->n_sys_scheds = 0;
2605 info->n_phys = 0;
2606
2607 visit_children(data, node);
2608
2609 cf_detail(CF_HARDWARE, "populating NVMe age");
2610 update_path_data(data);
2611
2612 return data;
2613}
2614
2615static path_data_t *
2616get_path_data(const char *any_path)
2617{
2618 cf_detail(CF_HARDWARE, "getting path data for %s", any_path);
2619
2620 cf_mutex_lock(&g_path_data_lock);
2621
2622 if (g_dev_graph == NULL) {
2623 build_device_graph();
2624 }
2625
2626 if (g_path_data == NULL) {
2627 g_path_data = cf_shash_create(cf_shash_fn_zstr,
2628 DEVICE_PATH_SIZE, sizeof(path_data_t *), 256, 0);
2629 }
2630
2631 size_t len = strlen(any_path);
2632
2633 if (len >= DEVICE_PATH_SIZE) {
2634 cf_warning(CF_HARDWARE, "device path %s is too long", any_path);
2635 cf_mutex_unlock(&g_path_data_lock);
2636 return NULL;
2637 }
2638
2639 char key[DEVICE_PATH_SIZE];
2640
2641 memcpy(key, any_path, len);
2642 memset(key + len, 0, DEVICE_PATH_SIZE - len);
2643
2644 path_data_t *data;
2645
2646 if (cf_shash_get(g_path_data, key, &data) != CF_SHASH_OK) {
2647 cf_detail(CF_HARDWARE, "no path data");
2648
2649 data = new_path_data(any_path);
2650
2651 if (data == NULL) {
2652 cf_mutex_unlock(&g_path_data_lock);
2653 return NULL;
2654 }
2655
2656 cf_shash_put_unique(g_path_data, key, &data);
2657 }
2658 else {
2659 cf_detail(CF_HARDWARE, "existing path data");
2660 }
2661
2662 cf_clock now = cf_get_seconds();
2663
2664 if (now > data->mod_time + 86400) {
2665 update_path_data(data);
2666 }
2667
2668 cf_mutex_unlock(&g_path_data_lock);
2669 return data;
2670}
2671
2672cf_storage_device_info *
2673cf_storage_get_device_info(const char *path)
2674{
2675 cf_detail(CF_HARDWARE, "getting device info for %s", path);
2676
2677 path_data_t *data = get_path_data(path);
2678
2679 if (data == NULL) {
2680 return NULL;
2681 }
2682
2683 return &data->info;
2684}
2685
2686void
2687cf_storage_set_scheduler(const char *path, const char *sched)
2688{
2689 cf_detail(CF_HARDWARE, "setting scheduler for %s to %s", path, sched);
2690
2691 path_data_t *data = get_path_data(path);
2692
2693 if (data == NULL) {
2694 cf_warning(CF_HARDWARE, "couldn't find path data for %s", path);
2695 return;
2696 }
2697
2698 bool failed = false;
2699
2700 for (uint32_t i = 0; i < data->n_sys_scheds; ++i) {
2701 if (write_file(data->sys_scheds[i], sched, strlen(sched)) !=
2702 FILE_RES_OK) {
2703 failed = true;
2704 }
2705 }
2706
2707 if (failed) {
2708 cf_warning(CF_HARDWARE, "couldn't set scheduler for %s to %s", path,
2709 sched);
2710 }
2711 else {
2712 cf_info(CF_HARDWARE, "set scheduler for %s to %s", path, sched);
2713 }
2714}
2715
2716int64_t
2717cf_storage_file_system_size(const char *path)
2718{
2719 struct stat file;
2720
2721 if (stat(path, &file) < 0) {
2722 switch (errno) {
2723 case ENOENT:
2724 cf_warning(CF_HARDWARE, "mount point %s does not exist", path);
2725 break;
2726
2727 case EACCES:
2728 cf_warning(CF_HARDWARE, "access to mount point %s denied", path);
2729 break;
2730
2731 default:
2732 cf_warning(CF_HARDWARE,
2733 "error while querying mount point %s: %d (%s)", path,
2734 errno, cf_strerror(errno));
2735 break;
2736 }
2737
2738 return -1;
2739 }
2740
2741 if (!S_ISDIR(file.st_mode)) {
2742 cf_warning(CF_HARDWARE, "mount point %s is not a directory", path);
2743 return -1;
2744 }
2745
2746 struct statfs fs;
2747
2748 if (statfs(path, &fs) < 0) {
2749 cf_warning(CF_HARDWARE,
2750 "error while querying mount point %s: %d (%s)", path,
2751 errno, cf_strerror(errno));
2752 return -1;
2753 }
2754
2755 int64_t sz = (int64_t)fs.f_bsize * (int64_t)fs.f_blocks;
2756
2757 cf_detail(CF_HARDWARE, "file system size of %s is %ld", path, sz);
2758 return sz;
2759}
2760
2761bool
2762cf_storage_is_root_fs(const char *path)
2763{
2764 struct statvfs vfs;
2765
2766 if (statvfs("/", &vfs) < 0) {
2767 cf_crash(CF_HARDWARE, "cannot stat root directory");
2768 }
2769
2770 uint64_t root_id = vfs.f_fsid;
2771
2772 if (statvfs(path, &vfs) < 0) {
2773 cf_warning(CF_HARDWARE, "cannot stat %s: %d (%s)", path, errno,
2774 cf_strerror(errno));
2775 return false;
2776 }
2777
2778 return vfs.f_fsid == root_id;
2779}
2780
2781void
2782cf_page_cache_dirty_limits(void)
2783{
2784 write_file_safe("/proc/sys/vm/dirty_bytes", "16777216", 8);
2785 write_file_safe("/proc/sys/vm/dirty_background_bytes", "1", 1);
2786 write_file_safe("/proc/sys/vm/dirty_expire_centisecs", "1", 1);
2787 write_file_safe("/proc/sys/vm/dirty_writeback_centisecs", "10", 2);
2788}
2789
2790bool
2791cf_mount_is_local(const char *path)
2792{
2793 if (g_i_numa_node == INVALID_INDEX) {
2794 cf_detail(CF_HARDWARE, "not NUMA pinned");
2795 return true;
2796 }
2797
2798 cf_storage_device_info *info = cf_storage_get_device_info(path);
2799 cf_topo_numa_node_index numa_node = info->phys[0].numa_node;
2800
2801 for (uint32_t i = 1; i < info->n_phys; i++) {
2802 if (info->phys[i].numa_node != numa_node) {
2803 cf_crash_nostack(CF_HARDWARE, "can't numa pin %s (%s,%s)", path,
2804 info->phys[0].dev_path, info->phys[i].dev_path);
2805 }
2806 }
2807
2808 return numa_node == g_i_numa_node;
2809}
2810