1/*
2 * Copyright (c) 2017, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include <string.h>
26#include <math.h>
27#include <errno.h>
28#include "utilities/globalDefinitions.hpp"
29#include "memory/allocation.hpp"
30#include "runtime/os.hpp"
31#include "logging/log.hpp"
32#include "osContainer_linux.hpp"
33
34/*
35 * PER_CPU_SHARES has been set to 1024 because CPU shares' quota
36 * is commonly used in cloud frameworks like Kubernetes[1],
37 * AWS[2] and Mesos[3] in a similar way. They spawn containers with
38 * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do
39 * the inverse for determining the number of possible available
40 * CPUs to the JVM inside a container. See JDK-8216366.
41 *
42 * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu
43 * In particular:
44 * When using Docker:
45 * The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially
46 * fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the
47 * --cpu-shares flag in the docker run command.
48 * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
49 * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648
50 * https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30
51 */
52#define PER_CPU_SHARES 1024
53
54bool OSContainer::_is_initialized = false;
55bool OSContainer::_is_containerized = false;
56julong _unlimited_memory;
57
58class CgroupSubsystem: CHeapObj<mtInternal> {
59 friend class OSContainer;
60
61 private:
62 /* mountinfo contents */
63 char *_root;
64 char *_mount_point;
65
66 /* Constructed subsystem directory */
67 char *_path;
68
69 public:
70 CgroupSubsystem(char *root, char *mountpoint) {
71 _root = os::strdup(root);
72 _mount_point = os::strdup(mountpoint);
73 _path = NULL;
74 }
75
76 /*
77 * Set directory to subsystem specific files based
78 * on the contents of the mountinfo and cgroup files.
79 */
80 void set_subsystem_path(char *cgroup_path) {
81 char buf[MAXPATHLEN+1];
82 if (_root != NULL && cgroup_path != NULL) {
83 if (strcmp(_root, "/") == 0) {
84 int buflen;
85 strncpy(buf, _mount_point, MAXPATHLEN);
86 buf[MAXPATHLEN-1] = '\0';
87 if (strcmp(cgroup_path,"/") != 0) {
88 buflen = strlen(buf);
89 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
90 return;
91 }
92 strncat(buf, cgroup_path, MAXPATHLEN-buflen);
93 buf[MAXPATHLEN-1] = '\0';
94 }
95 _path = os::strdup(buf);
96 } else {
97 if (strcmp(_root, cgroup_path) == 0) {
98 strncpy(buf, _mount_point, MAXPATHLEN);
99 buf[MAXPATHLEN-1] = '\0';
100 _path = os::strdup(buf);
101 } else {
102 char *p = strstr(cgroup_path, _root);
103 if (p != NULL && p == _root) {
104 if (strlen(cgroup_path) > strlen(_root)) {
105 int buflen;
106 strncpy(buf, _mount_point, MAXPATHLEN);
107 buf[MAXPATHLEN-1] = '\0';
108 buflen = strlen(buf);
109 if ((buflen + strlen(cgroup_path) - strlen(_root)) > (MAXPATHLEN-1)) {
110 return;
111 }
112 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
113 buf[MAXPATHLEN-1] = '\0';
114 _path = os::strdup(buf);
115 }
116 }
117 }
118 }
119 }
120 }
121
122 char *subsystem_path() { return _path; }
123};
124
125class CgroupMemorySubsystem: CgroupSubsystem {
126 friend class OSContainer;
127
128 private:
129 /* Some container runtimes set limits via cgroup
130 * hierarchy. If set to true consider also memory.stat
131 * file if everything else seems unlimited */
132 bool _uses_mem_hierarchy;
133
134 public:
135 CgroupMemorySubsystem(char *root, char *mountpoint) : CgroupSubsystem::CgroupSubsystem(root, mountpoint) {
136 _uses_mem_hierarchy = false;
137 }
138
139 bool is_hierarchical() { return _uses_mem_hierarchy; }
140 void set_hierarchical(bool value) { _uses_mem_hierarchy = value; }
141};
142
143CgroupMemorySubsystem* memory = NULL;
144CgroupSubsystem* cpuset = NULL;
145CgroupSubsystem* cpu = NULL;
146CgroupSubsystem* cpuacct = NULL;
147
148typedef char * cptr;
149
150PRAGMA_DIAG_PUSH
151PRAGMA_FORMAT_NONLITERAL_IGNORED
152template <typename T> int subsystem_file_line_contents(CgroupSubsystem* c,
153 const char *filename,
154 const char *matchline,
155 const char *scan_fmt,
156 T returnval) {
157 FILE *fp = NULL;
158 char *p;
159 char file[MAXPATHLEN+1];
160 char buf[MAXPATHLEN+1];
161 char discard[MAXPATHLEN+1];
162 bool found_match = false;
163
164 if (c == NULL) {
165 log_debug(os, container)("subsystem_file_line_contents: CgroupSubsytem* is NULL");
166 return OSCONTAINER_ERROR;
167 }
168 if (c->subsystem_path() == NULL) {
169 log_debug(os, container)("subsystem_file_line_contents: subsystem path is NULL");
170 return OSCONTAINER_ERROR;
171 }
172
173 strncpy(file, c->subsystem_path(), MAXPATHLEN);
174 file[MAXPATHLEN-1] = '\0';
175 int filelen = strlen(file);
176 if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) {
177 log_debug(os, container)("File path too long %s, %s", file, filename);
178 return OSCONTAINER_ERROR;
179 }
180 strncat(file, filename, MAXPATHLEN-filelen);
181 log_trace(os, container)("Path to %s is %s", filename, file);
182 fp = fopen(file, "r");
183 if (fp != NULL) {
184 int err = 0;
185 while ((p = fgets(buf, MAXPATHLEN, fp)) != NULL) {
186 found_match = false;
187 if (matchline == NULL) {
188 // single-line file case
189 int matched = sscanf(p, scan_fmt, returnval);
190 found_match = (matched == 1);
191 } else {
192 // multi-line file case
193 if (strstr(p, matchline) != NULL) {
194 // discard matchline string prefix
195 int matched = sscanf(p, scan_fmt, discard, returnval);
196 found_match = (matched == 2);
197 } else {
198 continue; // substring not found
199 }
200 }
201 if (found_match) {
202 fclose(fp);
203 return 0;
204 } else {
205 err = 1;
206 log_debug(os, container)("Type %s not found in file %s", scan_fmt, file);
207 }
208 }
209 if (err == 0) {
210 log_debug(os, container)("Empty file %s", file);
211 }
212 } else {
213 log_debug(os, container)("Open of file %s failed, %s", file, os::strerror(errno));
214 }
215 if (fp != NULL)
216 fclose(fp);
217 return OSCONTAINER_ERROR;
218}
219PRAGMA_DIAG_POP
220
221#define GET_CONTAINER_INFO(return_type, subsystem, filename, \
222 logstring, scan_fmt, variable) \
223 return_type variable; \
224{ \
225 int err; \
226 err = subsystem_file_line_contents(subsystem, \
227 filename, \
228 NULL, \
229 scan_fmt, \
230 &variable); \
231 if (err != 0) \
232 return (return_type) OSCONTAINER_ERROR; \
233 \
234 log_trace(os, container)(logstring, variable); \
235}
236
237#define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename, \
238 logstring, scan_fmt, variable, bufsize) \
239 char variable[bufsize]; \
240{ \
241 int err; \
242 err = subsystem_file_line_contents(subsystem, \
243 filename, \
244 NULL, \
245 scan_fmt, \
246 variable); \
247 if (err != 0) \
248 return (return_type) NULL; \
249 \
250 log_trace(os, container)(logstring, variable); \
251}
252
253#define GET_CONTAINER_INFO_LINE(return_type, subsystem, filename, \
254 matchline, logstring, scan_fmt, variable) \
255 return_type variable; \
256{ \
257 int err; \
258 err = subsystem_file_line_contents(subsystem, \
259 filename, \
260 matchline, \
261 scan_fmt, \
262 &variable); \
263 if (err != 0) \
264 return (return_type) OSCONTAINER_ERROR; \
265 \
266 log_trace(os, container)(logstring, variable); \
267}
268
269/* init
270 *
271 * Initialize the container support and determine if
272 * we are running under cgroup control.
273 */
274void OSContainer::init() {
275 FILE *mntinfo = NULL;
276 FILE *cgroup = NULL;
277 char buf[MAXPATHLEN+1];
278 char tmproot[MAXPATHLEN+1];
279 char tmpmount[MAXPATHLEN+1];
280 char *p;
281 jlong mem_limit;
282
283 assert(!_is_initialized, "Initializing OSContainer more than once");
284
285 _is_initialized = true;
286 _is_containerized = false;
287
288 _unlimited_memory = (LONG_MAX / os::vm_page_size()) * os::vm_page_size();
289
290 log_trace(os, container)("OSContainer::init: Initializing Container Support");
291 if (!UseContainerSupport) {
292 log_trace(os, container)("Container Support not enabled");
293 return;
294 }
295
296 /*
297 * Find the cgroup mount point for memory and cpuset
298 * by reading /proc/self/mountinfo
299 *
300 * Example for docker:
301 * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
302 *
303 * Example for host:
304 * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
305 */
306 mntinfo = fopen("/proc/self/mountinfo", "r");
307 if (mntinfo == NULL) {
308 log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
309 os::strerror(errno));
310 return;
311 }
312
313 while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
314 char tmpcgroups[MAXPATHLEN+1];
315 char *cptr = tmpcgroups;
316 char *token;
317
318 // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt
319 if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) {
320 continue;
321 }
322 while ((token = strsep(&cptr, ",")) != NULL) {
323 if (strcmp(token, "memory") == 0) {
324 memory = new CgroupMemorySubsystem(tmproot, tmpmount);
325 } else if (strcmp(token, "cpuset") == 0) {
326 cpuset = new CgroupSubsystem(tmproot, tmpmount);
327 } else if (strcmp(token, "cpu") == 0) {
328 cpu = new CgroupSubsystem(tmproot, tmpmount);
329 } else if (strcmp(token, "cpuacct") == 0) {
330 cpuacct= new CgroupSubsystem(tmproot, tmpmount);
331 }
332 }
333 }
334
335 fclose(mntinfo);
336
337 if (memory == NULL) {
338 log_debug(os, container)("Required cgroup memory subsystem not found");
339 return;
340 }
341 if (cpuset == NULL) {
342 log_debug(os, container)("Required cgroup cpuset subsystem not found");
343 return;
344 }
345 if (cpu == NULL) {
346 log_debug(os, container)("Required cgroup cpu subsystem not found");
347 return;
348 }
349 if (cpuacct == NULL) {
350 log_debug(os, container)("Required cgroup cpuacct subsystem not found");
351 return;
352 }
353
354 /*
355 * Read /proc/self/cgroup and map host mount point to
356 * local one via /proc/self/mountinfo content above
357 *
358 * Docker example:
359 * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
360 *
361 * Host example:
362 * 5:memory:/user.slice
363 *
364 * Construct a path to the process specific memory and cpuset
365 * cgroup directory.
366 *
367 * For a container running under Docker from memory example above
368 * the paths would be:
369 *
370 * /sys/fs/cgroup/memory
371 *
372 * For a Host from memory example above the path would be:
373 *
374 * /sys/fs/cgroup/memory/user.slice
375 *
376 */
377 cgroup = fopen("/proc/self/cgroup", "r");
378 if (cgroup == NULL) {
379 log_debug(os, container)("Can't open /proc/self/cgroup, %s",
380 os::strerror(errno));
381 return;
382 }
383
384 while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
385 char *controllers;
386 char *token;
387 char *base;
388
389 /* Skip cgroup number */
390 strsep(&p, ":");
391 /* Get controllers and base */
392 controllers = strsep(&p, ":");
393 base = strsep(&p, "\n");
394
395 if (controllers == NULL) {
396 continue;
397 }
398
399 while ((token = strsep(&controllers, ",")) != NULL) {
400 if (strcmp(token, "memory") == 0) {
401 memory->set_subsystem_path(base);
402 jlong hierarchy = uses_mem_hierarchy();
403 if (hierarchy > 0) {
404 memory->set_hierarchical(true);
405 }
406 } else if (strcmp(token, "cpuset") == 0) {
407 cpuset->set_subsystem_path(base);
408 } else if (strcmp(token, "cpu") == 0) {
409 cpu->set_subsystem_path(base);
410 } else if (strcmp(token, "cpuacct") == 0) {
411 cpuacct->set_subsystem_path(base);
412 }
413 }
414 }
415
416 fclose(cgroup);
417
418 // We need to update the amount of physical memory now that
419 // command line arguments have been processed.
420 if ((mem_limit = memory_limit_in_bytes()) > 0) {
421 os::Linux::set_physical_memory(mem_limit);
422 log_info(os, container)("Memory Limit is: " JLONG_FORMAT, mem_limit);
423 }
424
425 _is_containerized = true;
426
427}
428
429const char * OSContainer::container_type() {
430 if (is_containerized()) {
431 return "cgroupv1";
432 } else {
433 return NULL;
434 }
435}
436
437/* uses_mem_hierarchy
438 *
439 * Return whether or not hierarchical cgroup accounting is being
440 * done.
441 *
442 * return:
443 * A number > 0 if true, or
444 * OSCONTAINER_ERROR for not supported
445 */
446jlong OSContainer::uses_mem_hierarchy() {
447 GET_CONTAINER_INFO(jlong, memory, "/memory.use_hierarchy",
448 "Use Hierarchy is: " JLONG_FORMAT, JLONG_FORMAT, use_hierarchy);
449 return use_hierarchy;
450}
451
452
453/* memory_limit_in_bytes
454 *
455 * Return the limit of available memory for this process.
456 *
457 * return:
458 * memory limit in bytes or
459 * -1 for unlimited
460 * OSCONTAINER_ERROR for not supported
461 */
462jlong OSContainer::memory_limit_in_bytes() {
463 GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes",
464 "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit);
465
466 if (memlimit >= _unlimited_memory) {
467 log_trace(os, container)("Non-Hierarchical Memory Limit is: Unlimited");
468 if (memory->is_hierarchical()) {
469 const char* matchline = "hierarchical_memory_limit";
470 const char* format = "%s " JULONG_FORMAT;
471 GET_CONTAINER_INFO_LINE(julong, memory, "/memory.stat", matchline,
472 "Hierarchical Memory Limit is: " JULONG_FORMAT, format, hier_memlimit)
473 if (hier_memlimit >= _unlimited_memory) {
474 log_trace(os, container)("Hierarchical Memory Limit is: Unlimited");
475 } else {
476 return (jlong)hier_memlimit;
477 }
478 }
479 return (jlong)-1;
480 }
481 else {
482 return (jlong)memlimit;
483 }
484}
485
486jlong OSContainer::memory_and_swap_limit_in_bytes() {
487 GET_CONTAINER_INFO(julong, memory, "/memory.memsw.limit_in_bytes",
488 "Memory and Swap Limit is: " JULONG_FORMAT, JULONG_FORMAT, memswlimit);
489 if (memswlimit >= _unlimited_memory) {
490 log_trace(os, container)("Non-Hierarchical Memory and Swap Limit is: Unlimited");
491 if (memory->is_hierarchical()) {
492 const char* matchline = "hierarchical_memsw_limit";
493 const char* format = "%s " JULONG_FORMAT;
494 GET_CONTAINER_INFO_LINE(julong, memory, "/memory.stat", matchline,
495 "Hierarchical Memory and Swap Limit is : " JULONG_FORMAT, format, hier_memlimit)
496 if (hier_memlimit >= _unlimited_memory) {
497 log_trace(os, container)("Hierarchical Memory and Swap Limit is: Unlimited");
498 } else {
499 return (jlong)hier_memlimit;
500 }
501 }
502 return (jlong)-1;
503 } else {
504 return (jlong)memswlimit;
505 }
506}
507
508jlong OSContainer::memory_soft_limit_in_bytes() {
509 GET_CONTAINER_INFO(julong, memory, "/memory.soft_limit_in_bytes",
510 "Memory Soft Limit is: " JULONG_FORMAT, JULONG_FORMAT, memsoftlimit);
511 if (memsoftlimit >= _unlimited_memory) {
512 log_trace(os, container)("Memory Soft Limit is: Unlimited");
513 return (jlong)-1;
514 } else {
515 return (jlong)memsoftlimit;
516 }
517}
518
519/* memory_usage_in_bytes
520 *
521 * Return the amount of used memory for this process.
522 *
523 * return:
524 * memory usage in bytes or
525 * -1 for unlimited
526 * OSCONTAINER_ERROR for not supported
527 */
528jlong OSContainer::memory_usage_in_bytes() {
529 GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes",
530 "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage);
531 return memusage;
532}
533
534/* memory_max_usage_in_bytes
535 *
536 * Return the maximum amount of used memory for this process.
537 *
538 * return:
539 * max memory usage in bytes or
540 * OSCONTAINER_ERROR for not supported
541 */
542jlong OSContainer::memory_max_usage_in_bytes() {
543 GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes",
544 "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage);
545 return memmaxusage;
546}
547
548/* active_processor_count
549 *
550 * Calculate an appropriate number of active processors for the
551 * VM to use based on these three inputs.
552 *
553 * cpu affinity
554 * cgroup cpu quota & cpu period
555 * cgroup cpu shares
556 *
557 * Algorithm:
558 *
559 * Determine the number of available CPUs from sched_getaffinity
560 *
561 * If user specified a quota (quota != -1), calculate the number of
562 * required CPUs by dividing quota by period.
563 *
564 * If shares are in effect (shares != -1), calculate the number
565 * of CPUs required for the shares by dividing the share value
566 * by PER_CPU_SHARES.
567 *
568 * All results of division are rounded up to the next whole number.
569 *
570 * If neither shares or quotas have been specified, return the
571 * number of active processors in the system.
572 *
573 * If both shares and quotas have been specified, the results are
574 * based on the flag PreferContainerQuotaForCPUCount. If true,
575 * return the quota value. If false return the smallest value
576 * between shares or quotas.
577 *
578 * If shares and/or quotas have been specified, the resulting number
579 * returned will never exceed the number of active processors.
580 *
581 * return:
582 * number of CPUs
583 */
584int OSContainer::active_processor_count() {
585 int quota_count = 0, share_count = 0;
586 int cpu_count, limit_count;
587 int result;
588
589 cpu_count = limit_count = os::Linux::active_processor_count();
590 int quota = cpu_quota();
591 int period = cpu_period();
592 int share = cpu_shares();
593
594 if (quota > -1 && period > 0) {
595 quota_count = ceilf((float)quota / (float)period);
596 log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
597 }
598 if (share > -1) {
599 share_count = ceilf((float)share / (float)PER_CPU_SHARES);
600 log_trace(os, container)("CPU Share count based on shares: %d", share_count);
601 }
602
603 // If both shares and quotas are setup results depend
604 // on flag PreferContainerQuotaForCPUCount.
605 // If true, limit CPU count to quota
606 // If false, use minimum of shares and quotas
607 if (quota_count !=0 && share_count != 0) {
608 if (PreferContainerQuotaForCPUCount) {
609 limit_count = quota_count;
610 } else {
611 limit_count = MIN2(quota_count, share_count);
612 }
613 } else if (quota_count != 0) {
614 limit_count = quota_count;
615 } else if (share_count != 0) {
616 limit_count = share_count;
617 }
618
619 result = MIN2(cpu_count, limit_count);
620 log_trace(os, container)("OSContainer::active_processor_count: %d", result);
621 return result;
622}
623
624char * OSContainer::cpu_cpuset_cpus() {
625 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus",
626 "cpuset.cpus is: %s", "%1023s", cpus, 1024);
627 return os::strdup(cpus);
628}
629
630char * OSContainer::cpu_cpuset_memory_nodes() {
631 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems",
632 "cpuset.mems is: %s", "%1023s", mems, 1024);
633 return os::strdup(mems);
634}
635
636/* cpu_quota
637 *
638 * Return the number of milliseconds per period
639 * process is guaranteed to run.
640 *
641 * return:
642 * quota time in milliseconds
643 * -1 for no quota
644 * OSCONTAINER_ERROR for not supported
645 */
646int OSContainer::cpu_quota() {
647 GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us",
648 "CPU Quota is: %d", "%d", quota);
649 return quota;
650}
651
652int OSContainer::cpu_period() {
653 GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us",
654 "CPU Period is: %d", "%d", period);
655 return period;
656}
657
658/* cpu_shares
659 *
660 * Return the amount of cpu shares available to the process
661 *
662 * return:
663 * Share number (typically a number relative to 1024)
664 * (2048 typically expresses 2 CPUs worth of processing)
665 * -1 for no share setup
666 * OSCONTAINER_ERROR for not supported
667 */
668int OSContainer::cpu_shares() {
669 GET_CONTAINER_INFO(int, cpu, "/cpu.shares",
670 "CPU Shares is: %d", "%d", shares);
671 // Convert 1024 to no shares setup
672 if (shares == 1024) return -1;
673
674 return shares;
675}
676
677