1 /*
2 * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "cgroupUtil_linux.hpp"
26 #include "cgroupV1Subsystem_linux.hpp"
27 #include "logging/log.hpp"
28 #include "memory/allocation.hpp"
29 #include "os_linux.hpp"
30 #include "runtime/globals.hpp"
31 #include "runtime/os.hpp"
32 #include "utilities/globalDefinitions.hpp"
33
34 #include <errno.h>
35 #include <math.h>
36 #include <string.h>
37
38 /*
39 * Set directory to subsystem specific files based
40 * on the contents of the mountinfo and cgroup files.
41 *
42 * The method determines whether it runs in
43 * - host mode
44 * - container mode
45 *
46 * In the host mode, _root is equal to "/" and
47 * the subsystem path is equal to the _mount_point path
48 * joined with cgroup_path.
49 *
50 * In the container mode, it can be two possibilities:
51 * - private namespace (cgroupns=private)
52 * - host namespace (cgroupns=host, default mode in cgroup V1 hosts)
53 *
54 * Private namespace is equivalent to the host mode, i.e.
55 * the subsystem path is set by concatenating
56 * _mount_point and cgroup_path.
57 *
58 * In the host namespace, _root is equal to host's cgroup path
59 * of the control group to which the containerized process
60 * belongs to at the moment of creation. The mountinfo and
61 * cgroup files are mirrored from the host, while the subsystem
62 * specific files are mapped directly at _mount_point, i.e.
63 * at /sys/fs/cgroup/<controller>/, the subsystem path is
64 * then set equal to _mount_point.
65 *
66 * A special case of the subsystem path is when a cgroup path
67 * includes a subgroup, when a containerized process was associated
68 * with an existing cgroup, that is different from cgroup
69 * in which the process has been created.
70 * Here, the _root is equal to the host's initial cgroup path,
71 * cgroup_path will be equal to host's new cgroup path.
72 * As host cgroup hierarchies are not accessible in the container,
73 * it needs to be determined which part of cgroup path
74 * is accessible inside container, i.e. mapped under
75 * /sys/fs/cgroup/<controller>/<subgroup>.
76 * In Docker default setup, host's cgroup path can be
77 * of the form: /docker/<CONTAINER_ID>/<subgroup>,
78 * from which only <subgroup> is mapped.
79 * The method trims cgroup path from left, until the subgroup
80 * component is found. The subsystem path will be set to
81 * the _mount_point joined with the subgroup path.
82 */
83 void CgroupV1Controller::set_subsystem_path(const char* cgroup_path) {
84 if (_cgroup_path != nullptr) {
85 os::free(_cgroup_path);
86 }
87 if (_path != nullptr) {
88 os::free(_path);
89 _path = nullptr;
90 }
91 _cgroup_path = os::strdup(cgroup_path);
92 stringStream ss;
93 if (_root != nullptr && cgroup_path != nullptr) {
94 ss.print_raw(_mount_point);
95 if (strcmp(_root, "/") == 0) {
96 // host processes and containers with cgroupns=private
97 if (strcmp(cgroup_path,"/") != 0) {
98 ss.print_raw(cgroup_path);
99 }
100 } else {
101 // containers with cgroupns=host, default setting is _root==cgroup_path
102 if (strcmp(_root, cgroup_path) != 0) {
103 if (*cgroup_path != '\0' && strcmp(cgroup_path, "/") != 0) {
104 // When moved to a subgroup, between subgroups, the path suffix will change.
105 const char *suffix = cgroup_path;
106 while (suffix != nullptr) {
107 stringStream pp;
108 pp.print_raw(_mount_point);
109 pp.print_raw(suffix);
110 if (os::file_exists(pp.base())) {
111 ss.print_raw(suffix);
112 if (suffix != cgroup_path) {
113 log_trace(os, container)("set_subsystem_path: cgroup v1 path reduced to: %s.", suffix);
114 }
115 break;
116 }
117 log_trace(os, container)("set_subsystem_path: skipped non-existent directory: %s.", suffix);
118 suffix = strchr(suffix + 1, '/');
119 }
120 }
121 }
122 }
123 _path = os::strdup(ss.base());
124 }
125 }
126
127 /*
128 * The common case, containers, we have _root == _cgroup_path, and thus set the
129 * controller path to the _mount_point. This is where the limits are exposed in
130 * the cgroup pseudo filesystem (at the leaf) and adjustment of the path won't
131 * be needed for that reason.
132 */
133 bool CgroupV1Controller::needs_hierarchy_adjustment() {
134 assert(_cgroup_path != nullptr, "sanity");
135 return strcmp(_root, _cgroup_path) != 0;
136 }
137
138 static inline
139 void verbose_log(julong read_mem_limit, julong upper_mem_bound) {
140 if (log_is_enabled(Debug, os, container)) {
141 jlong mem_limit = (jlong)read_mem_limit; // account for negative values
142 if (mem_limit < 0 || read_mem_limit >= upper_mem_bound) {
143 const char *reason;
144 if (mem_limit == OSCONTAINER_ERROR) {
145 reason = "failed";
146 } else if (mem_limit == -1) {
147 reason = "unlimited";
148 } else {
149 assert(read_mem_limit >= upper_mem_bound, "Expected read value exceeding upper memory bound");
150 // Exceeding physical memory is treated as unlimited. This implementation
151 // caps it at host_mem since Cg v1 has no value to represent 'max'.
152 reason = "ignored";
153 }
154 log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", upper bound is " JLONG_FORMAT,
155 reason, mem_limit, upper_mem_bound);
156 }
157 }
158 }
159
160 jlong CgroupV1MemoryController::read_memory_limit_in_bytes(julong upper_bound) {
161 julong memlimit;
162 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.limit_in_bytes", "Memory Limit", memlimit);
163 if (memlimit >= upper_bound) {
164 verbose_log(memlimit, upper_bound);
165 return (jlong)-1;
166 } else {
167 verbose_log(memlimit, upper_bound);
168 return (jlong)memlimit;
169 }
170 }
171
172 /* read_mem_swap
173 *
174 * Determine the memory and swap limit metric. Returns a positive limit value strictly
175 * lower than the physical memory and swap limit iff there is a limit. Otherwise a
176 * negative value is returned indicating the determined status.
177 *
178 * returns:
179 * * A number > 0 if the limit is available and lower than a physical upper bound.
180 * * OSCONTAINER_ERROR if the limit cannot be retrieved (i.e. not supported) or
181 * * -1 if there isn't any limit in place (note: includes values which exceed a physical
182 * upper bound)
183 */
184 jlong CgroupV1MemoryController::read_mem_swap(julong upper_memsw_bound) {
185 julong memswlimit;
186 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.memsw.limit_in_bytes", "Memory and Swap Limit", memswlimit);
187 if (memswlimit >= upper_memsw_bound) {
188 log_trace(os, container)("Memory and Swap Limit is: Unlimited");
189 return (jlong)-1;
190 } else {
191 return (jlong)memswlimit;
192 }
193 }
194
195 jlong CgroupV1MemoryController::memory_and_swap_limit_in_bytes(julong upper_mem_bound, julong upper_swap_bound) {
196 jlong memory_swap = read_mem_swap(upper_mem_bound + upper_swap_bound);
197 if (memory_swap == -1) {
198 return memory_swap;
199 }
200 // If there is a swap limit, but swappiness == 0, reset the limit
201 // to the memory limit. Do the same for cases where swap isn't
202 // supported.
203 jlong swappiness = read_mem_swappiness();
204 if (swappiness == 0 || memory_swap == OSCONTAINER_ERROR) {
205 jlong memlimit = read_memory_limit_in_bytes(upper_mem_bound);
206 if (memory_swap == OSCONTAINER_ERROR) {
207 log_trace(os, container)("Memory and Swap Limit has been reset to " JLONG_FORMAT " because swap is not supported", memlimit);
208 } else {
209 log_trace(os, container)("Memory and Swap Limit has been reset to " JLONG_FORMAT " because swappiness is 0", memlimit);
210 }
211 return memlimit;
212 }
213 return memory_swap;
214 }
215
216 static inline
217 jlong memory_swap_usage_impl(CgroupController* ctrl) {
218 julong memory_swap_usage;
219 CONTAINER_READ_NUMBER_CHECKED(ctrl, "/memory.memsw.usage_in_bytes", "mem swap usage", memory_swap_usage);
220 return (jlong)memory_swap_usage;
221 }
222
223 jlong CgroupV1MemoryController::memory_and_swap_usage_in_bytes(julong upper_mem_bound, julong upper_swap_bound) {
224 jlong memory_sw_limit = memory_and_swap_limit_in_bytes(upper_mem_bound, upper_swap_bound);
225 jlong memory_limit = read_memory_limit_in_bytes(upper_mem_bound);
226 if (memory_sw_limit > 0 && memory_limit > 0) {
227 jlong delta_swap = memory_sw_limit - memory_limit;
228 if (delta_swap > 0) {
229 return memory_swap_usage_impl(reader());
230 }
231 }
232 return memory_usage_in_bytes();
233 }
234
235 jlong CgroupV1MemoryController::read_mem_swappiness() {
236 julong swappiness;
237 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.swappiness", "Swappiness", swappiness);
238 return (jlong)swappiness;
239 }
240
241 jlong CgroupV1MemoryController::memory_soft_limit_in_bytes(julong upper_bound) {
242 julong memsoftlimit;
243 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.soft_limit_in_bytes", "Memory Soft Limit", memsoftlimit);
244 if (memsoftlimit >= upper_bound) {
245 log_trace(os, container)("Memory Soft Limit is: Unlimited");
246 return (jlong)-1;
247 } else {
248 return (jlong)memsoftlimit;
249 }
250 }
251
252 jlong CgroupV1MemoryController::memory_throttle_limit_in_bytes() {
253 // Log this string at trace level so as to make tests happy.
254 log_trace(os, container)("Memory Throttle Limit is not supported.");
255 return OSCONTAINER_ERROR; // not supported
256 }
257
258 // Constructor
259 CgroupV1Subsystem::CgroupV1Subsystem(CgroupV1Controller* cpuset,
260 CgroupV1CpuController* cpu,
261 CgroupV1CpuacctController* cpuacct,
262 CgroupV1Controller* pids,
263 CgroupV1MemoryController* memory) :
264 _cpuset(cpuset),
265 _cpuacct(cpuacct),
266 _pids(pids) {
267 CgroupUtil::adjust_controller(memory);
268 CgroupUtil::adjust_controller(cpu);
269 _memory = new CachingCgroupController<CgroupMemoryController>(memory);
270 _cpu = new CachingCgroupController<CgroupCpuController>(cpu);
271 }
272
273 bool CgroupV1Subsystem::is_containerized() {
274 // containerized iff all required controllers are mounted
275 // read-only. See OSContainer::is_containerized() for
276 // the full logic.
277 //
278 return _memory->controller()->is_read_only() &&
279 _cpu->controller()->is_read_only() &&
280 _cpuacct->is_read_only() &&
281 _cpuset->is_read_only();
282 }
283
284 /* memory_usage_in_bytes
285 *
286 * Return the amount of used memory for this process.
287 *
288 * return:
289 * memory usage in bytes or
290 * -1 for unlimited
291 * OSCONTAINER_ERROR for not supported
292 */
293 jlong CgroupV1MemoryController::memory_usage_in_bytes() {
294 julong memusage;
295 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.usage_in_bytes", "Memory Usage", memusage);
296 return (jlong)memusage;
297 }
298
299 /* memory_max_usage_in_bytes
300 *
301 * Return the maximum amount of used memory for this process.
302 *
303 * return:
304 * max memory usage in bytes or
305 * OSCONTAINER_ERROR for not supported
306 */
307 jlong CgroupV1MemoryController::memory_max_usage_in_bytes() {
308 julong memmaxusage;
309 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.max_usage_in_bytes", "Maximum Memory Usage", memmaxusage);
310 return (jlong)memmaxusage;
311 }
312
313 jlong CgroupV1MemoryController::rss_usage_in_bytes() {
314 julong rss;
315 bool is_ok = reader()->read_numerical_key_value("/memory.stat", "rss", &rss);
316 if (!is_ok) {
317 return OSCONTAINER_ERROR;
318 }
319 log_trace(os, container)("RSS usage is: " JULONG_FORMAT, rss);
320 return (jlong)rss;
321 }
322
323 jlong CgroupV1MemoryController::cache_usage_in_bytes() {
324 julong cache;
325 bool is_ok = reader()->read_numerical_key_value("/memory.stat", "cache", &cache);
326 if (!is_ok) {
327 return OSCONTAINER_ERROR;
328 }
329 log_trace(os, container)("Cache usage is: " JULONG_FORMAT, cache);
330 return cache;
331 }
332
333 jlong CgroupV1MemoryController::kernel_memory_usage_in_bytes() {
334 julong kmem_usage;
335 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.usage_in_bytes", "Kernel Memory Usage", kmem_usage);
336 return (jlong)kmem_usage;
337 }
338
339 jlong CgroupV1MemoryController::kernel_memory_limit_in_bytes(julong upper_bound) {
340 julong kmem_limit;
341 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.limit_in_bytes", "Kernel Memory Limit", kmem_limit);
342 if (kmem_limit >= upper_bound) {
343 return (jlong)-1;
344 }
345 return (jlong)kmem_limit;
346 }
347
348 jlong CgroupV1MemoryController::kernel_memory_max_usage_in_bytes() {
349 julong kmem_max_usage;
350 CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.max_usage_in_bytes", "Maximum Kernel Memory Usage", kmem_max_usage);
351 return (jlong)kmem_max_usage;
352 }
353
354 void CgroupV1MemoryController::print_version_specific_info(outputStream* st, julong mem_bound) {
355 jlong kmem_usage = kernel_memory_usage_in_bytes();
356 jlong kmem_limit = kernel_memory_limit_in_bytes(mem_bound);
357 jlong kmem_max_usage = kernel_memory_max_usage_in_bytes();
358
359 OSContainer::print_container_helper(st, kmem_limit, "kernel_memory_limit_in_bytes");
360 OSContainer::print_container_helper(st, kmem_usage, "kernel_memory_usage_in_bytes");
361 OSContainer::print_container_helper(st, kmem_max_usage, "kernel_memory_max_usage_in_bytes");
362 }
363
364 char* CgroupV1Subsystem::cpu_cpuset_cpus() {
365 char cpus[1024];
366 CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.cpus", "cpuset.cpus", cpus, 1024);
367 return os::strdup(cpus);
368 }
369
370 char* CgroupV1Subsystem::cpu_cpuset_memory_nodes() {
371 char mems[1024];
372 CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.mems", "cpuset.mems", mems, 1024);
373 return os::strdup(mems);
374 }
375
376 /* cpu_quota
377 *
378 * Return the number of microseconds per period
379 * process is guaranteed to run.
380 *
381 * return:
382 * quota time in microseconds
383 * -1 for no quota
384 * OSCONTAINER_ERROR for not supported
385 */
386 int CgroupV1CpuController::cpu_quota() {
387 julong quota;
388 bool is_ok = reader()->read_number("/cpu.cfs_quota_us", "a);
389 if (!is_ok) {
390 log_trace(os, container)("CPU Quota failed: %d", OSCONTAINER_ERROR);
391 return OSCONTAINER_ERROR;
392 }
393 // cast to int since the read value might be negative
394 // and we want to avoid logging -1 as a large unsigned value.
395 int quota_int = (int)quota;
396 log_trace(os, container)("CPU Quota is: %d", quota_int);
397 return quota_int;
398 }
399
400 int CgroupV1CpuController::cpu_period() {
401 julong period;
402 CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.cfs_period_us", "CPU Period", period);
403 return (int)period;
404 }
405
406 /* cpu_shares
407 *
408 * Return the amount of cpu shares available to the process
409 *
410 * return:
411 * Share number (typically a number relative to 1024)
412 * (2048 typically expresses 2 CPUs worth of processing)
413 * -1 for no share setup
414 * OSCONTAINER_ERROR for not supported
415 */
416 int CgroupV1CpuController::cpu_shares() {
417 julong shares;
418 CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.shares", "CPU Shares", shares);
419 int shares_int = (int)shares;
420 // Convert 1024 to no shares setup
421 if (shares_int == 1024) return -1;
422
423 return shares_int;
424 }
425
426 jlong CgroupV1CpuacctController::cpu_usage_in_micros() {
427 julong cpu_usage;
428 CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpuacct.usage", "CPU Usage", cpu_usage);
429 // Output is in nanoseconds, convert to microseconds.
430 return (jlong)cpu_usage / 1000;
431 }
432
433 /* pids_max
434 *
435 * Return the maximum number of tasks available to the process
436 *
437 * return:
438 * maximum number of tasks
439 * -1 for unlimited
440 * OSCONTAINER_ERROR for not supported
441 */
442 jlong CgroupV1Subsystem::pids_max() {
443 if (_pids == nullptr) return OSCONTAINER_ERROR;
444 jlong pids_max;
445 CONTAINER_READ_NUMBER_CHECKED_MAX(_pids, "/pids.max", "Maximum number of tasks", pids_max);
446 return pids_max;
447 }
448
449 /* pids_current
450 *
451 * The number of tasks currently in the cgroup (and its descendants) of the process
452 *
453 * return:
454 * current number of tasks
455 * OSCONTAINER_ERROR for not supported
456 */
457 jlong CgroupV1Subsystem::pids_current() {
458 if (_pids == nullptr) return OSCONTAINER_ERROR;
459 julong pids_current;
460 CONTAINER_READ_NUMBER_CHECKED(_pids, "/pids.current", "Current number of tasks", pids_current);
461 return (jlong)pids_current;
462 }