1 /*
  2  * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #include "cgroupUtil_linux.hpp"
 26 #include "cgroupV1Subsystem_linux.hpp"
 27 #include "logging/log.hpp"
 28 #include "memory/allocation.hpp"
 29 #include "os_linux.hpp"
 30 #include "runtime/globals.hpp"
 31 #include "runtime/os.hpp"
 32 #include "utilities/globalDefinitions.hpp"
 33 
 34 #include <errno.h>
 35 #include <math.h>
 36 #include <string.h>
 37 
 38 /*
 39  * Set directory to subsystem specific files based
 40  * on the contents of the mountinfo and cgroup files.
 41  *
 42  * The method determines whether it runs in
 43  * - host mode
 44  * - container mode
 45  *
 46  * In the host mode, _root is equal to "/" and
 47  * the subsystem path is equal to the _mount_point path
 48  * joined with cgroup_path.
 49  *
 50  * In the container mode, it can be two possibilities:
 51  * - private namespace (cgroupns=private)
 52  * - host namespace (cgroupns=host, default mode in cgroup V1 hosts)
 53  *
 54  * Private namespace is equivalent to the host mode, i.e.
 55  * the subsystem path is set by concatenating
 56  * _mount_point and cgroup_path.
 57  *
 58  * In the host namespace, _root is equal to host's cgroup path
 59  * of the control group to which the containerized process
 60  * belongs to at the moment of creation. The mountinfo and
 61  * cgroup files are mirrored from the host, while the subsystem
 62  * specific files are mapped directly at _mount_point, i.e.
 63  * at /sys/fs/cgroup/<controller>/, the subsystem path is
 64  * then set equal to _mount_point.
 65  *
 66  * A special case of the subsystem path is when a cgroup path
 67  * includes a subgroup, when a containerized process was associated
 68  * with an existing cgroup, that is different from cgroup
 69  * in which the process has been created.
 70  * Here, the _root is equal to the host's initial cgroup path,
 71  * cgroup_path will be equal to host's new cgroup path.
 72  * As host cgroup hierarchies are not accessible in the container,
 73  * it needs to be determined which part of cgroup path
 74  * is accessible inside container, i.e. mapped under
 75  * /sys/fs/cgroup/<controller>/<subgroup>.
 76  * In Docker default setup, host's cgroup path can be
 77  * of the form: /docker/<CONTAINER_ID>/<subgroup>,
 78  * from which only <subgroup> is mapped.
 79  * The method trims cgroup path from left, until the subgroup
 80  * component is found. The subsystem path will be set to
 81  * the _mount_point joined with the subgroup path.
 82  */
 83 void CgroupV1Controller::set_subsystem_path(const char* cgroup_path) {
 84   if (_cgroup_path != nullptr) {
 85     os::free(_cgroup_path);
 86   }
 87   if (_path != nullptr) {
 88     os::free(_path);
 89     _path = nullptr;
 90   }
 91   _cgroup_path = os::strdup(cgroup_path);
 92   stringStream ss;
 93   if (_root != nullptr && cgroup_path != nullptr) {
 94     ss.print_raw(_mount_point);
 95     if (strcmp(_root, "/") == 0) {
 96       // host processes and containers with cgroupns=private
 97       if (strcmp(cgroup_path,"/") != 0) {
 98         ss.print_raw(cgroup_path);
 99       }
100     } else {
101       // containers with cgroupns=host, default setting is _root==cgroup_path
102       if (strcmp(_root, cgroup_path) != 0) {
103         if (*cgroup_path != '\0' && strcmp(cgroup_path, "/") != 0) {
104           // When moved to a subgroup, between subgroups, the path suffix will change.
105           const char *suffix = cgroup_path;
106           while (suffix != nullptr) {
107             stringStream pp;
108             pp.print_raw(_mount_point);
109             pp.print_raw(suffix);
110             if (os::file_exists(pp.base())) {
111               ss.print_raw(suffix);
112               if (suffix != cgroup_path) {
113                 log_trace(os, container)("set_subsystem_path: cgroup v1 path reduced to: %s.", suffix);
114               }
115               break;
116             }
117             log_trace(os, container)("set_subsystem_path: skipped non-existent directory: %s.", suffix);
118             suffix = strchr(suffix + 1, '/');
119           }
120         }
121       }
122     }
123     _path = os::strdup(ss.base());
124   }
125 }
126 
127 jlong CgroupV1MemoryController::uses_mem_hierarchy() {
128   julong use_hierarchy;
129   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.use_hierarchy", "Use Hierarchy", use_hierarchy);
130   return (jlong)use_hierarchy;
131 }
132 
133 /*
134  * The common case, containers, we have _root == _cgroup_path, and thus set the
135  * controller path to the _mount_point. This is where the limits are exposed in
136  * the cgroup pseudo filesystem (at the leaf) and adjustment of the path won't
137  * be needed for that reason.
138  */
139 bool CgroupV1Controller::needs_hierarchy_adjustment() {
140   assert(_cgroup_path != nullptr, "sanity");
141   return strcmp(_root, _cgroup_path) != 0;
142 }
143 
144 static inline
145 void verbose_log(julong read_mem_limit, julong upper_mem_bound) {
146   if (log_is_enabled(Debug, os, container)) {
147     jlong mem_limit = (jlong)read_mem_limit; // account for negative values
148     if (mem_limit < 0 || read_mem_limit >= upper_mem_bound) {
149       const char *reason;
150       if (mem_limit == OSCONTAINER_ERROR) {
151         reason = "failed";
152       } else if (mem_limit == -1) {
153         reason = "unlimited";
154       } else {
155         assert(read_mem_limit >= upper_mem_bound, "Expected read value exceeding upper memory bound");
156         // Exceeding physical memory is treated as unlimited. This implementation
157         // caps it at host_mem since Cg v1 has no value to represent 'max'.
158         reason = "ignored";
159       }
160       log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", upper bound is " JLONG_FORMAT,
161                                reason, mem_limit, upper_mem_bound);
162     }
163   }
164 }
165 
166 jlong CgroupV1MemoryController::read_memory_limit_in_bytes(julong upper_bound) {
167   julong memlimit;
168   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.limit_in_bytes", "Memory Limit", memlimit);
169   if (memlimit >= upper_bound && uses_mem_hierarchy()) {
170     CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(reader(), "/memory.stat",
171                                                "hierarchical_memory_limit", "Hierarchical Memory Limit",
172                                                memlimit);
173   }
174   verbose_log(memlimit, upper_bound);
175   return (jlong)((memlimit < upper_bound) ? memlimit : -1);
176 }
177 
178 /* read_mem_swap
179  *
180  * Determine the memory and swap limit metric. Returns a positive limit value strictly
181  * lower than the physical memory and swap limit iff there is a limit. Otherwise a
182  * negative value is returned indicating the determined status.
183  *
184  * returns:
185  *    * A number > 0 if the limit is available and lower than a physical upper bound.
186  *    * OSCONTAINER_ERROR if the limit cannot be retrieved (i.e. not supported) or
187  *    * -1 if there isn't any limit in place (note: includes values which exceed a physical
188  *      upper bound)
189  */
190 jlong CgroupV1MemoryController::read_mem_swap(julong upper_memsw_bound) {
191   julong memswlimit;
192   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.memsw.limit_in_bytes", "Memory and Swap Limit", memswlimit);
193   if (memswlimit >= upper_memsw_bound && uses_mem_hierarchy()) {
194       CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(reader(), "/memory.stat",
195                                                  "hierarchical_memsw_limit", "Hierarchical Memory and Swap Limit",
196                                                  memswlimit);
197   }
198   verbose_log(memswlimit, upper_memsw_bound);
199   return (jlong)((memswlimit < upper_memsw_bound) ? memswlimit : -1);
200 }
201 
202 jlong CgroupV1MemoryController::memory_and_swap_limit_in_bytes(julong upper_mem_bound, julong upper_swap_bound) {
203   jlong memory_swap = read_mem_swap(upper_mem_bound + upper_swap_bound);
204   if (memory_swap == -1) {
205     return memory_swap;
206   }
207   // If there is a swap limit, but swappiness == 0, reset the limit
208   // to the memory limit. Do the same for cases where swap isn't
209   // supported.
210   jlong swappiness = read_mem_swappiness();
211   if (swappiness == 0 || memory_swap == OSCONTAINER_ERROR) {
212     jlong memlimit = read_memory_limit_in_bytes(upper_mem_bound);
213     if (memory_swap == OSCONTAINER_ERROR) {
214       log_trace(os, container)("Memory and Swap Limit has been reset to " JLONG_FORMAT " because swap is not supported", memlimit);
215     } else {
216       log_trace(os, container)("Memory and Swap Limit has been reset to " JLONG_FORMAT " because swappiness is 0", memlimit);
217     }
218     return memlimit;
219   }
220   return memory_swap;
221 }
222 
223 static inline
224 jlong memory_swap_usage_impl(CgroupController* ctrl) {
225   julong memory_swap_usage;
226   CONTAINER_READ_NUMBER_CHECKED(ctrl, "/memory.memsw.usage_in_bytes", "mem swap usage", memory_swap_usage);
227   return (jlong)memory_swap_usage;
228 }
229 
230 jlong CgroupV1MemoryController::memory_and_swap_usage_in_bytes(julong upper_mem_bound, julong upper_swap_bound) {
231   jlong memory_sw_limit = memory_and_swap_limit_in_bytes(upper_mem_bound, upper_swap_bound);
232   jlong memory_limit = read_memory_limit_in_bytes(upper_mem_bound);
233   if (memory_sw_limit > 0 && memory_limit > 0) {
234     jlong delta_swap = memory_sw_limit - memory_limit;
235     if (delta_swap > 0) {
236       return memory_swap_usage_impl(reader());
237     }
238   }
239   return memory_usage_in_bytes();
240 }
241 
242 jlong CgroupV1MemoryController::read_mem_swappiness() {
243   julong swappiness;
244   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.swappiness", "Swappiness", swappiness);
245   return (jlong)swappiness;
246 }
247 
248 jlong CgroupV1MemoryController::memory_soft_limit_in_bytes(julong upper_bound) {
249   julong memsoftlimit;
250   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.soft_limit_in_bytes", "Memory Soft Limit", memsoftlimit);
251   if (memsoftlimit >= upper_bound) {
252     log_trace(os, container)("Memory Soft Limit is: Unlimited");
253     return (jlong)-1;
254   } else {
255     return (jlong)memsoftlimit;
256   }
257 }
258 
259 jlong CgroupV1MemoryController::memory_throttle_limit_in_bytes() {
260   // Log this string at trace level so as to make tests happy.
261   log_trace(os, container)("Memory Throttle Limit is not supported.");
262   return OSCONTAINER_ERROR; // not supported
263 }
264 
265 // Constructor
266 CgroupV1Subsystem::CgroupV1Subsystem(CgroupV1Controller* cpuset,
267                       CgroupV1CpuController* cpu,
268                       CgroupV1CpuacctController* cpuacct,
269                       CgroupV1Controller* pids,
270                       CgroupV1MemoryController* memory) :
271     _cpuset(cpuset),
272     _cpuacct(cpuacct),
273     _pids(pids) {
274   CgroupUtil::adjust_controller(memory);
275   CgroupUtil::adjust_controller(cpu);
276   _memory = new CachingCgroupController<CgroupMemoryController>(memory);
277   _cpu = new CachingCgroupController<CgroupCpuController>(cpu);
278 }
279 
280 bool CgroupV1Subsystem::is_containerized() {
281   // containerized iff all required controllers are mounted
282   // read-only. See OSContainer::is_containerized() for
283   // the full logic.
284   //
285   return _memory->controller()->is_read_only() &&
286          _cpu->controller()->is_read_only() &&
287          _cpuacct->is_read_only() &&
288          _cpuset->is_read_only();
289 }
290 
291 /* memory_usage_in_bytes
292  *
293  * Return the amount of used memory for this process.
294  *
295  * return:
296  *    memory usage in bytes or
297  *    -1 for unlimited
298  *    OSCONTAINER_ERROR for not supported
299  */
300 jlong CgroupV1MemoryController::memory_usage_in_bytes() {
301   julong memusage;
302   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.usage_in_bytes", "Memory Usage", memusage);
303   return (jlong)memusage;
304 }
305 
306 /* memory_max_usage_in_bytes
307  *
308  * Return the maximum amount of used memory for this process.
309  *
310  * return:
311  *    max memory usage in bytes or
312  *    OSCONTAINER_ERROR for not supported
313  */
314 jlong CgroupV1MemoryController::memory_max_usage_in_bytes() {
315   julong memmaxusage;
316   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.max_usage_in_bytes", "Maximum Memory Usage", memmaxusage);
317   return (jlong)memmaxusage;
318 }
319 
320 jlong CgroupV1MemoryController::rss_usage_in_bytes() {
321   julong rss;
322   bool is_ok = reader()->read_numerical_key_value("/memory.stat", "rss", &rss);
323   if (!is_ok) {
324     return OSCONTAINER_ERROR;
325   }
326   log_trace(os, container)("RSS usage is: " JULONG_FORMAT, rss);
327   return (jlong)rss;
328 }
329 
330 jlong CgroupV1MemoryController::cache_usage_in_bytes() {
331   julong cache;
332   bool is_ok = reader()->read_numerical_key_value("/memory.stat", "cache", &cache);
333   if (!is_ok) {
334     return OSCONTAINER_ERROR;
335   }
336   log_trace(os, container)("Cache usage is: " JULONG_FORMAT, cache);
337   return cache;
338 }
339 
340 jlong CgroupV1MemoryController::kernel_memory_usage_in_bytes() {
341   julong kmem_usage;
342   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.usage_in_bytes", "Kernel Memory Usage", kmem_usage);
343   return (jlong)kmem_usage;
344 }
345 
346 jlong CgroupV1MemoryController::kernel_memory_limit_in_bytes(julong upper_bound) {
347   julong kmem_limit;
348   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.limit_in_bytes", "Kernel Memory Limit", kmem_limit);
349   if (kmem_limit >= upper_bound) {
350     return (jlong)-1;
351   }
352   return (jlong)kmem_limit;
353 }
354 
355 jlong CgroupV1MemoryController::kernel_memory_max_usage_in_bytes() {
356   julong kmem_max_usage;
357   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.max_usage_in_bytes", "Maximum Kernel Memory Usage", kmem_max_usage);
358   return (jlong)kmem_max_usage;
359 }
360 
361 void CgroupV1MemoryController::print_version_specific_info(outputStream* st, julong mem_bound) {
362   jlong kmem_usage = kernel_memory_usage_in_bytes();
363   jlong kmem_limit = kernel_memory_limit_in_bytes(mem_bound);
364   jlong kmem_max_usage = kernel_memory_max_usage_in_bytes();
365 
366   OSContainer::print_container_helper(st, kmem_limit, "kernel_memory_limit_in_bytes");
367   OSContainer::print_container_helper(st, kmem_usage, "kernel_memory_usage_in_bytes");
368   OSContainer::print_container_helper(st, kmem_max_usage, "kernel_memory_max_usage_in_bytes");
369 }
370 
371 char* CgroupV1Subsystem::cpu_cpuset_cpus() {
372   char cpus[1024];
373   CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.cpus", "cpuset.cpus", cpus, 1024);
374   return os::strdup(cpus);
375 }
376 
377 char* CgroupV1Subsystem::cpu_cpuset_memory_nodes() {
378   char mems[1024];
379   CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.mems", "cpuset.mems", mems, 1024);
380   return os::strdup(mems);
381 }
382 
383 /* cpu_quota
384  *
385  * Return the number of microseconds per period
386  * process is guaranteed to run.
387  *
388  * return:
389  *    quota time in microseconds
390  *    -1 for no quota
391  *    OSCONTAINER_ERROR for not supported
392  */
393 int CgroupV1CpuController::cpu_quota() {
394   julong quota;
395   bool is_ok = reader()->read_number("/cpu.cfs_quota_us", &quota);
396   if (!is_ok) {
397     log_trace(os, container)("CPU Quota failed: %d", OSCONTAINER_ERROR);
398     return OSCONTAINER_ERROR;
399   }
400   // cast to int since the read value might be negative
401   // and we want to avoid logging -1 as a large unsigned value.
402   int quota_int = (int)quota;
403   log_trace(os, container)("CPU Quota is: %d", quota_int);
404   return quota_int;
405 }
406 
407 int CgroupV1CpuController::cpu_period() {
408   julong period;
409   CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.cfs_period_us", "CPU Period", period);
410   return (int)period;
411 }
412 
413 /* cpu_shares
414  *
415  * Return the amount of cpu shares available to the process
416  *
417  * return:
418  *    Share number (typically a number relative to 1024)
419  *                 (2048 typically expresses 2 CPUs worth of processing)
420  *    -1 for no share setup
421  *    OSCONTAINER_ERROR for not supported
422  */
423 int CgroupV1CpuController::cpu_shares() {
424   julong shares;
425   CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.shares", "CPU Shares", shares);
426   int shares_int = (int)shares;
427   // Convert 1024 to no shares setup
428   if (shares_int == 1024) return -1;
429 
430   return shares_int;
431 }
432 
433 jlong CgroupV1CpuacctController::cpu_usage_in_micros() {
434   julong cpu_usage;
435   CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpuacct.usage", "CPU Usage", cpu_usage);
436   // Output is in nanoseconds, convert to microseconds.
437   return (jlong)cpu_usage / 1000;
438 }
439 
440 /* pids_max
441  *
442  * Return the maximum number of tasks available to the process
443  *
444  * return:
445  *    maximum number of tasks
446  *    -1 for unlimited
447  *    OSCONTAINER_ERROR for not supported
448  */
449 jlong CgroupV1Subsystem::pids_max() {
450   if (_pids == nullptr) return OSCONTAINER_ERROR;
451   jlong pids_max;
452   CONTAINER_READ_NUMBER_CHECKED_MAX(_pids, "/pids.max", "Maximum number of tasks", pids_max);
453   return pids_max;
454 }
455 
456 /* pids_current
457  *
458  * The number of tasks currently in the cgroup (and its descendants) of the process
459  *
460  * return:
461  *    current number of tasks
462  *    OSCONTAINER_ERROR for not supported
463  */
464 jlong CgroupV1Subsystem::pids_current() {
465   if (_pids == nullptr) return OSCONTAINER_ERROR;
466   julong pids_current;
467   CONTAINER_READ_NUMBER_CHECKED(_pids, "/pids.current", "Current number of tasks", pids_current);
468   return (jlong)pids_current;
469 }