1 /*
  2  * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #include <string.h>
 26 #include <math.h>
 27 #include <errno.h>
 28 #include "cgroupV1Subsystem_linux.hpp"
 29 #include "cgroupUtil_linux.hpp"
 30 #include "logging/log.hpp"
 31 #include "memory/allocation.hpp"
 32 #include "runtime/globals.hpp"
 33 #include "runtime/os.hpp"
 34 #include "utilities/globalDefinitions.hpp"
 35 #include "os_linux.hpp"
 36 
 37 /*
 38  * Set directory to subsystem specific files based
 39  * on the contents of the mountinfo and cgroup files.
 40  *
 41  * The method determines whether it runs in
 42  * - host mode
 43  * - container mode
 44  *
 45  * In the host mode, _root is equal to "/" and
 46  * the subsystem path is equal to the _mount_point path
 47  * joined with cgroup_path.
 48  *
 49  * In the container mode, it can be two possibilities:
 50  * - private namespace (cgroupns=private)
 51  * - host namespace (cgroupns=host, default mode in cgroup V1 hosts)
 52  *
 53  * Private namespace is equivalent to the host mode, i.e.
 54  * the subsystem path is set by concatenating
 55  * _mount_point and cgroup_path.
 56  *
 57  * In the host namespace, _root is equal to host's cgroup path
 58  * of the control group to which the containerized process
 59  * belongs to at the moment of creation. The mountinfo and
 60  * cgroup files are mirrored from the host, while the subsystem
 61  * specific files are mapped directly at _mount_point, i.e.
 62  * at /sys/fs/cgroup/<controller>/, the subsystem path is
 63  * then set equal to _mount_point.
 64  *
 65  * A special case of the subsystem path is when a cgroup path
 66  * includes a subgroup, when a containerized process was associated
 67  * with an existing cgroup, that is different from cgroup
 68  * in which the process has been created.
 69  * Here, the _root is equal to the host's initial cgroup path,
 70  * cgroup_path will be equal to host's new cgroup path.
 71  * As host cgroup hierarchies are not accessible in the container,
 72  * it needs to be determined which part of cgroup path
 73  * is accessible inside container, i.e. mapped under
 74  * /sys/fs/cgroup/<controller>/<subgroup>.
 75  * In Docker default setup, host's cgroup path can be
 76  * of the form: /docker/<CONTAINER_ID>/<subgroup>,
 77  * from which only <subgroup> is mapped.
 78  * The method trims cgroup path from left, until the subgroup
 79  * component is found. The subsystem path will be set to
 80  * the _mount_point joined with the subgroup path.
 81  */
 82 void CgroupV1Controller::set_subsystem_path(const char* cgroup_path) {
 83   if (_cgroup_path != nullptr) {
 84     os::free(_cgroup_path);
 85   }
 86   if (_path != nullptr) {
 87     os::free(_path);
 88     _path = nullptr;
 89   }
 90   _cgroup_path = os::strdup(cgroup_path);
 91   stringStream ss;
 92   if (_root != nullptr && cgroup_path != nullptr) {
 93     ss.print_raw(_mount_point);
 94     if (strcmp(_root, "/") == 0) {
 95       // host processes and containers with cgroupns=private
 96       if (strcmp(cgroup_path,"/") != 0) {
 97         ss.print_raw(cgroup_path);
 98       }
 99     } else {
100       // containers with cgroupns=host, default setting is _root==cgroup_path
101       if (strcmp(_root, cgroup_path) != 0) {
102         if (*cgroup_path != '\0' && strcmp(cgroup_path, "/") != 0) {
103           // When moved to a subgroup, between subgroups, the path suffix will change.
104           const char *suffix = cgroup_path;
105           while (suffix != nullptr) {
106             stringStream pp;
107             pp.print_raw(_mount_point);
108             pp.print_raw(suffix);
109             if (os::file_exists(pp.base())) {
110               ss.print_raw(suffix);
111               if (suffix != cgroup_path) {
112                 log_trace(os, container)("set_subsystem_path: cgroup v1 path reduced to: %s.", suffix);
113               }
114               break;
115             }
116             log_trace(os, container)("set_subsystem_path: skipped non-existent directory: %s.", suffix);
117             suffix = strchr(suffix + 1, '/');
118           }
119         }
120       }
121     }
122     _path = os::strdup(ss.base());
123   }
124 }
125 
126 /*
127  * The common case, containers, we have _root == _cgroup_path, and thus set the
128  * controller path to the _mount_point. This is where the limits are exposed in
129  * the cgroup pseudo filesystem (at the leaf) and adjustment of the path won't
130  * be needed for that reason.
131  */
132 bool CgroupV1Controller::needs_hierarchy_adjustment() {
133   assert(_cgroup_path != nullptr, "sanity");
134   return strcmp(_root, _cgroup_path) != 0;
135 }
136 
137 static inline
138 void verbose_log(julong read_mem_limit, julong host_mem) {
139   if (log_is_enabled(Debug, os, container)) {
140     jlong mem_limit = (jlong)read_mem_limit; // account for negative values
141     if (mem_limit < 0 || read_mem_limit >= host_mem) {
142       const char *reason;
143       if (mem_limit == OSCONTAINER_ERROR) {
144         reason = "failed";
145       } else if (mem_limit == -1) {
146         reason = "unlimited";
147       } else {
148         assert(read_mem_limit >= host_mem, "Expected read value exceeding host_mem");
149         // Exceeding physical memory is treated as unlimited. This implementation
150         // caps it at host_mem since Cg v1 has no value to represent 'max'.
151         reason = "ignored";
152       }
153       log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", using host value " JLONG_FORMAT,
154                                reason, mem_limit, host_mem);
155     }
156   }
157 }
158 
159 jlong CgroupV1MemoryController::read_memory_limit_in_bytes(julong phys_mem) {
160   julong memlimit;
161   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.limit_in_bytes", "Memory Limit", memlimit);
162   if (memlimit >= phys_mem) {
163     verbose_log(memlimit, phys_mem);
164     return (jlong)-1;
165   } else {
166     verbose_log(memlimit, phys_mem);
167     return (jlong)memlimit;
168   }
169 }
170 
171 /* read_mem_swap
172  *
173  * Determine the memory and swap limit metric. Returns a positive limit value strictly
174  * lower than the physical memory and swap limit iff there is a limit. Otherwise a
175  * negative value is returned indicating the determined status.
176  *
177  * returns:
178  *    * A number > 0 if the limit is available and lower than a physical upper bound.
179  *    * OSCONTAINER_ERROR if the limit cannot be retrieved (i.e. not supported) or
180  *    * -1 if there isn't any limit in place (note: includes values which exceed a physical
181  *      upper bound)
182  */
183 jlong CgroupV1MemoryController::read_mem_swap(julong host_total_memsw) {
184   julong memswlimit;
185   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.memsw.limit_in_bytes", "Memory and Swap Limit", memswlimit);
186   if (memswlimit >= host_total_memsw) {
187     log_trace(os, container)("Memory and Swap Limit is: Unlimited");
188     return (jlong)-1;
189   } else {
190     return (jlong)memswlimit;
191   }
192 }
193 
194 jlong CgroupV1MemoryController::memory_and_swap_limit_in_bytes(julong host_mem, julong host_swap) {
195   jlong memory_swap = read_mem_swap(host_mem + host_swap);
196   if (memory_swap == -1) {
197     return memory_swap;
198   }
199   // If there is a swap limit, but swappiness == 0, reset the limit
200   // to the memory limit. Do the same for cases where swap isn't
201   // supported.
202   jlong swappiness = read_mem_swappiness();
203   if (swappiness == 0 || memory_swap == OSCONTAINER_ERROR) {
204     jlong memlimit = read_memory_limit_in_bytes(host_mem);
205     if (memory_swap == OSCONTAINER_ERROR) {
206       log_trace(os, container)("Memory and Swap Limit has been reset to " JLONG_FORMAT " because swap is not supported", memlimit);
207     } else {
208       log_trace(os, container)("Memory and Swap Limit has been reset to " JLONG_FORMAT " because swappiness is 0", memlimit);
209     }
210     return memlimit;
211   }
212   return memory_swap;
213 }
214 
215 static inline
216 jlong memory_swap_usage_impl(CgroupController* ctrl) {
217   julong memory_swap_usage;
218   CONTAINER_READ_NUMBER_CHECKED(ctrl, "/memory.memsw.usage_in_bytes", "mem swap usage", memory_swap_usage);
219   return (jlong)memory_swap_usage;
220 }
221 
222 jlong CgroupV1MemoryController::memory_and_swap_usage_in_bytes(julong phys_mem, julong host_swap) {
223   jlong memory_sw_limit = memory_and_swap_limit_in_bytes(phys_mem, host_swap);
224   jlong memory_limit = read_memory_limit_in_bytes(phys_mem);
225   if (memory_sw_limit > 0 && memory_limit > 0) {
226     jlong delta_swap = memory_sw_limit - memory_limit;
227     if (delta_swap > 0) {
228       return memory_swap_usage_impl(reader());
229     }
230   }
231   return memory_usage_in_bytes();
232 }
233 
234 jlong CgroupV1MemoryController::read_mem_swappiness() {
235   julong swappiness;
236   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.swappiness", "Swappiness", swappiness);
237   return (jlong)swappiness;
238 }
239 
240 jlong CgroupV1MemoryController::memory_soft_limit_in_bytes(julong phys_mem) {
241   julong memsoftlimit;
242   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.soft_limit_in_bytes", "Memory Soft Limit", memsoftlimit);
243   if (memsoftlimit >= phys_mem) {
244     log_trace(os, container)("Memory Soft Limit is: Unlimited");
245     return (jlong)-1;
246   } else {
247     return (jlong)memsoftlimit;
248   }
249 }
250 
251 // Constructor
252 CgroupV1Subsystem::CgroupV1Subsystem(CgroupV1Controller* cpuset,
253                       CgroupV1CpuController* cpu,
254                       CgroupV1Controller* cpuacct,
255                       CgroupV1Controller* pids,
256                       CgroupV1MemoryController* memory) :
257     _cpuset(cpuset),
258     _cpuacct(cpuacct),
259     _pids(pids) {
260   CgroupUtil::adjust_controller(memory);
261   CgroupUtil::adjust_controller(cpu);
262   _memory = new CachingCgroupController<CgroupMemoryController>(memory);
263   _cpu = new CachingCgroupController<CgroupCpuController>(cpu);
264 }
265 
266 bool CgroupV1Subsystem::is_containerized() {
267   // containerized iff all required controllers are mounted
268   // read-only. See OSContainer::is_containerized() for
269   // the full logic.
270   //
271   return _memory->controller()->is_read_only() &&
272          _cpu->controller()->is_read_only() &&
273          _cpuacct->is_read_only() &&
274          _cpuset->is_read_only();
275 }
276 
277 /* memory_usage_in_bytes
278  *
279  * Return the amount of used memory for this process.
280  *
281  * return:
282  *    memory usage in bytes or
283  *    -1 for unlimited
284  *    OSCONTAINER_ERROR for not supported
285  */
286 jlong CgroupV1MemoryController::memory_usage_in_bytes() {
287   julong memusage;
288   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.usage_in_bytes", "Memory Usage", memusage);
289   return (jlong)memusage;
290 }
291 
292 /* memory_max_usage_in_bytes
293  *
294  * Return the maximum amount of used memory for this process.
295  *
296  * return:
297  *    max memory usage in bytes or
298  *    OSCONTAINER_ERROR for not supported
299  */
300 jlong CgroupV1MemoryController::memory_max_usage_in_bytes() {
301   julong memmaxusage;
302   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.max_usage_in_bytes", "Maximum Memory Usage", memmaxusage);
303   return (jlong)memmaxusage;
304 }
305 
306 jlong CgroupV1MemoryController::rss_usage_in_bytes() {
307   julong rss;
308   bool is_ok = reader()->read_numerical_key_value("/memory.stat", "rss", &rss);
309   if (!is_ok) {
310     return OSCONTAINER_ERROR;
311   }
312   log_trace(os, container)("RSS usage is: " JULONG_FORMAT, rss);
313   return (jlong)rss;
314 }
315 
316 jlong CgroupV1MemoryController::cache_usage_in_bytes() {
317   julong cache;
318   bool is_ok = reader()->read_numerical_key_value("/memory.stat", "cache", &cache);
319   if (!is_ok) {
320     return OSCONTAINER_ERROR;
321   }
322   log_trace(os, container)("Cache usage is: " JULONG_FORMAT, cache);
323   return cache;
324 }
325 
326 jlong CgroupV1MemoryController::kernel_memory_usage_in_bytes() {
327   julong kmem_usage;
328   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.usage_in_bytes", "Kernel Memory Usage", kmem_usage);
329   return (jlong)kmem_usage;
330 }
331 
332 jlong CgroupV1MemoryController::kernel_memory_limit_in_bytes(julong phys_mem) {
333   julong kmem_limit;
334   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.limit_in_bytes", "Kernel Memory Limit", kmem_limit);
335   if (kmem_limit >= phys_mem) {
336     return (jlong)-1;
337   }
338   return (jlong)kmem_limit;
339 }
340 
341 jlong CgroupV1MemoryController::kernel_memory_max_usage_in_bytes() {
342   julong kmem_max_usage;
343   CONTAINER_READ_NUMBER_CHECKED(reader(), "/memory.kmem.max_usage_in_bytes", "Maximum Kernel Memory Usage", kmem_max_usage);
344   return (jlong)kmem_max_usage;
345 }
346 
347 void CgroupV1MemoryController::print_version_specific_info(outputStream* st, julong phys_mem) {
348   jlong kmem_usage = kernel_memory_usage_in_bytes();
349   jlong kmem_limit = kernel_memory_limit_in_bytes(phys_mem);
350   jlong kmem_max_usage = kernel_memory_max_usage_in_bytes();
351 
352   OSContainer::print_container_helper(st, kmem_limit, "kernel_memory_limit_in_bytes");
353   OSContainer::print_container_helper(st, kmem_usage, "kernel_memory_usage_in_bytes");
354   OSContainer::print_container_helper(st, kmem_max_usage, "kernel_memory_max_usage_in_bytes");
355 }
356 
357 char* CgroupV1Subsystem::cpu_cpuset_cpus() {
358   char cpus[1024];
359   CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.cpus", "cpuset.cpus", cpus, 1024);
360   return os::strdup(cpus);
361 }
362 
363 char* CgroupV1Subsystem::cpu_cpuset_memory_nodes() {
364   char mems[1024];
365   CONTAINER_READ_STRING_CHECKED(_cpuset, "/cpuset.mems", "cpuset.mems", mems, 1024);
366   return os::strdup(mems);
367 }
368 
369 /* cpu_quota
370  *
371  * Return the number of microseconds per period
372  * process is guaranteed to run.
373  *
374  * return:
375  *    quota time in microseconds
376  *    -1 for no quota
377  *    OSCONTAINER_ERROR for not supported
378  */
379 int CgroupV1CpuController::cpu_quota() {
380   julong quota;
381   bool is_ok = reader()->read_number("/cpu.cfs_quota_us", &quota);
382   if (!is_ok) {
383     log_trace(os, container)("CPU Quota failed: %d", OSCONTAINER_ERROR);
384     return OSCONTAINER_ERROR;
385   }
386   // cast to int since the read value might be negative
387   // and we want to avoid logging -1 as a large unsigned value.
388   int quota_int = (int)quota;
389   log_trace(os, container)("CPU Quota is: %d", quota_int);
390   return quota_int;
391 }
392 
393 int CgroupV1CpuController::cpu_period() {
394   julong period;
395   CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.cfs_period_us", "CPU Period", period);
396   return (int)period;
397 }
398 
399 /* cpu_shares
400  *
401  * Return the amount of cpu shares available to the process
402  *
403  * return:
404  *    Share number (typically a number relative to 1024)
405  *                 (2048 typically expresses 2 CPUs worth of processing)
406  *    -1 for no share setup
407  *    OSCONTAINER_ERROR for not supported
408  */
409 int CgroupV1CpuController::cpu_shares() {
410   julong shares;
411   CONTAINER_READ_NUMBER_CHECKED(reader(), "/cpu.shares", "CPU Shares", shares);
412   int shares_int = (int)shares;
413   // Convert 1024 to no shares setup
414   if (shares_int == 1024) return -1;
415 
416   return shares_int;
417 }
418 
419 /* pids_max
420  *
421  * Return the maximum number of tasks available to the process
422  *
423  * return:
424  *    maximum number of tasks
425  *    -1 for unlimited
426  *    OSCONTAINER_ERROR for not supported
427  */
428 jlong CgroupV1Subsystem::pids_max() {
429   if (_pids == nullptr) return OSCONTAINER_ERROR;
430   jlong pids_max;
431   CONTAINER_READ_NUMBER_CHECKED_MAX(_pids, "/pids.max", "Maximum number of tasks", pids_max);
432   return pids_max;
433 }
434 
435 /* pids_current
436  *
437  * The number of tasks currently in the cgroup (and its descendants) of the process
438  *
439  * return:
440  *    current number of tasks
441  *    OSCONTAINER_ERROR for not supported
442  */
443 jlong CgroupV1Subsystem::pids_current() {
444   if (_pids == nullptr) return OSCONTAINER_ERROR;
445   julong pids_current;
446   CONTAINER_READ_NUMBER_CHECKED(_pids, "/pids.current", "Current number of tasks", pids_current);
447   return (jlong)pids_current;
448 }