1 /*
2 * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef CGROUP_SUBSYSTEM_LINUX_HPP
26 #define CGROUP_SUBSYSTEM_LINUX_HPP
27
28 #include "logging/log.hpp"
29 #include "memory/allocation.hpp"
30 #include "osContainer_linux.hpp"
31 #include "runtime/os.hpp"
32 #include "utilities/globalDefinitions.hpp"
33 #include "utilities/macros.hpp"
34
35 // Shared cgroups code (used by cgroup version 1 and version 2)
36
37 /*
38 * PER_CPU_SHARES has been set to 1024 because CPU shares' quota
39 * is commonly used in cloud frameworks like Kubernetes[1],
40 * AWS[2] and Mesos[3] in a similar way. They spawn containers with
41 * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do
42 * the inverse for determining the number of possible available
43 * CPUs to the JVM inside a container. See JDK-8216366.
44 *
45 * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu
46 * In particular:
47 * When using Docker:
48 * The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially
49 * fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the
50 * --cpu-shares flag in the docker run command.
51 * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
52 * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648
53 * https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30
54 */
55 #define PER_CPU_SHARES 1024
56
57 #define CGROUPS_V1 1
58 #define CGROUPS_V2 2
59 #define INVALID_CGROUPS_V2 3
60 #define INVALID_CGROUPS_V1 4
61 #define INVALID_CGROUPS_NO_MOUNT 5
62 #define INVALID_CGROUPS_GENERIC 6
63
64 // Five controllers: cpu, cpuset, cpuacct, memory, pids
65 #define CG_INFO_LENGTH 5
66 #define CPUSET_IDX 0
67 #define CPU_IDX 1
68 #define CPUACCT_IDX 2
69 #define MEMORY_IDX 3
70 #define PIDS_IDX 4
71
72 #define CONTAINER_READ_NUMBER_CHECKED(controller, filename, log_string, retval) \
73 { \
74 bool is_ok; \
75 is_ok = controller->read_number(filename, &retval); \
76 if (!is_ok) { \
77 log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR); \
78 return OSCONTAINER_ERROR; \
79 } \
80 log_trace(os, container)(log_string " is: " JULONG_FORMAT, retval); \
81 }
82
83 #define CONTAINER_READ_NUMBER_CHECKED_MAX(controller, filename, log_string, retval) \
84 { \
85 bool is_ok; \
86 is_ok = controller->read_number_handle_max(filename, &retval); \
87 if (!is_ok) { \
88 log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR); \
89 return OSCONTAINER_ERROR; \
90 } \
91 log_trace(os, container)(log_string " is: " JLONG_FORMAT, retval); \
92 }
93
94 #define CONTAINER_READ_STRING_CHECKED(controller, filename, log_string, retval, buf_size) \
95 { \
96 bool is_ok; \
97 is_ok = controller->read_string(filename, retval, buf_size); \
98 if (!is_ok) { \
99 log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR); \
100 return nullptr; \
101 } \
102 log_trace(os, container)(log_string " is: %s", retval); \
103 }
104
105 #define CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(controller, filename, key, log_string, retval) \
106 { \
107 bool is_ok; \
108 is_ok = controller->read_numerical_key_value(filename, key, &retval); \
109 if (!is_ok) { \
110 log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR); \
111 return OSCONTAINER_ERROR; \
112 } \
113 log_trace(os, container)(log_string " is: " JULONG_FORMAT, retval); \
114 }
115
116
117 class CgroupController: public CHeapObj<mtInternal> {
118 protected:
119 char* _cgroup_path;
120 char* _mount_point;
121 public:
122 virtual const char* subsystem_path() = 0;
123 virtual bool is_read_only() = 0;
124 const char* cgroup_path() { return _cgroup_path; }
125 const char* mount_point() { return _mount_point; }
126 virtual bool needs_hierarchy_adjustment() { return false; }
127
128 /* Read a numerical value as unsigned long
129 *
130 * returns: false if any error occurred. true otherwise and
131 * the parsed value is set in the provided julong pointer.
132 */
133 bool read_number(const char* filename, julong* result);
134
135 /* Convenience method to deal with numbers as well as the string 'max'
136 * in interface files. Otherwise same as read_number().
137 *
138 * returns: false if any error occurred. true otherwise and
139 * the parsed value (which might be negative) is being set in
140 * the provided jlong pointer.
141 */
142 bool read_number_handle_max(const char* filename, jlong* result);
143
144 /* Read a string of at most buf_size - 1 characters from the interface file.
145 * The provided buffer must be at least buf_size in size so as to account
146 * for the null terminating character. Callers must ensure that the buffer
147 * is appropriately in-scope and of sufficient size.
148 *
149 * returns: false if any error occured. true otherwise and the passed
150 * in buffer will contain the first buf_size - 1 characters of the string
151 * or up to the first new line character ('\n') whichever comes first.
152 */
153 bool read_string(const char* filename, char* buf, size_t buf_size);
154
155 /* Read a tuple value as a number. Tuple is: '<first> <second>'.
156 * Handles 'max' (for unlimited) for any tuple value. This is handy for
157 * parsing interface files like cpu.max which contain such tuples.
158 *
159 * returns: false if any error occurred. true otherwise and the parsed
160 * value of the appropriate tuple entry set in the provided jlong pointer.
161 */
162 bool read_numerical_tuple_value(const char* filename, bool use_first, jlong* result);
163
164 /* Read a numerical value from a multi-line interface file. The matched line is
165 * determined by the provided 'key'. The associated numerical value is being set
166 * via the passed in julong pointer. Example interface file 'memory.stat'
167 *
168 * returns: false if any error occurred. true otherwise and the parsed value is
169 * being set in the provided julong pointer.
170 */
171 bool read_numerical_key_value(const char* filename, const char* key, julong* result);
172
173 private:
174 static jlong limit_from_str(char* limit_str);
175 };
176
177 class CachedMetric : public CHeapObj<mtInternal>{
178 private:
179 volatile jlong _metric;
180 volatile jlong _next_check_counter;
181 public:
182 CachedMetric() {
183 _metric = -1;
184 _next_check_counter = min_jlong;
185 }
186 bool should_check_metric() {
187 return os::elapsed_counter() > _next_check_counter;
188 }
189 jlong value() { return _metric; }
190 void set_value(jlong value, jlong timeout) {
191 _metric = value;
192 // Metric is unlikely to change, but we want to remain
193 // responsive to configuration changes. A very short grace time
194 // between re-read avoids excessive overhead during startup without
195 // significantly reducing the VMs ability to promptly react to changed
196 // metric config
197 _next_check_counter = os::elapsed_counter() + timeout;
198 }
199 };
200
201 template <class T>
202 class CachingCgroupController : public CHeapObj<mtInternal> {
203 private:
204 T* _controller;
205 CachedMetric* _metrics_cache;
206
207 public:
208 CachingCgroupController(T* cont) {
209 _controller = cont;
210 _metrics_cache = new CachedMetric();
211 }
212
213 CachedMetric* metrics_cache() { return _metrics_cache; }
214 T* controller() { return _controller; }
215 };
216
217 // Pure virtual class representing version agnostic CPU controllers
218 class CgroupCpuController: public CHeapObj<mtInternal> {
219 public:
220 virtual int cpu_quota() = 0;
221 virtual int cpu_period() = 0;
222 virtual int cpu_shares() = 0;
223 virtual bool needs_hierarchy_adjustment() = 0;
224 virtual bool is_read_only() = 0;
225 virtual const char* subsystem_path() = 0;
226 virtual void set_subsystem_path(const char* cgroup_path) = 0;
227 virtual const char* mount_point() = 0;
228 virtual const char* cgroup_path() = 0;
229 };
230
231 // Pure virtual class representing version agnostic CPU accounting controllers
232 class CgroupCpuacctController: public CHeapObj<mtInternal> {
233 public:
234 virtual jlong cpu_usage_in_micros() = 0;
235 virtual bool needs_hierarchy_adjustment() = 0;
236 virtual bool is_read_only() = 0;
237 virtual const char* subsystem_path() = 0;
238 virtual void set_subsystem_path(const char* cgroup_path) = 0;
239 virtual const char* mount_point() = 0;
240 virtual const char* cgroup_path() = 0;
241 };
242
243 // Pure virtual class representing version agnostic memory controllers
244 class CgroupMemoryController: public CHeapObj<mtInternal> {
245 public:
246 virtual jlong read_memory_limit_in_bytes(julong upper_bound) = 0;
247 virtual jlong memory_usage_in_bytes() = 0;
248 virtual jlong memory_and_swap_limit_in_bytes(julong upper_mem_bound, julong upper_swap_bound) = 0;
249 virtual jlong memory_and_swap_usage_in_bytes(julong upper_mem_bound, julong upper_swap_bound) = 0;
250 virtual jlong memory_soft_limit_in_bytes(julong upper_bound) = 0;
251 virtual jlong memory_throttle_limit_in_bytes() = 0;
252 virtual jlong memory_max_usage_in_bytes() = 0;
253 virtual jlong rss_usage_in_bytes() = 0;
254 virtual jlong cache_usage_in_bytes() = 0;
255 virtual void print_version_specific_info(outputStream* st, julong upper_mem_bound) = 0;
256 virtual bool needs_hierarchy_adjustment() = 0;
257 virtual bool is_read_only() = 0;
258 virtual const char* subsystem_path() = 0;
259 virtual void set_subsystem_path(const char* cgroup_path) = 0;
260 virtual const char* mount_point() = 0;
261 virtual const char* cgroup_path() = 0;
262 };
263
264 class CgroupSubsystem: public CHeapObj<mtInternal> {
265 public:
266 jlong memory_limit_in_bytes(julong upper_bound);
267 int active_processor_count();
268
269 virtual jlong pids_max() = 0;
270 virtual jlong pids_current() = 0;
271 virtual bool is_containerized() = 0;
272
273 virtual char * cpu_cpuset_cpus() = 0;
274 virtual char * cpu_cpuset_memory_nodes() = 0;
275 virtual const char * container_type() = 0;
276 virtual CachingCgroupController<CgroupMemoryController>* memory_controller() = 0;
277 virtual CachingCgroupController<CgroupCpuController>* cpu_controller() = 0;
278 virtual CgroupCpuacctController* cpuacct_controller() = 0;
279
280 int cpu_quota();
281 int cpu_period();
282 int cpu_shares();
283
284 jlong cpu_usage_in_micros();
285
286 jlong memory_usage_in_bytes();
287 jlong memory_and_swap_limit_in_bytes(julong upper_mem_bound, julong upper_swap_bound);
288 jlong memory_and_swap_usage_in_bytes(julong upper_mem_bound, julong upper_swap_bound);
289 jlong memory_soft_limit_in_bytes(julong upper_bound);
290 jlong memory_throttle_limit_in_bytes();
291 jlong memory_max_usage_in_bytes();
292 jlong rss_usage_in_bytes();
293 jlong cache_usage_in_bytes();
294 void print_version_specific_info(outputStream* st, julong upper_mem_bound);
295 };
296
297 // Utility class for storing info retrieved from /proc/cgroups,
298 // /proc/self/cgroup and /proc/self/mountinfo
299 // For reference see man 7 cgroups and CgroupSubsystemFactory
300 class CgroupInfo : public StackObj {
301 friend class CgroupSubsystemFactory;
302 friend class WhiteBox;
303
304 private:
305 char* _name;
306 int _hierarchy_id;
307 bool _enabled;
308 bool _read_only; // whether or not the mount path is mounted read-only
309 bool _data_complete; // indicating cgroup v1 data is complete for this controller
310 char* _cgroup_path; // cgroup controller path from /proc/self/cgroup
311 char* _root_mount_path; // root mount path from /proc/self/mountinfo. Unused for cgroup v2
312 char* _mount_path; // mount path from /proc/self/mountinfo.
313
314 public:
315 CgroupInfo() {
316 _name = nullptr;
317 _hierarchy_id = -1;
318 _enabled = false;
319 _read_only = false;
320 _data_complete = false;
321 _cgroup_path = nullptr;
322 _root_mount_path = nullptr;
323 _mount_path = nullptr;
324 }
325
326 };
327
328 class CgroupSubsystemFactory: AllStatic {
329 friend class WhiteBox;
330
331 public:
332 static CgroupSubsystem* create();
333 private:
334 static inline bool is_cgroup_v2(u1* flags) {
335 return *flags == CGROUPS_V2;
336 }
337
338 #ifdef ASSERT
339 static inline bool is_valid_cgroup(u1* flags) {
340 return *flags == CGROUPS_V1 || *flags == CGROUPS_V2;
341 }
342 static inline bool is_cgroup_v1(u1* flags) {
343 return *flags == CGROUPS_V1;
344 }
345 #endif
346
347 static void set_controller_paths(CgroupInfo* cg_infos,
348 int controller,
349 const char* name,
350 char* mount_path,
351 char* root_path,
352 bool read_only);
353 // Determine the cgroup type (version 1 or version 2), given
354 // relevant paths to files. Sets 'flags' accordingly.
355 static bool determine_type(CgroupInfo* cg_infos,
356 bool cgroups_v2_enabled,
357 const char* controllers_file,
358 const char* proc_self_cgroup,
359 const char* proc_self_mountinfo,
360 u1* flags);
361 static void cleanup(CgroupInfo* cg_infos);
362 };
363
364 #endif // CGROUP_SUBSYSTEM_LINUX_HPP