1 /*
  2  * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CGROUP_SUBSYSTEM_LINUX_HPP
 26 #define CGROUP_SUBSYSTEM_LINUX_HPP
 27 
 28 #include "logging/log.hpp"
 29 #include "memory/allocation.hpp"
 30 #include "osContainer_linux.hpp"
 31 #include "runtime/os.hpp"
 32 #include "utilities/globalDefinitions.hpp"
 33 #include "utilities/macros.hpp"
 34 
 35 // Shared cgroups code (used by cgroup version 1 and version 2)
 36 
 37 /*
 38  * PER_CPU_SHARES has been set to 1024 because CPU shares' quota
 39  * is commonly used in cloud frameworks like Kubernetes[1],
 40  * AWS[2] and Mesos[3] in a similar way. They spawn containers with
 41  * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do
 42  * the inverse for determining the number of possible available
 43  * CPUs to the JVM inside a container. See JDK-8216366.
 44  *
 45  * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu
 46  *     In particular:
 47  *        When using Docker:
 48  *          The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially
 49  *          fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the
 50  *          --cpu-shares flag in the docker run command.
 51  * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
 52  * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648
 53  *     https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30
 54  */
 55 #define PER_CPU_SHARES 1024
 56 
 57 #define CGROUPS_V1               1
 58 #define CGROUPS_V2               2
 59 #define INVALID_CGROUPS_V2       3
 60 #define INVALID_CGROUPS_V1       4
 61 #define INVALID_CGROUPS_NO_MOUNT 5
 62 #define INVALID_CGROUPS_GENERIC  6
 63 
 64 // Five controllers: cpu, cpuset, cpuacct, memory, pids
 65 #define CG_INFO_LENGTH 5
 66 #define CPUSET_IDX     0
 67 #define CPU_IDX        1
 68 #define CPUACCT_IDX    2
 69 #define MEMORY_IDX     3
 70 #define PIDS_IDX       4
 71 
 72 #define CONTAINER_READ_NUMBER_CHECKED(controller, filename, log_string, retval)       \
 73 {                                                                                     \
 74   bool is_ok;                                                                         \
 75   is_ok = controller->read_number(filename, &retval);                                 \
 76   if (!is_ok) {                                                                       \
 77     log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR);            \
 78     return OSCONTAINER_ERROR;                                                         \
 79   }                                                                                   \
 80   log_trace(os, container)(log_string " is: " JULONG_FORMAT, retval);                 \
 81 }
 82 
 83 #define CONTAINER_READ_NUMBER_CHECKED_MAX(controller, filename, log_string, retval)   \
 84 {                                                                                     \
 85   bool is_ok;                                                                         \
 86   is_ok = controller->read_number_handle_max(filename, &retval);                      \
 87   if (!is_ok) {                                                                       \
 88     log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR);            \
 89     return OSCONTAINER_ERROR;                                                         \
 90   }                                                                                   \
 91   log_trace(os, container)(log_string " is: " JLONG_FORMAT, retval);                  \
 92 }
 93 
 94 #define CONTAINER_READ_STRING_CHECKED(controller, filename, log_string, retval, buf_size) \
 95 {                                                                                         \
 96   bool is_ok;                                                                             \
 97   is_ok = controller->read_string(filename, retval, buf_size);                            \
 98   if (!is_ok) {                                                                           \
 99     log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR);                \
100     return nullptr;                                                                       \
101   }                                                                                       \
102   log_trace(os, container)(log_string " is: %s", retval);                                 \
103 }
104 
105 #define CONTAINER_READ_NUMERICAL_KEY_VALUE_CHECKED(controller, filename, key, log_string, retval) \
106 {                                                                                     \
107   bool is_ok;                                                                         \
108   is_ok = controller->read_numerical_key_value(filename, key, &retval);               \
109   if (!is_ok) {                                                                       \
110     log_trace(os, container)(log_string " failed: %d", OSCONTAINER_ERROR);            \
111     return OSCONTAINER_ERROR;                                                         \
112   }                                                                                   \
113   log_trace(os, container)(log_string " is: " JULONG_FORMAT, retval);                 \
114 }
115 
116 
117 class CgroupController: public CHeapObj<mtInternal> {
118   protected:
119     char* _cgroup_path;
120     char* _mount_point;
121   public:
122     virtual const char* subsystem_path() = 0;
123     virtual bool is_read_only() = 0;
124     const char* cgroup_path() { return _cgroup_path; }
125     const char* mount_point() { return _mount_point; }
126     virtual bool needs_hierarchy_adjustment() { return false; }
127 
128     /* Read a numerical value as unsigned long
129      *
130      * returns: false if any error occurred. true otherwise and
131      * the parsed value is set in the provided julong pointer.
132      */
133     bool read_number(const char* filename, julong* result);
134 
135     /* Convenience method to deal with numbers as well as the string 'max'
136      * in interface files. Otherwise same as read_number().
137      *
138      * returns: false if any error occurred. true otherwise and
139      * the parsed value (which might be negative) is being set in
140      * the provided jlong pointer.
141      */
142     bool read_number_handle_max(const char* filename, jlong* result);
143 
144     /* Read a string of at most buf_size - 1 characters from the interface file.
145      * The provided buffer must be at least buf_size in size so as to account
146      * for the null terminating character. Callers must ensure that the buffer
147      * is appropriately in-scope and of sufficient size.
148      *
149      * returns: false if any error occured. true otherwise and the passed
150      * in buffer will contain the first buf_size - 1 characters of the string
151      * or up to the first new line character ('\n') whichever comes first.
152      */
153     bool read_string(const char* filename, char* buf, size_t buf_size);
154 
155     /* Read a tuple value as a number. Tuple is: '<first> <second>'.
156      * Handles 'max' (for unlimited) for any tuple value. This is handy for
157      * parsing interface files like cpu.max which contain such tuples.
158      *
159      * returns: false if any error occurred. true otherwise and the parsed
160      * value of the appropriate tuple entry set in the provided jlong pointer.
161      */
162     bool read_numerical_tuple_value(const char* filename, bool use_first, jlong* result);
163 
164     /* Read a numerical value from a multi-line interface file. The matched line is
165      * determined by the provided 'key'. The associated numerical value is being set
166      * via the passed in julong pointer. Example interface file 'memory.stat'
167      *
168      * returns: false if any error occurred. true otherwise and the parsed value is
169      * being set in the provided julong pointer.
170      */
171     bool read_numerical_key_value(const char* filename, const char* key, julong* result);
172 
173   private:
174     static jlong limit_from_str(char* limit_str);
175 };
176 
177 class CachedMetric : public CHeapObj<mtInternal>{
178   private:
179     volatile jlong _metric;
180     volatile jlong _next_check_counter;
181   public:
182     CachedMetric() {
183       _metric = -1;
184       _next_check_counter = min_jlong;
185     }
186     bool should_check_metric() {
187       return os::elapsed_counter() > _next_check_counter;
188     }
189     jlong value() { return _metric; }
190     void set_value(jlong value, jlong timeout) {
191       _metric = value;
192       // Metric is unlikely to change, but we want to remain
193       // responsive to configuration changes. A very short grace time
194       // between re-read avoids excessive overhead during startup without
195       // significantly reducing the VMs ability to promptly react to changed
196       // metric config
197       _next_check_counter = os::elapsed_counter() + timeout;
198     }
199 };
200 
201 template <class T>
202 class CachingCgroupController : public CHeapObj<mtInternal> {
203   private:
204     T* _controller;
205     CachedMetric* _metrics_cache;
206 
207   public:
208     CachingCgroupController(T* cont) {
209       _controller = cont;
210       _metrics_cache = new CachedMetric();
211     }
212 
213     CachedMetric* metrics_cache() { return _metrics_cache; }
214     T* controller() { return _controller; }
215 };
216 
217 // Pure virtual class representing version agnostic CPU controllers
218 class CgroupCpuController: public CHeapObj<mtInternal> {
219   public:
220     virtual int cpu_quota() = 0;
221     virtual int cpu_period() = 0;
222     virtual int cpu_shares() = 0;
223     virtual bool needs_hierarchy_adjustment() = 0;
224     virtual bool is_read_only() = 0;
225     virtual const char* subsystem_path() = 0;
226     virtual void set_subsystem_path(const char* cgroup_path) = 0;
227     virtual const char* mount_point() = 0;
228     virtual const char* cgroup_path() = 0;
229 };
230 
231 // Pure virtual class representing version agnostic CPU accounting controllers
232 class CgroupCpuacctController: public CHeapObj<mtInternal> {
233   public:
234     virtual jlong cpu_usage_in_micros() = 0;
235     virtual bool needs_hierarchy_adjustment() = 0;
236     virtual bool is_read_only() = 0;
237     virtual const char* subsystem_path() = 0;
238     virtual void set_subsystem_path(const char* cgroup_path) = 0;
239     virtual const char* mount_point() = 0;
240     virtual const char* cgroup_path() = 0;
241 };
242 
243 // Pure virtual class representing version agnostic memory controllers
244 class CgroupMemoryController: public CHeapObj<mtInternal> {
245   public:
246     virtual jlong read_memory_limit_in_bytes(julong upper_bound) = 0;
247     virtual jlong memory_usage_in_bytes() = 0;
248     virtual jlong memory_and_swap_limit_in_bytes(julong upper_mem_bound, julong upper_swap_bound) = 0;
249     virtual jlong memory_and_swap_usage_in_bytes(julong upper_mem_bound, julong upper_swap_bound) = 0;
250     virtual jlong memory_soft_limit_in_bytes(julong upper_bound) = 0;
251     virtual jlong memory_throttle_limit_in_bytes() = 0;
252     virtual jlong memory_max_usage_in_bytes() = 0;
253     virtual jlong rss_usage_in_bytes() = 0;
254     virtual jlong cache_usage_in_bytes() = 0;
255     virtual void print_version_specific_info(outputStream* st, julong upper_mem_bound) = 0;
256     virtual bool needs_hierarchy_adjustment() = 0;
257     virtual bool is_read_only() = 0;
258     virtual const char* subsystem_path() = 0;
259     virtual void set_subsystem_path(const char* cgroup_path) = 0;
260     virtual const char* mount_point() = 0;
261     virtual const char* cgroup_path() = 0;
262 };
263 
264 class CgroupSubsystem: public CHeapObj<mtInternal> {
265   public:
266     jlong memory_limit_in_bytes(julong upper_bound);
267     int active_processor_count();
268 
269     virtual jlong pids_max() = 0;
270     virtual jlong pids_current() = 0;
271     virtual bool is_containerized() = 0;
272 
273     virtual char * cpu_cpuset_cpus() = 0;
274     virtual char * cpu_cpuset_memory_nodes() = 0;
275     virtual const char * container_type() = 0;
276     virtual CachingCgroupController<CgroupMemoryController>* memory_controller() = 0;
277     virtual CachingCgroupController<CgroupCpuController>* cpu_controller() = 0;
278     virtual CgroupCpuacctController* cpuacct_controller() = 0;
279 
280     int cpu_quota();
281     int cpu_period();
282     int cpu_shares();
283 
284     jlong cpu_usage_in_micros();
285 
286     jlong memory_usage_in_bytes();
287     jlong memory_and_swap_limit_in_bytes(julong upper_mem_bound, julong upper_swap_bound);
288     jlong memory_and_swap_usage_in_bytes(julong upper_mem_bound, julong upper_swap_bound);
289     jlong memory_soft_limit_in_bytes(julong upper_bound);
290     jlong memory_throttle_limit_in_bytes();
291     jlong memory_max_usage_in_bytes();
292     jlong rss_usage_in_bytes();
293     jlong cache_usage_in_bytes();
294     void print_version_specific_info(outputStream* st, julong upper_mem_bound);
295 };
296 
297 // Utility class for storing info retrieved from /proc/cgroups,
298 // /proc/self/cgroup and /proc/self/mountinfo
299 // For reference see man 7 cgroups and CgroupSubsystemFactory
300 class CgroupInfo : public StackObj {
301   friend class CgroupSubsystemFactory;
302   friend class WhiteBox;
303 
304   private:
305     char* _name;
306     int _hierarchy_id;
307     bool _enabled;
308     bool _read_only;            // whether or not the mount path is mounted read-only
309     bool _data_complete;    // indicating cgroup v1 data is complete for this controller
310     char* _cgroup_path;     // cgroup controller path from /proc/self/cgroup
311     char* _root_mount_path; // root mount path from /proc/self/mountinfo. Unused for cgroup v2
312     char* _mount_path;      // mount path from /proc/self/mountinfo.
313 
314   public:
315     CgroupInfo() {
316       _name = nullptr;
317       _hierarchy_id = -1;
318       _enabled = false;
319       _read_only = false;
320       _data_complete = false;
321       _cgroup_path = nullptr;
322       _root_mount_path = nullptr;
323       _mount_path = nullptr;
324     }
325 
326 };
327 
328 class CgroupSubsystemFactory: AllStatic {
329   friend class WhiteBox;
330 
331   public:
332     static CgroupSubsystem* create();
333   private:
334     static inline bool is_cgroup_v2(u1* flags) {
335        return *flags == CGROUPS_V2;
336     }
337 
338 #ifdef ASSERT
339     static inline bool is_valid_cgroup(u1* flags) {
340        return *flags == CGROUPS_V1 || *flags == CGROUPS_V2;
341     }
342     static inline bool is_cgroup_v1(u1* flags) {
343        return *flags == CGROUPS_V1;
344     }
345 #endif
346 
347     static void set_controller_paths(CgroupInfo* cg_infos,
348                                      int controller,
349                                      const char* name,
350                                      char* mount_path,
351                                      char* root_path,
352                                      bool read_only);
353     // Determine the cgroup type (version 1 or version 2), given
354     // relevant paths to files. Sets 'flags' accordingly.
355     static bool determine_type(CgroupInfo* cg_infos,
356                                bool cgroups_v2_enabled,
357                                const char* controllers_file,
358                                const char* proc_self_cgroup,
359                                const char* proc_self_mountinfo,
360                                u1* flags);
361     static void cleanup(CgroupInfo* cg_infos);
362 };
363 
364 #endif // CGROUP_SUBSYSTEM_LINUX_HPP