1 /*
   2  * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 // no precompiled headers
  26 #include "jvm.h"
  27 #include "classfile/classLoader.hpp"
  28 #include "classfile/systemDictionary.hpp"
  29 #include "classfile/vmSymbols.hpp"
  30 #include "code/icBuffer.hpp"
  31 #include "code/vtableStubs.hpp"
  32 #include "compiler/compileBroker.hpp"
  33 #include "compiler/disassembler.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "logging/log.hpp"
  36 #include "logging/logStream.hpp"
  37 #include "memory/allocation.inline.hpp"
  38 #include "memory/filemap.hpp"
  39 #include "oops/oop.inline.hpp"
  40 #include "os_linux.inline.hpp"
  41 #include "os_posix.inline.hpp"
  42 #include "os_share_linux.hpp"
  43 #include "osContainer_linux.hpp"
  44 #include "prims/jniFastGetField.hpp"
  45 #include "prims/jvm_misc.hpp"
  46 #include "runtime/arguments.hpp"
  47 #include "runtime/atomic.hpp"
  48 #include "runtime/extendedPC.hpp"
  49 #include "runtime/globals.hpp"
  50 #include "runtime/interfaceSupport.inline.hpp"
  51 #include "runtime/init.hpp"
  52 #include "runtime/java.hpp"
  53 #include "runtime/javaCalls.hpp"
  54 #include "runtime/mutexLocker.hpp"
  55 #include "runtime/objectMonitor.hpp"
  56 #include "runtime/orderAccess.hpp"
  57 #include "runtime/osThread.hpp"
  58 #include "runtime/perfMemory.hpp"
  59 #include "runtime/sharedRuntime.hpp"
  60 #include "runtime/statSampler.hpp"
  61 #include "runtime/stubRoutines.hpp"
  62 #include "runtime/thread.inline.hpp"
  63 #include "runtime/threadCritical.hpp"
  64 #include "runtime/threadSMR.hpp"
  65 #include "runtime/timer.hpp"
  66 #include "runtime/vm_version.hpp"
  67 #include "semaphore_posix.hpp"
  68 #include "services/attachListener.hpp"
  69 #include "services/memTracker.hpp"
  70 #include "services/runtimeService.hpp"
  71 #include "utilities/align.hpp"
  72 #include "utilities/decoder.hpp"
  73 #include "utilities/defaultStream.hpp"
  74 #include "utilities/events.hpp"
  75 #include "utilities/elfFile.hpp"
  76 #include "utilities/growableArray.hpp"
  77 #include "utilities/macros.hpp"
  78 #include "utilities/vmError.hpp"
  79 
  80 // put OS-includes here
  81 # include <sys/types.h>
  82 # include <sys/mman.h>
  83 # include <sys/stat.h>
  84 # include <sys/select.h>
  85 # include <pthread.h>
  86 # include <signal.h>
  87 # include <errno.h>
  88 # include <dlfcn.h>
  89 # include <stdio.h>
  90 # include <unistd.h>
  91 # include <sys/resource.h>
  92 # include <pthread.h>
  93 # include <sys/stat.h>
  94 # include <sys/time.h>
  95 # include <sys/times.h>
  96 # include <sys/utsname.h>
  97 # include <sys/socket.h>
  98 # include <sys/wait.h>
  99 # include <pwd.h>
 100 # include <poll.h>
 101 # include <fcntl.h>
 102 # include <string.h>
 103 # include <syscall.h>
 104 # include <sys/sysinfo.h>
 105 # include <sys/ipc.h>
 106 # include <sys/shm.h>
 107 # include <link.h>
 108 # include <stdint.h>
 109 # include <inttypes.h>
 110 # include <sys/ioctl.h>
 111 
 112 #ifndef _GNU_SOURCE
 113   #define _GNU_SOURCE
 114   #include <sched.h>
 115   #undef _GNU_SOURCE
 116 #else
 117   #include <sched.h>
 118 #endif
 119 
 120 // if RUSAGE_THREAD for getrusage() has not been defined, do it here. The code calling
 121 // getrusage() is prepared to handle the associated failure.
 122 #ifndef RUSAGE_THREAD
 123   #define RUSAGE_THREAD   (1)               /* only the calling thread */
 124 #endif
 125 
 126 #define MAX_PATH    (2 * K)
 127 
 128 #define MAX_SECS 100000000
 129 
 130 // for timer info max values which include all bits
 131 #define ALL_64_BITS CONST64(0xFFFFFFFFFFFFFFFF)
 132 
 133 enum CoredumpFilterBit {
 134   FILE_BACKED_PVT_BIT = 1 << 2,
 135   FILE_BACKED_SHARED_BIT = 1 << 3,
 136   LARGEPAGES_BIT = 1 << 6,
 137   DAX_SHARED_BIT = 1 << 8
 138 };
 139 
 140 ////////////////////////////////////////////////////////////////////////////////
 141 // global variables
 142 julong os::Linux::_physical_memory = 0;
 143 
 144 address   os::Linux::_initial_thread_stack_bottom = NULL;
 145 uintptr_t os::Linux::_initial_thread_stack_size   = 0;
 146 
 147 int (*os::Linux::_pthread_getcpuclockid)(pthread_t, clockid_t *) = NULL;
 148 int (*os::Linux::_pthread_setname_np)(pthread_t, const char*) = NULL;
 149 Mutex* os::Linux::_createThread_lock = NULL;
 150 pthread_t os::Linux::_main_thread;
 151 int os::Linux::_page_size = -1;
 152 bool os::Linux::_supports_fast_thread_cpu_time = false;
 153 uint32_t os::Linux::_os_version = 0;
 154 const char * os::Linux::_glibc_version = "unknown";
 155 const char * os::Linux::_libpthread_version = "unknown";
 156 
 157 static jlong initial_time_count=0;
 158 
 159 static int clock_tics_per_sec = 100;
 160 
 161 // If the VM might have been created on the primordial thread, we need to resolve the
 162 // primordial thread stack bounds and check if the current thread might be the
 163 // primordial thread in places. If we know that the primordial thread is never used,
 164 // such as when the VM was created by one of the standard java launchers, we can
 165 // avoid this
 166 static bool suppress_primordial_thread_resolution = false;
 167 
 168 // For diagnostics to print a message once. see run_periodic_checks
 169 static sigset_t check_signal_done;
 170 static bool check_signals = true;
 171 
 172 // Signal number used to suspend/resume a thread
 173 
 174 // do not use any signal number less than SIGSEGV, see 4355769
 175 static int SR_signum = SIGUSR2;
 176 sigset_t SR_sigset;
 177 
 178 // utility functions
 179 
 180 static int SR_initialize();
 181 
 182 julong os::available_memory() {
 183   return Linux::available_memory();
 184 }
 185 
 186 julong os::Linux::available_memory() {
 187   // values in struct sysinfo are "unsigned long"
 188   struct sysinfo si;
 189   julong avail_mem;
 190 
 191   if (OSContainer::is_containerized()) {
 192     jlong mem_limit, mem_usage;
 193     if ((mem_limit = OSContainer::memory_limit_in_bytes()) < 1) {
 194       log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", using host value",
 195                              mem_limit == OSCONTAINER_ERROR ? "failed" : "unlimited", mem_limit);
 196     }
 197     if (mem_limit > 0 && (mem_usage = OSContainer::memory_usage_in_bytes()) < 1) {
 198       log_debug(os, container)("container memory usage failed: " JLONG_FORMAT ", using host value", mem_usage);
 199     }
 200     if (mem_limit > 0 && mem_usage > 0 ) {
 201       avail_mem = mem_limit > mem_usage ? (julong)mem_limit - (julong)mem_usage : 0;
 202       log_trace(os)("available container memory: " JULONG_FORMAT, avail_mem);
 203       return avail_mem;
 204     }
 205   }
 206 
 207   sysinfo(&si);
 208   avail_mem = (julong)si.freeram * si.mem_unit;
 209   log_trace(os)("available memory: " JULONG_FORMAT, avail_mem);
 210   return avail_mem;
 211 }
 212 
 213 julong os::physical_memory() {
 214   jlong phys_mem = 0;
 215   if (OSContainer::is_containerized()) {
 216     jlong mem_limit;
 217     if ((mem_limit = OSContainer::memory_limit_in_bytes()) > 0) {
 218       log_trace(os)("total container memory: " JLONG_FORMAT, mem_limit);
 219       return mem_limit;
 220     }
 221     log_debug(os, container)("container memory limit %s: " JLONG_FORMAT ", using host value",
 222                             mem_limit == OSCONTAINER_ERROR ? "failed" : "unlimited", mem_limit);
 223   }
 224 
 225   phys_mem = Linux::physical_memory();
 226   log_trace(os)("total system memory: " JLONG_FORMAT, phys_mem);
 227   return phys_mem;
 228 }
 229 
 230 static uint64_t initial_total_ticks = 0;
 231 static uint64_t initial_steal_ticks = 0;
 232 static bool     has_initial_tick_info = false;
 233 
 234 static void next_line(FILE *f) {
 235   int c;
 236   do {
 237     c = fgetc(f);
 238   } while (c != '\n' && c != EOF);
 239 }
 240 
 241 bool os::Linux::get_tick_information(CPUPerfTicks* pticks, int which_logical_cpu) {
 242   FILE*         fh;
 243   uint64_t      userTicks, niceTicks, systemTicks, idleTicks;
 244   // since at least kernel 2.6 : iowait: time waiting for I/O to complete
 245   // irq: time  servicing interrupts; softirq: time servicing softirqs
 246   uint64_t      iowTicks = 0, irqTicks = 0, sirqTicks= 0;
 247   // steal (since kernel 2.6.11): time spent in other OS when running in a virtualized environment
 248   uint64_t      stealTicks = 0;
 249   // guest (since kernel 2.6.24): time spent running a virtual CPU for guest OS under the
 250   // control of the Linux kernel
 251   uint64_t      guestNiceTicks = 0;
 252   int           logical_cpu = -1;
 253   const int     required_tickinfo_count = (which_logical_cpu == -1) ? 4 : 5;
 254   int           n;
 255 
 256   memset(pticks, 0, sizeof(CPUPerfTicks));
 257 
 258   if ((fh = fopen("/proc/stat", "r")) == NULL) {
 259     return false;
 260   }
 261 
 262   if (which_logical_cpu == -1) {
 263     n = fscanf(fh, "cpu " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
 264             UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
 265             UINT64_FORMAT " " UINT64_FORMAT " ",
 266             &userTicks, &niceTicks, &systemTicks, &idleTicks,
 267             &iowTicks, &irqTicks, &sirqTicks,
 268             &stealTicks, &guestNiceTicks);
 269   } else {
 270     // Move to next line
 271     next_line(fh);
 272 
 273     // find the line for requested cpu faster to just iterate linefeeds?
 274     for (int i = 0; i < which_logical_cpu; i++) {
 275       next_line(fh);
 276     }
 277 
 278     n = fscanf(fh, "cpu%u " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
 279                UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " " UINT64_FORMAT " "
 280                UINT64_FORMAT " " UINT64_FORMAT " ",
 281                &logical_cpu, &userTicks, &niceTicks,
 282                &systemTicks, &idleTicks, &iowTicks, &irqTicks, &sirqTicks,
 283                &stealTicks, &guestNiceTicks);
 284   }
 285 
 286   fclose(fh);
 287   if (n < required_tickinfo_count || logical_cpu != which_logical_cpu) {
 288     return false;
 289   }
 290   pticks->used       = userTicks + niceTicks;
 291   pticks->usedKernel = systemTicks + irqTicks + sirqTicks;
 292   pticks->total      = userTicks + niceTicks + systemTicks + idleTicks +
 293                        iowTicks + irqTicks + sirqTicks + stealTicks + guestNiceTicks;
 294 
 295   if (n > required_tickinfo_count + 3) {
 296     pticks->steal = stealTicks;
 297     pticks->has_steal_ticks = true;
 298   } else {
 299     pticks->steal = 0;
 300     pticks->has_steal_ticks = false;
 301   }
 302 
 303   return true;
 304 }
 305 
 306 // Return true if user is running as root.
 307 
 308 bool os::have_special_privileges() {
 309   static bool init = false;
 310   static bool privileges = false;
 311   if (!init) {
 312     privileges = (getuid() != geteuid()) || (getgid() != getegid());
 313     init = true;
 314   }
 315   return privileges;
 316 }
 317 
 318 
 319 #ifndef SYS_gettid
 320 // i386: 224, ia64: 1105, amd64: 186, sparc 143
 321   #ifdef __ia64__
 322     #define SYS_gettid 1105
 323   #else
 324     #ifdef __i386__
 325       #define SYS_gettid 224
 326     #else
 327       #ifdef __amd64__
 328         #define SYS_gettid 186
 329       #else
 330         #ifdef __sparc__
 331           #define SYS_gettid 143
 332         #else
 333           #error define gettid for the arch
 334         #endif
 335       #endif
 336     #endif
 337   #endif
 338 #endif
 339 
 340 
 341 // pid_t gettid()
 342 //
 343 // Returns the kernel thread id of the currently running thread. Kernel
 344 // thread id is used to access /proc.
 345 pid_t os::Linux::gettid() {
 346   int rslt = syscall(SYS_gettid);
 347   assert(rslt != -1, "must be."); // old linuxthreads implementation?
 348   return (pid_t)rslt;
 349 }
 350 
 351 // Most versions of linux have a bug where the number of processors are
 352 // determined by looking at the /proc file system.  In a chroot environment,
 353 // the system call returns 1.
 354 static bool unsafe_chroot_detected = false;
 355 static const char *unstable_chroot_error = "/proc file system not found.\n"
 356                      "Java may be unstable running multithreaded in a chroot "
 357                      "environment on Linux when /proc filesystem is not mounted.";
 358 
 359 void os::Linux::initialize_system_info() {
 360   set_processor_count(sysconf(_SC_NPROCESSORS_CONF));
 361   if (processor_count() == 1) {
 362     pid_t pid = os::Linux::gettid();
 363     char fname[32];
 364     jio_snprintf(fname, sizeof(fname), "/proc/%d", pid);
 365     FILE *fp = fopen(fname, "r");
 366     if (fp == NULL) {
 367       unsafe_chroot_detected = true;
 368     } else {
 369       fclose(fp);
 370     }
 371   }
 372   _physical_memory = (julong)sysconf(_SC_PHYS_PAGES) * (julong)sysconf(_SC_PAGESIZE);
 373   assert(processor_count() > 0, "linux error");
 374 }
 375 
 376 void os::init_system_properties_values() {
 377   // The next steps are taken in the product version:
 378   //
 379   // Obtain the JAVA_HOME value from the location of libjvm.so.
 380   // This library should be located at:
 381   // <JAVA_HOME>/lib/{client|server}/libjvm.so.
 382   //
 383   // If "/jre/lib/" appears at the right place in the path, then we
 384   // assume libjvm.so is installed in a JDK and we use this path.
 385   //
 386   // Otherwise exit with message: "Could not create the Java virtual machine."
 387   //
 388   // The following extra steps are taken in the debugging version:
 389   //
 390   // If "/jre/lib/" does NOT appear at the right place in the path
 391   // instead of exit check for $JAVA_HOME environment variable.
 392   //
 393   // If it is defined and we are able to locate $JAVA_HOME/jre/lib/<arch>,
 394   // then we append a fake suffix "hotspot/libjvm.so" to this path so
 395   // it looks like libjvm.so is installed there
 396   // <JAVA_HOME>/jre/lib/<arch>/hotspot/libjvm.so.
 397   //
 398   // Otherwise exit.
 399   //
 400   // Important note: if the location of libjvm.so changes this
 401   // code needs to be changed accordingly.
 402 
 403   // See ld(1):
 404   //      The linker uses the following search paths to locate required
 405   //      shared libraries:
 406   //        1: ...
 407   //        ...
 408   //        7: The default directories, normally /lib and /usr/lib.
 409 #ifndef OVERRIDE_LIBPATH
 410   #if defined(AMD64) || (defined(_LP64) && defined(SPARC)) || defined(PPC64) || defined(S390)
 411     #define DEFAULT_LIBPATH "/usr/lib64:/lib64:/lib:/usr/lib"
 412   #else
 413     #define DEFAULT_LIBPATH "/lib:/usr/lib"
 414   #endif
 415 #else
 416   #define DEFAULT_LIBPATH OVERRIDE_LIBPATH
 417 #endif
 418 
 419 // Base path of extensions installed on the system.
 420 #define SYS_EXT_DIR     "/usr/java/packages"
 421 #define EXTENSIONS_DIR  "/lib/ext"
 422 
 423   // Buffer that fits several sprintfs.
 424   // Note that the space for the colon and the trailing null are provided
 425   // by the nulls included by the sizeof operator.
 426   const size_t bufsize =
 427     MAX2((size_t)MAXPATHLEN,  // For dll_dir & friends.
 428          (size_t)MAXPATHLEN + sizeof(EXTENSIONS_DIR) + sizeof(SYS_EXT_DIR) + sizeof(EXTENSIONS_DIR)); // extensions dir
 429   char *buf = (char *)NEW_C_HEAP_ARRAY(char, bufsize, mtInternal);
 430 
 431   // sysclasspath, java_home, dll_dir
 432   {
 433     char *pslash;
 434     os::jvm_path(buf, bufsize);
 435 
 436     // Found the full path to libjvm.so.
 437     // Now cut the path to <java_home>/jre if we can.
 438     pslash = strrchr(buf, '/');
 439     if (pslash != NULL) {
 440       *pslash = '\0';            // Get rid of /libjvm.so.
 441     }
 442     pslash = strrchr(buf, '/');
 443     if (pslash != NULL) {
 444       *pslash = '\0';            // Get rid of /{client|server|hotspot}.
 445     }
 446     Arguments::set_dll_dir(buf);
 447 
 448     if (pslash != NULL) {
 449       pslash = strrchr(buf, '/');
 450       if (pslash != NULL) {
 451         *pslash = '\0';        // Get rid of /lib.
 452       }
 453     }
 454     Arguments::set_java_home(buf);
 455     if (!set_boot_path('/', ':')) {
 456       vm_exit_during_initialization("Failed setting boot class path.", NULL);
 457     }
 458   }
 459 
 460   // Where to look for native libraries.
 461   //
 462   // Note: Due to a legacy implementation, most of the library path
 463   // is set in the launcher. This was to accomodate linking restrictions
 464   // on legacy Linux implementations (which are no longer supported).
 465   // Eventually, all the library path setting will be done here.
 466   //
 467   // However, to prevent the proliferation of improperly built native
 468   // libraries, the new path component /usr/java/packages is added here.
 469   // Eventually, all the library path setting will be done here.
 470   {
 471     // Get the user setting of LD_LIBRARY_PATH, and prepended it. It
 472     // should always exist (until the legacy problem cited above is
 473     // addressed).
 474     const char *v = ::getenv("LD_LIBRARY_PATH");
 475     const char *v_colon = ":";
 476     if (v == NULL) { v = ""; v_colon = ""; }
 477     // That's +1 for the colon and +1 for the trailing '\0'.
 478     char *ld_library_path = (char *)NEW_C_HEAP_ARRAY(char,
 479                                                      strlen(v) + 1 +
 480                                                      sizeof(SYS_EXT_DIR) + sizeof("/lib/") + sizeof(DEFAULT_LIBPATH) + 1,
 481                                                      mtInternal);
 482     sprintf(ld_library_path, "%s%s" SYS_EXT_DIR "/lib:" DEFAULT_LIBPATH, v, v_colon);
 483     Arguments::set_library_path(ld_library_path);
 484     FREE_C_HEAP_ARRAY(char, ld_library_path);
 485   }
 486 
 487   // Extensions directories.
 488   sprintf(buf, "%s" EXTENSIONS_DIR ":" SYS_EXT_DIR EXTENSIONS_DIR, Arguments::get_java_home());
 489   Arguments::set_ext_dirs(buf);
 490 
 491   FREE_C_HEAP_ARRAY(char, buf);
 492 
 493 #undef DEFAULT_LIBPATH
 494 #undef SYS_EXT_DIR
 495 #undef EXTENSIONS_DIR
 496 }
 497 
 498 ////////////////////////////////////////////////////////////////////////////////
 499 // breakpoint support
 500 
 501 void os::breakpoint() {
 502   BREAKPOINT;
 503 }
 504 
 505 extern "C" void breakpoint() {
 506   // use debugger to set breakpoint here
 507 }
 508 
 509 ////////////////////////////////////////////////////////////////////////////////
 510 // signal support
 511 
 512 debug_only(static bool signal_sets_initialized = false);
 513 static sigset_t unblocked_sigs, vm_sigs;
 514 
 515 void os::Linux::signal_sets_init() {
 516   // Should also have an assertion stating we are still single-threaded.
 517   assert(!signal_sets_initialized, "Already initialized");
 518   // Fill in signals that are necessarily unblocked for all threads in
 519   // the VM. Currently, we unblock the following signals:
 520   // SHUTDOWN{1,2,3}_SIGNAL: for shutdown hooks support (unless over-ridden
 521   //                         by -Xrs (=ReduceSignalUsage));
 522   // BREAK_SIGNAL which is unblocked only by the VM thread and blocked by all
 523   // other threads. The "ReduceSignalUsage" boolean tells us not to alter
 524   // the dispositions or masks wrt these signals.
 525   // Programs embedding the VM that want to use the above signals for their
 526   // own purposes must, at this time, use the "-Xrs" option to prevent
 527   // interference with shutdown hooks and BREAK_SIGNAL thread dumping.
 528   // (See bug 4345157, and other related bugs).
 529   // In reality, though, unblocking these signals is really a nop, since
 530   // these signals are not blocked by default.
 531   sigemptyset(&unblocked_sigs);
 532   sigaddset(&unblocked_sigs, SIGILL);
 533   sigaddset(&unblocked_sigs, SIGSEGV);
 534   sigaddset(&unblocked_sigs, SIGBUS);
 535   sigaddset(&unblocked_sigs, SIGFPE);
 536 #if defined(PPC64)
 537   sigaddset(&unblocked_sigs, SIGTRAP);
 538 #endif
 539   sigaddset(&unblocked_sigs, SR_signum);
 540 
 541   if (!ReduceSignalUsage) {
 542     if (!os::Posix::is_sig_ignored(SHUTDOWN1_SIGNAL)) {
 543       sigaddset(&unblocked_sigs, SHUTDOWN1_SIGNAL);
 544     }
 545     if (!os::Posix::is_sig_ignored(SHUTDOWN2_SIGNAL)) {
 546       sigaddset(&unblocked_sigs, SHUTDOWN2_SIGNAL);
 547     }
 548     if (!os::Posix::is_sig_ignored(SHUTDOWN3_SIGNAL)) {
 549       sigaddset(&unblocked_sigs, SHUTDOWN3_SIGNAL);
 550     }
 551   }
 552   // Fill in signals that are blocked by all but the VM thread.
 553   sigemptyset(&vm_sigs);
 554   if (!ReduceSignalUsage) {
 555     sigaddset(&vm_sigs, BREAK_SIGNAL);
 556   }
 557   debug_only(signal_sets_initialized = true);
 558 
 559 }
 560 
 561 // These are signals that are unblocked while a thread is running Java.
 562 // (For some reason, they get blocked by default.)
 563 sigset_t* os::Linux::unblocked_signals() {
 564   assert(signal_sets_initialized, "Not initialized");
 565   return &unblocked_sigs;
 566 }
 567 
 568 // These are the signals that are blocked while a (non-VM) thread is
 569 // running Java. Only the VM thread handles these signals.
 570 sigset_t* os::Linux::vm_signals() {
 571   assert(signal_sets_initialized, "Not initialized");
 572   return &vm_sigs;
 573 }
 574 
 575 void os::Linux::hotspot_sigmask(Thread* thread) {
 576 
 577   //Save caller's signal mask before setting VM signal mask
 578   sigset_t caller_sigmask;
 579   pthread_sigmask(SIG_BLOCK, NULL, &caller_sigmask);
 580 
 581   OSThread* osthread = thread->osthread();
 582   osthread->set_caller_sigmask(caller_sigmask);
 583 
 584   pthread_sigmask(SIG_UNBLOCK, os::Linux::unblocked_signals(), NULL);
 585 
 586   if (!ReduceSignalUsage) {
 587     if (thread->is_VM_thread()) {
 588       // Only the VM thread handles BREAK_SIGNAL ...
 589       pthread_sigmask(SIG_UNBLOCK, vm_signals(), NULL);
 590     } else {
 591       // ... all other threads block BREAK_SIGNAL
 592       pthread_sigmask(SIG_BLOCK, vm_signals(), NULL);
 593     }
 594   }
 595 }
 596 
 597 //////////////////////////////////////////////////////////////////////////////
 598 // detecting pthread library
 599 
 600 void os::Linux::libpthread_init() {
 601   // Save glibc and pthread version strings.
 602 #if !defined(_CS_GNU_LIBC_VERSION) || \
 603     !defined(_CS_GNU_LIBPTHREAD_VERSION)
 604   #error "glibc too old (< 2.3.2)"
 605 #endif
 606 
 607   size_t n;
 608 
 609   n = confstr(_CS_GNU_LIBC_VERSION, NULL, 0);
 610   if (n > 0) {
 611     char* str = (char *)malloc(n, mtInternal);
 612     confstr(_CS_GNU_LIBC_VERSION, str, n);
 613     os::Linux::set_glibc_version(str);
 614   }
 615 
 616   n = confstr(_CS_GNU_LIBPTHREAD_VERSION, NULL, 0);
 617   if (n > 0) {
 618     char* str = (char *)malloc(n, mtInternal);
 619     confstr(_CS_GNU_LIBPTHREAD_VERSION, str, n);
 620     os::Linux::set_libpthread_version(str);
 621   }
 622 }
 623 
 624 /////////////////////////////////////////////////////////////////////////////
 625 // thread stack expansion
 626 
 627 // os::Linux::manually_expand_stack() takes care of expanding the thread
 628 // stack. Note that this is normally not needed: pthread stacks allocate
 629 // thread stack using mmap() without MAP_NORESERVE, so the stack is already
 630 // committed. Therefore it is not necessary to expand the stack manually.
 631 //
 632 // Manually expanding the stack was historically needed on LinuxThreads
 633 // thread stacks, which were allocated with mmap(MAP_GROWSDOWN). Nowadays
 634 // it is kept to deal with very rare corner cases:
 635 //
 636 // For one, user may run the VM on an own implementation of threads
 637 // whose stacks are - like the old LinuxThreads - implemented using
 638 // mmap(MAP_GROWSDOWN).
 639 //
 640 // Also, this coding may be needed if the VM is running on the primordial
 641 // thread. Normally we avoid running on the primordial thread; however,
 642 // user may still invoke the VM on the primordial thread.
 643 //
 644 // The following historical comment describes the details about running
 645 // on a thread stack allocated with mmap(MAP_GROWSDOWN):
 646 
 647 
 648 // Force Linux kernel to expand current thread stack. If "bottom" is close
 649 // to the stack guard, caller should block all signals.
 650 //
 651 // MAP_GROWSDOWN:
 652 //   A special mmap() flag that is used to implement thread stacks. It tells
 653 //   kernel that the memory region should extend downwards when needed. This
 654 //   allows early versions of LinuxThreads to only mmap the first few pages
 655 //   when creating a new thread. Linux kernel will automatically expand thread
 656 //   stack as needed (on page faults).
 657 //
 658 //   However, because the memory region of a MAP_GROWSDOWN stack can grow on
 659 //   demand, if a page fault happens outside an already mapped MAP_GROWSDOWN
 660 //   region, it's hard to tell if the fault is due to a legitimate stack
 661 //   access or because of reading/writing non-exist memory (e.g. buffer
 662 //   overrun). As a rule, if the fault happens below current stack pointer,
 663 //   Linux kernel does not expand stack, instead a SIGSEGV is sent to the
 664 //   application (see Linux kernel fault.c).
 665 //
 666 //   This Linux feature can cause SIGSEGV when VM bangs thread stack for
 667 //   stack overflow detection.
 668 //
 669 //   Newer version of LinuxThreads (since glibc-2.2, or, RH-7.x) and NPTL do
 670 //   not use MAP_GROWSDOWN.
 671 //
 672 // To get around the problem and allow stack banging on Linux, we need to
 673 // manually expand thread stack after receiving the SIGSEGV.
 674 //
 675 // There are two ways to expand thread stack to address "bottom", we used
 676 // both of them in JVM before 1.5:
 677 //   1. adjust stack pointer first so that it is below "bottom", and then
 678 //      touch "bottom"
 679 //   2. mmap() the page in question
 680 //
 681 // Now alternate signal stack is gone, it's harder to use 2. For instance,
 682 // if current sp is already near the lower end of page 101, and we need to
 683 // call mmap() to map page 100, it is possible that part of the mmap() frame
 684 // will be placed in page 100. When page 100 is mapped, it is zero-filled.
 685 // That will destroy the mmap() frame and cause VM to crash.
 686 //
 687 // The following code works by adjusting sp first, then accessing the "bottom"
 688 // page to force a page fault. Linux kernel will then automatically expand the
 689 // stack mapping.
 690 //
 691 // _expand_stack_to() assumes its frame size is less than page size, which
 692 // should always be true if the function is not inlined.
 693 
 694 static void NOINLINE _expand_stack_to(address bottom) {
 695   address sp;
 696   size_t size;
 697   volatile char *p;
 698 
 699   // Adjust bottom to point to the largest address within the same page, it
 700   // gives us a one-page buffer if alloca() allocates slightly more memory.
 701   bottom = (address)align_down((uintptr_t)bottom, os::Linux::page_size());
 702   bottom += os::Linux::page_size() - 1;
 703 
 704   // sp might be slightly above current stack pointer; if that's the case, we
 705   // will alloca() a little more space than necessary, which is OK. Don't use
 706   // os::current_stack_pointer(), as its result can be slightly below current
 707   // stack pointer, causing us to not alloca enough to reach "bottom".
 708   sp = (address)&sp;
 709 
 710   if (sp > bottom) {
 711     size = sp - bottom;
 712     p = (volatile char *)alloca(size);
 713     assert(p != NULL && p <= (volatile char *)bottom, "alloca problem?");
 714     p[0] = '\0';
 715   }
 716 }
 717 
 718 void os::Linux::expand_stack_to(address bottom) {
 719   _expand_stack_to(bottom);
 720 }
 721 
 722 bool os::Linux::manually_expand_stack(JavaThread * t, address addr) {
 723   assert(t!=NULL, "just checking");
 724   assert(t->osthread()->expanding_stack(), "expand should be set");
 725   assert(t->stack_base() != NULL, "stack_base was not initialized");
 726 
 727   if (addr <  t->stack_base() && addr >= t->stack_reserved_zone_base()) {
 728     sigset_t mask_all, old_sigset;
 729     sigfillset(&mask_all);
 730     pthread_sigmask(SIG_SETMASK, &mask_all, &old_sigset);
 731     _expand_stack_to(addr);
 732     pthread_sigmask(SIG_SETMASK, &old_sigset, NULL);
 733     return true;
 734   }
 735   return false;
 736 }
 737 
 738 //////////////////////////////////////////////////////////////////////////////
 739 // create new thread
 740 
 741 // Thread start routine for all newly created threads
 742 static void *thread_native_entry(Thread *thread) {
 743 
 744   thread->record_stack_base_and_size();
 745 
 746   // Try to randomize the cache line index of hot stack frames.
 747   // This helps when threads of the same stack traces evict each other's
 748   // cache lines. The threads can be either from the same JVM instance, or
 749   // from different JVM instances. The benefit is especially true for
 750   // processors with hyperthreading technology.
 751   static int counter = 0;
 752   int pid = os::current_process_id();
 753   alloca(((pid ^ counter++) & 7) * 128);
 754 
 755   thread->initialize_thread_current();
 756 
 757   OSThread* osthread = thread->osthread();
 758   Monitor* sync = osthread->startThread_lock();
 759 
 760   osthread->set_thread_id(os::current_thread_id());
 761 
 762   log_info(os, thread)("Thread is alive (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 763     os::current_thread_id(), (uintx) pthread_self());
 764 
 765   if (UseNUMA) {
 766     int lgrp_id = os::numa_get_group_id();
 767     if (lgrp_id != -1) {
 768       thread->set_lgrp_id(lgrp_id);
 769     }
 770   }
 771   // initialize signal mask for this thread
 772   os::Linux::hotspot_sigmask(thread);
 773 
 774   // initialize floating point control register
 775   os::Linux::init_thread_fpu_state();
 776 
 777   // handshaking with parent thread
 778   {
 779     MutexLocker ml(sync, Mutex::_no_safepoint_check_flag);
 780 
 781     // notify parent thread
 782     osthread->set_state(INITIALIZED);
 783     sync->notify_all();
 784 
 785     // wait until os::start_thread()
 786     while (osthread->get_state() == INITIALIZED) {
 787       sync->wait_without_safepoint_check();
 788     }
 789   }
 790 
 791   assert(osthread->pthread_id() != 0, "pthread_id was not set as expected");
 792 
 793   // call one more level start routine
 794   thread->call_run();
 795 
 796   // Note: at this point the thread object may already have deleted itself.
 797   // Prevent dereferencing it from here on out.
 798   thread = NULL;
 799 
 800   log_info(os, thread)("Thread finished (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 801     os::current_thread_id(), (uintx) pthread_self());
 802 
 803   return 0;
 804 }
 805 
 806 bool os::create_thread(Thread* thread, ThreadType thr_type,
 807                        size_t req_stack_size) {
 808   assert(thread->osthread() == NULL, "caller responsible");
 809 
 810   // Allocate the OSThread object
 811   OSThread* osthread = new OSThread(NULL, NULL);
 812   if (osthread == NULL) {
 813     return false;
 814   }
 815 
 816   // set the correct thread state
 817   osthread->set_thread_type(thr_type);
 818 
 819   // Initial state is ALLOCATED but not INITIALIZED
 820   osthread->set_state(ALLOCATED);
 821 
 822   thread->set_osthread(osthread);
 823 
 824   // init thread attributes
 825   pthread_attr_t attr;
 826   pthread_attr_init(&attr);
 827   pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
 828 
 829   // Calculate stack size if it's not specified by caller.
 830   size_t stack_size = os::Posix::get_initial_stack_size(thr_type, req_stack_size);
 831   // In the Linux NPTL pthread implementation the guard size mechanism
 832   // is not implemented properly. The posix standard requires adding
 833   // the size of the guard pages to the stack size, instead Linux
 834   // takes the space out of 'stacksize'. Thus we adapt the requested
 835   // stack_size by the size of the guard pages to mimick proper
 836   // behaviour. However, be careful not to end up with a size
 837   // of zero due to overflow. Don't add the guard page in that case.
 838   size_t guard_size = os::Linux::default_guard_size(thr_type);
 839   if (stack_size <= SIZE_MAX - guard_size) {
 840     stack_size += guard_size;
 841   }
 842   assert(is_aligned(stack_size, os::vm_page_size()), "stack_size not aligned");
 843 
 844   int status = pthread_attr_setstacksize(&attr, stack_size);
 845   assert_status(status == 0, status, "pthread_attr_setstacksize");
 846 
 847   // Configure glibc guard page.
 848   pthread_attr_setguardsize(&attr, os::Linux::default_guard_size(thr_type));
 849 
 850   ThreadState state;
 851 
 852   {
 853     pthread_t tid;
 854     int ret = pthread_create(&tid, &attr, (void* (*)(void*)) thread_native_entry, thread);
 855 
 856     char buf[64];
 857     if (ret == 0) {
 858       log_info(os, thread)("Thread started (pthread id: " UINTX_FORMAT ", attributes: %s). ",
 859         (uintx) tid, os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
 860     } else {
 861       log_warning(os, thread)("Failed to start thread - pthread_create failed (%s) for attributes: %s.",
 862         os::errno_name(ret), os::Posix::describe_pthread_attr(buf, sizeof(buf), &attr));
 863       // Log some OS information which might explain why creating the thread failed.
 864       log_info(os, thread)("Number of threads approx. running in the VM: %d", Threads::number_of_threads());
 865       LogStream st(Log(os, thread)::info());
 866       os::Posix::print_rlimit_info(&st);
 867       os::print_memory_info(&st);
 868       os::Linux::print_proc_sys_info(&st);
 869       os::Linux::print_container_info(&st);
 870     }
 871 
 872     pthread_attr_destroy(&attr);
 873 
 874     if (ret != 0) {
 875       // Need to clean up stuff we've allocated so far
 876       thread->set_osthread(NULL);
 877       delete osthread;
 878       return false;
 879     }
 880 
 881     // Store pthread info into the OSThread
 882     osthread->set_pthread_id(tid);
 883 
 884     // Wait until child thread is either initialized or aborted
 885     {
 886       Monitor* sync_with_child = osthread->startThread_lock();
 887       MutexLocker ml(sync_with_child, Mutex::_no_safepoint_check_flag);
 888       while ((state = osthread->get_state()) == ALLOCATED) {
 889         sync_with_child->wait_without_safepoint_check();
 890       }
 891     }
 892   }
 893 
 894   // Aborted due to thread limit being reached
 895   if (state == ZOMBIE) {
 896     thread->set_osthread(NULL);
 897     delete osthread;
 898     return false;
 899   }
 900 
 901   // The thread is returned suspended (in state INITIALIZED),
 902   // and is started higher up in the call chain
 903   assert(state == INITIALIZED, "race condition");
 904   return true;
 905 }
 906 
 907 /////////////////////////////////////////////////////////////////////////////
 908 // attach existing thread
 909 
 910 // bootstrap the main thread
 911 bool os::create_main_thread(JavaThread* thread) {
 912   assert(os::Linux::_main_thread == pthread_self(), "should be called inside main thread");
 913   return create_attached_thread(thread);
 914 }
 915 
 916 bool os::create_attached_thread(JavaThread* thread) {
 917 #ifdef ASSERT
 918   thread->verify_not_published();
 919 #endif
 920 
 921   // Allocate the OSThread object
 922   OSThread* osthread = new OSThread(NULL, NULL);
 923 
 924   if (osthread == NULL) {
 925     return false;
 926   }
 927 
 928   // Store pthread info into the OSThread
 929   osthread->set_thread_id(os::Linux::gettid());
 930   osthread->set_pthread_id(::pthread_self());
 931 
 932   // initialize floating point control register
 933   os::Linux::init_thread_fpu_state();
 934 
 935   // Initial thread state is RUNNABLE
 936   osthread->set_state(RUNNABLE);
 937 
 938   thread->set_osthread(osthread);
 939 
 940   if (UseNUMA) {
 941     int lgrp_id = os::numa_get_group_id();
 942     if (lgrp_id != -1) {
 943       thread->set_lgrp_id(lgrp_id);
 944     }
 945   }
 946 
 947   if (os::is_primordial_thread()) {
 948     // If current thread is primordial thread, its stack is mapped on demand,
 949     // see notes about MAP_GROWSDOWN. Here we try to force kernel to map
 950     // the entire stack region to avoid SEGV in stack banging.
 951     // It is also useful to get around the heap-stack-gap problem on SuSE
 952     // kernel (see 4821821 for details). We first expand stack to the top
 953     // of yellow zone, then enable stack yellow zone (order is significant,
 954     // enabling yellow zone first will crash JVM on SuSE Linux), so there
 955     // is no gap between the last two virtual memory regions.
 956 
 957     JavaThread *jt = (JavaThread *)thread;
 958     address addr = jt->stack_reserved_zone_base();
 959     assert(addr != NULL, "initialization problem?");
 960     assert(jt->stack_available(addr) > 0, "stack guard should not be enabled");
 961 
 962     osthread->set_expanding_stack();
 963     os::Linux::manually_expand_stack(jt, addr);
 964     osthread->clear_expanding_stack();
 965   }
 966 
 967   // initialize signal mask for this thread
 968   // and save the caller's signal mask
 969   os::Linux::hotspot_sigmask(thread);
 970 
 971   log_info(os, thread)("Thread attached (tid: " UINTX_FORMAT ", pthread id: " UINTX_FORMAT ").",
 972     os::current_thread_id(), (uintx) pthread_self());
 973 
 974   return true;
 975 }
 976 
 977 void os::pd_start_thread(Thread* thread) {
 978   OSThread * osthread = thread->osthread();
 979   assert(osthread->get_state() != INITIALIZED, "just checking");
 980   Monitor* sync_with_child = osthread->startThread_lock();
 981   MutexLocker ml(sync_with_child, Mutex::_no_safepoint_check_flag);
 982   sync_with_child->notify();
 983 }
 984 
 985 // Free Linux resources related to the OSThread
 986 void os::free_thread(OSThread* osthread) {
 987   assert(osthread != NULL, "osthread not set");
 988 
 989   // We are told to free resources of the argument thread,
 990   // but we can only really operate on the current thread.
 991   assert(Thread::current()->osthread() == osthread,
 992          "os::free_thread but not current thread");
 993 
 994 #ifdef ASSERT
 995   sigset_t current;
 996   sigemptyset(&current);
 997   pthread_sigmask(SIG_SETMASK, NULL, &current);
 998   assert(!sigismember(&current, SR_signum), "SR signal should not be blocked!");
 999 #endif
1000 
1001   // Restore caller's signal mask
1002   sigset_t sigmask = osthread->caller_sigmask();
1003   pthread_sigmask(SIG_SETMASK, &sigmask, NULL);
1004 
1005   delete osthread;
1006 }
1007 
1008 //////////////////////////////////////////////////////////////////////////////
1009 // primordial thread
1010 
1011 // Check if current thread is the primordial thread, similar to Solaris thr_main.
1012 bool os::is_primordial_thread(void) {
1013   if (suppress_primordial_thread_resolution) {
1014     return false;
1015   }
1016   char dummy;
1017   // If called before init complete, thread stack bottom will be null.
1018   // Can be called if fatal error occurs before initialization.
1019   if (os::Linux::initial_thread_stack_bottom() == NULL) return false;
1020   assert(os::Linux::initial_thread_stack_bottom() != NULL &&
1021          os::Linux::initial_thread_stack_size()   != 0,
1022          "os::init did not locate primordial thread's stack region");
1023   if ((address)&dummy >= os::Linux::initial_thread_stack_bottom() &&
1024       (address)&dummy < os::Linux::initial_thread_stack_bottom() +
1025                         os::Linux::initial_thread_stack_size()) {
1026     return true;
1027   } else {
1028     return false;
1029   }
1030 }
1031 
1032 // Find the virtual memory area that contains addr
1033 static bool find_vma(address addr, address* vma_low, address* vma_high) {
1034   FILE *fp = fopen("/proc/self/maps", "r");
1035   if (fp) {
1036     address low, high;
1037     while (!feof(fp)) {
1038       if (fscanf(fp, "%p-%p", &low, &high) == 2) {
1039         if (low <= addr && addr < high) {
1040           if (vma_low)  *vma_low  = low;
1041           if (vma_high) *vma_high = high;
1042           fclose(fp);
1043           return true;
1044         }
1045       }
1046       for (;;) {
1047         int ch = fgetc(fp);
1048         if (ch == EOF || ch == (int)'\n') break;
1049       }
1050     }
1051     fclose(fp);
1052   }
1053   return false;
1054 }
1055 
1056 // Locate primordial thread stack. This special handling of primordial thread stack
1057 // is needed because pthread_getattr_np() on most (all?) Linux distros returns
1058 // bogus value for the primordial process thread. While the launcher has created
1059 // the VM in a new thread since JDK 6, we still have to allow for the use of the
1060 // JNI invocation API from a primordial thread.
1061 void os::Linux::capture_initial_stack(size_t max_size) {
1062 
1063   // max_size is either 0 (which means accept OS default for thread stacks) or
1064   // a user-specified value known to be at least the minimum needed. If we
1065   // are actually on the primordial thread we can make it appear that we have a
1066   // smaller max_size stack by inserting the guard pages at that location. But we
1067   // cannot do anything to emulate a larger stack than what has been provided by
1068   // the OS or threading library. In fact if we try to use a stack greater than
1069   // what is set by rlimit then we will crash the hosting process.
1070 
1071   // Maximum stack size is the easy part, get it from RLIMIT_STACK.
1072   // If this is "unlimited" then it will be a huge value.
1073   struct rlimit rlim;
1074   getrlimit(RLIMIT_STACK, &rlim);
1075   size_t stack_size = rlim.rlim_cur;
1076 
1077   // 6308388: a bug in ld.so will relocate its own .data section to the
1078   //   lower end of primordial stack; reduce ulimit -s value a little bit
1079   //   so we won't install guard page on ld.so's data section.
1080   //   But ensure we don't underflow the stack size - allow 1 page spare
1081   if (stack_size >= (size_t)(3 * page_size())) {
1082     stack_size -= 2 * page_size();
1083   }
1084 
1085   // Try to figure out where the stack base (top) is. This is harder.
1086   //
1087   // When an application is started, glibc saves the initial stack pointer in
1088   // a global variable "__libc_stack_end", which is then used by system
1089   // libraries. __libc_stack_end should be pretty close to stack top. The
1090   // variable is available since the very early days. However, because it is
1091   // a private interface, it could disappear in the future.
1092   //
1093   // Linux kernel saves start_stack information in /proc/<pid>/stat. Similar
1094   // to __libc_stack_end, it is very close to stack top, but isn't the real
1095   // stack top. Note that /proc may not exist if VM is running as a chroot
1096   // program, so reading /proc/<pid>/stat could fail. Also the contents of
1097   // /proc/<pid>/stat could change in the future (though unlikely).
1098   //
1099   // We try __libc_stack_end first. If that doesn't work, look for
1100   // /proc/<pid>/stat. If neither of them works, we use current stack pointer
1101   // as a hint, which should work well in most cases.
1102 
1103   uintptr_t stack_start;
1104 
1105   // try __libc_stack_end first
1106   uintptr_t *p = (uintptr_t *)dlsym(RTLD_DEFAULT, "__libc_stack_end");
1107   if (p && *p) {
1108     stack_start = *p;
1109   } else {
1110     // see if we can get the start_stack field from /proc/self/stat
1111     FILE *fp;
1112     int pid;
1113     char state;
1114     int ppid;
1115     int pgrp;
1116     int session;
1117     int nr;
1118     int tpgrp;
1119     unsigned long flags;
1120     unsigned long minflt;
1121     unsigned long cminflt;
1122     unsigned long majflt;
1123     unsigned long cmajflt;
1124     unsigned long utime;
1125     unsigned long stime;
1126     long cutime;
1127     long cstime;
1128     long prio;
1129     long nice;
1130     long junk;
1131     long it_real;
1132     uintptr_t start;
1133     uintptr_t vsize;
1134     intptr_t rss;
1135     uintptr_t rsslim;
1136     uintptr_t scodes;
1137     uintptr_t ecode;
1138     int i;
1139 
1140     // Figure what the primordial thread stack base is. Code is inspired
1141     // by email from Hans Boehm. /proc/self/stat begins with current pid,
1142     // followed by command name surrounded by parentheses, state, etc.
1143     char stat[2048];
1144     int statlen;
1145 
1146     fp = fopen("/proc/self/stat", "r");
1147     if (fp) {
1148       statlen = fread(stat, 1, 2047, fp);
1149       stat[statlen] = '\0';
1150       fclose(fp);
1151 
1152       // Skip pid and the command string. Note that we could be dealing with
1153       // weird command names, e.g. user could decide to rename java launcher
1154       // to "java 1.4.2 :)", then the stat file would look like
1155       //                1234 (java 1.4.2 :)) R ... ...
1156       // We don't really need to know the command string, just find the last
1157       // occurrence of ")" and then start parsing from there. See bug 4726580.
1158       char * s = strrchr(stat, ')');
1159 
1160       i = 0;
1161       if (s) {
1162         // Skip blank chars
1163         do { s++; } while (s && isspace(*s));
1164 
1165 #define _UFM UINTX_FORMAT
1166 #define _DFM INTX_FORMAT
1167 
1168         //                                     1   1   1   1   1   1   1   1   1   1   2   2    2    2    2    2    2    2    2
1169         //              3  4  5  6  7  8   9   0   1   2   3   4   5   6   7   8   9   0   1    2    3    4    5    6    7    8
1170         i = sscanf(s, "%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld " _UFM _UFM _DFM _UFM _UFM _UFM _UFM,
1171                    &state,          // 3  %c
1172                    &ppid,           // 4  %d
1173                    &pgrp,           // 5  %d
1174                    &session,        // 6  %d
1175                    &nr,             // 7  %d
1176                    &tpgrp,          // 8  %d
1177                    &flags,          // 9  %lu
1178                    &minflt,         // 10 %lu
1179                    &cminflt,        // 11 %lu
1180                    &majflt,         // 12 %lu
1181                    &cmajflt,        // 13 %lu
1182                    &utime,          // 14 %lu
1183                    &stime,          // 15 %lu
1184                    &cutime,         // 16 %ld
1185                    &cstime,         // 17 %ld
1186                    &prio,           // 18 %ld
1187                    &nice,           // 19 %ld
1188                    &junk,           // 20 %ld
1189                    &it_real,        // 21 %ld
1190                    &start,          // 22 UINTX_FORMAT
1191                    &vsize,          // 23 UINTX_FORMAT
1192                    &rss,            // 24 INTX_FORMAT
1193                    &rsslim,         // 25 UINTX_FORMAT
1194                    &scodes,         // 26 UINTX_FORMAT
1195                    &ecode,          // 27 UINTX_FORMAT
1196                    &stack_start);   // 28 UINTX_FORMAT
1197       }
1198 
1199 #undef _UFM
1200 #undef _DFM
1201 
1202       if (i != 28 - 2) {
1203         assert(false, "Bad conversion from /proc/self/stat");
1204         // product mode - assume we are the primordial thread, good luck in the
1205         // embedded case.
1206         warning("Can't detect primordial thread stack location - bad conversion");
1207         stack_start = (uintptr_t) &rlim;
1208       }
1209     } else {
1210       // For some reason we can't open /proc/self/stat (for example, running on
1211       // FreeBSD with a Linux emulator, or inside chroot), this should work for
1212       // most cases, so don't abort:
1213       warning("Can't detect primordial thread stack location - no /proc/self/stat");
1214       stack_start = (uintptr_t) &rlim;
1215     }
1216   }
1217 
1218   // Now we have a pointer (stack_start) very close to the stack top, the
1219   // next thing to do is to figure out the exact location of stack top. We
1220   // can find out the virtual memory area that contains stack_start by
1221   // reading /proc/self/maps, it should be the last vma in /proc/self/maps,
1222   // and its upper limit is the real stack top. (again, this would fail if
1223   // running inside chroot, because /proc may not exist.)
1224 
1225   uintptr_t stack_top;
1226   address low, high;
1227   if (find_vma((address)stack_start, &low, &high)) {
1228     // success, "high" is the true stack top. (ignore "low", because initial
1229     // thread stack grows on demand, its real bottom is high - RLIMIT_STACK.)
1230     stack_top = (uintptr_t)high;
1231   } else {
1232     // failed, likely because /proc/self/maps does not exist
1233     warning("Can't detect primordial thread stack location - find_vma failed");
1234     // best effort: stack_start is normally within a few pages below the real
1235     // stack top, use it as stack top, and reduce stack size so we won't put
1236     // guard page outside stack.
1237     stack_top = stack_start;
1238     stack_size -= 16 * page_size();
1239   }
1240 
1241   // stack_top could be partially down the page so align it
1242   stack_top = align_up(stack_top, page_size());
1243 
1244   // Allowed stack value is minimum of max_size and what we derived from rlimit
1245   if (max_size > 0) {
1246     _initial_thread_stack_size = MIN2(max_size, stack_size);
1247   } else {
1248     // Accept the rlimit max, but if stack is unlimited then it will be huge, so
1249     // clamp it at 8MB as we do on Solaris
1250     _initial_thread_stack_size = MIN2(stack_size, 8*M);
1251   }
1252   _initial_thread_stack_size = align_down(_initial_thread_stack_size, page_size());
1253   _initial_thread_stack_bottom = (address)stack_top - _initial_thread_stack_size;
1254 
1255   assert(_initial_thread_stack_bottom < (address)stack_top, "overflow!");
1256 
1257   if (log_is_enabled(Info, os, thread)) {
1258     // See if we seem to be on primordial process thread
1259     bool primordial = uintptr_t(&rlim) > uintptr_t(_initial_thread_stack_bottom) &&
1260                       uintptr_t(&rlim) < stack_top;
1261 
1262     log_info(os, thread)("Capturing initial stack in %s thread: req. size: " SIZE_FORMAT "K, actual size: "
1263                          SIZE_FORMAT "K, top=" INTPTR_FORMAT ", bottom=" INTPTR_FORMAT,
1264                          primordial ? "primordial" : "user", max_size / K,  _initial_thread_stack_size / K,
1265                          stack_top, intptr_t(_initial_thread_stack_bottom));
1266   }
1267 }
1268 
1269 ////////////////////////////////////////////////////////////////////////////////
1270 // time support
1271 
1272 #ifndef SUPPORTS_CLOCK_MONOTONIC
1273 #error "Build platform doesn't support clock_gettime and related functionality"
1274 #endif
1275 
1276 // Time since start-up in seconds to a fine granularity.
1277 // Used by VMSelfDestructTimer and the MemProfiler.
1278 double os::elapsedTime() {
1279 
1280   return ((double)os::elapsed_counter()) / os::elapsed_frequency(); // nanosecond resolution
1281 }
1282 
1283 jlong os::elapsed_counter() {
1284   return javaTimeNanos() - initial_time_count;
1285 }
1286 
1287 jlong os::elapsed_frequency() {
1288   return NANOSECS_PER_SEC; // nanosecond resolution
1289 }
1290 
1291 bool os::supports_vtime() { return true; }
1292 bool os::enable_vtime()   { return false; }
1293 bool os::vtime_enabled()  { return false; }
1294 
1295 double os::elapsedVTime() {
1296   struct rusage usage;
1297   int retval = getrusage(RUSAGE_THREAD, &usage);
1298   if (retval == 0) {
1299     return (double) (usage.ru_utime.tv_sec + usage.ru_stime.tv_sec) + (double) (usage.ru_utime.tv_usec + usage.ru_stime.tv_usec) / (1000 * 1000);
1300   } else {
1301     // better than nothing, but not much
1302     return elapsedTime();
1303   }
1304 }
1305 
1306 jlong os::javaTimeMillis() {
1307   timeval time;
1308   int status = gettimeofday(&time, NULL);
1309   assert(status != -1, "linux error");
1310   return jlong(time.tv_sec) * 1000  +  jlong(time.tv_usec / 1000);
1311 }
1312 
1313 void os::javaTimeSystemUTC(jlong &seconds, jlong &nanos) {
1314   timeval time;
1315   int status = gettimeofday(&time, NULL);
1316   assert(status != -1, "linux error");
1317   seconds = jlong(time.tv_sec);
1318   nanos = jlong(time.tv_usec) * 1000;
1319 }
1320 
1321 void os::Linux::fast_thread_clock_init() {
1322   if (!UseLinuxPosixThreadCPUClocks) {
1323     return;
1324   }
1325   clockid_t clockid;
1326   struct timespec tp;
1327   int (*pthread_getcpuclockid_func)(pthread_t, clockid_t *) =
1328       (int(*)(pthread_t, clockid_t *)) dlsym(RTLD_DEFAULT, "pthread_getcpuclockid");
1329 
1330   // Switch to using fast clocks for thread cpu time if
1331   // the clock_getres() returns 0 error code.
1332   // Note, that some kernels may support the current thread
1333   // clock (CLOCK_THREAD_CPUTIME_ID) but not the clocks
1334   // returned by the pthread_getcpuclockid().
1335   // If the fast Posix clocks are supported then the clock_getres()
1336   // must return at least tp.tv_sec == 0 which means a resolution
1337   // better than 1 sec. This is extra check for reliability.
1338 
1339   if (pthread_getcpuclockid_func &&
1340       pthread_getcpuclockid_func(_main_thread, &clockid) == 0 &&
1341       os::Posix::clock_getres(clockid, &tp) == 0 && tp.tv_sec == 0) {
1342     _supports_fast_thread_cpu_time = true;
1343     _pthread_getcpuclockid = pthread_getcpuclockid_func;
1344   }
1345 }
1346 
1347 jlong os::javaTimeNanos() {
1348   if (os::supports_monotonic_clock()) {
1349     struct timespec tp;
1350     int status = os::Posix::clock_gettime(CLOCK_MONOTONIC, &tp);
1351     assert(status == 0, "gettime error");
1352     jlong result = jlong(tp.tv_sec) * (1000 * 1000 * 1000) + jlong(tp.tv_nsec);
1353     return result;
1354   } else {
1355     timeval time;
1356     int status = gettimeofday(&time, NULL);
1357     assert(status != -1, "linux error");
1358     jlong usecs = jlong(time.tv_sec) * (1000 * 1000) + jlong(time.tv_usec);
1359     return 1000 * usecs;
1360   }
1361 }
1362 
1363 void os::javaTimeNanos_info(jvmtiTimerInfo *info_ptr) {
1364   if (os::supports_monotonic_clock()) {
1365     info_ptr->max_value = ALL_64_BITS;
1366 
1367     // CLOCK_MONOTONIC - amount of time since some arbitrary point in the past
1368     info_ptr->may_skip_backward = false;      // not subject to resetting or drifting
1369     info_ptr->may_skip_forward = false;       // not subject to resetting or drifting
1370   } else {
1371     // gettimeofday - based on time in seconds since the Epoch thus does not wrap
1372     info_ptr->max_value = ALL_64_BITS;
1373 
1374     // gettimeofday is a real time clock so it skips
1375     info_ptr->may_skip_backward = true;
1376     info_ptr->may_skip_forward = true;
1377   }
1378 
1379   info_ptr->kind = JVMTI_TIMER_ELAPSED;                // elapsed not CPU time
1380 }
1381 
1382 // Return the real, user, and system times in seconds from an
1383 // arbitrary fixed point in the past.
1384 bool os::getTimesSecs(double* process_real_time,
1385                       double* process_user_time,
1386                       double* process_system_time) {
1387   struct tms ticks;
1388   clock_t real_ticks = times(&ticks);
1389 
1390   if (real_ticks == (clock_t) (-1)) {
1391     return false;
1392   } else {
1393     double ticks_per_second = (double) clock_tics_per_sec;
1394     *process_user_time = ((double) ticks.tms_utime) / ticks_per_second;
1395     *process_system_time = ((double) ticks.tms_stime) / ticks_per_second;
1396     *process_real_time = ((double) real_ticks) / ticks_per_second;
1397 
1398     return true;
1399   }
1400 }
1401 
1402 
1403 char * os::local_time_string(char *buf, size_t buflen) {
1404   struct tm t;
1405   time_t long_time;
1406   time(&long_time);
1407   localtime_r(&long_time, &t);
1408   jio_snprintf(buf, buflen, "%d-%02d-%02d %02d:%02d:%02d",
1409                t.tm_year + 1900, t.tm_mon + 1, t.tm_mday,
1410                t.tm_hour, t.tm_min, t.tm_sec);
1411   return buf;
1412 }
1413 
1414 struct tm* os::localtime_pd(const time_t* clock, struct tm*  res) {
1415   return localtime_r(clock, res);
1416 }
1417 
1418 ////////////////////////////////////////////////////////////////////////////////
1419 // runtime exit support
1420 
1421 // Note: os::shutdown() might be called very early during initialization, or
1422 // called from signal handler. Before adding something to os::shutdown(), make
1423 // sure it is async-safe and can handle partially initialized VM.
1424 void os::shutdown() {
1425 
1426   // allow PerfMemory to attempt cleanup of any persistent resources
1427   perfMemory_exit();
1428 
1429   // needs to remove object in file system
1430   AttachListener::abort();
1431 
1432   // flush buffered output, finish log files
1433   ostream_abort();
1434 
1435   // Check for abort hook
1436   abort_hook_t abort_hook = Arguments::abort_hook();
1437   if (abort_hook != NULL) {
1438     abort_hook();
1439   }
1440 
1441 }
1442 
1443 // Note: os::abort() might be called very early during initialization, or
1444 // called from signal handler. Before adding something to os::abort(), make
1445 // sure it is async-safe and can handle partially initialized VM.
1446 void os::abort(bool dump_core, void* siginfo, const void* context) {
1447   os::shutdown();
1448   if (dump_core) {
1449     if (DumpPrivateMappingsInCore) {
1450       ClassLoader::close_jrt_image();
1451     }
1452 #ifndef PRODUCT
1453     fdStream out(defaultStream::output_fd());
1454     out.print_raw("Current thread is ");
1455     char buf[16];
1456     jio_snprintf(buf, sizeof(buf), UINTX_FORMAT, os::current_thread_id());
1457     out.print_raw_cr(buf);
1458     out.print_raw_cr("Dumping core ...");
1459 #endif
1460     ::abort(); // dump core
1461   }
1462 
1463   ::exit(1);
1464 }
1465 
1466 // Die immediately, no exit hook, no abort hook, no cleanup.
1467 void os::die() {
1468   ::abort();
1469 }
1470 
1471 // thread_id is kernel thread id (similar to Solaris LWP id)
1472 intx os::current_thread_id() { return os::Linux::gettid(); }
1473 int os::current_process_id() {
1474   return ::getpid();
1475 }
1476 
1477 // DLL functions
1478 
1479 const char* os::dll_file_extension() { return ".so"; }
1480 
1481 // This must be hard coded because it's the system's temporary
1482 // directory not the java application's temp directory, ala java.io.tmpdir.
1483 const char* os::get_temp_directory() { return "/tmp"; }
1484 
1485 static bool file_exists(const char* filename) {
1486   struct stat statbuf;
1487   if (filename == NULL || strlen(filename) == 0) {
1488     return false;
1489   }
1490   return os::stat(filename, &statbuf) == 0;
1491 }
1492 
1493 // check if addr is inside libjvm.so
1494 bool os::address_is_in_vm(address addr) {
1495   static address libjvm_base_addr;
1496   Dl_info dlinfo;
1497 
1498   if (libjvm_base_addr == NULL) {
1499     if (dladdr(CAST_FROM_FN_PTR(void *, os::address_is_in_vm), &dlinfo) != 0) {
1500       libjvm_base_addr = (address)dlinfo.dli_fbase;
1501     }
1502     assert(libjvm_base_addr !=NULL, "Cannot obtain base address for libjvm");
1503   }
1504 
1505   if (dladdr((void *)addr, &dlinfo) != 0) {
1506     if (libjvm_base_addr == (address)dlinfo.dli_fbase) return true;
1507   }
1508 
1509   return false;
1510 }
1511 
1512 bool os::dll_address_to_function_name(address addr, char *buf,
1513                                       int buflen, int *offset,
1514                                       bool demangle) {
1515   // buf is not optional, but offset is optional
1516   assert(buf != NULL, "sanity check");
1517 
1518   Dl_info dlinfo;
1519 
1520   if (dladdr((void*)addr, &dlinfo) != 0) {
1521     // see if we have a matching symbol
1522     if (dlinfo.dli_saddr != NULL && dlinfo.dli_sname != NULL) {
1523       if (!(demangle && Decoder::demangle(dlinfo.dli_sname, buf, buflen))) {
1524         jio_snprintf(buf, buflen, "%s", dlinfo.dli_sname);
1525       }
1526       if (offset != NULL) *offset = addr - (address)dlinfo.dli_saddr;
1527       return true;
1528     }
1529     // no matching symbol so try for just file info
1530     if (dlinfo.dli_fname != NULL && dlinfo.dli_fbase != NULL) {
1531       if (Decoder::decode((address)(addr - (address)dlinfo.dli_fbase),
1532                           buf, buflen, offset, dlinfo.dli_fname, demangle)) {
1533         return true;
1534       }
1535     }
1536   }
1537 
1538   buf[0] = '\0';
1539   if (offset != NULL) *offset = -1;
1540   return false;
1541 }
1542 
1543 struct _address_to_library_name {
1544   address addr;          // input : memory address
1545   size_t  buflen;        //         size of fname
1546   char*   fname;         // output: library name
1547   address base;          //         library base addr
1548 };
1549 
1550 static int address_to_library_name_callback(struct dl_phdr_info *info,
1551                                             size_t size, void *data) {
1552   int i;
1553   bool found = false;
1554   address libbase = NULL;
1555   struct _address_to_library_name * d = (struct _address_to_library_name *)data;
1556 
1557   // iterate through all loadable segments
1558   for (i = 0; i < info->dlpi_phnum; i++) {
1559     address segbase = (address)(info->dlpi_addr + info->dlpi_phdr[i].p_vaddr);
1560     if (info->dlpi_phdr[i].p_type == PT_LOAD) {
1561       // base address of a library is the lowest address of its loaded
1562       // segments.
1563       if (libbase == NULL || libbase > segbase) {
1564         libbase = segbase;
1565       }
1566       // see if 'addr' is within current segment
1567       if (segbase <= d->addr &&
1568           d->addr < segbase + info->dlpi_phdr[i].p_memsz) {
1569         found = true;
1570       }
1571     }
1572   }
1573 
1574   // dlpi_name is NULL or empty if the ELF file is executable, return 0
1575   // so dll_address_to_library_name() can fall through to use dladdr() which
1576   // can figure out executable name from argv[0].
1577   if (found && info->dlpi_name && info->dlpi_name[0]) {
1578     d->base = libbase;
1579     if (d->fname) {
1580       jio_snprintf(d->fname, d->buflen, "%s", info->dlpi_name);
1581     }
1582     return 1;
1583   }
1584   return 0;
1585 }
1586 
1587 bool os::dll_address_to_library_name(address addr, char* buf,
1588                                      int buflen, int* offset) {
1589   // buf is not optional, but offset is optional
1590   assert(buf != NULL, "sanity check");
1591 
1592   Dl_info dlinfo;
1593   struct _address_to_library_name data;
1594 
1595   // There is a bug in old glibc dladdr() implementation that it could resolve
1596   // to wrong library name if the .so file has a base address != NULL. Here
1597   // we iterate through the program headers of all loaded libraries to find
1598   // out which library 'addr' really belongs to. This workaround can be
1599   // removed once the minimum requirement for glibc is moved to 2.3.x.
1600   data.addr = addr;
1601   data.fname = buf;
1602   data.buflen = buflen;
1603   data.base = NULL;
1604   int rslt = dl_iterate_phdr(address_to_library_name_callback, (void *)&data);
1605 
1606   if (rslt) {
1607     // buf already contains library name
1608     if (offset) *offset = addr - data.base;
1609     return true;
1610   }
1611   if (dladdr((void*)addr, &dlinfo) != 0) {
1612     if (dlinfo.dli_fname != NULL) {
1613       jio_snprintf(buf, buflen, "%s", dlinfo.dli_fname);
1614     }
1615     if (dlinfo.dli_fbase != NULL && offset != NULL) {
1616       *offset = addr - (address)dlinfo.dli_fbase;
1617     }
1618     return true;
1619   }
1620 
1621   buf[0] = '\0';
1622   if (offset) *offset = -1;
1623   return false;
1624 }
1625 
1626 // Loads .dll/.so and
1627 // in case of error it checks if .dll/.so was built for the
1628 // same architecture as Hotspot is running on
1629 
1630 
1631 // Remember the stack's state. The Linux dynamic linker will change
1632 // the stack to 'executable' at most once, so we must safepoint only once.
1633 bool os::Linux::_stack_is_executable = false;
1634 
1635 // VM operation that loads a library.  This is necessary if stack protection
1636 // of the Java stacks can be lost during loading the library.  If we
1637 // do not stop the Java threads, they can stack overflow before the stacks
1638 // are protected again.
1639 class VM_LinuxDllLoad: public VM_Operation {
1640  private:
1641   const char *_filename;
1642   char *_ebuf;
1643   int _ebuflen;
1644   void *_lib;
1645  public:
1646   VM_LinuxDllLoad(const char *fn, char *ebuf, int ebuflen) :
1647     _filename(fn), _ebuf(ebuf), _ebuflen(ebuflen), _lib(NULL) {}
1648   VMOp_Type type() const { return VMOp_LinuxDllLoad; }
1649   void doit() {
1650     _lib = os::Linux::dll_load_in_vmthread(_filename, _ebuf, _ebuflen);
1651     os::Linux::_stack_is_executable = true;
1652   }
1653   void* loaded_library() { return _lib; }
1654 };
1655 
1656 void * os::dll_load(const char *filename, char *ebuf, int ebuflen) {
1657   void * result = NULL;
1658   bool load_attempted = false;
1659 
1660   // Check whether the library to load might change execution rights
1661   // of the stack. If they are changed, the protection of the stack
1662   // guard pages will be lost. We need a safepoint to fix this.
1663   //
1664   // See Linux man page execstack(8) for more info.
1665   if (os::uses_stack_guard_pages() && !os::Linux::_stack_is_executable) {
1666     if (!ElfFile::specifies_noexecstack(filename)) {
1667       if (!is_init_completed()) {
1668         os::Linux::_stack_is_executable = true;
1669         // This is OK - No Java threads have been created yet, and hence no
1670         // stack guard pages to fix.
1671         //
1672         // Dynamic loader will make all stacks executable after
1673         // this function returns, and will not do that again.
1674         assert(Threads::number_of_threads() == 0, "no Java threads should exist yet.");
1675       } else {
1676         warning("You have loaded library %s which might have disabled stack guard. "
1677                 "The VM will try to fix the stack guard now.\n"
1678                 "It's highly recommended that you fix the library with "
1679                 "'execstack -c <libfile>', or link it with '-z noexecstack'.",
1680                 filename);
1681 
1682         assert(Thread::current()->is_Java_thread(), "must be Java thread");
1683         JavaThread *jt = JavaThread::current();
1684         if (jt->thread_state() != _thread_in_native) {
1685           // This happens when a compiler thread tries to load a hsdis-<arch>.so file
1686           // that requires ExecStack. Cannot enter safe point. Let's give up.
1687           warning("Unable to fix stack guard. Giving up.");
1688         } else {
1689           if (!LoadExecStackDllInVMThread) {
1690             // This is for the case where the DLL has an static
1691             // constructor function that executes JNI code. We cannot
1692             // load such DLLs in the VMThread.
1693             result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1694           }
1695 
1696           ThreadInVMfromNative tiv(jt);
1697           debug_only(VMNativeEntryWrapper vew;)
1698 
1699           VM_LinuxDllLoad op(filename, ebuf, ebuflen);
1700           VMThread::execute(&op);
1701           if (LoadExecStackDllInVMThread) {
1702             result = op.loaded_library();
1703           }
1704           load_attempted = true;
1705         }
1706       }
1707     }
1708   }
1709 
1710   if (!load_attempted) {
1711     result = os::Linux::dlopen_helper(filename, ebuf, ebuflen);
1712   }
1713 
1714   if (result != NULL) {
1715     // Successful loading
1716     return result;
1717   }
1718 
1719   Elf32_Ehdr elf_head;
1720   int diag_msg_max_length=ebuflen-strlen(ebuf);
1721   char* diag_msg_buf=ebuf+strlen(ebuf);
1722 
1723   if (diag_msg_max_length==0) {
1724     // No more space in ebuf for additional diagnostics message
1725     return NULL;
1726   }
1727 
1728 
1729   int file_descriptor= ::open(filename, O_RDONLY | O_NONBLOCK);
1730 
1731   if (file_descriptor < 0) {
1732     // Can't open library, report dlerror() message
1733     return NULL;
1734   }
1735 
1736   bool failed_to_read_elf_head=
1737     (sizeof(elf_head)!=
1738      (::read(file_descriptor, &elf_head,sizeof(elf_head))));
1739 
1740   ::close(file_descriptor);
1741   if (failed_to_read_elf_head) {
1742     // file i/o error - report dlerror() msg
1743     return NULL;
1744   }
1745 
1746   typedef struct {
1747     Elf32_Half    code;         // Actual value as defined in elf.h
1748     Elf32_Half    compat_class; // Compatibility of archs at VM's sense
1749     unsigned char elf_class;    // 32 or 64 bit
1750     unsigned char endianess;    // MSB or LSB
1751     char*         name;         // String representation
1752   } arch_t;
1753 
1754 #ifndef EM_486
1755   #define EM_486          6               /* Intel 80486 */
1756 #endif
1757 #ifndef EM_AARCH64
1758   #define EM_AARCH64    183               /* ARM AARCH64 */
1759 #endif
1760 
1761   static const arch_t arch_array[]={
1762     {EM_386,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1763     {EM_486,         EM_386,     ELFCLASS32, ELFDATA2LSB, (char*)"IA 32"},
1764     {EM_IA_64,       EM_IA_64,   ELFCLASS64, ELFDATA2LSB, (char*)"IA 64"},
1765     {EM_X86_64,      EM_X86_64,  ELFCLASS64, ELFDATA2LSB, (char*)"AMD 64"},
1766     {EM_SPARC,       EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1767     {EM_SPARC32PLUS, EM_SPARC,   ELFCLASS32, ELFDATA2MSB, (char*)"Sparc 32"},
1768     {EM_SPARCV9,     EM_SPARCV9, ELFCLASS64, ELFDATA2MSB, (char*)"Sparc v9 64"},
1769     {EM_PPC,         EM_PPC,     ELFCLASS32, ELFDATA2MSB, (char*)"Power PC 32"},
1770 #if defined(VM_LITTLE_ENDIAN)
1771     {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2LSB, (char*)"Power PC 64 LE"},
1772     {EM_SH,          EM_SH,      ELFCLASS32, ELFDATA2LSB, (char*)"SuperH"},
1773 #else
1774     {EM_PPC64,       EM_PPC64,   ELFCLASS64, ELFDATA2MSB, (char*)"Power PC 64"},
1775     {EM_SH,          EM_SH,      ELFCLASS32, ELFDATA2MSB, (char*)"SuperH BE"},
1776 #endif
1777     {EM_ARM,         EM_ARM,     ELFCLASS32,   ELFDATA2LSB, (char*)"ARM"},
1778     {EM_S390,        EM_S390,    ELFCLASSNONE, ELFDATA2MSB, (char*)"IBM System/390"},
1779     {EM_ALPHA,       EM_ALPHA,   ELFCLASS64, ELFDATA2LSB, (char*)"Alpha"},
1780     {EM_MIPS_RS3_LE, EM_MIPS_RS3_LE, ELFCLASS32, ELFDATA2LSB, (char*)"MIPSel"},
1781     {EM_MIPS,        EM_MIPS,    ELFCLASS32, ELFDATA2MSB, (char*)"MIPS"},
1782     {EM_PARISC,      EM_PARISC,  ELFCLASS32, ELFDATA2MSB, (char*)"PARISC"},
1783     {EM_68K,         EM_68K,     ELFCLASS32, ELFDATA2MSB, (char*)"M68k"},
1784     {EM_AARCH64,     EM_AARCH64, ELFCLASS64, ELFDATA2LSB, (char*)"AARCH64"},
1785   };
1786 
1787 #if  (defined IA32)
1788   static  Elf32_Half running_arch_code=EM_386;
1789 #elif   (defined AMD64) || (defined X32)
1790   static  Elf32_Half running_arch_code=EM_X86_64;
1791 #elif  (defined IA64)
1792   static  Elf32_Half running_arch_code=EM_IA_64;
1793 #elif  (defined __sparc) && (defined _LP64)
1794   static  Elf32_Half running_arch_code=EM_SPARCV9;
1795 #elif  (defined __sparc) && (!defined _LP64)
1796   static  Elf32_Half running_arch_code=EM_SPARC;
1797 #elif  (defined __powerpc64__)
1798   static  Elf32_Half running_arch_code=EM_PPC64;
1799 #elif  (defined __powerpc__)
1800   static  Elf32_Half running_arch_code=EM_PPC;
1801 #elif  (defined AARCH64)
1802   static  Elf32_Half running_arch_code=EM_AARCH64;
1803 #elif  (defined ARM)
1804   static  Elf32_Half running_arch_code=EM_ARM;
1805 #elif  (defined S390)
1806   static  Elf32_Half running_arch_code=EM_S390;
1807 #elif  (defined ALPHA)
1808   static  Elf32_Half running_arch_code=EM_ALPHA;
1809 #elif  (defined MIPSEL)
1810   static  Elf32_Half running_arch_code=EM_MIPS_RS3_LE;
1811 #elif  (defined PARISC)
1812   static  Elf32_Half running_arch_code=EM_PARISC;
1813 #elif  (defined MIPS)
1814   static  Elf32_Half running_arch_code=EM_MIPS;
1815 #elif  (defined M68K)
1816   static  Elf32_Half running_arch_code=EM_68K;
1817 #elif  (defined SH)
1818   static  Elf32_Half running_arch_code=EM_SH;
1819 #else
1820     #error Method os::dll_load requires that one of following is defined:\
1821         AARCH64, ALPHA, ARM, AMD64, IA32, IA64, M68K, MIPS, MIPSEL, PARISC, __powerpc__, __powerpc64__, S390, SH, __sparc
1822 #endif
1823 
1824   // Identify compatability class for VM's architecture and library's architecture
1825   // Obtain string descriptions for architectures
1826 
1827   arch_t lib_arch={elf_head.e_machine,0,elf_head.e_ident[EI_CLASS], elf_head.e_ident[EI_DATA], NULL};
1828   int running_arch_index=-1;
1829 
1830   for (unsigned int i=0; i < ARRAY_SIZE(arch_array); i++) {
1831     if (running_arch_code == arch_array[i].code) {
1832       running_arch_index    = i;
1833     }
1834     if (lib_arch.code == arch_array[i].code) {
1835       lib_arch.compat_class = arch_array[i].compat_class;
1836       lib_arch.name         = arch_array[i].name;
1837     }
1838   }
1839 
1840   assert(running_arch_index != -1,
1841          "Didn't find running architecture code (running_arch_code) in arch_array");
1842   if (running_arch_index == -1) {
1843     // Even though running architecture detection failed
1844     // we may still continue with reporting dlerror() message
1845     return NULL;
1846   }
1847 
1848   if (lib_arch.endianess != arch_array[running_arch_index].endianess) {
1849     ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: endianness mismatch)");
1850     return NULL;
1851   }
1852 
1853 #ifndef S390
1854   if (lib_arch.elf_class != arch_array[running_arch_index].elf_class) {
1855     ::snprintf(diag_msg_buf, diag_msg_max_length-1," (Possible cause: architecture word width mismatch)");
1856     return NULL;
1857   }
1858 #endif // !S390
1859 
1860   if (lib_arch.compat_class != arch_array[running_arch_index].compat_class) {
1861     if (lib_arch.name!=NULL) {
1862       ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1863                  " (Possible cause: can't load %s-bit .so on a %s-bit platform)",
1864                  lib_arch.name, arch_array[running_arch_index].name);
1865     } else {
1866       ::snprintf(diag_msg_buf, diag_msg_max_length-1,
1867                  " (Possible cause: can't load this .so (machine code=0x%x) on a %s-bit platform)",
1868                  lib_arch.code,
1869                  arch_array[running_arch_index].name);
1870     }
1871   }
1872 
1873   return NULL;
1874 }
1875 
1876 void * os::Linux::dlopen_helper(const char *filename, char *ebuf,
1877                                 int ebuflen) {
1878   void * result = ::dlopen(filename, RTLD_LAZY);
1879   if (result == NULL) {
1880     ::strncpy(ebuf, ::dlerror(), ebuflen - 1);
1881     ebuf[ebuflen-1] = '\0';
1882   }
1883   return result;
1884 }
1885 
1886 void * os::Linux::dll_load_in_vmthread(const char *filename, char *ebuf,
1887                                        int ebuflen) {
1888   void * result = NULL;
1889   if (LoadExecStackDllInVMThread) {
1890     result = dlopen_helper(filename, ebuf, ebuflen);
1891   }
1892 
1893   // Since 7019808, libjvm.so is linked with -noexecstack. If the VM loads a
1894   // library that requires an executable stack, or which does not have this
1895   // stack attribute set, dlopen changes the stack attribute to executable. The
1896   // read protection of the guard pages gets lost.
1897   //
1898   // Need to check _stack_is_executable again as multiple VM_LinuxDllLoad
1899   // may have been queued at the same time.
1900 
1901   if (!_stack_is_executable) {
1902     for (JavaThreadIteratorWithHandle jtiwh; JavaThread *jt = jtiwh.next(); ) {
1903       if (!jt->stack_guard_zone_unused() &&     // Stack not yet fully initialized
1904           jt->stack_guards_enabled()) {         // No pending stack overflow exceptions
1905         if (!os::guard_memory((char *)jt->stack_end(), jt->stack_guard_zone_size())) {
1906           warning("Attempt to reguard stack yellow zone failed.");
1907         }
1908       }
1909     }
1910   }
1911 
1912   return result;
1913 }
1914 
1915 void* os::dll_lookup(void* handle, const char* name) {
1916   void* res = dlsym(handle, name);
1917   return res;
1918 }
1919 
1920 void* os::get_default_process_handle() {
1921   return (void*)::dlopen(NULL, RTLD_LAZY);
1922 }
1923 
1924 static bool _print_ascii_file(const char* filename, outputStream* st, const char* hdr = NULL) {
1925   int fd = ::open(filename, O_RDONLY);
1926   if (fd == -1) {
1927     return false;
1928   }
1929 
1930   if (hdr != NULL) {
1931     st->print_cr("%s", hdr);
1932   }
1933 
1934   char buf[33];
1935   int bytes;
1936   buf[32] = '\0';
1937   while ((bytes = ::read(fd, buf, sizeof(buf)-1)) > 0) {
1938     st->print_raw(buf, bytes);
1939   }
1940 
1941   ::close(fd);
1942 
1943   return true;
1944 }
1945 
1946 void os::print_dll_info(outputStream *st) {
1947   st->print_cr("Dynamic libraries:");
1948 
1949   char fname[32];
1950   pid_t pid = os::Linux::gettid();
1951 
1952   jio_snprintf(fname, sizeof(fname), "/proc/%d/maps", pid);
1953 
1954   if (!_print_ascii_file(fname, st)) {
1955     st->print("Can not get library information for pid = %d\n", pid);
1956   }
1957 }
1958 
1959 int os::get_loaded_modules_info(os::LoadedModulesCallbackFunc callback, void *param) {
1960   FILE *procmapsFile = NULL;
1961 
1962   // Open the procfs maps file for the current process
1963   if ((procmapsFile = fopen("/proc/self/maps", "r")) != NULL) {
1964     // Allocate PATH_MAX for file name plus a reasonable size for other fields.
1965     char line[PATH_MAX + 100];
1966 
1967     // Read line by line from 'file'
1968     while (fgets(line, sizeof(line), procmapsFile) != NULL) {
1969       u8 base, top, offset, inode;
1970       char permissions[5];
1971       char device[6];
1972       char name[PATH_MAX + 1];
1973 
1974       // Parse fields from line
1975       sscanf(line, UINT64_FORMAT_X "-" UINT64_FORMAT_X " %4s " UINT64_FORMAT_X " %7s " INT64_FORMAT " %s",
1976              &base, &top, permissions, &offset, device, &inode, name);
1977 
1978       // Filter by device id '00:00' so that we only get file system mapped files.
1979       if (strcmp(device, "00:00") != 0) {
1980 
1981         // Call callback with the fields of interest
1982         if(callback(name, (address)base, (address)top, param)) {
1983           // Oops abort, callback aborted
1984           fclose(procmapsFile);
1985           return 1;
1986         }
1987       }
1988     }
1989     fclose(procmapsFile);
1990   }
1991   return 0;
1992 }
1993 
1994 void os::print_os_info_brief(outputStream* st) {
1995   os::Linux::print_distro_info(st);
1996 
1997   os::Posix::print_uname_info(st);
1998 
1999   os::Linux::print_libversion_info(st);
2000 
2001 }
2002 
2003 void os::print_os_info(outputStream* st) {
2004   st->print("OS:");
2005 
2006   os::Linux::print_distro_info(st);
2007 
2008   os::Posix::print_uname_info(st);
2009 
2010   // Print warning if unsafe chroot environment detected
2011   if (unsafe_chroot_detected) {
2012     st->print("WARNING!! ");
2013     st->print_cr("%s", unstable_chroot_error);
2014   }
2015 
2016   os::Linux::print_libversion_info(st);
2017 
2018   os::Posix::print_rlimit_info(st);
2019 
2020   os::Posix::print_load_average(st);
2021 
2022   os::Linux::print_full_memory_info(st);
2023 
2024   os::Linux::print_proc_sys_info(st);
2025 
2026   os::Linux::print_ld_preload_file(st);
2027 
2028   os::Linux::print_container_info(st);
2029 
2030   VM_Version::print_platform_virtualization_info(st);
2031 
2032   os::Linux::print_steal_info(st);
2033 }
2034 
2035 // Try to identify popular distros.
2036 // Most Linux distributions have a /etc/XXX-release file, which contains
2037 // the OS version string. Newer Linux distributions have a /etc/lsb-release
2038 // file that also contains the OS version string. Some have more than one
2039 // /etc/XXX-release file (e.g. Mandrake has both /etc/mandrake-release and
2040 // /etc/redhat-release.), so the order is important.
2041 // Any Linux that is based on Redhat (i.e. Oracle, Mandrake, Sun JDS...) have
2042 // their own specific XXX-release file as well as a redhat-release file.
2043 // Because of this the XXX-release file needs to be searched for before the
2044 // redhat-release file.
2045 // Since Red Hat and SuSE have an lsb-release file that is not very descriptive the
2046 // search for redhat-release / SuSE-release needs to be before lsb-release.
2047 // Since the lsb-release file is the new standard it needs to be searched
2048 // before the older style release files.
2049 // Searching system-release (Red Hat) and os-release (other Linuxes) are a
2050 // next to last resort.  The os-release file is a new standard that contains
2051 // distribution information and the system-release file seems to be an old
2052 // standard that has been replaced by the lsb-release and os-release files.
2053 // Searching for the debian_version file is the last resort.  It contains
2054 // an informative string like "6.0.6" or "wheezy/sid". Because of this
2055 // "Debian " is printed before the contents of the debian_version file.
2056 
2057 const char* distro_files[] = {
2058   "/etc/oracle-release",
2059   "/etc/mandriva-release",
2060   "/etc/mandrake-release",
2061   "/etc/sun-release",
2062   "/etc/redhat-release",
2063   "/etc/SuSE-release",
2064   "/etc/lsb-release",
2065   "/etc/turbolinux-release",
2066   "/etc/gentoo-release",
2067   "/etc/ltib-release",
2068   "/etc/angstrom-version",
2069   "/etc/system-release",
2070   "/etc/os-release",
2071   NULL };
2072 
2073 void os::Linux::print_distro_info(outputStream* st) {
2074   for (int i = 0;; i++) {
2075     const char* file = distro_files[i];
2076     if (file == NULL) {
2077       break;  // done
2078     }
2079     // If file prints, we found it.
2080     if (_print_ascii_file(file, st)) {
2081       return;
2082     }
2083   }
2084 
2085   if (file_exists("/etc/debian_version")) {
2086     st->print("Debian ");
2087     _print_ascii_file("/etc/debian_version", st);
2088   } else {
2089     st->print("Linux");
2090   }
2091   st->cr();
2092 }
2093 
2094 static void parse_os_info_helper(FILE* fp, char* distro, size_t length, bool get_first_line) {
2095   char buf[256];
2096   while (fgets(buf, sizeof(buf), fp)) {
2097     // Edit out extra stuff in expected format
2098     if (strstr(buf, "DISTRIB_DESCRIPTION=") != NULL || strstr(buf, "PRETTY_NAME=") != NULL) {
2099       char* ptr = strstr(buf, "\"");  // the name is in quotes
2100       if (ptr != NULL) {
2101         ptr++; // go beyond first quote
2102         char* nl = strchr(ptr, '\"');
2103         if (nl != NULL) *nl = '\0';
2104         strncpy(distro, ptr, length);
2105       } else {
2106         ptr = strstr(buf, "=");
2107         ptr++; // go beyond equals then
2108         char* nl = strchr(ptr, '\n');
2109         if (nl != NULL) *nl = '\0';
2110         strncpy(distro, ptr, length);
2111       }
2112       return;
2113     } else if (get_first_line) {
2114       char* nl = strchr(buf, '\n');
2115       if (nl != NULL) *nl = '\0';
2116       strncpy(distro, buf, length);
2117       return;
2118     }
2119   }
2120   // print last line and close
2121   char* nl = strchr(buf, '\n');
2122   if (nl != NULL) *nl = '\0';
2123   strncpy(distro, buf, length);
2124 }
2125 
2126 static void parse_os_info(char* distro, size_t length, const char* file) {
2127   FILE* fp = fopen(file, "r");
2128   if (fp != NULL) {
2129     // if suse format, print out first line
2130     bool get_first_line = (strcmp(file, "/etc/SuSE-release") == 0);
2131     parse_os_info_helper(fp, distro, length, get_first_line);
2132     fclose(fp);
2133   }
2134 }
2135 
2136 void os::get_summary_os_info(char* buf, size_t buflen) {
2137   for (int i = 0;; i++) {
2138     const char* file = distro_files[i];
2139     if (file == NULL) {
2140       break; // ran out of distro_files
2141     }
2142     if (file_exists(file)) {
2143       parse_os_info(buf, buflen, file);
2144       return;
2145     }
2146   }
2147   // special case for debian
2148   if (file_exists("/etc/debian_version")) {
2149     strncpy(buf, "Debian ", buflen);
2150     if (buflen > 7) {
2151       parse_os_info(&buf[7], buflen-7, "/etc/debian_version");
2152     }
2153   } else {
2154     strncpy(buf, "Linux", buflen);
2155   }
2156 }
2157 
2158 void os::Linux::print_libversion_info(outputStream* st) {
2159   // libc, pthread
2160   st->print("libc:");
2161   st->print("%s ", os::Linux::glibc_version());
2162   st->print("%s ", os::Linux::libpthread_version());
2163   st->cr();
2164 }
2165 
2166 void os::Linux::print_proc_sys_info(outputStream* st) {
2167   st->cr();
2168   st->print_cr("/proc/sys/kernel/threads-max (system-wide limit on the number of threads):");
2169   _print_ascii_file("/proc/sys/kernel/threads-max", st);
2170   st->cr();
2171   st->cr();
2172 
2173   st->print_cr("/proc/sys/vm/max_map_count (maximum number of memory map areas a process may have):");
2174   _print_ascii_file("/proc/sys/vm/max_map_count", st);
2175   st->cr();
2176   st->cr();
2177 
2178   st->print_cr("/proc/sys/kernel/pid_max (system-wide limit on number of process identifiers):");
2179   _print_ascii_file("/proc/sys/kernel/pid_max", st);
2180   st->cr();
2181   st->cr();
2182 }
2183 
2184 void os::Linux::print_full_memory_info(outputStream* st) {
2185   st->print("\n/proc/meminfo:\n");
2186   _print_ascii_file("/proc/meminfo", st);
2187   st->cr();
2188 }
2189 
2190 void os::Linux::print_ld_preload_file(outputStream* st) {
2191   _print_ascii_file("/etc/ld.so.preload", st, "\n/etc/ld.so.preload:");
2192   st->cr();
2193 }
2194 
2195 void os::Linux::print_container_info(outputStream* st) {
2196   if (!OSContainer::is_containerized()) {
2197     return;
2198   }
2199 
2200   st->print("container (cgroup) information:\n");
2201 
2202   const char *p_ct = OSContainer::container_type();
2203   st->print("container_type: %s\n", p_ct != NULL ? p_ct : "not supported");
2204 
2205   char *p = OSContainer::cpu_cpuset_cpus();
2206   st->print("cpu_cpuset_cpus: %s\n", p != NULL ? p : "not supported");
2207   free(p);
2208 
2209   p = OSContainer::cpu_cpuset_memory_nodes();
2210   st->print("cpu_memory_nodes: %s\n", p != NULL ? p : "not supported");
2211   free(p);
2212 
2213   int i = OSContainer::active_processor_count();
2214   st->print("active_processor_count: ");
2215   if (i > 0) {
2216     st->print("%d\n", i);
2217   } else {
2218     st->print("not supported\n");
2219   }
2220 
2221   i = OSContainer::cpu_quota();
2222   st->print("cpu_quota: ");
2223   if (i > 0) {
2224     st->print("%d\n", i);
2225   } else {
2226     st->print("%s\n", i == OSCONTAINER_ERROR ? "not supported" : "no quota");
2227   }
2228 
2229   i = OSContainer::cpu_period();
2230   st->print("cpu_period: ");
2231   if (i > 0) {
2232     st->print("%d\n", i);
2233   } else {
2234     st->print("%s\n", i == OSCONTAINER_ERROR ? "not supported" : "no period");
2235   }
2236 
2237   i = OSContainer::cpu_shares();
2238   st->print("cpu_shares: ");
2239   if (i > 0) {
2240     st->print("%d\n", i);
2241   } else {
2242     st->print("%s\n", i == OSCONTAINER_ERROR ? "not supported" : "no shares");
2243   }
2244 
2245   jlong j = OSContainer::memory_limit_in_bytes();
2246   st->print("memory_limit_in_bytes: ");
2247   if (j > 0) {
2248     st->print(JLONG_FORMAT "\n", j);
2249   } else {
2250     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2251   }
2252 
2253   j = OSContainer::memory_and_swap_limit_in_bytes();
2254   st->print("memory_and_swap_limit_in_bytes: ");
2255   if (j > 0) {
2256     st->print(JLONG_FORMAT "\n", j);
2257   } else {
2258     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2259   }
2260 
2261   j = OSContainer::memory_soft_limit_in_bytes();
2262   st->print("memory_soft_limit_in_bytes: ");
2263   if (j > 0) {
2264     st->print(JLONG_FORMAT "\n", j);
2265   } else {
2266     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2267   }
2268 
2269   j = OSContainer::OSContainer::memory_usage_in_bytes();
2270   st->print("memory_usage_in_bytes: ");
2271   if (j > 0) {
2272     st->print(JLONG_FORMAT "\n", j);
2273   } else {
2274     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2275   }
2276 
2277   j = OSContainer::OSContainer::memory_max_usage_in_bytes();
2278   st->print("memory_max_usage_in_bytes: ");
2279   if (j > 0) {
2280     st->print(JLONG_FORMAT "\n", j);
2281   } else {
2282     st->print("%s\n", j == OSCONTAINER_ERROR ? "not supported" : "unlimited");
2283   }
2284   st->cr();
2285 }
2286 
2287 void os::Linux::print_steal_info(outputStream* st) {
2288   if (has_initial_tick_info) {
2289     CPUPerfTicks pticks;
2290     bool res = os::Linux::get_tick_information(&pticks, -1);
2291 
2292     if (res && pticks.has_steal_ticks) {
2293       uint64_t steal_ticks_difference = pticks.steal - initial_steal_ticks;
2294       uint64_t total_ticks_difference = pticks.total - initial_total_ticks;
2295       double steal_ticks_perc = 0.0;
2296       if (total_ticks_difference != 0) {
2297         steal_ticks_perc = (double) steal_ticks_difference / total_ticks_difference;
2298       }
2299       st->print_cr("Steal ticks since vm start: " UINT64_FORMAT, steal_ticks_difference);
2300       st->print_cr("Steal ticks percentage since vm start:%7.3f", steal_ticks_perc);
2301     }
2302   }
2303 }
2304 
2305 void os::print_memory_info(outputStream* st) {
2306 
2307   st->print("Memory:");
2308   st->print(" %dk page", os::vm_page_size()>>10);
2309 
2310   // values in struct sysinfo are "unsigned long"
2311   struct sysinfo si;
2312   sysinfo(&si);
2313 
2314   st->print(", physical " UINT64_FORMAT "k",
2315             os::physical_memory() >> 10);
2316   st->print("(" UINT64_FORMAT "k free)",
2317             os::available_memory() >> 10);
2318   st->print(", swap " UINT64_FORMAT "k",
2319             ((jlong)si.totalswap * si.mem_unit) >> 10);
2320   st->print("(" UINT64_FORMAT "k free)",
2321             ((jlong)si.freeswap * si.mem_unit) >> 10);
2322   st->cr();
2323 }
2324 
2325 // Print the first "model name" line and the first "flags" line
2326 // that we find and nothing more. We assume "model name" comes
2327 // before "flags" so if we find a second "model name", then the
2328 // "flags" field is considered missing.
2329 static bool print_model_name_and_flags(outputStream* st, char* buf, size_t buflen) {
2330 #if defined(IA32) || defined(AMD64)
2331   // Other platforms have less repetitive cpuinfo files
2332   FILE *fp = fopen("/proc/cpuinfo", "r");
2333   if (fp) {
2334     while (!feof(fp)) {
2335       if (fgets(buf, buflen, fp)) {
2336         // Assume model name comes before flags
2337         bool model_name_printed = false;
2338         if (strstr(buf, "model name") != NULL) {
2339           if (!model_name_printed) {
2340             st->print_raw("CPU Model and flags from /proc/cpuinfo:\n");
2341             st->print_raw(buf);
2342             model_name_printed = true;
2343           } else {
2344             // model name printed but not flags?  Odd, just return
2345             fclose(fp);
2346             return true;
2347           }
2348         }
2349         // print the flags line too
2350         if (strstr(buf, "flags") != NULL) {
2351           st->print_raw(buf);
2352           fclose(fp);
2353           return true;
2354         }
2355       }
2356     }
2357     fclose(fp);
2358   }
2359 #endif // x86 platforms
2360   return false;
2361 }
2362 
2363 void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
2364   // Only print the model name if the platform provides this as a summary
2365   if (!print_model_name_and_flags(st, buf, buflen)) {
2366     st->print("\n/proc/cpuinfo:\n");
2367     if (!_print_ascii_file("/proc/cpuinfo", st)) {
2368       st->print_cr("  <Not Available>");
2369     }
2370   }
2371 }
2372 
2373 #if defined(AMD64) || defined(IA32) || defined(X32)
2374 const char* search_string = "model name";
2375 #elif defined(M68K)
2376 const char* search_string = "CPU";
2377 #elif defined(PPC64)
2378 const char* search_string = "cpu";
2379 #elif defined(S390)
2380 const char* search_string = "machine =";
2381 #elif defined(SPARC)
2382 const char* search_string = "cpu";
2383 #else
2384 const char* search_string = "Processor";
2385 #endif
2386 
2387 // Parses the cpuinfo file for string representing the model name.
2388 void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
2389   FILE* fp = fopen("/proc/cpuinfo", "r");
2390   if (fp != NULL) {
2391     while (!feof(fp)) {
2392       char buf[256];
2393       if (fgets(buf, sizeof(buf), fp)) {
2394         char* start = strstr(buf, search_string);
2395         if (start != NULL) {
2396           char *ptr = start + strlen(search_string);
2397           char *end = buf + strlen(buf);
2398           while (ptr != end) {
2399              // skip whitespace and colon for the rest of the name.
2400              if (*ptr != ' ' && *ptr != '\t' && *ptr != ':') {
2401                break;
2402              }
2403              ptr++;
2404           }
2405           if (ptr != end) {
2406             // reasonable string, get rid of newline and keep the rest
2407             char* nl = strchr(buf, '\n');
2408             if (nl != NULL) *nl = '\0';
2409             strncpy(cpuinfo, ptr, length);
2410             fclose(fp);
2411             return;
2412           }
2413         }
2414       }
2415     }
2416     fclose(fp);
2417   }
2418   // cpuinfo not found or parsing failed, just print generic string.  The entire
2419   // /proc/cpuinfo file will be printed later in the file (or enough of it for x86)
2420 #if   defined(AARCH64)
2421   strncpy(cpuinfo, "AArch64", length);
2422 #elif defined(AMD64)
2423   strncpy(cpuinfo, "x86_64", length);
2424 #elif defined(ARM)  // Order wrt. AARCH64 is relevant!
2425   strncpy(cpuinfo, "ARM", length);
2426 #elif defined(IA32)
2427   strncpy(cpuinfo, "x86_32", length);
2428 #elif defined(IA64)
2429   strncpy(cpuinfo, "IA64", length);
2430 #elif defined(PPC)
2431   strncpy(cpuinfo, "PPC64", length);
2432 #elif defined(S390)
2433   strncpy(cpuinfo, "S390", length);
2434 #elif defined(SPARC)
2435   strncpy(cpuinfo, "sparcv9", length);
2436 #elif defined(ZERO_LIBARCH)
2437   strncpy(cpuinfo, ZERO_LIBARCH, length);
2438 #else
2439   strncpy(cpuinfo, "unknown", length);
2440 #endif
2441 }
2442 
2443 static void print_signal_handler(outputStream* st, int sig,
2444                                  char* buf, size_t buflen);
2445 
2446 void os::print_signal_handlers(outputStream* st, char* buf, size_t buflen) {
2447   st->print_cr("Signal Handlers:");
2448   print_signal_handler(st, SIGSEGV, buf, buflen);
2449   print_signal_handler(st, SIGBUS , buf, buflen);
2450   print_signal_handler(st, SIGFPE , buf, buflen);
2451   print_signal_handler(st, SIGPIPE, buf, buflen);
2452   print_signal_handler(st, SIGXFSZ, buf, buflen);
2453   print_signal_handler(st, SIGILL , buf, buflen);
2454   print_signal_handler(st, SR_signum, buf, buflen);
2455   print_signal_handler(st, SHUTDOWN1_SIGNAL, buf, buflen);
2456   print_signal_handler(st, SHUTDOWN2_SIGNAL , buf, buflen);
2457   print_signal_handler(st, SHUTDOWN3_SIGNAL , buf, buflen);
2458   print_signal_handler(st, BREAK_SIGNAL, buf, buflen);
2459 #if defined(PPC64)
2460   print_signal_handler(st, SIGTRAP, buf, buflen);
2461 #endif
2462 }
2463 
2464 static char saved_jvm_path[MAXPATHLEN] = {0};
2465 
2466 // Find the full path to the current module, libjvm.so
2467 void os::jvm_path(char *buf, jint buflen) {
2468   // Error checking.
2469   if (buflen < MAXPATHLEN) {
2470     assert(false, "must use a large-enough buffer");
2471     buf[0] = '\0';
2472     return;
2473   }
2474   // Lazy resolve the path to current module.
2475   if (saved_jvm_path[0] != 0) {
2476     strcpy(buf, saved_jvm_path);
2477     return;
2478   }
2479 
2480   char dli_fname[MAXPATHLEN];
2481   bool ret = dll_address_to_library_name(
2482                                          CAST_FROM_FN_PTR(address, os::jvm_path),
2483                                          dli_fname, sizeof(dli_fname), NULL);
2484   assert(ret, "cannot locate libjvm");
2485   char *rp = NULL;
2486   if (ret && dli_fname[0] != '\0') {
2487     rp = os::Posix::realpath(dli_fname, buf, buflen);
2488   }
2489   if (rp == NULL) {
2490     return;
2491   }
2492 
2493   if (Arguments::sun_java_launcher_is_altjvm()) {
2494     // Support for the java launcher's '-XXaltjvm=<path>' option. Typical
2495     // value for buf is "<JAVA_HOME>/jre/lib/<vmtype>/libjvm.so".
2496     // If "/jre/lib/" appears at the right place in the string, then
2497     // assume we are installed in a JDK and we're done. Otherwise, check
2498     // for a JAVA_HOME environment variable and fix up the path so it
2499     // looks like libjvm.so is installed there (append a fake suffix
2500     // hotspot/libjvm.so).
2501     const char *p = buf + strlen(buf) - 1;
2502     for (int count = 0; p > buf && count < 5; ++count) {
2503       for (--p; p > buf && *p != '/'; --p)
2504         /* empty */ ;
2505     }
2506 
2507     if (strncmp(p, "/jre/lib/", 9) != 0) {
2508       // Look for JAVA_HOME in the environment.
2509       char* java_home_var = ::getenv("JAVA_HOME");
2510       if (java_home_var != NULL && java_home_var[0] != 0) {
2511         char* jrelib_p;
2512         int len;
2513 
2514         // Check the current module name "libjvm.so".
2515         p = strrchr(buf, '/');
2516         if (p == NULL) {
2517           return;
2518         }
2519         assert(strstr(p, "/libjvm") == p, "invalid library name");
2520 
2521         rp = os::Posix::realpath(java_home_var, buf, buflen);
2522         if (rp == NULL) {
2523           return;
2524         }
2525 
2526         // determine if this is a legacy image or modules image
2527         // modules image doesn't have "jre" subdirectory
2528         len = strlen(buf);
2529         assert(len < buflen, "Ran out of buffer room");
2530         jrelib_p = buf + len;
2531         snprintf(jrelib_p, buflen-len, "/jre/lib");
2532         if (0 != access(buf, F_OK)) {
2533           snprintf(jrelib_p, buflen-len, "/lib");
2534         }
2535 
2536         if (0 == access(buf, F_OK)) {
2537           // Use current module name "libjvm.so"
2538           len = strlen(buf);
2539           snprintf(buf + len, buflen-len, "/hotspot/libjvm.so");
2540         } else {
2541           // Go back to path of .so
2542           rp = os::Posix::realpath(dli_fname, buf, buflen);
2543           if (rp == NULL) {
2544             return;
2545           }
2546         }
2547       }
2548     }
2549   }
2550 
2551   strncpy(saved_jvm_path, buf, MAXPATHLEN);
2552   saved_jvm_path[MAXPATHLEN - 1] = '\0';
2553 }
2554 
2555 void os::print_jni_name_prefix_on(outputStream* st, int args_size) {
2556   // no prefix required, not even "_"
2557 }
2558 
2559 void os::print_jni_name_suffix_on(outputStream* st, int args_size) {
2560   // no suffix required
2561 }
2562 
2563 ////////////////////////////////////////////////////////////////////////////////
2564 // sun.misc.Signal support
2565 
2566 static volatile jint sigint_count = 0;
2567 
2568 static void UserHandler(int sig, void *siginfo, void *context) {
2569   // 4511530 - sem_post is serialized and handled by the manager thread. When
2570   // the program is interrupted by Ctrl-C, SIGINT is sent to every thread. We
2571   // don't want to flood the manager thread with sem_post requests.
2572   if (sig == SIGINT && Atomic::add(1, &sigint_count) > 1) {
2573     return;
2574   }
2575 
2576   // Ctrl-C is pressed during error reporting, likely because the error
2577   // handler fails to abort. Let VM die immediately.
2578   if (sig == SIGINT && VMError::is_error_reported()) {
2579     os::die();
2580   }
2581 
2582   os::signal_notify(sig);
2583 }
2584 
2585 void* os::user_handler() {
2586   return CAST_FROM_FN_PTR(void*, UserHandler);
2587 }
2588 
2589 extern "C" {
2590   typedef void (*sa_handler_t)(int);
2591   typedef void (*sa_sigaction_t)(int, siginfo_t *, void *);
2592 }
2593 
2594 void* os::signal(int signal_number, void* handler) {
2595   struct sigaction sigAct, oldSigAct;
2596 
2597   sigfillset(&(sigAct.sa_mask));
2598   sigAct.sa_flags   = SA_RESTART|SA_SIGINFO;
2599   sigAct.sa_handler = CAST_TO_FN_PTR(sa_handler_t, handler);
2600 
2601   if (sigaction(signal_number, &sigAct, &oldSigAct)) {
2602     // -1 means registration failed
2603     return (void *)-1;
2604   }
2605 
2606   return CAST_FROM_FN_PTR(void*, oldSigAct.sa_handler);
2607 }
2608 
2609 void os::signal_raise(int signal_number) {
2610   ::raise(signal_number);
2611 }
2612 
2613 // The following code is moved from os.cpp for making this
2614 // code platform specific, which it is by its very nature.
2615 
2616 // Will be modified when max signal is changed to be dynamic
2617 int os::sigexitnum_pd() {
2618   return NSIG;
2619 }
2620 
2621 // a counter for each possible signal value
2622 static volatile jint pending_signals[NSIG+1] = { 0 };
2623 
2624 // Linux(POSIX) specific hand shaking semaphore.
2625 static Semaphore* sig_sem = NULL;
2626 static PosixSemaphore sr_semaphore;
2627 
2628 static void jdk_misc_signal_init() {
2629   // Initialize signal structures
2630   ::memset((void*)pending_signals, 0, sizeof(pending_signals));
2631 
2632   // Initialize signal semaphore
2633   sig_sem = new Semaphore();
2634 }
2635 
2636 void os::signal_notify(int sig) {
2637   if (sig_sem != NULL) {
2638     Atomic::inc(&pending_signals[sig]);
2639     sig_sem->signal();
2640   } else {
2641     // Signal thread is not created with ReduceSignalUsage and jdk_misc_signal_init
2642     // initialization isn't called.
2643     assert(ReduceSignalUsage, "signal semaphore should be created");
2644   }
2645 }
2646 
2647 static int check_pending_signals() {
2648   Atomic::store(0, &sigint_count);
2649   for (;;) {
2650     for (int i = 0; i < NSIG + 1; i++) {
2651       jint n = pending_signals[i];
2652       if (n > 0 && n == Atomic::cmpxchg(n - 1, &pending_signals[i], n)) {
2653         return i;
2654       }
2655     }
2656     JavaThread *thread = JavaThread::current();
2657     ThreadBlockInVM tbivm(thread);
2658 
2659     bool threadIsSuspended;
2660     do {
2661       thread->set_suspend_equivalent();
2662       // cleared by handle_special_suspend_equivalent_condition() or java_suspend_self()
2663       sig_sem->wait();
2664 
2665       // were we externally suspended while we were waiting?
2666       threadIsSuspended = thread->handle_special_suspend_equivalent_condition();
2667       if (threadIsSuspended) {
2668         // The semaphore has been incremented, but while we were waiting
2669         // another thread suspended us. We don't want to continue running
2670         // while suspended because that would surprise the thread that
2671         // suspended us.
2672         sig_sem->signal();
2673 
2674         thread->java_suspend_self();
2675       }
2676     } while (threadIsSuspended);
2677   }
2678 }
2679 
2680 int os::signal_wait() {
2681   return check_pending_signals();
2682 }
2683 
2684 ////////////////////////////////////////////////////////////////////////////////
2685 // Virtual Memory
2686 
2687 int os::vm_page_size() {
2688   // Seems redundant as all get out
2689   assert(os::Linux::page_size() != -1, "must call os::init");
2690   return os::Linux::page_size();
2691 }
2692 
2693 // Solaris allocates memory by pages.
2694 int os::vm_allocation_granularity() {
2695   assert(os::Linux::page_size() != -1, "must call os::init");
2696   return os::Linux::page_size();
2697 }
2698 
2699 // Rationale behind this function:
2700 //  current (Mon Apr 25 20:12:18 MSD 2005) oprofile drops samples without executable
2701 //  mapping for address (see lookup_dcookie() in the kernel module), thus we cannot get
2702 //  samples for JITted code. Here we create private executable mapping over the code cache
2703 //  and then we can use standard (well, almost, as mapping can change) way to provide
2704 //  info for the reporting script by storing timestamp and location of symbol
2705 void linux_wrap_code(char* base, size_t size) {
2706   static volatile jint cnt = 0;
2707 
2708   if (!UseOprofile) {
2709     return;
2710   }
2711 
2712   char buf[PATH_MAX+1];
2713   int num = Atomic::add(1, &cnt);
2714 
2715   snprintf(buf, sizeof(buf), "%s/hs-vm-%d-%d",
2716            os::get_temp_directory(), os::current_process_id(), num);
2717   unlink(buf);
2718 
2719   int fd = ::open(buf, O_CREAT | O_RDWR, S_IRWXU);
2720 
2721   if (fd != -1) {
2722     off_t rv = ::lseek(fd, size-2, SEEK_SET);
2723     if (rv != (off_t)-1) {
2724       if (::write(fd, "", 1) == 1) {
2725         mmap(base, size,
2726              PROT_READ|PROT_WRITE|PROT_EXEC,
2727              MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE, fd, 0);
2728       }
2729     }
2730     ::close(fd);
2731     unlink(buf);
2732   }
2733 }
2734 
2735 static bool recoverable_mmap_error(int err) {
2736   // See if the error is one we can let the caller handle. This
2737   // list of errno values comes from JBS-6843484. I can't find a
2738   // Linux man page that documents this specific set of errno
2739   // values so while this list currently matches Solaris, it may
2740   // change as we gain experience with this failure mode.
2741   switch (err) {
2742   case EBADF:
2743   case EINVAL:
2744   case ENOTSUP:
2745     // let the caller deal with these errors
2746     return true;
2747 
2748   default:
2749     // Any remaining errors on this OS can cause our reserved mapping
2750     // to be lost. That can cause confusion where different data
2751     // structures think they have the same memory mapped. The worst
2752     // scenario is if both the VM and a library think they have the
2753     // same memory mapped.
2754     return false;
2755   }
2756 }
2757 
2758 static void warn_fail_commit_memory(char* addr, size_t size, bool exec,
2759                                     int err) {
2760   warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2761           ", %d) failed; error='%s' (errno=%d)", p2i(addr), size, exec,
2762           os::strerror(err), err);
2763 }
2764 
2765 static void warn_fail_commit_memory(char* addr, size_t size,
2766                                     size_t alignment_hint, bool exec,
2767                                     int err) {
2768   warning("INFO: os::commit_memory(" PTR_FORMAT ", " SIZE_FORMAT
2769           ", " SIZE_FORMAT ", %d) failed; error='%s' (errno=%d)", p2i(addr), size,
2770           alignment_hint, exec, os::strerror(err), err);
2771 }
2772 
2773 // NOTE: Linux kernel does not really reserve the pages for us.
2774 //       All it does is to check if there are enough free pages
2775 //       left at the time of mmap(). This could be a potential
2776 //       problem.
2777 int os::Linux::commit_memory_impl(char* addr, size_t size, bool exec) {
2778   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
2779   uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
2780                                      MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
2781   if (res != (uintptr_t) MAP_FAILED) {
2782     if (UseNUMAInterleaving) {
2783       numa_make_global(addr, size);
2784     }
2785     return 0;
2786   }
2787 
2788   int err = errno;  // save errno from mmap() call above
2789 
2790   if (!recoverable_mmap_error(err)) {
2791     warn_fail_commit_memory(addr, size, exec, err);
2792     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "committing reserved memory.");
2793   }
2794 
2795   return err;
2796 }
2797 
2798 bool os::pd_commit_memory(char* addr, size_t size, bool exec) {
2799   return os::Linux::commit_memory_impl(addr, size, exec) == 0;
2800 }
2801 
2802 void os::pd_commit_memory_or_exit(char* addr, size_t size, bool exec,
2803                                   const char* mesg) {
2804   assert(mesg != NULL, "mesg must be specified");
2805   int err = os::Linux::commit_memory_impl(addr, size, exec);
2806   if (err != 0) {
2807     // the caller wants all commit errors to exit with the specified mesg:
2808     warn_fail_commit_memory(addr, size, exec, err);
2809     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2810   }
2811 }
2812 
2813 // Define MAP_HUGETLB here so we can build HotSpot on old systems.
2814 #ifndef MAP_HUGETLB
2815   #define MAP_HUGETLB 0x40000
2816 #endif
2817 
2818 // Define MADV_HUGEPAGE here so we can build HotSpot on old systems.
2819 #ifndef MADV_HUGEPAGE
2820   #define MADV_HUGEPAGE 14
2821 #endif
2822 
2823 int os::Linux::commit_memory_impl(char* addr, size_t size,
2824                                   size_t alignment_hint, bool exec) {
2825   int err = os::Linux::commit_memory_impl(addr, size, exec);
2826   if (err == 0) {
2827     realign_memory(addr, size, alignment_hint);
2828   }
2829   return err;
2830 }
2831 
2832 bool os::pd_commit_memory(char* addr, size_t size, size_t alignment_hint,
2833                           bool exec) {
2834   return os::Linux::commit_memory_impl(addr, size, alignment_hint, exec) == 0;
2835 }
2836 
2837 void os::pd_commit_memory_or_exit(char* addr, size_t size,
2838                                   size_t alignment_hint, bool exec,
2839                                   const char* mesg) {
2840   assert(mesg != NULL, "mesg must be specified");
2841   int err = os::Linux::commit_memory_impl(addr, size, alignment_hint, exec);
2842   if (err != 0) {
2843     // the caller wants all commit errors to exit with the specified mesg:
2844     warn_fail_commit_memory(addr, size, alignment_hint, exec, err);
2845     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "%s", mesg);
2846   }
2847 }
2848 
2849 void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
2850   if (UseTransparentHugePages && alignment_hint > (size_t)vm_page_size()) {
2851     // We don't check the return value: madvise(MADV_HUGEPAGE) may not
2852     // be supported or the memory may already be backed by huge pages.
2853     ::madvise(addr, bytes, MADV_HUGEPAGE);
2854   }
2855 }
2856 
2857 void os::pd_free_memory(char *addr, size_t bytes, size_t alignment_hint) {
2858   // This method works by doing an mmap over an existing mmaping and effectively discarding
2859   // the existing pages. However it won't work for SHM-based large pages that cannot be
2860   // uncommitted at all. We don't do anything in this case to avoid creating a segment with
2861   // small pages on top of the SHM segment. This method always works for small pages, so we
2862   // allow that in any case.
2863   if (alignment_hint <= (size_t)os::vm_page_size() || can_commit_large_page_memory()) {
2864     commit_memory(addr, bytes, alignment_hint, !ExecMem);
2865   }
2866 }
2867 
2868 void os::numa_make_global(char *addr, size_t bytes) {
2869   Linux::numa_interleave_memory(addr, bytes);
2870 }
2871 
2872 // Define for numa_set_bind_policy(int). Setting the argument to 0 will set the
2873 // bind policy to MPOL_PREFERRED for the current thread.
2874 #define USE_MPOL_PREFERRED 0
2875 
2876 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
2877   // To make NUMA and large pages more robust when both enabled, we need to ease
2878   // the requirements on where the memory should be allocated. MPOL_BIND is the
2879   // default policy and it will force memory to be allocated on the specified
2880   // node. Changing this to MPOL_PREFERRED will prefer to allocate the memory on
2881   // the specified node, but will not force it. Using this policy will prevent
2882   // getting SIGBUS when trying to allocate large pages on NUMA nodes with no
2883   // free large pages.
2884   Linux::numa_set_bind_policy(USE_MPOL_PREFERRED);
2885   Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
2886 }
2887 
2888 bool os::numa_topology_changed() { return false; }
2889 
2890 size_t os::numa_get_groups_num() {
2891   // Return just the number of nodes in which it's possible to allocate memory
2892   // (in numa terminology, configured nodes).
2893   return Linux::numa_num_configured_nodes();
2894 }
2895 
2896 int os::numa_get_group_id() {
2897   int cpu_id = Linux::sched_getcpu();
2898   if (cpu_id != -1) {
2899     int lgrp_id = Linux::get_node_by_cpu(cpu_id);
2900     if (lgrp_id != -1) {
2901       return lgrp_id;
2902     }
2903   }
2904   return 0;
2905 }
2906 
2907 int os::Linux::get_existing_num_nodes() {
2908   int node;
2909   int highest_node_number = Linux::numa_max_node();
2910   int num_nodes = 0;
2911 
2912   // Get the total number of nodes in the system including nodes without memory.
2913   for (node = 0; node <= highest_node_number; node++) {
2914     if (is_node_in_existing_nodes(node)) {
2915       num_nodes++;
2916     }
2917   }
2918   return num_nodes;
2919 }
2920 
2921 size_t os::numa_get_leaf_groups(int *ids, size_t size) {
2922   int highest_node_number = Linux::numa_max_node();
2923   size_t i = 0;
2924 
2925   // Map all node ids in which it is possible to allocate memory. Also nodes are
2926   // not always consecutively available, i.e. available from 0 to the highest
2927   // node number. If the nodes have been bound explicitly using numactl membind,
2928   // then allocate memory from those nodes only.
2929   for (int node = 0; node <= highest_node_number; node++) {
2930     if (Linux::is_node_in_bound_nodes((unsigned int)node)) {
2931       ids[i++] = node;
2932     }
2933   }
2934   return i;
2935 }
2936 
2937 bool os::get_page_info(char *start, page_info* info) {
2938   return false;
2939 }
2940 
2941 char *os::scan_pages(char *start, char* end, page_info* page_expected,
2942                      page_info* page_found) {
2943   return end;
2944 }
2945 
2946 
2947 int os::Linux::sched_getcpu_syscall(void) {
2948   unsigned int cpu = 0;
2949   int retval = -1;
2950 
2951 #if defined(IA32)
2952   #ifndef SYS_getcpu
2953     #define SYS_getcpu 318
2954   #endif
2955   retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
2956 #elif defined(AMD64)
2957 // Unfortunately we have to bring all these macros here from vsyscall.h
2958 // to be able to compile on old linuxes.
2959   #define __NR_vgetcpu 2
2960   #define VSYSCALL_START (-10UL << 20)
2961   #define VSYSCALL_SIZE 1024
2962   #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
2963   typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
2964   vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
2965   retval = vgetcpu(&cpu, NULL, NULL);
2966 #endif
2967 
2968   return (retval == -1) ? retval : cpu;
2969 }
2970 
2971 void os::Linux::sched_getcpu_init() {
2972   // sched_getcpu() should be in libc.
2973   set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2974                                   dlsym(RTLD_DEFAULT, "sched_getcpu")));
2975 
2976   // If it's not, try a direct syscall.
2977   if (sched_getcpu() == -1) {
2978     set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
2979                                     (void*)&sched_getcpu_syscall));
2980   }
2981 
2982   if (sched_getcpu() == -1) {
2983     vm_exit_during_initialization("getcpu(2) system call not supported by kernel");
2984   }
2985 }
2986 
2987 // Something to do with the numa-aware allocator needs these symbols
2988 extern "C" JNIEXPORT void numa_warn(int number, char *where, ...) { }
2989 extern "C" JNIEXPORT void numa_error(char *where) { }
2990 
2991 static void* dlvsym_if_available(void* handle, const char* name, const char* version) {
2992   typedef void* (*dlvsym_func_type)(void* handle, const char* name, const char* version);
2993   static dlvsym_func_type dlvsym_func;
2994   static bool initialized = false;
2995 
2996   if (!initialized) {
2997     dlvsym_func = (dlvsym_func_type)dlsym(RTLD_NEXT, "dlvsym");
2998     initialized = true;
2999   }
3000 
3001   if (dlvsym_func != NULL) {
3002     void *f = dlvsym_func(handle, name, version);
3003     if (f != NULL) {
3004       return f;
3005     }
3006   }
3007 
3008   return dlsym(handle, name);
3009 }
3010 
3011 // Handle request to load libnuma symbol version 1.1 (API v1). If it fails
3012 // load symbol from base version instead.
3013 void* os::Linux::libnuma_dlsym(void* handle, const char *name) {
3014   return dlvsym_if_available(handle, name, "libnuma_1.1");
3015 }
3016 
3017 // Handle request to load libnuma symbol version 1.2 (API v2) only.
3018 // Return NULL if the symbol is not defined in this particular version.
3019 void* os::Linux::libnuma_v2_dlsym(void* handle, const char* name) {
3020   return dlvsym_if_available(handle, name, "libnuma_1.2");
3021 }
3022 
3023 bool os::Linux::libnuma_init() {
3024   if (sched_getcpu() != -1) { // Requires sched_getcpu() support
3025     void *handle = dlopen("libnuma.so.1", RTLD_LAZY);
3026     if (handle != NULL) {
3027       set_numa_node_to_cpus(CAST_TO_FN_PTR(numa_node_to_cpus_func_t,
3028                                            libnuma_dlsym(handle, "numa_node_to_cpus")));
3029       set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
3030                                        libnuma_dlsym(handle, "numa_max_node")));
3031       set_numa_num_configured_nodes(CAST_TO_FN_PTR(numa_num_configured_nodes_func_t,
3032                                                    libnuma_dlsym(handle, "numa_num_configured_nodes")));
3033       set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
3034                                         libnuma_dlsym(handle, "numa_available")));
3035       set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
3036                                             libnuma_dlsym(handle, "numa_tonode_memory")));
3037       set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
3038                                                 libnuma_dlsym(handle, "numa_interleave_memory")));
3039       set_numa_interleave_memory_v2(CAST_TO_FN_PTR(numa_interleave_memory_v2_func_t,
3040                                                 libnuma_v2_dlsym(handle, "numa_interleave_memory")));
3041       set_numa_set_bind_policy(CAST_TO_FN_PTR(numa_set_bind_policy_func_t,
3042                                               libnuma_dlsym(handle, "numa_set_bind_policy")));
3043       set_numa_bitmask_isbitset(CAST_TO_FN_PTR(numa_bitmask_isbitset_func_t,
3044                                                libnuma_dlsym(handle, "numa_bitmask_isbitset")));
3045       set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
3046                                        libnuma_dlsym(handle, "numa_distance")));
3047       set_numa_get_membind(CAST_TO_FN_PTR(numa_get_membind_func_t,
3048                                           libnuma_v2_dlsym(handle, "numa_get_membind")));
3049       set_numa_get_interleave_mask(CAST_TO_FN_PTR(numa_get_interleave_mask_func_t,
3050                                                   libnuma_v2_dlsym(handle, "numa_get_interleave_mask")));
3051 
3052       if (numa_available() != -1) {
3053         set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
3054         set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr"));
3055         set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr"));
3056         set_numa_interleave_bitmask(_numa_get_interleave_mask());
3057         set_numa_membind_bitmask(_numa_get_membind());
3058         // Create an index -> node mapping, since nodes are not always consecutive
3059         _nindex_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
3060         rebuild_nindex_to_node_map();
3061         // Create a cpu -> node mapping
3062         _cpu_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
3063         rebuild_cpu_to_node_map();
3064         return true;
3065       }
3066     }
3067   }
3068   return false;
3069 }
3070 
3071 size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
3072   // Creating guard page is very expensive. Java thread has HotSpot
3073   // guard pages, only enable glibc guard page for non-Java threads.
3074   // (Remember: compiler thread is a Java thread, too!)
3075   return ((thr_type == java_thread || thr_type == compiler_thread) ? 0 : page_size());
3076 }
3077 
3078 void os::Linux::rebuild_nindex_to_node_map() {
3079   int highest_node_number = Linux::numa_max_node();
3080 
3081   nindex_to_node()->clear();
3082   for (int node = 0; node <= highest_node_number; node++) {
3083     if (Linux::is_node_in_existing_nodes(node)) {
3084       nindex_to_node()->append(node);
3085     }
3086   }
3087 }
3088 
3089 // rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
3090 // The table is later used in get_node_by_cpu().
3091 void os::Linux::rebuild_cpu_to_node_map() {
3092   const size_t NCPUS = 32768; // Since the buffer size computation is very obscure
3093                               // in libnuma (possible values are starting from 16,
3094                               // and continuing up with every other power of 2, but less
3095                               // than the maximum number of CPUs supported by kernel), and
3096                               // is a subject to change (in libnuma version 2 the requirements
3097                               // are more reasonable) we'll just hardcode the number they use
3098                               // in the library.
3099   const size_t BitsPerCLong = sizeof(long) * CHAR_BIT;
3100 
3101   size_t cpu_num = processor_count();
3102   size_t cpu_map_size = NCPUS / BitsPerCLong;
3103   size_t cpu_map_valid_size =
3104     MIN2((cpu_num + BitsPerCLong - 1) / BitsPerCLong, cpu_map_size);
3105 
3106   cpu_to_node()->clear();
3107   cpu_to_node()->at_grow(cpu_num - 1);
3108 
3109   size_t node_num = get_existing_num_nodes();
3110 
3111   int distance = 0;
3112   int closest_distance = INT_MAX;
3113   int closest_node = 0;
3114   unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size, mtInternal);
3115   for (size_t i = 0; i < node_num; i++) {
3116     // Check if node is configured (not a memory-less node). If it is not, find
3117     // the closest configured node. Check also if node is bound, i.e. it's allowed
3118     // to allocate memory from the node. If it's not allowed, map cpus in that node
3119     // to the closest node from which memory allocation is allowed.
3120     if (!is_node_in_configured_nodes(nindex_to_node()->at(i)) ||
3121         !is_node_in_bound_nodes(nindex_to_node()->at(i))) {
3122       closest_distance = INT_MAX;
3123       // Check distance from all remaining nodes in the system. Ignore distance
3124       // from itself, from another non-configured node, and from another non-bound
3125       // node.
3126       for (size_t m = 0; m < node_num; m++) {
3127         if (m != i &&
3128             is_node_in_configured_nodes(nindex_to_node()->at(m)) &&
3129             is_node_in_bound_nodes(nindex_to_node()->at(m))) {
3130           distance = numa_distance(nindex_to_node()->at(i), nindex_to_node()->at(m));
3131           // If a closest node is found, update. There is always at least one
3132           // configured and bound node in the system so there is always at least
3133           // one node close.
3134           if (distance != 0 && distance < closest_distance) {
3135             closest_distance = distance;
3136             closest_node = nindex_to_node()->at(m);
3137           }
3138         }
3139       }
3140      } else {
3141        // Current node is already a configured node.
3142        closest_node = nindex_to_node()->at(i);
3143      }
3144 
3145     // Get cpus from the original node and map them to the closest node. If node
3146     // is a configured node (not a memory-less node), then original node and
3147     // closest node are the same.
3148     if (numa_node_to_cpus(nindex_to_node()->at(i), cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
3149       for (size_t j = 0; j < cpu_map_valid_size; j++) {
3150         if (cpu_map[j] != 0) {
3151           for (size_t k = 0; k < BitsPerCLong; k++) {
3152             if (cpu_map[j] & (1UL << k)) {
3153               cpu_to_node()->at_put(j * BitsPerCLong + k, closest_node);
3154             }
3155           }
3156         }
3157       }
3158     }
3159   }
3160   FREE_C_HEAP_ARRAY(unsigned long, cpu_map);
3161 }
3162 
3163 int os::Linux::get_node_by_cpu(int cpu_id) {
3164   if (cpu_to_node() != NULL && cpu_id >= 0 && cpu_id < cpu_to_node()->length()) {
3165     return cpu_to_node()->at(cpu_id);
3166   }
3167   return -1;
3168 }
3169 
3170 GrowableArray<int>* os::Linux::_cpu_to_node;
3171 GrowableArray<int>* os::Linux::_nindex_to_node;
3172 os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
3173 os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
3174 os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
3175 os::Linux::numa_num_configured_nodes_func_t os::Linux::_numa_num_configured_nodes;
3176 os::Linux::numa_available_func_t os::Linux::_numa_available;
3177 os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
3178 os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
3179 os::Linux::numa_interleave_memory_v2_func_t os::Linux::_numa_interleave_memory_v2;
3180 os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
3181 os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
3182 os::Linux::numa_distance_func_t os::Linux::_numa_distance;
3183 os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind;
3184 os::Linux::numa_get_interleave_mask_func_t os::Linux::_numa_get_interleave_mask;
3185 os::Linux::NumaAllocationPolicy os::Linux::_current_numa_policy;
3186 unsigned long* os::Linux::_numa_all_nodes;
3187 struct bitmask* os::Linux::_numa_all_nodes_ptr;
3188 struct bitmask* os::Linux::_numa_nodes_ptr;
3189 struct bitmask* os::Linux::_numa_interleave_bitmask;
3190 struct bitmask* os::Linux::_numa_membind_bitmask;
3191 
3192 bool os::pd_uncommit_memory(char* addr, size_t size) {
3193   uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
3194                                      MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE|MAP_ANONYMOUS, -1, 0);
3195   return res  != (uintptr_t) MAP_FAILED;
3196 }
3197 
3198 static address get_stack_commited_bottom(address bottom, size_t size) {
3199   address nbot = bottom;
3200   address ntop = bottom + size;
3201 
3202   size_t page_sz = os::vm_page_size();
3203   unsigned pages = size / page_sz;
3204 
3205   unsigned char vec[1];
3206   unsigned imin = 1, imax = pages + 1, imid;
3207   int mincore_return_value = 0;
3208 
3209   assert(imin <= imax, "Unexpected page size");
3210 
3211   while (imin < imax) {
3212     imid = (imax + imin) / 2;
3213     nbot = ntop - (imid * page_sz);
3214 
3215     // Use a trick with mincore to check whether the page is mapped or not.
3216     // mincore sets vec to 1 if page resides in memory and to 0 if page
3217     // is swapped output but if page we are asking for is unmapped
3218     // it returns -1,ENOMEM
3219     mincore_return_value = mincore(nbot, page_sz, vec);
3220 
3221     if (mincore_return_value == -1) {
3222       // Page is not mapped go up
3223       // to find first mapped page
3224       if (errno != EAGAIN) {
3225         assert(errno == ENOMEM, "Unexpected mincore errno");
3226         imax = imid;
3227       }
3228     } else {
3229       // Page is mapped go down
3230       // to find first not mapped page
3231       imin = imid + 1;
3232     }
3233   }
3234 
3235   nbot = nbot + page_sz;
3236 
3237   // Adjust stack bottom one page up if last checked page is not mapped
3238   if (mincore_return_value == -1) {
3239     nbot = nbot + page_sz;
3240   }
3241 
3242   return nbot;
3243 }
3244 
3245 bool os::committed_in_range(address start, size_t size, address& committed_start, size_t& committed_size) {
3246   int mincore_return_value;
3247   const size_t stripe = 1024;  // query this many pages each time
3248   unsigned char vec[stripe + 1];
3249   // set a guard
3250   vec[stripe] = 'X';
3251 
3252   const size_t page_sz = os::vm_page_size();
3253   size_t pages = size / page_sz;
3254 
3255   assert(is_aligned(start, page_sz), "Start address must be page aligned");
3256   assert(is_aligned(size, page_sz), "Size must be page aligned");
3257 
3258   committed_start = NULL;
3259 
3260   int loops = (pages + stripe - 1) / stripe;
3261   int committed_pages = 0;
3262   address loop_base = start;
3263   bool found_range = false;
3264 
3265   for (int index = 0; index < loops && !found_range; index ++) {
3266     assert(pages > 0, "Nothing to do");
3267     int pages_to_query = (pages >= stripe) ? stripe : pages;
3268     pages -= pages_to_query;
3269 
3270     // Get stable read
3271     while ((mincore_return_value = mincore(loop_base, pages_to_query * page_sz, vec)) == -1 && errno == EAGAIN);
3272 
3273     // During shutdown, some memory goes away without properly notifying NMT,
3274     // E.g. ConcurrentGCThread/WatcherThread can exit without deleting thread object.
3275     // Bailout and return as not committed for now.
3276     if (mincore_return_value == -1 && errno == ENOMEM) {
3277       return false;
3278     }
3279 
3280     assert(vec[stripe] == 'X', "overflow guard");
3281     assert(mincore_return_value == 0, "Range must be valid");
3282     // Process this stripe
3283     for (int vecIdx = 0; vecIdx < pages_to_query; vecIdx ++) {
3284       if ((vec[vecIdx] & 0x01) == 0) { // not committed
3285         // End of current contiguous region
3286         if (committed_start != NULL) {
3287           found_range = true;
3288           break;
3289         }
3290       } else { // committed
3291         // Start of region
3292         if (committed_start == NULL) {
3293           committed_start = loop_base + page_sz * vecIdx;
3294         }
3295         committed_pages ++;
3296       }
3297     }
3298 
3299     loop_base += pages_to_query * page_sz;
3300   }
3301 
3302   if (committed_start != NULL) {
3303     assert(committed_pages > 0, "Must have committed region");
3304     assert(committed_pages <= int(size / page_sz), "Can not commit more than it has");
3305     assert(committed_start >= start && committed_start < start + size, "Out of range");
3306     committed_size = page_sz * committed_pages;
3307     return true;
3308   } else {
3309     assert(committed_pages == 0, "Should not have committed region");
3310     return false;
3311   }
3312 }
3313 
3314 
3315 // Linux uses a growable mapping for the stack, and if the mapping for
3316 // the stack guard pages is not removed when we detach a thread the
3317 // stack cannot grow beyond the pages where the stack guard was
3318 // mapped.  If at some point later in the process the stack expands to
3319 // that point, the Linux kernel cannot expand the stack any further
3320 // because the guard pages are in the way, and a segfault occurs.
3321 //
3322 // However, it's essential not to split the stack region by unmapping
3323 // a region (leaving a hole) that's already part of the stack mapping,
3324 // so if the stack mapping has already grown beyond the guard pages at
3325 // the time we create them, we have to truncate the stack mapping.
3326 // So, we need to know the extent of the stack mapping when
3327 // create_stack_guard_pages() is called.
3328 
3329 // We only need this for stacks that are growable: at the time of
3330 // writing thread stacks don't use growable mappings (i.e. those
3331 // creeated with MAP_GROWSDOWN), and aren't marked "[stack]", so this
3332 // only applies to the main thread.
3333 
3334 // If the (growable) stack mapping already extends beyond the point
3335 // where we're going to put our guard pages, truncate the mapping at
3336 // that point by munmap()ping it.  This ensures that when we later
3337 // munmap() the guard pages we don't leave a hole in the stack
3338 // mapping. This only affects the main/primordial thread
3339 
3340 bool os::pd_create_stack_guard_pages(char* addr, size_t size) {
3341   if (os::is_primordial_thread()) {
3342     // As we manually grow stack up to bottom inside create_attached_thread(),
3343     // it's likely that os::Linux::initial_thread_stack_bottom is mapped and
3344     // we don't need to do anything special.
3345     // Check it first, before calling heavy function.
3346     uintptr_t stack_extent = (uintptr_t) os::Linux::initial_thread_stack_bottom();
3347     unsigned char vec[1];
3348 
3349     if (mincore((address)stack_extent, os::vm_page_size(), vec) == -1) {
3350       // Fallback to slow path on all errors, including EAGAIN
3351       stack_extent = (uintptr_t) get_stack_commited_bottom(
3352                                                            os::Linux::initial_thread_stack_bottom(),
3353                                                            (size_t)addr - stack_extent);
3354     }
3355 
3356     if (stack_extent < (uintptr_t)addr) {
3357       ::munmap((void*)stack_extent, (uintptr_t)(addr - stack_extent));
3358     }
3359   }
3360 
3361   return os::commit_memory(addr, size, !ExecMem);
3362 }
3363 
3364 // If this is a growable mapping, remove the guard pages entirely by
3365 // munmap()ping them.  If not, just call uncommit_memory(). This only
3366 // affects the main/primordial thread, but guard against future OS changes.
3367 // It's safe to always unmap guard pages for primordial thread because we
3368 // always place it right after end of the mapped region.
3369 
3370 bool os::remove_stack_guard_pages(char* addr, size_t size) {
3371   uintptr_t stack_extent, stack_base;
3372 
3373   if (os::is_primordial_thread()) {
3374     return ::munmap(addr, size) == 0;
3375   }
3376 
3377   return os::uncommit_memory(addr, size);
3378 }
3379 
3380 // If 'fixed' is true, anon_mmap() will attempt to reserve anonymous memory
3381 // at 'requested_addr'. If there are existing memory mappings at the same
3382 // location, however, they will be overwritten. If 'fixed' is false,
3383 // 'requested_addr' is only treated as a hint, the return value may or
3384 // may not start from the requested address. Unlike Linux mmap(), this
3385 // function returns NULL to indicate failure.
3386 static char* anon_mmap(char* requested_addr, size_t bytes, bool fixed) {
3387   char * addr;
3388   int flags;
3389 
3390   flags = MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS;
3391   if (fixed) {
3392     assert((uintptr_t)requested_addr % os::Linux::page_size() == 0, "unaligned address");
3393     flags |= MAP_FIXED;
3394   }
3395 
3396   // Map reserved/uncommitted pages PROT_NONE so we fail early if we
3397   // touch an uncommitted page. Otherwise, the read/write might
3398   // succeed if we have enough swap space to back the physical page.
3399   addr = (char*)::mmap(requested_addr, bytes, PROT_NONE,
3400                        flags, -1, 0);
3401 
3402   return addr == MAP_FAILED ? NULL : addr;
3403 }
3404 
3405 // Allocate (using mmap, NO_RESERVE, with small pages) at either a given request address
3406 //   (req_addr != NULL) or with a given alignment.
3407 //  - bytes shall be a multiple of alignment.
3408 //  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3409 //  - alignment sets the alignment at which memory shall be allocated.
3410 //     It must be a multiple of allocation granularity.
3411 // Returns address of memory or NULL. If req_addr was not NULL, will only return
3412 //  req_addr or NULL.
3413 static char* anon_mmap_aligned(size_t bytes, size_t alignment, char* req_addr) {
3414 
3415   size_t extra_size = bytes;
3416   if (req_addr == NULL && alignment > 0) {
3417     extra_size += alignment;
3418   }
3419 
3420   char* start = (char*) ::mmap(req_addr, extra_size, PROT_NONE,
3421     MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
3422     -1, 0);
3423   if (start == MAP_FAILED) {
3424     start = NULL;
3425   } else {
3426     if (req_addr != NULL) {
3427       if (start != req_addr) {
3428         ::munmap(start, extra_size);
3429         start = NULL;
3430       }
3431     } else {
3432       char* const start_aligned = align_up(start, alignment);
3433       char* const end_aligned = start_aligned + bytes;
3434       char* const end = start + extra_size;
3435       if (start_aligned > start) {
3436         ::munmap(start, start_aligned - start);
3437       }
3438       if (end_aligned < end) {
3439         ::munmap(end_aligned, end - end_aligned);
3440       }
3441       start = start_aligned;
3442     }
3443   }
3444   return start;
3445 }
3446 
3447 static int anon_munmap(char * addr, size_t size) {
3448   return ::munmap(addr, size) == 0;
3449 }
3450 
3451 char* os::pd_reserve_memory(size_t bytes, char* requested_addr,
3452                             size_t alignment_hint) {
3453   return anon_mmap(requested_addr, bytes, (requested_addr != NULL));
3454 }
3455 
3456 bool os::pd_release_memory(char* addr, size_t size) {
3457   return anon_munmap(addr, size);
3458 }
3459 
3460 static bool linux_mprotect(char* addr, size_t size, int prot) {
3461   // Linux wants the mprotect address argument to be page aligned.
3462   char* bottom = (char*)align_down((intptr_t)addr, os::Linux::page_size());
3463 
3464   // According to SUSv3, mprotect() should only be used with mappings
3465   // established by mmap(), and mmap() always maps whole pages. Unaligned
3466   // 'addr' likely indicates problem in the VM (e.g. trying to change
3467   // protection of malloc'ed or statically allocated memory). Check the
3468   // caller if you hit this assert.
3469   assert(addr == bottom, "sanity check");
3470 
3471   size = align_up(pointer_delta(addr, bottom, 1) + size, os::Linux::page_size());
3472   return ::mprotect(bottom, size, prot) == 0;
3473 }
3474 
3475 // Set protections specified
3476 bool os::protect_memory(char* addr, size_t bytes, ProtType prot,
3477                         bool is_committed) {
3478   unsigned int p = 0;
3479   switch (prot) {
3480   case MEM_PROT_NONE: p = PROT_NONE; break;
3481   case MEM_PROT_READ: p = PROT_READ; break;
3482   case MEM_PROT_RW:   p = PROT_READ|PROT_WRITE; break;
3483   case MEM_PROT_RWX:  p = PROT_READ|PROT_WRITE|PROT_EXEC; break;
3484   default:
3485     ShouldNotReachHere();
3486   }
3487   // is_committed is unused.
3488   return linux_mprotect(addr, bytes, p);
3489 }
3490 
3491 bool os::guard_memory(char* addr, size_t size) {
3492   return linux_mprotect(addr, size, PROT_NONE);
3493 }
3494 
3495 bool os::unguard_memory(char* addr, size_t size) {
3496   return linux_mprotect(addr, size, PROT_READ|PROT_WRITE);
3497 }
3498 
3499 bool os::Linux::transparent_huge_pages_sanity_check(bool warn,
3500                                                     size_t page_size) {
3501   bool result = false;
3502   void *p = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
3503                  MAP_ANONYMOUS|MAP_PRIVATE,
3504                  -1, 0);
3505   if (p != MAP_FAILED) {
3506     void *aligned_p = align_up(p, page_size);
3507 
3508     result = madvise(aligned_p, page_size, MADV_HUGEPAGE) == 0;
3509 
3510     munmap(p, page_size * 2);
3511   }
3512 
3513   if (warn && !result) {
3514     warning("TransparentHugePages is not supported by the operating system.");
3515   }
3516 
3517   return result;
3518 }
3519 
3520 bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
3521   bool result = false;
3522   void *p = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
3523                  MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
3524                  -1, 0);
3525 
3526   if (p != MAP_FAILED) {
3527     // We don't know if this really is a huge page or not.
3528     FILE *fp = fopen("/proc/self/maps", "r");
3529     if (fp) {
3530       while (!feof(fp)) {
3531         char chars[257];
3532         long x = 0;
3533         if (fgets(chars, sizeof(chars), fp)) {
3534           if (sscanf(chars, "%lx-%*x", &x) == 1
3535               && x == (long)p) {
3536             if (strstr (chars, "hugepage")) {
3537               result = true;
3538               break;
3539             }
3540           }
3541         }
3542       }
3543       fclose(fp);
3544     }
3545     munmap(p, page_size);
3546   }
3547 
3548   if (warn && !result) {
3549     warning("HugeTLBFS is not supported by the operating system.");
3550   }
3551 
3552   return result;
3553 }
3554 
3555 // From the coredump_filter documentation:
3556 //
3557 // - (bit 0) anonymous private memory
3558 // - (bit 1) anonymous shared memory
3559 // - (bit 2) file-backed private memory
3560 // - (bit 3) file-backed shared memory
3561 // - (bit 4) ELF header pages in file-backed private memory areas (it is
3562 //           effective only if the bit 2 is cleared)
3563 // - (bit 5) hugetlb private memory
3564 // - (bit 6) hugetlb shared memory
3565 // - (bit 7) dax private memory
3566 // - (bit 8) dax shared memory
3567 //
3568 static void set_coredump_filter(CoredumpFilterBit bit) {
3569   FILE *f;
3570   long cdm;
3571 
3572   if ((f = fopen("/proc/self/coredump_filter", "r+")) == NULL) {
3573     return;
3574   }
3575 
3576   if (fscanf(f, "%lx", &cdm) != 1) {
3577     fclose(f);
3578     return;
3579   }
3580 
3581   long saved_cdm = cdm;
3582   rewind(f);
3583   cdm |= bit;
3584 
3585   if (cdm != saved_cdm) {
3586     fprintf(f, "%#lx", cdm);
3587   }
3588 
3589   fclose(f);
3590 }
3591 
3592 // Large page support
3593 
3594 static size_t _large_page_size = 0;
3595 
3596 size_t os::Linux::find_large_page_size() {
3597   size_t large_page_size = 0;
3598 
3599   // large_page_size on Linux is used to round up heap size. x86 uses either
3600   // 2M or 4M page, depending on whether PAE (Physical Address Extensions)
3601   // mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use
3602   // page as large as 256M.
3603   //
3604   // Here we try to figure out page size by parsing /proc/meminfo and looking
3605   // for a line with the following format:
3606   //    Hugepagesize:     2048 kB
3607   //
3608   // If we can't determine the value (e.g. /proc is not mounted, or the text
3609   // format has been changed), we'll use the largest page size supported by
3610   // the processor.
3611 
3612 #ifndef ZERO
3613   large_page_size =
3614     AARCH64_ONLY(2 * M)
3615     AMD64_ONLY(2 * M)
3616     ARM32_ONLY(2 * M)
3617     IA32_ONLY(4 * M)
3618     IA64_ONLY(256 * M)
3619     PPC_ONLY(4 * M)
3620     S390_ONLY(1 * M)
3621     SPARC_ONLY(4 * M);
3622 #endif // ZERO
3623 
3624   FILE *fp = fopen("/proc/meminfo", "r");
3625   if (fp) {
3626     while (!feof(fp)) {
3627       int x = 0;
3628       char buf[16];
3629       if (fscanf(fp, "Hugepagesize: %d", &x) == 1) {
3630         if (x && fgets(buf, sizeof(buf), fp) && strcmp(buf, " kB\n") == 0) {
3631           large_page_size = x * K;
3632           break;
3633         }
3634       } else {
3635         // skip to next line
3636         for (;;) {
3637           int ch = fgetc(fp);
3638           if (ch == EOF || ch == (int)'\n') break;
3639         }
3640       }
3641     }
3642     fclose(fp);
3643   }
3644 
3645   if (!FLAG_IS_DEFAULT(LargePageSizeInBytes) && LargePageSizeInBytes != large_page_size) {
3646     warning("Setting LargePageSizeInBytes has no effect on this OS. Large page size is "
3647             SIZE_FORMAT "%s.", byte_size_in_proper_unit(large_page_size),
3648             proper_unit_for_byte_size(large_page_size));
3649   }
3650 
3651   return large_page_size;
3652 }
3653 
3654 size_t os::Linux::setup_large_page_size() {
3655   _large_page_size = Linux::find_large_page_size();
3656   const size_t default_page_size = (size_t)Linux::page_size();
3657   if (_large_page_size > default_page_size) {
3658     _page_sizes[0] = _large_page_size;
3659     _page_sizes[1] = default_page_size;
3660     _page_sizes[2] = 0;
3661   }
3662 
3663   return _large_page_size;
3664 }
3665 
3666 bool os::Linux::setup_large_page_type(size_t page_size) {
3667   if (FLAG_IS_DEFAULT(UseHugeTLBFS) &&
3668       FLAG_IS_DEFAULT(UseSHM) &&
3669       FLAG_IS_DEFAULT(UseTransparentHugePages)) {
3670 
3671     // The type of large pages has not been specified by the user.
3672 
3673     // Try UseHugeTLBFS and then UseSHM.
3674     UseHugeTLBFS = UseSHM = true;
3675 
3676     // Don't try UseTransparentHugePages since there are known
3677     // performance issues with it turned on. This might change in the future.
3678     UseTransparentHugePages = false;
3679   }
3680 
3681   if (UseTransparentHugePages) {
3682     bool warn_on_failure = !FLAG_IS_DEFAULT(UseTransparentHugePages);
3683     if (transparent_huge_pages_sanity_check(warn_on_failure, page_size)) {
3684       UseHugeTLBFS = false;
3685       UseSHM = false;
3686       return true;
3687     }
3688     UseTransparentHugePages = false;
3689   }
3690 
3691   if (UseHugeTLBFS) {
3692     bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
3693     if (hugetlbfs_sanity_check(warn_on_failure, page_size)) {
3694       UseSHM = false;
3695       return true;
3696     }
3697     UseHugeTLBFS = false;
3698   }
3699 
3700   return UseSHM;
3701 }
3702 
3703 void os::large_page_init() {
3704   if (!UseLargePages &&
3705       !UseTransparentHugePages &&
3706       !UseHugeTLBFS &&
3707       !UseSHM) {
3708     // Not using large pages.
3709     return;
3710   }
3711 
3712   if (!FLAG_IS_DEFAULT(UseLargePages) && !UseLargePages) {
3713     // The user explicitly turned off large pages.
3714     // Ignore the rest of the large pages flags.
3715     UseTransparentHugePages = false;
3716     UseHugeTLBFS = false;
3717     UseSHM = false;
3718     return;
3719   }
3720 
3721   size_t large_page_size = Linux::setup_large_page_size();
3722   UseLargePages          = Linux::setup_large_page_type(large_page_size);
3723 
3724   set_coredump_filter(LARGEPAGES_BIT);
3725 }
3726 
3727 #ifndef SHM_HUGETLB
3728   #define SHM_HUGETLB 04000
3729 #endif
3730 
3731 #define shm_warning_format(format, ...)              \
3732   do {                                               \
3733     if (UseLargePages &&                             \
3734         (!FLAG_IS_DEFAULT(UseLargePages) ||          \
3735          !FLAG_IS_DEFAULT(UseSHM) ||                 \
3736          !FLAG_IS_DEFAULT(LargePageSizeInBytes))) {  \
3737       warning(format, __VA_ARGS__);                  \
3738     }                                                \
3739   } while (0)
3740 
3741 #define shm_warning(str) shm_warning_format("%s", str)
3742 
3743 #define shm_warning_with_errno(str)                \
3744   do {                                             \
3745     int err = errno;                               \
3746     shm_warning_format(str " (error = %d)", err);  \
3747   } while (0)
3748 
3749 static char* shmat_with_alignment(int shmid, size_t bytes, size_t alignment) {
3750   assert(is_aligned(bytes, alignment), "Must be divisible by the alignment");
3751 
3752   if (!is_aligned(alignment, SHMLBA)) {
3753     assert(false, "Code below assumes that alignment is at least SHMLBA aligned");
3754     return NULL;
3755   }
3756 
3757   // To ensure that we get 'alignment' aligned memory from shmat,
3758   // we pre-reserve aligned virtual memory and then attach to that.
3759 
3760   char* pre_reserved_addr = anon_mmap_aligned(bytes, alignment, NULL);
3761   if (pre_reserved_addr == NULL) {
3762     // Couldn't pre-reserve aligned memory.
3763     shm_warning("Failed to pre-reserve aligned memory for shmat.");
3764     return NULL;
3765   }
3766 
3767   // SHM_REMAP is needed to allow shmat to map over an existing mapping.
3768   char* addr = (char*)shmat(shmid, pre_reserved_addr, SHM_REMAP);
3769 
3770   if ((intptr_t)addr == -1) {
3771     int err = errno;
3772     shm_warning_with_errno("Failed to attach shared memory.");
3773 
3774     assert(err != EACCES, "Unexpected error");
3775     assert(err != EIDRM,  "Unexpected error");
3776     assert(err != EINVAL, "Unexpected error");
3777 
3778     // Since we don't know if the kernel unmapped the pre-reserved memory area
3779     // we can't unmap it, since that would potentially unmap memory that was
3780     // mapped from other threads.
3781     return NULL;
3782   }
3783 
3784   return addr;
3785 }
3786 
3787 static char* shmat_at_address(int shmid, char* req_addr) {
3788   if (!is_aligned(req_addr, SHMLBA)) {
3789     assert(false, "Requested address needs to be SHMLBA aligned");
3790     return NULL;
3791   }
3792 
3793   char* addr = (char*)shmat(shmid, req_addr, 0);
3794 
3795   if ((intptr_t)addr == -1) {
3796     shm_warning_with_errno("Failed to attach shared memory.");
3797     return NULL;
3798   }
3799 
3800   return addr;
3801 }
3802 
3803 static char* shmat_large_pages(int shmid, size_t bytes, size_t alignment, char* req_addr) {
3804   // If a req_addr has been provided, we assume that the caller has already aligned the address.
3805   if (req_addr != NULL) {
3806     assert(is_aligned(req_addr, os::large_page_size()), "Must be divisible by the large page size");
3807     assert(is_aligned(req_addr, alignment), "Must be divisible by given alignment");
3808     return shmat_at_address(shmid, req_addr);
3809   }
3810 
3811   // Since shmid has been setup with SHM_HUGETLB, shmat will automatically
3812   // return large page size aligned memory addresses when req_addr == NULL.
3813   // However, if the alignment is larger than the large page size, we have
3814   // to manually ensure that the memory returned is 'alignment' aligned.
3815   if (alignment > os::large_page_size()) {
3816     assert(is_aligned(alignment, os::large_page_size()), "Must be divisible by the large page size");
3817     return shmat_with_alignment(shmid, bytes, alignment);
3818   } else {
3819     return shmat_at_address(shmid, NULL);
3820   }
3821 }
3822 
3823 char* os::Linux::reserve_memory_special_shm(size_t bytes, size_t alignment,
3824                                             char* req_addr, bool exec) {
3825   // "exec" is passed in but not used.  Creating the shared image for
3826   // the code cache doesn't have an SHM_X executable permission to check.
3827   assert(UseLargePages && UseSHM, "only for SHM large pages");
3828   assert(is_aligned(req_addr, os::large_page_size()), "Unaligned address");
3829   assert(is_aligned(req_addr, alignment), "Unaligned address");
3830 
3831   if (!is_aligned(bytes, os::large_page_size())) {
3832     return NULL; // Fallback to small pages.
3833   }
3834 
3835   // Create a large shared memory region to attach to based on size.
3836   // Currently, size is the total size of the heap.
3837   int shmid = shmget(IPC_PRIVATE, bytes, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W);
3838   if (shmid == -1) {
3839     // Possible reasons for shmget failure:
3840     // 1. shmmax is too small for Java heap.
3841     //    > check shmmax value: cat /proc/sys/kernel/shmmax
3842     //    > increase shmmax value: echo "0xffffffff" > /proc/sys/kernel/shmmax
3843     // 2. not enough large page memory.
3844     //    > check available large pages: cat /proc/meminfo
3845     //    > increase amount of large pages:
3846     //          echo new_value > /proc/sys/vm/nr_hugepages
3847     //      Note 1: different Linux may use different name for this property,
3848     //            e.g. on Redhat AS-3 it is "hugetlb_pool".
3849     //      Note 2: it's possible there's enough physical memory available but
3850     //            they are so fragmented after a long run that they can't
3851     //            coalesce into large pages. Try to reserve large pages when
3852     //            the system is still "fresh".
3853     shm_warning_with_errno("Failed to reserve shared memory.");
3854     return NULL;
3855   }
3856 
3857   // Attach to the region.
3858   char* addr = shmat_large_pages(shmid, bytes, alignment, req_addr);
3859 
3860   // Remove shmid. If shmat() is successful, the actual shared memory segment
3861   // will be deleted when it's detached by shmdt() or when the process
3862   // terminates. If shmat() is not successful this will remove the shared
3863   // segment immediately.
3864   shmctl(shmid, IPC_RMID, NULL);
3865 
3866   return addr;
3867 }
3868 
3869 static void warn_on_large_pages_failure(char* req_addr, size_t bytes,
3870                                         int error) {
3871   assert(error == ENOMEM, "Only expect to fail if no memory is available");
3872 
3873   bool warn_on_failure = UseLargePages &&
3874       (!FLAG_IS_DEFAULT(UseLargePages) ||
3875        !FLAG_IS_DEFAULT(UseHugeTLBFS) ||
3876        !FLAG_IS_DEFAULT(LargePageSizeInBytes));
3877 
3878   if (warn_on_failure) {
3879     char msg[128];
3880     jio_snprintf(msg, sizeof(msg), "Failed to reserve large pages memory req_addr: "
3881                  PTR_FORMAT " bytes: " SIZE_FORMAT " (errno = %d).", req_addr, bytes, error);
3882     warning("%s", msg);
3883   }
3884 }
3885 
3886 char* os::Linux::reserve_memory_special_huge_tlbfs_only(size_t bytes,
3887                                                         char* req_addr,
3888                                                         bool exec) {
3889   assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
3890   assert(is_aligned(bytes, os::large_page_size()), "Unaligned size");
3891   assert(is_aligned(req_addr, os::large_page_size()), "Unaligned address");
3892 
3893   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3894   char* addr = (char*)::mmap(req_addr, bytes, prot,
3895                              MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
3896                              -1, 0);
3897 
3898   if (addr == MAP_FAILED) {
3899     warn_on_large_pages_failure(req_addr, bytes, errno);
3900     return NULL;
3901   }
3902 
3903   assert(is_aligned(addr, os::large_page_size()), "Must be");
3904 
3905   return addr;
3906 }
3907 
3908 // Reserve memory using mmap(MAP_HUGETLB).
3909 //  - bytes shall be a multiple of alignment.
3910 //  - req_addr can be NULL. If not NULL, it must be a multiple of alignment.
3911 //  - alignment sets the alignment at which memory shall be allocated.
3912 //     It must be a multiple of allocation granularity.
3913 // Returns address of memory or NULL. If req_addr was not NULL, will only return
3914 //  req_addr or NULL.
3915 char* os::Linux::reserve_memory_special_huge_tlbfs_mixed(size_t bytes,
3916                                                          size_t alignment,
3917                                                          char* req_addr,
3918                                                          bool exec) {
3919   size_t large_page_size = os::large_page_size();
3920   assert(bytes >= large_page_size, "Shouldn't allocate large pages for small sizes");
3921 
3922   assert(is_aligned(req_addr, alignment), "Must be");
3923   assert(is_aligned(bytes, alignment), "Must be");
3924 
3925   // First reserve - but not commit - the address range in small pages.
3926   char* const start = anon_mmap_aligned(bytes, alignment, req_addr);
3927 
3928   if (start == NULL) {
3929     return NULL;
3930   }
3931 
3932   assert(is_aligned(start, alignment), "Must be");
3933 
3934   char* end = start + bytes;
3935 
3936   // Find the regions of the allocated chunk that can be promoted to large pages.
3937   char* lp_start = align_up(start, large_page_size);
3938   char* lp_end   = align_down(end, large_page_size);
3939 
3940   size_t lp_bytes = lp_end - lp_start;
3941 
3942   assert(is_aligned(lp_bytes, large_page_size), "Must be");
3943 
3944   if (lp_bytes == 0) {
3945     // The mapped region doesn't even span the start and the end of a large page.
3946     // Fall back to allocate a non-special area.
3947     ::munmap(start, end - start);
3948     return NULL;
3949   }
3950 
3951   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
3952 
3953   void* result;
3954 
3955   // Commit small-paged leading area.
3956   if (start != lp_start) {
3957     result = ::mmap(start, lp_start - start, prot,
3958                     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3959                     -1, 0);
3960     if (result == MAP_FAILED) {
3961       ::munmap(lp_start, end - lp_start);
3962       return NULL;
3963     }
3964   }
3965 
3966   // Commit large-paged area.
3967   result = ::mmap(lp_start, lp_bytes, prot,
3968                   MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|MAP_HUGETLB,
3969                   -1, 0);
3970   if (result == MAP_FAILED) {
3971     warn_on_large_pages_failure(lp_start, lp_bytes, errno);
3972     // If the mmap above fails, the large pages region will be unmapped and we
3973     // have regions before and after with small pages. Release these regions.
3974     //
3975     // |  mapped  |  unmapped  |  mapped  |
3976     // ^          ^            ^          ^
3977     // start      lp_start     lp_end     end
3978     //
3979     ::munmap(start, lp_start - start);
3980     ::munmap(lp_end, end - lp_end);
3981     return NULL;
3982   }
3983 
3984   // Commit small-paged trailing area.
3985   if (lp_end != end) {
3986     result = ::mmap(lp_end, end - lp_end, prot,
3987                     MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
3988                     -1, 0);
3989     if (result == MAP_FAILED) {
3990       ::munmap(start, lp_end - start);
3991       return NULL;
3992     }
3993   }
3994 
3995   return start;
3996 }
3997 
3998 char* os::Linux::reserve_memory_special_huge_tlbfs(size_t bytes,
3999                                                    size_t alignment,
4000                                                    char* req_addr,
4001                                                    bool exec) {
4002   assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
4003   assert(is_aligned(req_addr, alignment), "Must be");
4004   assert(is_aligned(alignment, os::vm_allocation_granularity()), "Must be");
4005   assert(is_power_of_2(os::large_page_size()), "Must be");
4006   assert(bytes >= os::large_page_size(), "Shouldn't allocate large pages for small sizes");
4007 
4008   if (is_aligned(bytes, os::large_page_size()) && alignment <= os::large_page_size()) {
4009     return reserve_memory_special_huge_tlbfs_only(bytes, req_addr, exec);
4010   } else {
4011     return reserve_memory_special_huge_tlbfs_mixed(bytes, alignment, req_addr, exec);
4012   }
4013 }
4014 
4015 char* os::reserve_memory_special(size_t bytes, size_t alignment,
4016                                  char* req_addr, bool exec) {
4017   assert(UseLargePages, "only for large pages");
4018 
4019   char* addr;
4020   if (UseSHM) {
4021     addr = os::Linux::reserve_memory_special_shm(bytes, alignment, req_addr, exec);
4022   } else {
4023     assert(UseHugeTLBFS, "must be");
4024     addr = os::Linux::reserve_memory_special_huge_tlbfs(bytes, alignment, req_addr, exec);
4025   }
4026 
4027   if (addr != NULL) {
4028     if (UseNUMAInterleaving) {
4029       numa_make_global(addr, bytes);
4030     }
4031 
4032     // The memory is committed
4033     MemTracker::record_virtual_memory_reserve_and_commit((address)addr, bytes, CALLER_PC);
4034   }
4035 
4036   return addr;
4037 }
4038 
4039 bool os::Linux::release_memory_special_shm(char* base, size_t bytes) {
4040   // detaching the SHM segment will also delete it, see reserve_memory_special_shm()
4041   return shmdt(base) == 0;
4042 }
4043 
4044 bool os::Linux::release_memory_special_huge_tlbfs(char* base, size_t bytes) {
4045   return pd_release_memory(base, bytes);
4046 }
4047 
4048 bool os::release_memory_special(char* base, size_t bytes) {
4049   bool res;
4050   if (MemTracker::tracking_level() > NMT_minimal) {
4051     Tracker tkr(Tracker::release);
4052     res = os::Linux::release_memory_special_impl(base, bytes);
4053     if (res) {
4054       tkr.record((address)base, bytes);
4055     }
4056 
4057   } else {
4058     res = os::Linux::release_memory_special_impl(base, bytes);
4059   }
4060   return res;
4061 }
4062 
4063 bool os::Linux::release_memory_special_impl(char* base, size_t bytes) {
4064   assert(UseLargePages, "only for large pages");
4065   bool res;
4066 
4067   if (UseSHM) {
4068     res = os::Linux::release_memory_special_shm(base, bytes);
4069   } else {
4070     assert(UseHugeTLBFS, "must be");
4071     res = os::Linux::release_memory_special_huge_tlbfs(base, bytes);
4072   }
4073   return res;
4074 }
4075 
4076 size_t os::large_page_size() {
4077   return _large_page_size;
4078 }
4079 
4080 // With SysV SHM the entire memory region must be allocated as shared
4081 // memory.
4082 // HugeTLBFS allows application to commit large page memory on demand.
4083 // However, when committing memory with HugeTLBFS fails, the region
4084 // that was supposed to be committed will lose the old reservation
4085 // and allow other threads to steal that memory region. Because of this
4086 // behavior we can't commit HugeTLBFS memory.
4087 bool os::can_commit_large_page_memory() {
4088   return UseTransparentHugePages;
4089 }
4090 
4091 bool os::can_execute_large_page_memory() {
4092   return UseTransparentHugePages || UseHugeTLBFS;
4093 }
4094 
4095 char* os::pd_attempt_reserve_memory_at(size_t bytes, char* requested_addr, int file_desc) {
4096   assert(file_desc >= 0, "file_desc is not valid");
4097   char* result = pd_attempt_reserve_memory_at(bytes, requested_addr);
4098   if (result != NULL) {
4099     if (replace_existing_mapping_with_file_mapping(result, bytes, file_desc) == NULL) {
4100       vm_exit_during_initialization(err_msg("Error in mapping Java heap at the given filesystem directory"));
4101     }
4102   }
4103   return result;
4104 }
4105 
4106 // Reserve memory at an arbitrary address, only if that area is
4107 // available (and not reserved for something else).
4108 
4109 char* os::pd_attempt_reserve_memory_at(size_t bytes, char* requested_addr) {
4110   const int max_tries = 10;
4111   char* base[max_tries];
4112   size_t size[max_tries];
4113   const size_t gap = 0x000000;
4114 
4115   // Assert only that the size is a multiple of the page size, since
4116   // that's all that mmap requires, and since that's all we really know
4117   // about at this low abstraction level.  If we need higher alignment,
4118   // we can either pass an alignment to this method or verify alignment
4119   // in one of the methods further up the call chain.  See bug 5044738.
4120   assert(bytes % os::vm_page_size() == 0, "reserving unexpected size block");
4121 
4122   // Repeatedly allocate blocks until the block is allocated at the
4123   // right spot.
4124 
4125   // Linux mmap allows caller to pass an address as hint; give it a try first,
4126   // if kernel honors the hint then we can return immediately.
4127   char * addr = anon_mmap(requested_addr, bytes, false);
4128   if (addr == requested_addr) {
4129     return requested_addr;
4130   }
4131 
4132   if (addr != NULL) {
4133     // mmap() is successful but it fails to reserve at the requested address
4134     anon_munmap(addr, bytes);
4135   }
4136 
4137   int i;
4138   for (i = 0; i < max_tries; ++i) {
4139     base[i] = reserve_memory(bytes);
4140 
4141     if (base[i] != NULL) {
4142       // Is this the block we wanted?
4143       if (base[i] == requested_addr) {
4144         size[i] = bytes;
4145         break;
4146       }
4147 
4148       // Does this overlap the block we wanted? Give back the overlapped
4149       // parts and try again.
4150 
4151       ptrdiff_t top_overlap = requested_addr + (bytes + gap) - base[i];
4152       if (top_overlap >= 0 && (size_t)top_overlap < bytes) {
4153         unmap_memory(base[i], top_overlap);
4154         base[i] += top_overlap;
4155         size[i] = bytes - top_overlap;
4156       } else {
4157         ptrdiff_t bottom_overlap = base[i] + bytes - requested_addr;
4158         if (bottom_overlap >= 0 && (size_t)bottom_overlap < bytes) {
4159           unmap_memory(requested_addr, bottom_overlap);
4160           size[i] = bytes - bottom_overlap;
4161         } else {
4162           size[i] = bytes;
4163         }
4164       }
4165     }
4166   }
4167 
4168   // Give back the unused reserved pieces.
4169 
4170   for (int j = 0; j < i; ++j) {
4171     if (base[j] != NULL) {
4172       unmap_memory(base[j], size[j]);
4173     }
4174   }
4175 
4176   if (i < max_tries) {
4177     return requested_addr;
4178   } else {
4179     return NULL;
4180   }
4181 }
4182 
4183 // Sleep forever; naked call to OS-specific sleep; use with CAUTION
4184 void os::infinite_sleep() {
4185   while (true) {    // sleep forever ...
4186     ::sleep(100);   // ... 100 seconds at a time
4187   }
4188 }
4189 
4190 // Used to convert frequent JVM_Yield() to nops
4191 bool os::dont_yield() {
4192   return DontYieldALot;
4193 }
4194 
4195 // Linux CFS scheduler (since 2.6.23) does not guarantee sched_yield(2) will
4196 // actually give up the CPU. Since skip buddy (v2.6.28):
4197 //
4198 // * Sets the yielding task as skip buddy for current CPU's run queue.
4199 // * Picks next from run queue, if empty, picks a skip buddy (can be the yielding task).
4200 // * Clears skip buddies for this run queue (yielding task no longer a skip buddy).
4201 //
4202 // An alternative is calling os::naked_short_nanosleep with a small number to avoid
4203 // getting re-scheduled immediately.
4204 //
4205 void os::naked_yield() {
4206   sched_yield();
4207 }
4208 
4209 ////////////////////////////////////////////////////////////////////////////////
4210 // thread priority support
4211 
4212 // Note: Normal Linux applications are run with SCHED_OTHER policy. SCHED_OTHER
4213 // only supports dynamic priority, static priority must be zero. For real-time
4214 // applications, Linux supports SCHED_RR which allows static priority (1-99).
4215 // However, for large multi-threaded applications, SCHED_RR is not only slower
4216 // than SCHED_OTHER, but also very unstable (my volano tests hang hard 4 out
4217 // of 5 runs - Sep 2005).
4218 //
4219 // The following code actually changes the niceness of kernel-thread/LWP. It
4220 // has an assumption that setpriority() only modifies one kernel-thread/LWP,
4221 // not the entire user process, and user level threads are 1:1 mapped to kernel
4222 // threads. It has always been the case, but could change in the future. For
4223 // this reason, the code should not be used as default (ThreadPriorityPolicy=0).
4224 // It is only used when ThreadPriorityPolicy=1 and may require system level permission
4225 // (e.g., root privilege or CAP_SYS_NICE capability).
4226 
4227 int os::java_to_os_priority[CriticalPriority + 1] = {
4228   19,              // 0 Entry should never be used
4229 
4230    4,              // 1 MinPriority
4231    3,              // 2
4232    2,              // 3
4233 
4234    1,              // 4
4235    0,              // 5 NormPriority
4236   -1,              // 6
4237 
4238   -2,              // 7
4239   -3,              // 8
4240   -4,              // 9 NearMaxPriority
4241 
4242   -5,              // 10 MaxPriority
4243 
4244   -5               // 11 CriticalPriority
4245 };
4246 
4247 static int prio_init() {
4248   if (ThreadPriorityPolicy == 1) {
4249     if (geteuid() != 0) {
4250       if (!FLAG_IS_DEFAULT(ThreadPriorityPolicy)) {
4251         warning("-XX:ThreadPriorityPolicy=1 may require system level permission, " \
4252                 "e.g., being the root user. If the necessary permission is not " \
4253                 "possessed, changes to priority will be silently ignored.");
4254       }
4255     }
4256   }
4257   if (UseCriticalJavaThreadPriority) {
4258     os::java_to_os_priority[MaxPriority] = os::java_to_os_priority[CriticalPriority];
4259   }
4260   return 0;
4261 }
4262 
4263 OSReturn os::set_native_priority(Thread* thread, int newpri) {
4264   if (!UseThreadPriorities || ThreadPriorityPolicy == 0) return OS_OK;
4265 
4266   int ret = setpriority(PRIO_PROCESS, thread->osthread()->thread_id(), newpri);
4267   return (ret == 0) ? OS_OK : OS_ERR;
4268 }
4269 
4270 OSReturn os::get_native_priority(const Thread* const thread,
4271                                  int *priority_ptr) {
4272   if (!UseThreadPriorities || ThreadPriorityPolicy == 0) {
4273     *priority_ptr = java_to_os_priority[NormPriority];
4274     return OS_OK;
4275   }
4276 
4277   errno = 0;
4278   *priority_ptr = getpriority(PRIO_PROCESS, thread->osthread()->thread_id());
4279   return (*priority_ptr != -1 || errno == 0 ? OS_OK : OS_ERR);
4280 }
4281 
4282 ////////////////////////////////////////////////////////////////////////////////
4283 // suspend/resume support
4284 
4285 //  The low-level signal-based suspend/resume support is a remnant from the
4286 //  old VM-suspension that used to be for java-suspension, safepoints etc,
4287 //  within hotspot. Currently used by JFR's OSThreadSampler
4288 //
4289 //  The remaining code is greatly simplified from the more general suspension
4290 //  code that used to be used.
4291 //
4292 //  The protocol is quite simple:
4293 //  - suspend:
4294 //      - sends a signal to the target thread
4295 //      - polls the suspend state of the osthread using a yield loop
4296 //      - target thread signal handler (SR_handler) sets suspend state
4297 //        and blocks in sigsuspend until continued
4298 //  - resume:
4299 //      - sets target osthread state to continue
4300 //      - sends signal to end the sigsuspend loop in the SR_handler
4301 //
4302 //  Note that the SR_lock plays no role in this suspend/resume protocol,
4303 //  but is checked for NULL in SR_handler as a thread termination indicator.
4304 //  The SR_lock is, however, used by JavaThread::java_suspend()/java_resume() APIs.
4305 //
4306 //  Note that resume_clear_context() and suspend_save_context() are needed
4307 //  by SR_handler(), so that fetch_frame_from_ucontext() works,
4308 //  which in part is used by:
4309 //    - Forte Analyzer: AsyncGetCallTrace()
4310 //    - StackBanging: get_frame_at_stack_banging_point()
4311 
4312 static void resume_clear_context(OSThread *osthread) {
4313   osthread->set_ucontext(NULL);
4314   osthread->set_siginfo(NULL);
4315 }
4316 
4317 static void suspend_save_context(OSThread *osthread, siginfo_t* siginfo,
4318                                  ucontext_t* context) {
4319   osthread->set_ucontext(context);
4320   osthread->set_siginfo(siginfo);
4321 }
4322 
4323 // Handler function invoked when a thread's execution is suspended or
4324 // resumed. We have to be careful that only async-safe functions are
4325 // called here (Note: most pthread functions are not async safe and
4326 // should be avoided.)
4327 //
4328 // Note: sigwait() is a more natural fit than sigsuspend() from an
4329 // interface point of view, but sigwait() prevents the signal hander
4330 // from being run. libpthread would get very confused by not having
4331 // its signal handlers run and prevents sigwait()'s use with the
4332 // mutex granting granting signal.
4333 //
4334 // Currently only ever called on the VMThread and JavaThreads (PC sampling)
4335 //
4336 static void SR_handler(int sig, siginfo_t* siginfo, ucontext_t* context) {
4337   // Save and restore errno to avoid confusing native code with EINTR
4338   // after sigsuspend.
4339   int old_errno = errno;
4340 
4341   Thread* thread = Thread::current_or_null_safe();
4342   assert(thread != NULL, "Missing current thread in SR_handler");
4343 
4344   // On some systems we have seen signal delivery get "stuck" until the signal
4345   // mask is changed as part of thread termination. Check that the current thread
4346   // has not already terminated (via SR_lock()) - else the following assertion
4347   // will fail because the thread is no longer a JavaThread as the ~JavaThread
4348   // destructor has completed.
4349 
4350   if (thread->SR_lock() == NULL) {
4351     return;
4352   }
4353 
4354   assert(thread->is_VM_thread() || thread->is_Java_thread(), "Must be VMThread or JavaThread");
4355 
4356   OSThread* osthread = thread->osthread();
4357 
4358   os::SuspendResume::State current = osthread->sr.state();
4359   if (current == os::SuspendResume::SR_SUSPEND_REQUEST) {
4360     suspend_save_context(osthread, siginfo, context);
4361 
4362     // attempt to switch the state, we assume we had a SUSPEND_REQUEST
4363     os::SuspendResume::State state = osthread->sr.suspended();
4364     if (state == os::SuspendResume::SR_SUSPENDED) {
4365       sigset_t suspend_set;  // signals for sigsuspend()
4366       sigemptyset(&suspend_set);
4367       // get current set of blocked signals and unblock resume signal
4368       pthread_sigmask(SIG_BLOCK, NULL, &suspend_set);
4369       sigdelset(&suspend_set, SR_signum);
4370 
4371       sr_semaphore.signal();
4372       // wait here until we are resumed
4373       while (1) {
4374         sigsuspend(&suspend_set);
4375 
4376         os::SuspendResume::State result = osthread->sr.running();
4377         if (result == os::SuspendResume::SR_RUNNING) {
4378           sr_semaphore.signal();
4379           break;
4380         }
4381       }
4382 
4383     } else if (state == os::SuspendResume::SR_RUNNING) {
4384       // request was cancelled, continue
4385     } else {
4386       ShouldNotReachHere();
4387     }
4388 
4389     resume_clear_context(osthread);
4390   } else if (current == os::SuspendResume::SR_RUNNING) {
4391     // request was cancelled, continue
4392   } else if (current == os::SuspendResume::SR_WAKEUP_REQUEST) {
4393     // ignore
4394   } else {
4395     // ignore
4396   }
4397 
4398   errno = old_errno;
4399 }
4400 
4401 static int SR_initialize() {
4402   struct sigaction act;
4403   char *s;
4404 
4405   // Get signal number to use for suspend/resume
4406   if ((s = ::getenv("_JAVA_SR_SIGNUM")) != 0) {
4407     int sig = ::strtol(s, 0, 10);
4408     if (sig > MAX2(SIGSEGV, SIGBUS) &&  // See 4355769.
4409         sig < NSIG) {                   // Must be legal signal and fit into sigflags[].
4410       SR_signum = sig;
4411     } else {
4412       warning("You set _JAVA_SR_SIGNUM=%d. It must be in range [%d, %d]. Using %d instead.",
4413               sig, MAX2(SIGSEGV, SIGBUS)+1, NSIG-1, SR_signum);
4414     }
4415   }
4416 
4417   assert(SR_signum > SIGSEGV && SR_signum > SIGBUS,
4418          "SR_signum must be greater than max(SIGSEGV, SIGBUS), see 4355769");
4419 
4420   sigemptyset(&SR_sigset);
4421   sigaddset(&SR_sigset, SR_signum);
4422 
4423   // Set up signal handler for suspend/resume
4424   act.sa_flags = SA_RESTART|SA_SIGINFO;
4425   act.sa_handler = (void (*)(int)) SR_handler;
4426 
4427   // SR_signum is blocked by default.
4428   // 4528190 - We also need to block pthread restart signal (32 on all
4429   // supported Linux platforms). Note that LinuxThreads need to block
4430   // this signal for all threads to work properly. So we don't have
4431   // to use hard-coded signal number when setting up the mask.
4432   pthread_sigmask(SIG_BLOCK, NULL, &act.sa_mask);
4433 
4434   if (sigaction(SR_signum, &act, 0) == -1) {
4435     return -1;
4436   }
4437 
4438   // Save signal flag
4439   os::Linux::set_our_sigflags(SR_signum, act.sa_flags);
4440   return 0;
4441 }
4442 
4443 static int sr_notify(OSThread* osthread) {
4444   int status = pthread_kill(osthread->pthread_id(), SR_signum);
4445   assert_status(status == 0, status, "pthread_kill");
4446   return status;
4447 }
4448 
4449 // "Randomly" selected value for how long we want to spin
4450 // before bailing out on suspending a thread, also how often
4451 // we send a signal to a thread we want to resume
4452 static const int RANDOMLY_LARGE_INTEGER = 1000000;
4453 static const int RANDOMLY_LARGE_INTEGER2 = 100;
4454 
4455 // returns true on success and false on error - really an error is fatal
4456 // but this seems the normal response to library errors
4457 static bool do_suspend(OSThread* osthread) {
4458   assert(osthread->sr.is_running(), "thread should be running");
4459   assert(!sr_semaphore.trywait(), "semaphore has invalid state");
4460 
4461   // mark as suspended and send signal
4462   if (osthread->sr.request_suspend() != os::SuspendResume::SR_SUSPEND_REQUEST) {
4463     // failed to switch, state wasn't running?
4464     ShouldNotReachHere();
4465     return false;
4466   }
4467 
4468   if (sr_notify(osthread) != 0) {
4469     ShouldNotReachHere();
4470   }
4471 
4472   // managed to send the signal and switch to SUSPEND_REQUEST, now wait for SUSPENDED
4473   while (true) {
4474     if (sr_semaphore.timedwait(2)) {
4475       break;
4476     } else {
4477       // timeout
4478       os::SuspendResume::State cancelled = osthread->sr.cancel_suspend();
4479       if (cancelled == os::SuspendResume::SR_RUNNING) {
4480         return false;
4481       } else if (cancelled == os::SuspendResume::SR_SUSPENDED) {
4482         // make sure that we consume the signal on the semaphore as well
4483         sr_semaphore.wait();
4484         break;
4485       } else {
4486         ShouldNotReachHere();
4487         return false;
4488       }
4489     }
4490   }
4491 
4492   guarantee(osthread->sr.is_suspended(), "Must be suspended");
4493   return true;
4494 }
4495 
4496 static void do_resume(OSThread* osthread) {
4497   assert(osthread->sr.is_suspended(), "thread should be suspended");
4498   assert(!sr_semaphore.trywait(), "invalid semaphore state");
4499 
4500   if (osthread->sr.request_wakeup() != os::SuspendResume::SR_WAKEUP_REQUEST) {
4501     // failed to switch to WAKEUP_REQUEST
4502     ShouldNotReachHere();
4503     return;
4504   }
4505 
4506   while (true) {
4507     if (sr_notify(osthread) == 0) {
4508       if (sr_semaphore.timedwait(2)) {
4509         if (osthread->sr.is_running()) {
4510           return;
4511         }
4512       }
4513     } else {
4514       ShouldNotReachHere();
4515     }
4516   }
4517 
4518   guarantee(osthread->sr.is_running(), "Must be running!");
4519 }
4520 
4521 ///////////////////////////////////////////////////////////////////////////////////
4522 // signal handling (except suspend/resume)
4523 
4524 // This routine may be used by user applications as a "hook" to catch signals.
4525 // The user-defined signal handler must pass unrecognized signals to this
4526 // routine, and if it returns true (non-zero), then the signal handler must
4527 // return immediately.  If the flag "abort_if_unrecognized" is true, then this
4528 // routine will never retun false (zero), but instead will execute a VM panic
4529 // routine kill the process.
4530 //
4531 // If this routine returns false, it is OK to call it again.  This allows
4532 // the user-defined signal handler to perform checks either before or after
4533 // the VM performs its own checks.  Naturally, the user code would be making
4534 // a serious error if it tried to handle an exception (such as a null check
4535 // or breakpoint) that the VM was generating for its own correct operation.
4536 //
4537 // This routine may recognize any of the following kinds of signals:
4538 //    SIGBUS, SIGSEGV, SIGILL, SIGFPE, SIGQUIT, SIGPIPE, SIGXFSZ, SIGUSR1.
4539 // It should be consulted by handlers for any of those signals.
4540 //
4541 // The caller of this routine must pass in the three arguments supplied
4542 // to the function referred to in the "sa_sigaction" (not the "sa_handler")
4543 // field of the structure passed to sigaction().  This routine assumes that
4544 // the sa_flags field passed to sigaction() includes SA_SIGINFO and SA_RESTART.
4545 //
4546 // Note that the VM will print warnings if it detects conflicting signal
4547 // handlers, unless invoked with the option "-XX:+AllowUserSignalHandlers".
4548 //
4549 extern "C" JNIEXPORT int JVM_handle_linux_signal(int signo,
4550                                                  siginfo_t* siginfo,
4551                                                  void* ucontext,
4552                                                  int abort_if_unrecognized);
4553 
4554 static void signalHandler(int sig, siginfo_t* info, void* uc) {
4555   assert(info != NULL && uc != NULL, "it must be old kernel");
4556   int orig_errno = errno;  // Preserve errno value over signal handler.
4557   JVM_handle_linux_signal(sig, info, uc, true);
4558   errno = orig_errno;
4559 }
4560 
4561 
4562 // This boolean allows users to forward their own non-matching signals
4563 // to JVM_handle_linux_signal, harmlessly.
4564 bool os::Linux::signal_handlers_are_installed = false;
4565 
4566 // For signal-chaining
4567 bool os::Linux::libjsig_is_loaded = false;
4568 typedef struct sigaction *(*get_signal_t)(int);
4569 get_signal_t os::Linux::get_signal_action = NULL;
4570 
4571 struct sigaction* os::Linux::get_chained_signal_action(int sig) {
4572   struct sigaction *actp = NULL;
4573 
4574   if (libjsig_is_loaded) {
4575     // Retrieve the old signal handler from libjsig
4576     actp = (*get_signal_action)(sig);
4577   }
4578   if (actp == NULL) {
4579     // Retrieve the preinstalled signal handler from jvm
4580     actp = os::Posix::get_preinstalled_handler(sig);
4581   }
4582 
4583   return actp;
4584 }
4585 
4586 static bool call_chained_handler(struct sigaction *actp, int sig,
4587                                  siginfo_t *siginfo, void *context) {
4588   // Call the old signal handler
4589   if (actp->sa_handler == SIG_DFL) {
4590     // It's more reasonable to let jvm treat it as an unexpected exception
4591     // instead of taking the default action.
4592     return false;
4593   } else if (actp->sa_handler != SIG_IGN) {
4594     if ((actp->sa_flags & SA_NODEFER) == 0) {
4595       // automaticlly block the signal
4596       sigaddset(&(actp->sa_mask), sig);
4597     }
4598 
4599     sa_handler_t hand = NULL;
4600     sa_sigaction_t sa = NULL;
4601     bool siginfo_flag_set = (actp->sa_flags & SA_SIGINFO) != 0;
4602     // retrieve the chained handler
4603     if (siginfo_flag_set) {
4604       sa = actp->sa_sigaction;
4605     } else {
4606       hand = actp->sa_handler;
4607     }
4608 
4609     if ((actp->sa_flags & SA_RESETHAND) != 0) {
4610       actp->sa_handler = SIG_DFL;
4611     }
4612 
4613     // try to honor the signal mask
4614     sigset_t oset;
4615     sigemptyset(&oset);
4616     pthread_sigmask(SIG_SETMASK, &(actp->sa_mask), &oset);
4617 
4618     // call into the chained handler
4619     if (siginfo_flag_set) {
4620       (*sa)(sig, siginfo, context);
4621     } else {
4622       (*hand)(sig);
4623     }
4624 
4625     // restore the signal mask
4626     pthread_sigmask(SIG_SETMASK, &oset, NULL);
4627   }
4628   // Tell jvm's signal handler the signal is taken care of.
4629   return true;
4630 }
4631 
4632 bool os::Linux::chained_handler(int sig, siginfo_t* siginfo, void* context) {
4633   bool chained = false;
4634   // signal-chaining
4635   if (UseSignalChaining) {
4636     struct sigaction *actp = get_chained_signal_action(sig);
4637     if (actp != NULL) {
4638       chained = call_chained_handler(actp, sig, siginfo, context);
4639     }
4640   }
4641   return chained;
4642 }
4643 
4644 // for diagnostic
4645 int sigflags[NSIG];
4646 
4647 int os::Linux::get_our_sigflags(int sig) {
4648   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4649   return sigflags[sig];
4650 }
4651 
4652 void os::Linux::set_our_sigflags(int sig, int flags) {
4653   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4654   if (sig > 0 && sig < NSIG) {
4655     sigflags[sig] = flags;
4656   }
4657 }
4658 
4659 void os::Linux::set_signal_handler(int sig, bool set_installed) {
4660   // Check for overwrite.
4661   struct sigaction oldAct;
4662   sigaction(sig, (struct sigaction*)NULL, &oldAct);
4663 
4664   void* oldhand = oldAct.sa_sigaction
4665                 ? CAST_FROM_FN_PTR(void*,  oldAct.sa_sigaction)
4666                 : CAST_FROM_FN_PTR(void*,  oldAct.sa_handler);
4667   if (oldhand != CAST_FROM_FN_PTR(void*, SIG_DFL) &&
4668       oldhand != CAST_FROM_FN_PTR(void*, SIG_IGN) &&
4669       oldhand != CAST_FROM_FN_PTR(void*, (sa_sigaction_t)signalHandler)) {
4670     if (AllowUserSignalHandlers || !set_installed) {
4671       // Do not overwrite; user takes responsibility to forward to us.
4672       return;
4673     } else if (UseSignalChaining) {
4674       // save the old handler in jvm
4675       os::Posix::save_preinstalled_handler(sig, oldAct);
4676       // libjsig also interposes the sigaction() call below and saves the
4677       // old sigaction on it own.
4678     } else {
4679       fatal("Encountered unexpected pre-existing sigaction handler "
4680             "%#lx for signal %d.", (long)oldhand, sig);
4681     }
4682   }
4683 
4684   struct sigaction sigAct;
4685   sigfillset(&(sigAct.sa_mask));
4686   sigAct.sa_handler = SIG_DFL;
4687   if (!set_installed) {
4688     sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4689   } else {
4690     sigAct.sa_sigaction = signalHandler;
4691     sigAct.sa_flags = SA_SIGINFO|SA_RESTART;
4692   }
4693   // Save flags, which are set by ours
4694   assert(sig > 0 && sig < NSIG, "vm signal out of expected range");
4695   sigflags[sig] = sigAct.sa_flags;
4696 
4697   int ret = sigaction(sig, &sigAct, &oldAct);
4698   assert(ret == 0, "check");
4699 
4700   void* oldhand2  = oldAct.sa_sigaction
4701                   ? CAST_FROM_FN_PTR(void*, oldAct.sa_sigaction)
4702                   : CAST_FROM_FN_PTR(void*, oldAct.sa_handler);
4703   assert(oldhand2 == oldhand, "no concurrent signal handler installation");
4704 }
4705 
4706 // install signal handlers for signals that HotSpot needs to
4707 // handle in order to support Java-level exception handling.
4708 
4709 void os::Linux::install_signal_handlers() {
4710   if (!signal_handlers_are_installed) {
4711     signal_handlers_are_installed = true;
4712 
4713     // signal-chaining
4714     typedef void (*signal_setting_t)();
4715     signal_setting_t begin_signal_setting = NULL;
4716     signal_setting_t end_signal_setting = NULL;
4717     begin_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4718                                           dlsym(RTLD_DEFAULT, "JVM_begin_signal_setting"));
4719     if (begin_signal_setting != NULL) {
4720       end_signal_setting = CAST_TO_FN_PTR(signal_setting_t,
4721                                           dlsym(RTLD_DEFAULT, "JVM_end_signal_setting"));
4722       get_signal_action = CAST_TO_FN_PTR(get_signal_t,
4723                                          dlsym(RTLD_DEFAULT, "JVM_get_signal_action"));
4724       libjsig_is_loaded = true;
4725       assert(UseSignalChaining, "should enable signal-chaining");
4726     }
4727     if (libjsig_is_loaded) {
4728       // Tell libjsig jvm is setting signal handlers
4729       (*begin_signal_setting)();
4730     }
4731 
4732     set_signal_handler(SIGSEGV, true);
4733     set_signal_handler(SIGPIPE, true);
4734     set_signal_handler(SIGBUS, true);
4735     set_signal_handler(SIGILL, true);
4736     set_signal_handler(SIGFPE, true);
4737 #if defined(PPC64)
4738     set_signal_handler(SIGTRAP, true);
4739 #endif
4740     set_signal_handler(SIGXFSZ, true);
4741 
4742     if (libjsig_is_loaded) {
4743       // Tell libjsig jvm finishes setting signal handlers
4744       (*end_signal_setting)();
4745     }
4746 
4747     // We don't activate signal checker if libjsig is in place, we trust ourselves
4748     // and if UserSignalHandler is installed all bets are off.
4749     // Log that signal checking is off only if -verbose:jni is specified.
4750     if (CheckJNICalls) {
4751       if (libjsig_is_loaded) {
4752         if (PrintJNIResolving) {
4753           tty->print_cr("Info: libjsig is activated, all active signal checking is disabled");
4754         }
4755         check_signals = false;
4756       }
4757       if (AllowUserSignalHandlers) {
4758         if (PrintJNIResolving) {
4759           tty->print_cr("Info: AllowUserSignalHandlers is activated, all active signal checking is disabled");
4760         }
4761         check_signals = false;
4762       }
4763     }
4764   }
4765 }
4766 
4767 // This is the fastest way to get thread cpu time on Linux.
4768 // Returns cpu time (user+sys) for any thread, not only for current.
4769 // POSIX compliant clocks are implemented in the kernels 2.6.16+.
4770 // It might work on 2.6.10+ with a special kernel/glibc patch.
4771 // For reference, please, see IEEE Std 1003.1-2004:
4772 //   http://www.unix.org/single_unix_specification
4773 
4774 jlong os::Linux::fast_thread_cpu_time(clockid_t clockid) {
4775   struct timespec tp;
4776   int rc = os::Posix::clock_gettime(clockid, &tp);
4777   assert(rc == 0, "clock_gettime is expected to return 0 code");
4778 
4779   return (tp.tv_sec * NANOSECS_PER_SEC) + tp.tv_nsec;
4780 }
4781 
4782 void os::Linux::initialize_os_info() {
4783   assert(_os_version == 0, "OS info already initialized");
4784 
4785   struct utsname _uname;
4786 
4787   uint32_t major;
4788   uint32_t minor;
4789   uint32_t fix;
4790 
4791   int rc;
4792 
4793   // Kernel version is unknown if
4794   // verification below fails.
4795   _os_version = 0x01000000;
4796 
4797   rc = uname(&_uname);
4798   if (rc != -1) {
4799 
4800     rc = sscanf(_uname.release,"%d.%d.%d", &major, &minor, &fix);
4801     if (rc == 3) {
4802 
4803       if (major < 256 && minor < 256 && fix < 256) {
4804         // Kernel version format is as expected,
4805         // set it overriding unknown state.
4806         _os_version = (major << 16) |
4807                       (minor << 8 ) |
4808                       (fix   << 0 ) ;
4809       }
4810     }
4811   }
4812 }
4813 
4814 uint32_t os::Linux::os_version() {
4815   assert(_os_version != 0, "not initialized");
4816   return _os_version & 0x00FFFFFF;
4817 }
4818 
4819 bool os::Linux::os_version_is_known() {
4820   assert(_os_version != 0, "not initialized");
4821   return _os_version & 0x01000000 ? false : true;
4822 }
4823 
4824 /////
4825 // glibc on Linux platform uses non-documented flag
4826 // to indicate, that some special sort of signal
4827 // trampoline is used.
4828 // We will never set this flag, and we should
4829 // ignore this flag in our diagnostic
4830 #ifdef SIGNIFICANT_SIGNAL_MASK
4831   #undef SIGNIFICANT_SIGNAL_MASK
4832 #endif
4833 #define SIGNIFICANT_SIGNAL_MASK (~0x04000000)
4834 
4835 static const char* get_signal_handler_name(address handler,
4836                                            char* buf, int buflen) {
4837   int offset = 0;
4838   bool found = os::dll_address_to_library_name(handler, buf, buflen, &offset);
4839   if (found) {
4840     // skip directory names
4841     const char *p1, *p2;
4842     p1 = buf;
4843     size_t len = strlen(os::file_separator());
4844     while ((p2 = strstr(p1, os::file_separator())) != NULL) p1 = p2 + len;
4845     jio_snprintf(buf, buflen, "%s+0x%x", p1, offset);
4846   } else {
4847     jio_snprintf(buf, buflen, PTR_FORMAT, handler);
4848   }
4849   return buf;
4850 }
4851 
4852 static void print_signal_handler(outputStream* st, int sig,
4853                                  char* buf, size_t buflen) {
4854   struct sigaction sa;
4855 
4856   sigaction(sig, NULL, &sa);
4857 
4858   // See comment for SIGNIFICANT_SIGNAL_MASK define
4859   sa.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4860 
4861   st->print("%s: ", os::exception_name(sig, buf, buflen));
4862 
4863   address handler = (sa.sa_flags & SA_SIGINFO)
4864     ? CAST_FROM_FN_PTR(address, sa.sa_sigaction)
4865     : CAST_FROM_FN_PTR(address, sa.sa_handler);
4866 
4867   if (handler == CAST_FROM_FN_PTR(address, SIG_DFL)) {
4868     st->print("SIG_DFL");
4869   } else if (handler == CAST_FROM_FN_PTR(address, SIG_IGN)) {
4870     st->print("SIG_IGN");
4871   } else {
4872     st->print("[%s]", get_signal_handler_name(handler, buf, buflen));
4873   }
4874 
4875   st->print(", sa_mask[0]=");
4876   os::Posix::print_signal_set_short(st, &sa.sa_mask);
4877 
4878   address rh = VMError::get_resetted_sighandler(sig);
4879   // May be, handler was resetted by VMError?
4880   if (rh != NULL) {
4881     handler = rh;
4882     sa.sa_flags = VMError::get_resetted_sigflags(sig) & SIGNIFICANT_SIGNAL_MASK;
4883   }
4884 
4885   st->print(", sa_flags=");
4886   os::Posix::print_sa_flags(st, sa.sa_flags);
4887 
4888   // Check: is it our handler?
4889   if (handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler) ||
4890       handler == CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler)) {
4891     // It is our signal handler
4892     // check for flags, reset system-used one!
4893     if ((int)sa.sa_flags != os::Linux::get_our_sigflags(sig)) {
4894       st->print(
4895                 ", flags was changed from " PTR32_FORMAT ", consider using jsig library",
4896                 os::Linux::get_our_sigflags(sig));
4897     }
4898   }
4899   st->cr();
4900 }
4901 
4902 
4903 #define DO_SIGNAL_CHECK(sig)                      \
4904   do {                                            \
4905     if (!sigismember(&check_signal_done, sig)) {  \
4906       os::Linux::check_signal_handler(sig);       \
4907     }                                             \
4908   } while (0)
4909 
4910 // This method is a periodic task to check for misbehaving JNI applications
4911 // under CheckJNI, we can add any periodic checks here
4912 
4913 void os::run_periodic_checks() {
4914   if (check_signals == false) return;
4915 
4916   // SEGV and BUS if overridden could potentially prevent
4917   // generation of hs*.log in the event of a crash, debugging
4918   // such a case can be very challenging, so we absolutely
4919   // check the following for a good measure:
4920   DO_SIGNAL_CHECK(SIGSEGV);
4921   DO_SIGNAL_CHECK(SIGILL);
4922   DO_SIGNAL_CHECK(SIGFPE);
4923   DO_SIGNAL_CHECK(SIGBUS);
4924   DO_SIGNAL_CHECK(SIGPIPE);
4925   DO_SIGNAL_CHECK(SIGXFSZ);
4926 #if defined(PPC64)
4927   DO_SIGNAL_CHECK(SIGTRAP);
4928 #endif
4929 
4930   // ReduceSignalUsage allows the user to override these handlers
4931   // see comments at the very top and jvm_md.h
4932   if (!ReduceSignalUsage) {
4933     DO_SIGNAL_CHECK(SHUTDOWN1_SIGNAL);
4934     DO_SIGNAL_CHECK(SHUTDOWN2_SIGNAL);
4935     DO_SIGNAL_CHECK(SHUTDOWN3_SIGNAL);
4936     DO_SIGNAL_CHECK(BREAK_SIGNAL);
4937   }
4938 
4939   DO_SIGNAL_CHECK(SR_signum);
4940 }
4941 
4942 typedef int (*os_sigaction_t)(int, const struct sigaction *, struct sigaction *);
4943 
4944 static os_sigaction_t os_sigaction = NULL;
4945 
4946 void os::Linux::check_signal_handler(int sig) {
4947   char buf[O_BUFLEN];
4948   address jvmHandler = NULL;
4949 
4950 
4951   struct sigaction act;
4952   if (os_sigaction == NULL) {
4953     // only trust the default sigaction, in case it has been interposed
4954     os_sigaction = (os_sigaction_t)dlsym(RTLD_DEFAULT, "sigaction");
4955     if (os_sigaction == NULL) return;
4956   }
4957 
4958   os_sigaction(sig, (struct sigaction*)NULL, &act);
4959 
4960 
4961   act.sa_flags &= SIGNIFICANT_SIGNAL_MASK;
4962 
4963   address thisHandler = (act.sa_flags & SA_SIGINFO)
4964     ? CAST_FROM_FN_PTR(address, act.sa_sigaction)
4965     : CAST_FROM_FN_PTR(address, act.sa_handler);
4966 
4967 
4968   switch (sig) {
4969   case SIGSEGV:
4970   case SIGBUS:
4971   case SIGFPE:
4972   case SIGPIPE:
4973   case SIGILL:
4974   case SIGXFSZ:
4975     jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)signalHandler);
4976     break;
4977 
4978   case SHUTDOWN1_SIGNAL:
4979   case SHUTDOWN2_SIGNAL:
4980   case SHUTDOWN3_SIGNAL:
4981   case BREAK_SIGNAL:
4982     jvmHandler = (address)user_handler();
4983     break;
4984 
4985   default:
4986     if (sig == SR_signum) {
4987       jvmHandler = CAST_FROM_FN_PTR(address, (sa_sigaction_t)SR_handler);
4988     } else {
4989       return;
4990     }
4991     break;
4992   }
4993 
4994   if (thisHandler != jvmHandler) {
4995     tty->print("Warning: %s handler ", exception_name(sig, buf, O_BUFLEN));
4996     tty->print("expected:%s", get_signal_handler_name(jvmHandler, buf, O_BUFLEN));
4997     tty->print_cr("  found:%s", get_signal_handler_name(thisHandler, buf, O_BUFLEN));
4998     // No need to check this sig any longer
4999     sigaddset(&check_signal_done, sig);
5000     // Running under non-interactive shell, SHUTDOWN2_SIGNAL will be reassigned SIG_IGN
5001     if (sig == SHUTDOWN2_SIGNAL && !isatty(fileno(stdin))) {
5002       tty->print_cr("Running in non-interactive shell, %s handler is replaced by shell",
5003                     exception_name(sig, buf, O_BUFLEN));
5004     }
5005   } else if(os::Linux::get_our_sigflags(sig) != 0 && (int)act.sa_flags != os::Linux::get_our_sigflags(sig)) {
5006     tty->print("Warning: %s handler flags ", exception_name(sig, buf, O_BUFLEN));
5007     tty->print("expected:");
5008     os::Posix::print_sa_flags(tty, os::Linux::get_our_sigflags(sig));
5009     tty->cr();
5010     tty->print("  found:");
5011     os::Posix::print_sa_flags(tty, act.sa_flags);
5012     tty->cr();
5013     // No need to check this sig any longer
5014     sigaddset(&check_signal_done, sig);
5015   }
5016 
5017   // Dump all the signal
5018   if (sigismember(&check_signal_done, sig)) {
5019     print_signal_handlers(tty, buf, O_BUFLEN);
5020   }
5021 }
5022 
5023 extern void report_error(char* file_name, int line_no, char* title,
5024                          char* format, ...);
5025 
5026 // Some linux distributions (notably: Alpine Linux) include the
5027 // grsecurity in the kernel by default. Of particular interest from a
5028 // JVM perspective is PaX (https://pax.grsecurity.net/), which adds
5029 // some security features related to page attributes. Specifically,
5030 // the MPROTECT PaX functionality
5031 // (https://pax.grsecurity.net/docs/mprotect.txt) prevents dynamic
5032 // code generation by disallowing a (previously) writable page to be
5033 // marked as executable. This is, of course, exactly what HotSpot does
5034 // for both JIT compiled method, as well as for stubs, adapters, etc.
5035 //
5036 // Instead of crashing "lazily" when trying to make a page executable,
5037 // this code probes for the presence of PaX and reports the failure
5038 // eagerly.
5039 static void check_pax(void) {
5040   // Zero doesn't generate code dynamically, so no need to perform the PaX check
5041 #ifndef ZERO
5042   size_t size = os::Linux::page_size();
5043 
5044   void* p = ::mmap(NULL, size, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
5045   if (p == MAP_FAILED) {
5046     vm_exit_out_of_memory(size, OOM_MMAP_ERROR, "failed to allocate memory for PaX check.");
5047   }
5048 
5049   int res = ::mprotect(p, size, PROT_WRITE|PROT_EXEC);
5050   if (res == -1) {
5051     vm_exit_during_initialization("Failed to mark memory page as executable",
5052                                   "Please check if grsecurity/PaX is enabled in your kernel.\n"
5053                                   "\n"
5054                                   "For example, you can do this by running (note: you may need root privileges):\n"
5055                                   "\n"
5056                                   "    sysctl kernel.pax.softmode\n"
5057                                   "\n"
5058                                   "If PaX is included in the kernel you will see something like this:\n"
5059                                   "\n"
5060                                   "    kernel.pax.softmode = 0\n"
5061                                   "\n"
5062                                   "In particular, if the value is 0 (zero), then PaX is enabled.\n"
5063                                   "\n"
5064                                   "PaX includes security functionality which interferes with the dynamic code\n"
5065                                   "generation the JVM relies on. Specifically, the MPROTECT functionality as\n"
5066                                   "described on https://pax.grsecurity.net/docs/mprotect.txt is not compatible\n"
5067                                   "with the JVM. If you want to allow the JVM to run you will have to disable PaX.\n"
5068                                   "You can do this on a per-executable basis using the paxctl tool, for example:\n"
5069                                   "\n"
5070                                   "    paxctl -cm bin/java\n"
5071                                   "\n"
5072                                   "Please note that this modifies the executable binary in-place, so you may want\n"
5073                                   "to make a backup of it first. Also note that you have to repeat this for other\n"
5074                                   "executables like javac, jar, jcmd, etc.\n"
5075                                   );
5076 
5077   }
5078 
5079   ::munmap(p, size);
5080 #endif
5081 }
5082 
5083 // this is called _before_ most of the global arguments have been parsed
5084 void os::init(void) {
5085   char dummy;   // used to get a guess on initial stack address
5086 
5087   clock_tics_per_sec = sysconf(_SC_CLK_TCK);
5088 
5089   init_random(1234567);
5090 
5091   Linux::set_page_size(sysconf(_SC_PAGESIZE));
5092   if (Linux::page_size() == -1) {
5093     fatal("os_linux.cpp: os::init: sysconf failed (%s)",
5094           os::strerror(errno));
5095   }
5096   init_page_sizes((size_t) Linux::page_size());
5097 
5098   Linux::initialize_system_info();
5099 
5100   Linux::initialize_os_info();
5101 
5102   os::Linux::CPUPerfTicks pticks;
5103   bool res = os::Linux::get_tick_information(&pticks, -1);
5104 
5105   if (res && pticks.has_steal_ticks) {
5106     has_initial_tick_info = true;
5107     initial_total_ticks = pticks.total;
5108     initial_steal_ticks = pticks.steal;
5109   }
5110 
5111   // _main_thread points to the thread that created/loaded the JVM.
5112   Linux::_main_thread = pthread_self();
5113 
5114   // retrieve entry point for pthread_setname_np
5115   Linux::_pthread_setname_np =
5116     (int(*)(pthread_t, const char*))dlsym(RTLD_DEFAULT, "pthread_setname_np");
5117 
5118   check_pax();
5119 
5120   os::Posix::init();
5121 
5122   initial_time_count = javaTimeNanos();
5123 
5124   // Always warn if no monotonic clock available
5125   if (!os::Posix::supports_monotonic_clock()) {
5126     warning("No monotonic clock was available - timed services may "    \
5127             "be adversely affected if the time-of-day clock changes");
5128   }
5129 }
5130 
5131 // To install functions for atexit system call
5132 extern "C" {
5133   static void perfMemory_exit_helper() {
5134     perfMemory_exit();
5135   }
5136 }
5137 
5138 void os::pd_init_container_support() {
5139   OSContainer::init();
5140 }
5141 
5142 void os::Linux::numa_init() {
5143 
5144   // Java can be invoked as
5145   // 1. Without numactl and heap will be allocated/configured on all nodes as
5146   //    per the system policy.
5147   // 2. With numactl --interleave:
5148   //      Use numa_get_interleave_mask(v2) API to get nodes bitmask. The same
5149   //      API for membind case bitmask is reset.
5150   //      Interleave is only hint and Kernel can fallback to other nodes if
5151   //      no memory is available on the target nodes.
5152   // 3. With numactl --membind:
5153   //      Use numa_get_membind(v2) API to get nodes bitmask. The same API for
5154   //      interleave case returns bitmask of all nodes.
5155   // numa_all_nodes_ptr holds bitmask of all nodes.
5156   // numa_get_interleave_mask(v2) and numa_get_membind(v2) APIs returns correct
5157   // bitmask when externally configured to run on all or fewer nodes.
5158 
5159   if (!Linux::libnuma_init()) {
5160     UseNUMA = false;
5161   } else {
5162     if ((Linux::numa_max_node() < 1) || Linux::is_bound_to_single_node()) {
5163       // If there's only one node (they start from 0) or if the process
5164       // is bound explicitly to a single node using membind, disable NUMA.
5165       UseNUMA = false;
5166     } else {
5167 
5168       LogTarget(Info,os) log;
5169       LogStream ls(log);
5170 
5171       Linux::set_configured_numa_policy(Linux::identify_numa_policy());
5172 
5173       struct bitmask* bmp = Linux::_numa_membind_bitmask;
5174       const char* numa_mode = "membind";
5175 
5176       if (Linux::is_running_in_interleave_mode()) {
5177         bmp = Linux::_numa_interleave_bitmask;
5178         numa_mode = "interleave";
5179       }
5180 
5181       ls.print("UseNUMA is enabled and invoked in '%s' mode."
5182                " Heap will be configured using NUMA memory nodes:", numa_mode);
5183 
5184       for (int node = 0; node <= Linux::numa_max_node(); node++) {
5185         if (Linux::_numa_bitmask_isbitset(bmp, node)) {
5186           ls.print(" %d", node);
5187         }
5188       }
5189     }
5190   }
5191 
5192   if (UseParallelGC && UseNUMA && UseLargePages && !can_commit_large_page_memory()) {
5193     // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way
5194     // we can make the adaptive lgrp chunk resizing work. If the user specified both
5195     // UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn
5196     // and disable adaptive resizing.
5197     if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) {
5198       warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, "
5199               "disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)");
5200       UseAdaptiveSizePolicy = false;
5201       UseAdaptiveNUMAChunkSizing = false;
5202     }
5203   }
5204 
5205   if (!UseNUMA && ForceNUMA) {
5206     UseNUMA = true;
5207   }
5208 }
5209 
5210 // this is called _after_ the global arguments have been parsed
5211 jint os::init_2(void) {
5212 
5213   // This could be set after os::Posix::init() but all platforms
5214   // have to set it the same so we have to mirror Solaris.
5215   DEBUG_ONLY(os::set_mutex_init_done();)
5216 
5217   os::Posix::init_2();
5218 
5219   Linux::fast_thread_clock_init();
5220 
5221   // initialize suspend/resume support - must do this before signal_sets_init()
5222   if (SR_initialize() != 0) {
5223     perror("SR_initialize failed");
5224     return JNI_ERR;
5225   }
5226 
5227   Linux::signal_sets_init();
5228   Linux::install_signal_handlers();
5229   // Initialize data for jdk.internal.misc.Signal
5230   if (!ReduceSignalUsage) {
5231     jdk_misc_signal_init();
5232   }
5233 
5234   // Check and sets minimum stack sizes against command line options
5235   if (Posix::set_minimum_stack_sizes() == JNI_ERR) {
5236     return JNI_ERR;
5237   }
5238 
5239 #if defined(IA32)
5240   // Need to ensure we've determined the process's initial stack to
5241   // perform the workaround
5242   Linux::capture_initial_stack(JavaThread::stack_size_at_create());
5243   workaround_expand_exec_shield_cs_limit();
5244 #else
5245   suppress_primordial_thread_resolution = Arguments::created_by_java_launcher();
5246   if (!suppress_primordial_thread_resolution) {
5247     Linux::capture_initial_stack(JavaThread::stack_size_at_create());
5248   }
5249 #endif
5250 
5251   Linux::libpthread_init();
5252   Linux::sched_getcpu_init();
5253   log_info(os)("HotSpot is running with %s, %s",
5254                Linux::glibc_version(), Linux::libpthread_version());
5255 
5256   if (UseNUMA) {
5257     Linux::numa_init();
5258   }
5259 
5260   if (MaxFDLimit) {
5261     // set the number of file descriptors to max. print out error
5262     // if getrlimit/setrlimit fails but continue regardless.
5263     struct rlimit nbr_files;
5264     int status = getrlimit(RLIMIT_NOFILE, &nbr_files);
5265     if (status != 0) {
5266       log_info(os)("os::init_2 getrlimit failed: %s", os::strerror(errno));
5267     } else {
5268       nbr_files.rlim_cur = nbr_files.rlim_max;
5269       status = setrlimit(RLIMIT_NOFILE, &nbr_files);
5270       if (status != 0) {
5271         log_info(os)("os::init_2 setrlimit failed: %s", os::strerror(errno));
5272       }
5273     }
5274   }
5275 
5276   // Initialize lock used to serialize thread creation (see os::create_thread)
5277   Linux::set_createThread_lock(new Mutex(Mutex::leaf, "createThread_lock", false));
5278 
5279   // at-exit methods are called in the reverse order of their registration.
5280   // atexit functions are called on return from main or as a result of a
5281   // call to exit(3C). There can be only 32 of these functions registered
5282   // and atexit() does not set errno.
5283 
5284   if (PerfAllowAtExitRegistration) {
5285     // only register atexit functions if PerfAllowAtExitRegistration is set.
5286     // atexit functions can be delayed until process exit time, which
5287     // can be problematic for embedded VM situations. Embedded VMs should
5288     // call DestroyJavaVM() to assure that VM resources are released.
5289 
5290     // note: perfMemory_exit_helper atexit function may be removed in
5291     // the future if the appropriate cleanup code can be added to the
5292     // VM_Exit VMOperation's doit method.
5293     if (atexit(perfMemory_exit_helper) != 0) {
5294       warning("os::init_2 atexit(perfMemory_exit_helper) failed");
5295     }
5296   }
5297 
5298   // initialize thread priority policy
5299   prio_init();
5300 
5301   if (!FLAG_IS_DEFAULT(AllocateHeapAt) || !FLAG_IS_DEFAULT(AllocateOldGenAt)) {
5302     set_coredump_filter(DAX_SHARED_BIT);
5303   }
5304 
5305   if (DumpPrivateMappingsInCore) {
5306     set_coredump_filter(FILE_BACKED_PVT_BIT);
5307   }
5308 
5309   if (DumpSharedMappingsInCore) {
5310     set_coredump_filter(FILE_BACKED_SHARED_BIT);
5311   }
5312 
5313   return JNI_OK;
5314 }
5315 
5316 // Mark the polling page as unreadable
5317 void os::make_polling_page_unreadable(void) {
5318   if (!guard_memory((char*)_polling_page, Linux::page_size())) {
5319     fatal("Could not disable polling page");
5320   }
5321 }
5322 
5323 // Mark the polling page as readable
5324 void os::make_polling_page_readable(void) {
5325   if (!linux_mprotect((char *)_polling_page, Linux::page_size(), PROT_READ)) {
5326     fatal("Could not enable polling page");
5327   }
5328 }
5329 
5330 // older glibc versions don't have this macro (which expands to
5331 // an optimized bit-counting function) so we have to roll our own
5332 #ifndef CPU_COUNT
5333 
5334 static int _cpu_count(const cpu_set_t* cpus) {
5335   int count = 0;
5336   // only look up to the number of configured processors
5337   for (int i = 0; i < os::processor_count(); i++) {
5338     if (CPU_ISSET(i, cpus)) {
5339       count++;
5340     }
5341   }
5342   return count;
5343 }
5344 
5345 #define CPU_COUNT(cpus) _cpu_count(cpus)
5346 
5347 #endif // CPU_COUNT
5348 
5349 // Get the current number of available processors for this process.
5350 // This value can change at any time during a process's lifetime.
5351 // sched_getaffinity gives an accurate answer as it accounts for cpusets.
5352 // If it appears there may be more than 1024 processors then we do a
5353 // dynamic check - see 6515172 for details.
5354 // If anything goes wrong we fallback to returning the number of online
5355 // processors - which can be greater than the number available to the process.
5356 int os::Linux::active_processor_count() {
5357   cpu_set_t cpus;  // can represent at most 1024 (CPU_SETSIZE) processors
5358   cpu_set_t* cpus_p = &cpus;
5359   int cpus_size = sizeof(cpu_set_t);
5360 
5361   int configured_cpus = os::processor_count();  // upper bound on available cpus
5362   int cpu_count = 0;
5363 
5364 // old build platforms may not support dynamic cpu sets
5365 #ifdef CPU_ALLOC
5366 
5367   // To enable easy testing of the dynamic path on different platforms we
5368   // introduce a diagnostic flag: UseCpuAllocPath
5369   if (configured_cpus >= CPU_SETSIZE || UseCpuAllocPath) {
5370     // kernel may use a mask bigger than cpu_set_t
5371     log_trace(os)("active_processor_count: using dynamic path %s"
5372                   "- configured processors: %d",
5373                   UseCpuAllocPath ? "(forced) " : "",
5374                   configured_cpus);
5375     cpus_p = CPU_ALLOC(configured_cpus);
5376     if (cpus_p != NULL) {
5377       cpus_size = CPU_ALLOC_SIZE(configured_cpus);
5378       // zero it just to be safe
5379       CPU_ZERO_S(cpus_size, cpus_p);
5380     }
5381     else {
5382        // failed to allocate so fallback to online cpus
5383        int online_cpus = ::sysconf(_SC_NPROCESSORS_ONLN);
5384        log_trace(os)("active_processor_count: "
5385                      "CPU_ALLOC failed (%s) - using "
5386                      "online processor count: %d",
5387                      os::strerror(errno), online_cpus);
5388        return online_cpus;
5389     }
5390   }
5391   else {
5392     log_trace(os)("active_processor_count: using static path - configured processors: %d",
5393                   configured_cpus);
5394   }
5395 #else // CPU_ALLOC
5396 // these stubs won't be executed
5397 #define CPU_COUNT_S(size, cpus) -1
5398 #define CPU_FREE(cpus)
5399 
5400   log_trace(os)("active_processor_count: only static path available - configured processors: %d",
5401                 configured_cpus);
5402 #endif // CPU_ALLOC
5403 
5404   // pid 0 means the current thread - which we have to assume represents the process
5405   if (sched_getaffinity(0, cpus_size, cpus_p) == 0) {
5406     if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5407       cpu_count = CPU_COUNT_S(cpus_size, cpus_p);
5408     }
5409     else {
5410       cpu_count = CPU_COUNT(cpus_p);
5411     }
5412     log_trace(os)("active_processor_count: sched_getaffinity processor count: %d", cpu_count);
5413   }
5414   else {
5415     cpu_count = ::sysconf(_SC_NPROCESSORS_ONLN);
5416     warning("sched_getaffinity failed (%s)- using online processor count (%d) "
5417             "which may exceed available processors", os::strerror(errno), cpu_count);
5418   }
5419 
5420   if (cpus_p != &cpus) { // can only be true when CPU_ALLOC used
5421     CPU_FREE(cpus_p);
5422   }
5423 
5424   assert(cpu_count > 0 && cpu_count <= os::processor_count(), "sanity check");
5425   return cpu_count;
5426 }
5427 
5428 // Determine the active processor count from one of
5429 // three different sources:
5430 //
5431 // 1. User option -XX:ActiveProcessorCount
5432 // 2. kernel os calls (sched_getaffinity or sysconf(_SC_NPROCESSORS_ONLN)
5433 // 3. extracted from cgroup cpu subsystem (shares and quotas)
5434 //
5435 // Option 1, if specified, will always override.
5436 // If the cgroup subsystem is active and configured, we
5437 // will return the min of the cgroup and option 2 results.
5438 // This is required since tools, such as numactl, that
5439 // alter cpu affinity do not update cgroup subsystem
5440 // cpuset configuration files.
5441 int os::active_processor_count() {
5442   // User has overridden the number of active processors
5443   if (ActiveProcessorCount > 0) {
5444     log_trace(os)("active_processor_count: "
5445                   "active processor count set by user : %d",
5446                   ActiveProcessorCount);
5447     return ActiveProcessorCount;
5448   }
5449 
5450   int active_cpus;
5451   if (OSContainer::is_containerized()) {
5452     active_cpus = OSContainer::active_processor_count();
5453     log_trace(os)("active_processor_count: determined by OSContainer: %d",
5454                    active_cpus);
5455   } else {
5456     active_cpus = os::Linux::active_processor_count();
5457   }
5458 
5459   return active_cpus;
5460 }
5461 
5462 uint os::processor_id() {
5463   const int id = Linux::sched_getcpu();
5464   assert(id >= 0 && id < _processor_count, "Invalid processor id");
5465   return (uint)id;
5466 }
5467 
5468 void os::set_native_thread_name(const char *name) {
5469   if (Linux::_pthread_setname_np) {
5470     char buf [16]; // according to glibc manpage, 16 chars incl. '/0'
5471     snprintf(buf, sizeof(buf), "%s", name);
5472     buf[sizeof(buf) - 1] = '\0';
5473     const int rc = Linux::_pthread_setname_np(pthread_self(), buf);
5474     // ERANGE should not happen; all other errors should just be ignored.
5475     assert(rc != ERANGE, "pthread_setname_np failed");
5476   }
5477 }
5478 
5479 bool os::distribute_processes(uint length, uint* distribution) {
5480   // Not yet implemented.
5481   return false;
5482 }
5483 
5484 bool os::bind_to_processor(uint processor_id) {
5485   // Not yet implemented.
5486   return false;
5487 }
5488 
5489 ///
5490 
5491 void os::SuspendedThreadTask::internal_do_task() {
5492   if (do_suspend(_thread->osthread())) {
5493     SuspendedThreadTaskContext context(_thread, _thread->osthread()->ucontext());
5494     do_task(context);
5495     do_resume(_thread->osthread());
5496   }
5497 }
5498 
5499 ////////////////////////////////////////////////////////////////////////////////
5500 // debug support
5501 
5502 bool os::find(address addr, outputStream* st) {
5503   Dl_info dlinfo;
5504   memset(&dlinfo, 0, sizeof(dlinfo));
5505   if (dladdr(addr, &dlinfo) != 0) {
5506     st->print(PTR_FORMAT ": ", p2i(addr));
5507     if (dlinfo.dli_sname != NULL && dlinfo.dli_saddr != NULL) {
5508       st->print("%s+" PTR_FORMAT, dlinfo.dli_sname,
5509                 p2i(addr) - p2i(dlinfo.dli_saddr));
5510     } else if (dlinfo.dli_fbase != NULL) {
5511       st->print("<offset " PTR_FORMAT ">", p2i(addr) - p2i(dlinfo.dli_fbase));
5512     } else {
5513       st->print("<absolute address>");
5514     }
5515     if (dlinfo.dli_fname != NULL) {
5516       st->print(" in %s", dlinfo.dli_fname);
5517     }
5518     if (dlinfo.dli_fbase != NULL) {
5519       st->print(" at " PTR_FORMAT, p2i(dlinfo.dli_fbase));
5520     }
5521     st->cr();
5522 
5523     if (Verbose) {
5524       // decode some bytes around the PC
5525       address begin = clamp_address_in_page(addr-40, addr, os::vm_page_size());
5526       address end   = clamp_address_in_page(addr+40, addr, os::vm_page_size());
5527       address       lowest = (address) dlinfo.dli_sname;
5528       if (!lowest)  lowest = (address) dlinfo.dli_fbase;
5529       if (begin < lowest)  begin = lowest;
5530       Dl_info dlinfo2;
5531       if (dladdr(end, &dlinfo2) != 0 && dlinfo2.dli_saddr != dlinfo.dli_saddr
5532           && end > dlinfo2.dli_saddr && dlinfo2.dli_saddr > begin) {
5533         end = (address) dlinfo2.dli_saddr;
5534       }
5535       Disassembler::decode(begin, end, st);
5536     }
5537     return true;
5538   }
5539   return false;
5540 }
5541 
5542 ////////////////////////////////////////////////////////////////////////////////
5543 // misc
5544 
5545 // This does not do anything on Linux. This is basically a hook for being
5546 // able to use structured exception handling (thread-local exception filters)
5547 // on, e.g., Win32.
5548 void
5549 os::os_exception_wrapper(java_call_t f, JavaValue* value, const methodHandle& method,
5550                          JavaCallArguments* args, Thread* thread) {
5551   f(value, method, args, thread);
5552 }
5553 
5554 void os::print_statistics() {
5555 }
5556 
5557 bool os::message_box(const char* title, const char* message) {
5558   int i;
5559   fdStream err(defaultStream::error_fd());
5560   for (i = 0; i < 78; i++) err.print_raw("=");
5561   err.cr();
5562   err.print_raw_cr(title);
5563   for (i = 0; i < 78; i++) err.print_raw("-");
5564   err.cr();
5565   err.print_raw_cr(message);
5566   for (i = 0; i < 78; i++) err.print_raw("=");
5567   err.cr();
5568 
5569   char buf[16];
5570   // Prevent process from exiting upon "read error" without consuming all CPU
5571   while (::read(0, buf, sizeof(buf)) <= 0) { ::sleep(100); }
5572 
5573   return buf[0] == 'y' || buf[0] == 'Y';
5574 }
5575 
5576 // Is a (classpath) directory empty?
5577 bool os::dir_is_empty(const char* path) {
5578   DIR *dir = NULL;
5579   struct dirent *ptr;
5580 
5581   dir = opendir(path);
5582   if (dir == NULL) return true;
5583 
5584   // Scan the directory
5585   bool result = true;
5586   while (result && (ptr = readdir(dir)) != NULL) {
5587     if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
5588       result = false;
5589     }
5590   }
5591   closedir(dir);
5592   return result;
5593 }
5594 
5595 // This code originates from JDK's sysOpen and open64_w
5596 // from src/solaris/hpi/src/system_md.c
5597 
5598 int os::open(const char *path, int oflag, int mode) {
5599   if (strlen(path) > MAX_PATH - 1) {
5600     errno = ENAMETOOLONG;
5601     return -1;
5602   }
5603 
5604   // All file descriptors that are opened in the Java process and not
5605   // specifically destined for a subprocess should have the close-on-exec
5606   // flag set.  If we don't set it, then careless 3rd party native code
5607   // might fork and exec without closing all appropriate file descriptors
5608   // (e.g. as we do in closeDescriptors in UNIXProcess.c), and this in
5609   // turn might:
5610   //
5611   // - cause end-of-file to fail to be detected on some file
5612   //   descriptors, resulting in mysterious hangs, or
5613   //
5614   // - might cause an fopen in the subprocess to fail on a system
5615   //   suffering from bug 1085341.
5616   //
5617   // (Yes, the default setting of the close-on-exec flag is a Unix
5618   // design flaw)
5619   //
5620   // See:
5621   // 1085341: 32-bit stdio routines should support file descriptors >255
5622   // 4843136: (process) pipe file descriptor from Runtime.exec not being closed
5623   // 6339493: (process) Runtime.exec does not close all file descriptors on Solaris 9
5624   //
5625   // Modern Linux kernels (after 2.6.23 2007) support O_CLOEXEC with open().
5626   // O_CLOEXEC is preferable to using FD_CLOEXEC on an open file descriptor
5627   // because it saves a system call and removes a small window where the flag
5628   // is unset.  On ancient Linux kernels the O_CLOEXEC flag will be ignored
5629   // and we fall back to using FD_CLOEXEC (see below).
5630 #ifdef O_CLOEXEC
5631   oflag |= O_CLOEXEC;
5632 #endif
5633 
5634   int fd = ::open64(path, oflag, mode);
5635   if (fd == -1) return -1;
5636 
5637   //If the open succeeded, the file might still be a directory
5638   {
5639     struct stat64 buf64;
5640     int ret = ::fstat64(fd, &buf64);
5641     int st_mode = buf64.st_mode;
5642 
5643     if (ret != -1) {
5644       if ((st_mode & S_IFMT) == S_IFDIR) {
5645         errno = EISDIR;
5646         ::close(fd);
5647         return -1;
5648       }
5649     } else {
5650       ::close(fd);
5651       return -1;
5652     }
5653   }
5654 
5655 #ifdef FD_CLOEXEC
5656   // Validate that the use of the O_CLOEXEC flag on open above worked.
5657   // With recent kernels, we will perform this check exactly once.
5658   static sig_atomic_t O_CLOEXEC_is_known_to_work = 0;
5659   if (!O_CLOEXEC_is_known_to_work) {
5660     int flags = ::fcntl(fd, F_GETFD);
5661     if (flags != -1) {
5662       if ((flags & FD_CLOEXEC) != 0)
5663         O_CLOEXEC_is_known_to_work = 1;
5664       else
5665         ::fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
5666     }
5667   }
5668 #endif
5669 
5670   return fd;
5671 }
5672 
5673 
5674 // create binary file, rewriting existing file if required
5675 int os::create_binary_file(const char* path, bool rewrite_existing) {
5676   int oflags = O_WRONLY | O_CREAT;
5677   if (!rewrite_existing) {
5678     oflags |= O_EXCL;
5679   }
5680   return ::open64(path, oflags, S_IREAD | S_IWRITE);
5681 }
5682 
5683 // return current position of file pointer
5684 jlong os::current_file_offset(int fd) {
5685   return (jlong)::lseek64(fd, (off64_t)0, SEEK_CUR);
5686 }
5687 
5688 // move file pointer to the specified offset
5689 jlong os::seek_to_file_offset(int fd, jlong offset) {
5690   return (jlong)::lseek64(fd, (off64_t)offset, SEEK_SET);
5691 }
5692 
5693 // This code originates from JDK's sysAvailable
5694 // from src/solaris/hpi/src/native_threads/src/sys_api_td.c
5695 
5696 int os::available(int fd, jlong *bytes) {
5697   jlong cur, end;
5698   int mode;
5699   struct stat64 buf64;
5700 
5701   if (::fstat64(fd, &buf64) >= 0) {
5702     mode = buf64.st_mode;
5703     if (S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) {
5704       int n;
5705       if (::ioctl(fd, FIONREAD, &n) >= 0) {
5706         *bytes = n;
5707         return 1;
5708       }
5709     }
5710   }
5711   if ((cur = ::lseek64(fd, 0L, SEEK_CUR)) == -1) {
5712     return 0;
5713   } else if ((end = ::lseek64(fd, 0L, SEEK_END)) == -1) {
5714     return 0;
5715   } else if (::lseek64(fd, cur, SEEK_SET) == -1) {
5716     return 0;
5717   }
5718   *bytes = end - cur;
5719   return 1;
5720 }
5721 
5722 // Map a block of memory.
5723 char* os::pd_map_memory(int fd, const char* file_name, size_t file_offset,
5724                         char *addr, size_t bytes, bool read_only,
5725                         bool allow_exec) {
5726   int prot;
5727   int flags = MAP_PRIVATE;
5728 
5729   if (read_only) {
5730     prot = PROT_READ;
5731   } else {
5732     prot = PROT_READ | PROT_WRITE;
5733   }
5734 
5735   if (allow_exec) {
5736     prot |= PROT_EXEC;
5737   }
5738 
5739   if (addr != NULL) {
5740     flags |= MAP_FIXED;
5741   }
5742 
5743   char* mapped_address = (char*)mmap(addr, (size_t)bytes, prot, flags,
5744                                      fd, file_offset);
5745   if (mapped_address == MAP_FAILED) {
5746     return NULL;
5747   }
5748   return mapped_address;
5749 }
5750 
5751 
5752 // Remap a block of memory.
5753 char* os::pd_remap_memory(int fd, const char* file_name, size_t file_offset,
5754                           char *addr, size_t bytes, bool read_only,
5755                           bool allow_exec) {
5756   // same as map_memory() on this OS
5757   return os::map_memory(fd, file_name, file_offset, addr, bytes, read_only,
5758                         allow_exec);
5759 }
5760 
5761 
5762 // Unmap a block of memory.
5763 bool os::pd_unmap_memory(char* addr, size_t bytes) {
5764   return munmap(addr, bytes) == 0;
5765 }
5766 
5767 static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time);
5768 
5769 static jlong fast_cpu_time(Thread *thread) {
5770     clockid_t clockid;
5771     int rc = os::Linux::pthread_getcpuclockid(thread->osthread()->pthread_id(),
5772                                               &clockid);
5773     if (rc == 0) {
5774       return os::Linux::fast_thread_cpu_time(clockid);
5775     } else {
5776       // It's possible to encounter a terminated native thread that failed
5777       // to detach itself from the VM - which should result in ESRCH.
5778       assert_status(rc == ESRCH, rc, "pthread_getcpuclockid failed");
5779       return -1;
5780     }
5781 }
5782 
5783 // current_thread_cpu_time(bool) and thread_cpu_time(Thread*, bool)
5784 // are used by JVM M&M and JVMTI to get user+sys or user CPU time
5785 // of a thread.
5786 //
5787 // current_thread_cpu_time() and thread_cpu_time(Thread*) returns
5788 // the fast estimate available on the platform.
5789 
5790 jlong os::current_thread_cpu_time() {
5791   if (os::Linux::supports_fast_thread_cpu_time()) {
5792     return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5793   } else {
5794     // return user + sys since the cost is the same
5795     return slow_thread_cpu_time(Thread::current(), true /* user + sys */);
5796   }
5797 }
5798 
5799 jlong os::thread_cpu_time(Thread* thread) {
5800   // consistent with what current_thread_cpu_time() returns
5801   if (os::Linux::supports_fast_thread_cpu_time()) {
5802     return fast_cpu_time(thread);
5803   } else {
5804     return slow_thread_cpu_time(thread, true /* user + sys */);
5805   }
5806 }
5807 
5808 jlong os::current_thread_cpu_time(bool user_sys_cpu_time) {
5809   if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5810     return os::Linux::fast_thread_cpu_time(CLOCK_THREAD_CPUTIME_ID);
5811   } else {
5812     return slow_thread_cpu_time(Thread::current(), user_sys_cpu_time);
5813   }
5814 }
5815 
5816 jlong os::thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5817   if (user_sys_cpu_time && os::Linux::supports_fast_thread_cpu_time()) {
5818     return fast_cpu_time(thread);
5819   } else {
5820     return slow_thread_cpu_time(thread, user_sys_cpu_time);
5821   }
5822 }
5823 
5824 //  -1 on error.
5825 static jlong slow_thread_cpu_time(Thread *thread, bool user_sys_cpu_time) {
5826   pid_t  tid = thread->osthread()->thread_id();
5827   char *s;
5828   char stat[2048];
5829   int statlen;
5830   char proc_name[64];
5831   int count;
5832   long sys_time, user_time;
5833   char cdummy;
5834   int idummy;
5835   long ldummy;
5836   FILE *fp;
5837 
5838   snprintf(proc_name, 64, "/proc/self/task/%d/stat", tid);
5839   fp = fopen(proc_name, "r");
5840   if (fp == NULL) return -1;
5841   statlen = fread(stat, 1, 2047, fp);
5842   stat[statlen] = '\0';
5843   fclose(fp);
5844 
5845   // Skip pid and the command string. Note that we could be dealing with
5846   // weird command names, e.g. user could decide to rename java launcher
5847   // to "java 1.4.2 :)", then the stat file would look like
5848   //                1234 (java 1.4.2 :)) R ... ...
5849   // We don't really need to know the command string, just find the last
5850   // occurrence of ")" and then start parsing from there. See bug 4726580.
5851   s = strrchr(stat, ')');
5852   if (s == NULL) return -1;
5853 
5854   // Skip blank chars
5855   do { s++; } while (s && isspace(*s));
5856 
5857   count = sscanf(s,"%c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
5858                  &cdummy, &idummy, &idummy, &idummy, &idummy, &idummy,
5859                  &ldummy, &ldummy, &ldummy, &ldummy, &ldummy,
5860                  &user_time, &sys_time);
5861   if (count != 13) return -1;
5862   if (user_sys_cpu_time) {
5863     return ((jlong)sys_time + (jlong)user_time) * (1000000000 / clock_tics_per_sec);
5864   } else {
5865     return (jlong)user_time * (1000000000 / clock_tics_per_sec);
5866   }
5867 }
5868 
5869 void os::current_thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5870   info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5871   info_ptr->may_skip_backward = false;     // elapsed time not wall time
5872   info_ptr->may_skip_forward = false;      // elapsed time not wall time
5873   info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5874 }
5875 
5876 void os::thread_cpu_time_info(jvmtiTimerInfo *info_ptr) {
5877   info_ptr->max_value = ALL_64_BITS;       // will not wrap in less than 64 bits
5878   info_ptr->may_skip_backward = false;     // elapsed time not wall time
5879   info_ptr->may_skip_forward = false;      // elapsed time not wall time
5880   info_ptr->kind = JVMTI_TIMER_TOTAL_CPU;  // user+system time is returned
5881 }
5882 
5883 bool os::is_thread_cpu_time_supported() {
5884   return true;
5885 }
5886 
5887 // System loadavg support.  Returns -1 if load average cannot be obtained.
5888 // Linux doesn't yet have a (official) notion of processor sets,
5889 // so just return the system wide load average.
5890 int os::loadavg(double loadavg[], int nelem) {
5891   return ::getloadavg(loadavg, nelem);
5892 }
5893 
5894 void os::pause() {
5895   char filename[MAX_PATH];
5896   if (PauseAtStartupFile && PauseAtStartupFile[0]) {
5897     jio_snprintf(filename, MAX_PATH, "%s", PauseAtStartupFile);
5898   } else {
5899     jio_snprintf(filename, MAX_PATH, "./vm.paused.%d", current_process_id());
5900   }
5901 
5902   int fd = ::open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666);
5903   if (fd != -1) {
5904     struct stat buf;
5905     ::close(fd);
5906     while (::stat(filename, &buf) == 0) {
5907       (void)::poll(NULL, 0, 100);
5908     }
5909   } else {
5910     jio_fprintf(stderr,
5911                 "Could not open pause file '%s', continuing immediately.\n", filename);
5912   }
5913 }
5914 
5915 extern char** environ;
5916 
5917 // Run the specified command in a separate process. Return its exit value,
5918 // or -1 on failure (e.g. can't fork a new process).
5919 // Unlike system(), this function can be called from signal handler. It
5920 // doesn't block SIGINT et al.
5921 int os::fork_and_exec(char* cmd, bool use_vfork_if_available) {
5922   const char * argv[4] = {"sh", "-c", cmd, NULL};
5923 
5924   pid_t pid ;
5925 
5926   if (use_vfork_if_available) {
5927     pid = vfork();
5928   } else {
5929     pid = fork();
5930   }
5931 
5932   if (pid < 0) {
5933     // fork failed
5934     return -1;
5935 
5936   } else if (pid == 0) {
5937     // child process
5938 
5939     execve("/bin/sh", (char* const*)argv, environ);
5940 
5941     // execve failed
5942     _exit(-1);
5943 
5944   } else  {
5945     // copied from J2SE ..._waitForProcessExit() in UNIXProcess_md.c; we don't
5946     // care about the actual exit code, for now.
5947 
5948     int status;
5949 
5950     // Wait for the child process to exit.  This returns immediately if
5951     // the child has already exited. */
5952     while (waitpid(pid, &status, 0) < 0) {
5953       switch (errno) {
5954       case ECHILD: return 0;
5955       case EINTR: break;
5956       default: return -1;
5957       }
5958     }
5959 
5960     if (WIFEXITED(status)) {
5961       // The child exited normally; get its exit code.
5962       return WEXITSTATUS(status);
5963     } else if (WIFSIGNALED(status)) {
5964       // The child exited because of a signal
5965       // The best value to return is 0x80 + signal number,
5966       // because that is what all Unix shells do, and because
5967       // it allows callers to distinguish between process exit and
5968       // process death by signal.
5969       return 0x80 + WTERMSIG(status);
5970     } else {
5971       // Unknown exit code; pass it through
5972       return status;
5973     }
5974   }
5975 }
5976 
5977 // Get the default path to the core file
5978 // Returns the length of the string
5979 int os::get_core_path(char* buffer, size_t bufferSize) {
5980   /*
5981    * Max length of /proc/sys/kernel/core_pattern is 128 characters.
5982    * See https://www.kernel.org/doc/Documentation/sysctl/kernel.txt
5983    */
5984   const int core_pattern_len = 129;
5985   char core_pattern[core_pattern_len] = {0};
5986 
5987   int core_pattern_file = ::open("/proc/sys/kernel/core_pattern", O_RDONLY);
5988   if (core_pattern_file == -1) {
5989     return -1;
5990   }
5991 
5992   ssize_t ret = ::read(core_pattern_file, core_pattern, core_pattern_len);
5993   ::close(core_pattern_file);
5994   if (ret <= 0 || ret >= core_pattern_len || core_pattern[0] == '\n') {
5995     return -1;
5996   }
5997   if (core_pattern[ret-1] == '\n') {
5998     core_pattern[ret-1] = '\0';
5999   } else {
6000     core_pattern[ret] = '\0';
6001   }
6002 
6003   // Replace the %p in the core pattern with the process id. NOTE: we do this
6004   // only if the pattern doesn't start with "|", and we support only one %p in
6005   // the pattern.
6006   char *pid_pos = strstr(core_pattern, "%p");
6007   const char* tail = (pid_pos != NULL) ? (pid_pos + 2) : "";  // skip over the "%p"
6008   int written;
6009 
6010   if (core_pattern[0] == '/') {
6011     if (pid_pos != NULL) {
6012       *pid_pos = '\0';
6013       written = jio_snprintf(buffer, bufferSize, "%s%d%s", core_pattern,
6014                              current_process_id(), tail);
6015     } else {
6016       written = jio_snprintf(buffer, bufferSize, "%s", core_pattern);
6017     }
6018   } else {
6019     char cwd[PATH_MAX];
6020 
6021     const char* p = get_current_directory(cwd, PATH_MAX);
6022     if (p == NULL) {
6023       return -1;
6024     }
6025 
6026     if (core_pattern[0] == '|') {
6027       written = jio_snprintf(buffer, bufferSize,
6028                              "\"%s\" (or dumping to %s/core.%d)",
6029                              &core_pattern[1], p, current_process_id());
6030     } else if (pid_pos != NULL) {
6031       *pid_pos = '\0';
6032       written = jio_snprintf(buffer, bufferSize, "%s/%s%d%s", p, core_pattern,
6033                              current_process_id(), tail);
6034     } else {
6035       written = jio_snprintf(buffer, bufferSize, "%s/%s", p, core_pattern);
6036     }
6037   }
6038 
6039   if (written < 0) {
6040     return -1;
6041   }
6042 
6043   if (((size_t)written < bufferSize) && (pid_pos == NULL) && (core_pattern[0] != '|')) {
6044     int core_uses_pid_file = ::open("/proc/sys/kernel/core_uses_pid", O_RDONLY);
6045 
6046     if (core_uses_pid_file != -1) {
6047       char core_uses_pid = 0;
6048       ssize_t ret = ::read(core_uses_pid_file, &core_uses_pid, 1);
6049       ::close(core_uses_pid_file);
6050 
6051       if (core_uses_pid == '1') {
6052         jio_snprintf(buffer + written, bufferSize - written,
6053                                           ".%d", current_process_id());
6054       }
6055     }
6056   }
6057 
6058   return strlen(buffer);
6059 }
6060 
6061 bool os::start_debugging(char *buf, int buflen) {
6062   int len = (int)strlen(buf);
6063   char *p = &buf[len];
6064 
6065   jio_snprintf(p, buflen-len,
6066                "\n\n"
6067                "Do you want to debug the problem?\n\n"
6068                "To debug, run 'gdb /proc/%d/exe %d'; then switch to thread " UINTX_FORMAT " (" INTPTR_FORMAT ")\n"
6069                "Enter 'yes' to launch gdb automatically (PATH must include gdb)\n"
6070                "Otherwise, press RETURN to abort...",
6071                os::current_process_id(), os::current_process_id(),
6072                os::current_thread_id(), os::current_thread_id());
6073 
6074   bool yes = os::message_box("Unexpected Error", buf);
6075 
6076   if (yes) {
6077     // yes, user asked VM to launch debugger
6078     jio_snprintf(buf, sizeof(char)*buflen, "gdb /proc/%d/exe %d",
6079                  os::current_process_id(), os::current_process_id());
6080 
6081     os::fork_and_exec(buf);
6082     yes = false;
6083   }
6084   return yes;
6085 }
6086 
6087 
6088 // Java/Compiler thread:
6089 //
6090 //   Low memory addresses
6091 // P0 +------------------------+
6092 //    |                        |\  Java thread created by VM does not have glibc
6093 //    |    glibc guard page    | - guard page, attached Java thread usually has
6094 //    |                        |/  1 glibc guard page.
6095 // P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
6096 //    |                        |\
6097 //    |  HotSpot Guard Pages   | - red, yellow and reserved pages
6098 //    |                        |/
6099 //    +------------------------+ JavaThread::stack_reserved_zone_base()
6100 //    |                        |\
6101 //    |      Normal Stack      | -
6102 //    |                        |/
6103 // P2 +------------------------+ Thread::stack_base()
6104 //
6105 // Non-Java thread:
6106 //
6107 //   Low memory addresses
6108 // P0 +------------------------+
6109 //    |                        |\
6110 //    |  glibc guard page      | - usually 1 page
6111 //    |                        |/
6112 // P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
6113 //    |                        |\
6114 //    |      Normal Stack      | -
6115 //    |                        |/
6116 // P2 +------------------------+ Thread::stack_base()
6117 //
6118 // ** P1 (aka bottom) and size (P2 = P1 - size) are the address and stack size
6119 //    returned from pthread_attr_getstack().
6120 // ** Due to NPTL implementation error, linux takes the glibc guard page out
6121 //    of the stack size given in pthread_attr. We work around this for
6122 //    threads created by the VM. (We adapt bottom to be P1 and size accordingly.)
6123 //
6124 #ifndef ZERO
6125 static void current_stack_region(address * bottom, size_t * size) {
6126   if (os::is_primordial_thread()) {
6127     // primordial thread needs special handling because pthread_getattr_np()
6128     // may return bogus value.
6129     *bottom = os::Linux::initial_thread_stack_bottom();
6130     *size   = os::Linux::initial_thread_stack_size();
6131   } else {
6132     pthread_attr_t attr;
6133 
6134     int rslt = pthread_getattr_np(pthread_self(), &attr);
6135 
6136     // JVM needs to know exact stack location, abort if it fails
6137     if (rslt != 0) {
6138       if (rslt == ENOMEM) {
6139         vm_exit_out_of_memory(0, OOM_MMAP_ERROR, "pthread_getattr_np");
6140       } else {
6141         fatal("pthread_getattr_np failed with error = %d", rslt);
6142       }
6143     }
6144 
6145     if (pthread_attr_getstack(&attr, (void **)bottom, size) != 0) {
6146       fatal("Cannot locate current stack attributes!");
6147     }
6148 
6149     // Work around NPTL stack guard error.
6150     size_t guard_size = 0;
6151     rslt = pthread_attr_getguardsize(&attr, &guard_size);
6152     if (rslt != 0) {
6153       fatal("pthread_attr_getguardsize failed with error = %d", rslt);
6154     }
6155     *bottom += guard_size;
6156     *size   -= guard_size;
6157 
6158     pthread_attr_destroy(&attr);
6159 
6160   }
6161   assert(os::current_stack_pointer() >= *bottom &&
6162          os::current_stack_pointer() < *bottom + *size, "just checking");
6163 }
6164 
6165 address os::current_stack_base() {
6166   address bottom;
6167   size_t size;
6168   current_stack_region(&bottom, &size);
6169   return (bottom + size);
6170 }
6171 
6172 size_t os::current_stack_size() {
6173   // This stack size includes the usable stack and HotSpot guard pages
6174   // (for the threads that have Hotspot guard pages).
6175   address bottom;
6176   size_t size;
6177   current_stack_region(&bottom, &size);
6178   return size;
6179 }
6180 #endif
6181 
6182 static inline struct timespec get_mtime(const char* filename) {
6183   struct stat st;
6184   int ret = os::stat(filename, &st);
6185   assert(ret == 0, "failed to stat() file '%s': %s", filename, os::strerror(errno));
6186   return st.st_mtim;
6187 }
6188 
6189 int os::compare_file_modified_times(const char* file1, const char* file2) {
6190   struct timespec filetime1 = get_mtime(file1);
6191   struct timespec filetime2 = get_mtime(file2);
6192   int diff = filetime1.tv_sec - filetime2.tv_sec;
6193   if (diff == 0) {
6194     return filetime1.tv_nsec - filetime2.tv_nsec;
6195   }
6196   return diff;
6197 }
6198 
6199 /////////////// Unit tests ///////////////
6200 
6201 #ifndef PRODUCT
6202 
6203 class TestReserveMemorySpecial : AllStatic {
6204  public:
6205   static void small_page_write(void* addr, size_t size) {
6206     size_t page_size = os::vm_page_size();
6207 
6208     char* end = (char*)addr + size;
6209     for (char* p = (char*)addr; p < end; p += page_size) {
6210       *p = 1;
6211     }
6212   }
6213 
6214   static void test_reserve_memory_special_huge_tlbfs_only(size_t size) {
6215     if (!UseHugeTLBFS) {
6216       return;
6217     }
6218 
6219     char* addr = os::Linux::reserve_memory_special_huge_tlbfs_only(size, NULL, false);
6220 
6221     if (addr != NULL) {
6222       small_page_write(addr, size);
6223 
6224       os::Linux::release_memory_special_huge_tlbfs(addr, size);
6225     }
6226   }
6227 
6228   static void test_reserve_memory_special_huge_tlbfs_only() {
6229     if (!UseHugeTLBFS) {
6230       return;
6231     }
6232 
6233     size_t lp = os::large_page_size();
6234 
6235     for (size_t size = lp; size <= lp * 10; size += lp) {
6236       test_reserve_memory_special_huge_tlbfs_only(size);
6237     }
6238   }
6239 
6240   static void test_reserve_memory_special_huge_tlbfs_mixed() {
6241     size_t lp = os::large_page_size();
6242     size_t ag = os::vm_allocation_granularity();
6243 
6244     // sizes to test
6245     const size_t sizes[] = {
6246       lp, lp + ag, lp + lp / 2, lp * 2,
6247       lp * 2 + ag, lp * 2 - ag, lp * 2 + lp / 2,
6248       lp * 10, lp * 10 + lp / 2
6249     };
6250     const int num_sizes = sizeof(sizes) / sizeof(size_t);
6251 
6252     // For each size/alignment combination, we test three scenarios:
6253     // 1) with req_addr == NULL
6254     // 2) with a non-null req_addr at which we expect to successfully allocate
6255     // 3) with a non-null req_addr which contains a pre-existing mapping, at which we
6256     //    expect the allocation to either fail or to ignore req_addr
6257 
6258     // Pre-allocate two areas; they shall be as large as the largest allocation
6259     //  and aligned to the largest alignment we will be testing.
6260     const size_t mapping_size = sizes[num_sizes - 1] * 2;
6261     char* const mapping1 = (char*) ::mmap(NULL, mapping_size,
6262       PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
6263       -1, 0);
6264     assert(mapping1 != MAP_FAILED, "should work");
6265 
6266     char* const mapping2 = (char*) ::mmap(NULL, mapping_size,
6267       PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE,
6268       -1, 0);
6269     assert(mapping2 != MAP_FAILED, "should work");
6270 
6271     // Unmap the first mapping, but leave the second mapping intact: the first
6272     // mapping will serve as a value for a "good" req_addr (case 2). The second
6273     // mapping, still intact, as "bad" req_addr (case 3).
6274     ::munmap(mapping1, mapping_size);
6275 
6276     // Case 1
6277     for (int i = 0; i < num_sizes; i++) {
6278       const size_t size = sizes[i];
6279       for (size_t alignment = ag; is_aligned(size, alignment); alignment *= 2) {
6280         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, NULL, false);
6281         if (p != NULL) {
6282           assert(is_aligned(p, alignment), "must be");
6283           small_page_write(p, size);
6284           os::Linux::release_memory_special_huge_tlbfs(p, size);
6285         }
6286       }
6287     }
6288 
6289     // Case 2
6290     for (int i = 0; i < num_sizes; i++) {
6291       const size_t size = sizes[i];
6292       for (size_t alignment = ag; is_aligned(size, alignment); alignment *= 2) {
6293         char* const req_addr = align_up(mapping1, alignment);
6294         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
6295         if (p != NULL) {
6296           assert(p == req_addr, "must be");
6297           small_page_write(p, size);
6298           os::Linux::release_memory_special_huge_tlbfs(p, size);
6299         }
6300       }
6301     }
6302 
6303     // Case 3
6304     for (int i = 0; i < num_sizes; i++) {
6305       const size_t size = sizes[i];
6306       for (size_t alignment = ag; is_aligned(size, alignment); alignment *= 2) {
6307         char* const req_addr = align_up(mapping2, alignment);
6308         char* p = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, req_addr, false);
6309         // as the area around req_addr contains already existing mappings, the API should always
6310         // return NULL (as per contract, it cannot return another address)
6311         assert(p == NULL, "must be");
6312       }
6313     }
6314 
6315     ::munmap(mapping2, mapping_size);
6316 
6317   }
6318 
6319   static void test_reserve_memory_special_huge_tlbfs() {
6320     if (!UseHugeTLBFS) {
6321       return;
6322     }
6323 
6324     test_reserve_memory_special_huge_tlbfs_only();
6325     test_reserve_memory_special_huge_tlbfs_mixed();
6326   }
6327 
6328   static void test_reserve_memory_special_shm(size_t size, size_t alignment) {
6329     if (!UseSHM) {
6330       return;
6331     }
6332 
6333     char* addr = os::Linux::reserve_memory_special_shm(size, alignment, NULL, false);
6334 
6335     if (addr != NULL) {
6336       assert(is_aligned(addr, alignment), "Check");
6337       assert(is_aligned(addr, os::large_page_size()), "Check");
6338 
6339       small_page_write(addr, size);
6340 
6341       os::Linux::release_memory_special_shm(addr, size);
6342     }
6343   }
6344 
6345   static void test_reserve_memory_special_shm() {
6346     size_t lp = os::large_page_size();
6347     size_t ag = os::vm_allocation_granularity();
6348 
6349     for (size_t size = ag; size < lp * 3; size += ag) {
6350       for (size_t alignment = ag; is_aligned(size, alignment); alignment *= 2) {
6351         test_reserve_memory_special_shm(size, alignment);
6352       }
6353     }
6354   }
6355 
6356   static void test() {
6357     test_reserve_memory_special_huge_tlbfs();
6358     test_reserve_memory_special_shm();
6359   }
6360 };
6361 
6362 void TestReserveMemorySpecial_test() {
6363   TestReserveMemorySpecial::test();
6364 }
6365 
6366 #endif