1 /*
2  * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25
26 package jdk.nashorn.internal.runtime.linker;
27
28 /**
29  * <p>
30  * Implements the name mangling and demangling as specified by John Rose's
31  * <a href="https://blogs.oracle.com/jrose/entry/symbolic_freedom_in_the_vm"
32  * target="_blank">"Symbolic Freedom in the VM"</a> article. Normally, you would
33  * mangle the names in the call sites as you're generating bytecode, and then
34  * demangle them when you receive them in bootstrap methods.
35  * </p>
36  * <p>
37  * This code is derived from sun.invoke.util.BytecodeName. Apart from subsetting that
38  * class, we don't want to create dependency between non-exported package from java.base
39  * to nashorn module.
40  * </p>
41  *
42  * <h3>Comment from BytecodeName class reproduced here:</h3>
43  *
44  * Includes universal mangling rules for the JVM.
45  *
46  * <h3>Avoiding Dangerous Characters </h3>
47  *
48  * <p>
49  * The JVM defines a very small set of characters which are illegal
50  * in name spellings.  We will slightly extend and regularize this set
51  * into a group of <cite>dangerous characters</cite>.
52  * These characters will then be replaced, in mangled names, by escape sequences.
53  * In addition, accidental escape sequences must be further escaped.
54  * Finally, a special prefix will be applied if and only if
55  * the mangling would otherwise fail to begin with the escape character.
56  * This happens to cover the corner case of the null string,
57  * and also clearly marks symbols which need demangling.
58  * </p>
59  * <p>
60  * Dangerous characters are the union of all characters forbidden
61  * or otherwise restricted by the JVM specification,
62  * plus their mates, if they are brackets
63  * (<code><b>[</b></code> and <code><b>]</b></code>,
64  * <code><b>&lt;</b></code> and <code><b>&gt;</b></code>),
65  * plus, arbitrarily, the colon character <code><b>:</b></code>.
66  * There is no distinction between type, method, and field names.
67  * This makes it easier to convert between mangled names of different
68  * types, since they do not need to be decoded (demangled).
69  * </p>
70  * <p>
71  * The escape character is backslash <code><b>\</b></code>
72  * (also known as reverse solidus).
73  * This character is, until now, unheard of in bytecode names,
74  * but traditional in the proposed role.
75  *
76  * </p>
77  * <h3> Replacement Characters </h3>
78  *
79  *
80  * <p>
81  * Every escape sequence is two characters
82  * (in fact, two UTF8 bytes) beginning with
83  * the escape character and followed by a
84  * <cite>replacement character</cite>.
85  * (Since the replacement character is never a backslash,
86  * iterated manglings do not double in size.)
87  * </p>
88  * <p>
89  * Each dangerous character has some rough visual similarity
90  * to its corresponding replacement character.
91  * This makes mangled symbols easier to recognize by sight.
92  * </p>
93  * <p>
94  * The dangerous characters are
95  * <code><b>/</b></code> (forward slash, used to delimit package components),
96  * <code><b>.</b></code> (dot, also a package delimiter),
97  * <code><b>;</b></code> (semicolon, used in signatures),
98  * <code><b>$</b></code> (dollar, used in inner classes and synthetic members), 99 * <code><b>&lt;</b></code> (left angle), 100 * <code><b>&gt;</b></code> (right angle), 101 * <code><b>[</b></code> (left square bracket, used in array types), 102 * <code><b>]</b></code> (right square bracket, reserved in this scheme for language use), 103 * and <code><b>:</b></code> (colon, reserved in this scheme for language use). 104 * Their replacements are, respectively, 105 * <code><b>|</b></code> (vertical bar), 106 * <code><b>,</b></code> (comma), 107 * <code><b>?</b></code> (question mark), 108 * <code><b>%</b></code> (percent), 109 * <code><b>^</b></code> (caret), 110 * <code><b>_</b></code> (underscore), and 111 * <code><b>{</b></code> (left curly bracket), 112 * <code><b>}</b></code> (right curly bracket), 113 * <code><b>!</b></code> (exclamation mark). 114 * In addition, the replacement character for the escape character itself is 115 * <code><b>-</b></code> (hyphen), 116 * and the replacement character for the null prefix is 117 * <code><b>=</b></code> (equal sign). 118 * </p> 119 * <p> 120 * An escape character <code><b>\</b></code> 121 * followed by any of these replacement characters 122 * is an escape sequence, and there are no other escape sequences. 123 * An equal sign is only part of an escape sequence 124 * if it is the second character in the whole string, following a backslash. 125 * Two consecutive backslashes do <em>not</em> form an escape sequence. 126 * </p> 127 * <p> 128 * Each escape sequence replaces a so-called <cite>original character</cite> 129 * which is either one of the dangerous characters or the escape character. 130 * A null prefix replaces an initial null string, not a character. 131 * </p> 132 * <p> 133 * All this implies that escape sequences cannot overlap and may be 134 * determined all at once for a whole string. Note that a spelling 135 * string can contain <cite>accidental escapes</cite>, apparent escape 136 * sequences which must not be interpreted as manglings. 137 * These are disabled by replacing their leading backslash with an 138 * escape sequence (<code><b>\-</b></code>). To mangle a string, three logical steps 139 * are required, though they may be carried out in one pass: 140 * </p> 141 * <ol> 142 * <li>In each accidental escape, replace the backslash with an escape sequence 143 * (<code><b>\-</b></code>).</li> 144 * <li>Replace each dangerous character with an escape sequence 145 * (<code><b>\|</b></code> for <code><b>/</b></code>, etc.).</li> 146 * <li>If the first two steps introduced any change, <em>and</em> 147 * if the string does not already begin with a backslash, prepend a null prefix (<code><b>\=</b></code>).</li> 148 * </ol> 149 * 150 * To demangle a mangled string that begins with an escape, 151 * remove any null prefix, and then replace (in parallel) 152 * each escape sequence by its original character. 153 * <p>Spelling strings which contain accidental 154 * escapes <em>must</em> have them replaced, even if those 155 * strings do not contain dangerous characters. 156 * This restriction means that mangling a string always 157 * requires a scan of the string for escapes. 158 * But then, a scan would be required anyway, 159 * to check for dangerous characters. 160 * 161 * </p> 162 * <h3> Nice Properties </h3> 163 * 164 * <p> 165 * If a bytecode name does not contain any escape sequence, 166 * demangling is a no-op: The string demangles to itself. 167 * Such a string is called <cite>self-mangling</cite>. 168 * Almost all strings are self-mangling. 169 * In practice, to demangle almost any name &ldquo;found in nature&rdquo;, 170 * simply verify that it does not begin with a backslash. 171 * </p> 172 * <p> 173 * Mangling is a one-to-one function, while demangling 174 * is a many-to-one function. 175 * A mangled string is defined as <cite>validly mangled</cite> if 176 * it is in fact the unique mangling of its spelling string. 177 * Three examples of invalidly mangled strings are <code><b>\=foo</b></code>, 178 * <code><b>\-bar</b></code>, and <code><b>baz\!</b></code>, which demangle to <code><b>foo</b></code>, <code><b>\bar</b></code>, and 179 * <code><b>baz\!</b></code>, but then remangle to <code><b>foo</b></code>, <code><b>\bar</b></code>, and <code><b>\=baz\-!</b></code>. 180 * If a language back-end or runtime is using mangled names, 181 * it should never present an invalidly mangled bytecode 182 * name to the JVM. If the runtime encounters one, 183 * it should also report an error, since such an occurrence 184 * probably indicates a bug in name encoding which 185 * will lead to errors in linkage. 186 * However, this note does not propose that the JVM verifier 187 * detect invalidly mangled names. 188 * </p> 189 * <p> 190 * As a result of these rules, it is a simple matter to 191 * compute validly mangled substrings and concatenations 192 * of validly mangled strings, and (with a little care) 193 * these correspond to corresponding operations on their 194 * spelling strings. 195 * </p> 196 * <ul> 197 * <li>Any prefix of a validly mangled string is also validly mangled, 198 * although a null prefix may need to be removed.</li> 199 * <li>Any suffix of a validly mangled string is also validly mangled, 200 * although a null prefix may need to be added.</li> 201 * <li>Two validly mangled strings, when concatenated, 202 * are also validly mangled, although any null prefix 203 * must be removed from the second string, 204 * and a trailing backslash on the first string may need escaping, 205 * if it would participate in an accidental escape when followed 206 * by the first character of the second string.</li> 207 * </ul> 208 * <p>If languages that include non-Java symbol spellings use this 209 * mangling convention, they will enjoy the following advantages: 210 * </p> 211 * <ul> 212 * <li>They can interoperate via symbols they share in common.</li> 213 * <li>Low-level tools, such as backtrace printers, will have readable displays.</li> 214 * <li>Future JVM and language extensions can safely use the dangerous characters 215 * for structuring symbols, but will never interfere with valid spellings.</li> 216 * <li>Runtimes and compilers can use standard libraries for mangling and demangling.</li> 217 * <li>Occasional transliterations and name composition will be simple and regular, 218 * for classes, methods, and fields.</li> 219 * <li>Bytecode names will continue to be compact. 220 * When mangled, spellings will at most double in length, either in 221 * UTF8 or UTF16 format, and most will not change at all.</li> 222 * </ul> 223 * 224 * 225 * <h3> Suggestions for Human Readable Presentations </h3> 226 * 227 * 228 * <p> 229 * For human readable displays of symbols, 230 * it will be better to present a string-like quoted 231 * representation of the spelling, because JVM users 232 * are generally familiar with such tokens. 233 * We suggest using single or double quotes before and after 234 * mangled symbols which are not valid Java identifiers, 235 * with quotes, backslashes, and non-printing characters 236 * escaped as if for literals in the Java language. 237 * </p> 238 * <p> 239 * For example, an HTML-like spelling 240 * <code><b>&lt;pre&gt;</b></code> mangles to 241 * <code><b>\^pre\_</b></code> and could 242 * display more cleanly as 243 * <code><b>'&lt;pre&gt;'</b></code>, 244 * with the quotes included. 245 * Such string-like conventions are <em>not</em> suitable 246 * for mangled bytecode names, in part because 247 * dangerous characters must be eliminated, rather 248 * than just quoted. Otherwise internally structured 249 * strings like package prefixes and method signatures 250 * could not be reliably parsed. 251 * </p> 252 * <p> 253 * In such human-readable displays, invalidly mangled 254 * names should <em>not</em> be demangled and quoted, 255 * for this would be misleading. Likewise, JVM symbols 256 * which contain dangerous characters (like dots in field 257 * names or brackets in method names) should not be 258 * simply quoted. The bytecode names 259 * <code><b>\=phase\,1</b></code> and 260 * <code><b>phase.1</b></code> are distinct, 261 * and in demangled displays they should be presented as 262 * <code><b>'phase.1'</b></code> and something like 263 * <code><b>'phase'.1</b></code>, respectively. 264 * </p> 265 */ 266 public final class NameCodec { 267 private NameCodec() { 268 } 269 270 private static final char ESCAPE_C = '\\'; 271 // empty escape sequence to avoid a null name or illegal prefix 272 private static final char NULL_ESCAPE_C = '='; 273 private static final String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C; 274 275 /** 276 * Canonical encoding for the empty name. 277 */ 278 public static final String EMPTY_NAME = new String(new char[] { ESCAPE_C, NULL_ESCAPE_C }); 279 280 /** 281 * Encodes ("mangles") an unencoded symbolic name. 282 * @param name the symbolic name to mangle 283 * @return the mangled form of the symbolic name. 284 */ 285 public static String encode(final String name) { 286 final String bn = mangle(name); 287 assert((Object)bn == name || looksMangled(bn)) : bn; 288 assert(name.equals(decode(bn))) : name; 289 return bn; 290 } 291 292 /** 293 * Decodes ("demangles") an encoded symbolic name. 294 * @param name the symbolic name to demangle 295 * @return the demangled form of the symbolic name. 296 */ 297 public static String decode(final String name) { 298 String sn = name; 299 if (!sn.isEmpty() && looksMangled(name)) { 300 sn = demangle(name); 301 assert(name.equals(mangle(sn))) : name+" => "+sn+" => "+mangle(sn); 302 } 303 return sn; 304 } 305 306 private static boolean looksMangled(final String s) { 307 return s.charAt(0) == ESCAPE_C; 308 } 309 310 private static String mangle(final String s) { 311 if (s.length() == 0) 312 return NULL_ESCAPE; 313 314 // build this lazily, when we first need an escape: 315 StringBuilder sb = null; 316 317 for (int i = 0, slen = s.length(); i < slen; i++) { 318 final char c = s.charAt(i); 319 320 boolean needEscape = false; 321 if (c == ESCAPE_C) { 322 if (i+1 < slen) { 323 final char c1 = s.charAt(i+1); 324 if ((i == 0 && c1 == NULL_ESCAPE_C) 325 || c1 != originalOfReplacement(c1)) { 326 // an accidental escape 327 needEscape = true; 328 } 329 } 330 } else { 331 needEscape = isDangerous(c); 332 } 333 334 if (!needEscape) { 335 if (sb != null) sb.append(c); 336 continue; 337 } 338 339 // build sb if this is the first escape 340 if (sb == null) { 341 sb = new StringBuilder(s.length()+10); 342 // mangled names must begin with a backslash: 343 if (s.charAt(0) != ESCAPE_C && i > 0) 344 sb.append(NULL_ESCAPE); 345 // append the string so far, which is unremarkable: 346 sb.append(s, 0, i); 347 } 348 349 // rewrite \ to \-, / to \|, etc. 350 sb.append(ESCAPE_C); 351 sb.append(replacementOf(c)); 352 } 353 354 if (sb != null) return sb.toString(); 355 356 return s; 357 } 358 359 private static String demangle(final String s) { 360 // build this lazily, when we first meet an escape: 361 StringBuilder sb = null; 362 363 int stringStart = 0; 364 if (s.startsWith(NULL_ESCAPE)) 365 stringStart = 2; 366 367 for (int i = stringStart, slen = s.length(); i < slen; i++) { 368 char c = s.charAt(i); 369 370 if (c == ESCAPE_C && i+1 < slen) { 371 // might be an escape sequence 372 final char rc = s.charAt(i+1); 373 final char oc = originalOfReplacement(rc); 374 if (oc != rc) { 375 // build sb if this is the first escape 376 if (sb == null) { 377 sb = new StringBuilder(s.length()); 378 // append the string so far, which is unremarkable: 379 sb.append(s, stringStart, i); 380 } 381 ++i; // skip both characters 382 c = oc; 383 } 384 } 385 386 if (sb != null) 387 sb.append(c); 388 } 389 390 if (sb != null) return sb.toString(); 391 392 return s.substring(stringStart); 393 } 394 395 private static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; // \\ must be first
396     private static final String REPLACEMENT_CHARS =  "-|,?!%{}^_";
397     private static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\
398
399     private static final long[] SPECIAL_BITMAP = new long[2];  // 128 bits
400     static {
401         final String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS;
402         for (final char c : SPECIAL.toCharArray()) {
403             SPECIAL_BITMAP[c >>> 6] |= 1L << c;
404         }
405     }
406
407     private static boolean isSpecial(final char c) {
408         if ((c >>> 6) < SPECIAL_BITMAP.length)
409             return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0;
410         else
411             return false;
412     }
413
414     private static char replacementOf(final char c) {
415         if (!isSpecial(c))  return c;
416         final int i = DANGEROUS_CHARS.indexOf(c);
417         if (i < 0)  return c;
418         return REPLACEMENT_CHARS.charAt(i);
419     }
420
421     private static char originalOfReplacement(final char c) {
422         if (!isSpecial(c))  return c;
423         final int i = REPLACEMENT_CHARS.indexOf(c);
424         if (i < 0)  return c;
425         return DANGEROUS_CHARS.charAt(i);
426     }
427
428     private static boolean isDangerous(final char c) {
429         if (!isSpecial(c))  return false;
430         return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX);
431     }
432 }