1 /*
  2  *  Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
  3  *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  *  This code is free software; you can redistribute it and/or modify it
  6  *  under the terms of the GNU General Public License version 2 only, as
  7  *  published by the Free Software Foundation.  Oracle designates this
  8  *  particular file as subject to the "Classpath" exception as provided
  9  *  by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  *  This code is distributed in the hope that it will be useful, but WITHOUT
 12  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  *  version 2 for more details (a copy is included in the LICENSE file that
 15  *  accompanied this code).
 16  *
 17  *  You should have received a copy of the GNU General Public License version
 18  *  2 along with this work; if not, write to the Free Software Foundation,
 19  *  Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  *   Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  *  or visit www.oracle.com if you need additional information or have any
 23  *  questions.
 24  */
 25 
 26 package jdk.internal.foreign;
 27 
 28 import jdk.internal.access.JavaLangAccess;
 29 import jdk.internal.access.SharedSecrets;
 30 import jdk.internal.foreign.abi.SharedUtils;
 31 import jdk.internal.util.ArraysSupport;
 32 
 33 import java.lang.foreign.MemorySegment;
 34 import java.nio.charset.Charset;
 35 
 36 import static java.lang.foreign.ValueLayout.*;
 37 
 38 /**
 39  * Miscellaneous functions to read and write strings, in various charsets.
 40  */
 41 public final class StringSupport {
 42 
 43     static final JavaLangAccess JAVA_LANG_ACCESS = SharedSecrets.getJavaLangAccess();
 44 
 45     private StringSupport() {}
 46 
 47     public static String read(MemorySegment segment, long offset, Charset charset) {
 48         return switch (CharsetKind.of(charset)) {
 49             case SINGLE_BYTE -> readByte(segment, offset, charset);
 50             case DOUBLE_BYTE -> readShort(segment, offset, charset);
 51             case QUAD_BYTE -> readInt(segment, offset, charset);
 52         };
 53     }
 54 
 55     public static void write(MemorySegment segment, long offset, Charset charset, String string) {
 56         switch (CharsetKind.of(charset)) {
 57             case SINGLE_BYTE -> writeByte(segment, offset, charset, string);
 58             case DOUBLE_BYTE -> writeShort(segment, offset, charset, string);
 59             case QUAD_BYTE -> writeInt(segment, offset, charset, string);
 60         }
 61     }
 62 
 63     private static String readByte(MemorySegment segment, long offset, Charset charset) {
 64         long len = chunkedStrlenByte(segment, offset);
 65         byte[] bytes = new byte[(int)len];
 66         MemorySegment.copy(segment, JAVA_BYTE, offset, bytes, 0, (int)len);
 67         return new String(bytes, charset);
 68     }
 69 
 70     private static void writeByte(MemorySegment segment, long offset, Charset charset, String string) {
 71         int bytes = copyBytes(string, segment, charset, offset);
 72         segment.set(JAVA_BYTE, offset + bytes, (byte)0);
 73     }
 74 
 75     private static String readShort(MemorySegment segment, long offset, Charset charset) {
 76         long len = chunkedStrlenShort(segment, offset);
 77         byte[] bytes = new byte[(int)len];
 78         MemorySegment.copy(segment, JAVA_BYTE, offset, bytes, 0, (int)len);
 79         return new String(bytes, charset);
 80     }
 81 
 82     private static void writeShort(MemorySegment segment, long offset, Charset charset, String string) {
 83         int bytes = copyBytes(string, segment, charset, offset);
 84         segment.set(JAVA_SHORT, offset + bytes, (short)0);
 85     }
 86 
 87     private static String readInt(MemorySegment segment, long offset, Charset charset) {
 88         long len = strlenInt(segment, offset);
 89         byte[] bytes = new byte[(int)len];
 90         MemorySegment.copy(segment, JAVA_BYTE, offset, bytes, 0, (int)len);
 91         return new String(bytes, charset);
 92     }
 93 
 94     private static void writeInt(MemorySegment segment, long offset, Charset charset, String string) {
 95         int bytes = copyBytes(string, segment, charset, offset);
 96         segment.set(JAVA_INT, offset + bytes, 0);
 97     }
 98 
 99     /**
100      * {@return the shortest distance beginning at the provided {@code start}
101      *  to the encountering of a zero byte in the provided {@code segment}}
102      * <p>
103      * The method divides the region of interest into three distinct regions:
104      * <ul>
105      *     <li>head (access made on a byte-by-byte basis) (if any)</li>
106      *     <li>body (access made with eight bytes at a time at physically 64-bit-aligned memory) (if any)</li>
107      *     <li>tail (access made on a byte-by-byte basis) (if any)</li>
108      * </ul>
109      * <p>
110      * The body is using a heuristic method to determine if a long word
111      * contains a zero byte. The method might have false positives but
112      * never false negatives.
113      * <p>
114      * This method is inspired by the `glibc/string/strlen.c` implementation
115      *
116      * @param segment to examine
117      * @param start   from where examination shall begin
118      * @throws IllegalArgumentException if the examined region contains no zero bytes
119      *                                  within a length that can be accepted by a String
120      */
121     public static int chunkedStrlenByte(MemorySegment segment, long start) {
122 
123         // Handle the first unaligned "head" bytes separately
124         int headCount = (int)SharedUtils.remainsToAlignment(segment.address() + start, Long.BYTES);
125 
126         int offset = 0;
127         for (; offset < headCount; offset++) {
128             byte curr = segment.get(JAVA_BYTE, start + offset);
129             if (curr == 0) {
130                 return offset;
131             }
132         }
133 
134         // We are now on a long-aligned boundary so this is the "body"
135         int bodyCount = bodyCount(segment.byteSize() - start - headCount);
136 
137         for (; offset < bodyCount; offset += Long.BYTES) {
138             // We know we are `long` aligned so, we can save on alignment checking here
139             long curr = segment.get(JAVA_LONG_UNALIGNED, start + offset);
140             // Is this a candidate?
141             if (mightContainZeroByte(curr)) {
142                 for (int j = 0; j < 8; j++) {
143                     if (segment.get(JAVA_BYTE, start + offset + j) == 0) {
144                         return offset + j;
145                     }
146                 }
147             }
148         }
149 
150         // Handle the "tail"
151         return requireWithinArraySize((long) offset + strlenByte(segment, start + offset));
152     }
153 
154     /* Bits 63 and N * 8 (N = 1..7) of this number are zero.  Call these bits
155        the "holes".  Note that there is a hole just to the left of
156        each byte, with an extra at the end:
157 
158        bits:  01111110 11111110 11111110 11111110 11111110 11111110 11111110 11111111
159        bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD EEEEEEEE FFFFFFFF GGGGGGGG HHHHHHHH
160 
161        The 1-bits make sure that carries propagate to the next 0-bit.
162        The 0-bits provide holes for carries to fall into.
163     */
164     private static final long HIMAGIC_FOR_BYTES = 0x8080_8080_8080_8080L;
165     private static final long LOMAGIC_FOR_BYTES = 0x0101_0101_0101_0101L;
166 
167     static boolean mightContainZeroByte(long l) {
168         return ((l - LOMAGIC_FOR_BYTES) & (~l) & HIMAGIC_FOR_BYTES) != 0;
169     }
170 
171     private static final long HIMAGIC_FOR_SHORTS = 0x8000_8000_8000_8000L;
172     private static final long LOMAGIC_FOR_SHORTS = 0x0001_0001_0001_0001L;
173 
174     static boolean mightContainZeroShort(long l) {
175         return ((l - LOMAGIC_FOR_SHORTS) & (~l) & HIMAGIC_FOR_SHORTS) != 0;
176     }
177 
178     static int requireWithinArraySize(long size) {
179         if (size > ArraysSupport.SOFT_MAX_ARRAY_LENGTH) {
180             throw newIaeStringTooLarge();
181         }
182         return (int) size;
183     }
184 
185     static int bodyCount(long remaining) {
186         return (int) Math.min(
187                 // Make sure we do not wrap around
188                 Integer.MAX_VALUE - Long.BYTES,
189                 // Remaining bytes to consider
190                 remaining)
191                 & -Long.BYTES; // Mask 0xFFFFFFF8
192     }
193 
194     private static int strlenByte(MemorySegment segment, long start) {
195         for (int offset = 0; offset < ArraysSupport.SOFT_MAX_ARRAY_LENGTH; offset += 1) {
196             byte curr = segment.get(JAVA_BYTE, start + offset);
197             if (curr == 0) {
198                 return offset;
199             }
200         }
201         throw newIaeStringTooLarge();
202     }
203 
204     /**
205      * {@return the shortest distance beginning at the provided {@code start}
206      *  to the encountering of a zero short in the provided {@code segment}}
207      * <p>
208      * Note: The inspected region must be short aligned.
209      *
210      * @see #chunkedStrlenByte(MemorySegment, long) for more information
211      *
212      * @param segment to examine
213      * @param start   from where examination shall begin
214      * @throws IllegalArgumentException if the examined region contains no zero shorts
215      *                                  within a length that can be accepted by a String
216      */
217     public static int chunkedStrlenShort(MemorySegment segment, long start) {
218 
219         // Handle the first unaligned "head" bytes separately
220         int headCount = (int)SharedUtils.remainsToAlignment(segment.address() + start, Long.BYTES);
221 
222         int offset = 0;
223         for (; offset < headCount; offset += Short.BYTES) {
224             short curr = segment.get(JAVA_SHORT, start + offset);
225             if (curr == 0) {
226                 return offset;
227             }
228         }
229 
230         // We are now on a long-aligned boundary so this is the "body"
231         int bodyCount = bodyCount(segment.byteSize() - start - headCount);
232 
233         for (; offset < bodyCount; offset += Long.BYTES) {
234             // We know we are `long` aligned so, we can save on alignment checking here
235             long curr = segment.get(JAVA_LONG_UNALIGNED, start + offset);
236             // Is this a candidate?
237             if (mightContainZeroShort(curr)) {
238                 for (int j = 0; j < Long.BYTES; j += Short.BYTES) {
239                     if (segment.get(JAVA_SHORT_UNALIGNED, start + offset + j) == 0) {
240                         return offset + j;
241                     }
242                 }
243             }
244         }
245 
246         // Handle the "tail"
247         return requireWithinArraySize((long) offset + strlenShort(segment, start + offset));
248     }
249 
250     private static int strlenShort(MemorySegment segment, long start) {
251         for (int offset = 0; offset < ArraysSupport.SOFT_MAX_ARRAY_LENGTH; offset += Short.BYTES) {
252             short curr = segment.get(JAVA_SHORT_UNALIGNED, start + offset);
253             if (curr == (short)0) {
254                 return offset;
255             }
256         }
257         throw newIaeStringTooLarge();
258     }
259 
260     // The gain of using `long` wide operations for `int` is lower than for the two other `byte` and `short` variants
261     // so, there is only one method for ints.
262     public static int strlenInt(MemorySegment segment, long start) {
263         for (int offset = 0; offset < ArraysSupport.SOFT_MAX_ARRAY_LENGTH; offset += Integer.BYTES) {
264             // We are guaranteed to be aligned here so, we can use unaligned access.
265             int curr = segment.get(JAVA_INT_UNALIGNED, start + offset);
266             if (curr == 0) {
267                 return offset;
268             }
269         }
270         throw newIaeStringTooLarge();
271     }
272 
273     public enum CharsetKind {
274         SINGLE_BYTE(1),
275         DOUBLE_BYTE(2),
276         QUAD_BYTE(4);
277 
278         final int terminatorCharSize;
279 
280         CharsetKind(int terminatorCharSize) {
281             this.terminatorCharSize = terminatorCharSize;
282         }
283 
284         public int terminatorCharSize() {
285             return terminatorCharSize;
286         }
287 
288         public static CharsetKind of(Charset charset) {
289             // Comparing the charset to specific internal implementations avoids loading the class `StandardCharsets`
290             if        (charset == sun.nio.cs.UTF_8.INSTANCE ||
291                        charset == sun.nio.cs.ISO_8859_1.INSTANCE ||
292                        charset == sun.nio.cs.US_ASCII.INSTANCE) {
293                 return SINGLE_BYTE;
294             } else if (charset instanceof sun.nio.cs.UTF_16LE ||
295                        charset instanceof sun.nio.cs.UTF_16BE ||
296                        charset instanceof sun.nio.cs.UTF_16) {
297                 return DOUBLE_BYTE;
298             } else if (charset instanceof sun.nio.cs.UTF_32LE ||
299                        charset instanceof sun.nio.cs.UTF_32BE ||
300                        charset instanceof sun.nio.cs.UTF_32) {
301                 return QUAD_BYTE;
302             } else {
303                 throw new UnsupportedOperationException("Unsupported charset: " + charset);
304             }
305         }
306     }
307 
308     public static boolean bytesCompatible(String string, Charset charset) {
309         return JAVA_LANG_ACCESS.bytesCompatible(string, charset);
310     }
311 
312     public static int copyBytes(String string, MemorySegment segment, Charset charset, long offset) {
313         if (bytesCompatible(string, charset)) {
314             copyToSegmentRaw(string, segment, offset);
315             return string.length();
316         } else {
317             byte[] bytes = string.getBytes(charset);
318             MemorySegment.copy(bytes, 0, segment, JAVA_BYTE, offset, bytes.length);
319             return bytes.length;
320         }
321     }
322 
323     public static void copyToSegmentRaw(String string, MemorySegment segment, long offset) {
324         JAVA_LANG_ACCESS.copyToSegmentRaw(string, segment, offset);
325     }
326 
327     private static IllegalArgumentException newIaeStringTooLarge() {
328         return new IllegalArgumentException("String too large");
329     }
330 
331 }