< prev index next >

src/hotspot/cpu/arm/stubGenerator_arm.cpp

Print this page


   1 /*
   2  * Copyright (c) 2008, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *


 911   // Generate the inner loop for forward aligned array copy
 912   //
 913   // Arguments
 914   //      from:      src address, 64 bits  aligned
 915   //      to:        dst address, wordSize aligned
 916   //      count:     number of elements (32-bit int)
 917   //      bytes_per_count: number of bytes for each unit of 'count'
 918   //
 919   // Return the minimum initial value for count
 920   //
 921   // Notes:
 922   // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
 923   // - 'to' aligned on wordSize
 924   // - 'count' must be greater or equal than the returned value
 925   //
 926   // Increases 'from' and 'to' by count*bytes_per_count.
 927   //
 928   // Scratches 'count', R3.
 929   // R4-R10 are preserved (saved/restored).
 930   //
 931   int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool unsafe_copy = false) {
 932     assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
 933 
 934     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
 935     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
 936     int pld_offset = config->pld_distance;
 937     const int count_per_loop = bytes_per_loop / bytes_per_count;
 938 
 939     bool split_read= config->split_ldm;
 940     bool split_write= config->split_stm;
 941 
 942     // XXX optim: use VLDM/VSTM when available (Neon) with PLD
 943     //  NEONCopyPLD
 944     //      PLD [r1, #0xC0]
 945     //      VLDM r1!,{d0-d7}
 946     //      VSTM r0!,{d0-d7}
 947     //      SUBS r2,r2,#0x40
 948     //      BGE NEONCopyPLD
 949 
 950     __ push(RegisterSet(R4,R10));
 951 
 952     const bool prefetch_before = pld_offset < 0;
 953     const bool prefetch_after = pld_offset > 0;
 954 
 955     Label L_skip_pld;
 956 
 957     {
 958       // UnsafeCopyMemory page error: continue after ucm
 959       UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true);
 960       // predecrease to exit when there is less than count_per_loop
 961       __ sub_32(count, count, count_per_loop);
 962 
 963       if (pld_offset != 0) {
 964         pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
 965 
 966         prefetch(from, to, 0);
 967 
 968         if (prefetch_before) {
 969           // If prefetch is done ahead, final PLDs that overflow the
 970           // copied area can be easily avoided. 'count' is predecreased
 971           // by the prefetch distance to optimize the inner loop and the
 972           // outer loop skips the PLD.
 973           __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count);
 974 
 975           // skip prefetch for small copies
 976           __ b(L_skip_pld, lt);
 977         }








 978 
 979         int offset = ArmCopyCacheLineSize;
 980         while (offset <= pld_offset) {
 981           prefetch(from, to, offset);
 982           offset += ArmCopyCacheLineSize;
 983         };
 984       }
 985 
 986       {
 987         // 32-bit ARM note: we have tried implementing loop unrolling to skip one
 988         // PLD with 64 bytes cache line but the gain was not significant.



 989 
 990         Label L_copy_loop;
 991         __ align(OptoLoopAlignment);
 992         __ BIND(L_copy_loop);
 993 
 994         if (prefetch_before) {
 995           prefetch(from, to, bytes_per_loop + pld_offset);
 996           __ BIND(L_skip_pld);
 997         }
 998 
 999         if (split_read) {
1000           // Split the register set in two sets so that there is less
1001           // latency between LDM and STM (R3-R6 available while R7-R10
1002           // still loading) and less register locking issue when iterating
1003           // on the first LDM.
1004           __ ldmia(from, RegisterSet(R3, R6), writeback);
1005           __ ldmia(from, RegisterSet(R7, R10), writeback);
1006         } else {
1007           __ ldmia(from, RegisterSet(R3, R10), writeback);
1008         }
1009 
1010         __ subs_32(count, count, count_per_loop);



1011 
1012         if (prefetch_after) {
1013           prefetch(from, to, pld_offset, bytes_per_loop);
1014         }







1015 
1016         if (split_write) {
1017           __ stmia(to, RegisterSet(R3, R6), writeback);
1018           __ stmia(to, RegisterSet(R7, R10), writeback);
1019         } else {
1020           __ stmia(to, RegisterSet(R3, R10), writeback);
1021         }
1022 
1023         __ b(L_copy_loop, ge);


1024 
1025         if (prefetch_before) {
1026           // the inner loop may end earlier, allowing to skip PLD for the last iterations
1027           __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
1028           __ b(L_skip_pld, ge);
1029         }
1030       }
1031       BLOCK_COMMENT("Remaining bytes:");
1032       // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
1033 
1034       // __ add(count, count, ...); // addition useless for the bit tests
1035       assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
1036 
1037       __ tst(count, 16 / bytes_per_count);
1038       __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
1039       __ stmia(to, RegisterSet(R3, R6), writeback, ne);





1040 
1041       __ tst(count, 8 / bytes_per_count);
1042       __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
1043       __ stmia(to, RegisterSet(R3, R4), writeback, ne);
1044 
1045       if (bytes_per_count <= 4) {
1046         __ tst(count, 4 / bytes_per_count);
1047         __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes
1048         __ str(R3, Address(to, 4, post_indexed), ne);
1049       }
1050 
1051       if (bytes_per_count <= 2) {
1052         __ tst(count, 2 / bytes_per_count);
1053         __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes
1054         __ strh(R3, Address(to, 2, post_indexed), ne);
1055       }
1056 
1057       if (bytes_per_count == 1) {
1058         __ tst(count, 1);
1059         __ ldrb(R3, Address(from, 1, post_indexed), ne);
1060         __ strb(R3, Address(to, 1, post_indexed), ne);
1061       }







1062     }
1063 
1064     __ pop(RegisterSet(R4,R10));
1065 
1066     return count_per_loop;
1067   }
1068 
1069 
1070   // Generate the inner loop for backward aligned array copy
1071   //
1072   // Arguments
1073   //      end_from:      src end address, 64 bits  aligned
1074   //      end_to:        dst end address, wordSize aligned
1075   //      count:         number of elements (32-bit int)
1076   //      bytes_per_count: number of bytes for each unit of 'count'
1077   //
1078   // Return the minimum initial value for count
1079   //
1080   // Notes:
1081   // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
1082   // - 'end_to' aligned on wordSize
1083   // - 'count' must be greater or equal than the returned value
1084   //
1085   // Decreases 'end_from' and 'end_to' by count*bytes_per_count.
1086   //
1087   // Scratches 'count', R3.
1088   // ARM R4-R10 are preserved (saved/restored).
1089   //
1090   int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, bool unsafe_copy = false) {
1091     assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
1092 
1093     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
1094     const int count_per_loop = bytes_per_loop / bytes_per_count;
1095 
1096     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned;
1097     int pld_offset = config->pld_distance;
1098 
1099     bool split_read= config->split_ldm;
1100     bool split_write= config->split_stm;
1101 
1102     // See the forward copy variant for additional comments.
1103 
1104     __ push(RegisterSet(R4,R10));
1105 
1106     {
1107       // UnsafeCopyMemory page error: continue after ucm
1108       UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true);
1109       __ sub_32(count, count, count_per_loop);
1110 
1111       const bool prefetch_before = pld_offset < 0;
1112       const bool prefetch_after = pld_offset > 0;
1113 
1114       Label L_skip_pld;
1115 
1116       if (pld_offset != 0) {
1117         pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
1118 
1119         prefetch(end_from, end_to, -wordSize);

1120 
1121         if (prefetch_before) {
1122           __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count);
1123           __ b(L_skip_pld, lt);
1124         }
1125 
1126         int offset = ArmCopyCacheLineSize;
1127         while (offset <= pld_offset) {
1128           prefetch(end_from, end_to, -(wordSize + offset));
1129           offset += ArmCopyCacheLineSize;
1130         };
1131       }
1132 
1133       {
1134         // 32-bit ARM note: we have tried implementing loop unrolling to skip one
1135         // PLD with 64 bytes cache line but the gain was not significant.



1136 
1137         Label L_copy_loop;
1138         __ align(OptoLoopAlignment);
1139         __ BIND(L_copy_loop);
1140 
1141         if (prefetch_before) {
1142           prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
1143           __ BIND(L_skip_pld);
1144         }
1145 
1146         if (split_read) {
1147           __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
1148           __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
1149         } else {
1150           __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
1151         }
1152 
1153         __ subs_32(count, count, count_per_loop);



1154 
1155         if (prefetch_after) {
1156           prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
1157         }



1158 
1159         if (split_write) {
1160           __ stmdb(end_to, RegisterSet(R7, R10), writeback);
1161           __ stmdb(end_to, RegisterSet(R3, R6), writeback);
1162         } else {
1163           __ stmdb(end_to, RegisterSet(R3, R10), writeback);
1164         }
1165 
1166         __ b(L_copy_loop, ge);


1167 
1168         if (prefetch_before) {
1169           __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
1170           __ b(L_skip_pld, ge);
1171         }

1172       }
1173       BLOCK_COMMENT("Remaining bytes:");
1174       // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
1175 
1176       // __ add(count, count, ...); // addition useless for the bit tests
1177       assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
1178 
1179       __ tst(count, 16 / bytes_per_count);
1180       __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
1181       __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne);




1182 
1183       __ tst(count, 8 / bytes_per_count);
1184       __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
1185       __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne);
1186 
1187       if (bytes_per_count <= 4) {
1188         __ tst(count, 4 / bytes_per_count);
1189         __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes
1190         __ str(R3, Address(end_to, -4, pre_indexed), ne);
1191       }
1192 
1193       if (bytes_per_count <= 2) {
1194         __ tst(count, 2 / bytes_per_count);
1195         __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes
1196         __ strh(R3, Address(end_to, -2, pre_indexed), ne);
1197       }
1198 
1199       if (bytes_per_count == 1) {
1200         __ tst(count, 1);
1201         __ ldrb(R3, Address(end_from, -1, pre_indexed), ne);
1202         __ strb(R3, Address(end_to, -1, pre_indexed), ne);
1203       }
1204     }













1205     __ pop(RegisterSet(R4,R10));
1206 
1207     return count_per_loop;
1208   }
1209 
1210 
1211   // Generate the inner loop for shifted forward array copy (unaligned copy).
1212   // It can be used when bytes_per_count < wordSize, i.e. byte/short copy
1213   //
1214   // Arguments
1215   //      from:      start src address, 64 bits aligned
1216   //      to:        start dst address, (now) wordSize aligned
1217   //      count:     number of elements (32-bit int)
1218   //      bytes_per_count: number of bytes for each unit of 'count'
1219   //      lsr_shift: shift applied to 'old' value to skipped already written bytes
1220   //      lsl_shift: shift applied to 'new' value to set the high bytes of the next write
1221   //
1222   // Return the minimum initial value for count
1223   //
1224   // Notes:


1739       store_one(tmp, to, bytes_per_count, forward, ne);
1740       if (bytes_per_count < 4) {
1741         __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough
1742       }
1743     }
1744     return 7/bytes_per_count;
1745   }
1746 
1747   // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction.
1748   //
1749   // Arguments:
1750   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
1751   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
1752   //     count:             32-bit int, number of elements to be copied
1753   //     entry:             copy loop entry point
1754   //     bytes_per_count:   size of an element
1755   //     forward:           specifies copy direction
1756   //
1757   // Notes:
1758   //     shifts 'from' and 'to'
1759   void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry, bool unsafe_copy = false) {
1760     assert_different_registers(from, to, count, tmp);
1761 
1762     {
1763       // UnsafeCopyMemory page error: continue after ucm
1764       UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true);
1765       __ align(OptoLoopAlignment);
1766       Label L_small_loop;
1767       __ BIND(L_small_loop);
1768       store_one(tmp, to, bytes_per_count, forward, al, tmp2);
1769       __ BIND(entry); // entry point
1770       __ subs(count, count, 1);
1771       load_one(tmp, from, bytes_per_count, forward, ge, tmp2);
1772       __ b(L_small_loop, ge);
1773     }
1774   }
1775 
1776   // Aligns 'to' by reading one word from 'from' and writting its part to 'to'.
1777   //
1778   // Arguments:
1779   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
1780   //     count:             32-bit int, number of elements allowed to be copied
1781   //     to_remainder:      remainder of dividing 'to' by wordSize
1782   //     bytes_per_count:   size of an element
1783   //     forward:           specifies copy direction
1784   //     Rval:              contains an already read but not yet written word;
1785   //                        its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'.
1786   //
1787   // Notes:
1788   //     'count' must not be less then the returned value
1789   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
1790   //     shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written)
1791   //     decreases 'count' by the the number of elements written
1792   //     Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop
1793   int align_dst(Register to, Register count, Register Rval, Register tmp,


1870     return min_copy + required_to_align;
1871   }
1872 
1873   // Copies 'count' of elements using shifted copy loop
1874   //
1875   // Arguments:
1876   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
1877   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
1878   //     count:             32-bit int, number of elements to be copied
1879   //     bytes_per_count:   size of an element
1880   //     forward:           specifies copy direction
1881   //
1882   // Notes:
1883   //     'count' must not be less then the returned value
1884   //     'from' must be aligned by wordSize
1885   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
1886   //     shifts 'to' by the number of copied bytes
1887   //
1888   // Scratches 'from', 'count', R3 and R12.
1889   // R4-R10 saved for use.
1890   int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward, bool unsafe_copy = false) {
1891 
1892     const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect
1893 
1894     int min_copy = 0;
1895 
1896     // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point,
1897     // then the remainder of 'to' divided by wordSize is one of elements of {seq}.
1898 
1899     __ push(RegisterSet(R4,R10));

1900 
1901     {
1902       // UnsafeCopyMemory page error: continue after ucm
1903       UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true);
1904       load_one(Rval, from, wordSize, forward);
1905 
1906       switch (bytes_per_count) {
1907         case 2:
1908           min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
1909           break;
1910         case 1:
1911         {
1912           Label L1, L2, L3;
1913           int min_copy1, min_copy2, min_copy3;
1914 
1915           Label L_loop_finished;
1916 
1917           if (forward) {
1918               __ tbz(to, 0, L2);
1919               __ tbz(to, 1, L1);
1920 
1921               __ BIND(L3);
1922               min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
1923               __ b(L_loop_finished);
1924 
1925               __ BIND(L1);
1926               min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
1927               __ b(L_loop_finished);
1928 
1929               __ BIND(L2);
1930               min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
1931           } else {
1932               __ tbz(to, 0, L2);
1933               __ tbnz(to, 1, L3);
1934 
1935               __ BIND(L1);
1936               min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
1937               __ b(L_loop_finished);
1938 
1939                __ BIND(L3);
1940               min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
1941               __ b(L_loop_finished);
1942 
1943              __ BIND(L2);
1944               min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
1945           }




1946 
1947           min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3);




1948 
1949           __ BIND(L_loop_finished);






1950 
1951           break;

1952         }
1953         default:
1954           ShouldNotReachHere();
1955           break;



1956       }



1957     }

1958     __ pop(RegisterSet(R4,R10));
1959 
1960     return min_copy;
1961   }
1962 
1963 #ifndef PRODUCT
1964   int * get_arraycopy_counter(int bytes_per_count) {
1965     switch (bytes_per_count) {
1966       case 1:
1967         return &SharedRuntime::_jbyte_array_copy_ctr;
1968       case 2:
1969         return &SharedRuntime::_jshort_array_copy_ctr;
1970       case 4:
1971         return &SharedRuntime::_jint_array_copy_ctr;
1972       case 8:
1973         return &SharedRuntime::_jlong_array_copy_ctr;
1974       default:
1975         ShouldNotReachHere();
1976         return NULL;
1977     }
1978   }
1979 #endif // !PRODUCT
1980 
1981   address generate_unsafecopy_common_error_exit() {
1982     address start_pc = __ pc();
1983       __ mov(R0, 0);
1984       __ ret();
1985     return start_pc;
1986   }
1987 
1988   //
1989   //  Generate stub for primitive array copy.  If "aligned" is true, the
1990   //  "from" and "to" addresses are assumed to be heapword aligned.
1991   //
1992   //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
1993   //  "nooverlap_target" must be specified as the address to jump if they don't.
1994   //
1995   // Arguments for generated stub:
1996   //      from:  R0
1997   //      to:    R1
1998   //      count: R2 treated as signed 32-bit int
1999   //
2000   address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) {
2001     __ align(CodeEntryAlignment);
2002     StubCodeMark mark(this, "StubRoutines", name);
2003     address start = __ pc();
2004 
2005     const Register from  = R0;   // source array address
2006     const Register to    = R1;   // destination array address
2007     const Register count = R2;   // elements count


2038     //  *) The small and simple one applicable for any array (but not efficient for large arrays).
2039     // Currently "small" implementation is used if and only if the "large" one could not be used.
2040     // XXX optim: tune the limit higher ?
2041     // Large implementation lower applicability bound is actually determined by
2042     // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop.
2043     const int small_copy_limit = (8*wordSize + 7) / bytes_per_count;
2044 
2045     Label L_small_array;
2046     __ cmp_32(count, small_copy_limit);
2047     __ b(L_small_array, le);
2048 
2049     // Otherwise proceed with large implementation.
2050 
2051     bool from_is_aligned = (bytes_per_count >= 8);
2052     if (aligned && forward && (HeapWordSize % 8 == 0)) {
2053         // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
2054         //  then from is aligned by 8
2055         from_is_aligned = true;
2056     }
2057 
2058     int count_required_to_align = 0;
2059     {
2060       // UnsafeCopyMemoryMark page error: continue at UnsafeCopyMemory common_error_exit
2061       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
2062       count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
2063       assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");
2064     }
2065 
2066     // now 'from' is aligned
2067 
2068     bool to_is_aligned = false;
2069 
2070     if (bytes_per_count >= wordSize) {
2071       // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
2072       to_is_aligned = true;
2073     } else {
2074       if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
2075         // Originally 'from' and 'to' were heapword aligned;
2076         // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
2077         //  so 'to' is also heapword aligned and thus aligned by wordSize.
2078         to_is_aligned = true;
2079       }
2080     }
2081 
2082     Label L_unaligned_dst;
2083 
2084     if (!to_is_aligned) {
2085       BLOCK_COMMENT("Check dst alignment:");
2086       __ tst(to, wordSize - 1);
2087       __ b(L_unaligned_dst, ne); // 'to' is not aligned
2088     }
2089 
2090     // 'from' and 'to' are properly aligned
2091 
2092     int min_copy;
2093     if (forward) {
2094       min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count, !aligned /*add UnsafeCopyMemory entry*/);
2095     } else {
2096       min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count, !aligned /*add UnsafeCopyMemory entry*/);
2097     }
2098     assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
2099 
2100     if (status) {
2101       __ mov(R0, 0); // OK
2102     }
2103 
2104     __ ret();
2105 
2106     {
2107       copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */, !aligned /*add UnsafeCopyMemory entry*/);
2108 
2109       if (status) {
2110         __ mov(R0, 0); // OK
2111       }
2112 
2113       __ ret();
2114     }
2115 
2116     if (! to_is_aligned) {
2117       __ BIND(L_unaligned_dst);
2118       int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward, !aligned /*add UnsafeCopyMemory entry*/);
2119       assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
2120 
2121       if (status) {
2122         __ mov(R0, 0); // OK
2123       }
2124 
2125       __ ret();
2126     }
2127 
2128     return start;
2129   }
2130 
2131 
2132   // Generates pattern of code to be placed after raw data copying in generate_oop_copy
2133   // Includes return from arraycopy stub.
2134   //
2135   // Arguments:
2136   //     to:       destination pointer after copying.
2137   //               if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region
2138   //     count:    total number of copied elements, 32-bit int


2883     //        the conjoint stubs use them.
2884 
2885     bool status = false; // non failing C2 stubs need not return a status in R0
2886 
2887 #ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */
2888     // With this flag, the C2 stubs are tested by generating calls to
2889     // generic_arraycopy instead of Runtime1::arraycopy
2890 
2891     // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied)
2892     // and the result is tested to see whether the arraycopy stub should
2893     // be called.
2894 
2895     // When we test arraycopy this way, we must generate extra code in the
2896     // arraycopy methods callable from C2 generic_arraycopy to set the
2897     // status to 0 for those who always succeed (calling the slow path stub might
2898     // lead to errors since the copy has already been performed).
2899 
2900     status = true; // generate a status compatible with C1 calls
2901 #endif
2902 
2903     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
2904     UnsafeCopyMemory::set_common_exit_stub_pc(ucm_common_error_exit);
2905 
2906     // these need always status in case they are called from generic_arraycopy
2907     StubRoutines::_jbyte_disjoint_arraycopy  = generate_primitive_copy(false, "jbyte_disjoint_arraycopy",  true, 1, true);
2908     StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true);
2909     StubRoutines::_jint_disjoint_arraycopy   = generate_primitive_copy(false, "jint_disjoint_arraycopy",   true, 4, true);
2910     StubRoutines::_jlong_disjoint_arraycopy  = generate_primitive_copy(false, "jlong_disjoint_arraycopy",  true, 8, true);
2911     StubRoutines::_oop_disjoint_arraycopy    = generate_oop_copy      (false, "oop_disjoint_arraycopy",    true,    true);
2912 
2913     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true);
2914     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true);
2915     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy",  status, 4, true);
2916     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true);
2917     StubRoutines::_arrayof_oop_disjoint_arraycopy    = generate_oop_copy      (true, "arrayof_oop_disjoint_arraycopy",   status,    true);
2918 
2919     // these need always status in case they are called from generic_arraycopy
2920     StubRoutines::_jbyte_arraycopy  = generate_primitive_copy(false, "jbyte_arraycopy",  true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy);
2921     StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy);
2922     StubRoutines::_jint_arraycopy   = generate_primitive_copy(false, "jint_arraycopy",   true, 4, false, StubRoutines::_jint_disjoint_arraycopy);
2923     StubRoutines::_jlong_arraycopy  = generate_primitive_copy(false, "jlong_arraycopy",  true, 8, false, StubRoutines::_jlong_disjoint_arraycopy);
2924     StubRoutines::_oop_arraycopy    = generate_oop_copy      (false, "oop_arraycopy",    true,    false, StubRoutines::_oop_disjoint_arraycopy);
2925 


3068       aes_init();
3069       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3070       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3071       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3072       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
3073     }
3074 #endif // COMPILE_CRYPTO
3075   }
3076 
3077 
3078  public:
3079   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3080     if (all) {
3081       generate_all();
3082     } else {
3083       generate_initial();
3084     }
3085   }
3086 }; // end class declaration
3087 
3088 #define UCM_TABLE_MAX_ENTRIES 32
3089 void StubGenerator_generate(CodeBuffer* code, bool all) {
3090   if (UnsafeCopyMemory::_table == NULL) {
3091     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
3092   }
3093   StubGenerator g(code, all);
3094 }
   1 /*
   2  * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *


 911   // Generate the inner loop for forward aligned array copy
 912   //
 913   // Arguments
 914   //      from:      src address, 64 bits  aligned
 915   //      to:        dst address, wordSize aligned
 916   //      count:     number of elements (32-bit int)
 917   //      bytes_per_count: number of bytes for each unit of 'count'
 918   //
 919   // Return the minimum initial value for count
 920   //
 921   // Notes:
 922   // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
 923   // - 'to' aligned on wordSize
 924   // - 'count' must be greater or equal than the returned value
 925   //
 926   // Increases 'from' and 'to' by count*bytes_per_count.
 927   //
 928   // Scratches 'count', R3.
 929   // R4-R10 are preserved (saved/restored).
 930   //
 931   int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) {
 932     assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
 933 
 934     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
 935     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
 936     int pld_offset = config->pld_distance;
 937     const int count_per_loop = bytes_per_loop / bytes_per_count;
 938 
 939     bool split_read= config->split_ldm;
 940     bool split_write= config->split_stm;
 941 
 942     // XXX optim: use VLDM/VSTM when available (Neon) with PLD
 943     //  NEONCopyPLD
 944     //      PLD [r1, #0xC0]
 945     //      VLDM r1!,{d0-d7}
 946     //      VSTM r0!,{d0-d7}
 947     //      SUBS r2,r2,#0x40
 948     //      BGE NEONCopyPLD
 949 
 950     __ push(RegisterSet(R4,R10));
 951 
 952     const bool prefetch_before = pld_offset < 0;
 953     const bool prefetch_after = pld_offset > 0;
 954 
 955     Label L_skip_pld;
 956 
 957     // predecrease to exit when there is less than count_per_loop
 958     __ sub_32(count, count, count_per_loop);















 959 
 960     if (pld_offset != 0) {
 961       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
 962 
 963       prefetch(from, to, 0);
 964 
 965       if (prefetch_before) {
 966         // If prefetch is done ahead, final PLDs that overflow the
 967         // copied area can be easily avoided. 'count' is predecreased
 968         // by the prefetch distance to optimize the inner loop and the
 969         // outer loop skips the PLD.
 970         __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count);
 971 
 972         // skip prefetch for small copies
 973         __ b(L_skip_pld, lt);



 974       }
 975 
 976       int offset = ArmCopyCacheLineSize;
 977       while (offset <= pld_offset) {
 978         prefetch(from, to, offset);
 979         offset += ArmCopyCacheLineSize;
 980       };
 981     }
 982 
 983     {
 984       // 32-bit ARM note: we have tried implementing loop unrolling to skip one
 985       // PLD with 64 bytes cache line but the gain was not significant.





 986 
 987       Label L_copy_loop;
 988       __ align(OptoLoopAlignment);
 989       __ BIND(L_copy_loop);







 990 
 991       if (prefetch_before) {
 992         prefetch(from, to, bytes_per_loop + pld_offset);
 993         __ BIND(L_skip_pld);
 994       }
 995 
 996       if (split_read) {
 997         // Split the register set in two sets so that there is less
 998         // latency between LDM and STM (R3-R6 available while R7-R10
 999         // still loading) and less register locking issue when iterating
1000         // on the first LDM.
1001         __ ldmia(from, RegisterSet(R3, R6), writeback);
1002         __ ldmia(from, RegisterSet(R7, R10), writeback);
1003       } else {
1004         __ ldmia(from, RegisterSet(R3, R10), writeback);
1005       }
1006 
1007       __ subs_32(count, count, count_per_loop);





1008 
1009       if (prefetch_after) {
1010         prefetch(from, to, pld_offset, bytes_per_loop);
1011       }
1012 
1013       if (split_write) {
1014         __ stmia(to, RegisterSet(R3, R6), writeback);
1015         __ stmia(to, RegisterSet(R7, R10), writeback);
1016       } else {
1017         __ stmia(to, RegisterSet(R3, R10), writeback);
1018       }


1019 
1020       __ b(L_copy_loop, ge);

1021 
1022       if (prefetch_before) {
1023         // the inner loop may end earlier, allowing to skip PLD for the last iterations
1024         __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
1025         __ b(L_skip_pld, ge);
1026       }
1027     }
1028     BLOCK_COMMENT("Remaining bytes:");
1029     // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
1030 
1031     // __ add(count, count, ...); // addition useless for the bit tests
1032     assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");

1033 
1034     __ tst(count, 16 / bytes_per_count);
1035     __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
1036     __ stmia(to, RegisterSet(R3, R6), writeback, ne);
1037 
1038     __ tst(count, 8 / bytes_per_count);
1039     __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
1040     __ stmia(to, RegisterSet(R3, R4), writeback, ne);
1041 
1042     if (bytes_per_count <= 4) {
1043       __ tst(count, 4 / bytes_per_count);
1044       __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes
1045       __ str(R3, Address(to, 4, post_indexed), ne);
1046     }
1047 
1048     if (bytes_per_count <= 2) {
1049       __ tst(count, 2 / bytes_per_count);
1050       __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes
1051       __ strh(R3, Address(to, 2, post_indexed), ne);
1052     }
1053 
1054     if (bytes_per_count == 1) {
1055       __ tst(count, 1);
1056       __ ldrb(R3, Address(from, 1, post_indexed), ne);
1057       __ strb(R3, Address(to, 1, post_indexed), ne);
1058     }
1059 
1060     __ pop(RegisterSet(R4,R10));
1061 
1062     return count_per_loop;
1063   }
1064 
1065 
1066   // Generate the inner loop for backward aligned array copy
1067   //
1068   // Arguments
1069   //      end_from:      src end address, 64 bits  aligned
1070   //      end_to:        dst end address, wordSize aligned
1071   //      count:         number of elements (32-bit int)
1072   //      bytes_per_count: number of bytes for each unit of 'count'
1073   //
1074   // Return the minimum initial value for count
1075   //
1076   // Notes:
1077   // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
1078   // - 'end_to' aligned on wordSize
1079   // - 'count' must be greater or equal than the returned value
1080   //
1081   // Decreases 'end_from' and 'end_to' by count*bytes_per_count.
1082   //
1083   // Scratches 'count', R3.
1084   // ARM R4-R10 are preserved (saved/restored).
1085   //
1086   int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) {
1087     assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
1088 
1089     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
1090     const int count_per_loop = bytes_per_loop / bytes_per_count;
1091 
1092     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned;
1093     int pld_offset = config->pld_distance;
1094 
1095     bool split_read= config->split_ldm;
1096     bool split_write= config->split_stm;
1097 
1098     // See the forward copy variant for additional comments.
1099 
1100     __ push(RegisterSet(R4,R10));
1101 
1102     __ sub_32(count, count, count_per_loop);



1103 
1104     const bool prefetch_before = pld_offset < 0;
1105     const bool prefetch_after = pld_offset > 0;


1106 
1107     Label L_skip_pld;

1108 
1109     if (pld_offset != 0) {
1110       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
1111 
1112       prefetch(end_from, end_to, -wordSize);



1113 
1114       if (prefetch_before) {
1115         __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count);
1116         __ b(L_skip_pld, lt);


1117       }
1118 
1119       int offset = ArmCopyCacheLineSize;
1120       while (offset <= pld_offset) {
1121         prefetch(end_from, end_to, -(wordSize + offset));
1122         offset += ArmCopyCacheLineSize;
1123       };
1124     }
1125 
1126     {
1127       // 32-bit ARM note: we have tried implementing loop unrolling to skip one
1128       // PLD with 64 bytes cache line but the gain was not significant.





1129 
1130       Label L_copy_loop;
1131       __ align(OptoLoopAlignment);
1132       __ BIND(L_copy_loop);



1133 
1134       if (prefetch_before) {
1135         prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
1136         __ BIND(L_skip_pld);
1137       }
1138 
1139       if (split_read) {
1140         __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
1141         __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
1142       } else {
1143         __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
1144       }
1145 
1146       __ subs_32(count, count, count_per_loop);





1147 
1148       if (prefetch_after) {
1149         prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
1150       }
1151 
1152       if (split_write) {
1153         __ stmdb(end_to, RegisterSet(R7, R10), writeback);
1154         __ stmdb(end_to, RegisterSet(R3, R6), writeback);
1155       } else {
1156         __ stmdb(end_to, RegisterSet(R3, R10), writeback);
1157       }


1158 
1159       __ b(L_copy_loop, ge);

1160 
1161       if (prefetch_before) {
1162         __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
1163         __ b(L_skip_pld, ge);
1164       }
1165     }
1166     BLOCK_COMMENT("Remaining bytes:");
1167     // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
1168 
1169     // __ add(count, count, ...); // addition useless for the bit tests
1170     assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");

1171 
1172     __ tst(count, 16 / bytes_per_count);
1173     __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
1174     __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne);


1175 
1176     __ tst(count, 8 / bytes_per_count);
1177     __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
1178     __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne);


1179 
1180     if (bytes_per_count <= 4) {
1181       __ tst(count, 4 / bytes_per_count);
1182       __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes
1183       __ str(R3, Address(end_to, -4, pre_indexed), ne);

1184     }
1185 
1186     if (bytes_per_count <= 2) {
1187       __ tst(count, 2 / bytes_per_count);
1188       __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes
1189       __ strh(R3, Address(end_to, -2, pre_indexed), ne);
1190     }
1191 
1192     if (bytes_per_count == 1) {
1193       __ tst(count, 1);
1194       __ ldrb(R3, Address(end_from, -1, pre_indexed), ne);
1195       __ strb(R3, Address(end_to, -1, pre_indexed), ne);
1196     }
1197 
1198     __ pop(RegisterSet(R4,R10));
1199 
1200     return count_per_loop;
1201   }
1202 
1203 
1204   // Generate the inner loop for shifted forward array copy (unaligned copy).
1205   // It can be used when bytes_per_count < wordSize, i.e. byte/short copy
1206   //
1207   // Arguments
1208   //      from:      start src address, 64 bits aligned
1209   //      to:        start dst address, (now) wordSize aligned
1210   //      count:     number of elements (32-bit int)
1211   //      bytes_per_count: number of bytes for each unit of 'count'
1212   //      lsr_shift: shift applied to 'old' value to skipped already written bytes
1213   //      lsl_shift: shift applied to 'new' value to set the high bytes of the next write
1214   //
1215   // Return the minimum initial value for count
1216   //
1217   // Notes:


1732       store_one(tmp, to, bytes_per_count, forward, ne);
1733       if (bytes_per_count < 4) {
1734         __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough
1735       }
1736     }
1737     return 7/bytes_per_count;
1738   }
1739 
1740   // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction.
1741   //
1742   // Arguments:
1743   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
1744   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
1745   //     count:             32-bit int, number of elements to be copied
1746   //     entry:             copy loop entry point
1747   //     bytes_per_count:   size of an element
1748   //     forward:           specifies copy direction
1749   //
1750   // Notes:
1751   //     shifts 'from' and 'to'
1752   void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) {
1753     assert_different_registers(from, to, count, tmp);
1754 
1755     __ align(OptoLoopAlignment);
1756     Label L_small_loop;
1757     __ BIND(L_small_loop);
1758     store_one(tmp, to, bytes_per_count, forward, al, tmp2);
1759     __ BIND(entry); // entry point
1760     __ subs(count, count, 1);
1761     load_one(tmp, from, bytes_per_count, forward, ge, tmp2);
1762     __ b(L_small_loop, ge);




1763   }
1764 
1765   // Aligns 'to' by reading one word from 'from' and writting its part to 'to'.
1766   //
1767   // Arguments:
1768   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
1769   //     count:             32-bit int, number of elements allowed to be copied
1770   //     to_remainder:      remainder of dividing 'to' by wordSize
1771   //     bytes_per_count:   size of an element
1772   //     forward:           specifies copy direction
1773   //     Rval:              contains an already read but not yet written word;
1774   //                        its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'.
1775   //
1776   // Notes:
1777   //     'count' must not be less then the returned value
1778   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
1779   //     shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written)
1780   //     decreases 'count' by the the number of elements written
1781   //     Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop
1782   int align_dst(Register to, Register count, Register Rval, Register tmp,


1859     return min_copy + required_to_align;
1860   }
1861 
1862   // Copies 'count' of elements using shifted copy loop
1863   //
1864   // Arguments:
1865   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
1866   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
1867   //     count:             32-bit int, number of elements to be copied
1868   //     bytes_per_count:   size of an element
1869   //     forward:           specifies copy direction
1870   //
1871   // Notes:
1872   //     'count' must not be less then the returned value
1873   //     'from' must be aligned by wordSize
1874   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
1875   //     shifts 'to' by the number of copied bytes
1876   //
1877   // Scratches 'from', 'count', R3 and R12.
1878   // R4-R10 saved for use.
1879   int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) {
1880 
1881     const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect
1882 
1883     int min_copy = 0;
1884 
1885     // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point,
1886     // then the remainder of 'to' divided by wordSize is one of elements of {seq}.
1887 
1888     __ push(RegisterSet(R4,R10));
1889     load_one(Rval, from, wordSize, forward);
1890 
1891     switch (bytes_per_count) {
1892       case 2:
1893         min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
1894         break;
1895       case 1:
1896       {
1897         Label L1, L2, L3;
1898         int min_copy1, min_copy2, min_copy3;



















1899 
1900         Label L_loop_finished;




1901 
1902         if (forward) {
1903             __ tbz(to, 0, L2);
1904             __ tbz(to, 1, L1);




1905 
1906             __ BIND(L3);
1907             min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
1908             __ b(L_loop_finished);
1909 
1910             __ BIND(L1);
1911             min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
1912             __ b(L_loop_finished);
1913 
1914             __ BIND(L2);
1915             min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
1916         } else {
1917             __ tbz(to, 0, L2);
1918             __ tbnz(to, 1, L3);
1919 
1920             __ BIND(L1);
1921             min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
1922             __ b(L_loop_finished);
1923 
1924              __ BIND(L3);
1925             min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
1926             __ b(L_loop_finished);
1927 
1928            __ BIND(L2);
1929             min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
1930         }
1931 
1932         min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3);
1933 
1934         __ BIND(L_loop_finished);
1935 
1936         break;
1937       }
1938       default:
1939         ShouldNotReachHere();
1940         break;
1941     }
1942 
1943     __ pop(RegisterSet(R4,R10));
1944 
1945     return min_copy;
1946   }
1947 
1948 #ifndef PRODUCT
1949   int * get_arraycopy_counter(int bytes_per_count) {
1950     switch (bytes_per_count) {
1951       case 1:
1952         return &SharedRuntime::_jbyte_array_copy_ctr;
1953       case 2:
1954         return &SharedRuntime::_jshort_array_copy_ctr;
1955       case 4:
1956         return &SharedRuntime::_jint_array_copy_ctr;
1957       case 8:
1958         return &SharedRuntime::_jlong_array_copy_ctr;
1959       default:
1960         ShouldNotReachHere();
1961         return NULL;
1962     }
1963   }
1964 #endif // !PRODUCT
1965 







1966   //
1967   //  Generate stub for primitive array copy.  If "aligned" is true, the
1968   //  "from" and "to" addresses are assumed to be heapword aligned.
1969   //
1970   //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
1971   //  "nooverlap_target" must be specified as the address to jump if they don't.
1972   //
1973   // Arguments for generated stub:
1974   //      from:  R0
1975   //      to:    R1
1976   //      count: R2 treated as signed 32-bit int
1977   //
1978   address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) {
1979     __ align(CodeEntryAlignment);
1980     StubCodeMark mark(this, "StubRoutines", name);
1981     address start = __ pc();
1982 
1983     const Register from  = R0;   // source array address
1984     const Register to    = R1;   // destination array address
1985     const Register count = R2;   // elements count


2016     //  *) The small and simple one applicable for any array (but not efficient for large arrays).
2017     // Currently "small" implementation is used if and only if the "large" one could not be used.
2018     // XXX optim: tune the limit higher ?
2019     // Large implementation lower applicability bound is actually determined by
2020     // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop.
2021     const int small_copy_limit = (8*wordSize + 7) / bytes_per_count;
2022 
2023     Label L_small_array;
2024     __ cmp_32(count, small_copy_limit);
2025     __ b(L_small_array, le);
2026 
2027     // Otherwise proceed with large implementation.
2028 
2029     bool from_is_aligned = (bytes_per_count >= 8);
2030     if (aligned && forward && (HeapWordSize % 8 == 0)) {
2031         // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
2032         //  then from is aligned by 8
2033         from_is_aligned = true;
2034     }
2035 
2036     int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
2037     assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");





2038 
2039     // now 'from' is aligned
2040 
2041     bool to_is_aligned = false;
2042 
2043     if (bytes_per_count >= wordSize) {
2044       // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
2045       to_is_aligned = true;
2046     } else {
2047       if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
2048         // Originally 'from' and 'to' were heapword aligned;
2049         // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
2050         //  so 'to' is also heapword aligned and thus aligned by wordSize.
2051         to_is_aligned = true;
2052       }
2053     }
2054 
2055     Label L_unaligned_dst;
2056 
2057     if (!to_is_aligned) {
2058       BLOCK_COMMENT("Check dst alignment:");
2059       __ tst(to, wordSize - 1);
2060       __ b(L_unaligned_dst, ne); // 'to' is not aligned
2061     }
2062 
2063     // 'from' and 'to' are properly aligned
2064 
2065     int min_copy;
2066     if (forward) {
2067       min_copy = generate_forward_aligned_copy_loop (from, to, count, bytes_per_count);
2068     } else {
2069       min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count);
2070     }
2071     assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
2072 
2073     if (status) {
2074       __ mov(R0, 0); // OK
2075     }
2076 
2077     __ ret();
2078 
2079     {
2080       copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */);
2081 
2082       if (status) {
2083         __ mov(R0, 0); // OK
2084       }
2085 
2086       __ ret();
2087     }
2088 
2089     if (! to_is_aligned) {
2090       __ BIND(L_unaligned_dst);
2091       int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
2092       assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
2093 
2094       if (status) {
2095         __ mov(R0, 0); // OK
2096       }
2097 
2098       __ ret();
2099     }
2100 
2101     return start;
2102   }
2103 
2104 
2105   // Generates pattern of code to be placed after raw data copying in generate_oop_copy
2106   // Includes return from arraycopy stub.
2107   //
2108   // Arguments:
2109   //     to:       destination pointer after copying.
2110   //               if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region
2111   //     count:    total number of copied elements, 32-bit int


2856     //        the conjoint stubs use them.
2857 
2858     bool status = false; // non failing C2 stubs need not return a status in R0
2859 
2860 #ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */
2861     // With this flag, the C2 stubs are tested by generating calls to
2862     // generic_arraycopy instead of Runtime1::arraycopy
2863 
2864     // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied)
2865     // and the result is tested to see whether the arraycopy stub should
2866     // be called.
2867 
2868     // When we test arraycopy this way, we must generate extra code in the
2869     // arraycopy methods callable from C2 generic_arraycopy to set the
2870     // status to 0 for those who always succeed (calling the slow path stub might
2871     // lead to errors since the copy has already been performed).
2872 
2873     status = true; // generate a status compatible with C1 calls
2874 #endif
2875 



2876     // these need always status in case they are called from generic_arraycopy
2877     StubRoutines::_jbyte_disjoint_arraycopy  = generate_primitive_copy(false, "jbyte_disjoint_arraycopy",  true, 1, true);
2878     StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true);
2879     StubRoutines::_jint_disjoint_arraycopy   = generate_primitive_copy(false, "jint_disjoint_arraycopy",   true, 4, true);
2880     StubRoutines::_jlong_disjoint_arraycopy  = generate_primitive_copy(false, "jlong_disjoint_arraycopy",  true, 8, true);
2881     StubRoutines::_oop_disjoint_arraycopy    = generate_oop_copy      (false, "oop_disjoint_arraycopy",    true,    true);
2882 
2883     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true);
2884     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true);
2885     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy",  status, 4, true);
2886     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true);
2887     StubRoutines::_arrayof_oop_disjoint_arraycopy    = generate_oop_copy      (true, "arrayof_oop_disjoint_arraycopy",   status,    true);
2888 
2889     // these need always status in case they are called from generic_arraycopy
2890     StubRoutines::_jbyte_arraycopy  = generate_primitive_copy(false, "jbyte_arraycopy",  true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy);
2891     StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy);
2892     StubRoutines::_jint_arraycopy   = generate_primitive_copy(false, "jint_arraycopy",   true, 4, false, StubRoutines::_jint_disjoint_arraycopy);
2893     StubRoutines::_jlong_arraycopy  = generate_primitive_copy(false, "jlong_arraycopy",  true, 8, false, StubRoutines::_jlong_disjoint_arraycopy);
2894     StubRoutines::_oop_arraycopy    = generate_oop_copy      (false, "oop_arraycopy",    true,    false, StubRoutines::_oop_disjoint_arraycopy);
2895 


3038       aes_init();
3039       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3040       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3041       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
3042       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
3043     }
3044 #endif // COMPILE_CRYPTO
3045   }
3046 
3047 
3048  public:
3049   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3050     if (all) {
3051       generate_all();
3052     } else {
3053       generate_initial();
3054     }
3055   }
3056 }; // end class declaration
3057 

3058 void StubGenerator_generate(CodeBuffer* code, bool all) {



3059   StubGenerator g(code, all);
3060 }
< prev index next >