1 /*
2 * Copyright (c) 2000, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "compiler/compileLog.hpp"
26 #include "gc/shared/barrierSet.hpp"
27 #include "gc/shared/c2/barrierSetC2.hpp"
28 #include "memory/allocation.inline.hpp"
29 #include "opto/addnode.hpp"
30 #include "opto/callnode.hpp"
31 #include "opto/castnode.hpp"
32 #include "opto/connode.hpp"
33 #include "opto/convertnode.hpp"
34 #include "opto/divnode.hpp"
35 #include "opto/loopnode.hpp"
36 #include "opto/movenode.hpp"
37 #include "opto/mulnode.hpp"
38 #include "opto/opaquenode.hpp"
39 #include "opto/phase.hpp"
40 #include "opto/predicates.hpp"
41 #include "opto/rootnode.hpp"
42 #include "opto/runtime.hpp"
43 #include "opto/subnode.hpp"
44 #include "opto/superword.hpp"
45 #include "opto/vectornode.hpp"
46 #include "runtime/globals_extension.hpp"
47 #include "runtime/stubRoutines.hpp"
48
49 //------------------------------is_loop_exit-----------------------------------
50 // Given an IfNode, return the loop-exiting projection or null if both
51 // arms remain in the loop.
52 Node *IdealLoopTree::is_loop_exit(Node *iff) const {
53 if (iff->outcnt() != 2) return nullptr; // Ignore partially dead tests
54 PhaseIdealLoop *phase = _phase;
55 // Test is an IfNode, has 2 projections. If BOTH are in the loop
56 // we need loop unswitching instead of peeling.
57 if (!is_member(phase->get_loop(iff->raw_out(0))))
58 return iff->raw_out(0);
59 if (!is_member(phase->get_loop(iff->raw_out(1))))
60 return iff->raw_out(1);
61 return nullptr;
62 }
63
64
65 //=============================================================================
66
67
68 //------------------------------record_for_igvn----------------------------
69 // Put loop body on igvn work list
70 void IdealLoopTree::record_for_igvn() {
71 for (uint i = 0; i < _body.size(); i++) {
72 Node *n = _body.at(i);
73 _phase->_igvn._worklist.push(n);
74 }
75 // put body of outer strip mined loop on igvn work list as well
76 if (_head->is_CountedLoop() && _head->as_Loop()->is_strip_mined()) {
77 CountedLoopNode* l = _head->as_CountedLoop();
78 Node* outer_loop = l->outer_loop();
79 assert(outer_loop != nullptr, "missing piece of strip mined loop");
80 _phase->_igvn._worklist.push(outer_loop);
81 Node* outer_loop_tail = l->outer_loop_tail();
82 assert(outer_loop_tail != nullptr, "missing piece of strip mined loop");
83 _phase->_igvn._worklist.push(outer_loop_tail);
84 Node* outer_loop_end = l->outer_loop_end();
85 assert(outer_loop_end != nullptr, "missing piece of strip mined loop");
86 _phase->_igvn._worklist.push(outer_loop_end);
87 Node* outer_safepoint = l->outer_safepoint();
88 assert(outer_safepoint != nullptr, "missing piece of strip mined loop");
89 _phase->_igvn._worklist.push(outer_safepoint);
90 IfFalseNode* cle_out = _head->as_CountedLoop()->loopexit()->false_proj();
91 assert(cle_out != nullptr, "missing piece of strip mined loop");
92 _phase->_igvn._worklist.push(cle_out);
93 }
94 }
95
96 //------------------------------compute_exact_trip_count-----------------------
97 // Compute loop trip count if possible. Do not recalculate trip count for
98 // split loops (pre-main-post) which have their limits and inits behind Opaque node.
99 void IdealLoopTree::compute_trip_count(PhaseIdealLoop* phase, BasicType loop_bt) {
100 if (!_head->as_Loop()->is_valid_counted_loop(loop_bt)) {
101 return;
102 }
103 BaseCountedLoopNode* cl = _head->as_BaseCountedLoop();
104 // Trip count may become nonexact for iteration split loops since
105 // RCE modifies limits. Note, _trip_count value is not reset since
106 // it is used to limit unrolling of main loop.
107 cl->set_nonexact_trip_count();
108
109 // Loop's test should be part of loop.
110 if (!phase->ctrl_is_member(this, cl->loopexit()->in(CountedLoopEndNode::TestValue)))
111 return; // Infinite loop
112
113 #ifdef ASSERT
114 BoolTest::mask bt = cl->loopexit()->test_trip();
115 assert(bt == BoolTest::lt || bt == BoolTest::gt ||
116 bt == BoolTest::ne, "canonical test is expected");
117 #endif
118
119 Node* init_n = cl->init_trip();
120 Node* limit_n = cl->limit();
121 if (init_n != nullptr && limit_n != nullptr) {
122 jlong stride_con = cl->stride_con();
123 const TypeInteger* init_type = phase->_igvn.type(init_n)->is_integer(loop_bt);
124 const TypeInteger* limit_type = phase->_igvn.type(limit_n)->is_integer(loop_bt);
125
126 // compute trip count
127 // It used to be computed as:
128 // max(1, limit_con - init_con + stride_m) / stride_con
129 // with stride_m = stride_con - (stride_con > 0 ? 1 : -1)
130 // for int counted loops only and by promoting all values to long to avoid overflow
131 // This implements the computation for int and long counted loops in a way that promotion to the next larger integer
132 // type is not needed to protect against overflow.
133 //
134 // Use unsigned longs to avoid overflow: number of iteration is a positive number but can be really large for
135 // instance if init_con = min_jint, limit_con = max_jint
136 jlong init_con = (stride_con > 0) ? init_type->lo_as_long() : init_type->hi_as_long();
137 julong uinit_con = init_con;
138 jlong limit_con = (stride_con > 0) ? limit_type->hi_as_long() : limit_type->lo_as_long();
139 julong ulimit_con = limit_con;
140 // The loop body is always executed at least once even if init >= limit (for stride_con > 0) or
141 // init <= limit (for stride_con < 0).
142 julong udiff = 1;
143 if (stride_con > 0 && limit_con > init_con) {
144 udiff = ulimit_con - uinit_con;
145 } else if (stride_con < 0 && limit_con < init_con) {
146 udiff = uinit_con - ulimit_con;
147 }
148 // The loop runs for one more iteration if the limit is (stride > 0 in this example):
149 // init + k * stride + small_value, 0 < small_value < stride
150 julong utrip_count = udiff / ABS(stride_con);
151 if (utrip_count * ABS(stride_con) != udiff) {
152 // Guaranteed to not overflow because it can only happen for ABS(stride) > 1 in which case, utrip_count can't be
153 // max_juint/max_julong
154 utrip_count++;
155 }
156
157 #ifdef ASSERT
158 if (loop_bt == T_INT) {
159 // Use longs to avoid integer overflow.
160 jlong init_con = (stride_con > 0) ? init_type->is_int()->_lo : init_type->is_int()->_hi;
161 jlong limit_con = (stride_con > 0) ? limit_type->is_int()->_hi : limit_type->is_int()->_lo;
162 int stride_m = stride_con - (stride_con > 0 ? 1 : -1);
163 jlong trip_count = (limit_con - init_con + stride_m) / stride_con;
164 // The loop body is always executed at least once even if init >= limit (for stride_con > 0) or
165 // init <= limit (for stride_con < 0).
166 trip_count = MAX2(trip_count, (jlong)1);
167 assert(checked_cast<juint>(trip_count) == checked_cast<juint>(utrip_count), "incorrect trip count computation");
168 }
169 #endif
170
171 if (utrip_count < max_unsigned_integer(loop_bt)) {
172 if (init_n->is_Con() && limit_n->is_Con()) {
173 // Set exact trip count.
174 cl->set_exact_trip_count(utrip_count);
175 } else if (loop_bt == T_LONG || cl->as_CountedLoop()->unrolled_count() == 1) {
176 // Set maximum trip count before unrolling.
177 cl->set_trip_count(utrip_count);
178 }
179 }
180 }
181 }
182
183 //------------------------------compute_profile_trip_cnt----------------------------
184 // Compute loop trip count from profile data as
185 // (backedge_count + loop_exit_count) / loop_exit_count
186
187 float IdealLoopTree::compute_profile_trip_cnt_helper(Node* n) {
188 if (n->is_If()) {
189 IfNode *iff = n->as_If();
190 if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
191 Node *exit = is_loop_exit(iff);
192 if (exit) {
193 float exit_prob = iff->_prob;
194 if (exit->Opcode() == Op_IfFalse) {
195 exit_prob = 1.0 - exit_prob;
196 }
197 if (exit_prob > PROB_MIN) {
198 float exit_cnt = iff->_fcnt * exit_prob;
199 return exit_cnt;
200 }
201 }
202 }
203 }
204 if (n->is_Jump()) {
205 JumpNode *jmp = n->as_Jump();
206 if (jmp->_fcnt != COUNT_UNKNOWN) {
207 float* probs = jmp->_probs;
208 float exit_prob = 0;
209 PhaseIdealLoop *phase = _phase;
210 for (DUIterator_Fast imax, i = jmp->fast_outs(imax); i < imax; i++) {
211 JumpProjNode* u = jmp->fast_out(i)->as_JumpProj();
212 if (!is_member(_phase->get_loop(u))) {
213 exit_prob += probs[u->_con];
214 }
215 }
216 return exit_prob * jmp->_fcnt;
217 }
218 }
219 return 0;
220 }
221
222 void IdealLoopTree::compute_profile_trip_cnt(PhaseIdealLoop *phase) {
223 if (!_head->is_Loop()) {
224 return;
225 }
226 LoopNode* head = _head->as_Loop();
227 if (head->profile_trip_cnt() != COUNT_UNKNOWN) {
228 return; // Already computed
229 }
230 float trip_cnt = (float)max_jint; // default is big
231
232 Node* back = head->in(LoopNode::LoopBackControl);
233 while (back != head) {
234 if ((back->Opcode() == Op_IfTrue || back->Opcode() == Op_IfFalse) &&
235 back->in(0) &&
236 back->in(0)->is_If() &&
237 back->in(0)->as_If()->_fcnt != COUNT_UNKNOWN &&
238 back->in(0)->as_If()->_prob != PROB_UNKNOWN &&
239 (back->Opcode() == Op_IfTrue ? 1-back->in(0)->as_If()->_prob : back->in(0)->as_If()->_prob) > PROB_MIN) {
240 break;
241 }
242 back = phase->idom(back);
243 }
244 if (back != head) {
245 assert((back->Opcode() == Op_IfTrue || back->Opcode() == Op_IfFalse) &&
246 back->in(0), "if-projection exists");
247 IfNode* back_if = back->in(0)->as_If();
248 float loop_back_cnt = back_if->_fcnt * (back->Opcode() == Op_IfTrue ? back_if->_prob : (1 - back_if->_prob));
249
250 // Now compute a loop exit count
251 float loop_exit_cnt = 0.0f;
252 if (_child == nullptr) {
253 for (uint i = 0; i < _body.size(); i++) {
254 Node *n = _body[i];
255 loop_exit_cnt += compute_profile_trip_cnt_helper(n);
256 }
257 } else {
258 ResourceMark rm;
259 Unique_Node_List wq;
260 wq.push(back);
261 for (uint i = 0; i < wq.size(); i++) {
262 Node *n = wq.at(i);
263 assert(n->is_CFG(), "only control nodes");
264 if (n != head) {
265 if (n->is_Region()) {
266 for (uint j = 1; j < n->req(); j++) {
267 wq.push(n->in(j));
268 }
269 } else {
270 loop_exit_cnt += compute_profile_trip_cnt_helper(n);
271 wq.push(n->in(0));
272 }
273 }
274 }
275
276 }
277 if (loop_exit_cnt > 0.0f) {
278 trip_cnt = (loop_back_cnt + loop_exit_cnt) / loop_exit_cnt;
279 } else {
280 // No exit count so use
281 trip_cnt = loop_back_cnt;
282 }
283 } else {
284 head->mark_profile_trip_failed();
285 }
286 #ifndef PRODUCT
287 if (TraceProfileTripCount) {
288 tty->print_cr("compute_profile_trip_cnt lp: %d cnt: %f\n", head->_idx, trip_cnt);
289 }
290 #endif
291 head->set_profile_trip_cnt(trip_cnt);
292 }
293
294 // Return nonzero index of invariant operand for an associative
295 // binary operation of (nonconstant) invariant and variant values.
296 // Helper for reassociate_invariants.
297 int IdealLoopTree::find_invariant(Node* n, PhaseIdealLoop* phase) {
298 bool in1_invar = this->is_invariant(n->in(1));
299 bool in2_invar = this->is_invariant(n->in(2));
300 if (in1_invar && !in2_invar) return 1;
301 if (!in1_invar && in2_invar) return 2;
302 return 0;
303 }
304
305 // Return TRUE if "n" is an associative cmp node. A cmp node is
306 // associative if it is only used for equals or not-equals
307 // comparisons of integers or longs. We cannot reassociate
308 // non-equality comparisons due to possibility of overflow.
309 bool IdealLoopTree::is_associative_cmp(Node* n) {
310 if (n->Opcode() != Op_CmpI && n->Opcode() != Op_CmpL) {
311 return false;
312 }
313 for (DUIterator i = n->outs(); n->has_out(i); i++) {
314 BoolNode* bool_out = n->out(i)->isa_Bool();
315 if (bool_out == nullptr || !(bool_out->_test._test == BoolTest::eq ||
316 bool_out->_test._test == BoolTest::ne)) {
317 return false;
318 }
319 }
320 return true;
321 }
322
323 // Return TRUE if "n" is an associative binary node. If "base" is
324 // not null, "n" must be re-associative with it.
325 bool IdealLoopTree::is_associative(Node* n, Node* base) {
326 int op = n->Opcode();
327 if (base != nullptr) {
328 assert(is_associative(base), "Base node should be associative");
329 int base_op = base->Opcode();
330 if (base_op == Op_AddI || base_op == Op_SubI || base_op == Op_CmpI) {
331 return op == Op_AddI || op == Op_SubI;
332 }
333 if (base_op == Op_AddL || base_op == Op_SubL || base_op == Op_CmpL) {
334 return op == Op_AddL || op == Op_SubL;
335 }
336 return op == base_op;
337 } else {
338 // Integer "add/sub/mul/and/or/xor" operations are associative. Integer
339 // "cmp" operations are associative if it is an equality comparison.
340 return op == Op_AddI || op == Op_AddL
341 || op == Op_SubI || op == Op_SubL
342 || op == Op_MulI || op == Op_MulL
343 || op == Op_AndI || op == Op_AndL
344 || op == Op_OrI || op == Op_OrL
345 || op == Op_XorI || op == Op_XorL
346 || is_associative_cmp(n);
347 }
348 }
349
350 // Reassociate invariant add and subtract expressions:
351 //
352 // inv1 + (x + inv2) => ( inv1 + inv2) + x
353 // (x + inv2) + inv1 => ( inv1 + inv2) + x
354 // inv1 + (x - inv2) => ( inv1 - inv2) + x
355 // inv1 - (inv2 - x) => ( inv1 - inv2) + x
356 // (x + inv2) - inv1 => (-inv1 + inv2) + x
357 // (x - inv2) + inv1 => ( inv1 - inv2) + x
358 // (x - inv2) - inv1 => (-inv1 - inv2) + x
359 // inv1 + (inv2 - x) => ( inv1 + inv2) - x
360 // inv1 - (x - inv2) => ( inv1 + inv2) - x
361 // (inv2 - x) + inv1 => ( inv1 + inv2) - x
362 // (inv2 - x) - inv1 => (-inv1 + inv2) - x
363 // inv1 - (x + inv2) => ( inv1 - inv2) - x
364 //
365 // Apply the same transformations to == and !=
366 // inv1 == (x + inv2) => ( inv1 - inv2 ) == x
367 // inv1 == (x - inv2) => ( inv1 + inv2 ) == x
368 // inv1 == (inv2 - x) => (-inv1 + inv2 ) == x
369 Node* IdealLoopTree::reassociate_add_sub_cmp(Node* n1, int inv1_idx, int inv2_idx, PhaseIdealLoop* phase) {
370 Node* n2 = n1->in(3 - inv1_idx);
371 bool n1_is_sub = n1->is_Sub() && !n1->is_Cmp();
372 bool n1_is_cmp = n1->is_Cmp();
373 bool n2_is_sub = n2->is_Sub();
374 assert(n1->is_Add() || n1_is_sub || n1_is_cmp, "Target node should be add, subtract, or compare");
375 assert(n2->is_Add() || (n2_is_sub && !n2->is_Cmp()), "Child node should be add or subtract");
376 Node* inv1 = n1->in(inv1_idx);
377 Node* inv2 = n2->in(inv2_idx);
378 Node* x = n2->in(3 - inv2_idx);
379
380 // Determine whether x, inv1, or inv2 should be negative in the transformed
381 // expression
382 bool neg_x = n2_is_sub && inv2_idx == 1;
383 bool neg_inv2 = (n2_is_sub && !n1_is_cmp && inv2_idx == 2) || (n1_is_cmp && !n2_is_sub);
384 bool neg_inv1 = (n1_is_sub && inv1_idx == 2) || (n1_is_cmp && inv2_idx == 1 && n2_is_sub);
385 if (n1_is_sub && inv1_idx == 1) {
386 neg_x = !neg_x;
387 neg_inv2 = !neg_inv2;
388 }
389
390 bool is_int = n2->bottom_type()->isa_int() != nullptr;
391 Node* inv1_c = phase->get_ctrl(inv1);
392 Node* n_inv1;
393 if (neg_inv1) {
394 if (is_int) {
395 n_inv1 = new SubINode(phase->intcon(0), inv1);
396 } else {
397 n_inv1 = new SubLNode(phase->longcon(0L), inv1);
398 }
399 phase->register_new_node(n_inv1, inv1_c);
400 } else {
401 n_inv1 = inv1;
402 }
403
404 Node* inv;
405 if (is_int) {
406 if (neg_inv2) {
407 inv = new SubINode(n_inv1, inv2);
408 } else {
409 inv = new AddINode(n_inv1, inv2);
410 }
411 phase->register_new_node(inv, phase->get_early_ctrl(inv));
412 if (n1_is_cmp) {
413 return new CmpINode(x, inv);
414 }
415 if (neg_x) {
416 return new SubINode(inv, x);
417 } else {
418 return new AddINode(x, inv);
419 }
420 } else {
421 if (neg_inv2) {
422 inv = new SubLNode(n_inv1, inv2);
423 } else {
424 inv = new AddLNode(n_inv1, inv2);
425 }
426 phase->register_new_node(inv, phase->get_early_ctrl(inv));
427 if (n1_is_cmp) {
428 return new CmpLNode(x, inv);
429 }
430 if (neg_x) {
431 return new SubLNode(inv, x);
432 } else {
433 return new AddLNode(x, inv);
434 }
435 }
436 }
437
438 // Reassociate invariant binary expressions with add/sub/mul/
439 // and/or/xor/cmp operators.
440 // For add/sub/cmp expressions: see "reassociate_add_sub_cmp"
441 //
442 // For mul/and/or/xor expressions:
443 //
444 // inv1 op (x op inv2) => (inv1 op inv2) op x
445 //
446 Node* IdealLoopTree::reassociate(Node* n1, PhaseIdealLoop *phase) {
447 if (!is_associative(n1) || n1->outcnt() == 0) return nullptr;
448 if (is_invariant(n1)) return nullptr;
449 // Don't mess with add of constant (igvn moves them to expression tree root.)
450 if (n1->is_Add() && n1->in(2)->is_Con()) return nullptr;
451
452 int inv1_idx = find_invariant(n1, phase);
453 if (!inv1_idx) return nullptr;
454 Node* n2 = n1->in(3 - inv1_idx);
455 if (!is_associative(n2, n1)) return nullptr;
456 int inv2_idx = find_invariant(n2, phase);
457 if (!inv2_idx) return nullptr;
458
459 if (!phase->may_require_nodes(10, 10)) return nullptr;
460
461 Node* result = nullptr;
462 switch (n1->Opcode()) {
463 case Op_AddI:
464 case Op_AddL:
465 case Op_SubI:
466 case Op_SubL:
467 case Op_CmpI:
468 case Op_CmpL:
469 result = reassociate_add_sub_cmp(n1, inv1_idx, inv2_idx, phase);
470 break;
471 case Op_MulI:
472 case Op_MulL:
473 case Op_AndI:
474 case Op_AndL:
475 case Op_OrI:
476 case Op_OrL:
477 case Op_XorI:
478 case Op_XorL: {
479 Node* inv1 = n1->in(inv1_idx);
480 Node* inv2 = n2->in(inv2_idx);
481 Node* x = n2->in(3 - inv2_idx);
482 Node* inv = n2->clone_with_data_edge(inv1, inv2);
483 phase->register_new_node(inv, phase->get_early_ctrl(inv));
484 result = n1->clone_with_data_edge(x, inv);
485 break;
486 }
487 default:
488 ShouldNotReachHere();
489 }
490
491 assert(result != nullptr, "");
492 phase->register_new_node_with_ctrl_of(result, n1);
493 phase->_igvn.replace_node(n1, result);
494 assert(phase->get_loop(phase->get_ctrl(n1)) == this, "");
495 _body.yank(n1);
496 return result;
497 }
498
499 //---------------------reassociate_invariants-----------------------------
500 // Reassociate invariant expressions:
501 void IdealLoopTree::reassociate_invariants(PhaseIdealLoop *phase) {
502 for (int i = _body.size() - 1; i >= 0; i--) {
503 Node *n = _body.at(i);
504 for (int j = 0; j < 5; j++) {
505 Node* nn = reassociate(n, phase);
506 if (nn == nullptr) break;
507 n = nn; // again
508 }
509 }
510 }
511
512 //------------------------------policy_peeling---------------------------------
513 // Return TRUE if the loop should be peeled, otherwise return FALSE. Peeling
514 // is applicable if we can make a loop-invariant test (usually a null-check)
515 // execute before we enter the loop. When TRUE, the estimated node budget is
516 // also requested.
517 bool IdealLoopTree::policy_peeling(PhaseIdealLoop *phase) {
518 uint estimate = estimate_peeling(phase);
519
520 return estimate == 0 ? false : phase->may_require_nodes(estimate);
521 }
522
523 // Perform actual policy and size estimate for the loop peeling transform, and
524 // return the estimated loop size if peeling is applicable, otherwise return
525 // zero. No node budget is allocated.
526 uint IdealLoopTree::estimate_peeling(PhaseIdealLoop *phase) {
527 if (LoopPeeling != 1) {
528 return 0;
529 }
530
531 // If nodes are depleted, some transform has miscalculated its needs.
532 assert(!phase->exceeding_node_budget(), "sanity");
533
534 // Peeling does loop cloning which can result in O(N^2) node construction.
535 if (_body.size() > 255 && !StressLoopPeeling) {
536 return 0; // Suppress too large body size.
537 }
538 // Optimistic estimate that approximates loop body complexity via data and
539 // control flow fan-out (instead of using the more pessimistic: BodySize^2).
540 uint estimate = est_loop_clone_sz(2);
541
542 if (phase->exceeding_node_budget(estimate)) {
543 return 0; // Too large to safely clone.
544 }
545
546 // Check for vectorized loops, any peeling done was already applied.
547 if (_head->is_CountedLoop()) {
548 CountedLoopNode* cl = _head->as_CountedLoop();
549 if (cl->is_unroll_only() || cl->trip_count() == 1) {
550 // Peeling is not legal here (cf. assert in do_peeling), we don't even stress peel!
551 return 0;
552 }
553 }
554
555 #ifndef PRODUCT
556 // It is now safe to peel or not.
557 if (StressLoopPeeling) {
558 LoopNode* loop_head = _head->as_Loop();
559 static constexpr uint max_peeling_opportunities = 5;
560 if (loop_head->_stress_peeling_attempts < max_peeling_opportunities) {
561 loop_head->_stress_peeling_attempts++;
562 // In case of stress, let's just pick randomly...
563 return ((phase->C->random() % 2) == 0) ? estimate : 0;
564 }
565 return 0;
566 }
567 // ...otherwise, let's apply our heuristic.
568 #endif
569
570 Node* test = tail();
571
572 while (test != _head) { // Scan till run off top of loop
573 if (test->is_If()) { // Test?
574 Node *ctrl = phase->get_ctrl(test->in(1));
575 if (ctrl->is_top()) {
576 return 0; // Found dead test on live IF? No peeling!
577 }
578 // Standard IF only has one input value to check for loop invariance.
579 assert(test->Opcode() == Op_If ||
580 test->Opcode() == Op_CountedLoopEnd ||
581 test->Opcode() == Op_LongCountedLoopEnd ||
582 test->Opcode() == Op_RangeCheck ||
583 test->Opcode() == Op_ParsePredicate,
584 "Check this code when new subtype is added");
585 // Condition is not a member of this loop?
586 if (!is_member(phase->get_loop(ctrl)) && is_loop_exit(test)) {
587 return estimate; // Found reason to peel!
588 }
589 }
590 // Walk up dominators to loop _head looking for test which is executed on
591 // every path through the loop.
592 test = phase->idom(test);
593 }
594 return 0;
595 }
596
597 //------------------------------peeled_dom_test_elim---------------------------
598 // If we got the effect of peeling, either by actually peeling or by making
599 // a pre-loop which must execute at least once, we can remove all
600 // loop-invariant dominated tests in the main body.
601 void PhaseIdealLoop::peeled_dom_test_elim(IdealLoopTree* loop, Node_List& old_new) {
602 bool progress = true;
603 while (progress) {
604 progress = false; // Reset for next iteration
605 Node* prev = loop->_head->in(LoopNode::LoopBackControl); // loop->tail();
606 Node* test = prev->in(0);
607 while (test != loop->_head) { // Scan till run off top of loop
608 int p_op = prev->Opcode();
609 assert(test != nullptr, "test cannot be null");
610 Node* test_cond = nullptr;
611 if ((p_op == Op_IfFalse || p_op == Op_IfTrue) && test->is_If()) {
612 test_cond = test->in(1);
613 }
614 if (test_cond != nullptr && // Test?
615 !test_cond->is_Con() && // And not already obvious?
616 // And condition is not a member of this loop?
617 !ctrl_is_member(loop, test_cond)) {
618 // Walk loop body looking for instances of this test
619 for (uint i = 0; i < loop->_body.size(); i++) {
620 Node* n = loop->_body.at(i);
621 // Check against cached test condition because dominated_by()
622 // replaces the test condition with a constant.
623 if (n->is_If() && n->in(1) == test_cond) {
624 // IfNode was dominated by version in peeled loop body
625 progress = true;
626 dominated_by(old_new[prev->_idx]->as_IfProj(), n->as_If());
627 }
628 }
629 }
630 prev = test;
631 test = idom(test);
632 } // End of scan tests in loop
633 } // End of while (progress)
634 }
635
636 //------------------------------do_peeling-------------------------------------
637 // Peel the first iteration of the given loop.
638 // Step 1: Clone the loop body. The clone becomes the peeled iteration.
639 // The pre-loop illegally has 2 control users (old & new loops).
640 // Step 2: Make the old-loop fall-in edges point to the peeled iteration.
641 // Do this by making the old-loop fall-in edges act as if they came
642 // around the loopback from the prior iteration (follow the old-loop
643 // backedges) and then map to the new peeled iteration. This leaves
644 // the pre-loop with only 1 user (the new peeled iteration), but the
645 // peeled-loop backedge has 2 users.
646 // Step 3: Cut the backedge on the clone (so its not a loop) and remove the
647 // extra backedge user.
648 //
649 // orig
650 //
651 // stmt1
652 // |
653 // v
654 // predicates
655 // |
656 // v
657 // loop<----+
658 // | |
659 // stmt2 |
660 // | |
661 // v |
662 // if ^
663 // / \ |
664 // / \ |
665 // v v |
666 // false true |
667 // / \ |
668 // / ----+
669 // |
670 // v
671 // exit
672 //
673 //
674 // after clone loop
675 //
676 // stmt1
677 // |
678 // v
679 // predicates
680 // / \
681 // clone / \ orig
682 // / \
683 // / \
684 // v v
685 // +---->loop clone loop<----+
686 // | | | |
687 // | stmt2 clone stmt2 |
688 // | | | |
689 // | v v |
690 // ^ if clone If ^
691 // | / \ / \ |
692 // | / \ / \ |
693 // | v v v v |
694 // | true false false true |
695 // | / \ / \ |
696 // +---- \ / ----+
697 // \ /
698 // 1v v2
699 // region
700 // |
701 // v
702 // exit
703 //
704 //
705 // after peel and predicate move
706 //
707 // stmt1
708 // |
709 // v
710 // predicates
711 // /
712 // /
713 // clone / orig
714 // /
715 // / +----------+
716 // / | |
717 // / | |
718 // / | |
719 // v v |
720 // TOP-->loop clone loop<----+ |
721 // | | | |
722 // stmt2 clone stmt2 | |
723 // | | | ^
724 // v v | |
725 // if clone If ^ |
726 // / \ / \ | |
727 // / \ / \ | |
728 // v v v v | |
729 // true false false true | |
730 // | \ / \ | |
731 // | \ / ----+ ^
732 // | \ / |
733 // | 1v v2 |
734 // v region |
735 // | | |
736 // | v |
737 // | exit |
738 // | |
739 // +--------------->-----------------+
740 //
741 //
742 // final graph
743 //
744 // stmt1
745 // |
746 // v
747 // predicates
748 // |
749 // v
750 // stmt2 clone
751 // |
752 // v
753 // if clone
754 // / |
755 // / |
756 // v v
757 // false true
758 // | |
759 // | v
760 // | Initialized Assertion Predicates
761 // | |
762 // | v
763 // | loop<----+
764 // | | |
765 // | stmt2 |
766 // | | |
767 // | v |
768 // v if ^
769 // | / \ |
770 // | / \ |
771 // | v v |
772 // | false true |
773 // | | \ |
774 // v v --+
775 // region
776 // |
777 // v
778 // exit
779 //
780 void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) {
781 assert(LoopPeeling != 0, "do_peeling called with loop peeling always disabled");
782
783 C->set_major_progress();
784 // Peeling a 'main' loop in a pre/main/post situation obfuscates the
785 // 'pre' loop from the main and the 'pre' can no longer have its
786 // iterations adjusted. Therefore, we need to declare this loop as
787 // no longer a 'main' loop; it will need new pre and post loops before
788 // we can do further RCE.
789 #ifndef PRODUCT
790 if (TraceLoopOpts) {
791 tty->print("Peel ");
792 loop->dump_head();
793 }
794 #endif
795 LoopNode* head = loop->_head->as_Loop();
796
797 C->print_method(PHASE_BEFORE_LOOP_PEELING, 4, head);
798
799 bool counted_loop = head->is_CountedLoop();
800 if (counted_loop) {
801 CountedLoopNode *cl = head->as_CountedLoop();
802 assert(cl->trip_count() > 0, "peeling a fully unrolled loop");
803 cl->set_trip_count(cl->trip_count() - 1);
804 if (cl->is_main_loop()) {
805 cl->set_normal_loop();
806 if (cl->is_multiversion()) {
807 // Peeling also destroys the connection of the main loop
808 // to the multiversion_if.
809 cl->set_no_multiversion();
810 }
811 #ifndef PRODUCT
812 if (TraceLoopOpts) {
813 tty->print("Peeling a 'main' loop; resetting to 'normal' ");
814 }
815 #endif
816 }
817 }
818
819 // Step 1: Clone the loop body. The clone becomes the peeled iteration.
820 // The pre-loop illegally has 2 control users (old & new loops).
821 const uint first_node_index_in_post_loop_body = Compile::current()->unique();
822 LoopNode* outer_loop_head = head->skip_strip_mined();
823 clone_loop(loop, old_new, dom_depth(outer_loop_head), ControlAroundStripMined);
824
825 // Step 2: Make the old-loop fall-in edges point to the peeled iteration.
826 // Do this by making the old-loop fall-in edges act as if they came
827 // around the loopback from the prior iteration (follow the old-loop
828 // backedges) and then map to the new peeled iteration. This leaves
829 // the pre-loop with only 1 user (the new peeled iteration), but the
830 // peeled-loop backedge has 2 users.
831 Node* new_entry = old_new[head->in(LoopNode::LoopBackControl)->_idx];
832 _igvn.hash_delete(outer_loop_head);
833 outer_loop_head->set_req(LoopNode::EntryControl, new_entry);
834 for (DUIterator_Fast jmax, j = head->fast_outs(jmax); j < jmax; j++) {
835 Node* old = head->fast_out(j);
836 if (old->in(0) == loop->_head && old->req() == 3 && old->is_Phi()) {
837 Node* new_exit_value = old_new[old->in(LoopNode::LoopBackControl)->_idx];
838 if (!new_exit_value) // Backedge value is ALSO loop invariant?
839 // Then loop body backedge value remains the same.
840 new_exit_value = old->in(LoopNode::LoopBackControl);
841 _igvn.hash_delete(old);
842 old->set_req(LoopNode::EntryControl, new_exit_value);
843 }
844 }
845
846
847 // Step 3: Cut the backedge on the clone (so its not a loop) and remove the
848 // extra backedge user.
849 Node* new_head = old_new[head->_idx];
850 _igvn.hash_delete(new_head);
851 new_head->set_req(LoopNode::LoopBackControl, C->top());
852 for (DUIterator_Fast j2max, j2 = new_head->fast_outs(j2max); j2 < j2max; j2++) {
853 Node* use = new_head->fast_out(j2);
854 if (use->in(0) == new_head && use->req() == 3 && use->is_Phi()) {
855 _igvn.hash_delete(use);
856 use->set_req(LoopNode::LoopBackControl, C->top());
857 }
858 }
859
860 // Step 4: Correct dom-depth info. Set to loop-head depth.
861
862 int dd_outer_loop_head = dom_depth(outer_loop_head);
863 set_idom(outer_loop_head, outer_loop_head->in(LoopNode::EntryControl), dd_outer_loop_head);
864 for (uint j3 = 0; j3 < loop->_body.size(); j3++) {
865 Node *old = loop->_body.at(j3);
866 Node *nnn = old_new[old->_idx];
867 if (!has_ctrl(nnn)) {
868 set_idom(nnn, idom(nnn), dd_outer_loop_head-1);
869 }
870 }
871
872 // Step 5: Assertion Predicates initialization
873 if (counted_loop) {
874 CountedLoopNode* cl = head->as_CountedLoop();
875 Node* init = cl->init_trip();
876 Node* init_ctrl = cl->skip_strip_mined()->in(LoopNode::EntryControl);
877 initialize_assertion_predicates_for_peeled_loop(new_head->as_CountedLoop(), cl,
878 first_node_index_in_post_loop_body, old_new);
879 cast_incr_before_loop(init, init_ctrl, cl);
880 }
881
882 // Now force out all loop-invariant dominating tests. The optimizer
883 // finds some, but we _know_ they are all useless.
884 peeled_dom_test_elim(loop,old_new);
885
886 loop->record_for_igvn();
887
888 C->print_method(PHASE_AFTER_LOOP_PEELING, 4, new_head);
889 }
890
891 //------------------------------policy_maximally_unroll------------------------
892 // Calculate the exact loop trip-count and return TRUE if loop can be fully,
893 // i.e. maximally, unrolled, otherwise return FALSE. When TRUE, the estimated
894 // node budget is also requested.
895 bool IdealLoopTree::policy_maximally_unroll(PhaseIdealLoop* phase) const {
896 CountedLoopNode* cl = _head->as_CountedLoop();
897 assert(cl->is_normal_loop(), "");
898 if (!cl->is_valid_counted_loop(T_INT)) {
899 return false; // Malformed counted loop.
900 }
901 if (!cl->has_exact_trip_count()) {
902 return false; // Trip count is not exact.
903 }
904
905 uint trip_count = cl->trip_count();
906 // Note, max_juint is used to indicate unknown trip count.
907 assert(trip_count > 1, "one-iteration loop should be optimized out already");
908 assert(trip_count < max_juint, "exact trip_count should be less than max_juint.");
909
910 // If nodes are depleted, some transform has miscalculated its needs.
911 assert(!phase->exceeding_node_budget(), "sanity");
912
913 // Allow the unrolled body to get larger than the standard loop size limit.
914 uint unroll_limit = (uint)LoopUnrollLimit * 4;
915 assert((intx)unroll_limit == LoopUnrollLimit * 4, "LoopUnrollLimit must fit in 32bits");
916 if (trip_count > unroll_limit || _body.size() > unroll_limit) {
917 return false;
918 }
919
920 uint new_body_size = est_loop_unroll_sz(trip_count);
921
922 if (new_body_size == UINT_MAX) { // Check for bad estimate (overflow).
923 return false;
924 }
925
926 // Fully unroll a loop with few iterations, regardless of other conditions,
927 // since the following (general) loop optimizations will split such loop in
928 // any case (into pre-main-post).
929 if (trip_count <= 3) {
930 return phase->may_require_nodes(new_body_size);
931 }
932
933 // Reject if unrolling will result in too much node construction.
934 if (new_body_size > unroll_limit || phase->exceeding_node_budget(new_body_size)) {
935 return false;
936 }
937
938 // Do not unroll a loop with String intrinsics code.
939 // String intrinsics are large and have loops.
940 for (uint k = 0; k < _body.size(); k++) {
941 Node* n = _body.at(k);
942 switch (n->Opcode()) {
943 case Op_StrComp:
944 case Op_StrEquals:
945 case Op_VectorizedHashCode:
946 case Op_StrIndexOf:
947 case Op_StrIndexOfChar:
948 case Op_EncodeISOArray:
949 case Op_AryEq:
950 case Op_CountPositives: {
951 return false;
952 }
953 } // switch
954 }
955
956 return phase->may_require_nodes(new_body_size);
957 }
958
959
960 //------------------------------policy_unroll----------------------------------
961 // Return TRUE or FALSE if the loop should be unrolled or not. Apply unroll if
962 // the loop is a counted loop and the loop body is small enough. When TRUE,
963 // the estimated node budget is also requested.
964 bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
965
966 CountedLoopNode *cl = _head->as_CountedLoop();
967 assert(cl->is_normal_loop() || cl->is_main_loop(), "");
968
969 if (!cl->is_valid_counted_loop(T_INT)) {
970 return false; // Malformed counted loop
971 }
972
973 // If nodes are depleted, some transform has miscalculated its needs.
974 assert(!phase->exceeding_node_budget(), "sanity");
975
976 // Protect against over-unrolling.
977 // After split at least one iteration will be executed in pre-loop.
978 if (cl->trip_count() <= (cl->is_normal_loop() ? 2u : 1u)) {
979 return false;
980 }
981 _local_loop_unroll_limit = LoopUnrollLimit;
982 _local_loop_unroll_factor = 4;
983 int future_unroll_cnt = cl->unrolled_count() * 2;
984 if (!cl->is_vectorized_loop()) {
985 if (future_unroll_cnt > LoopMaxUnroll) return false;
986 } else {
987 // obey user constraints on vector mapped loops with additional unrolling applied
988 int unroll_constraint = (cl->slp_max_unroll()) ? cl->slp_max_unroll() : 1;
989 if ((future_unroll_cnt / unroll_constraint) > LoopMaxUnroll) return false;
990 }
991
992 const int stride_con = cl->stride_con();
993
994 // Check for initial stride being a small enough constant
995 const int initial_stride_sz = MAX2(1<<2, Matcher::max_vector_size(T_BYTE) / 2);
996 // Maximum stride size should protect against overflow, when doubling stride unroll_count times
997 const int max_stride_size = MIN2<int>(max_jint / 2 - 2, initial_stride_sz * future_unroll_cnt);
998 // No abs() use; abs(min_jint) = min_jint
999 if (stride_con < -max_stride_size || stride_con > max_stride_size) return false;
1000
1001 // Don't unroll if the next round of unrolling would push us
1002 // over the expected trip count of the loop. One is subtracted
1003 // from the expected trip count because the pre-loop normally
1004 // executes 1 iteration.
1005 if (UnrollLimitForProfileCheck > 0 &&
1006 cl->profile_trip_cnt() != COUNT_UNKNOWN &&
1007 future_unroll_cnt > UnrollLimitForProfileCheck &&
1008 (float)future_unroll_cnt > cl->profile_trip_cnt() - 1.0) {
1009 return false;
1010 }
1011
1012 bool should_unroll = true;
1013
1014 // When unroll count is greater than LoopUnrollMin, don't unroll if:
1015 // the residual iterations are more than 10% of the trip count
1016 // and rounds of "unroll,optimize" are not making significant progress
1017 // Progress defined as current size less than 20% larger than previous size.
1018 if (phase->C->do_superword() &&
1019 cl->node_count_before_unroll() > 0 &&
1020 future_unroll_cnt > LoopUnrollMin &&
1021 is_residual_iters_large(future_unroll_cnt, cl) &&
1022 1.2 * cl->node_count_before_unroll() < (double)_body.size()) {
1023 if ((cl->slp_max_unroll() == 0) && !is_residual_iters_large(cl->unrolled_count(), cl)) {
1024 // cl->slp_max_unroll() = 0 means that the previous slp analysis never passed.
1025 // slp analysis may fail due to the loop IR is too complicated especially during the early stage
1026 // of loop unrolling analysis. But after several rounds of loop unrolling and other optimizations,
1027 // it's possible that the loop IR becomes simple enough to pass the slp analysis.
1028 // So we don't return immediately in hoping that the next slp analysis can succeed.
1029 should_unroll = false;
1030 future_unroll_cnt = cl->unrolled_count();
1031 } else {
1032 return false;
1033 }
1034 }
1035
1036 Node *init_n = cl->init_trip();
1037 Node *limit_n = cl->limit();
1038 if (limit_n == nullptr) return false; // We will dereference it below.
1039
1040 // Non-constant bounds.
1041 // Protect against over-unrolling when init or/and limit are not constant
1042 // (so that trip_count's init value is maxint) but iv range is known.
1043 if (init_n == nullptr || !init_n->is_Con() || !limit_n->is_Con()) {
1044 Node* phi = cl->phi();
1045 if (phi != nullptr) {
1046 assert(phi->is_Phi() && phi->in(0) == _head, "Counted loop should have iv phi.");
1047 const TypeInt* iv_type = phase->_igvn.type(phi)->is_int();
1048 int next_stride = stride_con * 2; // stride after this unroll
1049 if (next_stride > 0) {
1050 if (iv_type->_lo > max_jint - next_stride || // overflow
1051 iv_type->_lo + next_stride > iv_type->_hi) {
1052 return false; // over-unrolling
1053 }
1054 } else if (next_stride < 0) {
1055 if (iv_type->_hi < min_jint - next_stride || // overflow
1056 iv_type->_hi + next_stride < iv_type->_lo) {
1057 return false; // over-unrolling
1058 }
1059 }
1060 }
1061 }
1062
1063 // After unroll limit will be adjusted: new_limit = limit-stride.
1064 // Bailout if adjustment overflow.
1065 const TypeInt* limit_type = phase->_igvn.type(limit_n)->is_int();
1066 if ((stride_con > 0 && ((min_jint + stride_con) > limit_type->_hi)) ||
1067 (stride_con < 0 && ((max_jint + stride_con) < limit_type->_lo)))
1068 return false; // overflow
1069
1070 // Rudimentary cost model to estimate loop unrolling
1071 // factor.
1072 // Adjust body_size to determine if we unroll or not
1073 uint body_size = _body.size();
1074 // Key test to unroll loop in CRC32 java code
1075 int xors_in_loop = 0;
1076 // Also count ModL, DivL, MulL, and other nodes that expand mightly
1077 for (uint k = 0; k < _body.size(); k++) {
1078 Node* n = _body.at(k);
1079 if (MemNode::barrier_data(n) != 0) {
1080 body_size += BarrierSet::barrier_set()->barrier_set_c2()->estimated_barrier_size(n);
1081 }
1082 switch (n->Opcode()) {
1083 case Op_XorI: xors_in_loop++; break; // CRC32 java code
1084 case Op_ModL: body_size += 30; break;
1085 case Op_DivL: body_size += 30; break;
1086 case Op_MulL: body_size += 10; break;
1087 case Op_RoundF:
1088 case Op_RoundD: {
1089 body_size += Matcher::scalar_op_pre_select_sz_estimate(n->Opcode(), n->bottom_type()->basic_type());
1090 } break;
1091 case Op_CountTrailingZerosV:
1092 case Op_CountLeadingZerosV:
1093 case Op_LoadVectorGather:
1094 case Op_LoadVectorGatherMasked:
1095 case Op_ReverseV:
1096 case Op_RoundVF:
1097 case Op_RoundVD:
1098 case Op_VectorCastD2X:
1099 case Op_VectorCastF2X:
1100 case Op_PopCountVI:
1101 case Op_PopCountVL: {
1102 const TypeVect* vt = n->bottom_type()->is_vect();
1103 body_size += Matcher::vector_op_pre_select_sz_estimate(n->Opcode(), vt->element_basic_type(), vt->length());
1104 } break;
1105 case Op_StrComp:
1106 case Op_StrEquals:
1107 case Op_StrIndexOf:
1108 case Op_StrIndexOfChar:
1109 case Op_EncodeISOArray:
1110 case Op_AryEq:
1111 case Op_VectorizedHashCode:
1112 case Op_CountPositives: {
1113 // Do not unroll a loop with String intrinsics code.
1114 // String intrinsics are large and have loops.
1115 return false;
1116 }
1117 } // switch
1118 }
1119
1120 if (phase->C->do_superword()) {
1121 // Only attempt slp analysis when user controls do not prohibit it
1122 if (!range_checks_present() && (LoopMaxUnroll > _local_loop_unroll_factor)) {
1123 // Once policy_slp_analysis succeeds, mark the loop with the
1124 // maximal unroll factor so that we minimize analysis passes
1125 if (future_unroll_cnt >= _local_loop_unroll_factor) {
1126 policy_unroll_slp_analysis(cl, phase, future_unroll_cnt);
1127 }
1128 }
1129 }
1130
1131 int slp_max_unroll_factor = cl->slp_max_unroll();
1132 if ((LoopMaxUnroll < slp_max_unroll_factor) && FLAG_IS_DEFAULT(LoopMaxUnroll) && UseSubwordForMaxVector) {
1133 LoopMaxUnroll = slp_max_unroll_factor;
1134 }
1135
1136 uint estimate = est_loop_clone_sz(2);
1137
1138 if (cl->has_passed_slp()) {
1139 if (slp_max_unroll_factor >= future_unroll_cnt) {
1140 return should_unroll && phase->may_require_nodes(estimate);
1141 }
1142 return false; // Loop too big.
1143 }
1144
1145 // Check for being too big
1146 if (body_size > (uint)_local_loop_unroll_limit) {
1147 if ((cl->is_subword_loop() || xors_in_loop >= 4) && body_size < 4u * LoopUnrollLimit) {
1148 return should_unroll && phase->may_require_nodes(estimate);
1149 }
1150 return false; // Loop too big.
1151 }
1152
1153 if (cl->is_unroll_only()) {
1154 if (TraceSuperWordLoopUnrollAnalysis) {
1155 tty->print_cr("policy_unroll passed vector loop(vlen=%d, factor=%d)\n",
1156 slp_max_unroll_factor, future_unroll_cnt);
1157 }
1158 }
1159
1160 // Unroll once! (Each trip will soon do double iterations)
1161 return should_unroll && phase->may_require_nodes(estimate);
1162 }
1163
1164 void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_cnt) {
1165
1166 // If nodes are depleted, some transform has miscalculated its needs.
1167 assert(!phase->exceeding_node_budget(), "sanity");
1168
1169 // Enable this functionality target by target as needed
1170 if (SuperWordLoopUnrollAnalysis) {
1171 if (!cl->was_slp_analyzed()) {
1172 Compile::TracePhase tp(Phase::_t_autoVectorize);
1173
1174 VLoop vloop(this, true);
1175 if (vloop.check_preconditions()) {
1176 SuperWord::unrolling_analysis(vloop, _local_loop_unroll_factor);
1177 }
1178 }
1179
1180 if (cl->has_passed_slp()) {
1181 int slp_max_unroll_factor = cl->slp_max_unroll();
1182 if (slp_max_unroll_factor >= future_unroll_cnt) {
1183 int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
1184 if (new_limit > LoopUnrollLimit) {
1185 if (TraceSuperWordLoopUnrollAnalysis) {
1186 tty->print_cr("slp analysis unroll=%d, default limit=%d\n", new_limit, _local_loop_unroll_limit);
1187 }
1188 _local_loop_unroll_limit = new_limit;
1189 }
1190 }
1191 }
1192 }
1193 }
1194
1195
1196 //------------------------------policy_range_check-----------------------------
1197 // Return TRUE or FALSE if the loop should be range-check-eliminated or not.
1198 // When TRUE, the estimated node budget is also requested.
1199 //
1200 // We will actually perform iteration-splitting, a more powerful form of RCE.
1201 bool IdealLoopTree::policy_range_check(PhaseIdealLoop* phase, bool provisional, BasicType bt) const {
1202 if (!provisional && !RangeCheckElimination) return false;
1203
1204 // If nodes are depleted, some transform has miscalculated its needs.
1205 assert(provisional || !phase->exceeding_node_budget(), "sanity");
1206
1207 if (_head->is_CountedLoop()) {
1208 CountedLoopNode *cl = _head->as_CountedLoop();
1209 // If we unrolled with no intention of doing RCE and we later changed our
1210 // minds, we got no pre-loop. Either we need to make a new pre-loop, or we
1211 // have to disallow RCE.
1212 if (cl->is_main_no_pre_loop()) return false; // Disallowed for now.
1213
1214 // check for vectorized loops, some opts are no longer needed
1215 // RCE needs pre/main/post loops. Don't apply it on a single iteration loop.
1216 if (cl->is_unroll_only() || (cl->is_normal_loop() && cl->trip_count() == 1)) return false;
1217 } else {
1218 assert(provisional, "no long counted loop expected");
1219 }
1220
1221 BaseCountedLoopNode* cl = _head->as_BaseCountedLoop();
1222 Node *trip_counter = cl->phi();
1223 assert(!cl->is_LongCountedLoop() || bt == T_LONG, "only long range checks in long counted loops");
1224 assert(cl->is_valid_counted_loop(cl->bt()), "only for well formed loops");
1225
1226 // Check loop body for tests of trip-counter plus loop-invariant vs
1227 // loop-invariant.
1228 for (uint i = 0; i < _body.size(); i++) {
1229 Node *iff = _body[i];
1230 if (iff->Opcode() == Op_If ||
1231 iff->Opcode() == Op_RangeCheck) { // Test?
1232
1233 // Comparing trip+off vs limit
1234 Node* bol = iff->in(1);
1235 if (bol->req() != 2) {
1236 // Could be a dead constant test or another dead variant (e.g. a Phi with 2 inputs created with split_thru_phi).
1237 // Either way, skip this test.
1238 continue;
1239 }
1240 if (!bol->is_Bool()) {
1241 assert(bol->is_OpaqueConstantBool() ||
1242 bol->is_OpaqueTemplateAssertionPredicate() ||
1243 bol->is_OpaqueInitializedAssertionPredicate() ||
1244 bol->is_OpaqueMultiversioning(),
1245 "Opaque node of a non-null-check or an Assertion Predicate or Multiversioning");
1246 continue;
1247 }
1248 if (bol->as_Bool()->_test._test == BoolTest::ne) {
1249 continue; // not RC
1250 }
1251 Node *cmp = bol->in(1);
1252
1253 if (provisional) {
1254 // Try to pattern match with either cmp inputs, do not check
1255 // whether one of the inputs is loop independent as it may not
1256 // have had a chance to be hoisted yet.
1257 if (!phase->is_scaled_iv_plus_offset(cmp->in(1), trip_counter, bt, nullptr, nullptr) &&
1258 !phase->is_scaled_iv_plus_offset(cmp->in(2), trip_counter, bt, nullptr, nullptr)) {
1259 continue;
1260 }
1261 } else {
1262 Node *rc_exp = cmp->in(1);
1263 Node *limit = cmp->in(2);
1264 Node *limit_c = phase->get_ctrl(limit);
1265 if (limit_c == phase->C->top()) {
1266 return false; // Found dead test on live IF? No RCE!
1267 }
1268 if (is_member(phase->get_loop(limit_c))) {
1269 // Compare might have operands swapped; commute them
1270 rc_exp = cmp->in(2);
1271 limit = cmp->in(1);
1272 limit_c = phase->get_ctrl(limit);
1273 if (is_member(phase->get_loop(limit_c))) {
1274 continue; // Both inputs are loop varying; cannot RCE
1275 }
1276 }
1277
1278 if (!phase->is_scaled_iv_plus_offset(rc_exp, trip_counter, bt, nullptr, nullptr)) {
1279 continue;
1280 }
1281 }
1282 // Found a test like 'trip+off vs limit'. Test is an IfNode, has two (2)
1283 // projections. If BOTH are in the loop we need loop unswitching instead
1284 // of iteration splitting.
1285 if (is_loop_exit(iff)) {
1286 // Found valid reason to split iterations (if there is room).
1287 // NOTE: Usually a gross overestimate.
1288 // Long range checks cause the loop to be transformed in a loop nest which only causes a fixed number of nodes
1289 // to be added
1290 return provisional || bt == T_LONG || phase->may_require_nodes(est_loop_clone_sz(2));
1291 }
1292 } // End of is IF
1293 }
1294
1295 return false;
1296 }
1297
1298 //------------------------------policy_peel_only-------------------------------
1299 // Return TRUE or FALSE if the loop should NEVER be RCE'd or aligned. Useful
1300 // for unrolling loops with NO array accesses.
1301 bool IdealLoopTree::policy_peel_only(PhaseIdealLoop *phase) const {
1302
1303 // If nodes are depleted, some transform has miscalculated its needs.
1304 assert(!phase->exceeding_node_budget(), "sanity");
1305
1306 // check for vectorized loops, any peeling done was already applied
1307 if (_head->is_CountedLoop() && _head->as_CountedLoop()->is_unroll_only()) {
1308 return false;
1309 }
1310
1311 for (uint i = 0; i < _body.size(); i++) {
1312 if (_body[i]->is_Mem()) {
1313 return false;
1314 }
1315 }
1316 // No memory accesses at all!
1317 return true;
1318 }
1319
1320 //------------------------------clone_up_backedge_goo--------------------------
1321 // If Node n lives in the back_ctrl block and cannot float, we clone a private
1322 // version of n in preheader_ctrl block and return that, otherwise return n.
1323 Node *PhaseIdealLoop::clone_up_backedge_goo(Node *back_ctrl, Node *preheader_ctrl, Node *n, VectorSet &visited, Node_Stack &clones) {
1324 if (get_ctrl(n) != back_ctrl) return n;
1325
1326 // Only visit once
1327 if (visited.test_set(n->_idx)) {
1328 Node *x = clones.find(n->_idx);
1329 return (x != nullptr) ? x : n;
1330 }
1331
1332 Node *x = nullptr; // If required, a clone of 'n'
1333 // Check for 'n' being pinned in the backedge.
1334 if (n->in(0) && n->in(0) == back_ctrl) {
1335 assert(clones.find(n->_idx) == nullptr, "dead loop");
1336 x = n->clone(); // Clone a copy of 'n' to preheader
1337 clones.push(x, n->_idx);
1338 x->set_req(0, preheader_ctrl); // Fix x's control input to preheader
1339 }
1340
1341 // Recursive fixup any other input edges into x.
1342 // If there are no changes we can just return 'n', otherwise
1343 // we need to clone a private copy and change it.
1344 for (uint i = 1; i < n->req(); i++) {
1345 Node *g = clone_up_backedge_goo(back_ctrl, preheader_ctrl, n->in(i), visited, clones);
1346 if (g != n->in(i)) {
1347 if (!x) {
1348 assert(clones.find(n->_idx) == nullptr, "dead loop");
1349 x = n->clone();
1350 clones.push(x, n->_idx);
1351 }
1352 x->set_req(i, g);
1353 }
1354 }
1355 if (x) { // x can legally float to pre-header location
1356 register_new_node(x, preheader_ctrl);
1357 return x;
1358 } else { // raise n to cover LCA of uses
1359 set_ctrl(n, find_non_split_ctrl(back_ctrl->in(0)));
1360 }
1361 return n;
1362 }
1363
1364 // When a counted loop is created, the loop phi type may be narrowed down. As a consequence, the control input of some
1365 // nodes may be cleared: in particular in the case of a division by the loop iv, the Div node would lose its control
1366 // dependency if the loop phi is never zero. After pre/main/post loops are created (and possibly unrolling), the
1367 // loop phi type is only correct if the loop is indeed reachable: there's an implicit dependency between the loop phi
1368 // type and the zero trip guard for the main or post loop and as a consequence a dependency between the Div node and the
1369 // zero trip guard. This makes the dependency explicit by adding a CastII for the loop entry input of the loop phi. If
1370 // the backedge of the main or post loop is removed, a Div node won't be able to float above the zero trip guard of the
1371 // loop and can't execute even if the loop is not reached.
1372 void PhaseIdealLoop::cast_incr_before_loop(Node* incr, Node* ctrl, CountedLoopNode* loop) {
1373 Node* castii = new CastIINode(ctrl, incr, TypeInt::INT, ConstraintCastNode::DependencyType::NonFloatingNonNarrowing);
1374 register_new_node(castii, ctrl);
1375 Node* phi = loop->phi();
1376 assert(phi->in(LoopNode::EntryControl) == incr, "replacing wrong input?");
1377 _igvn.replace_input_of(phi, LoopNode::EntryControl, castii);
1378 }
1379
1380 #ifdef ASSERT
1381 void PhaseIdealLoop::ensure_zero_trip_guard_proj(Node* node, bool is_main_loop) {
1382 assert(node->is_IfProj(), "must be the zero trip guard If node");
1383 Node* zer_bol = node->in(0)->in(1);
1384 assert(zer_bol != nullptr && zer_bol->is_Bool(), "must be Bool");
1385 Node* zer_cmp = zer_bol->in(1);
1386 assert(zer_cmp != nullptr && zer_cmp->Opcode() == Op_CmpI, "must be CmpI");
1387 // For the main loop, the opaque node is the second input to zer_cmp, for the post loop it's the first input node
1388 Node* zer_opaq = zer_cmp->in(is_main_loop ? 2 : 1);
1389 assert(zer_opaq != nullptr && zer_opaq->Opcode() == Op_OpaqueZeroTripGuard, "must be OpaqueZeroTripGuard");
1390 }
1391 #endif
1392
1393 //------------------------------insert_pre_post_loops--------------------------
1394 // Insert pre and post loops. If peel_only is set, the pre-loop can not have
1395 // more iterations added. It acts as a 'peel' only, no lower-bound RCE, no
1396 // alignment. Useful to unroll loops that do no array accesses.
1397 void PhaseIdealLoop::insert_pre_post_loops(IdealLoopTree *loop, Node_List &old_new, bool peel_only) {
1398
1399 #ifndef PRODUCT
1400 if (TraceLoopOpts) {
1401 if (peel_only)
1402 tty->print("PeelMainPost ");
1403 else
1404 tty->print("PreMainPost ");
1405 loop->dump_head();
1406 }
1407 #endif
1408 C->set_major_progress();
1409
1410 // Find common pieces of the loop being guarded with pre & post loops
1411 CountedLoopNode *main_head = loop->_head->as_CountedLoop();
1412 assert(main_head->is_normal_loop(), "");
1413 CountedLoopEndNode *main_end = main_head->loopexit();
1414 assert(main_end->outcnt() == 2, "1 true, 1 false path only");
1415
1416 C->print_method(PHASE_BEFORE_PRE_MAIN_POST, 4, main_head);
1417
1418 Node *init = main_head->init_trip();
1419 Node *incr = main_end ->incr();
1420 Node *limit = main_end ->limit();
1421 Node *stride = main_end ->stride();
1422 Node *cmp = main_end ->cmp_node();
1423 BoolTest::mask b_test = main_end->test_trip();
1424
1425 // Need only 1 user of 'bol' because I will be hacking the loop bounds.
1426 Node *bol = main_end->in(CountedLoopEndNode::TestValue);
1427 if (bol->outcnt() != 1) {
1428 bol = bol->clone();
1429 register_new_node(bol,main_end->in(CountedLoopEndNode::TestControl));
1430 _igvn.replace_input_of(main_end, CountedLoopEndNode::TestValue, bol);
1431 }
1432 // Need only 1 user of 'cmp' because I will be hacking the loop bounds.
1433 if (cmp->outcnt() != 1) {
1434 cmp = cmp->clone();
1435 register_new_node(cmp,main_end->in(CountedLoopEndNode::TestControl));
1436 _igvn.replace_input_of(bol, 1, cmp);
1437 }
1438
1439 // Add the post loop
1440 CountedLoopNode *post_head = nullptr;
1441 Node* post_incr = incr;
1442 Node* main_exit = insert_post_loop(loop, old_new, main_head, main_end, post_incr, limit, post_head);
1443 C->print_method(PHASE_AFTER_POST_LOOP, 4, post_head);
1444
1445 //------------------------------
1446 // Step B: Create Pre-Loop.
1447
1448 // Step B1: Clone the loop body. The clone becomes the pre-loop. The main
1449 // loop pre-header illegally has 2 control users (old & new loops).
1450 LoopNode* outer_main_head = main_head;
1451 IdealLoopTree* outer_loop = loop;
1452 if (main_head->is_strip_mined()) {
1453 main_head->verify_strip_mined(1);
1454 outer_main_head = main_head->outer_loop();
1455 outer_loop = loop->_parent;
1456 assert(outer_loop->_head == outer_main_head, "broken loop tree");
1457 }
1458
1459 const uint first_node_index_in_pre_loop_body = Compile::current()->unique();
1460 uint dd_main_head = dom_depth(outer_main_head);
1461 clone_loop(loop, old_new, dd_main_head, ControlAroundStripMined);
1462 CountedLoopNode* pre_head = old_new[main_head->_idx]->as_CountedLoop();
1463 CountedLoopEndNode* pre_end = old_new[main_end ->_idx]->as_CountedLoopEnd();
1464 pre_head->set_pre_loop(main_head);
1465 Node *pre_incr = old_new[incr->_idx];
1466
1467 // Reduce the pre-loop trip count.
1468 pre_end->_prob = PROB_FAIR;
1469
1470 // Find the pre-loop normal exit.
1471 IfFalseNode* pre_exit = pre_end->false_proj();
1472 IfFalseNode* new_pre_exit = new IfFalseNode(pre_end);
1473 _igvn.register_new_node_with_optimizer(new_pre_exit);
1474 set_idom(new_pre_exit, pre_end, dd_main_head);
1475 set_loop(new_pre_exit, outer_loop->_parent);
1476
1477 // Step B2: Build a zero-trip guard for the main-loop. After leaving the
1478 // pre-loop, the main-loop may not execute at all. Later in life this
1479 // zero-trip guard will become the minimum-trip guard when we unroll
1480 // the main-loop.
1481 Node *min_opaq = new OpaqueZeroTripGuardNode(C, limit, b_test);
1482 Node *min_cmp = new CmpINode(pre_incr, min_opaq);
1483 Node *min_bol = new BoolNode(min_cmp, b_test);
1484 register_new_node(min_opaq, new_pre_exit);
1485 register_new_node(min_cmp , new_pre_exit);
1486 register_new_node(min_bol , new_pre_exit);
1487
1488 // Build the IfNode (assume the main-loop is executed always).
1489 IfNode *min_iff = new IfNode(new_pre_exit, min_bol, PROB_ALWAYS, COUNT_UNKNOWN);
1490 _igvn.register_new_node_with_optimizer(min_iff);
1491 set_idom(min_iff, new_pre_exit, dd_main_head);
1492 set_loop(min_iff, outer_loop->_parent);
1493
1494 // Plug in the false-path, taken if we need to skip main-loop
1495 _igvn.hash_delete(pre_exit);
1496 pre_exit->set_req(0, min_iff);
1497 set_idom(pre_exit, min_iff, dd_main_head);
1498 set_idom(pre_exit->unique_ctrl_out(), min_iff, dd_main_head);
1499 // Make the true-path, must enter the main loop
1500 Node *min_taken = new IfTrueNode(min_iff);
1501 _igvn.register_new_node_with_optimizer(min_taken);
1502 set_idom(min_taken, min_iff, dd_main_head);
1503 set_loop(min_taken, outer_loop->_parent);
1504 // Plug in the true path
1505 _igvn.hash_delete(outer_main_head);
1506 outer_main_head->set_req(LoopNode::EntryControl, min_taken);
1507 set_idom(outer_main_head, min_taken, dd_main_head);
1508 assert(post_head->in(1)->is_IfProj(), "must be zero-trip guard If node projection of the post loop");
1509
1510 VectorSet visited;
1511 Node_Stack clones(main_head->back_control()->outcnt());
1512 // Step B3: Make the fall-in values to the main-loop come from the
1513 // fall-out values of the pre-loop.
1514 const uint last_node_index_in_pre_loop_body = Compile::current()->unique() - 1;
1515 for (DUIterator i2 = main_head->outs(); main_head->has_out(i2); i2++) {
1516 Node* main_phi = main_head->out(i2);
1517 if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() > 0) {
1518 Node* pre_phi = old_new[main_phi->_idx];
1519 Node* fallpre = clone_up_backedge_goo(pre_head->back_control(),
1520 main_head->skip_strip_mined()->in(LoopNode::EntryControl),
1521 pre_phi->in(LoopNode::LoopBackControl),
1522 visited, clones);
1523 _igvn.hash_delete(main_phi);
1524 main_phi->set_req(LoopNode::EntryControl, fallpre);
1525 }
1526 }
1527 DEBUG_ONLY(const uint last_node_index_from_backedge_goo = Compile::current()->unique() - 1);
1528
1529 DEBUG_ONLY(ensure_zero_trip_guard_proj(outer_main_head->in(LoopNode::EntryControl), true);)
1530 initialize_assertion_predicates_for_main_loop(pre_head, main_head, first_node_index_in_pre_loop_body,
1531 last_node_index_in_pre_loop_body,
1532 DEBUG_ONLY(last_node_index_from_backedge_goo COMMA) old_new);
1533 // CastII for the main loop:
1534 cast_incr_before_loop(pre_incr, min_taken, main_head);
1535
1536 // Step B4: Shorten the pre-loop to run only 1 iteration (for now).
1537 // RCE and alignment may change this later.
1538 Node *cmp_end = pre_end->cmp_node();
1539 assert(cmp_end->in(2) == limit, "");
1540 Node *pre_limit = new AddINode(init, stride);
1541
1542 // Save the original loop limit in this Opaque1 node for
1543 // use by range check elimination.
1544 Node *pre_opaq = new Opaque1Node(C, pre_limit, limit);
1545
1546 register_new_node(pre_limit, pre_head->in(LoopNode::EntryControl));
1547 register_new_node(pre_opaq , pre_head->in(LoopNode::EntryControl));
1548
1549 // Since no other users of pre-loop compare, I can hack limit directly
1550 assert(cmp_end->outcnt() == 1, "no other users");
1551 _igvn.hash_delete(cmp_end);
1552 cmp_end->set_req(2, peel_only ? pre_limit : pre_opaq);
1553
1554 // Special case for not-equal loop bounds:
1555 // Change pre loop test, main loop test, and the
1556 // main loop guard test to use lt or gt depending on stride
1557 // direction:
1558 // positive stride use <
1559 // negative stride use >
1560 //
1561 // not-equal test is kept for post loop to handle case
1562 // when init > limit when stride > 0 (and reverse).
1563
1564 if (pre_end->in(CountedLoopEndNode::TestValue)->as_Bool()->_test._test == BoolTest::ne) {
1565
1566 BoolTest::mask new_test = (main_end->stride_con() > 0) ? BoolTest::lt : BoolTest::gt;
1567 // Modify pre loop end condition
1568 Node* pre_bol = pre_end->in(CountedLoopEndNode::TestValue)->as_Bool();
1569 BoolNode* new_bol0 = new BoolNode(pre_bol->in(1), new_test);
1570 register_new_node(new_bol0, pre_head->in(0));
1571 _igvn.replace_input_of(pre_end, CountedLoopEndNode::TestValue, new_bol0);
1572 // Modify main loop guard condition
1573 assert(min_iff->in(CountedLoopEndNode::TestValue) == min_bol, "guard okay");
1574 BoolNode* new_bol1 = new BoolNode(min_bol->in(1), new_test);
1575 register_new_node(new_bol1, new_pre_exit);
1576 _igvn.hash_delete(min_iff);
1577 min_iff->set_req(CountedLoopEndNode::TestValue, new_bol1);
1578 // Modify main loop end condition
1579 BoolNode* main_bol = main_end->in(CountedLoopEndNode::TestValue)->as_Bool();
1580 BoolNode* new_bol2 = new BoolNode(main_bol->in(1), new_test);
1581 register_new_node(new_bol2, main_end->in(CountedLoopEndNode::TestControl));
1582 _igvn.replace_input_of(main_end, CountedLoopEndNode::TestValue, new_bol2);
1583 }
1584
1585 // Flag main loop
1586 main_head->set_main_loop();
1587 if (peel_only) {
1588 main_head->set_main_no_pre_loop();
1589 }
1590
1591 // Subtract a trip count for the pre-loop.
1592 main_head->set_trip_count(main_head->trip_count() - 1);
1593
1594 // It's difficult to be precise about the trip-counts
1595 // for the pre/post loops. They are usually very short,
1596 // so guess that 4 trips is a reasonable value.
1597 post_head->set_profile_trip_cnt(4.0);
1598 pre_head->set_profile_trip_cnt(4.0);
1599
1600 // Now force out all loop-invariant dominating tests. The optimizer
1601 // finds some, but we _know_ they are all useless.
1602 peeled_dom_test_elim(loop,old_new);
1603 loop->record_for_igvn();
1604
1605 C->print_method(PHASE_AFTER_PRE_MAIN_POST, 4, main_head);
1606 }
1607
1608 //------------------------------insert_vector_post_loop------------------------
1609 // Insert a copy of the atomic unrolled vectorized main loop as a post loop,
1610 // unroll_policy has already informed us that more unrolling is about to
1611 // happen to the main loop. The resultant post loop will serve as a
1612 // vectorized drain loop.
1613 void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new) {
1614 if (!loop->_head->is_CountedLoop()) return;
1615
1616 CountedLoopNode *cl = loop->_head->as_CountedLoop();
1617
1618 // only process vectorized main loops
1619 if (!cl->is_vectorized_loop() || !cl->is_main_loop()) return;
1620
1621 int slp_max_unroll_factor = cl->slp_max_unroll();
1622 int cur_unroll = cl->unrolled_count();
1623
1624 if (slp_max_unroll_factor == 0) return;
1625
1626 // only process atomic unroll vector loops (not super unrolled after vectorization)
1627 if (cur_unroll != slp_max_unroll_factor) return;
1628
1629 // we only ever process this one time
1630 if (cl->has_atomic_post_loop()) return;
1631
1632 if (!may_require_nodes(loop->est_loop_clone_sz(2))) {
1633 return;
1634 }
1635
1636 #ifndef PRODUCT
1637 if (TraceLoopOpts) {
1638 tty->print("PostVector ");
1639 loop->dump_head();
1640 }
1641 #endif
1642 C->set_major_progress();
1643
1644 // Find common pieces of the loop being guarded with pre & post loops
1645 CountedLoopNode *main_head = loop->_head->as_CountedLoop();
1646 CountedLoopEndNode *main_end = main_head->loopexit();
1647 // diagnostic to show loop end is not properly formed
1648 assert(main_end->outcnt() == 2, "1 true, 1 false path only");
1649
1650 // mark this loop as processed
1651 main_head->mark_has_atomic_post_loop();
1652
1653 Node *incr = main_end->incr();
1654 Node *limit = main_end->limit();
1655
1656 // In this case we throw away the result as we are not using it to connect anything else.
1657 C->print_method(PHASE_BEFORE_POST_LOOP, 4, main_head);
1658 CountedLoopNode *post_head = nullptr;
1659 insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
1660 C->print_method(PHASE_AFTER_POST_LOOP, 4, post_head);
1661
1662 // It's difficult to be precise about the trip-counts
1663 // for post loops. They are usually very short,
1664 // so guess that unit vector trips is a reasonable value.
1665 post_head->set_profile_trip_cnt(cur_unroll);
1666
1667 // Now force out all loop-invariant dominating tests. The optimizer
1668 // finds some, but we _know_ they are all useless.
1669 peeled_dom_test_elim(loop, old_new);
1670 loop->record_for_igvn();
1671 }
1672
1673 Node* PhaseIdealLoop::find_last_store_in_outer_loop(Node* store, const IdealLoopTree* outer_loop) {
1674 assert(store != nullptr && store->is_Store(), "starting point should be a store node");
1675 // Follow the memory uses until we get out of the loop.
1676 // Store nodes in the outer loop body were moved by PhaseIdealLoop::try_move_store_after_loop.
1677 // Because of the conditions in try_move_store_after_loop (no other usage in the loop body
1678 // except for the phi node associated with the loop head), we have the guarantee of a
1679 // linear memory subgraph within the outer loop body.
1680 Node* last = store;
1681 Node* unique_next = store;
1682 do {
1683 last = unique_next;
1684 for (DUIterator_Fast imax, l = last->fast_outs(imax); l < imax; l++) {
1685 Node* use = last->fast_out(l);
1686 if (use->is_Store() && use->in(MemNode::Memory) == last) {
1687 if (ctrl_is_member(outer_loop, use)) {
1688 assert(unique_next == last, "memory node should only have one usage in the loop body");
1689 unique_next = use;
1690 }
1691 }
1692 }
1693 } while (last != unique_next);
1694 return last;
1695 }
1696
1697 //------------------------------insert_post_loop-------------------------------
1698 // Insert post loops. Add a post loop to the given loop passed.
1699 Node *PhaseIdealLoop::insert_post_loop(IdealLoopTree* loop, Node_List& old_new,
1700 CountedLoopNode* main_head, CountedLoopEndNode* main_end,
1701 Node* incr, Node* limit, CountedLoopNode*& post_head) {
1702 IfNode* outer_main_end = main_end;
1703 IdealLoopTree* outer_loop = loop;
1704 if (main_head->is_strip_mined()) {
1705 main_head->verify_strip_mined(1);
1706 outer_main_end = main_head->outer_loop_end();
1707 outer_loop = loop->_parent;
1708 assert(outer_loop->_head == main_head->in(LoopNode::EntryControl), "broken loop tree");
1709 }
1710
1711 //------------------------------
1712 // Step A: Create a new post-Loop.
1713 IfFalseNode* main_exit = outer_main_end->false_proj();
1714 int dd_main_exit = dom_depth(main_exit);
1715
1716 // Step A1: Clone the loop body of main. The clone becomes the post-loop.
1717 // The main loop pre-header illegally has 2 control users (old & new loops).
1718 const uint first_node_index_in_cloned_loop_body = C->unique();
1719 clone_loop(loop, old_new, dd_main_exit, ControlAroundStripMined);
1720 assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, "");
1721 post_head = old_new[main_head->_idx]->as_CountedLoop();
1722 post_head->set_normal_loop();
1723 post_head->set_post_loop(main_head);
1724
1725 // clone_loop() above changes the exit projection
1726 main_exit = outer_main_end->false_proj();
1727
1728 // Reduce the post-loop trip count.
1729 CountedLoopEndNode* post_end = old_new[main_end->_idx]->as_CountedLoopEnd();
1730 post_end->_prob = PROB_FAIR;
1731
1732 // Build the main-loop normal exit.
1733 IfFalseNode *new_main_exit = new IfFalseNode(outer_main_end);
1734 _igvn.register_new_node_with_optimizer(new_main_exit);
1735 set_idom(new_main_exit, outer_main_end, dd_main_exit);
1736 set_loop(new_main_exit, outer_loop->_parent);
1737
1738 // Step A2: Build a zero-trip guard for the post-loop. After leaving the
1739 // main-loop, the post-loop may not execute at all. We 'opaque' the incr
1740 // (the previous loop trip-counter exit value) because we will be changing
1741 // the exit value (via additional unrolling) so we cannot constant-fold away the zero
1742 // trip guard until all unrolling is done.
1743 Node *zer_opaq = new OpaqueZeroTripGuardNode(C, incr, main_end->test_trip());
1744 Node *zer_cmp = new CmpINode(zer_opaq, limit);
1745 Node *zer_bol = new BoolNode(zer_cmp, main_end->test_trip());
1746 register_new_node(zer_opaq, new_main_exit);
1747 register_new_node(zer_cmp, new_main_exit);
1748 register_new_node(zer_bol, new_main_exit);
1749
1750 // Build the IfNode
1751 IfNode *zer_iff = new IfNode(new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN);
1752 _igvn.register_new_node_with_optimizer(zer_iff);
1753 set_idom(zer_iff, new_main_exit, dd_main_exit);
1754 set_loop(zer_iff, outer_loop->_parent);
1755
1756 // Plug in the false-path, taken if we need to skip this post-loop
1757 _igvn.replace_input_of(main_exit, 0, zer_iff);
1758 set_idom(main_exit, zer_iff, dd_main_exit);
1759 set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
1760 // Make the true-path, must enter this post loop
1761 Node *zer_taken = new IfTrueNode(zer_iff);
1762 _igvn.register_new_node_with_optimizer(zer_taken);
1763 set_idom(zer_taken, zer_iff, dd_main_exit);
1764 set_loop(zer_taken, outer_loop->_parent);
1765 // Plug in the true path
1766 _igvn.hash_delete(post_head);
1767 post_head->set_req(LoopNode::EntryControl, zer_taken);
1768 set_idom(post_head, zer_taken, dd_main_exit);
1769
1770 VectorSet visited;
1771 Node_Stack clones(main_head->back_control()->outcnt());
1772 // Step A3: Make the fall-in values to the post-loop come from the
1773 // fall-out values of the main-loop.
1774 for (DUIterator i = main_head->outs(); main_head->has_out(i); i++) {
1775 Node* main_phi = main_head->out(i);
1776 if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() > 0) {
1777 Node* cur_phi = old_new[main_phi->_idx];
1778 Node* fallnew = clone_up_backedge_goo(main_head->back_control(),
1779 post_head->init_control(),
1780 main_phi->in(LoopNode::LoopBackControl),
1781 visited, clones);
1782 _igvn.hash_delete(cur_phi);
1783 cur_phi->set_req(LoopNode::EntryControl, fallnew);
1784 }
1785 }
1786 // Store nodes that were moved to the outer loop by PhaseIdealLoop::try_move_store_after_loop
1787 // do not have an associated Phi node. Such nodes are attached to the false projection of the CountedLoopEnd node,
1788 // right after the execution of the inner CountedLoop.
1789 // We have to make sure that such stores in the post loop have the right memory inputs from the main loop
1790 // The moved store node is always attached right after the inner loop exit, and just before the safepoint
1791 const IfFalseNode* if_false = main_end->false_proj();
1792 for (DUIterator j = if_false->outs(); if_false->has_out(j); j++) {
1793 Node* store = if_false->out(j);
1794 if (store->is_Store()) {
1795 // We only make changes if the memory input of the store is outside the outer loop body,
1796 // as this is when we would normally expect a Phi as input. If the memory input
1797 // is in the loop body as well, then we can safely assume it is still correct as the entire
1798 // body was cloned as a unit
1799 if (!ctrl_is_member(outer_loop, store->in(MemNode::Memory))) {
1800 Node* mem_out = find_last_store_in_outer_loop(store, outer_loop);
1801 Node* store_new = old_new[store->_idx];
1802 store_new->set_req(MemNode::Memory, mem_out);
1803 }
1804 }
1805 }
1806
1807 DEBUG_ONLY(ensure_zero_trip_guard_proj(post_head->in(LoopNode::EntryControl), false);)
1808 initialize_assertion_predicates_for_post_loop(main_head, post_head, first_node_index_in_cloned_loop_body);
1809 cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
1810 return new_main_exit;
1811 }
1812
1813 //------------------------------is_invariant-----------------------------
1814 // Return true if n is invariant
1815 bool IdealLoopTree::is_invariant(Node* n) const {
1816 Node *n_c = _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n;
1817 if (n_c->is_top()) return false;
1818 return !is_member(_phase->get_loop(n_c));
1819 }
1820
1821 // Search the Assertion Predicates added by loop predication and/or range check elimination and update them according
1822 // to the new stride.
1823 void PhaseIdealLoop::update_main_loop_assertion_predicates(CountedLoopNode* new_main_loop_head,
1824 const int stride_con_before_unroll) {
1825 // Compute the value of the loop induction variable at the end of the
1826 // first iteration of the unrolled loop: init + new_stride_con - init_inc
1827 int unrolled_stride_con = stride_con_before_unroll * 2;
1828 Node* unrolled_stride = intcon(unrolled_stride_con);
1829
1830 Node* loop_entry = new_main_loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
1831 PredicateIterator predicate_iterator(loop_entry);
1832 UpdateStrideForAssertionPredicates update_stride_for_assertion_predicates(unrolled_stride, new_main_loop_head, this);
1833 predicate_iterator.for_each(update_stride_for_assertion_predicates);
1834 }
1835
1836 // Source Loop: Cloned - peeled_loop_head
1837 // Target Loop: Original - remaining_loop_head
1838 void PhaseIdealLoop::initialize_assertion_predicates_for_peeled_loop(CountedLoopNode* peeled_loop_head,
1839 CountedLoopNode* remaining_loop_head,
1840 const uint first_node_index_in_cloned_loop_body,
1841 const Node_List& old_new) {
1842 const NodeInOriginalLoopBody node_in_original_loop_body(first_node_index_in_cloned_loop_body, old_new);
1843 create_assertion_predicates_at_loop(peeled_loop_head, remaining_loop_head, node_in_original_loop_body, true);
1844 }
1845
1846 // Source Loop: Cloned - pre_loop_head
1847 // Target Loop: Original - main_loop_head
1848 void PhaseIdealLoop::initialize_assertion_predicates_for_main_loop(CountedLoopNode* pre_loop_head,
1849 CountedLoopNode* main_loop_head,
1850 const uint first_node_index_in_pre_loop_body,
1851 const uint last_node_index_in_pre_loop_body,
1852 DEBUG_ONLY(const uint last_node_index_from_backedge_goo COMMA)
1853 const Node_List& old_new) {
1854 assert(first_node_index_in_pre_loop_body < last_node_index_in_pre_loop_body, "cloned some nodes");
1855 const NodeInMainLoopBody node_in_main_loop_body(first_node_index_in_pre_loop_body,
1856 last_node_index_in_pre_loop_body,
1857 DEBUG_ONLY(last_node_index_from_backedge_goo COMMA) old_new);
1858 create_assertion_predicates_at_main_or_post_loop(pre_loop_head, main_loop_head, node_in_main_loop_body, true);
1859 }
1860
1861 // Source Loop: Original - main_loop_head
1862 // Target Loop: Cloned - post_loop_head
1863 //
1864 // The post loop is cloned before the pre loop. Do not kill the old Template Assertion Predicates, yet. We need to clone
1865 // from them when creating the pre loop. Only then we can kill them.
1866 void PhaseIdealLoop::initialize_assertion_predicates_for_post_loop(CountedLoopNode* main_loop_head,
1867 CountedLoopNode* post_loop_head,
1868 const uint first_node_index_in_cloned_loop_body) {
1869 const NodeInClonedLoopBody node_in_cloned_loop_body(first_node_index_in_cloned_loop_body);
1870 create_assertion_predicates_at_main_or_post_loop(main_loop_head, post_loop_head, node_in_cloned_loop_body, false);
1871 }
1872
1873 void PhaseIdealLoop::create_assertion_predicates_at_loop(CountedLoopNode* source_loop_head,
1874 CountedLoopNode* target_loop_head,
1875 const NodeInLoopBody& _node_in_loop_body,
1876 const bool kill_old_template) {
1877 CreateAssertionPredicatesVisitor create_assertion_predicates_visitor(target_loop_head, this, _node_in_loop_body,
1878 kill_old_template);
1879 Node* source_loop_entry = source_loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
1880 PredicateIterator predicate_iterator(source_loop_entry);
1881 predicate_iterator.for_each(create_assertion_predicates_visitor);
1882 }
1883
1884 void PhaseIdealLoop::create_assertion_predicates_at_main_or_post_loop(CountedLoopNode* source_loop_head,
1885 CountedLoopNode* target_loop_head,
1886 const NodeInLoopBody& _node_in_loop_body,
1887 const bool kill_old_template) {
1888 Node* old_target_loop_head_entry = target_loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
1889 const uint node_index_before_new_assertion_predicate_nodes = C->unique();
1890 const bool need_to_rewire_old_target_loop_entry_dependencies = old_target_loop_head_entry->outcnt() > 1;
1891 create_assertion_predicates_at_loop(source_loop_head, target_loop_head, _node_in_loop_body, kill_old_template);
1892 if (need_to_rewire_old_target_loop_entry_dependencies) {
1893 rewire_old_target_loop_entry_dependency_to_new_entry(target_loop_head, old_target_loop_head_entry,
1894 node_index_before_new_assertion_predicate_nodes);
1895 }
1896 }
1897
1898 // Rewire any control dependent nodes on the old target loop entry before adding Assertion Predicate related nodes.
1899 // These have been added by PhaseIdealLoop::clone_up_backedge_goo() and assume to be ending up at the target loop entry
1900 // which is no longer the case when adding additional Assertion Predicates. Fix this by rewiring these nodes to the new
1901 // target loop entry which corresponds to the tail of the last Assertion Predicate before the target loop. This is safe
1902 // to do because these control dependent nodes on the old target loop entry created by clone_up_backedge_goo() were
1903 // pinned on the loop backedge before. The Assertion Predicates are not control dependent on these nodes in any way.
1904 void PhaseIdealLoop::rewire_old_target_loop_entry_dependency_to_new_entry(
1905 CountedLoopNode* target_loop_head, const Node* old_target_loop_entry,
1906 const uint node_index_before_new_assertion_predicate_nodes) {
1907 Node* new_main_loop_entry = target_loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
1908 if (new_main_loop_entry == old_target_loop_entry) {
1909 // No Assertion Predicates added.
1910 return;
1911 }
1912
1913 for (DUIterator_Fast imax, i = old_target_loop_entry->fast_outs(imax); i < imax; i++) {
1914 Node* out = old_target_loop_entry->fast_out(i);
1915 if (!out->is_CFG() && out->_idx < node_index_before_new_assertion_predicate_nodes) {
1916 assert(out != target_loop_head->init_trip(), "CastII on loop entry?");
1917 _igvn.replace_input_of(out, 0, new_main_loop_entry);
1918 set_ctrl(out, new_main_loop_entry);
1919 --i;
1920 --imax;
1921 }
1922 }
1923 }
1924
1925 //------------------------------do_unroll--------------------------------------
1926 // Unroll the loop body one step - make each trip do 2 iterations.
1927 void PhaseIdealLoop::do_unroll(IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip) {
1928 assert(LoopUnrollLimit, "");
1929 CountedLoopNode *loop_head = loop->_head->as_CountedLoop();
1930 CountedLoopEndNode *loop_end = loop_head->loopexit();
1931
1932 C->print_method(PHASE_BEFORE_LOOP_UNROLLING, 4, loop_head);
1933
1934 #ifndef PRODUCT
1935 if (TraceLoopOpts) {
1936 if (loop_head->trip_count() < (uint)LoopUnrollLimit) {
1937 tty->print("Unroll %d(" JULONG_FORMAT_W(2) ") ", loop_head->unrolled_count()*2, loop_head->trip_count());
1938 } else {
1939 tty->print("Unroll %d ", loop_head->unrolled_count()*2);
1940 }
1941 loop->dump_head();
1942 }
1943
1944 if (C->do_vector_loop() && (PrintOpto && (VerifyLoopOptimizations || TraceLoopOpts))) {
1945 Node_Stack stack(C->live_nodes() >> 2);
1946 Node_List rpo_list;
1947 VectorSet visited;
1948 visited.set(loop_head->_idx);
1949 rpo(loop_head, stack, visited, rpo_list);
1950 dump(loop, rpo_list.size(), rpo_list);
1951 }
1952 #endif
1953
1954 // Remember loop node count before unrolling to detect
1955 // if rounds of unroll,optimize are making progress
1956 loop_head->set_node_count_before_unroll(loop->_body.size());
1957
1958 Node *ctrl = loop_head->skip_strip_mined()->in(LoopNode::EntryControl);
1959 Node *limit = loop_head->limit();
1960 Node *init = loop_head->init_trip();
1961 Node *stride = loop_head->stride();
1962
1963 Node *opaq = nullptr;
1964 if (adjust_min_trip) { // If not maximally unrolling, need adjustment
1965 // Search for zero-trip guard.
1966
1967 // Check the shape of the graph at the loop entry. If an inappropriate
1968 // graph shape is encountered, the compiler bails out loop unrolling;
1969 // compilation of the method will still succeed.
1970 opaq = loop_head->is_canonical_loop_entry();
1971 if (opaq == nullptr) {
1972 return;
1973 }
1974 // Zero-trip test uses an 'opaque' node which is not shared, otherwise bail out.
1975 if (opaq->outcnt() != 1 || opaq->in(1) != limit) {
1976 #ifdef ASSERT
1977 // In rare cases, loop cloning (as for peeling, for instance) can break this by replacing
1978 // limit and the input of opaq by equivalent but distinct phis.
1979 // Next IGVN should clean it up. Let's try to detect we are in such a case.
1980 Unique_Node_List& worklist = loop->_phase->_igvn._worklist;
1981 assert(C->major_progress(), "The operation that replaced limit and opaq->in(1) (e.g. peeling) should have set major_progress");
1982 assert(opaq->in(1)->is_Phi() && limit->is_Phi(), "Nodes limit and opaq->in(1) should have been replaced by PhiNodes by fix_data_uses from clone_loop.");
1983 assert(worklist.member(opaq->in(1)) && worklist.member(limit), "Nodes limit and opaq->in(1) differ and should have been recorded for IGVN.");
1984 #endif
1985 return;
1986 }
1987 }
1988
1989 C->set_major_progress();
1990
1991 Node* new_limit = nullptr;
1992 const int stride_con = stride->get_int();
1993 int stride_p = (stride_con > 0) ? stride_con : -stride_con;
1994 uint old_trip_count = loop_head->trip_count();
1995 // Verify that unroll policy result is still valid.
1996 assert(old_trip_count > 1 && (!adjust_min_trip || stride_p <=
1997 MIN2<int>(max_jint / 2 - 2, MAX2(1<<3, Matcher::max_vector_size(T_BYTE)) * loop_head->unrolled_count())), "sanity");
1998
1999 // Adjust loop limit to keep valid iterations number after unroll.
2000 // Use (limit - stride) instead of (((limit - init)/stride) & (-2))*stride
2001 // which may overflow.
2002 if (!adjust_min_trip) {
2003 assert(old_trip_count > 1 && (old_trip_count & 1) == 0,
2004 "odd trip count for maximally unroll");
2005 // Don't need to adjust limit for maximally unroll since trip count is even.
2006 } else if (loop_head->has_exact_trip_count() && init->is_Con()) {
2007 // The trip count being exact means it has been set (using CountedLoopNode::set_exact_trip_count in compute_trip_count)
2008 assert(old_trip_count < max_juint, "sanity");
2009 // Loop's limit is constant. Loop's init could be constant when pre-loop
2010 // become peeled iteration.
2011 jlong init_con = init->get_int();
2012 // We can keep old loop limit if iterations count stays the same:
2013 // old_trip_count == new_trip_count * 2
2014 // Note: since old_trip_count >= 2 then new_trip_count >= 1
2015 // so we also don't need to adjust zero trip test.
2016 jlong limit_con = limit->get_int();
2017 // (stride_con*2) not overflow since stride_con <= 8.
2018 int new_stride_con = stride_con * 2;
2019 int stride_m = new_stride_con - (stride_con > 0 ? 1 : -1);
2020 jlong trip_count = (limit_con - init_con + stride_m)/new_stride_con;
2021 // New trip count should satisfy next conditions.
2022 assert(trip_count > 0 && (julong)trip_count <= (julong)max_juint/2, "sanity");
2023 uint new_trip_count = (uint)trip_count;
2024 // Since old_trip_count has been set to < max_juint (that is at most 2^32-2),
2025 // new_trip_count is lower than or equal to 2^31-1 and the multiplication cannot overflow.
2026 adjust_min_trip = (old_trip_count != new_trip_count*2);
2027 }
2028
2029 if (adjust_min_trip) {
2030 // Step 2: Adjust the trip limit if it is called for.
2031 // The adjustment amount is -stride. Need to make sure if the
2032 // adjustment underflows or overflows, then the main loop is skipped.
2033 Node* cmp = loop_end->cmp_node();
2034 assert(cmp->in(2) == limit, "sanity");
2035 assert(opaq != nullptr && opaq->in(1) == limit, "sanity");
2036
2037 // Verify that policy_unroll result is still valid.
2038 const TypeInt* limit_type = _igvn.type(limit)->is_int();
2039 assert((stride_con > 0 && ((min_jint + stride_con) <= limit_type->_hi)) ||
2040 (stride_con < 0 && ((max_jint + stride_con) >= limit_type->_lo)),
2041 "sanity");
2042
2043 if (limit->is_Con()) {
2044 // The check in policy_unroll and the assert above guarantee
2045 // no underflow if limit is constant.
2046 new_limit = intcon(limit->get_int() - stride_con);
2047 } else {
2048 // Limit is not constant. Int subtraction could lead to underflow.
2049 // (1) Convert to long.
2050 Node* limit_l = new ConvI2LNode(limit);
2051 register_new_node_with_ctrl_of(limit_l, limit);
2052 Node* stride_l = longcon(stride_con);
2053
2054 // (2) Subtract: compute in long, to prevent underflow.
2055 Node* new_limit_l = new SubLNode(limit_l, stride_l);
2056 register_new_node(new_limit_l, ctrl);
2057
2058 // (3) Clamp to int range, in case we had subtraction underflow.
2059 Node* underflow_clamp_l = longcon((stride_con > 0) ? min_jint : max_jint);
2060 Node* new_limit_no_underflow_l = nullptr;
2061 if (stride_con > 0) {
2062 // limit = MaxL(limit - stride, min_jint)
2063 new_limit_no_underflow_l = new MaxLNode(C, new_limit_l, underflow_clamp_l);
2064 } else {
2065 // limit = MinL(limit - stride, max_jint)
2066 new_limit_no_underflow_l = new MinLNode(C, new_limit_l, underflow_clamp_l);
2067 }
2068 register_new_node(new_limit_no_underflow_l, ctrl);
2069
2070 // (4) Convert back to int.
2071 new_limit = new ConvL2INode(new_limit_no_underflow_l);
2072 register_new_node(new_limit, ctrl);
2073 }
2074
2075 assert(new_limit != nullptr, "");
2076 // Replace in loop test.
2077 assert(loop_end->in(1)->in(1) == cmp, "sanity");
2078 if (cmp->outcnt() == 1 && loop_end->in(1)->outcnt() == 1) {
2079 // Don't need to create new test since only one user.
2080 _igvn.hash_delete(cmp);
2081 cmp->set_req(2, new_limit);
2082 } else {
2083 // Create new test since it is shared.
2084 Node* ctrl2 = loop_end->in(0);
2085 Node* cmp2 = cmp->clone();
2086 cmp2->set_req(2, new_limit);
2087 register_new_node(cmp2, ctrl2);
2088 Node* bol2 = loop_end->in(1)->clone();
2089 bol2->set_req(1, cmp2);
2090 register_new_node(bol2, ctrl2);
2091 _igvn.replace_input_of(loop_end, 1, bol2);
2092 }
2093 // Step 3: Find the min-trip test guaranteed before a 'main' loop.
2094 // Make it a 1-trip test (means at least 2 trips).
2095
2096 // Guard test uses an 'opaque' node which is not shared. Hence I
2097 // can edit it's inputs directly. Hammer in the new limit for the
2098 // minimum-trip guard.
2099 assert(opaq->outcnt() == 1, "");
2100 // Notify limit -> opaq -> CmpI, it may constant fold.
2101 _igvn.add_users_to_worklist(opaq->in(1));
2102 _igvn.replace_input_of(opaq, 1, new_limit);
2103 }
2104
2105 // Adjust max trip count. The trip count is intentionally rounded
2106 // down here (e.g. 15-> 7-> 3-> 1) because if we unwittingly over-unroll,
2107 // the main, unrolled, part of the loop will never execute as it is protected
2108 // by the min-trip test. See bug 4834191 for a case where we over-unrolled
2109 // and later determined that part of the unrolled loop was dead.
2110 loop_head->set_trip_count(old_trip_count / 2);
2111
2112 // Double the count of original iterations in the unrolled loop body.
2113 loop_head->double_unrolled_count();
2114
2115 // ---------
2116 // Step 4: Clone the loop body. Move it inside the loop. This loop body
2117 // represents the odd iterations; since the loop trips an even number of
2118 // times its backedge is never taken. Kill the backedge.
2119 uint dd = dom_depth(loop_head);
2120 clone_loop(loop, old_new, dd, IgnoreStripMined);
2121
2122 // Make backedges of the clone equal to backedges of the original.
2123 // Make the fall-in from the original come from the fall-out of the clone.
2124 for (DUIterator_Fast jmax, j = loop_head->fast_outs(jmax); j < jmax; j++) {
2125 Node* phi = loop_head->fast_out(j);
2126 if (phi->is_Phi() && phi->in(0) == loop_head && phi->outcnt() > 0) {
2127 Node *newphi = old_new[phi->_idx];
2128 _igvn.hash_delete(phi);
2129 _igvn.hash_delete(newphi);
2130
2131 phi ->set_req(LoopNode:: EntryControl, newphi->in(LoopNode::LoopBackControl));
2132 newphi->set_req(LoopNode::LoopBackControl, phi ->in(LoopNode::LoopBackControl));
2133 phi ->set_req(LoopNode::LoopBackControl, C->top());
2134 }
2135 }
2136 CountedLoopNode* clone_head = old_new[loop_head->_idx]->as_CountedLoop();
2137 _igvn.hash_delete(clone_head);
2138 loop_head ->set_req(LoopNode:: EntryControl, clone_head->in(LoopNode::LoopBackControl));
2139 clone_head->set_req(LoopNode::LoopBackControl, loop_head ->in(LoopNode::LoopBackControl));
2140 loop_head ->set_req(LoopNode::LoopBackControl, C->top());
2141 loop->_head = clone_head; // New loop header
2142
2143 set_idom(loop_head, loop_head ->in(LoopNode::EntryControl), dd);
2144 set_idom(clone_head, clone_head->in(LoopNode::EntryControl), dd);
2145
2146 // Kill the clone's backedge
2147 Node *newcle = old_new[loop_end->_idx];
2148 _igvn.hash_delete(newcle);
2149 Node* one = intcon(1);
2150 newcle->set_req(1, one);
2151 // Force clone into same loop body
2152 uint max = loop->_body.size();
2153 for (uint k = 0; k < max; k++) {
2154 Node *old = loop->_body.at(k);
2155 Node *nnn = old_new[old->_idx];
2156 loop->_body.push(nnn);
2157 if (!has_ctrl(old)) {
2158 set_loop(nnn, loop);
2159 }
2160 }
2161
2162 loop->record_for_igvn();
2163 loop_head->clear_strip_mined();
2164
2165 update_main_loop_assertion_predicates(clone_head, stride_con);
2166
2167 #ifndef PRODUCT
2168 if (C->do_vector_loop() && (PrintOpto && (VerifyLoopOptimizations || TraceLoopOpts))) {
2169 tty->print("\nnew loop after unroll\n"); loop->dump_head();
2170 for (uint i = 0; i < loop->_body.size(); i++) {
2171 loop->_body.at(i)->dump();
2172 }
2173 if (C->clone_map().is_debug()) {
2174 tty->print("\nCloneMap\n");
2175 Dict* dict = C->clone_map().dict();
2176 DictI i(dict);
2177 tty->print_cr("Dict@%p[%d] = ", dict, dict->Size());
2178 for (int ii = 0; i.test(); ++i, ++ii) {
2179 NodeCloneInfo cl((uint64_t)dict->operator[]((void*)i._key));
2180 tty->print("%d->%d:%d,", (int)(intptr_t)i._key, cl.idx(), cl.gen());
2181 if (ii % 10 == 9) {
2182 tty->print_cr(" ");
2183 }
2184 }
2185 tty->print_cr(" ");
2186 }
2187 }
2188 #endif
2189
2190 C->print_method(PHASE_AFTER_LOOP_UNROLLING, 4, clone_head);
2191 }
2192
2193 //------------------------------do_maximally_unroll----------------------------
2194
2195 void PhaseIdealLoop::do_maximally_unroll(IdealLoopTree *loop, Node_List &old_new) {
2196 CountedLoopNode *cl = loop->_head->as_CountedLoop();
2197 assert(cl->has_exact_trip_count(), "trip count is not exact");
2198 assert(cl->trip_count() > 0, "");
2199 #ifndef PRODUCT
2200 if (TraceLoopOpts) {
2201 tty->print("MaxUnroll " JULONG_FORMAT " ", cl->trip_count());
2202 loop->dump_head();
2203 }
2204 #endif
2205
2206 // If loop is tripping an odd number of times, peel odd iteration
2207 if ((cl->trip_count() & 1) == 1) {
2208 if (LoopPeeling == 0) {
2209 #ifndef PRODUCT
2210 if (TraceLoopOpts) {
2211 tty->print("MaxUnroll cancelled since LoopPeeling is always disabled");
2212 loop->dump_head();
2213 }
2214 #endif
2215 return;
2216 }
2217 do_peeling(loop, old_new);
2218 }
2219
2220 // Now its tripping an even number of times remaining. Double loop body.
2221 // Do not adjust pre-guards; they are not needed and do not exist.
2222 if (cl->trip_count() > 0) {
2223 assert((cl->trip_count() & 1) == 0, "missed peeling");
2224 do_unroll(loop, old_new, false);
2225 }
2226 }
2227
2228 //------------------------------adjust_limit-----------------------------------
2229 // Helper function that computes new loop limit as (rc_limit-offset)/scale
2230 Node* PhaseIdealLoop::adjust_limit(bool is_positive_stride, Node* scale, Node* offset, Node* rc_limit, Node* old_limit, Node* pre_ctrl, bool round) {
2231 Node* old_limit_long = new ConvI2LNode(old_limit);
2232 register_new_node(old_limit_long, pre_ctrl);
2233
2234 Node* sub = new SubLNode(rc_limit, offset);
2235 register_new_node(sub, pre_ctrl);
2236 Node* limit = new DivLNode(nullptr, sub, scale);
2237 register_new_node(limit, pre_ctrl);
2238
2239 // When the absolute value of scale is greater than one, the division
2240 // may round limit down/up, so add/sub one to/from the limit.
2241 if (round) {
2242 limit = new AddLNode(limit, _igvn.longcon(is_positive_stride ? -1 : 1));
2243 register_new_node(limit, pre_ctrl);
2244 }
2245
2246 // Clamp the limit to handle integer under-/overflows by using long values.
2247 // We only convert the limit back to int when we handled under-/overflows.
2248 // Note that all values are longs in the following computations.
2249 // When reducing the limit, clamp to [min_jint, old_limit]:
2250 // INT(MINL(old_limit, MAXL(limit, min_jint)))
2251 // - integer underflow of limit: MAXL chooses min_jint.
2252 // - integer overflow of limit: MINL chooses old_limit (<= MAX_INT < limit)
2253 // When increasing the limit, clamp to [old_limit, max_jint]:
2254 // INT(MAXL(old_limit, MINL(limit, max_jint)))
2255 // - integer overflow of limit: MINL chooses max_jint.
2256 // - integer underflow of limit: MAXL chooses old_limit (>= MIN_INT > limit)
2257 // INT() is finally converting the limit back to an integer value.
2258
2259 Node* inner_result_long = nullptr;
2260 Node* outer_result_long = nullptr;
2261 if (is_positive_stride) {
2262 inner_result_long = new MaxLNode(C, limit, _igvn.longcon(min_jint));
2263 outer_result_long = new MinLNode(C, inner_result_long, old_limit_long);
2264 } else {
2265 inner_result_long = new MinLNode(C, limit, _igvn.longcon(max_jint));
2266 outer_result_long = new MaxLNode(C, inner_result_long, old_limit_long);
2267 }
2268 register_new_node(inner_result_long, pre_ctrl);
2269 register_new_node(outer_result_long, pre_ctrl);
2270
2271 limit = new ConvL2INode(outer_result_long);
2272 register_new_node(limit, pre_ctrl);
2273 return limit;
2274 }
2275
2276 //------------------------------add_constraint---------------------------------
2277 // Constrain the main loop iterations so the conditions:
2278 // low_limit <= scale_con*I + offset < upper_limit
2279 // always hold true. That is, either increase the number of iterations in the
2280 // pre-loop or reduce the number of iterations in the main-loop until the condition
2281 // holds true in the main-loop. Stride, scale, offset and limit are all loop
2282 // invariant. Further, stride and scale are constants (offset and limit often are).
2283 void PhaseIdealLoop::add_constraint(jlong stride_con, jlong scale_con, Node* offset, Node* low_limit, Node* upper_limit, Node* pre_ctrl, Node** pre_limit, Node** main_limit) {
2284 assert(_igvn.type(offset)->isa_long() != nullptr && _igvn.type(low_limit)->isa_long() != nullptr &&
2285 _igvn.type(upper_limit)->isa_long() != nullptr, "arguments should be long values");
2286
2287 // For a positive stride, we need to reduce the main-loop limit and
2288 // increase the pre-loop limit. This is reversed for a negative stride.
2289 bool is_positive_stride = (stride_con > 0);
2290
2291 // If the absolute scale value is greater one, division in 'adjust_limit' may require
2292 // rounding. Make sure the ABS method correctly handles min_jint.
2293 // Only do this for the pre-loop, one less iteration of the main loop doesn't hurt.
2294 bool round = ABS(scale_con) > 1;
2295
2296 Node* scale = longcon(scale_con);
2297
2298 if ((stride_con^scale_con) >= 0) { // Use XOR to avoid overflow
2299 // Positive stride*scale: the affine function is increasing,
2300 // the pre-loop checks for underflow and the post-loop for overflow.
2301
2302 // The overflow limit: scale*I+offset < upper_limit
2303 // For the main-loop limit compute:
2304 // ( if (scale > 0) /* and stride > 0 */
2305 // I < (upper_limit-offset)/scale
2306 // else /* scale < 0 and stride < 0 */
2307 // I > (upper_limit-offset)/scale
2308 // )
2309 *main_limit = adjust_limit(is_positive_stride, scale, offset, upper_limit, *main_limit, pre_ctrl, false);
2310
2311 // The underflow limit: low_limit <= scale*I+offset
2312 // For the pre-loop limit compute:
2313 // NOT(scale*I+offset >= low_limit)
2314 // scale*I+offset < low_limit
2315 // ( if (scale > 0) /* and stride > 0 */
2316 // I < (low_limit-offset)/scale
2317 // else /* scale < 0 and stride < 0 */
2318 // I > (low_limit-offset)/scale
2319 // )
2320 *pre_limit = adjust_limit(!is_positive_stride, scale, offset, low_limit, *pre_limit, pre_ctrl, round);
2321 } else {
2322 // Negative stride*scale: the affine function is decreasing,
2323 // the pre-loop checks for overflow and the post-loop for underflow.
2324
2325 // The overflow limit: scale*I+offset < upper_limit
2326 // For the pre-loop limit compute:
2327 // NOT(scale*I+offset < upper_limit)
2328 // scale*I+offset >= upper_limit
2329 // scale*I+offset+1 > upper_limit
2330 // ( if (scale < 0) /* and stride > 0 */
2331 // I < (upper_limit-(offset+1))/scale
2332 // else /* scale > 0 and stride < 0 */
2333 // I > (upper_limit-(offset+1))/scale
2334 // )
2335 Node* one = longcon(1);
2336 Node* plus_one = new AddLNode(offset, one);
2337 register_new_node(plus_one, pre_ctrl);
2338 *pre_limit = adjust_limit(!is_positive_stride, scale, plus_one, upper_limit, *pre_limit, pre_ctrl, round);
2339
2340 // The underflow limit: low_limit <= scale*I+offset
2341 // For the main-loop limit compute:
2342 // scale*I+offset+1 > low_limit
2343 // ( if (scale < 0) /* and stride > 0 */
2344 // I < (low_limit-(offset+1))/scale
2345 // else /* scale > 0 and stride < 0 */
2346 // I > (low_limit-(offset+1))/scale
2347 // )
2348 *main_limit = adjust_limit(is_positive_stride, scale, plus_one, low_limit, *main_limit, pre_ctrl, false);
2349 }
2350 }
2351
2352 //----------------------------------is_iv------------------------------------
2353 // Return true if exp is the value (of type bt) of the given induction var.
2354 // This grammar of cases is recognized, where X is I|L according to bt:
2355 // VIV[iv] = iv | (CastXX VIV[iv]) | (ConvI2X VIV[iv])
2356 bool PhaseIdealLoop::is_iv(Node* exp, Node* iv, BasicType bt) {
2357 exp = exp->uncast();
2358 if (exp == iv && iv->bottom_type()->isa_integer(bt)) {
2359 return true;
2360 }
2361
2362 if (bt == T_LONG && iv->bottom_type()->isa_int() && exp->Opcode() == Op_ConvI2L && exp->in(1)->uncast() == iv) {
2363 return true;
2364 }
2365 return false;
2366 }
2367
2368 //------------------------------is_scaled_iv---------------------------------
2369 // Return true if exp is a constant times the given induction var (of type bt).
2370 // The multiplication is either done in full precision (exactly of type bt),
2371 // or else bt is T_LONG but iv is scaled using 32-bit arithmetic followed by a ConvI2L.
2372 // This grammar of cases is recognized, where X is I|L according to bt:
2373 // SIV[iv] = VIV[iv] | (CastXX SIV[iv])
2374 // | (MulX VIV[iv] ConX) | (MulX ConX VIV[iv])
2375 // | (LShiftX VIV[iv] ConI)
2376 // | (ConvI2L SIV[iv]) -- a "short-scale" can occur here; note recursion
2377 // | (SubX 0 SIV[iv]) -- same as MulX(iv, -scale); note recursion
2378 // | (AddX SIV[iv] SIV[iv]) -- sum of two scaled iv; note recursion
2379 // | (SubX SIV[iv] SIV[iv]) -- difference of two scaled iv; note recursion
2380 // VIV[iv] = [either iv or its value converted; see is_iv() above]
2381 // On success, the constant scale value is stored back to *p_scale.
2382 // The value (*p_short_scale) reports if such a ConvI2L conversion was present.
2383 bool PhaseIdealLoop::is_scaled_iv(Node* exp, Node* iv, BasicType bt, jlong* p_scale, bool* p_short_scale, int depth) {
2384 BasicType exp_bt = bt;
2385 exp = exp->uncast(); //strip casts
2386 assert(exp_bt == T_INT || exp_bt == T_LONG, "unexpected int type");
2387 if (is_iv(exp, iv, exp_bt)) {
2388 if (p_scale != nullptr) {
2389 *p_scale = 1;
2390 }
2391 if (p_short_scale != nullptr) {
2392 *p_short_scale = false;
2393 }
2394 return true;
2395 }
2396 if (exp_bt == T_LONG && iv->bottom_type()->isa_int() && exp->Opcode() == Op_ConvI2L) {
2397 exp = exp->in(1);
2398 exp_bt = T_INT;
2399 }
2400 int opc = exp->Opcode();
2401 int which = 0; // this is which subexpression we find the iv in
2402 // Can't use is_Mul() here as it's true for AndI and AndL
2403 if (opc == Op_Mul(exp_bt)) {
2404 if ((is_iv(exp->in(which = 1), iv, exp_bt) && exp->in(2)->is_Con()) ||
2405 (is_iv(exp->in(which = 2), iv, exp_bt) && exp->in(1)->is_Con())) {
2406 Node* factor = exp->in(which == 1 ? 2 : 1); // the other argument
2407 jlong scale = factor->find_integer_as_long(exp_bt, 0);
2408 if (scale == 0) {
2409 return false; // might be top
2410 }
2411 if (p_scale != nullptr) {
2412 *p_scale = scale;
2413 }
2414 if (p_short_scale != nullptr) {
2415 // (ConvI2L (MulI iv K)) can be 64-bit linear if iv is kept small enough...
2416 *p_short_scale = (exp_bt != bt && scale != 1);
2417 }
2418 return true;
2419 }
2420 } else if (opc == Op_LShift(exp_bt)) {
2421 if (is_iv(exp->in(1), iv, exp_bt) && exp->in(2)->is_Con()) {
2422 jint shift_amount = exp->in(2)->find_int_con(min_jint);
2423 if (shift_amount == min_jint) {
2424 return false; // might be top
2425 }
2426 jlong scale;
2427 if (exp_bt == T_INT) {
2428 scale = java_shift_left((jint)1, (juint)shift_amount);
2429 } else if (exp_bt == T_LONG) {
2430 scale = java_shift_left((jlong)1, (julong)shift_amount);
2431 }
2432 if (p_scale != nullptr) {
2433 *p_scale = scale;
2434 }
2435 if (p_short_scale != nullptr) {
2436 // (ConvI2L (MulI iv K)) can be 64-bit linear if iv is kept small enough...
2437 *p_short_scale = (exp_bt != bt && scale != 1);
2438 }
2439 return true;
2440 }
2441 } else if (opc == Op_Add(exp_bt)) {
2442 jlong scale_l = 0;
2443 jlong scale_r = 0;
2444 bool short_scale_l = false;
2445 bool short_scale_r = false;
2446 if (depth == 0 &&
2447 is_scaled_iv(exp->in(1), iv, exp_bt, &scale_l, &short_scale_l, depth + 1) &&
2448 is_scaled_iv(exp->in(2), iv, exp_bt, &scale_r, &short_scale_r, depth + 1)) {
2449 // AddX(iv*K1, iv*K2) => iv*(K1+K2)
2450 jlong scale_sum = java_add(scale_l, scale_r);
2451 if (scale_sum > max_signed_integer(exp_bt) || scale_sum <= min_signed_integer(exp_bt)) {
2452 // This logic is shared by int and long. For int, the result may overflow
2453 // as we use jlong to compute so do the check here. Long result may also
2454 // overflow but that's fine because result wraps.
2455 return false;
2456 }
2457 if (p_scale != nullptr) {
2458 *p_scale = scale_sum;
2459 }
2460 if (p_short_scale != nullptr) {
2461 *p_short_scale = short_scale_l && short_scale_r;
2462 }
2463 return true;
2464 }
2465 } else if (opc == Op_Sub(exp_bt)) {
2466 if (exp->in(1)->find_integer_as_long(exp_bt, -1) == 0) {
2467 jlong scale = 0;
2468 if (depth == 0 && is_scaled_iv(exp->in(2), iv, exp_bt, &scale, p_short_scale, depth + 1)) {
2469 // SubX(0, iv*K) => iv*(-K)
2470 if (scale == min_signed_integer(exp_bt)) {
2471 // This should work even if -K overflows, but let's not.
2472 return false;
2473 }
2474 scale = java_multiply(scale, (jlong)-1);
2475 if (p_scale != nullptr) {
2476 *p_scale = scale;
2477 }
2478 if (p_short_scale != nullptr) {
2479 // (ConvI2L (MulI iv K)) can be 64-bit linear if iv is kept small enough...
2480 *p_short_scale = *p_short_scale || (exp_bt != bt && scale != 1);
2481 }
2482 return true;
2483 }
2484 } else {
2485 jlong scale_l = 0;
2486 jlong scale_r = 0;
2487 bool short_scale_l = false;
2488 bool short_scale_r = false;
2489 if (depth == 0 &&
2490 is_scaled_iv(exp->in(1), iv, exp_bt, &scale_l, &short_scale_l, depth + 1) &&
2491 is_scaled_iv(exp->in(2), iv, exp_bt, &scale_r, &short_scale_r, depth + 1)) {
2492 // SubX(iv*K1, iv*K2) => iv*(K1-K2)
2493 jlong scale_diff = java_subtract(scale_l, scale_r);
2494 if (scale_diff > max_signed_integer(exp_bt) || scale_diff <= min_signed_integer(exp_bt)) {
2495 // This logic is shared by int and long. For int, the result may
2496 // overflow as we use jlong to compute so do the check here. Long
2497 // result may also overflow but that's fine because result wraps.
2498 return false;
2499 }
2500 if (p_scale != nullptr) {
2501 *p_scale = scale_diff;
2502 }
2503 if (p_short_scale != nullptr) {
2504 *p_short_scale = short_scale_l && short_scale_r;
2505 }
2506 return true;
2507 }
2508 }
2509 }
2510 // We could also recognize (iv*K1)*K2, even with overflow, but let's not.
2511 return false;
2512 }
2513
2514 //-------------------------is_scaled_iv_plus_offset--------------------------
2515 // Return true if exp is a simple linear transform of the given induction var.
2516 // The scale must be constant and the addition tree (if any) must be simple.
2517 // This grammar of cases is recognized, where X is I|L according to bt:
2518 //
2519 // OIV[iv] = SIV[iv] | (CastXX OIV[iv])
2520 // | (AddX SIV[iv] E) | (AddX E SIV[iv])
2521 // | (SubX SIV[iv] E) | (SubX E SIV[iv])
2522 // SSIV[iv] = (ConvI2X SIV[iv]) -- a "short scale" might occur here
2523 // SIV[iv] = [a possibly scaled value of iv; see is_scaled_iv() above]
2524 //
2525 // On success, the constant scale value is stored back to *p_scale unless null.
2526 // Likewise, the addend (perhaps a synthetic AddX node) is stored to *p_offset.
2527 // Also, (*p_short_scale) reports if a ConvI2L conversion was seen after a MulI,
2528 // meaning bt is T_LONG but iv was scaled using 32-bit arithmetic.
2529 // To avoid looping, the match is depth-limited, and so may fail to match the grammar to complex expressions.
2530 bool PhaseIdealLoop::is_scaled_iv_plus_offset(Node* exp, Node* iv, BasicType bt, jlong* p_scale, Node** p_offset, bool* p_short_scale, int depth) {
2531 assert(bt == T_INT || bt == T_LONG, "unexpected int type");
2532 jlong scale = 0; // to catch result from is_scaled_iv()
2533 BasicType exp_bt = bt;
2534 exp = exp->uncast();
2535 if (is_scaled_iv(exp, iv, exp_bt, &scale, p_short_scale)) {
2536 if (p_scale != nullptr) {
2537 *p_scale = scale;
2538 }
2539 if (p_offset != nullptr) {
2540 Node* zero = zerocon(bt);
2541 *p_offset = zero;
2542 }
2543 return true;
2544 }
2545 if (exp_bt != bt) {
2546 // We would now be matching inputs like (ConvI2L exp:(AddI (MulI iv S) E)).
2547 // It's hard to make 32-bit arithmetic linear if it overflows. Although we do
2548 // cope with overflowing multiplication by S, it would be even more work to
2549 // handle overflowing addition of E. So we bail out here on ConvI2L input.
2550 return false;
2551 }
2552 int opc = exp->Opcode();
2553 int which = 0; // this is which subexpression we find the iv in
2554 Node* offset = nullptr;
2555 if (opc == Op_Add(exp_bt)) {
2556 // Check for a scaled IV in (AddX (MulX iv S) E) or (AddX E (MulX iv S)).
2557 if (is_scaled_iv(exp->in(which = 1), iv, bt, &scale, p_short_scale) ||
2558 is_scaled_iv(exp->in(which = 2), iv, bt, &scale, p_short_scale)) {
2559 offset = exp->in(which == 1 ? 2 : 1); // the other argument
2560 if (p_scale != nullptr) {
2561 *p_scale = scale;
2562 }
2563 if (p_offset != nullptr) {
2564 *p_offset = offset;
2565 }
2566 return true;
2567 }
2568 // Check for more addends, like (AddX (AddX (MulX iv S) E1) E2), etc.
2569 if (is_scaled_iv_plus_extra_offset(exp->in(1), exp->in(2), iv, bt, p_scale, p_offset, p_short_scale, depth) ||
2570 is_scaled_iv_plus_extra_offset(exp->in(2), exp->in(1), iv, bt, p_scale, p_offset, p_short_scale, depth)) {
2571 return true;
2572 }
2573 } else if (opc == Op_Sub(exp_bt)) {
2574 if (is_scaled_iv(exp->in(which = 1), iv, bt, &scale, p_short_scale) ||
2575 is_scaled_iv(exp->in(which = 2), iv, bt, &scale, p_short_scale)) {
2576 // Match (SubX SIV[iv] E) as if (AddX SIV[iv] (SubX 0 E)), and
2577 // match (SubX E SIV[iv]) as if (AddX E (SubX 0 SIV[iv])).
2578 offset = exp->in(which == 1 ? 2 : 1); // the other argument
2579 if (which == 2) {
2580 // We can't handle a scale of min_jint (or min_jlong) here as -1 * min_jint = min_jint
2581 if (scale == min_signed_integer(bt)) {
2582 return false; // cannot negate the scale of the iv
2583 }
2584 scale = java_multiply(scale, (jlong)-1);
2585 }
2586 if (p_scale != nullptr) {
2587 *p_scale = scale;
2588 }
2589 if (p_offset != nullptr) {
2590 if (which == 1) { // must negate the extracted offset
2591 Node* zero = integercon(0, exp_bt);
2592 Node *ctrl_off = get_ctrl(offset);
2593 offset = SubNode::make(zero, offset, exp_bt);
2594 register_new_node(offset, ctrl_off);
2595 }
2596 *p_offset = offset;
2597 }
2598 return true;
2599 }
2600 }
2601 return false;
2602 }
2603
2604 // Helper for is_scaled_iv_plus_offset(), not called separately.
2605 // The caller encountered (AddX exp1 offset3) or (AddX offset3 exp1).
2606 // Here, exp1 is inspected to see if it is a simple linear transform of iv.
2607 // If so, the offset3 is combined with any other offset2 from inside exp1.
2608 bool PhaseIdealLoop::is_scaled_iv_plus_extra_offset(Node* exp1, Node* offset3, Node* iv,
2609 BasicType bt,
2610 jlong* p_scale, Node** p_offset,
2611 bool* p_short_scale, int depth) {
2612 // By the time we reach here, it is unlikely that exp1 is a simple iv*K.
2613 // If is a linear iv transform, it is probably an add or subtract.
2614 // Let's collect the internal offset2 from it.
2615 Node* offset2 = nullptr;
2616 if (offset3->is_Con() &&
2617 depth < 2 &&
2618 is_scaled_iv_plus_offset(exp1, iv, bt, p_scale,
2619 &offset2, p_short_scale, depth+1)) {
2620 if (p_offset != nullptr) {
2621 Node* ctrl_off2 = get_ctrl(offset2);
2622 Node* offset = AddNode::make(offset2, offset3, bt);
2623 register_new_node(offset, ctrl_off2);
2624 *p_offset = offset;
2625 }
2626 return true;
2627 }
2628 return false;
2629 }
2630
2631 //------------------------------do_range_check---------------------------------
2632 // Eliminate range-checks and other trip-counter vs loop-invariant tests.
2633 void PhaseIdealLoop::do_range_check(IdealLoopTree* loop) {
2634 #ifndef PRODUCT
2635 if (TraceLoopOpts) {
2636 tty->print("RangeCheck ");
2637 loop->dump_head();
2638 }
2639 #endif
2640
2641 assert(RangeCheckElimination, "");
2642 CountedLoopNode *cl = loop->_head->as_CountedLoop();
2643
2644 // protect against stride not being a constant
2645 if (!cl->stride_is_con()) {
2646 return;
2647 }
2648 // Find the trip counter; we are iteration splitting based on it
2649 Node *trip_counter = cl->phi();
2650 // Find the main loop limit; we will trim it's iterations
2651 // to not ever trip end tests
2652 Node *main_limit = cl->limit();
2653 Node* main_limit_ctrl = get_ctrl(main_limit);
2654
2655 // Check graph shape. Cannot optimize a loop if zero-trip
2656 // Opaque1 node is optimized away and then another round
2657 // of loop opts attempted.
2658 if (cl->is_canonical_loop_entry() == nullptr) {
2659 return;
2660 }
2661
2662 // Need to find the main-loop zero-trip guard
2663 Node *ctrl = cl->skip_assertion_predicates_with_halt();
2664 Node *iffm = ctrl->in(0);
2665 Node *opqzm = iffm->in(1)->in(1)->in(2);
2666 assert(opqzm->in(1) == main_limit, "do not understand situation");
2667
2668 // Find the pre-loop limit; we will expand its iterations to
2669 // not ever trip low tests.
2670 Node *p_f = iffm->in(0);
2671 // pre loop may have been optimized out
2672 if (p_f->Opcode() != Op_IfFalse) {
2673 return;
2674 }
2675 CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
2676 assert(pre_end->loopnode()->is_pre_loop(), "");
2677 Node *pre_opaq1 = pre_end->limit();
2678 // Occasionally it's possible for a pre-loop Opaque1 node to be
2679 // optimized away and then another round of loop opts attempted.
2680 // We can not optimize this particular loop in that case.
2681 if (pre_opaq1->Opcode() != Op_Opaque1) {
2682 return;
2683 }
2684 Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
2685 Node *pre_limit = pre_opaq->in(1);
2686 Node* pre_limit_ctrl = get_ctrl(pre_limit);
2687
2688 // Where do we put new limit calculations
2689 Node* pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
2690 // Range check elimination optimizes out conditions whose parameters are loop invariant in the main loop. They usually
2691 // have control above the pre loop, but there's no guarantee that they do. There's no guarantee either that the pre
2692 // loop limit has control that's out of loop (a previous round of range check elimination could have set a limit that's
2693 // not loop invariant). new_limit_ctrl is used for both the pre and main loops. Early control for the main limit may be
2694 // below the pre loop entry and the pre limit and must be taken into account when initializing new_limit_ctrl.
2695 Node* new_limit_ctrl = dominated_node(pre_ctrl, pre_limit_ctrl, compute_early_ctrl(main_limit, main_limit_ctrl));
2696
2697 // Ensure the original loop limit is available from the
2698 // pre-loop Opaque1 node.
2699 Node *orig_limit = pre_opaq->original_loop_limit();
2700 if (orig_limit == nullptr || _igvn.type(orig_limit) == Type::TOP) {
2701 return;
2702 }
2703 // Must know if its a count-up or count-down loop
2704
2705 int stride_con = cl->stride_con();
2706 bool abs_stride_is_one = stride_con == 1 || stride_con == -1;
2707 Node* zero = longcon(0);
2708 Node* one = longcon(1);
2709 // Use symmetrical int range [-max_jint,max_jint]
2710 Node* mini = longcon(-max_jint);
2711
2712 Node* loop_entry = cl->skip_strip_mined()->in(LoopNode::EntryControl);
2713 assert(loop_entry->is_Proj() && loop_entry->in(0)->is_If(), "if projection only");
2714
2715 // if abs(stride) == 1, an Assertion Predicate for the final iv value is added. We don't know the final iv value until
2716 // we're done with range check elimination so use a place holder.
2717 Node* final_iv_placeholder = nullptr;
2718 if (abs_stride_is_one) {
2719 final_iv_placeholder = new Node(1);
2720 _igvn.set_type(final_iv_placeholder, TypeInt::INT);
2721 final_iv_placeholder->init_req(0, loop_entry);
2722 }
2723
2724 // Check loop body for tests of trip-counter plus loop-invariant vs loop-variant.
2725 for (uint i = 0; i < loop->_body.size(); i++) {
2726 Node *iff = loop->_body[i];
2727 if (iff->Opcode() == Op_If ||
2728 iff->Opcode() == Op_RangeCheck) { // Test?
2729 // Test is an IfNode, has 2 projections. If BOTH are in the loop
2730 // we need loop unswitching instead of iteration splitting.
2731 Node *exit = loop->is_loop_exit(iff);
2732 if (!exit) continue;
2733 int flip = (exit->Opcode() == Op_IfTrue) ? 1 : 0;
2734
2735 // Get boolean condition to test
2736 Node *i1 = iff->in(1);
2737 if (!i1->is_Bool()) continue;
2738 BoolNode *bol = i1->as_Bool();
2739 BoolTest b_test = bol->_test;
2740 // Flip sense of test if exit condition is flipped
2741 if (flip) {
2742 b_test = b_test.negate();
2743 }
2744 // Get compare
2745 Node *cmp = bol->in(1);
2746
2747 // Look for trip_counter + offset vs limit
2748 Node *rc_exp = cmp->in(1);
2749 Node *limit = cmp->in(2);
2750 int scale_con= 1; // Assume trip counter not scaled
2751
2752 Node* limit_ctrl = get_ctrl(limit);
2753 if (loop->is_member(get_loop(limit_ctrl))) {
2754 // Compare might have operands swapped; commute them
2755 b_test = b_test.commute();
2756 rc_exp = cmp->in(2);
2757 limit = cmp->in(1);
2758 limit_ctrl = get_ctrl(limit);
2759 if (loop->is_member(get_loop(limit_ctrl))) {
2760 continue; // Both inputs are loop varying; cannot RCE
2761 }
2762 }
2763 // Here we know 'limit' is loop invariant
2764
2765 // 'limit' maybe pinned below the zero trip test (probably from a
2766 // previous round of rce), in which case, it can't be used in the
2767 // zero trip test expression which must occur before the zero test's if.
2768 if (is_dominator(ctrl, limit_ctrl)) {
2769 continue; // Don't rce this check but continue looking for other candidates.
2770 }
2771
2772 assert(is_dominator(compute_early_ctrl(limit, limit_ctrl), pre_end), "node pinned on loop exit test?");
2773
2774 // Check for scaled induction variable plus an offset
2775 Node *offset = nullptr;
2776
2777 if (!is_scaled_iv_plus_offset(rc_exp, trip_counter, &scale_con, &offset)) {
2778 continue;
2779 }
2780
2781 Node* offset_ctrl = get_ctrl(offset);
2782 if (loop->is_member(get_loop(offset_ctrl))) {
2783 continue; // Offset is not really loop invariant
2784 }
2785 // Here we know 'offset' is loop invariant.
2786
2787 // As above for the 'limit', the 'offset' maybe pinned below the
2788 // zero trip test.
2789 if (is_dominator(ctrl, offset_ctrl)) {
2790 continue; // Don't rce this check but continue looking for other candidates.
2791 }
2792
2793 // offset and limit can have control set below the pre loop when they are not loop invariant in the pre loop.
2794 // Update their control (and the control of inputs as needed) to be above pre_end
2795 offset_ctrl = ensure_node_and_inputs_are_above_pre_end(pre_end, offset);
2796 limit_ctrl = ensure_node_and_inputs_are_above_pre_end(pre_end, limit);
2797
2798 // offset and limit could have control below new_limit_ctrl if they are not loop invariant in the pre loop.
2799 Node* next_limit_ctrl = dominated_node(new_limit_ctrl, offset_ctrl, limit_ctrl);
2800
2801 #ifdef ASSERT
2802 if (TraceRangeLimitCheck) {
2803 tty->print_cr("RC bool node%s", flip ? " flipped:" : ":");
2804 bol->dump(2);
2805 }
2806 #endif
2807 // At this point we have the expression as:
2808 // scale_con * trip_counter + offset :: limit
2809 // where scale_con, offset and limit are loop invariant. Trip_counter
2810 // monotonically increases by stride_con, a constant. Both (or either)
2811 // stride_con and scale_con can be negative which will flip about the
2812 // sense of the test.
2813
2814 C->print_method(PHASE_BEFORE_RANGE_CHECK_ELIMINATION, 4, iff);
2815
2816 // Perform the limit computations in jlong to avoid overflow
2817 jlong lscale_con = scale_con;
2818 Node* int_offset = offset;
2819 offset = new ConvI2LNode(offset);
2820 register_new_node(offset, next_limit_ctrl);
2821 Node* int_limit = limit;
2822 limit = new ConvI2LNode(limit);
2823 register_new_node(limit, next_limit_ctrl);
2824
2825 // Adjust pre and main loop limits to guard the correct iteration set
2826 if (cmp->Opcode() == Op_CmpU) { // Unsigned compare is really 2 tests
2827 if (b_test._test == BoolTest::lt) { // Range checks always use lt
2828 // The underflow and overflow limits: 0 <= scale*I+offset < limit
2829 add_constraint(stride_con, lscale_con, offset, zero, limit, next_limit_ctrl, &pre_limit, &main_limit);
2830 Node* init = cl->uncasted_init_trip(true);
2831
2832 Node* opaque_init = new OpaqueLoopInitNode(C, init);
2833 register_new_node(opaque_init, loop_entry);
2834
2835 InitializedAssertionPredicateCreator initialized_assertion_predicate_creator(this);
2836 if (abs_stride_is_one) {
2837 // If the main loop becomes empty and the array access for this range check is sunk out of the loop, the index
2838 // for the array access will be set to the index value of the final iteration which could be out of loop.
2839 // Add an Initialized Assertion Predicate for that corner case. The final iv is computed from LoopLimit which
2840 // is the LoopNode::limit() only if abs(stride) == 1 otherwise the computation depends on LoopNode::init_trip()
2841 // as well. When LoopLimit only depends on LoopNode::limit(), there are cases where the zero trip guard for
2842 // the main loop doesn't constant fold after range check elimination but, the array access for the final
2843 // iteration of the main loop is out of bound and the index for that access is out of range for the range
2844 // check CastII.
2845 // Note that we do not need to emit a Template Assertion Predicate to update this predicate. When further
2846 // splitting this loop, the final IV will still be the same. When unrolling the loop, we will remove a
2847 // previously added Initialized Assertion Predicate here. But then abs(stride) is greater than 1, and we
2848 // cannot remove an empty loop with a constant limit when init is not a constant as well. We will use
2849 // a LoopLimitCheck node that can only be folded if the zero grip guard is also foldable.
2850 loop_entry = initialized_assertion_predicate_creator.create(final_iv_placeholder, loop_entry, stride_con,
2851 scale_con, int_offset, int_limit,
2852 AssertionPredicateType::FinalIv);
2853 }
2854
2855 // Add two Template Assertion Predicates to create new Initialized Assertion Predicates from when either
2856 // unrolling or splitting this main-loop further.
2857 TemplateAssertionPredicateCreator template_assertion_predicate_creator(cl, scale_con , int_offset, int_limit,
2858 this);
2859 loop_entry = template_assertion_predicate_creator.create(loop_entry);
2860
2861 // Initialized Assertion Predicate for the value of the initial main-loop.
2862 loop_entry = initialized_assertion_predicate_creator.create(init, loop_entry, stride_con, scale_con,
2863 int_offset, int_limit,
2864 AssertionPredicateType::InitValue);
2865
2866 } else {
2867 if (PrintOpto) {
2868 tty->print_cr("missed RCE opportunity");
2869 }
2870 continue; // In release mode, ignore it
2871 }
2872 } else { // Otherwise work on normal compares
2873 switch(b_test._test) {
2874 case BoolTest::gt:
2875 // Fall into GE case
2876 case BoolTest::ge:
2877 // Convert (I*scale+offset) >= Limit to (I*(-scale)+(-offset)) <= -Limit
2878 lscale_con = -lscale_con;
2879 offset = new SubLNode(zero, offset);
2880 register_new_node(offset, next_limit_ctrl);
2881 limit = new SubLNode(zero, limit);
2882 register_new_node(limit, next_limit_ctrl);
2883 // Fall into LE case
2884 case BoolTest::le:
2885 if (b_test._test != BoolTest::gt) {
2886 // Convert X <= Y to X < Y+1
2887 limit = new AddLNode(limit, one);
2888 register_new_node(limit, next_limit_ctrl);
2889 }
2890 // Fall into LT case
2891 case BoolTest::lt:
2892 // The underflow and overflow limits: MIN_INT <= scale*I+offset < limit
2893 // Note: (MIN_INT+1 == -MAX_INT) is used instead of MIN_INT here
2894 // to avoid problem with scale == -1: MIN_INT/(-1) == MIN_INT.
2895 add_constraint(stride_con, lscale_con, offset, mini, limit, next_limit_ctrl, &pre_limit, &main_limit);
2896 break;
2897 default:
2898 if (PrintOpto) {
2899 tty->print_cr("missed RCE opportunity");
2900 }
2901 continue; // Unhandled case
2902 }
2903 }
2904 // Only update variable tracking control for new nodes if it's indeed a range check that can be eliminated (and
2905 // limits are updated)
2906 new_limit_ctrl = next_limit_ctrl;
2907
2908 // Kill the eliminated test
2909 C->set_major_progress();
2910 Node* kill_con = intcon(1-flip);
2911 _igvn.replace_input_of(iff, 1, kill_con);
2912 // Find surviving projection
2913 assert(iff->is_If(), "");
2914 ProjNode* dp = ((IfNode*)iff)->proj_out(1-flip);
2915 // Find loads off the surviving projection; remove their control edge
2916 for (DUIterator_Fast imax, i = dp->fast_outs(imax); i < imax; i++) {
2917 Node* cd = dp->fast_out(i); // Control-dependent node
2918 if (cd->is_Load() && cd->depends_only_on_test()) { // Loads can now float around in the loop
2919 // Allow the load to float around in the loop, or before it
2920 // but NOT before the pre-loop.
2921 _igvn.replace_input_of(cd, 0, ctrl); // ctrl, not null
2922 --i;
2923 --imax;
2924 }
2925 }
2926 } // End of is IF
2927 }
2928 if (loop_entry != cl->skip_strip_mined()->in(LoopNode::EntryControl)) {
2929 _igvn.replace_input_of(cl->skip_strip_mined(), LoopNode::EntryControl, loop_entry);
2930 set_idom(cl->skip_strip_mined(), loop_entry, dom_depth(cl->skip_strip_mined()));
2931 }
2932
2933 // Update loop limits
2934 if (pre_limit != orig_limit) {
2935 // Computed pre-loop limit can be outside of loop iterations range.
2936 pre_limit = (stride_con > 0) ? (Node*)new MinINode(pre_limit, orig_limit)
2937 : (Node*)new MaxINode(pre_limit, orig_limit);
2938 register_new_node(pre_limit, new_limit_ctrl);
2939 }
2940 // new pre_limit can push Bool/Cmp/Opaque nodes down (when one of the eliminated condition has parameters that are not
2941 // loop invariant in the pre loop.
2942 set_ctrl(pre_opaq, new_limit_ctrl);
2943 // Can't use new_limit_ctrl for Bool/Cmp because it can be out of loop while they are loop variant. Conservatively set
2944 // control to latest possible one.
2945 set_ctrl(pre_end->cmp_node(), pre_end->in(0));
2946 set_ctrl(pre_end->in(1), pre_end->in(0));
2947
2948 _igvn.replace_input_of(pre_opaq, 1, pre_limit);
2949
2950 // Note:: we are making the main loop limit no longer precise;
2951 // need to round up based on stride.
2952 cl->set_nonexact_trip_count();
2953 Node *main_cle = cl->loopexit();
2954 Node *main_bol = main_cle->in(1);
2955 // Hacking loop bounds; need private copies of exit test
2956 if (main_bol->outcnt() > 1) { // BoolNode shared?
2957 main_bol = main_bol->clone(); // Clone a private BoolNode
2958 register_new_node(main_bol, main_cle->in(0));
2959 _igvn.replace_input_of(main_cle, 1, main_bol);
2960 }
2961 Node *main_cmp = main_bol->in(1);
2962 if (main_cmp->outcnt() > 1) { // CmpNode shared?
2963 main_cmp = main_cmp->clone(); // Clone a private CmpNode
2964 register_new_node(main_cmp, main_cle->in(0));
2965 _igvn.replace_input_of(main_bol, 1, main_cmp);
2966 }
2967 assert(main_limit == cl->limit() || get_ctrl(main_limit) == new_limit_ctrl, "wrong control for added limit");
2968 const TypeInt* orig_limit_t = _igvn.type(orig_limit)->is_int();
2969 bool upward = cl->stride_con() > 0;
2970 // The new loop limit is <= (for an upward loop) >= (for a downward loop) than the orig limit.
2971 // The expression that computes the new limit may be too complicated and the computed type of the new limit
2972 // may be too pessimistic. A CastII here guarantees it's not lost.
2973 main_limit = new CastIINode(pre_ctrl, main_limit, TypeInt::make(upward ? min_jint : orig_limit_t->_lo,
2974 upward ? orig_limit_t->_hi : max_jint, Type::WidenMax));
2975 register_new_node(main_limit, new_limit_ctrl);
2976 // Hack the now-private loop bounds
2977 _igvn.replace_input_of(main_cmp, 2, main_limit);
2978 if (abs_stride_is_one) {
2979 Node* final_iv = new SubINode(main_limit, cl->stride());
2980 register_new_node(final_iv, loop_entry);
2981 _igvn.replace_node(final_iv_placeholder, final_iv);
2982 }
2983 // The OpaqueNode is unshared by design
2984 assert(opqzm->outcnt() == 1, "cannot hack shared node");
2985 _igvn.replace_input_of(opqzm, 1, main_limit);
2986 // new main_limit can push opaque node for zero trip guard down (when one of the eliminated condition has parameters
2987 // that are not loop invariant in the pre loop).
2988 set_ctrl(opqzm, new_limit_ctrl);
2989 // Bool/Cmp nodes for zero trip guard should have been assigned control between the main and pre loop (because zero
2990 // trip guard depends on induction variable value out of pre loop) so shouldn't need to be adjusted
2991 assert(is_dominator(new_limit_ctrl, get_ctrl(iffm->in(1)->in(1))), "control of cmp should be below control of updated input");
2992
2993 C->print_method(PHASE_AFTER_RANGE_CHECK_ELIMINATION, 4, cl);
2994 }
2995
2996 // Adjust control for node and its inputs (and inputs of its inputs) to be above the pre end
2997 Node* PhaseIdealLoop::ensure_node_and_inputs_are_above_pre_end(CountedLoopEndNode* pre_end, Node* node) {
2998 Node* control = get_ctrl(node);
2999 assert(is_dominator(compute_early_ctrl(node, control), pre_end), "node pinned on loop exit test?");
3000
3001 if (is_dominator(control, pre_end)) {
3002 return control;
3003 }
3004 control = pre_end->in(0);
3005 ResourceMark rm;
3006 Unique_Node_List wq;
3007 wq.push(node);
3008 for (uint i = 0; i < wq.size(); i++) {
3009 Node* n = wq.at(i);
3010 assert(is_dominator(compute_early_ctrl(n, get_ctrl(n)), pre_end), "node pinned on loop exit test?");
3011 set_ctrl(n, control);
3012 for (uint j = 0; j < n->req(); j++) {
3013 Node* in = n->in(j);
3014 if (in != nullptr && has_ctrl(in) && !is_dominator(get_ctrl(in), pre_end)) {
3015 wq.push(in);
3016 }
3017 }
3018 }
3019 return control;
3020 }
3021
3022 bool IdealLoopTree::compute_has_range_checks() const {
3023 assert(_head->is_CountedLoop(), "");
3024 for (uint i = 0; i < _body.size(); i++) {
3025 Node *iff = _body[i];
3026 int iff_opc = iff->Opcode();
3027 if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
3028 return true;
3029 }
3030 }
3031 return false;
3032 }
3033
3034 //------------------------------DCE_loop_body----------------------------------
3035 // Remove simplistic dead code from loop body
3036 void IdealLoopTree::DCE_loop_body() {
3037 for (uint i = 0; i < _body.size(); i++) {
3038 if (_body.at(i)->outcnt() == 0) {
3039 _body.map(i, _body.pop());
3040 i--; // Ensure we revisit the updated index.
3041 }
3042 }
3043 }
3044
3045
3046 //------------------------------adjust_loop_exit_prob--------------------------
3047 // Look for loop-exit tests with the 50/50 (or worse) guesses from the parsing stage.
3048 // Replace with a 1-in-10 exit guess.
3049 void IdealLoopTree::adjust_loop_exit_prob(PhaseIdealLoop *phase) {
3050 Node *test = tail();
3051 while (test != _head) {
3052 uint top = test->Opcode();
3053 if (top == Op_IfTrue || top == Op_IfFalse) {
3054 int test_con = ((ProjNode*)test)->_con;
3055 assert(top == (uint)(test_con? Op_IfTrue: Op_IfFalse), "sanity");
3056 IfNode *iff = test->in(0)->as_If();
3057 if (iff->outcnt() == 2) { // Ignore dead tests
3058 Node *bol = iff->in(1);
3059 if (bol && bol->req() > 1 && bol->in(1) &&
3060 ((bol->in(1)->Opcode() == Op_CompareAndExchangeB) ||
3061 (bol->in(1)->Opcode() == Op_CompareAndExchangeS) ||
3062 (bol->in(1)->Opcode() == Op_CompareAndExchangeI) ||
3063 (bol->in(1)->Opcode() == Op_CompareAndExchangeL) ||
3064 (bol->in(1)->Opcode() == Op_CompareAndExchangeP) ||
3065 (bol->in(1)->Opcode() == Op_CompareAndExchangeN) ||
3066 (bol->in(1)->Opcode() == Op_WeakCompareAndSwapB) ||
3067 (bol->in(1)->Opcode() == Op_WeakCompareAndSwapS) ||
3068 (bol->in(1)->Opcode() == Op_WeakCompareAndSwapI) ||
3069 (bol->in(1)->Opcode() == Op_WeakCompareAndSwapL) ||
3070 (bol->in(1)->Opcode() == Op_WeakCompareAndSwapP) ||
3071 (bol->in(1)->Opcode() == Op_WeakCompareAndSwapN) ||
3072 (bol->in(1)->Opcode() == Op_CompareAndSwapB) ||
3073 (bol->in(1)->Opcode() == Op_CompareAndSwapS) ||
3074 (bol->in(1)->Opcode() == Op_CompareAndSwapI) ||
3075 (bol->in(1)->Opcode() == Op_CompareAndSwapL) ||
3076 (bol->in(1)->Opcode() == Op_CompareAndSwapP) ||
3077 (bol->in(1)->Opcode() == Op_CompareAndSwapN)))
3078 return; // Allocation loops RARELY take backedge
3079 // Find the OTHER exit path from the IF
3080 Node* ex = iff->proj_out(1-test_con);
3081 float p = iff->_prob;
3082 if (!phase->is_member(this, ex) && iff->_fcnt == COUNT_UNKNOWN) {
3083 if (top == Op_IfTrue) {
3084 if (p < (PROB_FAIR + PROB_UNLIKELY_MAG(3))) {
3085 iff->_prob = PROB_STATIC_FREQUENT;
3086 }
3087 } else {
3088 if (p > (PROB_FAIR - PROB_UNLIKELY_MAG(3))) {
3089 iff->_prob = PROB_STATIC_INFREQUENT;
3090 }
3091 }
3092 }
3093 }
3094 }
3095 test = phase->idom(test);
3096 }
3097 }
3098
3099 static CountedLoopNode* locate_pre_from_main(CountedLoopNode* main_loop) {
3100 assert(!main_loop->is_main_no_pre_loop(), "Does not have a pre loop");
3101 Node* ctrl = main_loop->skip_assertion_predicates_with_halt();
3102 assert(ctrl->Opcode() == Op_IfTrue || ctrl->Opcode() == Op_IfFalse, "");
3103 Node* iffm = ctrl->in(0);
3104 assert(iffm->Opcode() == Op_If, "");
3105 Node* p_f = iffm->in(0);
3106 assert(p_f->Opcode() == Op_IfFalse, "");
3107 CountedLoopNode* pre_loop = p_f->in(0)->as_CountedLoopEnd()->loopnode();
3108 assert(pre_loop->is_pre_loop(), "No pre loop found");
3109 return pre_loop;
3110 }
3111
3112 // Remove the main and post loops and make the pre loop execute all
3113 // iterations. Useful when the pre loop is found empty.
3114 void IdealLoopTree::remove_main_post_loops(CountedLoopNode *cl, PhaseIdealLoop *phase) {
3115 CountedLoopEndNode* pre_end = cl->loopexit();
3116 Node* pre_cmp = pre_end->cmp_node();
3117 if (pre_cmp->in(2)->Opcode() != Op_Opaque1) {
3118 // Only safe to remove the main loop if the compiler optimized it
3119 // out based on an unknown number of iterations
3120 return;
3121 }
3122
3123 // Can we find the main loop?
3124 if (_next == nullptr) {
3125 return;
3126 }
3127
3128 Node* next_head = _next->_head;
3129 if (!next_head->is_CountedLoop()) {
3130 return;
3131 }
3132
3133 CountedLoopNode* main_head = next_head->as_CountedLoop();
3134 if (!main_head->is_main_loop() || main_head->is_main_no_pre_loop()) {
3135 return;
3136 }
3137
3138 // We found a main-loop after this pre-loop, but they might not belong together.
3139 if (locate_pre_from_main(main_head) != cl) {
3140 return;
3141 }
3142
3143 Node* main_iff = main_head->skip_assertion_predicates_with_halt()->in(0);
3144
3145 // Remove the Opaque1Node of the pre loop and make it execute all iterations
3146 phase->_igvn.replace_input_of(pre_cmp, 2, pre_cmp->in(2)->in(2));
3147 // Remove the OpaqueZeroTripGuardNode of the main loop so it can be optimized out
3148 Node* main_cmp = main_iff->in(1)->in(1);
3149 assert(main_cmp->in(2)->Opcode() == Op_OpaqueZeroTripGuard, "main loop has no opaque node?");
3150 phase->_igvn.replace_input_of(main_cmp, 2, main_cmp->in(2)->in(1));
3151 }
3152
3153 //------------------------------do_remove_empty_loop---------------------------
3154 // We always attempt remove empty loops. The approach is to replace the trip
3155 // counter with the value it will have on the last iteration. This will break
3156 // the loop.
3157 bool IdealLoopTree::do_remove_empty_loop(PhaseIdealLoop *phase) {
3158 if (!_head->is_CountedLoop()) {
3159 return false; // Dead loop
3160 }
3161 if (!empty_loop_candidate(phase)) {
3162 return false;
3163 }
3164 CountedLoopNode *cl = _head->as_CountedLoop();
3165 #ifdef ASSERT
3166 // Call collect_loop_core_nodes to exercise the assert that checks that it finds the right number of nodes
3167 if (empty_loop_with_extra_nodes_candidate(phase)) {
3168 Unique_Node_List wq;
3169 collect_loop_core_nodes(phase, wq);
3170 }
3171 #endif
3172 // Minimum size must be empty loop
3173 if (_body.size() > EMPTY_LOOP_SIZE) {
3174 // This loop has more nodes than an empty loop but, maybe they are only kept alive by the outer strip mined loop's
3175 // safepoint. If they go away once the safepoint is removed, that loop is empty.
3176 if (!empty_loop_with_data_nodes(phase)) {
3177 return false;
3178 }
3179 }
3180 phase->C->print_method(PHASE_BEFORE_REMOVE_EMPTY_LOOP, 4, cl);
3181 if (cl->is_pre_loop()) {
3182 // If the loop we are removing is a pre-loop then the main and post loop
3183 // can be removed as well.
3184 remove_main_post_loops(cl, phase);
3185 }
3186
3187 #ifdef ASSERT
3188 // Ensure at most one used phi exists, which is the iv.
3189 Node* iv = nullptr;
3190 for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
3191 Node* n = cl->fast_out(i);
3192 if ((n->Opcode() == Op_Phi) && (n->outcnt() > 0)) {
3193 assert(iv == nullptr, "Too many phis");
3194 iv = n;
3195 }
3196 }
3197 assert(iv == cl->phi(), "Wrong phi");
3198 #endif
3199
3200 // main and post loops have explicitly created zero trip guard
3201 bool needs_guard = !cl->is_main_loop() && !cl->is_post_loop();
3202 if (needs_guard) {
3203 // Skip guard if values not overlap.
3204 const TypeInt* init_t = phase->_igvn.type(cl->init_trip())->is_int();
3205 const TypeInt* limit_t = phase->_igvn.type(cl->limit())->is_int();
3206 int stride_con = cl->stride_con();
3207 if (stride_con > 0) {
3208 needs_guard = (init_t->_hi >= limit_t->_lo);
3209 } else {
3210 needs_guard = (init_t->_lo <= limit_t->_hi);
3211 }
3212 }
3213 if (needs_guard) {
3214 // Check for an obvious zero trip guard.
3215 Predicates predicates(cl->skip_strip_mined()->in(LoopNode::EntryControl));
3216 Node* in_ctrl = predicates.entry();
3217 if (in_ctrl->Opcode() == Op_IfTrue || in_ctrl->Opcode() == Op_IfFalse) {
3218 bool maybe_swapped = (in_ctrl->Opcode() == Op_IfFalse);
3219 // The test should look like just the backedge of a CountedLoop
3220 Node* iff = in_ctrl->in(0);
3221 if (iff->is_If()) {
3222 Node* bol = iff->in(1);
3223 if (bol->is_Bool()) {
3224 BoolTest test = bol->as_Bool()->_test;
3225 if (maybe_swapped) {
3226 test._test = test.commute();
3227 test._test = test.negate();
3228 }
3229 if (test._test == cl->loopexit()->test_trip()) {
3230 Node* cmp = bol->in(1);
3231 int init_idx = maybe_swapped ? 2 : 1;
3232 int limit_idx = maybe_swapped ? 1 : 2;
3233 if (cmp->is_Cmp() && cmp->in(init_idx) == cl->init_trip() && cmp->in(limit_idx) == cl->limit()) {
3234 needs_guard = false;
3235 }
3236 }
3237 }
3238 }
3239 }
3240 }
3241
3242 #ifndef PRODUCT
3243 if (PrintOpto) {
3244 tty->print("Removing empty loop with%s zero trip guard", needs_guard ? "out" : "");
3245 this->dump_head();
3246 } else if (TraceLoopOpts) {
3247 tty->print("Empty with%s zero trip guard ", needs_guard ? "out" : "");
3248 this->dump_head();
3249 }
3250 #endif
3251
3252 if (needs_guard) {
3253 if (LoopPeeling == 0) {
3254 #ifndef PRODUCT
3255 if (TraceLoopOpts) {
3256 tty->print("Empty loop not removed since LoopPeeling is always disabled");
3257 this->dump_head();
3258 }
3259 #endif
3260 return false;
3261 }
3262 // Peel the loop to ensure there's a zero trip guard
3263 Node_List old_new;
3264 phase->do_peeling(this, old_new);
3265 }
3266
3267 // Replace the phi at loop head with the final value of the last
3268 // iteration (exact_limit - stride), to make sure the loop exit value
3269 // is correct, for any users after the loop.
3270 // Note: the final value after increment should not overflow since
3271 // counted loop has limit check predicate.
3272 Node* phi = cl->phi();
3273 Node* exact_limit = phase->exact_limit(this);
3274
3275 // We need to pin the exact limit to prevent it from floating above the zero trip guard.
3276 Node* cast_ii = ConstraintCastNode::make_cast_for_basic_type(
3277 cl->in(LoopNode::EntryControl), exact_limit,
3278 phase->_igvn.type(exact_limit),
3279 ConstraintCastNode::DependencyType::NonFloatingNonNarrowing, T_INT);
3280 phase->register_new_node(cast_ii, cl->in(LoopNode::EntryControl));
3281
3282 Node* final_iv = new SubINode(cast_ii, cl->stride());
3283 phase->register_new_node(final_iv, cl->in(LoopNode::EntryControl));
3284 phase->_igvn.replace_node(phi, final_iv);
3285
3286 // Set loop-exit condition to false. Then the CountedLoopEnd will collapse,
3287 // because the back edge is never taken.
3288 Node* zero = phase->_igvn.intcon(0);
3289 phase->_igvn.replace_input_of(cl->loopexit(), CountedLoopEndNode::TestValue, zero);
3290
3291 phase->C->set_major_progress();
3292 phase->C->print_method(PHASE_AFTER_REMOVE_EMPTY_LOOP, 4, final_iv);
3293 return true;
3294 }
3295
3296 bool IdealLoopTree::empty_loop_candidate(PhaseIdealLoop* phase) const {
3297 CountedLoopNode *cl = _head->as_CountedLoop();
3298 if (!cl->is_valid_counted_loop(T_INT)) {
3299 return false; // Malformed loop
3300 }
3301 if (!phase->ctrl_is_member(this, cl->loopexit()->in(CountedLoopEndNode::TestValue))) {
3302 return false; // Infinite loop
3303 }
3304 return true;
3305 }
3306
3307 bool IdealLoopTree::empty_loop_with_data_nodes(PhaseIdealLoop* phase) const {
3308 CountedLoopNode* cl = _head->as_CountedLoop();
3309 if (!cl->is_strip_mined() || !empty_loop_with_extra_nodes_candidate(phase)) {
3310 return false;
3311 }
3312 Unique_Node_List empty_loop_nodes;
3313 Unique_Node_List wq;
3314
3315 // Start from all data nodes in the loop body that are not one of the EMPTY_LOOP_SIZE nodes expected in an empty body
3316 enqueue_data_nodes(phase, empty_loop_nodes, wq);
3317 // and now follow uses
3318 for (uint i = 0; i < wq.size(); ++i) {
3319 Node* n = wq.at(i);
3320 for (DUIterator_Fast jmax, j = n->fast_outs(jmax); j < jmax; j++) {
3321 Node* u = n->fast_out(j);
3322 if (u->Opcode() == Op_SafePoint) {
3323 // found a safepoint. Maybe this loop's safepoint or another loop safepoint.
3324 if (!process_safepoint(phase, empty_loop_nodes, wq, u)) {
3325 return false;
3326 }
3327 } else {
3328 const Type* u_t = phase->_igvn.type(u);
3329 if (u_t == Type::CONTROL || u_t == Type::MEMORY || u_t == Type::ABIO) {
3330 // found a side effect
3331 return false;
3332 }
3333 wq.push(u);
3334 }
3335 }
3336 }
3337 // Nodes (ignoring the EMPTY_LOOP_SIZE nodes of the "core" of the loop) are kept alive by otherwise empty loops'
3338 // safepoints: kill them.
3339 for (uint i = 0; i < wq.size(); ++i) {
3340 Node* n = wq.at(i);
3341 phase->_igvn.replace_node(n, phase->C->top());
3342 }
3343
3344 #ifdef ASSERT
3345 for (uint i = 0; i < _body.size(); ++i) {
3346 Node* n = _body.at(i);
3347 assert(wq.member(n) || empty_loop_nodes.member(n), "missed a node in the body?");
3348 }
3349 #endif
3350
3351 return true;
3352 }
3353
3354 bool IdealLoopTree::process_safepoint(PhaseIdealLoop* phase, Unique_Node_List& empty_loop_nodes, Unique_Node_List& wq,
3355 Node* sfpt) const {
3356 CountedLoopNode* cl = _head->as_CountedLoop();
3357 if (cl->outer_safepoint() == sfpt) {
3358 // the current loop's safepoint
3359 return true;
3360 }
3361
3362 // Some other loop's safepoint. Maybe that loop is empty too.
3363 IdealLoopTree* sfpt_loop = phase->get_loop(sfpt);
3364 if (!sfpt_loop->_head->is_OuterStripMinedLoop()) {
3365 return false;
3366 }
3367 IdealLoopTree* sfpt_inner_loop = sfpt_loop->_child;
3368 CountedLoopNode* sfpt_cl = sfpt_inner_loop->_head->as_CountedLoop();
3369 assert(sfpt_cl->is_strip_mined(), "inconsistent");
3370
3371 if (empty_loop_nodes.member(sfpt_cl)) {
3372 // already taken care of
3373 return true;
3374 }
3375
3376 if (!sfpt_inner_loop->empty_loop_candidate(phase) || !sfpt_inner_loop->empty_loop_with_extra_nodes_candidate(phase)) {
3377 return false;
3378 }
3379
3380 // Enqueue the nodes of that loop for processing too
3381 sfpt_inner_loop->enqueue_data_nodes(phase, empty_loop_nodes, wq);
3382 return true;
3383 }
3384
3385 bool IdealLoopTree::empty_loop_with_extra_nodes_candidate(PhaseIdealLoop* phase) const {
3386 CountedLoopNode *cl = _head->as_CountedLoop();
3387 // No other control flow node in the loop body
3388 if (cl->loopexit()->in(0) != cl) {
3389 return false;
3390 }
3391
3392 if (phase->ctrl_is_member(this, cl->limit())) {
3393 return false;
3394 }
3395 return true;
3396 }
3397
3398 void IdealLoopTree::enqueue_data_nodes(PhaseIdealLoop* phase, Unique_Node_List& empty_loop_nodes,
3399 Unique_Node_List& wq) const {
3400 collect_loop_core_nodes(phase, empty_loop_nodes);
3401 for (uint i = 0; i < _body.size(); ++i) {
3402 Node* n = _body.at(i);
3403 if (!empty_loop_nodes.member(n)) {
3404 wq.push(n);
3405 }
3406 }
3407 }
3408
3409 // This collects the node that would be left if this body was empty
3410 void IdealLoopTree::collect_loop_core_nodes(PhaseIdealLoop* phase, Unique_Node_List& wq) const {
3411 uint before = wq.size();
3412 wq.push(_head->in(LoopNode::LoopBackControl));
3413 for (uint i = before; i < wq.size(); ++i) {
3414 Node* n = wq.at(i);
3415 for (uint j = 0; j < n->req(); ++j) {
3416 Node* in = n->in(j);
3417 if (in != nullptr) {
3418 if (phase->get_loop(phase->ctrl_or_self(in)) == this) {
3419 wq.push(in);
3420 }
3421 }
3422 }
3423 }
3424 assert(wq.size() - before == EMPTY_LOOP_SIZE, "expect the EMPTY_LOOP_SIZE nodes of this body if empty");
3425 }
3426
3427 //------------------------------do_one_iteration_loop--------------------------
3428 // Convert one-iteration loop into normal code.
3429 bool IdealLoopTree::do_one_iteration_loop(PhaseIdealLoop *phase) {
3430 if (!_head->as_Loop()->is_valid_counted_loop(T_INT)) {
3431 return false; // Only for counted loop
3432 }
3433 CountedLoopNode *cl = _head->as_CountedLoop();
3434 if (!cl->has_exact_trip_count() || cl->trip_count() != 1) {
3435 return false;
3436 }
3437
3438 #ifndef PRODUCT
3439 if (TraceLoopOpts) {
3440 tty->print("OneIteration ");
3441 this->dump_head();
3442 }
3443 #endif
3444
3445 phase->C->print_method(PHASE_BEFORE_ONE_ITERATION_LOOP, 4, cl);
3446 Node *init_n = cl->init_trip();
3447 // Loop boundaries should be constant since trip count is exact.
3448 assert((cl->stride_con() > 0 && init_n->get_int() + cl->stride_con() >= cl->limit()->get_int()) ||
3449 (cl->stride_con() < 0 && init_n->get_int() + cl->stride_con() <= cl->limit()->get_int()), "should be one iteration");
3450 // Replace the phi at loop head with the value of the init_trip.
3451 // Then the CountedLoopEnd will collapse (backedge will not be taken)
3452 // and all loop-invariant uses of the exit values will be correct.
3453 phase->_igvn.replace_node(cl->phi(), cl->init_trip());
3454 phase->C->set_major_progress();
3455 phase->C->print_method(PHASE_AFTER_ONE_ITERATION_LOOP, 4, init_n);
3456 return true;
3457 }
3458
3459 //=============================================================================
3460 //------------------------------iteration_split_impl---------------------------
3461 bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_new) {
3462 if (!_head->is_Loop()) {
3463 // Head could be a region with a NeverBranch that was added in beautify loops but the region was not
3464 // yet transformed into a LoopNode. Bail out and wait until beautify loops turns it into a Loop node.
3465 return false;
3466 }
3467 // Compute loop trip count if possible.
3468 compute_trip_count(phase, T_INT);
3469
3470 // Convert one-iteration loop into normal code.
3471 if (do_one_iteration_loop(phase)) {
3472 return true;
3473 }
3474 // Check and remove empty loops (spam micro-benchmarks)
3475 if (do_remove_empty_loop(phase)) {
3476 return true; // Here we removed an empty loop
3477 }
3478
3479 AutoNodeBudget node_budget(phase);
3480
3481 // Non-counted loops may be peeled; exactly 1 iteration is peeled.
3482 // This removes loop-invariant tests (usually null checks).
3483 if (!_head->is_CountedLoop()) { // Non-counted loop
3484 if (PartialPeelLoop) {
3485 bool rc = phase->partial_peel(this, old_new);
3486 if (Compile::current()->failing()) { return false; }
3487 if (rc) {
3488 // Partial peel succeeded so terminate this round of loop opts
3489 return false;
3490 }
3491 }
3492 if (policy_peeling(phase)) { // Should we peel?
3493 if (PrintOpto) { tty->print_cr("should_peel"); }
3494 phase->do_peeling(this, old_new);
3495 } else if (policy_unswitching(phase)) {
3496 phase->do_unswitching(this, old_new);
3497 return false; // need to recalculate idom data
3498 } else if (phase->duplicate_loop_backedge(this, old_new)) {
3499 return false;
3500 } else if (_head->is_LongCountedLoop()) {
3501 phase->create_loop_nest(this, old_new);
3502 }
3503 return true;
3504 }
3505 CountedLoopNode *cl = _head->as_CountedLoop();
3506
3507 if (!cl->is_valid_counted_loop(T_INT)) return true; // Ignore various kinds of broken loops
3508
3509 // Do nothing special to pre- and post- loops
3510 if (cl->is_pre_loop() || cl->is_post_loop()) return true;
3511
3512 // With multiversioning, we create a fast_loop and a slow_loop, and a multiversion_if that
3513 // decides which loop is taken at runtime. At first, the multiversion_if always takes the
3514 // fast_loop, and we only optimize the fast_loop. Since we are not sure if we will ever use
3515 // the slow_loop, we delay optimizations for it, so we do not waste compile time and code
3516 // size. If we never change the condition of the multiversion_if, the slow_loop is eventually
3517 // folded away after loop-opts. While optimizing the fast_loop, we may want to perform some
3518 // speculative optimization, for which we need a runtime-check. We add this runtime-check
3519 // condition to the multiversion_if. Now, it becomes possible to execute the slow_loop at
3520 // runtime, and we resume optimizations for slow_loop ("un-delay" it).
3521 // TLDR: If the slow_loop is still in "delay" mode, check if the multiversion_if was changed
3522 // and we should now resume optimizations for it.
3523 if (cl->is_multiversion_delayed_slow_loop() &&
3524 !phase->try_resume_optimizations_for_delayed_slow_loop(this)) {
3525 // We are still delayed, so wait with further loop-opts.
3526 return true;
3527 }
3528
3529 // Compute loop trip count from profile data
3530 compute_profile_trip_cnt(phase);
3531
3532 // Before attempting fancy unrolling, RCE or alignment, see if we want
3533 // to completely unroll this loop or do loop unswitching.
3534 if (cl->is_normal_loop()) {
3535 if (policy_unswitching(phase)) {
3536 phase->do_unswitching(this, old_new);
3537 return false; // need to recalculate idom data
3538 }
3539 if (policy_maximally_unroll(phase)) {
3540 // Here we did some unrolling and peeling. Eventually we will
3541 // completely unroll this loop and it will no longer be a loop.
3542 phase->do_maximally_unroll(this, old_new);
3543 return true;
3544 }
3545 if (StressDuplicateBackedge && phase->duplicate_loop_backedge(this, old_new)) {
3546 return false;
3547 }
3548 }
3549
3550 uint est_peeling = estimate_peeling(phase);
3551 bool should_peel = 0 < est_peeling;
3552
3553 // Counted loops may be peeled, or may need some iterations run up
3554 // front for RCE. Thus we clone a full loop up front whose trip count is
3555 // at least 1 (if peeling), but may be several more.
3556
3557 // The main loop will start cache-line aligned with at least 1
3558 // iteration of the unrolled body (zero-trip test required) and
3559 // will have some range checks removed.
3560
3561 // A post-loop will finish any odd iterations (leftover after
3562 // unrolling), plus any needed for RCE purposes.
3563
3564 bool should_unroll = policy_unroll(phase);
3565 bool should_rce = policy_range_check(phase, false, T_INT);
3566 bool should_rce_long = policy_range_check(phase, false, T_LONG);
3567
3568 // If not RCE'ing (iteration splitting), then we do not need a pre-loop.
3569 // We may still need to peel an initial iteration but we will not
3570 // be needing an unknown number of pre-iterations.
3571 //
3572 // Basically, if peel_only reports TRUE first time through, we will not
3573 // be able to later do RCE on this loop.
3574 bool peel_only = policy_peel_only(phase) && !should_rce;
3575
3576 // If we have any of these conditions (RCE, unrolling) met, then
3577 // we switch to the pre-/main-/post-loop model. This model also covers
3578 // peeling.
3579 if (should_rce || should_unroll) {
3580 if (cl->is_normal_loop()) { // Convert to 'pre/main/post' loops
3581 if (should_rce_long && phase->create_loop_nest(this, old_new)) {
3582 return true;
3583 }
3584 uint estimate = est_loop_clone_sz(3);
3585 if (!phase->may_require_nodes(estimate)) {
3586 return false;
3587 }
3588
3589 if (!peel_only) {
3590 // We are going to add pre-loop and post-loop (PreMainPost).
3591 // But should we also multiversion for auto-vectorization speculative
3592 // checks, i.e. fast and slow-paths?
3593 // Note: Just PeelMainPost is not sufficient, as we could never find the
3594 // multiversion_if again from the main loop: we need a nicely structured
3595 // pre-loop, a peeled iteration cannot easily be parsed through.
3596 phase->maybe_multiversion_for_auto_vectorization_runtime_checks(this, old_new);
3597 }
3598
3599 phase->insert_pre_post_loops(this, old_new, peel_only);
3600 }
3601 // Adjust the pre- and main-loop limits to let the pre and post loops run
3602 // with full checks, but the main-loop with no checks. Remove said checks
3603 // from the main body.
3604 if (should_rce) {
3605 phase->do_range_check(this);
3606 }
3607
3608 // Double loop body for unrolling. Adjust the minimum-trip test (will do
3609 // twice as many iterations as before) and the main body limit (only do
3610 // an even number of trips). If we are peeling, we might enable some RCE
3611 // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
3612 // peeling.
3613 if (should_unroll && !should_peel) {
3614 if (SuperWordLoopUnrollAnalysis) {
3615 phase->insert_vector_post_loop(this, old_new);
3616 }
3617 phase->do_unroll(this, old_new, true);
3618 }
3619 } else { // Else we have an unchanged counted loop
3620 if (should_peel) { // Might want to peel but do nothing else
3621 if (phase->may_require_nodes(est_peeling)) {
3622 phase->do_peeling(this, old_new);
3623 }
3624 }
3625 if (should_rce_long) {
3626 phase->create_loop_nest(this, old_new);
3627 }
3628 }
3629 return true;
3630 }
3631
3632
3633 //=============================================================================
3634 //------------------------------iteration_split--------------------------------
3635 bool IdealLoopTree::iteration_split(PhaseIdealLoop* phase, Node_List &old_new) {
3636 // Recursively iteration split nested loops
3637 if (_child && !_child->iteration_split(phase, old_new)) {
3638 return false;
3639 }
3640
3641 // Clean out prior deadwood
3642 DCE_loop_body();
3643
3644 // Look for loop-exit tests with my 50/50 guesses from the Parsing stage.
3645 // Replace with a 1-in-10 exit guess.
3646 if (!is_root() && is_loop()) {
3647 adjust_loop_exit_prob(phase);
3648 }
3649
3650 // Unrolling, RCE and peeling efforts, iff innermost loop.
3651 if (_allow_optimizations && is_innermost()) {
3652 if (!_has_call) {
3653 if (!iteration_split_impl(phase, old_new)) {
3654 return false;
3655 }
3656 } else {
3657 AutoNodeBudget node_budget(phase);
3658 if (policy_unswitching(phase)) {
3659 phase->do_unswitching(this, old_new);
3660 return false; // need to recalculate idom data
3661 }
3662 }
3663 }
3664
3665 if (_next && !_next->iteration_split(phase, old_new)) {
3666 return false;
3667 }
3668 return true;
3669 }
3670
3671
3672 //=============================================================================
3673 // Process all the loops in the loop tree and replace any fill
3674 // patterns with an intrinsic version.
3675 bool PhaseIdealLoop::do_intrinsify_fill() {
3676 bool changed = false;
3677 for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
3678 IdealLoopTree* lpt = iter.current();
3679 changed |= intrinsify_fill(lpt);
3680 }
3681 return changed;
3682 }
3683
3684
3685 // Examine an inner loop looking for a single store of an invariant
3686 // value in a unit stride loop,
3687 bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value,
3688 Node*& shift, Node*& con) {
3689 const char* msg = nullptr;
3690 Node* msg_node = nullptr;
3691
3692 store_value = nullptr;
3693 con = nullptr;
3694 shift = nullptr;
3695
3696 // Process the loop looking for stores. If there are multiple
3697 // stores or extra control flow give at this point.
3698 CountedLoopNode* head = lpt->_head->as_CountedLoop();
3699 for (uint i = 0; msg == nullptr && i < lpt->_body.size(); i++) {
3700 Node* n = lpt->_body.at(i);
3701 if (n->outcnt() == 0) continue; // Ignore dead
3702 if (n->is_Store()) {
3703 if (store != nullptr) {
3704 msg = "multiple stores";
3705 break;
3706 }
3707 int opc = n->Opcode();
3708 if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreNKlass) {
3709 msg = "oop fills not handled";
3710 break;
3711 }
3712 Node* value = n->in(MemNode::ValueIn);
3713 if (!lpt->is_invariant(value)) {
3714 msg = "variant store value";
3715 } else if (!_igvn.type(n->in(MemNode::Address))->isa_aryptr()) {
3716 msg = "not array address";
3717 }
3718 store = n;
3719 store_value = value;
3720 } else if (n->is_If() && n != head->loopexit_or_null()) {
3721 msg = "extra control flow";
3722 msg_node = n;
3723 }
3724 }
3725
3726 if (store == nullptr) {
3727 // No store in loop
3728 return false;
3729 }
3730
3731 if (msg == nullptr && store->as_Mem()->is_mismatched_access()) {
3732 // This optimization does not currently support mismatched stores, where the
3733 // type of the value to be stored differs from the element type of the
3734 // destination array. Such patterns arise for example from memory segment
3735 // initialization. This limitation could be overcome by extending this
3736 // function's address matching logic and ensuring that the fill intrinsic
3737 // implementations support mismatched array filling.
3738 msg = "mismatched store";
3739 }
3740
3741 if (msg == nullptr && head->stride_con() != 1) {
3742 // could handle negative strides too
3743 if (head->stride_con() < 0) {
3744 msg = "negative stride";
3745 } else {
3746 msg = "non-unit stride";
3747 }
3748 }
3749
3750 if (msg == nullptr && !store->in(MemNode::Address)->is_AddP()) {
3751 msg = "can't handle store address";
3752 msg_node = store->in(MemNode::Address);
3753 }
3754
3755 if (msg == nullptr &&
3756 (!store->in(MemNode::Memory)->is_Phi() ||
3757 store->in(MemNode::Memory)->in(LoopNode::LoopBackControl) != store)) {
3758 msg = "store memory isn't proper phi";
3759 msg_node = store->in(MemNode::Memory);
3760 }
3761
3762 // Make sure there is an appropriate fill routine
3763 BasicType t = msg == nullptr ?
3764 store->adr_type()->isa_aryptr()->elem()->array_element_basic_type() : T_VOID;
3765 const char* fill_name;
3766 if (msg == nullptr &&
3767 StubRoutines::select_fill_function(t, false, fill_name) == nullptr) {
3768 msg = "unsupported store";
3769 msg_node = store;
3770 }
3771
3772 if (msg != nullptr) {
3773 #ifndef PRODUCT
3774 if (TraceOptimizeFill) {
3775 tty->print_cr("not fill intrinsic candidate: %s", msg);
3776 if (msg_node != nullptr) msg_node->dump();
3777 }
3778 #endif
3779 return false;
3780 }
3781
3782 // Make sure the address expression can be handled. It should be
3783 // head->phi * elsize + con. head->phi might have a ConvI2L(CastII()).
3784 Node* elements[4];
3785 Node* cast = nullptr;
3786 Node* conv = nullptr;
3787 bool found_index = false;
3788 int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements));
3789 for (int e = 0; e < count; e++) {
3790 Node* n = elements[e];
3791 if (n->is_Con() && con == nullptr) {
3792 con = n;
3793 } else if (n->Opcode() == Op_LShiftX && shift == nullptr) {
3794 Node* value = n->in(1);
3795 #ifdef _LP64
3796 if (value->Opcode() == Op_ConvI2L) {
3797 conv = value;
3798 value = value->in(1);
3799 }
3800 if (value->Opcode() == Op_CastII &&
3801 value->as_CastII()->has_range_check()) {
3802 // Skip range check dependent CastII nodes
3803 cast = value;
3804 value = value->in(1);
3805 }
3806 #endif
3807 if (value != head->phi()) {
3808 msg = "unhandled shift in address";
3809 } else {
3810 if (type2aelembytes(t, true) != (1 << n->in(2)->get_int())) {
3811 msg = "scale doesn't match";
3812 } else {
3813 found_index = true;
3814 shift = n;
3815 }
3816 }
3817 } else if (n->Opcode() == Op_ConvI2L && conv == nullptr) {
3818 conv = n;
3819 n = n->in(1);
3820 if (n->Opcode() == Op_CastII &&
3821 n->as_CastII()->has_range_check()) {
3822 // Skip range check dependent CastII nodes
3823 cast = n;
3824 n = n->in(1);
3825 }
3826 if (n == head->phi()) {
3827 found_index = true;
3828 } else {
3829 msg = "unhandled input to ConvI2L";
3830 }
3831 } else if (n == head->phi()) {
3832 // no shift, check below for allowed cases
3833 found_index = true;
3834 } else {
3835 msg = "unhandled node in address";
3836 msg_node = n;
3837 }
3838 }
3839
3840 if (count == -1) {
3841 msg = "malformed address expression";
3842 msg_node = store;
3843 }
3844
3845 if (!found_index) {
3846 msg = "missing use of index";
3847 }
3848
3849 // byte sized items won't have a shift
3850 if (msg == nullptr && shift == nullptr && t != T_BYTE && t != T_BOOLEAN) {
3851 msg = "can't find shift";
3852 msg_node = store;
3853 }
3854
3855 if (msg != nullptr) {
3856 #ifndef PRODUCT
3857 if (TraceOptimizeFill) {
3858 tty->print_cr("not fill intrinsic: %s", msg);
3859 if (msg_node != nullptr) msg_node->dump();
3860 }
3861 #endif
3862 return false;
3863 }
3864
3865 // No make sure all the other nodes in the loop can be handled
3866 VectorSet ok;
3867
3868 // store related values are ok
3869 ok.set(store->_idx);
3870 ok.set(store->in(MemNode::Memory)->_idx);
3871
3872 CountedLoopEndNode* loop_exit = head->loopexit();
3873
3874 // Loop structure is ok
3875 ok.set(head->_idx);
3876 ok.set(loop_exit->_idx);
3877 ok.set(head->phi()->_idx);
3878 ok.set(head->incr()->_idx);
3879 ok.set(loop_exit->cmp_node()->_idx);
3880 ok.set(loop_exit->in(1)->_idx);
3881
3882 // Address elements are ok
3883 if (con) ok.set(con->_idx);
3884 if (shift) ok.set(shift->_idx);
3885 if (cast) ok.set(cast->_idx);
3886 if (conv) ok.set(conv->_idx);
3887
3888 for (uint i = 0; msg == nullptr && i < lpt->_body.size(); i++) {
3889 Node* n = lpt->_body.at(i);
3890 if (n->outcnt() == 0) continue; // Ignore dead
3891 if (ok.test(n->_idx)) continue;
3892 // Backedge projection is ok
3893 if (n->is_IfTrue() && n->in(0) == loop_exit) continue;
3894 if (!n->is_AddP()) {
3895 msg = "unhandled node";
3896 msg_node = n;
3897 break;
3898 }
3899 }
3900
3901 // Make sure no unexpected values are used outside the loop
3902 for (uint i = 0; msg == nullptr && i < lpt->_body.size(); i++) {
3903 Node* n = lpt->_body.at(i);
3904 // These values can be replaced with other nodes if they are used
3905 // outside the loop.
3906 if (n == store || n == loop_exit || n == head->incr() || n == store->in(MemNode::Memory)) continue;
3907 for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) {
3908 Node* use = iter.get();
3909 if (!lpt->_body.contains(use)) {
3910 msg = "node is used outside loop";
3911 msg_node = n;
3912 break;
3913 }
3914 }
3915 }
3916
3917 #ifdef ASSERT
3918 if (TraceOptimizeFill) {
3919 if (msg != nullptr) {
3920 tty->print_cr("no fill intrinsic: %s", msg);
3921 if (msg_node != nullptr) msg_node->dump();
3922 } else {
3923 tty->print_cr("fill intrinsic for:");
3924 }
3925 store->dump();
3926 if (Verbose) {
3927 lpt->_body.dump();
3928 }
3929 }
3930 #endif
3931
3932 return msg == nullptr;
3933 }
3934
3935
3936
3937 bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) {
3938 // Only for counted inner loops
3939 if (!lpt->is_counted() || !lpt->is_innermost()) {
3940 return false;
3941 }
3942
3943 // Must have constant stride
3944 CountedLoopNode* head = lpt->_head->as_CountedLoop();
3945 if (!head->is_valid_counted_loop(T_INT) || !head->is_normal_loop()) {
3946 return false;
3947 }
3948
3949 head->verify_strip_mined(1);
3950
3951 // Check that the body only contains a store of a loop invariant
3952 // value that is indexed by the loop phi.
3953 Node* store = nullptr;
3954 Node* store_value = nullptr;
3955 Node* shift = nullptr;
3956 Node* offset = nullptr;
3957 if (!match_fill_loop(lpt, store, store_value, shift, offset)) {
3958 return false;
3959 }
3960
3961 IfFalseNode* exit = head->loopexit()->false_proj_or_null();
3962 if (exit == nullptr) {
3963 return false;
3964 }
3965
3966 #ifndef PRODUCT
3967 if (TraceLoopOpts) {
3968 tty->print("ArrayFill ");
3969 lpt->dump_head();
3970 }
3971 #endif
3972
3973 // Now replace the whole loop body by a call to a fill routine that
3974 // covers the same region as the loop.
3975 Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base);
3976
3977 // Build an expression for the beginning of the copy region
3978 Node* index = head->init_trip();
3979 #ifdef _LP64
3980 index = new ConvI2LNode(index);
3981 _igvn.register_new_node_with_optimizer(index);
3982 #endif
3983 if (shift != nullptr) {
3984 // byte arrays don't require a shift but others do.
3985 index = new LShiftXNode(index, shift->in(2));
3986 _igvn.register_new_node_with_optimizer(index);
3987 }
3988 Node* from = AddPNode::make_with_base(base, index);
3989 _igvn.register_new_node_with_optimizer(from);
3990 // For normal array fills, C2 uses two AddP nodes for array element
3991 // addressing. But for array fills with Unsafe call, there's only one
3992 // AddP node adding an absolute offset, so we do a null check here.
3993 assert(offset != nullptr || C->has_unsafe_access(),
3994 "Only array fills with unsafe have no extra offset");
3995 if (offset != nullptr) {
3996 from = AddPNode::make_with_base(base, from, offset);
3997 _igvn.register_new_node_with_optimizer(from);
3998 }
3999 // Compute the number of elements to copy
4000 Node* len = new SubINode(head->limit(), head->init_trip());
4001 _igvn.register_new_node_with_optimizer(len);
4002
4003 // If the store is on the backedge, it is not executed in the last
4004 // iteration, and we must subtract 1 from the len.
4005 IfTrueNode* backedge = head->loopexit()->true_proj();
4006 if (store->in(0) == backedge) {
4007 len = new SubINode(len, _igvn.intcon(1));
4008 _igvn.register_new_node_with_optimizer(len);
4009 #ifndef PRODUCT
4010 if (TraceOptimizeFill) {
4011 tty->print_cr("ArrayFill store on backedge, subtract 1 from len.");
4012 }
4013 #endif
4014 }
4015
4016 BasicType t = store->adr_type()->isa_aryptr()->elem()->array_element_basic_type();
4017 bool aligned = false;
4018 if (offset != nullptr && head->init_trip()->is_Con()) {
4019 int element_size = type2aelembytes(t);
4020 aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0;
4021 }
4022
4023 // Build a call to the fill routine
4024 const char* fill_name;
4025 address fill = StubRoutines::select_fill_function(t, aligned, fill_name);
4026 assert(fill != nullptr, "what?");
4027
4028 // Convert float/double to int/long for fill routines
4029 if (t == T_FLOAT) {
4030 store_value = new MoveF2INode(store_value);
4031 _igvn.register_new_node_with_optimizer(store_value);
4032 } else if (t == T_DOUBLE) {
4033 store_value = new MoveD2LNode(store_value);
4034 _igvn.register_new_node_with_optimizer(store_value);
4035 }
4036
4037 Node* mem_phi = store->in(MemNode::Memory);
4038 Node* result_ctrl;
4039 Node* result_mem;
4040 const TypeFunc* call_type = OptoRuntime::array_fill_Type();
4041 CallLeafNode *call = new CallLeafNoFPNode(call_type, fill,
4042 fill_name, TypeAryPtr::get_array_body_type(t));
4043 uint cnt = 0;
4044 call->init_req(TypeFunc::Parms + cnt++, from);
4045 call->init_req(TypeFunc::Parms + cnt++, store_value);
4046 #ifdef _LP64
4047 len = new ConvI2LNode(len);
4048 _igvn.register_new_node_with_optimizer(len);
4049 #endif
4050 call->init_req(TypeFunc::Parms + cnt++, len);
4051 #ifdef _LP64
4052 call->init_req(TypeFunc::Parms + cnt++, C->top());
4053 #endif
4054 call->init_req(TypeFunc::Control, head->init_control());
4055 call->init_req(TypeFunc::I_O, C->top()); // Does no I/O.
4056 call->init_req(TypeFunc::Memory, mem_phi->in(LoopNode::EntryControl));
4057 call->init_req(TypeFunc::ReturnAdr, C->start()->proj_out_or_null(TypeFunc::ReturnAdr));
4058 Node* frame = new ParmNode(C->start(), TypeFunc::FramePtr);
4059 _igvn.register_new_node_with_optimizer(frame);
4060 call->init_req(TypeFunc::FramePtr, frame);
4061 _igvn.register_new_node_with_optimizer(call);
4062 result_ctrl = new ProjNode(call,TypeFunc::Control);
4063 _igvn.register_new_node_with_optimizer(result_ctrl);
4064 result_mem = new ProjNode(call,TypeFunc::Memory);
4065 _igvn.register_new_node_with_optimizer(result_mem);
4066
4067 /* Disable following optimization until proper fix (add missing checks).
4068
4069 // If this fill is tightly coupled to an allocation and overwrites
4070 // the whole body, allow it to take over the zeroing.
4071 AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this);
4072 if (alloc != nullptr && alloc->is_AllocateArray()) {
4073 Node* length = alloc->as_AllocateArray()->Ideal_length();
4074 if (head->limit() == length &&
4075 head->init_trip() == _igvn.intcon(0)) {
4076 if (TraceOptimizeFill) {
4077 tty->print_cr("Eliminated zeroing in allocation");
4078 }
4079 alloc->maybe_set_complete(&_igvn);
4080 } else {
4081 #ifdef ASSERT
4082 if (TraceOptimizeFill) {
4083 tty->print_cr("filling array but bounds don't match");
4084 alloc->dump();
4085 head->init_trip()->dump();
4086 head->limit()->dump();
4087 length->dump();
4088 }
4089 #endif
4090 }
4091 }
4092 */
4093
4094 if (head->is_strip_mined()) {
4095 // Inner strip mined loop goes away so get rid of outer strip
4096 // mined loop
4097 Node* outer_sfpt = head->outer_safepoint();
4098 Node* in = outer_sfpt->in(0);
4099 Node* outer_out = head->outer_loop_exit();
4100 replace_node_and_forward_ctrl(outer_out, in);
4101 _igvn.replace_input_of(outer_sfpt, 0, C->top());
4102 }
4103
4104 // Redirect the old control and memory edges that are outside the loop.
4105 // Sometimes the memory phi of the head is used as the outgoing
4106 // state of the loop. It's safe in this case to replace it with the
4107 // result_mem.
4108 _igvn.replace_node(store->in(MemNode::Memory), result_mem);
4109 replace_node_and_forward_ctrl(exit, result_ctrl);
4110 _igvn.replace_node(store, result_mem);
4111 // Any uses the increment outside of the loop become the loop limit.
4112 _igvn.replace_node(head->incr(), head->limit());
4113
4114 // Disconnect the head from the loop.
4115 for (uint i = 0; i < lpt->_body.size(); i++) {
4116 Node* n = lpt->_body.at(i);
4117 _igvn.replace_node(n, C->top());
4118 }
4119
4120 #ifndef PRODUCT
4121 if (TraceOptimizeFill) {
4122 tty->print("ArrayFill call ");
4123 call->dump();
4124 }
4125 #endif
4126
4127 return true;
4128 }