1 | /* |
2 | * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
12 | * version 2 for more details (a copy is included in the LICENSE file that |
13 | * accompanied this code). |
14 | * |
15 | * You should have received a copy of the GNU General Public License version |
16 | * 2 along with this work; if not, write to the Free Software Foundation, |
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
18 | * |
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 | * or visit www.oracle.com if you need additional information or have any |
21 | * questions. |
22 | * |
23 | */ |
24 | |
25 | #include "precompiled.hpp" |
26 | #include "libadt/vectset.hpp" |
27 | #include "memory/allocation.inline.hpp" |
28 | #include "memory/resourceArea.hpp" |
29 | #include "opto/block.hpp" |
30 | #include "opto/c2compiler.hpp" |
31 | #include "opto/callnode.hpp" |
32 | #include "opto/cfgnode.hpp" |
33 | #include "opto/machnode.hpp" |
34 | #include "opto/opcodes.hpp" |
35 | #include "opto/phaseX.hpp" |
36 | #include "opto/rootnode.hpp" |
37 | #include "opto/runtime.hpp" |
38 | #include "opto/chaitin.hpp" |
39 | #include "runtime/deoptimization.hpp" |
40 | |
41 | // Portions of code courtesy of Clifford Click |
42 | |
43 | // Optimization - Graph Style |
44 | |
45 | // To avoid float value underflow |
46 | #define MIN_BLOCK_FREQUENCY 1.e-35f |
47 | |
48 | //----------------------------schedule_node_into_block------------------------- |
49 | // Insert node n into block b. Look for projections of n and make sure they |
50 | // are in b also. |
51 | void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) { |
52 | // Set basic block of n, Add n to b, |
53 | map_node_to_block(n, b); |
54 | b->add_inst(n); |
55 | |
56 | // After Matching, nearly any old Node may have projections trailing it. |
57 | // These are usually machine-dependent flags. In any case, they might |
58 | // float to another block below this one. Move them up. |
59 | for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { |
60 | Node* use = n->fast_out(i); |
61 | if (use->is_Proj()) { |
62 | Block* buse = get_block_for_node(use); |
63 | if (buse != b) { // In wrong block? |
64 | if (buse != NULL) { |
65 | buse->find_remove(use); // Remove from wrong block |
66 | } |
67 | map_node_to_block(use, b); |
68 | b->add_inst(use); |
69 | } |
70 | } |
71 | } |
72 | } |
73 | |
74 | //----------------------------replace_block_proj_ctrl------------------------- |
75 | // Nodes that have is_block_proj() nodes as their control need to use |
76 | // the appropriate Region for their actual block as their control since |
77 | // the projection will be in a predecessor block. |
78 | void PhaseCFG::replace_block_proj_ctrl( Node *n ) { |
79 | const Node *in0 = n->in(0); |
80 | assert(in0 != NULL, "Only control-dependent" ); |
81 | const Node *p = in0->is_block_proj(); |
82 | if (p != NULL && p != n) { // Control from a block projection? |
83 | assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here" ); |
84 | // Find trailing Region |
85 | Block *pb = get_block_for_node(in0); // Block-projection already has basic block |
86 | uint j = 0; |
87 | if (pb->_num_succs != 1) { // More then 1 successor? |
88 | // Search for successor |
89 | uint max = pb->number_of_nodes(); |
90 | assert( max > 1, "" ); |
91 | uint start = max - pb->_num_succs; |
92 | // Find which output path belongs to projection |
93 | for (j = start; j < max; j++) { |
94 | if( pb->get_node(j) == in0 ) |
95 | break; |
96 | } |
97 | assert( j < max, "must find" ); |
98 | // Change control to match head of successor basic block |
99 | j -= start; |
100 | } |
101 | n->set_req(0, pb->_succs[j]->head()); |
102 | } |
103 | } |
104 | |
105 | bool PhaseCFG::is_dominator(Node* dom_node, Node* node) { |
106 | if (dom_node == node) { |
107 | return true; |
108 | } |
109 | Block* d = get_block_for_node(dom_node); |
110 | Block* n = get_block_for_node(node); |
111 | if (d == n) { |
112 | if (dom_node->is_block_start()) { |
113 | return true; |
114 | } |
115 | if (node->is_block_start()) { |
116 | return false; |
117 | } |
118 | if (dom_node->is_block_proj()) { |
119 | return false; |
120 | } |
121 | if (node->is_block_proj()) { |
122 | return true; |
123 | } |
124 | #ifdef ASSERT |
125 | node->dump(); |
126 | dom_node->dump(); |
127 | #endif |
128 | fatal("unhandled" ); |
129 | return false; |
130 | } |
131 | return d->dom_lca(n) == d; |
132 | } |
133 | |
134 | //------------------------------schedule_pinned_nodes-------------------------- |
135 | // Set the basic block for Nodes pinned into blocks |
136 | void PhaseCFG::schedule_pinned_nodes(VectorSet &visited) { |
137 | // Allocate node stack of size C->live_nodes()+8 to avoid frequent realloc |
138 | GrowableArray <Node *> spstack(C->live_nodes() + 8); |
139 | spstack.push(_root); |
140 | while (spstack.is_nonempty()) { |
141 | Node* node = spstack.pop(); |
142 | if (!visited.test_set(node->_idx)) { // Test node and flag it as visited |
143 | if (node->pinned() && !has_block(node)) { // Pinned? Nail it down! |
144 | assert(node->in(0), "pinned Node must have Control" ); |
145 | // Before setting block replace block_proj control edge |
146 | replace_block_proj_ctrl(node); |
147 | Node* input = node->in(0); |
148 | while (!input->is_block_start()) { |
149 | input = input->in(0); |
150 | } |
151 | Block* block = get_block_for_node(input); // Basic block of controlling input |
152 | schedule_node_into_block(node, block); |
153 | } |
154 | |
155 | // If the node has precedence edges (added when CastPP nodes are |
156 | // removed in final_graph_reshaping), fix the control of the |
157 | // node to cover the precedence edges and remove the |
158 | // dependencies. |
159 | Node* n = NULL; |
160 | for (uint i = node->len()-1; i >= node->req(); i--) { |
161 | Node* m = node->in(i); |
162 | if (m == NULL) continue; |
163 | // Skip the precedence edge if the test that guarded a CastPP: |
164 | // - was optimized out during escape analysis |
165 | // (OptimizePtrCompare): the CastPP's control isn't an end of |
166 | // block. |
167 | // - is moved in the branch of a dominating If: the control of |
168 | // the CastPP is then a Region. |
169 | if (m->is_block_proj() || m->is_block_start()) { |
170 | node->rm_prec(i); |
171 | if (n == NULL) { |
172 | n = m; |
173 | } else { |
174 | assert(is_dominator(n, m) || is_dominator(m, n), "one must dominate the other" ); |
175 | n = is_dominator(n, m) ? m : n; |
176 | } |
177 | } |
178 | } |
179 | if (n != NULL) { |
180 | assert(node->in(0), "control should have been set" ); |
181 | assert(is_dominator(n, node->in(0)) || is_dominator(node->in(0), n), "one must dominate the other" ); |
182 | if (!is_dominator(n, node->in(0))) { |
183 | node->set_req(0, n); |
184 | } |
185 | } |
186 | |
187 | // process all inputs that are non NULL |
188 | for (int i = node->req() - 1; i >= 0; --i) { |
189 | if (node->in(i) != NULL) { |
190 | spstack.push(node->in(i)); |
191 | } |
192 | } |
193 | } |
194 | } |
195 | } |
196 | |
197 | #ifdef ASSERT |
198 | // Assert that new input b2 is dominated by all previous inputs. |
199 | // Check this by by seeing that it is dominated by b1, the deepest |
200 | // input observed until b2. |
201 | static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) { |
202 | if (b1 == NULL) return; |
203 | assert(b1->_dom_depth < b2->_dom_depth, "sanity" ); |
204 | Block* tmp = b2; |
205 | while (tmp != b1 && tmp != NULL) { |
206 | tmp = tmp->_idom; |
207 | } |
208 | if (tmp != b1) { |
209 | // Detected an unschedulable graph. Print some nice stuff and die. |
210 | tty->print_cr("!!! Unschedulable graph !!!" ); |
211 | for (uint j=0; j<n->len(); j++) { // For all inputs |
212 | Node* inn = n->in(j); // Get input |
213 | if (inn == NULL) continue; // Ignore NULL, missing inputs |
214 | Block* inb = cfg->get_block_for_node(inn); |
215 | tty->print("B%d idom=B%d depth=%2d " ,inb->_pre_order, |
216 | inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth); |
217 | inn->dump(); |
218 | } |
219 | tty->print("Failing node: " ); |
220 | n->dump(); |
221 | assert(false, "unscheduable graph" ); |
222 | } |
223 | } |
224 | #endif |
225 | |
226 | static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) { |
227 | // Find the last input dominated by all other inputs. |
228 | Block* deepb = NULL; // Deepest block so far |
229 | int deepb_dom_depth = 0; |
230 | for (uint k = 0; k < n->len(); k++) { // For all inputs |
231 | Node* inn = n->in(k); // Get input |
232 | if (inn == NULL) continue; // Ignore NULL, missing inputs |
233 | Block* inb = cfg->get_block_for_node(inn); |
234 | assert(inb != NULL, "must already have scheduled this input" ); |
235 | if (deepb_dom_depth < (int) inb->_dom_depth) { |
236 | // The new inb must be dominated by the previous deepb. |
237 | // The various inputs must be linearly ordered in the dom |
238 | // tree, or else there will not be a unique deepest block. |
239 | DEBUG_ONLY(assert_dom(deepb, inb, n, cfg)); |
240 | deepb = inb; // Save deepest block |
241 | deepb_dom_depth = deepb->_dom_depth; |
242 | } |
243 | } |
244 | assert(deepb != NULL, "must be at least one input to n" ); |
245 | return deepb; |
246 | } |
247 | |
248 | |
249 | //------------------------------schedule_early--------------------------------- |
250 | // Find the earliest Block any instruction can be placed in. Some instructions |
251 | // are pinned into Blocks. Unpinned instructions can appear in last block in |
252 | // which all their inputs occur. |
253 | bool PhaseCFG::schedule_early(VectorSet &visited, Node_Stack &roots) { |
254 | // Allocate stack with enough space to avoid frequent realloc |
255 | Node_Stack nstack(roots.size() + 8); |
256 | // _root will be processed among C->top() inputs |
257 | roots.push(C->top(), 0); |
258 | visited.set(C->top()->_idx); |
259 | |
260 | while (roots.size() != 0) { |
261 | // Use local variables nstack_top_n & nstack_top_i to cache values |
262 | // on stack's top. |
263 | Node* parent_node = roots.node(); |
264 | uint input_index = 0; |
265 | roots.pop(); |
266 | |
267 | while (true) { |
268 | if (input_index == 0) { |
269 | // Fixup some control. Constants without control get attached |
270 | // to root and nodes that use is_block_proj() nodes should be attached |
271 | // to the region that starts their block. |
272 | const Node* control_input = parent_node->in(0); |
273 | if (control_input != NULL) { |
274 | replace_block_proj_ctrl(parent_node); |
275 | } else { |
276 | // Is a constant with NO inputs? |
277 | if (parent_node->req() == 1) { |
278 | parent_node->set_req(0, _root); |
279 | } |
280 | } |
281 | } |
282 | |
283 | // First, visit all inputs and force them to get a block. If an |
284 | // input is already in a block we quit following inputs (to avoid |
285 | // cycles). Instead we put that Node on a worklist to be handled |
286 | // later (since IT'S inputs may not have a block yet). |
287 | |
288 | // Assume all n's inputs will be processed |
289 | bool done = true; |
290 | |
291 | while (input_index < parent_node->len()) { |
292 | Node* in = parent_node->in(input_index++); |
293 | if (in == NULL) { |
294 | continue; |
295 | } |
296 | |
297 | int is_visited = visited.test_set(in->_idx); |
298 | if (!has_block(in)) { |
299 | if (is_visited) { |
300 | assert(false, "graph should be schedulable" ); |
301 | return false; |
302 | } |
303 | // Save parent node and next input's index. |
304 | nstack.push(parent_node, input_index); |
305 | // Process current input now. |
306 | parent_node = in; |
307 | input_index = 0; |
308 | // Not all n's inputs processed. |
309 | done = false; |
310 | break; |
311 | } else if (!is_visited) { |
312 | // Visit this guy later, using worklist |
313 | roots.push(in, 0); |
314 | } |
315 | } |
316 | |
317 | if (done) { |
318 | // All of n's inputs have been processed, complete post-processing. |
319 | |
320 | // Some instructions are pinned into a block. These include Region, |
321 | // Phi, Start, Return, and other control-dependent instructions and |
322 | // any projections which depend on them. |
323 | if (!parent_node->pinned()) { |
324 | // Set earliest legal block. |
325 | Block* earliest_block = find_deepest_input(parent_node, this); |
326 | map_node_to_block(parent_node, earliest_block); |
327 | } else { |
328 | assert(get_block_for_node(parent_node) == get_block_for_node(parent_node->in(0)), "Pinned Node should be at the same block as its control edge" ); |
329 | } |
330 | |
331 | if (nstack.is_empty()) { |
332 | // Finished all nodes on stack. |
333 | // Process next node on the worklist 'roots'. |
334 | break; |
335 | } |
336 | // Get saved parent node and next input's index. |
337 | parent_node = nstack.node(); |
338 | input_index = nstack.index(); |
339 | nstack.pop(); |
340 | } |
341 | } |
342 | } |
343 | return true; |
344 | } |
345 | |
346 | //------------------------------dom_lca---------------------------------------- |
347 | // Find least common ancestor in dominator tree |
348 | // LCA is a current notion of LCA, to be raised above 'this'. |
349 | // As a convenient boundary condition, return 'this' if LCA is NULL. |
350 | // Find the LCA of those two nodes. |
351 | Block* Block::dom_lca(Block* LCA) { |
352 | if (LCA == NULL || LCA == this) return this; |
353 | |
354 | Block* anc = this; |
355 | while (anc->_dom_depth > LCA->_dom_depth) |
356 | anc = anc->_idom; // Walk up till anc is as high as LCA |
357 | |
358 | while (LCA->_dom_depth > anc->_dom_depth) |
359 | LCA = LCA->_idom; // Walk up till LCA is as high as anc |
360 | |
361 | while (LCA != anc) { // Walk both up till they are the same |
362 | LCA = LCA->_idom; |
363 | anc = anc->_idom; |
364 | } |
365 | |
366 | return LCA; |
367 | } |
368 | |
369 | //--------------------------raise_LCA_above_use-------------------------------- |
370 | // We are placing a definition, and have been given a def->use edge. |
371 | // The definition must dominate the use, so move the LCA upward in the |
372 | // dominator tree to dominate the use. If the use is a phi, adjust |
373 | // the LCA only with the phi input paths which actually use this def. |
374 | static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) { |
375 | Block* buse = cfg->get_block_for_node(use); |
376 | if (buse == NULL) return LCA; // Unused killing Projs have no use block |
377 | if (!use->is_Phi()) return buse->dom_lca(LCA); |
378 | uint pmax = use->req(); // Number of Phi inputs |
379 | // Why does not this loop just break after finding the matching input to |
380 | // the Phi? Well...it's like this. I do not have true def-use/use-def |
381 | // chains. Means I cannot distinguish, from the def-use direction, which |
382 | // of many use-defs lead from the same use to the same def. That is, this |
383 | // Phi might have several uses of the same def. Each use appears in a |
384 | // different predecessor block. But when I enter here, I cannot distinguish |
385 | // which use-def edge I should find the predecessor block for. So I find |
386 | // them all. Means I do a little extra work if a Phi uses the same value |
387 | // more than once. |
388 | for (uint j=1; j<pmax; j++) { // For all inputs |
389 | if (use->in(j) == def) { // Found matching input? |
390 | Block* pred = cfg->get_block_for_node(buse->pred(j)); |
391 | LCA = pred->dom_lca(LCA); |
392 | } |
393 | } |
394 | return LCA; |
395 | } |
396 | |
397 | //----------------------------raise_LCA_above_marks---------------------------- |
398 | // Return a new LCA that dominates LCA and any of its marked predecessors. |
399 | // Search all my parents up to 'early' (exclusive), looking for predecessors |
400 | // which are marked with the given index. Return the LCA (in the dom tree) |
401 | // of all marked blocks. If there are none marked, return the original |
402 | // LCA. |
403 | static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) { |
404 | Block_List worklist; |
405 | worklist.push(LCA); |
406 | while (worklist.size() > 0) { |
407 | Block* mid = worklist.pop(); |
408 | if (mid == early) continue; // stop searching here |
409 | |
410 | // Test and set the visited bit. |
411 | if (mid->raise_LCA_visited() == mark) continue; // already visited |
412 | |
413 | // Don't process the current LCA, otherwise the search may terminate early |
414 | if (mid != LCA && mid->raise_LCA_mark() == mark) { |
415 | // Raise the LCA. |
416 | LCA = mid->dom_lca(LCA); |
417 | if (LCA == early) break; // stop searching everywhere |
418 | assert(early->dominates(LCA), "early is high enough" ); |
419 | // Resume searching at that point, skipping intermediate levels. |
420 | worklist.push(LCA); |
421 | if (LCA == mid) |
422 | continue; // Don't mark as visited to avoid early termination. |
423 | } else { |
424 | // Keep searching through this block's predecessors. |
425 | for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) { |
426 | Block* mid_parent = cfg->get_block_for_node(mid->pred(j)); |
427 | worklist.push(mid_parent); |
428 | } |
429 | } |
430 | mid->set_raise_LCA_visited(mark); |
431 | } |
432 | return LCA; |
433 | } |
434 | |
435 | //--------------------------memory_early_block-------------------------------- |
436 | // This is a variation of find_deepest_input, the heart of schedule_early. |
437 | // Find the "early" block for a load, if we considered only memory and |
438 | // address inputs, that is, if other data inputs were ignored. |
439 | // |
440 | // Because a subset of edges are considered, the resulting block will |
441 | // be earlier (at a shallower dom_depth) than the true schedule_early |
442 | // point of the node. We compute this earlier block as a more permissive |
443 | // site for anti-dependency insertion, but only if subsume_loads is enabled. |
444 | static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) { |
445 | Node* base; |
446 | Node* index; |
447 | Node* store = load->in(MemNode::Memory); |
448 | load->as_Mach()->memory_inputs(base, index); |
449 | |
450 | assert(base != NodeSentinel && index != NodeSentinel, |
451 | "unexpected base/index inputs" ); |
452 | |
453 | Node* mem_inputs[4]; |
454 | int mem_inputs_length = 0; |
455 | if (base != NULL) mem_inputs[mem_inputs_length++] = base; |
456 | if (index != NULL) mem_inputs[mem_inputs_length++] = index; |
457 | if (store != NULL) mem_inputs[mem_inputs_length++] = store; |
458 | |
459 | // In the comparision below, add one to account for the control input, |
460 | // which may be null, but always takes up a spot in the in array. |
461 | if (mem_inputs_length + 1 < (int) load->req()) { |
462 | // This "load" has more inputs than just the memory, base and index inputs. |
463 | // For purposes of checking anti-dependences, we need to start |
464 | // from the early block of only the address portion of the instruction, |
465 | // and ignore other blocks that may have factored into the wider |
466 | // schedule_early calculation. |
467 | if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0); |
468 | |
469 | Block* deepb = NULL; // Deepest block so far |
470 | int deepb_dom_depth = 0; |
471 | for (int i = 0; i < mem_inputs_length; i++) { |
472 | Block* inb = cfg->get_block_for_node(mem_inputs[i]); |
473 | if (deepb_dom_depth < (int) inb->_dom_depth) { |
474 | // The new inb must be dominated by the previous deepb. |
475 | // The various inputs must be linearly ordered in the dom |
476 | // tree, or else there will not be a unique deepest block. |
477 | DEBUG_ONLY(assert_dom(deepb, inb, load, cfg)); |
478 | deepb = inb; // Save deepest block |
479 | deepb_dom_depth = deepb->_dom_depth; |
480 | } |
481 | } |
482 | early = deepb; |
483 | } |
484 | |
485 | return early; |
486 | } |
487 | |
488 | //--------------------------insert_anti_dependences--------------------------- |
489 | // A load may need to witness memory that nearby stores can overwrite. |
490 | // For each nearby store, either insert an "anti-dependence" edge |
491 | // from the load to the store, or else move LCA upward to force the |
492 | // load to (eventually) be scheduled in a block above the store. |
493 | // |
494 | // Do not add edges to stores on distinct control-flow paths; |
495 | // only add edges to stores which might interfere. |
496 | // |
497 | // Return the (updated) LCA. There will not be any possibly interfering |
498 | // store between the load's "early block" and the updated LCA. |
499 | // Any stores in the updated LCA will have new precedence edges |
500 | // back to the load. The caller is expected to schedule the load |
501 | // in the LCA, in which case the precedence edges will make LCM |
502 | // preserve anti-dependences. The caller may also hoist the load |
503 | // above the LCA, if it is not the early block. |
504 | Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) { |
505 | assert(load->needs_anti_dependence_check(), "must be a load of some sort" ); |
506 | assert(LCA != NULL, "" ); |
507 | DEBUG_ONLY(Block* LCA_orig = LCA); |
508 | |
509 | // Compute the alias index. Loads and stores with different alias indices |
510 | // do not need anti-dependence edges. |
511 | int load_alias_idx = C->get_alias_index(load->adr_type()); |
512 | #ifdef ASSERT |
513 | if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 && |
514 | (PrintOpto || VerifyAliases || |
515 | (PrintMiscellaneous && (WizardMode || Verbose)))) { |
516 | // Load nodes should not consume all of memory. |
517 | // Reporting a bottom type indicates a bug in adlc. |
518 | // If some particular type of node validly consumes all of memory, |
519 | // sharpen the preceding "if" to exclude it, so we can catch bugs here. |
520 | tty->print_cr("*** Possible Anti-Dependence Bug: Load consumes all of memory." ); |
521 | load->dump(2); |
522 | if (VerifyAliases) assert(load_alias_idx != Compile::AliasIdxBot, "" ); |
523 | } |
524 | #endif |
525 | assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp), |
526 | "String compare is only known 'load' that does not conflict with any stores" ); |
527 | assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals), |
528 | "String equals is a 'load' that does not conflict with any stores" ); |
529 | assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf), |
530 | "String indexOf is a 'load' that does not conflict with any stores" ); |
531 | assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOfChar), |
532 | "String indexOfChar is a 'load' that does not conflict with any stores" ); |
533 | assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq), |
534 | "Arrays equals is a 'load' that does not conflict with any stores" ); |
535 | assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_HasNegatives), |
536 | "HasNegatives is a 'load' that does not conflict with any stores" ); |
537 | |
538 | if (!C->alias_type(load_alias_idx)->is_rewritable()) { |
539 | // It is impossible to spoil this load by putting stores before it, |
540 | // because we know that the stores will never update the value |
541 | // which 'load' must witness. |
542 | return LCA; |
543 | } |
544 | |
545 | node_idx_t load_index = load->_idx; |
546 | |
547 | // Note the earliest legal placement of 'load', as determined by |
548 | // by the unique point in the dom tree where all memory effects |
549 | // and other inputs are first available. (Computed by schedule_early.) |
550 | // For normal loads, 'early' is the shallowest place (dom graph wise) |
551 | // to look for anti-deps between this load and any store. |
552 | Block* early = get_block_for_node(load); |
553 | |
554 | // If we are subsuming loads, compute an "early" block that only considers |
555 | // memory or address inputs. This block may be different than the |
556 | // schedule_early block in that it could be at an even shallower depth in the |
557 | // dominator tree, and allow for a broader discovery of anti-dependences. |
558 | if (C->subsume_loads()) { |
559 | early = memory_early_block(load, early, this); |
560 | } |
561 | |
562 | ResourceArea *area = Thread::current()->resource_area(); |
563 | Node_List worklist_mem(area); // prior memory state to store |
564 | Node_List worklist_store(area); // possible-def to explore |
565 | Node_List worklist_visited(area); // visited mergemem nodes |
566 | Node_List non_early_stores(area); // all relevant stores outside of early |
567 | bool must_raise_LCA = false; |
568 | |
569 | #ifdef TRACK_PHI_INPUTS |
570 | // %%% This extra checking fails because MergeMem nodes are not GVNed. |
571 | // Provide "phi_inputs" to check if every input to a PhiNode is from the |
572 | // original memory state. This indicates a PhiNode for which should not |
573 | // prevent the load from sinking. For such a block, set_raise_LCA_mark |
574 | // may be overly conservative. |
575 | // Mechanism: count inputs seen for each Phi encountered in worklist_store. |
576 | DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0)); |
577 | #endif |
578 | |
579 | // 'load' uses some memory state; look for users of the same state. |
580 | // Recurse through MergeMem nodes to the stores that use them. |
581 | |
582 | // Each of these stores is a possible definition of memory |
583 | // that 'load' needs to use. We need to force 'load' |
584 | // to occur before each such store. When the store is in |
585 | // the same block as 'load', we insert an anti-dependence |
586 | // edge load->store. |
587 | |
588 | // The relevant stores "nearby" the load consist of a tree rooted |
589 | // at initial_mem, with internal nodes of type MergeMem. |
590 | // Therefore, the branches visited by the worklist are of this form: |
591 | // initial_mem -> (MergeMem ->)* store |
592 | // The anti-dependence constraints apply only to the fringe of this tree. |
593 | |
594 | Node* initial_mem = load->in(MemNode::Memory); |
595 | worklist_store.push(initial_mem); |
596 | worklist_visited.push(initial_mem); |
597 | worklist_mem.push(NULL); |
598 | while (worklist_store.size() > 0) { |
599 | // Examine a nearby store to see if it might interfere with our load. |
600 | Node* mem = worklist_mem.pop(); |
601 | Node* store = worklist_store.pop(); |
602 | uint op = store->Opcode(); |
603 | |
604 | // MergeMems do not directly have anti-deps. |
605 | // Treat them as internal nodes in a forward tree of memory states, |
606 | // the leaves of which are each a 'possible-def'. |
607 | if (store == initial_mem // root (exclusive) of tree we are searching |
608 | || op == Op_MergeMem // internal node of tree we are searching |
609 | ) { |
610 | mem = store; // It's not a possibly interfering store. |
611 | if (store == initial_mem) |
612 | initial_mem = NULL; // only process initial memory once |
613 | |
614 | for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) { |
615 | store = mem->fast_out(i); |
616 | if (store->is_MergeMem()) { |
617 | // Be sure we don't get into combinatorial problems. |
618 | // (Allow phis to be repeated; they can merge two relevant states.) |
619 | uint j = worklist_visited.size(); |
620 | for (; j > 0; j--) { |
621 | if (worklist_visited.at(j-1) == store) break; |
622 | } |
623 | if (j > 0) continue; // already on work list; do not repeat |
624 | worklist_visited.push(store); |
625 | } |
626 | worklist_mem.push(mem); |
627 | worklist_store.push(store); |
628 | } |
629 | continue; |
630 | } |
631 | |
632 | if (op == Op_MachProj || op == Op_Catch) continue; |
633 | if (store->needs_anti_dependence_check()) continue; // not really a store |
634 | |
635 | // Compute the alias index. Loads and stores with different alias |
636 | // indices do not need anti-dependence edges. Wide MemBar's are |
637 | // anti-dependent on everything (except immutable memories). |
638 | const TypePtr* adr_type = store->adr_type(); |
639 | if (!C->can_alias(adr_type, load_alias_idx)) continue; |
640 | |
641 | // Most slow-path runtime calls do NOT modify Java memory, but |
642 | // they can block and so write Raw memory. |
643 | if (store->is_Mach()) { |
644 | MachNode* mstore = store->as_Mach(); |
645 | if (load_alias_idx != Compile::AliasIdxRaw) { |
646 | // Check for call into the runtime using the Java calling |
647 | // convention (and from there into a wrapper); it has no |
648 | // _method. Can't do this optimization for Native calls because |
649 | // they CAN write to Java memory. |
650 | if (mstore->ideal_Opcode() == Op_CallStaticJava) { |
651 | assert(mstore->is_MachSafePoint(), "" ); |
652 | MachSafePointNode* ms = (MachSafePointNode*) mstore; |
653 | assert(ms->is_MachCallJava(), "" ); |
654 | MachCallJavaNode* mcj = (MachCallJavaNode*) ms; |
655 | if (mcj->_method == NULL) { |
656 | // These runtime calls do not write to Java visible memory |
657 | // (other than Raw) and so do not require anti-dependence edges. |
658 | continue; |
659 | } |
660 | } |
661 | // Same for SafePoints: they read/write Raw but only read otherwise. |
662 | // This is basically a workaround for SafePoints only defining control |
663 | // instead of control + memory. |
664 | if (mstore->ideal_Opcode() == Op_SafePoint) |
665 | continue; |
666 | } else { |
667 | // Some raw memory, such as the load of "top" at an allocation, |
668 | // can be control dependent on the previous safepoint. See |
669 | // comments in GraphKit::allocate_heap() about control input. |
670 | // Inserting an anti-dep between such a safepoint and a use |
671 | // creates a cycle, and will cause a subsequent failure in |
672 | // local scheduling. (BugId 4919904) |
673 | // (%%% How can a control input be a safepoint and not a projection??) |
674 | if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore) |
675 | continue; |
676 | } |
677 | } |
678 | |
679 | // Identify a block that the current load must be above, |
680 | // or else observe that 'store' is all the way up in the |
681 | // earliest legal block for 'load'. In the latter case, |
682 | // immediately insert an anti-dependence edge. |
683 | Block* store_block = get_block_for_node(store); |
684 | assert(store_block != NULL, "unused killing projections skipped above" ); |
685 | |
686 | if (store->is_Phi()) { |
687 | // Loop-phis need to raise load before input. (Other phis are treated |
688 | // as store below.) |
689 | // |
690 | // 'load' uses memory which is one (or more) of the Phi's inputs. |
691 | // It must be scheduled not before the Phi, but rather before |
692 | // each of the relevant Phi inputs. |
693 | // |
694 | // Instead of finding the LCA of all inputs to a Phi that match 'mem', |
695 | // we mark each corresponding predecessor block and do a combined |
696 | // hoisting operation later (raise_LCA_above_marks). |
697 | // |
698 | // Do not assert(store_block != early, "Phi merging memory after access") |
699 | // PhiNode may be at start of block 'early' with backedge to 'early' |
700 | DEBUG_ONLY(bool found_match = false); |
701 | for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) { |
702 | if (store->in(j) == mem) { // Found matching input? |
703 | DEBUG_ONLY(found_match = true); |
704 | Block* pred_block = get_block_for_node(store_block->pred(j)); |
705 | if (pred_block != early) { |
706 | // If any predecessor of the Phi matches the load's "early block", |
707 | // we do not need a precedence edge between the Phi and 'load' |
708 | // since the load will be forced into a block preceding the Phi. |
709 | pred_block->set_raise_LCA_mark(load_index); |
710 | assert(!LCA_orig->dominates(pred_block) || |
711 | early->dominates(pred_block), "early is high enough" ); |
712 | must_raise_LCA = true; |
713 | } else { |
714 | // anti-dependent upon PHI pinned below 'early', no edge needed |
715 | LCA = early; // but can not schedule below 'early' |
716 | } |
717 | } |
718 | } |
719 | assert(found_match, "no worklist bug" ); |
720 | #ifdef TRACK_PHI_INPUTS |
721 | #ifdef ASSERT |
722 | // This assert asks about correct handling of PhiNodes, which may not |
723 | // have all input edges directly from 'mem'. See BugId 4621264 |
724 | int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1; |
725 | // Increment by exactly one even if there are multiple copies of 'mem' |
726 | // coming into the phi, because we will run this block several times |
727 | // if there are several copies of 'mem'. (That's how DU iterators work.) |
728 | phi_inputs.at_put(store->_idx, num_mem_inputs); |
729 | assert(PhiNode::Input + num_mem_inputs < store->req(), |
730 | "Expect at least one phi input will not be from original memory state" ); |
731 | #endif //ASSERT |
732 | #endif //TRACK_PHI_INPUTS |
733 | } else if (store_block != early) { |
734 | // 'store' is between the current LCA and earliest possible block. |
735 | // Label its block, and decide later on how to raise the LCA |
736 | // to include the effect on LCA of this store. |
737 | // If this store's block gets chosen as the raised LCA, we |
738 | // will find him on the non_early_stores list and stick him |
739 | // with a precedence edge. |
740 | // (But, don't bother if LCA is already raised all the way.) |
741 | if (LCA != early) { |
742 | store_block->set_raise_LCA_mark(load_index); |
743 | must_raise_LCA = true; |
744 | non_early_stores.push(store); |
745 | } |
746 | } else { |
747 | // Found a possibly-interfering store in the load's 'early' block. |
748 | // This means 'load' cannot sink at all in the dominator tree. |
749 | // Add an anti-dep edge, and squeeze 'load' into the highest block. |
750 | assert(store != load->in(0), "dependence cycle found" ); |
751 | if (verify) { |
752 | assert(store->find_edge(load) != -1, "missing precedence edge" ); |
753 | } else { |
754 | store->add_prec(load); |
755 | } |
756 | LCA = early; |
757 | // This turns off the process of gathering non_early_stores. |
758 | } |
759 | } |
760 | // (Worklist is now empty; all nearby stores have been visited.) |
761 | |
762 | // Finished if 'load' must be scheduled in its 'early' block. |
763 | // If we found any stores there, they have already been given |
764 | // precedence edges. |
765 | if (LCA == early) return LCA; |
766 | |
767 | // We get here only if there are no possibly-interfering stores |
768 | // in the load's 'early' block. Move LCA up above all predecessors |
769 | // which contain stores we have noted. |
770 | // |
771 | // The raised LCA block can be a home to such interfering stores, |
772 | // but its predecessors must not contain any such stores. |
773 | // |
774 | // The raised LCA will be a lower bound for placing the load, |
775 | // preventing the load from sinking past any block containing |
776 | // a store that may invalidate the memory state required by 'load'. |
777 | if (must_raise_LCA) |
778 | LCA = raise_LCA_above_marks(LCA, load->_idx, early, this); |
779 | if (LCA == early) return LCA; |
780 | |
781 | // Insert anti-dependence edges from 'load' to each store |
782 | // in the non-early LCA block. |
783 | // Mine the non_early_stores list for such stores. |
784 | if (LCA->raise_LCA_mark() == load_index) { |
785 | while (non_early_stores.size() > 0) { |
786 | Node* store = non_early_stores.pop(); |
787 | Block* store_block = get_block_for_node(store); |
788 | if (store_block == LCA) { |
789 | // add anti_dependence from store to load in its own block |
790 | assert(store != load->in(0), "dependence cycle found" ); |
791 | if (verify) { |
792 | assert(store->find_edge(load) != -1, "missing precedence edge" ); |
793 | } else { |
794 | store->add_prec(load); |
795 | } |
796 | } else { |
797 | assert(store_block->raise_LCA_mark() == load_index, "block was marked" ); |
798 | // Any other stores we found must be either inside the new LCA |
799 | // or else outside the original LCA. In the latter case, they |
800 | // did not interfere with any use of 'load'. |
801 | assert(LCA->dominates(store_block) |
802 | || !LCA_orig->dominates(store_block), "no stray stores" ); |
803 | } |
804 | } |
805 | } |
806 | |
807 | // Return the highest block containing stores; any stores |
808 | // within that block have been given anti-dependence edges. |
809 | return LCA; |
810 | } |
811 | |
812 | // This class is used to iterate backwards over the nodes in the graph. |
813 | |
814 | class Node_Backward_Iterator { |
815 | |
816 | private: |
817 | Node_Backward_Iterator(); |
818 | |
819 | public: |
820 | // Constructor for the iterator |
821 | Node_Backward_Iterator(Node *root, VectorSet &visited, Node_Stack &stack, PhaseCFG &cfg); |
822 | |
823 | // Postincrement operator to iterate over the nodes |
824 | Node *next(); |
825 | |
826 | private: |
827 | VectorSet &_visited; |
828 | Node_Stack &_stack; |
829 | PhaseCFG &_cfg; |
830 | }; |
831 | |
832 | // Constructor for the Node_Backward_Iterator |
833 | Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_Stack &stack, PhaseCFG &cfg) |
834 | : _visited(visited), _stack(stack), _cfg(cfg) { |
835 | // The stack should contain exactly the root |
836 | stack.clear(); |
837 | stack.push(root, root->outcnt()); |
838 | |
839 | // Clear the visited bits |
840 | visited.Clear(); |
841 | } |
842 | |
843 | // Iterator for the Node_Backward_Iterator |
844 | Node *Node_Backward_Iterator::next() { |
845 | |
846 | // If the _stack is empty, then just return NULL: finished. |
847 | if ( !_stack.size() ) |
848 | return NULL; |
849 | |
850 | // I visit unvisited not-anti-dependence users first, then anti-dependent |
851 | // children next. I iterate backwards to support removal of nodes. |
852 | // The stack holds states consisting of 3 values: |
853 | // current Def node, flag which indicates 1st/2nd pass, index of current out edge |
854 | Node *self = (Node*)(((uintptr_t)_stack.node()) & ~1); |
855 | bool iterate_anti_dep = (((uintptr_t)_stack.node()) & 1); |
856 | uint idx = MIN2(_stack.index(), self->outcnt()); // Support removal of nodes. |
857 | _stack.pop(); |
858 | |
859 | // I cycle here when I am entering a deeper level of recursion. |
860 | // The key variable 'self' was set prior to jumping here. |
861 | while( 1 ) { |
862 | |
863 | _visited.set(self->_idx); |
864 | |
865 | // Now schedule all uses as late as possible. |
866 | const Node* src = self->is_Proj() ? self->in(0) : self; |
867 | uint src_rpo = _cfg.get_block_for_node(src)->_rpo; |
868 | |
869 | // Schedule all nodes in a post-order visit |
870 | Node *unvisited = NULL; // Unvisited anti-dependent Node, if any |
871 | |
872 | // Scan for unvisited nodes |
873 | while (idx > 0) { |
874 | // For all uses, schedule late |
875 | Node* n = self->raw_out(--idx); // Use |
876 | |
877 | // Skip already visited children |
878 | if ( _visited.test(n->_idx) ) |
879 | continue; |
880 | |
881 | // do not traverse backward control edges |
882 | Node *use = n->is_Proj() ? n->in(0) : n; |
883 | uint use_rpo = _cfg.get_block_for_node(use)->_rpo; |
884 | |
885 | if ( use_rpo < src_rpo ) |
886 | continue; |
887 | |
888 | // Phi nodes always precede uses in a basic block |
889 | if ( use_rpo == src_rpo && use->is_Phi() ) |
890 | continue; |
891 | |
892 | unvisited = n; // Found unvisited |
893 | |
894 | // Check for possible-anti-dependent |
895 | // 1st pass: No such nodes, 2nd pass: Only such nodes. |
896 | if (n->needs_anti_dependence_check() == iterate_anti_dep) { |
897 | unvisited = n; // Found unvisited |
898 | break; |
899 | } |
900 | } |
901 | |
902 | // Did I find an unvisited not-anti-dependent Node? |
903 | if (!unvisited) { |
904 | if (!iterate_anti_dep) { |
905 | // 2nd pass: Iterate over nodes which needs_anti_dependence_check. |
906 | iterate_anti_dep = true; |
907 | idx = self->outcnt(); |
908 | continue; |
909 | } |
910 | break; // All done with children; post-visit 'self' |
911 | } |
912 | |
913 | // Visit the unvisited Node. Contains the obvious push to |
914 | // indicate I'm entering a deeper level of recursion. I push the |
915 | // old state onto the _stack and set a new state and loop (recurse). |
916 | _stack.push((Node*)((uintptr_t)self | (uintptr_t)iterate_anti_dep), idx); |
917 | self = unvisited; |
918 | iterate_anti_dep = false; |
919 | idx = self->outcnt(); |
920 | } // End recursion loop |
921 | |
922 | return self; |
923 | } |
924 | |
925 | //------------------------------ComputeLatenciesBackwards---------------------- |
926 | // Compute the latency of all the instructions. |
927 | void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_Stack &stack) { |
928 | #ifndef PRODUCT |
929 | if (trace_opto_pipelining()) |
930 | tty->print("\n#---- ComputeLatenciesBackwards ----\n" ); |
931 | #endif |
932 | |
933 | Node_Backward_Iterator iter((Node *)_root, visited, stack, *this); |
934 | Node *n; |
935 | |
936 | // Walk over all the nodes from last to first |
937 | while ((n = iter.next())) { |
938 | // Set the latency for the definitions of this instruction |
939 | partial_latency_of_defs(n); |
940 | } |
941 | } // end ComputeLatenciesBackwards |
942 | |
943 | //------------------------------partial_latency_of_defs------------------------ |
944 | // Compute the latency impact of this node on all defs. This computes |
945 | // a number that increases as we approach the beginning of the routine. |
946 | void PhaseCFG::partial_latency_of_defs(Node *n) { |
947 | // Set the latency for this instruction |
948 | #ifndef PRODUCT |
949 | if (trace_opto_pipelining()) { |
950 | tty->print("# latency_to_inputs: node_latency[%d] = %d for node" , n->_idx, get_latency_for_node(n)); |
951 | dump(); |
952 | } |
953 | #endif |
954 | |
955 | if (n->is_Proj()) { |
956 | n = n->in(0); |
957 | } |
958 | |
959 | if (n->is_Root()) { |
960 | return; |
961 | } |
962 | |
963 | uint nlen = n->len(); |
964 | uint use_latency = get_latency_for_node(n); |
965 | uint use_pre_order = get_block_for_node(n)->_pre_order; |
966 | |
967 | for (uint j = 0; j < nlen; j++) { |
968 | Node *def = n->in(j); |
969 | |
970 | if (!def || def == n) { |
971 | continue; |
972 | } |
973 | |
974 | // Walk backwards thru projections |
975 | if (def->is_Proj()) { |
976 | def = def->in(0); |
977 | } |
978 | |
979 | #ifndef PRODUCT |
980 | if (trace_opto_pipelining()) { |
981 | tty->print("# in(%2d): " , j); |
982 | def->dump(); |
983 | } |
984 | #endif |
985 | |
986 | // If the defining block is not known, assume it is ok |
987 | Block *def_block = get_block_for_node(def); |
988 | uint def_pre_order = def_block ? def_block->_pre_order : 0; |
989 | |
990 | if ((use_pre_order < def_pre_order) || (use_pre_order == def_pre_order && n->is_Phi())) { |
991 | continue; |
992 | } |
993 | |
994 | uint delta_latency = n->latency(j); |
995 | uint current_latency = delta_latency + use_latency; |
996 | |
997 | if (get_latency_for_node(def) < current_latency) { |
998 | set_latency_for_node(def, current_latency); |
999 | } |
1000 | |
1001 | #ifndef PRODUCT |
1002 | if (trace_opto_pipelining()) { |
1003 | tty->print_cr("# %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d" , use_latency, j, delta_latency, current_latency, def->_idx, get_latency_for_node(def)); |
1004 | } |
1005 | #endif |
1006 | } |
1007 | } |
1008 | |
1009 | //------------------------------latency_from_use------------------------------- |
1010 | // Compute the latency of a specific use |
1011 | int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) { |
1012 | // If self-reference, return no latency |
1013 | if (use == n || use->is_Root()) { |
1014 | return 0; |
1015 | } |
1016 | |
1017 | uint def_pre_order = get_block_for_node(def)->_pre_order; |
1018 | uint latency = 0; |
1019 | |
1020 | // If the use is not a projection, then it is simple... |
1021 | if (!use->is_Proj()) { |
1022 | #ifndef PRODUCT |
1023 | if (trace_opto_pipelining()) { |
1024 | tty->print("# out(): " ); |
1025 | use->dump(); |
1026 | } |
1027 | #endif |
1028 | |
1029 | uint use_pre_order = get_block_for_node(use)->_pre_order; |
1030 | |
1031 | if (use_pre_order < def_pre_order) |
1032 | return 0; |
1033 | |
1034 | if (use_pre_order == def_pre_order && use->is_Phi()) |
1035 | return 0; |
1036 | |
1037 | uint nlen = use->len(); |
1038 | uint nl = get_latency_for_node(use); |
1039 | |
1040 | for ( uint j=0; j<nlen; j++ ) { |
1041 | if (use->in(j) == n) { |
1042 | // Change this if we want local latencies |
1043 | uint ul = use->latency(j); |
1044 | uint l = ul + nl; |
1045 | if (latency < l) latency = l; |
1046 | #ifndef PRODUCT |
1047 | if (trace_opto_pipelining()) { |
1048 | tty->print_cr("# %d + edge_latency(%d) == %d -> %d, latency = %d" , |
1049 | nl, j, ul, l, latency); |
1050 | } |
1051 | #endif |
1052 | } |
1053 | } |
1054 | } else { |
1055 | // This is a projection, just grab the latency of the use(s) |
1056 | for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) { |
1057 | uint l = latency_from_use(use, def, use->fast_out(j)); |
1058 | if (latency < l) latency = l; |
1059 | } |
1060 | } |
1061 | |
1062 | return latency; |
1063 | } |
1064 | |
1065 | //------------------------------latency_from_uses------------------------------ |
1066 | // Compute the latency of this instruction relative to all of it's uses. |
1067 | // This computes a number that increases as we approach the beginning of the |
1068 | // routine. |
1069 | void PhaseCFG::latency_from_uses(Node *n) { |
1070 | // Set the latency for this instruction |
1071 | #ifndef PRODUCT |
1072 | if (trace_opto_pipelining()) { |
1073 | tty->print("# latency_from_outputs: node_latency[%d] = %d for node" , n->_idx, get_latency_for_node(n)); |
1074 | dump(); |
1075 | } |
1076 | #endif |
1077 | uint latency=0; |
1078 | const Node *def = n->is_Proj() ? n->in(0): n; |
1079 | |
1080 | for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { |
1081 | uint l = latency_from_use(n, def, n->fast_out(i)); |
1082 | |
1083 | if (latency < l) latency = l; |
1084 | } |
1085 | |
1086 | set_latency_for_node(n, latency); |
1087 | } |
1088 | |
1089 | //------------------------------hoist_to_cheaper_block------------------------- |
1090 | // Pick a block for node self, between early and LCA, that is a cheaper |
1091 | // alternative to LCA. |
1092 | Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) { |
1093 | const double delta = 1+PROB_UNLIKELY_MAG(4); |
1094 | Block* least = LCA; |
1095 | double least_freq = least->_freq; |
1096 | uint target = get_latency_for_node(self); |
1097 | uint start_latency = get_latency_for_node(LCA->head()); |
1098 | uint end_latency = get_latency_for_node(LCA->get_node(LCA->end_idx())); |
1099 | bool in_latency = (target <= start_latency); |
1100 | const Block* root_block = get_block_for_node(_root); |
1101 | |
1102 | // Turn off latency scheduling if scheduling is just plain off |
1103 | if (!C->do_scheduling()) |
1104 | in_latency = true; |
1105 | |
1106 | // Do not hoist (to cover latency) instructions which target a |
1107 | // single register. Hoisting stretches the live range of the |
1108 | // single register and may force spilling. |
1109 | MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL; |
1110 | if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty()) |
1111 | in_latency = true; |
1112 | |
1113 | #ifndef PRODUCT |
1114 | if (trace_opto_pipelining()) { |
1115 | tty->print("# Find cheaper block for latency %d: " , get_latency_for_node(self)); |
1116 | self->dump(); |
1117 | tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g" , |
1118 | LCA->_pre_order, |
1119 | LCA->head()->_idx, |
1120 | start_latency, |
1121 | LCA->get_node(LCA->end_idx())->_idx, |
1122 | end_latency, |
1123 | least_freq); |
1124 | } |
1125 | #endif |
1126 | |
1127 | int cand_cnt = 0; // number of candidates tried |
1128 | |
1129 | // Walk up the dominator tree from LCA (Lowest common ancestor) to |
1130 | // the earliest legal location. Capture the least execution frequency. |
1131 | while (LCA != early) { |
1132 | LCA = LCA->_idom; // Follow up the dominator tree |
1133 | |
1134 | if (LCA == NULL) { |
1135 | // Bailout without retry |
1136 | assert(false, "graph should be schedulable" ); |
1137 | C->record_method_not_compilable("late schedule failed: LCA == NULL" ); |
1138 | return least; |
1139 | } |
1140 | |
1141 | // Don't hoist machine instructions to the root basic block |
1142 | if (mach && LCA == root_block) |
1143 | break; |
1144 | |
1145 | uint start_lat = get_latency_for_node(LCA->head()); |
1146 | uint end_idx = LCA->end_idx(); |
1147 | uint end_lat = get_latency_for_node(LCA->get_node(end_idx)); |
1148 | double LCA_freq = LCA->_freq; |
1149 | #ifndef PRODUCT |
1150 | if (trace_opto_pipelining()) { |
1151 | tty->print_cr("# B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g" , |
1152 | LCA->_pre_order, LCA->head()->_idx, start_lat, end_idx, end_lat, LCA_freq); |
1153 | } |
1154 | #endif |
1155 | cand_cnt++; |
1156 | if (LCA_freq < least_freq || // Better Frequency |
1157 | (StressGCM && Compile::randomized_select(cand_cnt)) || // Should be randomly accepted in stress mode |
1158 | (!StressGCM && // Otherwise, choose with latency |
1159 | !in_latency && // No block containing latency |
1160 | LCA_freq < least_freq * delta && // No worse frequency |
1161 | target >= end_lat && // within latency range |
1162 | !self->is_iteratively_computed() ) // But don't hoist IV increments |
1163 | // because they may end up above other uses of their phi forcing |
1164 | // their result register to be different from their input. |
1165 | ) { |
1166 | least = LCA; // Found cheaper block |
1167 | least_freq = LCA_freq; |
1168 | start_latency = start_lat; |
1169 | end_latency = end_lat; |
1170 | if (target <= start_lat) |
1171 | in_latency = true; |
1172 | } |
1173 | } |
1174 | |
1175 | #ifndef PRODUCT |
1176 | if (trace_opto_pipelining()) { |
1177 | tty->print_cr("# Choose block B%d with start latency=%d and freq=%g" , |
1178 | least->_pre_order, start_latency, least_freq); |
1179 | } |
1180 | #endif |
1181 | |
1182 | // See if the latency needs to be updated |
1183 | if (target < end_latency) { |
1184 | #ifndef PRODUCT |
1185 | if (trace_opto_pipelining()) { |
1186 | tty->print_cr("# Change latency for [%4d] from %d to %d" , self->_idx, target, end_latency); |
1187 | } |
1188 | #endif |
1189 | set_latency_for_node(self, end_latency); |
1190 | partial_latency_of_defs(self); |
1191 | } |
1192 | |
1193 | return least; |
1194 | } |
1195 | |
1196 | |
1197 | //------------------------------schedule_late----------------------------------- |
1198 | // Now schedule all codes as LATE as possible. This is the LCA in the |
1199 | // dominator tree of all USES of a value. Pick the block with the least |
1200 | // loop nesting depth that is lowest in the dominator tree. |
1201 | extern const char must_clone[]; |
1202 | void PhaseCFG::schedule_late(VectorSet &visited, Node_Stack &stack) { |
1203 | #ifndef PRODUCT |
1204 | if (trace_opto_pipelining()) |
1205 | tty->print("\n#---- schedule_late ----\n" ); |
1206 | #endif |
1207 | |
1208 | Node_Backward_Iterator iter((Node *)_root, visited, stack, *this); |
1209 | Node *self; |
1210 | |
1211 | // Walk over all the nodes from last to first |
1212 | while ((self = iter.next())) { |
1213 | Block* early = get_block_for_node(self); // Earliest legal placement |
1214 | |
1215 | if (self->is_top()) { |
1216 | // Top node goes in bb #2 with other constants. |
1217 | // It must be special-cased, because it has no out edges. |
1218 | early->add_inst(self); |
1219 | continue; |
1220 | } |
1221 | |
1222 | // No uses, just terminate |
1223 | if (self->outcnt() == 0) { |
1224 | assert(self->is_MachProj(), "sanity" ); |
1225 | continue; // Must be a dead machine projection |
1226 | } |
1227 | |
1228 | // If node is pinned in the block, then no scheduling can be done. |
1229 | if( self->pinned() ) // Pinned in block? |
1230 | continue; |
1231 | |
1232 | MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL; |
1233 | if (mach) { |
1234 | switch (mach->ideal_Opcode()) { |
1235 | case Op_CreateEx: |
1236 | // Don't move exception creation |
1237 | early->add_inst(self); |
1238 | continue; |
1239 | break; |
1240 | case Op_CheckCastPP: { |
1241 | // Don't move CheckCastPP nodes away from their input, if the input |
1242 | // is a rawptr (5071820). |
1243 | Node *def = self->in(1); |
1244 | if (def != NULL && def->bottom_type()->base() == Type::RawPtr) { |
1245 | early->add_inst(self); |
1246 | #ifdef ASSERT |
1247 | _raw_oops.push(def); |
1248 | #endif |
1249 | continue; |
1250 | } |
1251 | break; |
1252 | } |
1253 | default: |
1254 | break; |
1255 | } |
1256 | } |
1257 | |
1258 | // Gather LCA of all uses |
1259 | Block *LCA = NULL; |
1260 | { |
1261 | for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) { |
1262 | // For all uses, find LCA |
1263 | Node* use = self->fast_out(i); |
1264 | LCA = raise_LCA_above_use(LCA, use, self, this); |
1265 | } |
1266 | guarantee(LCA != NULL, "There must be a LCA" ); |
1267 | } // (Hide defs of imax, i from rest of block.) |
1268 | |
1269 | // Place temps in the block of their use. This isn't a |
1270 | // requirement for correctness but it reduces useless |
1271 | // interference between temps and other nodes. |
1272 | if (mach != NULL && mach->is_MachTemp()) { |
1273 | map_node_to_block(self, LCA); |
1274 | LCA->add_inst(self); |
1275 | continue; |
1276 | } |
1277 | |
1278 | // Check if 'self' could be anti-dependent on memory |
1279 | if (self->needs_anti_dependence_check()) { |
1280 | // Hoist LCA above possible-defs and insert anti-dependences to |
1281 | // defs in new LCA block. |
1282 | LCA = insert_anti_dependences(LCA, self); |
1283 | } |
1284 | |
1285 | if (early->_dom_depth > LCA->_dom_depth) { |
1286 | // Somehow the LCA has moved above the earliest legal point. |
1287 | // (One way this can happen is via memory_early_block.) |
1288 | if (C->subsume_loads() == true && !C->failing()) { |
1289 | // Retry with subsume_loads == false |
1290 | // If this is the first failure, the sentinel string will "stick" |
1291 | // to the Compile object, and the C2Compiler will see it and retry. |
1292 | C->record_failure(C2Compiler::retry_no_subsuming_loads()); |
1293 | } else { |
1294 | // Bailout without retry when (early->_dom_depth > LCA->_dom_depth) |
1295 | assert(false, "graph should be schedulable" ); |
1296 | C->record_method_not_compilable("late schedule failed: incorrect graph" ); |
1297 | } |
1298 | return; |
1299 | } |
1300 | |
1301 | // If there is no opportunity to hoist, then we're done. |
1302 | // In stress mode, try to hoist even the single operations. |
1303 | bool try_to_hoist = StressGCM || (LCA != early); |
1304 | |
1305 | // Must clone guys stay next to use; no hoisting allowed. |
1306 | // Also cannot hoist guys that alter memory or are otherwise not |
1307 | // allocatable (hoisting can make a value live longer, leading to |
1308 | // anti and output dependency problems which are normally resolved |
1309 | // by the register allocator giving everyone a different register). |
1310 | if (mach != NULL && must_clone[mach->ideal_Opcode()]) |
1311 | try_to_hoist = false; |
1312 | |
1313 | Block* late = NULL; |
1314 | if (try_to_hoist) { |
1315 | // Now find the block with the least execution frequency. |
1316 | // Start at the latest schedule and work up to the earliest schedule |
1317 | // in the dominator tree. Thus the Node will dominate all its uses. |
1318 | late = hoist_to_cheaper_block(LCA, early, self); |
1319 | } else { |
1320 | // Just use the LCA of the uses. |
1321 | late = LCA; |
1322 | } |
1323 | |
1324 | // Put the node into target block |
1325 | schedule_node_into_block(self, late); |
1326 | |
1327 | #ifdef ASSERT |
1328 | if (self->needs_anti_dependence_check()) { |
1329 | // since precedence edges are only inserted when we're sure they |
1330 | // are needed make sure that after placement in a block we don't |
1331 | // need any new precedence edges. |
1332 | verify_anti_dependences(late, self); |
1333 | } |
1334 | #endif |
1335 | } // Loop until all nodes have been visited |
1336 | |
1337 | } // end ScheduleLate |
1338 | |
1339 | //------------------------------GlobalCodeMotion------------------------------- |
1340 | void PhaseCFG::global_code_motion() { |
1341 | ResourceMark rm; |
1342 | |
1343 | #ifndef PRODUCT |
1344 | if (trace_opto_pipelining()) { |
1345 | tty->print("\n---- Start GlobalCodeMotion ----\n" ); |
1346 | } |
1347 | #endif |
1348 | |
1349 | // Initialize the node to block mapping for things on the proj_list |
1350 | for (uint i = 0; i < _matcher.number_of_projections(); i++) { |
1351 | unmap_node_from_block(_matcher.get_projection(i)); |
1352 | } |
1353 | |
1354 | // Set the basic block for Nodes pinned into blocks |
1355 | Arena* arena = Thread::current()->resource_area(); |
1356 | VectorSet visited(arena); |
1357 | schedule_pinned_nodes(visited); |
1358 | |
1359 | // Find the earliest Block any instruction can be placed in. Some |
1360 | // instructions are pinned into Blocks. Unpinned instructions can |
1361 | // appear in last block in which all their inputs occur. |
1362 | visited.Clear(); |
1363 | Node_Stack stack(arena, (C->live_nodes() >> 2) + 16); // pre-grow |
1364 | if (!schedule_early(visited, stack)) { |
1365 | // Bailout without retry |
1366 | C->record_method_not_compilable("early schedule failed" ); |
1367 | return; |
1368 | } |
1369 | |
1370 | // Build Def-Use edges. |
1371 | // Compute the latency information (via backwards walk) for all the |
1372 | // instructions in the graph |
1373 | _node_latency = new GrowableArray<uint>(); // resource_area allocation |
1374 | |
1375 | if (C->do_scheduling()) { |
1376 | compute_latencies_backwards(visited, stack); |
1377 | } |
1378 | |
1379 | // Now schedule all codes as LATE as possible. This is the LCA in the |
1380 | // dominator tree of all USES of a value. Pick the block with the least |
1381 | // loop nesting depth that is lowest in the dominator tree. |
1382 | // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() ) |
1383 | schedule_late(visited, stack); |
1384 | if (C->failing()) { |
1385 | // schedule_late fails only when graph is incorrect. |
1386 | assert(!VerifyGraphEdges, "verification should have failed" ); |
1387 | return; |
1388 | } |
1389 | |
1390 | #ifndef PRODUCT |
1391 | if (trace_opto_pipelining()) { |
1392 | tty->print("\n---- Detect implicit null checks ----\n" ); |
1393 | } |
1394 | #endif |
1395 | |
1396 | // Detect implicit-null-check opportunities. Basically, find NULL checks |
1397 | // with suitable memory ops nearby. Use the memory op to do the NULL check. |
1398 | // I can generate a memory op if there is not one nearby. |
1399 | if (C->is_method_compilation()) { |
1400 | // By reversing the loop direction we get a very minor gain on mpegaudio. |
1401 | // Feel free to revert to a forward loop for clarity. |
1402 | // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) { |
1403 | for (int i = _matcher._null_check_tests.size() - 2; i >= 0; i -= 2) { |
1404 | Node* proj = _matcher._null_check_tests[i]; |
1405 | Node* val = _matcher._null_check_tests[i + 1]; |
1406 | Block* block = get_block_for_node(proj); |
1407 | implicit_null_check(block, proj, val, C->allowed_deopt_reasons()); |
1408 | // The implicit_null_check will only perform the transformation |
1409 | // if the null branch is truly uncommon, *and* it leads to an |
1410 | // uncommon trap. Combined with the too_many_traps guards |
1411 | // above, this prevents SEGV storms reported in 6366351, |
1412 | // by recompiling offending methods without this optimization. |
1413 | } |
1414 | } |
1415 | |
1416 | bool block_size_threshold_ok = false; |
1417 | intptr_t *recalc_pressure_nodes = NULL; |
1418 | if (OptoRegScheduling) { |
1419 | for (uint i = 0; i < number_of_blocks(); i++) { |
1420 | Block* block = get_block(i); |
1421 | if (block->number_of_nodes() > 10) { |
1422 | block_size_threshold_ok = true; |
1423 | break; |
1424 | } |
1425 | } |
1426 | } |
1427 | |
1428 | // Enabling the scheduler for register pressure plus finding blocks of size to schedule for it |
1429 | // is key to enabling this feature. |
1430 | PhaseChaitin regalloc(C->unique(), *this, _matcher, true); |
1431 | ResourceArea live_arena(mtCompiler); // Arena for liveness |
1432 | ResourceMark rm_live(&live_arena); |
1433 | PhaseLive live(*this, regalloc._lrg_map.names(), &live_arena, true); |
1434 | PhaseIFG ifg(&live_arena); |
1435 | if (OptoRegScheduling && block_size_threshold_ok) { |
1436 | regalloc.mark_ssa(); |
1437 | Compile::TracePhase tp("computeLive" , &timers[_t_computeLive]); |
1438 | rm_live.reset_to_mark(); // Reclaim working storage |
1439 | IndexSet::reset_memory(C, &live_arena); |
1440 | uint node_size = regalloc._lrg_map.max_lrg_id(); |
1441 | ifg.init(node_size); // Empty IFG |
1442 | regalloc.set_ifg(ifg); |
1443 | regalloc.set_live(live); |
1444 | regalloc.gather_lrg_masks(false); // Collect LRG masks |
1445 | live.compute(node_size); // Compute liveness |
1446 | |
1447 | recalc_pressure_nodes = NEW_RESOURCE_ARRAY(intptr_t, node_size); |
1448 | for (uint i = 0; i < node_size; i++) { |
1449 | recalc_pressure_nodes[i] = 0; |
1450 | } |
1451 | } |
1452 | _regalloc = ®alloc; |
1453 | |
1454 | #ifndef PRODUCT |
1455 | if (trace_opto_pipelining()) { |
1456 | tty->print("\n---- Start Local Scheduling ----\n" ); |
1457 | } |
1458 | #endif |
1459 | |
1460 | // Schedule locally. Right now a simple topological sort. |
1461 | // Later, do a real latency aware scheduler. |
1462 | GrowableArray<int> ready_cnt(C->unique(), C->unique(), -1); |
1463 | visited.Clear(); |
1464 | for (uint i = 0; i < number_of_blocks(); i++) { |
1465 | Block* block = get_block(i); |
1466 | if (!schedule_local(block, ready_cnt, visited, recalc_pressure_nodes)) { |
1467 | if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) { |
1468 | C->record_method_not_compilable("local schedule failed" ); |
1469 | } |
1470 | _regalloc = NULL; |
1471 | return; |
1472 | } |
1473 | } |
1474 | _regalloc = NULL; |
1475 | |
1476 | // If we inserted any instructions between a Call and his CatchNode, |
1477 | // clone the instructions on all paths below the Catch. |
1478 | for (uint i = 0; i < number_of_blocks(); i++) { |
1479 | Block* block = get_block(i); |
1480 | call_catch_cleanup(block); |
1481 | } |
1482 | |
1483 | #ifndef PRODUCT |
1484 | if (trace_opto_pipelining()) { |
1485 | tty->print("\n---- After GlobalCodeMotion ----\n" ); |
1486 | for (uint i = 0; i < number_of_blocks(); i++) { |
1487 | Block* block = get_block(i); |
1488 | block->dump(); |
1489 | } |
1490 | } |
1491 | #endif |
1492 | // Dead. |
1493 | _node_latency = (GrowableArray<uint> *)((intptr_t)0xdeadbeef); |
1494 | } |
1495 | |
1496 | bool PhaseCFG::do_global_code_motion() { |
1497 | |
1498 | build_dominator_tree(); |
1499 | if (C->failing()) { |
1500 | return false; |
1501 | } |
1502 | |
1503 | NOT_PRODUCT( C->verify_graph_edges(); ) |
1504 | |
1505 | estimate_block_frequency(); |
1506 | |
1507 | global_code_motion(); |
1508 | |
1509 | if (C->failing()) { |
1510 | return false; |
1511 | } |
1512 | |
1513 | return true; |
1514 | } |
1515 | |
1516 | //------------------------------Estimate_Block_Frequency----------------------- |
1517 | // Estimate block frequencies based on IfNode probabilities. |
1518 | void PhaseCFG::estimate_block_frequency() { |
1519 | |
1520 | // Force conditional branches leading to uncommon traps to be unlikely, |
1521 | // not because we get to the uncommon_trap with less relative frequency, |
1522 | // but because an uncommon_trap typically causes a deopt, so we only get |
1523 | // there once. |
1524 | if (C->do_freq_based_layout()) { |
1525 | Block_List worklist; |
1526 | Block* root_blk = get_block(0); |
1527 | for (uint i = 1; i < root_blk->num_preds(); i++) { |
1528 | Block *pb = get_block_for_node(root_blk->pred(i)); |
1529 | if (pb->has_uncommon_code()) { |
1530 | worklist.push(pb); |
1531 | } |
1532 | } |
1533 | while (worklist.size() > 0) { |
1534 | Block* uct = worklist.pop(); |
1535 | if (uct == get_root_block()) { |
1536 | continue; |
1537 | } |
1538 | for (uint i = 1; i < uct->num_preds(); i++) { |
1539 | Block *pb = get_block_for_node(uct->pred(i)); |
1540 | if (pb->_num_succs == 1) { |
1541 | worklist.push(pb); |
1542 | } else if (pb->num_fall_throughs() == 2) { |
1543 | pb->update_uncommon_branch(uct); |
1544 | } |
1545 | } |
1546 | } |
1547 | } |
1548 | |
1549 | // Create the loop tree and calculate loop depth. |
1550 | _root_loop = create_loop_tree(); |
1551 | _root_loop->compute_loop_depth(0); |
1552 | |
1553 | // Compute block frequency of each block, relative to a single loop entry. |
1554 | _root_loop->compute_freq(); |
1555 | |
1556 | // Adjust all frequencies to be relative to a single method entry |
1557 | _root_loop->_freq = 1.0; |
1558 | _root_loop->scale_freq(); |
1559 | |
1560 | // Save outmost loop frequency for LRG frequency threshold |
1561 | _outer_loop_frequency = _root_loop->outer_loop_freq(); |
1562 | |
1563 | // force paths ending at uncommon traps to be infrequent |
1564 | if (!C->do_freq_based_layout()) { |
1565 | Block_List worklist; |
1566 | Block* root_blk = get_block(0); |
1567 | for (uint i = 1; i < root_blk->num_preds(); i++) { |
1568 | Block *pb = get_block_for_node(root_blk->pred(i)); |
1569 | if (pb->has_uncommon_code()) { |
1570 | worklist.push(pb); |
1571 | } |
1572 | } |
1573 | while (worklist.size() > 0) { |
1574 | Block* uct = worklist.pop(); |
1575 | uct->_freq = PROB_MIN; |
1576 | for (uint i = 1; i < uct->num_preds(); i++) { |
1577 | Block *pb = get_block_for_node(uct->pred(i)); |
1578 | if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) { |
1579 | worklist.push(pb); |
1580 | } |
1581 | } |
1582 | } |
1583 | } |
1584 | |
1585 | #ifdef ASSERT |
1586 | for (uint i = 0; i < number_of_blocks(); i++) { |
1587 | Block* b = get_block(i); |
1588 | assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency" ); |
1589 | } |
1590 | #endif |
1591 | |
1592 | #ifndef PRODUCT |
1593 | if (PrintCFGBlockFreq) { |
1594 | tty->print_cr("CFG Block Frequencies" ); |
1595 | _root_loop->dump_tree(); |
1596 | if (Verbose) { |
1597 | tty->print_cr("PhaseCFG dump" ); |
1598 | dump(); |
1599 | tty->print_cr("Node dump" ); |
1600 | _root->dump(99999); |
1601 | } |
1602 | } |
1603 | #endif |
1604 | } |
1605 | |
1606 | //----------------------------create_loop_tree-------------------------------- |
1607 | // Create a loop tree from the CFG |
1608 | CFGLoop* PhaseCFG::create_loop_tree() { |
1609 | |
1610 | #ifdef ASSERT |
1611 | assert(get_block(0) == get_root_block(), "first block should be root block" ); |
1612 | for (uint i = 0; i < number_of_blocks(); i++) { |
1613 | Block* block = get_block(i); |
1614 | // Check that _loop field are clear...we could clear them if not. |
1615 | assert(block->_loop == NULL, "clear _loop expected" ); |
1616 | // Sanity check that the RPO numbering is reflected in the _blocks array. |
1617 | // It doesn't have to be for the loop tree to be built, but if it is not, |
1618 | // then the blocks have been reordered since dom graph building...which |
1619 | // may question the RPO numbering |
1620 | assert(block->_rpo == i, "unexpected reverse post order number" ); |
1621 | } |
1622 | #endif |
1623 | |
1624 | int idct = 0; |
1625 | CFGLoop* root_loop = new CFGLoop(idct++); |
1626 | |
1627 | Block_List worklist; |
1628 | |
1629 | // Assign blocks to loops |
1630 | for(uint i = number_of_blocks() - 1; i > 0; i-- ) { // skip Root block |
1631 | Block* block = get_block(i); |
1632 | |
1633 | if (block->head()->is_Loop()) { |
1634 | Block* loop_head = block; |
1635 | assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors" ); |
1636 | Node* tail_n = loop_head->pred(LoopNode::LoopBackControl); |
1637 | Block* tail = get_block_for_node(tail_n); |
1638 | |
1639 | // Defensively filter out Loop nodes for non-single-entry loops. |
1640 | // For all reasonable loops, the head occurs before the tail in RPO. |
1641 | if (i <= tail->_rpo) { |
1642 | |
1643 | // The tail and (recursive) predecessors of the tail |
1644 | // are made members of a new loop. |
1645 | |
1646 | assert(worklist.size() == 0, "nonempty worklist" ); |
1647 | CFGLoop* nloop = new CFGLoop(idct++); |
1648 | assert(loop_head->_loop == NULL, "just checking" ); |
1649 | loop_head->_loop = nloop; |
1650 | // Add to nloop so push_pred() will skip over inner loops |
1651 | nloop->add_member(loop_head); |
1652 | nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, this); |
1653 | |
1654 | while (worklist.size() > 0) { |
1655 | Block* member = worklist.pop(); |
1656 | if (member != loop_head) { |
1657 | for (uint j = 1; j < member->num_preds(); j++) { |
1658 | nloop->push_pred(member, j, worklist, this); |
1659 | } |
1660 | } |
1661 | } |
1662 | } |
1663 | } |
1664 | } |
1665 | |
1666 | // Create a member list for each loop consisting |
1667 | // of both blocks and (immediate child) loops. |
1668 | for (uint i = 0; i < number_of_blocks(); i++) { |
1669 | Block* block = get_block(i); |
1670 | CFGLoop* lp = block->_loop; |
1671 | if (lp == NULL) { |
1672 | // Not assigned to a loop. Add it to the method's pseudo loop. |
1673 | block->_loop = root_loop; |
1674 | lp = root_loop; |
1675 | } |
1676 | if (lp == root_loop || block != lp->head()) { // loop heads are already members |
1677 | lp->add_member(block); |
1678 | } |
1679 | if (lp != root_loop) { |
1680 | if (lp->parent() == NULL) { |
1681 | // Not a nested loop. Make it a child of the method's pseudo loop. |
1682 | root_loop->add_nested_loop(lp); |
1683 | } |
1684 | if (block == lp->head()) { |
1685 | // Add nested loop to member list of parent loop. |
1686 | lp->parent()->add_member(lp); |
1687 | } |
1688 | } |
1689 | } |
1690 | |
1691 | return root_loop; |
1692 | } |
1693 | |
1694 | //------------------------------push_pred-------------------------------------- |
1695 | void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, PhaseCFG* cfg) { |
1696 | Node* pred_n = blk->pred(i); |
1697 | Block* pred = cfg->get_block_for_node(pred_n); |
1698 | CFGLoop *pred_loop = pred->_loop; |
1699 | if (pred_loop == NULL) { |
1700 | // Filter out blocks for non-single-entry loops. |
1701 | // For all reasonable loops, the head occurs before the tail in RPO. |
1702 | if (pred->_rpo > head()->_rpo) { |
1703 | pred->_loop = this; |
1704 | worklist.push(pred); |
1705 | } |
1706 | } else if (pred_loop != this) { |
1707 | // Nested loop. |
1708 | while (pred_loop->_parent != NULL && pred_loop->_parent != this) { |
1709 | pred_loop = pred_loop->_parent; |
1710 | } |
1711 | // Make pred's loop be a child |
1712 | if (pred_loop->_parent == NULL) { |
1713 | add_nested_loop(pred_loop); |
1714 | // Continue with loop entry predecessor. |
1715 | Block* pred_head = pred_loop->head(); |
1716 | assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors" ); |
1717 | assert(pred_head != head(), "loop head in only one loop" ); |
1718 | push_pred(pred_head, LoopNode::EntryControl, worklist, cfg); |
1719 | } else { |
1720 | assert(pred_loop->_parent == this && _parent == NULL, "just checking" ); |
1721 | } |
1722 | } |
1723 | } |
1724 | |
1725 | //------------------------------add_nested_loop-------------------------------- |
1726 | // Make cl a child of the current loop in the loop tree. |
1727 | void CFGLoop::add_nested_loop(CFGLoop* cl) { |
1728 | assert(_parent == NULL, "no parent yet" ); |
1729 | assert(cl != this, "not my own parent" ); |
1730 | cl->_parent = this; |
1731 | CFGLoop* ch = _child; |
1732 | if (ch == NULL) { |
1733 | _child = cl; |
1734 | } else { |
1735 | while (ch->_sibling != NULL) { ch = ch->_sibling; } |
1736 | ch->_sibling = cl; |
1737 | } |
1738 | } |
1739 | |
1740 | //------------------------------compute_loop_depth----------------------------- |
1741 | // Store the loop depth in each CFGLoop object. |
1742 | // Recursively walk the children to do the same for them. |
1743 | void CFGLoop::compute_loop_depth(int depth) { |
1744 | _depth = depth; |
1745 | CFGLoop* ch = _child; |
1746 | while (ch != NULL) { |
1747 | ch->compute_loop_depth(depth + 1); |
1748 | ch = ch->_sibling; |
1749 | } |
1750 | } |
1751 | |
1752 | //------------------------------compute_freq----------------------------------- |
1753 | // Compute the frequency of each block and loop, relative to a single entry |
1754 | // into the dominating loop head. |
1755 | void CFGLoop::compute_freq() { |
1756 | // Bottom up traversal of loop tree (visit inner loops first.) |
1757 | // Set loop head frequency to 1.0, then transitively |
1758 | // compute frequency for all successors in the loop, |
1759 | // as well as for each exit edge. Inner loops are |
1760 | // treated as single blocks with loop exit targets |
1761 | // as the successor blocks. |
1762 | |
1763 | // Nested loops first |
1764 | CFGLoop* ch = _child; |
1765 | while (ch != NULL) { |
1766 | ch->compute_freq(); |
1767 | ch = ch->_sibling; |
1768 | } |
1769 | assert (_members.length() > 0, "no empty loops" ); |
1770 | Block* hd = head(); |
1771 | hd->_freq = 1.0; |
1772 | for (int i = 0; i < _members.length(); i++) { |
1773 | CFGElement* s = _members.at(i); |
1774 | double freq = s->_freq; |
1775 | if (s->is_block()) { |
1776 | Block* b = s->as_Block(); |
1777 | for (uint j = 0; j < b->_num_succs; j++) { |
1778 | Block* sb = b->_succs[j]; |
1779 | update_succ_freq(sb, freq * b->succ_prob(j)); |
1780 | } |
1781 | } else { |
1782 | CFGLoop* lp = s->as_CFGLoop(); |
1783 | assert(lp->_parent == this, "immediate child" ); |
1784 | for (int k = 0; k < lp->_exits.length(); k++) { |
1785 | Block* eb = lp->_exits.at(k).get_target(); |
1786 | double prob = lp->_exits.at(k).get_prob(); |
1787 | update_succ_freq(eb, freq * prob); |
1788 | } |
1789 | } |
1790 | } |
1791 | |
1792 | // For all loops other than the outer, "method" loop, |
1793 | // sum and normalize the exit probability. The "method" loop |
1794 | // should keep the initial exit probability of 1, so that |
1795 | // inner blocks do not get erroneously scaled. |
1796 | if (_depth != 0) { |
1797 | // Total the exit probabilities for this loop. |
1798 | double exits_sum = 0.0f; |
1799 | for (int i = 0; i < _exits.length(); i++) { |
1800 | exits_sum += _exits.at(i).get_prob(); |
1801 | } |
1802 | |
1803 | // Normalize the exit probabilities. Until now, the |
1804 | // probabilities estimate the possibility of exit per |
1805 | // a single loop iteration; afterward, they estimate |
1806 | // the probability of exit per loop entry. |
1807 | for (int i = 0; i < _exits.length(); i++) { |
1808 | Block* et = _exits.at(i).get_target(); |
1809 | float new_prob = 0.0f; |
1810 | if (_exits.at(i).get_prob() > 0.0f) { |
1811 | new_prob = _exits.at(i).get_prob() / exits_sum; |
1812 | } |
1813 | BlockProbPair bpp(et, new_prob); |
1814 | _exits.at_put(i, bpp); |
1815 | } |
1816 | |
1817 | // Save the total, but guard against unreasonable probability, |
1818 | // as the value is used to estimate the loop trip count. |
1819 | // An infinite trip count would blur relative block |
1820 | // frequencies. |
1821 | if (exits_sum > 1.0f) exits_sum = 1.0; |
1822 | if (exits_sum < PROB_MIN) exits_sum = PROB_MIN; |
1823 | _exit_prob = exits_sum; |
1824 | } |
1825 | } |
1826 | |
1827 | //------------------------------succ_prob------------------------------------- |
1828 | // Determine the probability of reaching successor 'i' from the receiver block. |
1829 | float Block::succ_prob(uint i) { |
1830 | int eidx = end_idx(); |
1831 | Node *n = get_node(eidx); // Get ending Node |
1832 | |
1833 | int op = n->Opcode(); |
1834 | if (n->is_Mach()) { |
1835 | if (n->is_MachNullCheck()) { |
1836 | // Can only reach here if called after lcm. The original Op_If is gone, |
1837 | // so we attempt to infer the probability from one or both of the |
1838 | // successor blocks. |
1839 | assert(_num_succs == 2, "expecting 2 successors of a null check" ); |
1840 | // If either successor has only one predecessor, then the |
1841 | // probability estimate can be derived using the |
1842 | // relative frequency of the successor and this block. |
1843 | if (_succs[i]->num_preds() == 2) { |
1844 | return _succs[i]->_freq / _freq; |
1845 | } else if (_succs[1-i]->num_preds() == 2) { |
1846 | return 1 - (_succs[1-i]->_freq / _freq); |
1847 | } else { |
1848 | // Estimate using both successor frequencies |
1849 | float freq = _succs[i]->_freq; |
1850 | return freq / (freq + _succs[1-i]->_freq); |
1851 | } |
1852 | } |
1853 | op = n->as_Mach()->ideal_Opcode(); |
1854 | } |
1855 | |
1856 | |
1857 | // Switch on branch type |
1858 | switch( op ) { |
1859 | case Op_CountedLoopEnd: |
1860 | case Op_If: { |
1861 | assert (i < 2, "just checking" ); |
1862 | // Conditionals pass on only part of their frequency |
1863 | float prob = n->as_MachIf()->_prob; |
1864 | assert(prob >= 0.0 && prob <= 1.0, "out of range probability" ); |
1865 | // If succ[i] is the FALSE branch, invert path info |
1866 | if( get_node(i + eidx + 1)->Opcode() == Op_IfFalse ) { |
1867 | return 1.0f - prob; // not taken |
1868 | } else { |
1869 | return prob; // taken |
1870 | } |
1871 | } |
1872 | |
1873 | case Op_Jump: |
1874 | return n->as_MachJump()->_probs[get_node(i + eidx + 1)->as_JumpProj()->_con]; |
1875 | |
1876 | case Op_Catch: { |
1877 | const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj(); |
1878 | if (ci->_con == CatchProjNode::fall_through_index) { |
1879 | // Fall-thru path gets the lion's share. |
1880 | return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs; |
1881 | } else { |
1882 | // Presume exceptional paths are equally unlikely |
1883 | return PROB_UNLIKELY_MAG(5); |
1884 | } |
1885 | } |
1886 | |
1887 | case Op_Root: |
1888 | case Op_Goto: |
1889 | // Pass frequency straight thru to target |
1890 | return 1.0f; |
1891 | |
1892 | case Op_NeverBranch: |
1893 | return 0.0f; |
1894 | |
1895 | case Op_TailCall: |
1896 | case Op_TailJump: |
1897 | case Op_Return: |
1898 | case Op_Halt: |
1899 | case Op_Rethrow: |
1900 | // Do not push out freq to root block |
1901 | return 0.0f; |
1902 | |
1903 | default: |
1904 | ShouldNotReachHere(); |
1905 | } |
1906 | |
1907 | return 0.0f; |
1908 | } |
1909 | |
1910 | //------------------------------num_fall_throughs----------------------------- |
1911 | // Return the number of fall-through candidates for a block |
1912 | int Block::num_fall_throughs() { |
1913 | int eidx = end_idx(); |
1914 | Node *n = get_node(eidx); // Get ending Node |
1915 | |
1916 | int op = n->Opcode(); |
1917 | if (n->is_Mach()) { |
1918 | if (n->is_MachNullCheck()) { |
1919 | // In theory, either side can fall-thru, for simplicity sake, |
1920 | // let's say only the false branch can now. |
1921 | return 1; |
1922 | } |
1923 | op = n->as_Mach()->ideal_Opcode(); |
1924 | } |
1925 | |
1926 | // Switch on branch type |
1927 | switch( op ) { |
1928 | case Op_CountedLoopEnd: |
1929 | case Op_If: |
1930 | return 2; |
1931 | |
1932 | case Op_Root: |
1933 | case Op_Goto: |
1934 | return 1; |
1935 | |
1936 | case Op_Catch: { |
1937 | for (uint i = 0; i < _num_succs; i++) { |
1938 | const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj(); |
1939 | if (ci->_con == CatchProjNode::fall_through_index) { |
1940 | return 1; |
1941 | } |
1942 | } |
1943 | return 0; |
1944 | } |
1945 | |
1946 | case Op_Jump: |
1947 | case Op_NeverBranch: |
1948 | case Op_TailCall: |
1949 | case Op_TailJump: |
1950 | case Op_Return: |
1951 | case Op_Halt: |
1952 | case Op_Rethrow: |
1953 | return 0; |
1954 | |
1955 | default: |
1956 | ShouldNotReachHere(); |
1957 | } |
1958 | |
1959 | return 0; |
1960 | } |
1961 | |
1962 | //------------------------------succ_fall_through----------------------------- |
1963 | // Return true if a specific successor could be fall-through target. |
1964 | bool Block::succ_fall_through(uint i) { |
1965 | int eidx = end_idx(); |
1966 | Node *n = get_node(eidx); // Get ending Node |
1967 | |
1968 | int op = n->Opcode(); |
1969 | if (n->is_Mach()) { |
1970 | if (n->is_MachNullCheck()) { |
1971 | // In theory, either side can fall-thru, for simplicity sake, |
1972 | // let's say only the false branch can now. |
1973 | return get_node(i + eidx + 1)->Opcode() == Op_IfFalse; |
1974 | } |
1975 | op = n->as_Mach()->ideal_Opcode(); |
1976 | } |
1977 | |
1978 | // Switch on branch type |
1979 | switch( op ) { |
1980 | case Op_CountedLoopEnd: |
1981 | case Op_If: |
1982 | case Op_Root: |
1983 | case Op_Goto: |
1984 | return true; |
1985 | |
1986 | case Op_Catch: { |
1987 | const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj(); |
1988 | return ci->_con == CatchProjNode::fall_through_index; |
1989 | } |
1990 | |
1991 | case Op_Jump: |
1992 | case Op_NeverBranch: |
1993 | case Op_TailCall: |
1994 | case Op_TailJump: |
1995 | case Op_Return: |
1996 | case Op_Halt: |
1997 | case Op_Rethrow: |
1998 | return false; |
1999 | |
2000 | default: |
2001 | ShouldNotReachHere(); |
2002 | } |
2003 | |
2004 | return false; |
2005 | } |
2006 | |
2007 | //------------------------------update_uncommon_branch------------------------ |
2008 | // Update the probability of a two-branch to be uncommon |
2009 | void Block::update_uncommon_branch(Block* ub) { |
2010 | int eidx = end_idx(); |
2011 | Node *n = get_node(eidx); // Get ending Node |
2012 | |
2013 | int op = n->as_Mach()->ideal_Opcode(); |
2014 | |
2015 | assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If" ); |
2016 | assert(num_fall_throughs() == 2, "must be a two way branch block" ); |
2017 | |
2018 | // Which successor is ub? |
2019 | uint s; |
2020 | for (s = 0; s <_num_succs; s++) { |
2021 | if (_succs[s] == ub) break; |
2022 | } |
2023 | assert(s < 2, "uncommon successor must be found" ); |
2024 | |
2025 | // If ub is the true path, make the proability small, else |
2026 | // ub is the false path, and make the probability large |
2027 | bool invert = (get_node(s + eidx + 1)->Opcode() == Op_IfFalse); |
2028 | |
2029 | // Get existing probability |
2030 | float p = n->as_MachIf()->_prob; |
2031 | |
2032 | if (invert) p = 1.0 - p; |
2033 | if (p > PROB_MIN) { |
2034 | p = PROB_MIN; |
2035 | } |
2036 | if (invert) p = 1.0 - p; |
2037 | |
2038 | n->as_MachIf()->_prob = p; |
2039 | } |
2040 | |
2041 | //------------------------------update_succ_freq------------------------------- |
2042 | // Update the appropriate frequency associated with block 'b', a successor of |
2043 | // a block in this loop. |
2044 | void CFGLoop::update_succ_freq(Block* b, double freq) { |
2045 | if (b->_loop == this) { |
2046 | if (b == head()) { |
2047 | // back branch within the loop |
2048 | // Do nothing now, the loop carried frequency will be |
2049 | // adjust later in scale_freq(). |
2050 | } else { |
2051 | // simple branch within the loop |
2052 | b->_freq += freq; |
2053 | } |
2054 | } else if (!in_loop_nest(b)) { |
2055 | // branch is exit from this loop |
2056 | BlockProbPair bpp(b, freq); |
2057 | _exits.append(bpp); |
2058 | } else { |
2059 | // branch into nested loop |
2060 | CFGLoop* ch = b->_loop; |
2061 | ch->_freq += freq; |
2062 | } |
2063 | } |
2064 | |
2065 | //------------------------------in_loop_nest----------------------------------- |
2066 | // Determine if block b is in the receiver's loop nest. |
2067 | bool CFGLoop::in_loop_nest(Block* b) { |
2068 | int depth = _depth; |
2069 | CFGLoop* b_loop = b->_loop; |
2070 | int b_depth = b_loop->_depth; |
2071 | if (depth == b_depth) { |
2072 | return true; |
2073 | } |
2074 | while (b_depth > depth) { |
2075 | b_loop = b_loop->_parent; |
2076 | b_depth = b_loop->_depth; |
2077 | } |
2078 | return b_loop == this; |
2079 | } |
2080 | |
2081 | //------------------------------scale_freq------------------------------------- |
2082 | // Scale frequency of loops and blocks by trip counts from outer loops |
2083 | // Do a top down traversal of loop tree (visit outer loops first.) |
2084 | void CFGLoop::scale_freq() { |
2085 | double loop_freq = _freq * trip_count(); |
2086 | _freq = loop_freq; |
2087 | for (int i = 0; i < _members.length(); i++) { |
2088 | CFGElement* s = _members.at(i); |
2089 | double block_freq = s->_freq * loop_freq; |
2090 | if (g_isnan(block_freq) || block_freq < MIN_BLOCK_FREQUENCY) |
2091 | block_freq = MIN_BLOCK_FREQUENCY; |
2092 | s->_freq = block_freq; |
2093 | } |
2094 | CFGLoop* ch = _child; |
2095 | while (ch != NULL) { |
2096 | ch->scale_freq(); |
2097 | ch = ch->_sibling; |
2098 | } |
2099 | } |
2100 | |
2101 | // Frequency of outer loop |
2102 | double CFGLoop::outer_loop_freq() const { |
2103 | if (_child != NULL) { |
2104 | return _child->_freq; |
2105 | } |
2106 | return _freq; |
2107 | } |
2108 | |
2109 | #ifndef PRODUCT |
2110 | //------------------------------dump_tree-------------------------------------- |
2111 | void CFGLoop::dump_tree() const { |
2112 | dump(); |
2113 | if (_child != NULL) _child->dump_tree(); |
2114 | if (_sibling != NULL) _sibling->dump_tree(); |
2115 | } |
2116 | |
2117 | //------------------------------dump------------------------------------------- |
2118 | void CFGLoop::dump() const { |
2119 | for (int i = 0; i < _depth; i++) tty->print(" " ); |
2120 | tty->print("%s: %d trip_count: %6.0f freq: %6.0f\n" , |
2121 | _depth == 0 ? "Method" : "Loop" , _id, trip_count(), _freq); |
2122 | for (int i = 0; i < _depth; i++) tty->print(" " ); |
2123 | tty->print(" members:" ); |
2124 | int k = 0; |
2125 | for (int i = 0; i < _members.length(); i++) { |
2126 | if (k++ >= 6) { |
2127 | tty->print("\n " ); |
2128 | for (int j = 0; j < _depth+1; j++) tty->print(" " ); |
2129 | k = 0; |
2130 | } |
2131 | CFGElement *s = _members.at(i); |
2132 | if (s->is_block()) { |
2133 | Block *b = s->as_Block(); |
2134 | tty->print(" B%d(%6.3f)" , b->_pre_order, b->_freq); |
2135 | } else { |
2136 | CFGLoop* lp = s->as_CFGLoop(); |
2137 | tty->print(" L%d(%6.3f)" , lp->_id, lp->_freq); |
2138 | } |
2139 | } |
2140 | tty->print("\n" ); |
2141 | for (int i = 0; i < _depth; i++) tty->print(" " ); |
2142 | tty->print(" exits: " ); |
2143 | k = 0; |
2144 | for (int i = 0; i < _exits.length(); i++) { |
2145 | if (k++ >= 7) { |
2146 | tty->print("\n " ); |
2147 | for (int j = 0; j < _depth+1; j++) tty->print(" " ); |
2148 | k = 0; |
2149 | } |
2150 | Block *blk = _exits.at(i).get_target(); |
2151 | double prob = _exits.at(i).get_prob(); |
2152 | tty->print(" ->%d@%d%%" , blk->_pre_order, (int)(prob*100)); |
2153 | } |
2154 | tty->print("\n" ); |
2155 | } |
2156 | #endif |
2157 | |