superword.cpp source code [OpenJDK/src/hotspot/share/opto/superword.cpp]

1	/*
2	* Copyright (c) 2007, 2018, Oracle and/or its affiliates. All rights reserved.
3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4	*
5	* This code is free software; you can redistribute it and/or modify it
6	* under the terms of the GNU General Public License version 2 only, as
7	* published by the Free Software Foundation.
8	*
9	* This code is distributed in the hope that it will be useful, but WITHOUT
10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12	* version 2 for more details (a copy is included in the LICENSE file that
13	* accompanied this code).
14	*
15	* You should have received a copy of the GNU General Public License version
16	* 2 along with this work; if not, write to the Free Software Foundation,
17	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18	*
19	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20	* or visit www.oracle.com if you need additional information or have any
21	* questions.
22	*/
23
24	#include "precompiled.hpp"
25	#include "compiler/compileLog.hpp"
26	#include "libadt/vectset.hpp"
27	#include "memory/allocation.inline.hpp"
28	#include "memory/resourceArea.hpp"
29	#include "opto/addnode.hpp"
30	#include "opto/callnode.hpp"
31	#include "opto/castnode.hpp"
32	#include "opto/convertnode.hpp"
33	#include "opto/divnode.hpp"
34	#include "opto/matcher.hpp"
35	#include "opto/memnode.hpp"
36	#include "opto/mulnode.hpp"
37	#include "opto/opcodes.hpp"
38	#include "opto/opaquenode.hpp"
39	#include "opto/superword.hpp"
40	#include "opto/vectornode.hpp"
41	#include "opto/movenode.hpp"
42
43	//
44	// S U P E R W O R D T R A N S F O R M
45	//=============================================================================
46
47	//------------------------------SuperWord---------------------------
48	SuperWord::SuperWord(PhaseIdealLoop* phase) :
49	_phase(phase),
50	_arena(phase->C->comp_arena()),
51	_igvn(phase->_igvn),
52	_packset (arena(), `8`, `0`, NULL), // packs for the current block
53	_bb_idx (arena(), (int)(`1.10` * phase->C->unique()), `0`, `0`), // node idx to index in bb
54	_block (arena(), `8`, `0`, NULL), // nodes in current block
55	_post_block (arena(), `8`, `0`, NULL), // nodes common to current block which are marked as post loop vectorizable
56	_data_entry (arena(), `8`, `0`, NULL), // nodes with all inputs from outside
57	_mem_slice_head (arena(), `8`, `0`, NULL), // memory slice heads
58	_mem_slice_tail (arena(), `8`, `0`, NULL), // memory slice tails
59	_node_info (arena(), `8`, `0`, SWNodeInfo::initial), // info needed per node
60	_clone_map(phase->C->clone_map()), // map of nodes created in cloning
61	_cmovev_kit (_arena, this), // map to facilitate CMoveV creation
62	_align_to_ref(NULL), // memory reference to align vectors to
63	_disjoint_ptrs (arena(), `8`, `0`, OrderedPair::initial), // runtime disambiguated pointer pairs
64	_dg (_arena), // dependence graph
65	_visited (arena()), // visited node set
66	_post_visited (arena()), // post visited node set
67	_n_idx_list (arena(), `8`), // scratch list of (node,index) pairs
68	_nlist (arena(), `8`, `0`, NULL), // scratch list of nodes
69	_stk (arena(), `8`, `0`, NULL), // scratch stack of nodes
70	_lpt(NULL), // loop tree node
71	_lp(NULL), // LoopNode
72	_bb(NULL), // basic block
73	_iv(NULL), // induction var
74	_race_possible(false), // cases where SDMU is true
75	_early_return(true), // analysis evaluations routine
76	_do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
77	_do_reserve_copy(DoReserveCopyInSuperWord),
78	_num_work_vecs(`0`), // amount of vector work we have
79	_num_reductions(`0`), // amount of reduction work we have
80	_ii_first(-`1`), // first loop generation index - only if do_vector_loop()
81	_ii_last(-`1`), // last loop generation index - only if do_vector_loop()
82	_ii_order (arena(), `8`, `0`, `0`)
83	{
84	#ifndef PRODUCT
85	_vector_loop_debug = `0`;
86	if (_phase->C->method() != NULL) {
87	_vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
88	}
89
90	#endif
91	}
92
93	//------------------------------transform_loop---------------------------
94	void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
95	assert(UseSuperWord, "should be");
96	// Do vectors exist on this architecture?
97	if (Matcher::vector_width_in_bytes(T_BYTE) < `2`) return;
98
99	assert(lpt->_head->is_CountedLoop(), "must be");
100	CountedLoopNode *cl = lpt->_head->as_CountedLoop();
101
102	if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
103
104	bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
105	if (post_loop_allowed) {
106	if (cl->is_reduction_loop()) return; // no predication mapping
107	Node *limit = cl->limit();
108	if (limit->is_Con()) return; // non constant limits only
109	// Now check the limit for expressions we do not handle
110	if (limit->is_Add()) {
111	Node *in2 = limit->in(`2`);
112	if (in2->is_Con()) {
113	int val = in2->get_int();
114	// should not try to program these cases
115	if (val < `0`) return;
116	}
117	}
118	}
119
120	// skip any loop that has not been assigned max unroll by analysis
121	if (do_optimization) {
122	if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == `0`) return;
123	}
124
125	// Check for no control flow in body (other than exit)
126	Node *cl_exit = cl->loopexit();
127	if (cl->is_main_loop() && (cl_exit->in(`0`) != lpt->_head)) {
128	#ifndef PRODUCT
129	if (TraceSuperWord) {
130	tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
131	tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
132	tty->print("cl_exit->in(0) %d", cl_exit->in(`0`)->_idx); cl_exit->in(`0`)->dump();
133	tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
134	lpt->dump_head();
135	}
136	#endif
137	return;
138	}
139
140	// Make sure the are no extra control users of the loop backedge
141	if (cl->back_control()->outcnt() != `1`) {
142	return;
143	}
144
145	// Skip any loops already optimized by slp
146	if (cl->is_vectorized_loop()) return;
147
148	if (cl->is_unroll_only()) return;
149
150	if (cl->is_main_loop()) {
151	// Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
152	CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
153	if (pre_end == NULL) return;
154	Node *pre_opaq1 = pre_end->limit();
155	if (pre_opaq1->Opcode() != Op_Opaque1) return;
156	}
157
158	init(); // initialize data structures
159
160	set_lpt(lpt);
161	set_lp(cl);
162
163	// For now, define one block which is the entire loop body
164	set_bb(cl);
165
166	if (do_optimization) {
167	assert(_packset.length() == `0`, "packset must be empty");
168	SLP_extract();
169	if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
170	if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
171	IdealLoopTree *lpt_next = lpt->_next;
172	CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
173	_phase->has_range_checks(lpt_next);
174	if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
175	if (!cl_next->is_vectorized_loop()) {
176	int slp_max_unroll_factor = cl->slp_max_unroll();
177	cl_next->set_slp_max_unroll(slp_max_unroll_factor);
178	}
179	}
180	}
181	}
182	}
183	}
184
185	//------------------------------early unrolling analysis------------------------------
186	void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
187	bool is_slp = true;
188	ResourceMark rm;
189	size_t ignored_size = lpt()->_body.size();
190	int ignored_loop_nodes = NEW_RESOURCE_ARRAY(int*, ignored_size);
191	Node_Stack nstack((int)ignored_size);
192	CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
193	Node *cl_exit = cl->loopexit_or_null();
194	int rpo_idx = _post_block.length();
195
196	assert(rpo_idx == `0`, "post loop block is empty");
197
198	// First clear the entries
199	for (uint i = `0`; i < lpt()->_body.size(); i++) {
200	ignored_loop_nodes[i] = -`1`;
201	}
202
203	int max_vector = Matcher::max_vector_size(T_BYTE);
204	bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
205
206	// Process the loop, some/all of the stack entries will not be in order, ergo
207	// need to preprocess the ignored initial state before we process the loop
208	for (uint i = `0`; i < lpt()->_body.size(); i++) {
209	Node* n = lpt()->_body.at(i);
210	if (n == cl->incr() \|\|
211	n->is_reduction() \|\|
212	n->is_AddP() \|\|
213	n->is_Cmp() \|\|
214	n->is_IfTrue() \|\|
215	n->is_CountedLoop() \|\|
216	(n == cl_exit)) {
217	ignored_loop_nodes[i] = n->_idx;
218	continue;
219	}
220
221	if (n->is_If()) {
222	IfNode *iff = n->as_If();
223	if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
224	if (lpt()->is_loop_exit(iff)) {
225	ignored_loop_nodes[i] = n->_idx;
226	continue;
227	}
228	}
229	}
230
231	if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
232	Node* n_tail = n->in(LoopNode::LoopBackControl);
233	if (n_tail != n->in(LoopNode::EntryControl)) {
234	if (!n_tail->is_Mem()) {
235	is_slp = false;
236	break;
237	}
238	}
239	}
240
241	// This must happen after check of phi/if
242	if (n->is_Phi() \|\| n->is_If()) {
243	ignored_loop_nodes[i] = n->_idx;
244	continue;
245	}
246
247	if (n->is_LoadStore() \|\| n->is_MergeMem() \|\|
248	(n->is_Proj() && !n->as_Proj()->is_CFG())) {
249	is_slp = false;
250	break;
251	}
252
253	// Ignore nodes with non-primitive type.
254	BasicType bt;
255	if (n->is_Mem()) {
256	bt = n->as_Mem()->memory_type();
257	} else {
258	bt = n->bottom_type()->basic_type();
259	}
260	if (is_java_primitive(bt) == false) {
261	ignored_loop_nodes[i] = n->_idx;
262	continue;
263	}
264
265	if (n->is_Mem()) {
266	MemNode* current = n->as_Mem();
267	Node* adr = n->in(MemNode::Address);
268	Node* n_ctrl = _phase->get_ctrl(adr);
269
270	// save a queue of post process nodes
271	if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
272	// Process the memory expression
273	int stack_idx = `0`;
274	bool have_side_effects = true;
275	if (adr->is_AddP() == false) {
276	nstack.push(adr, stack_idx++);
277	} else {
278	// Mark the components of the memory operation in nstack
279	SWPointer p1(current, this, &nstack, true);
280	have_side_effects = p1.node_stack()->is_nonempty();
281	}
282
283	// Process the pointer stack
284	while (have_side_effects) {
285	Node* pointer_node = nstack.node();
286	for (uint j = `0`; j < lpt()->_body.size(); j++) {
287	Node* cur_node = lpt()->_body.at(j);
288	if (cur_node == pointer_node) {
289	ignored_loop_nodes[j] = cur_node->_idx;
290	break;
291	}
292	}
293	nstack.pop();
294	have_side_effects = nstack.is_nonempty();
295	}
296	}
297	}
298	}
299
300	if (is_slp) {
301	// Now we try to find the maximum supported consistent vector which the machine
302	// description can use
303	bool small_basic_type = false;
304	bool flag_small_bt = false;
305	for (uint i = `0`; i < lpt()->_body.size(); i++) {
306	if (ignored_loop_nodes[i] != -`1`) continue;
307
308	BasicType bt;
309	Node* n = lpt()->_body.at(i);
310	if (n->is_Mem()) {
311	bt = n->as_Mem()->memory_type();
312	} else {
313	bt = n->bottom_type()->basic_type();
314	}
315
316	if (post_loop_allowed) {
317	if (!small_basic_type) {
318	switch (bt) {
319	case T_CHAR:
320	case T_BYTE:
321	case T_SHORT:
322	small_basic_type = true;
323	break;
324
325	case T_LONG:
326	// TODO: Remove when support completed for mask context with LONG.
327	// Support needs to be augmented for logical qword operations, currently we map to dword
328	// buckets for vectors on logicals as these were legacy.
329	small_basic_type = true;
330	break;
331
332	default:
333	break;
334	}
335	}
336	}
337
338	if (is_java_primitive(bt) == false) continue;
339
340	int cur_max_vector = Matcher::max_vector_size(bt);
341
342	// If a max vector exists which is not larger than _local_loop_unroll_factor
343	// stop looking, we already have the max vector to map to.
344	if (cur_max_vector < local_loop_unroll_factor) {
345	is_slp = false;
346	if (TraceSuperWordLoopUnrollAnalysis) {
347	tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
348	}
349	break;
350	}
351
352	// Map the maximal common vector
353	if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
354	if (cur_max_vector < max_vector && !flag_small_bt) {
355	max_vector = cur_max_vector;
356	} else if (cur_max_vector > max_vector && UseSubwordForMaxVector) {
357	// Analyse subword in the loop to set maximum vector size to take advantage of full vector width for subword types.
358	// Here we analyze if narrowing is likely to happen and if it is we set vector size more aggressively.
359	// We check for possibility of narrowing by looking through chain operations using subword types.
360	if (is_subword_type(bt)) {
361	uint start, end;
362	VectorNode::vector_operands(n, &start, &end);
363
364	for (uint j = start; j < end; j++) {
365	Node* in = n->in(j);
366	// Don't propagate through a memory
367	if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
368	bool same_type = true;
369	for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
370	Node *use = in->fast_out(k);
371	if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
372	same_type = false;
373	break;
374	}
375	}
376	if (same_type) {
377	max_vector = cur_max_vector;
378	flag_small_bt = true;
379	cl->mark_subword_loop();
380	}
381	}
382	}
383	}
384	}
385	// We only process post loops on predicated targets where we want to
386	// mask map the loop to a single iteration
387	if (post_loop_allowed) {
388	_post_block.at_put_grow(rpo_idx++, n);
389	}
390	}
391	}
392	if (is_slp) {
393	local_loop_unroll_factor = max_vector;
394	cl->mark_passed_slp();
395	}
396	cl->mark_was_slp();
397	if (cl->is_main_loop()) {
398	cl->set_slp_max_unroll(local_loop_unroll_factor);
399	} else if (post_loop_allowed) {
400	if (!small_basic_type) {
401	// avoid replication context for small basic types in programmable masked loops
402	cl->set_slp_max_unroll(local_loop_unroll_factor);
403	}
404	}
405	}
406	}
407
408	//------------------------------SLP_extract---------------------------
409	// Extract the superword level parallelism
410	//
411	// 1) A reverse post-order of nodes in the block is constructed. By scanning
412	// this list from first to last, all definitions are visited before their uses.
413	//
414	// 2) A point-to-point dependence graph is constructed between memory references.
415	// This simplies the upcoming "independence" checker.
416	//
417	// 3) The maximum depth in the node graph from the beginning of the block
418	// to each node is computed. This is used to prune the graph search
419	// in the independence checker.
420	//
421	// 4) For integer types, the necessary bit width is propagated backwards
422	// from stores to allow packed operations on byte, char, and short
423	// integers. This reverses the promotion to type "int" that javac
424	// did for operations like: char c1,c2,c3; c1 = c2 + c3.
425	//
426	// 5) One of the memory references is picked to be an aligned vector reference.
427	// The pre-loop trip count is adjusted to align this reference in the
428	// unrolled body.
429	//
430	// 6) The initial set of pack pairs is seeded with memory references.
431	//
432	// 7) The set of pack pairs is extended by following use->def and def->use links.
433	//
434	// 8) The pairs are combined into vector sized packs.
435	//
436	// 9) Reorder the memory slices to co-locate members of the memory packs.
437	//
438	// 10) Generate ideal vector nodes for the final set of packs and where necessary,
439	// inserting scalar promotion, vector creation from multiple scalars, and
440	// extraction of scalar values from vectors.
441	//
442	void SuperWord::SLP_extract() {
443
444	#ifndef PRODUCT
445	if (_do_vector_loop && TraceSuperWord) {
446	tty->print("SuperWord::SLP_extract\n");
447	tty->print("input loop\n");
448	_lpt->dump_head();
449	_lpt->dump();
450	for (uint i = `0`; i < _lpt->_body.size(); i++) {
451	_lpt->_body.at(i)->dump();
452	}
453	}
454	#endif
455	// Ready the block
456	if (!construct_bb()) {
457	return; // Exit if no interesting nodes or complex graph.
458	}
459
460	// build _dg, _disjoint_ptrs
461	dependence_graph();
462
463	// compute function depth(Node)*
464	compute_max_depth();
465
466	CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
467	bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
468	if (cl->is_main_loop()) {
469	if (_do_vector_loop) {
470	if (mark_generations() != -`1`) {
471	hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
472
473	if (!construct_bb()) {
474	return; // Exit if no interesting nodes or complex graph.
475	}
476	dependence_graph();
477	compute_max_depth();
478	}
479
480	#ifndef PRODUCT
481	if (TraceSuperWord) {
482	tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
483	_lpt->dump_head();
484	for (int j = `0`; j < _block.length(); j++) {
485	Node* n = _block.at(j);
486	int d = depth(n);
487	for (int i = `0`; i < d; i++) tty->print("%s", " ");
488	tty->print("%d :", d);
489	n->dump();
490	}
491	}
492	#endif
493	}
494
495	compute_vector_element_type();
496
497	// Attempt vectorization
498
499	find_adjacent_refs();
500
501	extend_packlist();
502
503	if (_do_vector_loop) {
504	if (_packset.length() == `0`) {
505	if (TraceSuperWord) {
506	tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
507	}
508	pack_parallel();
509	}
510	}
511
512	combine_packs();
513
514	construct_my_pack_map();
515	if (UseVectorCmov) {
516	merge_packs_to_cmovd();
517	}
518
519	filter_packs();
520
521	schedule();
522	} else if (post_loop_allowed) {
523	int saved_mapped_unroll_factor = cl->slp_max_unroll();
524	if (saved_mapped_unroll_factor) {
525	int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
526
527	// now reset the slp_unroll_factor so that we can check the analysis mapped
528	// what the vector loop was mapped to
529	cl->set_slp_max_unroll(`0`);
530
531	// do the analysis on the post loop
532	unrolling_analysis(vector_mapped_unroll_factor);
533
534	// if our analyzed loop is a canonical fit, start processing it
535	if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
536	// now add the vector nodes to packsets
537	for (int i = `0`; i < _post_block.length(); i++) {
538	Node* n = _post_block.at(i);
539	Node_List* singleton = new Node_List ();
540	singleton->push(n);
541	_packset.append(singleton);
542	set_my_pack(n, singleton);
543	}
544
545	// map base types for vector usage
546	compute_vector_element_type();
547	} else {
548	return;
549	}
550	} else {
551	// for some reason we could not map the slp analysis state of the vectorized loop
552	return;
553	}
554	}
555
556	output();
557	}
558
559	//------------------------------find_adjacent_refs---------------------------
560	// Find the adjacent memory references and create pack pairs for them.
561	// This is the initial set of packs that will then be extended by
562	// following use->def and def->use links. The align positions are
563	// assigned relative to the reference "align_to_ref"
564	void SuperWord::find_adjacent_refs() {
565	// Get list of memory operations
566	Node_List memops;
567	for (int i = `0`; i < _block.length(); i++) {
568	Node* n = _block.at(i);
569	if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
570	is_java_primitive(n->as_Mem()->memory_type())) {
571	int align = memory_alignment(n->as_Mem(), `0`);
572	if (align != bottom_align) {
573	memops.push(n);
574	}
575	}
576	}
577
578	Node_List align_to_refs;
579	int best_iv_adjustment = `0`;
580	MemNode* best_align_to_mem_ref = NULL;
581
582	while (memops.size() != `0`) {
583	// Find a memory reference to align to.
584	MemNode* mem_ref = find_align_to_ref(memops);
585	if (mem_ref == NULL) break;
586	align_to_refs.push(mem_ref);
587	int iv_adjustment = get_iv_adjustment(mem_ref);
588
589	if (best_align_to_mem_ref == NULL) {
590	// Set memory reference which is the best from all memory operations
591	// to be used for alignment. The pre-loop trip count is modified to align
592	// this reference to a vector-aligned address.
593	best_align_to_mem_ref = mem_ref;
594	best_iv_adjustment = iv_adjustment;
595	NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
596	}
597
598	SWPointer align_to_ref_p(mem_ref, this, NULL, false);
599	// Set alignment relative to "align_to_ref" for all related memory operations.
600	for (int i = memops.size() - `1`; i >= `0`; i--) {
601	MemNode* s = memops.at(i)->as_Mem();
602	if (isomorphic(s, mem_ref) &&
603	(!_do_vector_loop \|\| same_origin_idx(s, mem_ref))) {
604	SWPointer p2(s, this, NULL, false);
605	if (p2.comparable(align_to_ref_p)) {
606	int align = memory_alignment(s, iv_adjustment);
607	set_alignment(s, align);
608	}
609	}
610	}
611
612	// Create initial pack pairs of memory operations for which
613	// alignment is set and vectors will be aligned.
614	bool create_pack = true;
615	if (memory_alignment(mem_ref, best_iv_adjustment) == `0` \|\| _do_vector_loop) {
616	if (!Matcher::misaligned_vectors_ok() \|\| AlignVector) {
617	int vw = vector_width(mem_ref);
618	int vw_best = vector_width(best_align_to_mem_ref);
619	if (vw > vw_best) {
620	// Do not vectorize a memory access with more elements per vector
621	// if unaligned memory access is not allowed because number of
622	// iterations in pre-loop will be not enough to align it.
623	create_pack = false;
624	} else {
625	SWPointer p2(best_align_to_mem_ref, this, NULL, false);
626	if (align_to_ref_p.invar() != p2.invar()) {
627	// Do not vectorize memory accesses with different invariants
628	// if unaligned memory accesses are not allowed.
629	create_pack = false;
630	}
631	}
632	}
633	} else {
634	if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
635	// Can't allow vectorization of unaligned memory accesses with the
636	// same type since it could be overlapped accesses to the same array.
637	create_pack = false;
638	} else {
639	// Allow independent (different type) unaligned memory operations
640	// if HW supports them.
641	if (!Matcher::misaligned_vectors_ok() \|\| AlignVector) {
642	create_pack = false;
643	} else {
644	// Check if packs of the same memory type but
645	// with a different alignment were created before.
646	for (uint i = `0`; i < align_to_refs.size(); i++) {
647	MemNode* mr = align_to_refs.at(i)->as_Mem();
648	if (mr == mem_ref) {
649	// Skip when we are looking at same memory operation.
650	continue;
651	}
652	if (same_velt_type(mr, mem_ref) &&
653	memory_alignment(mr, iv_adjustment) != `0`)
654	create_pack = false;
655	}
656	}
657	}
658	}
659	if (create_pack) {
660	for (uint i = `0`; i < memops.size(); i++) {
661	Node* s1 = memops.at(i);
662	int align = alignment(s1);
663	if (align == top_align) continue;
664	for (uint j = `0`; j < memops.size(); j++) {
665	Node* s2 = memops.at(j);
666	if (alignment(s2) == top_align) continue;
667	if (s1 != s2 && are_adjacent_refs(s1, s2)) {
668	if (stmts_can_pack(s1, s2, align)) {
669	Node_List* pair = new Node_List ();
670	pair->push(s1);
671	pair->push(s2);
672	if (!_do_vector_loop \|\| same_origin_idx(s1, s2)) {
673	_packset.append(pair);
674	}
675	}
676	}
677	}
678	}
679	} else { // Don't create unaligned pack
680	// First, remove remaining memory ops of the same type from the list.
681	for (int i = memops.size() - `1`; i >= `0`; i--) {
682	MemNode* s = memops.at(i)->as_Mem();
683	if (same_velt_type(s, mem_ref)) {
684	memops.remove(i);
685	}
686	}
687
688	// Second, remove already constructed packs of the same type.
689	for (int i = _packset.length() - `1`; i >= `0`; i--) {
690	Node_List* p = _packset.at(i);
691	MemNode* s = p->at(`0`)->as_Mem();
692	if (same_velt_type(s, mem_ref)) {
693	remove_pack_at(i);
694	}
695	}
696
697	// If needed find the best memory reference for loop alignment again.
698	if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
699	// Put memory ops from remaining packs back on memops list for
700	// the best alignment search.
701	uint orig_msize = memops.size();
702	for (int i = `0`; i < _packset.length(); i++) {
703	Node_List* p = _packset.at(i);
704	MemNode* s = p->at(`0`)->as_Mem();
705	assert(!same_velt_type(s, mem_ref), "sanity");
706	memops.push(s);
707	}
708	best_align_to_mem_ref = find_align_to_ref(memops);
709	if (best_align_to_mem_ref == NULL) {
710	if (TraceSuperWord) {
711	tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");
712	}
713	break;
714	}
715	best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
716	NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
717	// Restore list.
718	while (memops.size() > orig_msize)
719	(void)memops.pop();
720	}
721	} // unaligned memory accesses
722
723	// Remove used mem nodes.
724	for (int i = memops.size() - `1`; i >= `0`; i--) {
725	MemNode* m = memops.at(i)->as_Mem();
726	if (alignment(m) != top_align) {
727	memops.remove(i);
728	}
729	}
730
731	} // while (memops.size() != 0
732	set_align_to_ref(best_align_to_mem_ref);
733
734	if (TraceSuperWord) {
735	tty->print_cr("\nAfter find_adjacent_refs");
736	print_packset();
737	}
738	}
739
740	#ifndef PRODUCT
741	void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
742	if (is_trace_adjacent()) {
743	tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
744	best_align_to_mem_ref->_idx, best_iv_adjustment);
745	best_align_to_mem_ref->dump();
746	}
747	}
748	#endif
749
750	//------------------------------find_align_to_ref---------------------------
751	// Find a memory reference to align the loop induction variable to.
752	// Looks first at stores then at loads, looking for a memory reference
753	// with the largest number of references similar to it.
754	MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
755	GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), `0`);
756
757	// Count number of comparable memory ops
758	for (uint i = `0`; i < memops.size(); i++) {
759	MemNode* s1 = memops.at(i)->as_Mem();
760	SWPointer p1(s1, this, NULL, false);
761	// Discard if pre loop can't align this reference
762	if (!ref_is_alignable(p1)) {
763	*cmp_ct.adr_at(i) = `0`;
764	continue;
765	}
766	for (uint j = i+`1`; j < memops.size(); j++) {
767	MemNode* s2 = memops.at(j)->as_Mem();
768	if (isomorphic(s1, s2)) {
769	SWPointer p2(s2, this, NULL, false);
770	if (p1.comparable(p2)) {
771	(*cmp_ct.adr_at(i))++;
772	(*cmp_ct.adr_at(j))++;
773	}
774	}
775	}
776	}
777
778	// Find Store (or Load) with the greatest number of "comparable" references,
779	// biggest vector size, smallest data size and smallest iv offset.
780	int max_ct = `0`;
781	int max_vw = `0`;
782	int max_idx = -`1`;
783	int min_size = max_jint;
784	int min_iv_offset = max_jint;
785	for (uint j = `0`; j < memops.size(); j++) {
786	MemNode* s = memops.at(j)->as_Mem();
787	if (s->is_Store()) {
788	int vw = vector_width_in_bytes(s);
789	assert(vw > `1`, "sanity");
790	SWPointer p(s, this, NULL, false);
791	if ( cmp_ct.at(j) > max_ct \|\|
792	(cmp_ct.at(j) == max_ct &&
793	( vw > max_vw \|\|
794	(vw == max_vw &&
795	( data_size(s) < min_size \|\|
796	(data_size(s) == min_size &&
797	p.offset_in_bytes() < min_iv_offset)))))) {
798	max_ct = cmp_ct.at(j);
799	max_vw = vw;
800	max_idx = j;
801	min_size = data_size(s);
802	min_iv_offset = p.offset_in_bytes();
803	}
804	}
805	}
806	// If no stores, look at loads
807	if (max_ct == `0`) {
808	for (uint j = `0`; j < memops.size(); j++) {
809	MemNode* s = memops.at(j)->as_Mem();
810	if (s->is_Load()) {
811	int vw = vector_width_in_bytes(s);
812	assert(vw > `1`, "sanity");
813	SWPointer p(s, this, NULL, false);
814	if ( cmp_ct.at(j) > max_ct \|\|
815	(cmp_ct.at(j) == max_ct &&
816	( vw > max_vw \|\|
817	(vw == max_vw &&
818	( data_size(s) < min_size \|\|
819	(data_size(s) == min_size &&
820	p.offset_in_bytes() < min_iv_offset)))))) {
821	max_ct = cmp_ct.at(j);
822	max_vw = vw;
823	max_idx = j;
824	min_size = data_size(s);
825	min_iv_offset = p.offset_in_bytes();
826	}
827	}
828	}
829	}
830
831	#ifdef ASSERT
832	if (TraceSuperWord && Verbose) {
833	tty->print_cr("\nVector memops after find_align_to_ref");
834	for (uint i = `0`; i < memops.size(); i++) {
835	MemNode* s = memops.at(i)->as_Mem();
836	s->dump();
837	}
838	}
839	#endif
840
841	if (max_ct > `0`) {
842	#ifdef ASSERT
843	if (TraceSuperWord) {
844	tty->print("\nVector align to node: ");
845	memops.at(max_idx)->as_Mem()->dump();
846	}
847	#endif
848	return memops.at(max_idx)->as_Mem();
849	}
850	return NULL;
851	}
852
853	//------------------span_works_for_memory_size-----------------------------
854	static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) {
855	bool span_matches_memory = false;
856	if ((mem_size == type2aelembytes(T_BYTE) \|\| mem_size == type2aelembytes(T_SHORT))
857	&& ABS(span) == type2aelembytes(T_INT)) {
858	// There is a mismatch on span size compared to memory.
859	for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) {
860	Node* use = mem->fast_out(j);
861	if (!VectorNode::is_type_transition_to_int(use)) {
862	return false;
863	}
864	}
865	// If all uses transition to integer, it means that we can successfully align even on mismatch.
866	return true;
867	}
868	else {
869	span_matches_memory = ABS(span) == mem_size;
870	}
871	return span_matches_memory && (ABS(offset) % mem_size) == `0`;
872	}
873
874	//------------------------------ref_is_alignable---------------------------
875	// Can the preloop align the reference to position zero in the vector?
876	bool SuperWord::ref_is_alignable(SWPointer& p) {
877	if (!p.has_iv()) {
878	return true; // no induction variable
879	}
880	CountedLoopEndNode* pre_end = get_pre_loop_end(lp()->as_CountedLoop());
881	assert(pre_end != NULL, "we must have a correct pre-loop");
882	assert(pre_end->stride_is_con(), "pre loop stride is constant");
883	int preloop_stride = pre_end->stride_con();
884
885	int span = preloop_stride * p.scale_in_bytes();
886	int mem_size = p.memory_size();
887	int offset = p.offset_in_bytes();
888	// Stride one accesses are alignable if offset is aligned to memory operation size.
889	// Offset can be unaligned when UseUnalignedAccesses is used.
890	if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) {
891	return true;
892	}
893	// If the initial offset from start of the object is computable,
894	// check if the pre-loop can align the final offset accordingly.
895	//
896	// In other words: Can we find an i such that the offset
897	// after i pre-loop iterations is aligned to vw?
898	// (init_offset + pre_loop) % vw == 0 (1)
899	// where
900	// pre_loop = i span*
901	// is the number of bytes added to the offset by i pre-loop iterations.
902	//
903	// For this to hold we need pre_loop to increase init_offset by
904	// pre_loop = vw - (init_offset % vw)
905	//
906	// This is only possible if pre_loop is divisible by span because each
907	// pre-loop iteration increases the initial offset by 'span' bytes:
908	// (vw - (init_offset % vw)) % span == 0
909	//
910	int vw = vector_width_in_bytes(p.mem());
911	assert(vw > `1`, "sanity");
912	Node* init_nd = pre_end->init_trip();
913	if (init_nd->is_Con() && p.invar() == NULL) {
914	int init = init_nd->bottom_type()->is_int()->get_con();
915	int init_offset = init * p.scale_in_bytes() + offset;
916	if (init_offset < `0`) { // negative offset from object start?
917	return false; // may happen in dead loop
918	}
919	if (vw % span == `0`) {
920	// If vm is a multiple of span, we use formula (1).
921	if (span > `0`) {
922	return (vw - (init_offset % vw)) % span == `0`;
923	} else {
924	assert(span < `0`, "nonzero stride * scale");
925	return (init_offset % vw) % -span == `0`;
926	}
927	} else if (span % vw == `0`) {
928	// If span is a multiple of vw, we can simplify formula (1) to:
929	// (init_offset + i span) % vw == 0*
930	// =>
931	// (init_offset % vw) + ((i span) % vw) == 0*
932	// =>
933	// init_offset % vw == 0
934	//
935	// Because we add a multiple of vw to the initial offset, the final
936	// offset is a multiple of vw if and only if init_offset is a multiple.
937	//
938	return (init_offset % vw) == `0`;
939	}
940	}
941	return false;
942	}
943	//---------------------------get_vw_bytes_special------------------------
944	int SuperWord::get_vw_bytes_special(MemNode* s) {
945	// Get the vector width in bytes.
946	int vw = vector_width_in_bytes(s);
947
948	// Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
949	BasicType btype = velt_basic_type(s);
950	if (type2aelembytes(btype) == `2`) {
951	bool should_combine_adjacent = true;
952	for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
953	Node* user = s->fast_out(i);
954	if (!VectorNode::is_muladds2i(user)) {
955	should_combine_adjacent = false;
956	}
957	}
958	if (should_combine_adjacent) {
959	vw = MIN2(Matcher::max_vector_size(btype)type2aelembytes(btype), vw `2`);
960	}
961	}
962
963	return vw;
964	}
965
966	//---------------------------get_iv_adjustment---------------------------
967	// Calculate loop's iv adjustment for this memory ops.
968	int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
969	SWPointer align_to_ref_p(mem_ref, this, NULL, false);
970	int offset = align_to_ref_p.offset_in_bytes();
971	int scale = align_to_ref_p.scale_in_bytes();
972	int elt_size = align_to_ref_p.memory_size();
973	int vw = get_vw_bytes_special(mem_ref);
974	assert(vw > `1`, "sanity");
975	int iv_adjustment;
976	if (scale != `0`) {
977	int stride_sign = (scale * iv_stride()) > `0` ? `1` : -`1`;
978	// At least one iteration is executed in pre-loop by default. As result
979	// several iterations are needed to align memory operations in main-loop even
980	// if offset is 0.
981	int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
982	assert(((ABS(iv_adjustment_in_bytes) % elt_size) == `0`),
983	"(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size);
984	iv_adjustment = iv_adjustment_in_bytes/elt_size;
985	} else {
986	// This memory op is not dependent on iv (scale == 0)
987	iv_adjustment = `0`;
988	}
989
990	#ifndef PRODUCT
991	if (TraceSuperWord) {
992	tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
993	mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
994	mem_ref->dump();
995	}
996	#endif
997	return iv_adjustment;
998	}
999
1000	//---------------------------dependence_graph---------------------------
1001	// Construct dependency graph.
1002	// Add dependence edges to load/store nodes for memory dependence
1003	// A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
1004	void SuperWord::dependence_graph() {
1005	CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
1006	// First, assign a dependence node to each memory node
1007	for (int i = `0`; i < _block.length(); i++ ) {
1008	Node *n = _block.at(i);
1009	if (n->is_Mem() \|\| (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
1010	_dg.make_node(n);
1011	}
1012	}
1013
1014	// For each memory slice, create the dependences
1015	for (int i = `0`; i < _mem_slice_head.length(); i++) {
1016	Node* n = _mem_slice_head.at(i);
1017	Node* n_tail = _mem_slice_tail.at(i);
1018
1019	// Get slice in predecessor order (last is first)
1020	if (cl->is_main_loop()) {
1021	mem_slice_preds(n_tail, n, _nlist);
1022	}
1023
1024	#ifndef PRODUCT
1025	if(TraceSuperWord && Verbose) {
1026	tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
1027	for (int j = _nlist.length() - `1`; j >= `0` ; j--) {
1028	_nlist.at(j)->dump();
1029	}
1030	}
1031	#endif
1032	// Make the slice dependent on the root
1033	DepMem* slice = _dg.dep(n);
1034	_dg.make_edge(_dg.root(), slice);
1035
1036	// Create a sink for the slice
1037	DepMem* slice_sink = _dg.make_node(NULL);
1038	_dg.make_edge(slice_sink, _dg.tail());
1039
1040	// Now visit each pair of memory ops, creating the edges
1041	for (int j = _nlist.length() - `1`; j >= `0` ; j--) {
1042	Node* s1 = _nlist.at(j);
1043
1044	// If no dependency yet, use slice
1045	if (_dg.dep(s1)->in_cnt() == `0`) {
1046	_dg.make_edge(slice, s1);
1047	}
1048	SWPointer p1(s1->as_Mem(), this, NULL, false);
1049	bool sink_dependent = true;
1050	for (int k = j - `1`; k >= `0`; k--) {
1051	Node* s2 = _nlist.at(k);
1052	if (s1->is_Load() && s2->is_Load())
1053	continue;
1054	SWPointer p2(s2->as_Mem(), this, NULL, false);
1055
1056	int cmp = p1.cmp(p2);
1057	if (SuperWordRTDepCheck &&
1058	p1.base() != p2.base() && p1.valid() && p2.valid()) {
1059	// Create a runtime check to disambiguate
1060	OrderedPair pp(p1.base(), p2.base());
1061	_disjoint_ptrs.append_if_missing(pp);
1062	} else if (!SWPointer::not_equal(cmp)) {
1063	// Possibly same address
1064	_dg.make_edge(s1, s2);
1065	sink_dependent = false;
1066	}
1067	}
1068	if (sink_dependent) {
1069	_dg.make_edge(s1, slice_sink);
1070	}
1071	}
1072
1073	if (TraceSuperWord) {
1074	tty->print_cr("\nDependence graph for slice: %d", n->_idx);
1075	for (int q = `0`; q < _nlist.length(); q++) {
1076	_dg.print(_nlist.at(q));
1077	}
1078	tty->cr();
1079	}
1080
1081	_nlist.clear();
1082	}
1083
1084	if (TraceSuperWord) {
1085	tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > `0` ? "" : "NONE");
1086	for (int r = `0`; r < _disjoint_ptrs.length(); r++) {
1087	_disjoint_ptrs.at(r).print();
1088	tty->cr();
1089	}
1090	tty->cr();
1091	}
1092
1093	}
1094
1095	//---------------------------mem_slice_preds---------------------------
1096	// Return a memory slice (node list) in predecessor order starting at "start"
1097	void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
1098	assert(preds.length() == `0`, "start empty");
1099	Node* n = start;
1100	Node* prev = NULL;
1101	while (true) {
1102	NOT_PRODUCT( if(is_trace_mem_slice()) tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx);)
1103	assert(in_bb(n), "must be in block");
1104	for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
1105	Node* out = n->fast_out(i);
1106	if (out->is_Load()) {
1107	if (in_bb(out)) {
1108	preds.push(out);
1109	if (TraceSuperWord && Verbose) {
1110	tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx);
1111	}
1112	}
1113	} else {
1114	// FIXME
1115	if (out->is_MergeMem() && !in_bb(out)) {
1116	// Either unrolling is causing a memory edge not to disappear,
1117	// or need to run igvn.optimize() again before SLP
1118	} else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) {
1119	// Ditto. Not sure what else to check further.
1120	} else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
1121	// StoreCM has an input edge used as a precedence edge.
1122	// Maybe an issue when oop stores are vectorized.
1123	} else {
1124	assert(out == prev \|\| prev == NULL, "no branches off of store slice");
1125	}
1126	}//else
1127	}//for
1128	if (n == stop) break;
1129	preds.push(n);
1130	if (TraceSuperWord && Verbose) {
1131	tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx);
1132	}
1133	prev = n;
1134	assert(n->is_Mem(), "unexpected node %s", n->Name());
1135	n = n->in(MemNode::Memory);
1136	}
1137	}
1138
1139	//------------------------------stmts_can_pack---------------------------
1140	// Can s1 and s2 be in a pack with s1 immediately preceding s2 and
1141	// s1 aligned at "align"
1142	bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
1143
1144	// Do not use superword for non-primitives
1145	BasicType bt1 = velt_basic_type(s1);
1146	BasicType bt2 = velt_basic_type(s2);
1147	if(!is_java_primitive(bt1) \|\| !is_java_primitive(bt2))
1148	return false;
1149	if (Matcher::max_vector_size(bt1) < `2`) {
1150	return false; // No vectors for this type
1151	}
1152
1153	if (isomorphic(s1, s2)) {
1154	if ((independent(s1, s2) && have_similar_inputs(s1, s2)) \|\| reduction(s1, s2)) {
1155	if (!exists_at(s1, `0`) && !exists_at(s2, `1`)) {
1156	if (!s1->is_Mem() \|\| are_adjacent_refs(s1, s2)) {
1157	int s1_align = alignment(s1);
1158	int s2_align = alignment(s2);
1159	if (s1_align == top_align \|\| s1_align == align) {
1160	if (s2_align == top_align \|\| s2_align == align + data_size(s1)) {
1161	return true;
1162	}
1163	}
1164	}
1165	}
1166	}
1167	}
1168	return false;
1169	}
1170
1171	//------------------------------exists_at---------------------------
1172	// Does s exist in a pack at position pos?
1173	bool SuperWord::exists_at(Node* s, uint pos) {
1174	for (int i = `0`; i < _packset.length(); i++) {
1175	Node_List* p = _packset.at(i);
1176	if (p->at(pos) == s) {
1177	return true;
1178	}
1179	}
1180	return false;
1181	}
1182
1183	//------------------------------are_adjacent_refs---------------------------
1184	// Is s1 immediately before s2 in memory?
1185	bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
1186	if (!s1->is_Mem() \|\| !s2->is_Mem()) return false;
1187	if (!in_bb(s1) \|\| !in_bb(s2)) return false;
1188
1189	// Do not use superword for non-primitives
1190	if (!is_java_primitive(s1->as_Mem()->memory_type()) \|\|
1191	!is_java_primitive(s2->as_Mem()->memory_type())) {
1192	return false;
1193	}
1194
1195	// FIXME - co_locate_pack fails on Stores in different mem-slices, so
1196	// only pack memops that are in the same alias set until that's fixed.
1197	if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
1198	_phase->C->get_alias_index(s2->as_Mem()->adr_type()))
1199	return false;
1200	SWPointer p1(s1->as_Mem(), this, NULL, false);
1201	SWPointer p2(s2->as_Mem(), this, NULL, false);
1202	if (p1.base() != p2.base() \|\| !p1.comparable(p2)) return false;
1203	int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
1204	return diff == data_size(s1);
1205	}
1206
1207	//------------------------------isomorphic---------------------------
1208	// Are s1 and s2 similar?
1209	bool SuperWord::isomorphic(Node* s1, Node* s2) {
1210	if (s1->Opcode() != s2->Opcode()) return false;
1211	if (s1->req() != s2->req()) return false;
1212	if (!same_velt_type(s1, s2)) return false;
1213	Node* s1_ctrl = s1->in(`0`);
1214	Node* s2_ctrl = s2->in(`0`);
1215	// If the control nodes are equivalent, no further checks are required to test for isomorphism.
1216	if (s1_ctrl == s2_ctrl) {
1217	return true;
1218	} else {
1219	bool s1_ctrl_inv = ((s1_ctrl == NULL) ? true : lpt()->is_invariant(s1_ctrl));
1220	bool s2_ctrl_inv = ((s2_ctrl == NULL) ? true : lpt()->is_invariant(s2_ctrl));
1221	// If the control nodes are not invariant for the loop, fail isomorphism test.
1222	if (!s1_ctrl_inv \|\| !s2_ctrl_inv) {
1223	return false;
1224	}
1225	if(s1_ctrl != NULL && s2_ctrl != NULL) {
1226	if (s1_ctrl->is_Proj()) {
1227	s1_ctrl = s1_ctrl->in(`0`);
1228	assert(lpt()->is_invariant(s1_ctrl), "must be invariant");
1229	}
1230	if (s2_ctrl->is_Proj()) {
1231	s2_ctrl = s2_ctrl->in(`0`);
1232	assert(lpt()->is_invariant(s2_ctrl), "must be invariant");
1233	}
1234	if (!s1_ctrl->is_RangeCheck() \|\| !s2_ctrl->is_RangeCheck()) {
1235	return false;
1236	}
1237	}
1238	// Control nodes are invariant. However, we have no way of checking whether they resolve
1239	// in an equivalent manner. But, we know that invariant range checks are guaranteed to
1240	// throw before the loop (if they would have thrown). Thus, the loop would not have been reached.
1241	// Therefore, if the control nodes for both are range checks, we accept them to be isomorphic.
1242	for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1243	Node* t1 = s1->fast_out(i);
1244	for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1245	Node* t2 = s2->fast_out(j);
1246	if (VectorNode::is_muladds2i(t1) && VectorNode::is_muladds2i(t2)) {
1247	return true;
1248	}
1249	}
1250	}
1251	}
1252	return false;
1253	}
1254
1255	//------------------------------independent---------------------------
1256	// Is there no data path from s1 to s2 or s2 to s1?
1257	bool SuperWord::independent(Node* s1, Node* s2) {
1258	// assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
1259	int d1 = depth(s1);
1260	int d2 = depth(s2);
1261	if (d1 == d2) return s1 != s2;
1262	Node* deep = d1 > d2 ? s1 : s2;
1263	Node* shallow = d1 > d2 ? s2 : s1;
1264
1265	visited_clear();
1266
1267	return independent_path(shallow, deep);
1268	}
1269
1270	//--------------------------have_similar_inputs-----------------------
1271	// For a node pair (s1, s2) which is isomorphic and independent,
1272	// do s1 and s2 have similar input edges?
1273	bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
1274	// assert(isomorphic(s1, s2) == true, "check isomorphic");
1275	// assert(independent(s1, s2) == true, "check independent");
1276	if (s1->req() > `1` && !s1->is_Store() && !s1->is_Load()) {
1277	for (uint i = `1`; i < s1->req(); i++) {
1278	if (s1->in(i)->Opcode() != s2->in(i)->Opcode()) return false;
1279	}
1280	}
1281	return true;
1282	}
1283
1284	//------------------------------reduction---------------------------
1285	// Is there a data path between s1 and s2 and the nodes reductions?
1286	bool SuperWord::reduction(Node* s1, Node* s2) {
1287	bool retValue = false;
1288	int d1 = depth(s1);
1289	int d2 = depth(s2);
1290	if (d1 + `1` == d2) {
1291	if (s1->is_reduction() && s2->is_reduction()) {
1292	// This is an ordered set, so s1 should define s2
1293	for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1294	Node* t1 = s1->fast_out(i);
1295	if (t1 == s2) {
1296	// both nodes are reductions and connected
1297	retValue = true;
1298	}
1299	}
1300	}
1301	}
1302
1303	return retValue;
1304	}
1305
1306	//------------------------------independent_path------------------------------
1307	// Helper for independent
1308	bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
1309	if (dp >= `1000`) return false; // stop deep recursion
1310	visited_set(deep);
1311	int shal_depth = depth(shallow);
1312	assert(shal_depth <= depth(deep), "must be");
1313	for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) {
1314	Node* pred = preds.current();
1315	if (in_bb(pred) && !visited_test(pred)) {
1316	if (shallow == pred) {
1317	return false;
1318	}
1319	if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+`1`)) {
1320	return false;
1321	}
1322	}
1323	}
1324	return true;
1325	}
1326
1327	//------------------------------set_alignment---------------------------
1328	void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
1329	set_alignment(s1, align);
1330	if (align == top_align \|\| align == bottom_align) {
1331	set_alignment(s2, align);
1332	} else {
1333	set_alignment(s2, align + data_size(s1));
1334	}
1335	}
1336
1337	//------------------------------data_size---------------------------
1338	int SuperWord::data_size(Node* s) {
1339	Node* use = NULL; //test if the node is a candidate for CMoveV optimization, then return the size of CMov
1340	if (UseVectorCmov) {
1341	use = _cmovev_kit.is_Bool_candidate(s);
1342	if (use != NULL) {
1343	return data_size(use);
1344	}
1345	use = _cmovev_kit.is_CmpD_candidate(s);
1346	if (use != NULL) {
1347	return data_size(use);
1348	}
1349	}
1350
1351	int bsize = type2aelembytes(velt_basic_type(s));
1352	assert(bsize != `0`, "valid size");
1353	return bsize;
1354	}
1355
1356	//------------------------------extend_packlist---------------------------
1357	// Extend packset by following use->def and def->use links from pack members.
1358	void SuperWord::extend_packlist() {
1359	bool changed;
1360	do {
1361	packset_sort(_packset.length());
1362	changed = false;
1363	for (int i = `0`; i < _packset.length(); i++) {
1364	Node_List* p = _packset.at(i);
1365	changed \|= follow_use_defs(p);
1366	changed \|= follow_def_uses(p);
1367	}
1368	} while (changed);
1369
1370	if (_race_possible) {
1371	for (int i = `0`; i < _packset.length(); i++) {
1372	Node_List* p = _packset.at(i);
1373	order_def_uses(p);
1374	}
1375	}
1376
1377	if (TraceSuperWord) {
1378	tty->print_cr("\nAfter extend_packlist");
1379	print_packset();
1380	}
1381	}
1382
1383	//------------------------------follow_use_defs---------------------------
1384	// Extend the packset by visiting operand definitions of nodes in pack p
1385	bool SuperWord::follow_use_defs(Node_List* p) {
1386	assert(p->size() == `2`, "just checking");
1387	Node* s1 = p->at(`0`);
1388	Node* s2 = p->at(`1`);
1389	assert(s1->req() == s2->req(), "just checking");
1390	assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1391
1392	if (s1->is_Load()) return false;
1393
1394	int align = alignment(s1);
1395	NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, align);)
1396	bool changed = false;
1397	int start = s1->is_Store() ? MemNode::ValueIn : `1`;
1398	int end = s1->is_Store() ? MemNode::ValueIn+`1` : s1->req();
1399	for (int j = start; j < end; j++) {
1400	Node* t1 = s1->in(j);
1401	Node* t2 = s2->in(j);
1402	if (!in_bb(t1) \|\| !in_bb(t2))
1403	continue;
1404	if (stmts_can_pack(t1, t2, align)) {
1405	if (est_savings(t1, t2) >= `0`) {
1406	Node_List* pair = new Node_List ();
1407	pair->push(t1);
1408	pair->push(t2);
1409	_packset.append(pair);
1410	NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)", t1->_idx, t2->_idx, align);)
1411	set_alignment(t1, t2, align);
1412	changed = true;
1413	}
1414	}
1415	}
1416	return changed;
1417	}
1418
1419	//------------------------------follow_def_uses---------------------------
1420	// Extend the packset by visiting uses of nodes in pack p
1421	bool SuperWord::follow_def_uses(Node_List* p) {
1422	bool changed = false;
1423	Node* s1 = p->at(`0`);
1424	Node* s2 = p->at(`1`);
1425	assert(p->size() == `2`, "just checking");
1426	assert(s1->req() == s2->req(), "just checking");
1427	assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
1428
1429	if (s1->is_Store()) return false;
1430
1431	int align = alignment(s1);
1432	NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d", s1->_idx, align);)
1433	int savings = -`1`;
1434	int num_s1_uses = `0`;
1435	Node* u1 = NULL;
1436	Node* u2 = NULL;
1437	for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1438	Node* t1 = s1->fast_out(i);
1439	num_s1_uses++;
1440	if (!in_bb(t1)) continue;
1441	for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
1442	Node* t2 = s2->fast_out(j);
1443	if (!in_bb(t2)) continue;
1444	if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
1445	if (!opnd_positions_match(s1, t1, s2, t2))
1446	continue;
1447	if (stmts_can_pack(t1, t2, align)) {
1448	int my_savings = est_savings(t1, t2);
1449	if (my_savings > savings) {
1450	savings = my_savings;
1451	u1 = t1;
1452	u2 = t2;
1453	}
1454	}
1455	}
1456	}
1457	if (num_s1_uses > `1`) {
1458	_race_possible = true;
1459	}
1460	if (savings >= `0`) {
1461	Node_List* pair = new Node_List ();
1462	pair->push(u1);
1463	pair->push(u2);
1464	_packset.append(pair);
1465	NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)", u1->_idx, u2->_idx, align);)
1466	set_alignment(u1, u2, align);
1467	changed = true;
1468	}
1469	return changed;
1470	}
1471
1472	//------------------------------order_def_uses---------------------------
1473	// For extended packsets, ordinally arrange uses packset by major component
1474	void SuperWord::order_def_uses(Node_List* p) {
1475	Node* s1 = p->at(`0`);
1476
1477	if (s1->is_Store()) return;
1478
1479	// reductions are always managed beforehand
1480	if (s1->is_reduction()) return;
1481
1482	for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1483	Node* t1 = s1->fast_out(i);
1484
1485	// Only allow operand swap on commuting operations
1486	if (!t1->is_Add() && !t1->is_Mul() && !VectorNode::is_muladds2i(t1)) {
1487	break;
1488	}
1489
1490	// Now find t1's packset
1491	Node_List* p2 = NULL;
1492	for (int j = `0`; j < _packset.length(); j++) {
1493	p2 = _packset.at(j);
1494	Node* first = p2->at(`0`);
1495	if (t1 == first) {
1496	break;
1497	}
1498	p2 = NULL;
1499	}
1500	// Arrange all sub components by the major component
1501	if (p2 != NULL) {
1502	for (uint j = `1`; j < p->size(); j++) {
1503	Node* d1 = p->at(j);
1504	Node* u1 = p2->at(j);
1505	opnd_positions_match(s1, t1, d1, u1);
1506	}
1507	}
1508	}
1509	}
1510
1511	//---------------------------opnd_positions_match-------------------------
1512	// Is the use of d1 in u1 at the same operand position as d2 in u2?
1513	bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
1514	// check reductions to see if they are marshalled to represent the reduction
1515	// operator in a specified opnd
1516	if (u1->is_reduction() && u2->is_reduction()) {
1517	// ensure reductions have phis and reduction definitions feeding the 1st operand
1518	Node* first = u1->in(`2`);
1519	if (first->is_Phi() \|\| first->is_reduction()) {
1520	u1->swap_edges(`1`, `2`);
1521	}
1522	// ensure reductions have phis and reduction definitions feeding the 1st operand
1523	first = u2->in(`2`);
1524	if (first->is_Phi() \|\| first->is_reduction()) {
1525	u2->swap_edges(`1`, `2`);
1526	}
1527	return true;
1528	}
1529
1530	uint ct = u1->req();
1531	if (ct != u2->req()) return false;
1532	uint i1 = `0`;
1533	uint i2 = `0`;
1534	do {
1535	for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
1536	for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
1537	if (i1 != i2) {
1538	if ((i1 == (`3`-i2)) && (u2->is_Add() \|\| u2->is_Mul())) {
1539	// Further analysis relies on operands position matching.
1540	u2->swap_edges(i1, i2);
1541	} else if (VectorNode::is_muladds2i(u2) && u1 != u2) {
1542	if (i1 == `5` - i2) { // ((i1 == 3 && i2 == 2) \|\| (i1 == 2 && i2 == 3) \|\| (i1 == 1 && i2 == 4) \|\| (i1 == 4 && i2 == 1))
1543	u2->swap_edges(`1`, `2`);
1544	u2->swap_edges(`3`, `4`);
1545	}
1546	if (i1 == `3` - i2 \|\| i1 == `7` - i2) { // ((i1 == 1 && i2 == 2) \|\| (i1 == 2 && i2 == 1) \|\| (i1 == 3 && i2 == 4) \|\| (i1 == 4 && i2 == 3))
1547	u2->swap_edges(`2`, `3`);
1548	u2->swap_edges(`1`, `4`);
1549	}
1550	return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
1551	} else {
1552	return false;
1553	}
1554	} else if (i1 == i2 && VectorNode::is_muladds2i(u2) && u1 != u2) {
1555	u2->swap_edges(`1`, `3`);
1556	u2->swap_edges(`2`, `4`);
1557	return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
1558	}
1559	} while (i1 < ct);
1560	return true;
1561	}
1562
1563	//------------------------------est_savings---------------------------
1564	// Estimate the savings from executing s1 and s2 as a pack
1565	int SuperWord::est_savings(Node* s1, Node* s2) {
1566	int save_in = `2` - `1`; // 2 operations per instruction in packed form
1567
1568	// inputs
1569	for (uint i = `1`; i < s1->req(); i++) {
1570	Node* x1 = s1->in(i);
1571	Node* x2 = s2->in(i);
1572	if (x1 != x2) {
1573	if (are_adjacent_refs(x1, x2)) {
1574	save_in += adjacent_profit(x1, x2);
1575	} else if (!in_packset(x1, x2)) {
1576	save_in -= pack_cost(`2`);
1577	} else {
1578	save_in += unpack_cost(`2`);
1579	}
1580	}
1581	}
1582
1583	// uses of result
1584	uint ct = `0`;
1585	int save_use = `0`;
1586	for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
1587	Node* s1_use = s1->fast_out(i);
1588	for (int j = `0`; j < _packset.length(); j++) {
1589	Node_List* p = _packset.at(j);
1590	if (p->at(`0`) == s1_use) {
1591	for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
1592	Node* s2_use = s2->fast_out(k);
1593	if (p->at(p->size()-`1`) == s2_use) {
1594	ct++;
1595	if (are_adjacent_refs(s1_use, s2_use)) {
1596	save_use += adjacent_profit(s1_use, s2_use);
1597	}
1598	}
1599	}
1600	}
1601	}
1602	}
1603
1604	if (ct < s1->outcnt()) save_use += unpack_cost(`1`);
1605	if (ct < s2->outcnt()) save_use += unpack_cost(`1`);
1606
1607	return MAX2(save_in, save_use);
1608	}
1609
1610	//------------------------------costs---------------------------
1611	int SuperWord::adjacent_profit(Node* s1, Node* s2) { return `2`; }
1612	int SuperWord::pack_cost(int ct) { return ct; }
1613	int SuperWord::unpack_cost(int ct) { return ct; }
1614
1615	//------------------------------combine_packs---------------------------
1616	// Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
1617	void SuperWord::combine_packs() {
1618	bool changed = true;
1619	// Combine packs regardless max vector size.
1620	while (changed) {
1621	changed = false;
1622	for (int i = `0`; i < _packset.length(); i++) {
1623	Node_List* p1 = _packset.at(i);
1624	if (p1 == NULL) continue;
1625	// Because of sorting we can start at i + 1
1626	for (int j = i + `1`; j < _packset.length(); j++) {
1627	Node_List* p2 = _packset.at(j);
1628	if (p2 == NULL) continue;
1629	if (i == j) continue;
1630	if (p1->at(p1->size()-`1`) == p2->at(`0`)) {
1631	for (uint k = `1`; k < p2->size(); k++) {
1632	p1->push(p2->at(k));
1633	}
1634	_packset.at_put(j, NULL);
1635	changed = true;
1636	}
1637	}
1638	}
1639	}
1640
1641	// Split packs which have size greater then max vector size.
1642	for (int i = `0`; i < _packset.length(); i++) {
1643	Node_List* p1 = _packset.at(i);
1644	if (p1 != NULL) {
1645	BasicType bt = velt_basic_type(p1->at(`0`));
1646	uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
1647	assert(is_power_of_2(max_vlen), "sanity");
1648	uint psize = p1->size();
1649	if (!is_power_of_2(psize)) {
1650	// Skip pack which can't be vector.
1651	// case1: for(...) { a[i] = i; } elements values are different (i+x)
1652	// case2: for(...) { a[i] = b[i+1]; } can't align both, load and store
1653	_packset.at_put(i, NULL);
1654	continue;
1655	}
1656	if (psize > max_vlen) {
1657	Node_List* pack = new Node_List ();
1658	for (uint j = `0`; j < psize; j++) {
1659	pack->push(p1->at(j));
1660	if (pack->size() >= max_vlen) {
1661	assert(is_power_of_2(pack->size()), "sanity");
1662	_packset.append(pack);
1663	pack = new Node_List ();
1664	}
1665	}
1666	_packset.at_put(i, NULL);
1667	}
1668	}
1669	}
1670
1671	// Compress list.
1672	for (int i = _packset.length() - `1`; i >= `0`; i--) {
1673	Node_List* p1 = _packset.at(i);
1674	if (p1 == NULL) {
1675	_packset.remove_at(i);
1676	}
1677	}
1678
1679	if (TraceSuperWord) {
1680	tty->print_cr("\nAfter combine_packs");
1681	print_packset();
1682	}
1683	}
1684
1685	//-----------------------------construct_my_pack_map--------------------------
1686	// Construct the map from nodes to packs. Only valid after the
1687	// point where a node is only in one pack (after combine_packs).
1688	void SuperWord::construct_my_pack_map() {
1689	Node_List* rslt = NULL;
1690	for (int i = `0`; i < _packset.length(); i++) {
1691	Node_List* p = _packset.at(i);
1692	for (uint j = `0`; j < p->size(); j++) {
1693	Node* s = p->at(j);
1694	assert(my_pack(s) == NULL, "only in one pack");
1695	set_my_pack(s, p);
1696	}
1697	}
1698	}
1699
1700	//------------------------------filter_packs---------------------------
1701	// Remove packs that are not implemented or not profitable.
1702	void SuperWord::filter_packs() {
1703	// Remove packs that are not implemented
1704	for (int i = _packset.length() - `1`; i >= `0`; i--) {
1705	Node_List* pk = _packset.at(i);
1706	bool impl = implemented(pk);
1707	if (!impl) {
1708	#ifndef PRODUCT
1709	if (TraceSuperWord && Verbose) {
1710	tty->print_cr("Unimplemented");
1711	pk->at(`0`)->dump();
1712	}
1713	#endif
1714	remove_pack_at(i);
1715	}
1716	Node *n = pk->at(`0`);
1717	if (n->is_reduction()) {
1718	_num_reductions++;
1719	} else {
1720	_num_work_vecs++;
1721	}
1722	}
1723
1724	// Remove packs that are not profitable
1725	bool changed;
1726	do {
1727	changed = false;
1728	for (int i = _packset.length() - `1`; i >= `0`; i--) {
1729	Node_List* pk = _packset.at(i);
1730	bool prof = profitable(pk);
1731	if (!prof) {
1732	#ifndef PRODUCT
1733	if (TraceSuperWord && Verbose) {
1734	tty->print_cr("Unprofitable");
1735	pk->at(`0`)->dump();
1736	}
1737	#endif
1738	remove_pack_at(i);
1739	changed = true;
1740	}
1741	}
1742	} while (changed);
1743
1744	#ifndef PRODUCT
1745	if (TraceSuperWord) {
1746	tty->print_cr("\nAfter filter_packs");
1747	print_packset();
1748	tty->cr();
1749	}
1750	#endif
1751	}
1752
1753	//------------------------------merge_packs_to_cmovd---------------------------
1754	// Merge CMoveD into new vector-nodes
1755	// We want to catch this pattern and subsume CmpD and Bool into CMoveD
1756	//
1757	// SubD ConD
1758	// / \| /
1759	// / \| / /
1760	// / \| / /
1761	// / \| / /
1762	// / / /
1763	// / / \| /
1764	// v / \| /
1765	// CmpD \| /
1766	// \| \| /
1767	// v \| /
1768	// Bool \| /
1769	// \ \| /
1770	// \ \| /
1771	// \ \| /
1772	// \ \| /
1773	// \ v /
1774	// CMoveD
1775	//
1776
1777	void SuperWord::merge_packs_to_cmovd() {
1778	for (int i = _packset.length() - `1`; i >= `0`; i--) {
1779	_cmovev_kit.make_cmovevd_pack(_packset.at(i));
1780	}
1781	#ifndef PRODUCT
1782	if (TraceSuperWord) {
1783	tty->print_cr("\nSuperWord::merge_packs_to_cmovd(): After merge");
1784	print_packset();
1785	tty->cr();
1786	}
1787	#endif
1788	}
1789
1790	Node* CMoveKit::is_Bool_candidate(Node* def) const {
1791	Node* use = NULL;
1792	if (!def->is_Bool() \|\| def->in(`0`) != NULL \|\| def->outcnt() != `1`) {
1793	return NULL;
1794	}
1795	for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1796	use = def->fast_out(j);
1797	if (!_sw->same_generation(def, use) \|\| !use->is_CMove()) {
1798	return NULL;
1799	}
1800	}
1801	return use;
1802	}
1803
1804	Node* CMoveKit::is_CmpD_candidate(Node* def) const {
1805	Node* use = NULL;
1806	if (!def->is_Cmp() \|\| def->in(`0`) != NULL \|\| def->outcnt() != `1`) {
1807	return NULL;
1808	}
1809	for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
1810	use = def->fast_out(j);
1811	if (!_sw->same_generation(def, use) \|\| (use = is_Bool_candidate(use)) == NULL \|\| !_sw->same_generation(def, use)) {
1812	return NULL;
1813	}
1814	}
1815	return use;
1816	}
1817
1818	Node_List* CMoveKit::make_cmovevd_pack(Node_List* cmovd_pk) {
1819	Node *cmovd = cmovd_pk->at(`0`);
1820	if (!cmovd->is_CMove()) {
1821	return NULL;
1822	}
1823	if (cmovd->Opcode() != Op_CMoveF && cmovd->Opcode() != Op_CMoveD) {
1824	return NULL;
1825	}
1826	if (pack(cmovd) != NULL) { // already in the cmov pack
1827	return NULL;
1828	}
1829	if (cmovd->in(`0`) != NULL) {
1830	NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CMoveD %d has control flow, escaping...", cmovd->_idx); cmovd->dump();})
1831	return NULL;
1832	}
1833
1834	Node* bol = cmovd->as_CMove()->in(CMoveNode::Condition);
1835	if (!bol->is_Bool()
1836	\|\| bol->outcnt() != `1`
1837	\|\| !_sw->same_generation(bol, cmovd)
1838	\|\| bol->in(`0`) != NULL // BoolNode has control flow!!
1839	\|\| _sw->my_pack(bol) == NULL) {
1840	NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: Bool %d does not fit CMoveD %d for building vector, escaping...", bol->_idx, cmovd->_idx); bol->dump();})
1841	return NULL;
1842	}
1843	Node_List* bool_pk = _sw->my_pack(bol);
1844	if (bool_pk->size() != cmovd_pk->size() ) {
1845	return NULL;
1846	}
1847
1848	Node* cmpd = bol->in(`1`);
1849	if (!cmpd->is_Cmp()
1850	\|\| cmpd->outcnt() != `1`
1851	\|\| !_sw->same_generation(cmpd, cmovd)
1852	\|\| cmpd->in(`0`) != NULL // CmpDNode has control flow!!
1853	\|\| _sw->my_pack(cmpd) == NULL) {
1854	NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CmpD %d does not fit CMoveD %d for building vector, escaping...", cmpd->_idx, cmovd->_idx); cmpd->dump();})
1855	return NULL;
1856	}
1857	Node_List* cmpd_pk = _sw->my_pack(cmpd);
1858	if (cmpd_pk->size() != cmovd_pk->size() ) {
1859	return NULL;
1860	}
1861
1862	if (!test_cmpd_pack(cmpd_pk, cmovd_pk)) {
1863	NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: cmpd pack for CmpD %d failed vectorization test", cmpd->_idx); cmpd->dump();})
1864	return NULL;
1865	}
1866
1867	Node_List* new_cmpd_pk = new Node_List ();
1868	uint sz = cmovd_pk->size() - `1`;
1869	for (uint i = `0`; i <= sz; ++i) {
1870	Node* cmov = cmovd_pk->at(i);
1871	Node* bol = bool_pk->at(i);
1872	Node* cmp = cmpd_pk->at(i);
1873
1874	new_cmpd_pk->insert(i, cmov);
1875
1876	map(cmov, new_cmpd_pk);
1877	map(bol, new_cmpd_pk);
1878	map(cmp, new_cmpd_pk);
1879
1880	_sw->set_my_pack(cmov, new_cmpd_pk); // and keep old packs for cmp and bool
1881	}
1882	_sw->_packset.remove(cmovd_pk);
1883	_sw->_packset.remove(bool_pk);
1884	_sw->_packset.remove(cmpd_pk);
1885	_sw->_packset.append(new_cmpd_pk);
1886	NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print_cr("CMoveKit::make_cmovevd_pack: added syntactic CMoveD pack"); _sw->print_pack(new_cmpd_pk);})
1887	return new_cmpd_pk;
1888	}
1889
1890	bool CMoveKit::test_cmpd_pack(Node_List* cmpd_pk, Node_List* cmovd_pk) {
1891	Node* cmpd0 = cmpd_pk->at(`0`);
1892	assert(cmpd0->is_Cmp(), "CMoveKit::test_cmpd_pack: should be CmpDNode");
1893	assert(cmovd_pk->at(`0`)->is_CMove(), "CMoveKit::test_cmpd_pack: should be CMoveD");
1894	assert(cmpd_pk->size() == cmovd_pk->size(), "CMoveKit::test_cmpd_pack: should be same size");
1895	Node* in1 = cmpd0->in(`1`);
1896	Node* in2 = cmpd0->in(`2`);
1897	Node_List* in1_pk = _sw->my_pack(in1);
1898	Node_List* in2_pk = _sw->my_pack(in2);
1899
1900	if ( (in1_pk != NULL && in1_pk->size() != cmpd_pk->size())
1901	\|\| (in2_pk != NULL && in2_pk->size() != cmpd_pk->size()) ) {
1902	return false;
1903	}
1904
1905	// test if "all" in1 are in the same pack or the same node
1906	if (in1_pk == NULL) {
1907	for (uint j = `1`; j < cmpd_pk->size(); j++) {
1908	if (cmpd_pk->at(j)->in(`1`) != in1) {
1909	return false;
1910	}
1911	}//for: in1_pk is not pack but all CmpD nodes in the pack have the same in(1)
1912	}
1913	// test if "all" in2 are in the same pack or the same node
1914	if (in2_pk == NULL) {
1915	for (uint j = `1`; j < cmpd_pk->size(); j++) {
1916	if (cmpd_pk->at(j)->in(`2`) != in2) {
1917	return false;
1918	}
1919	}//for: in2_pk is not pack but all CmpD nodes in the pack have the same in(2)
1920	}
1921	//now check if cmpd_pk may be subsumed in vector built for cmovd_pk
1922	int cmovd_ind1, cmovd_ind2;
1923	if (cmpd_pk->at(`0`)->in(`1`) == cmovd_pk->at(`0`)->as_CMove()->in(CMoveNode::IfFalse)
1924	&& cmpd_pk->at(`0`)->in(`2`) == cmovd_pk->at(`0`)->as_CMove()->in(CMoveNode::IfTrue)) {
1925	cmovd_ind1 = CMoveNode::IfFalse;
1926	cmovd_ind2 = CMoveNode::IfTrue;
1927	} else if (cmpd_pk->at(`0`)->in(`2`) == cmovd_pk->at(`0`)->as_CMove()->in(CMoveNode::IfFalse)
1928	&& cmpd_pk->at(`0`)->in(`1`) == cmovd_pk->at(`0`)->as_CMove()->in(CMoveNode::IfTrue)) {
1929	cmovd_ind2 = CMoveNode::IfFalse;
1930	cmovd_ind1 = CMoveNode::IfTrue;
1931	}
1932	else {
1933	return false;
1934	}
1935
1936	for (uint j = `1`; j < cmpd_pk->size(); j++) {
1937	if (cmpd_pk->at(j)->in(`1`) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind1)
1938	\|\| cmpd_pk->at(j)->in(`2`) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind2)) {
1939	return false;
1940	}//if
1941	}
1942	NOT_PRODUCT(if(_sw->is_trace_cmov()) { tty->print("CMoveKit::test_cmpd_pack: cmpd pack for 1st CmpD %d is OK for vectorization: ", cmpd0->_idx); cmpd0->dump(); })
1943	return true;
1944	}
1945
1946	//------------------------------implemented---------------------------
1947	// Can code be generated for pack p?
1948	bool SuperWord::implemented(Node_List* p) {
1949	bool retValue = false;
1950	Node* p0 = p->at(`0`);
1951	if (p0 != NULL) {
1952	int opc = p0->Opcode();
1953	uint size = p->size();
1954	if (p0->is_reduction()) {
1955	const Type *arith_type = p0->bottom_type();
1956	// Length 2 reductions of INT/LONG do not offer performance benefits
1957	if (((arith_type->basic_type() == T_INT) \|\| (arith_type->basic_type() == T_LONG)) && (size == `2`)) {
1958	retValue = false;
1959	} else {
1960	retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
1961	}
1962	} else {
1963	retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
1964	}
1965	if (!retValue) {
1966	if (is_cmov_pack(p)) {
1967	NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmpd pack"); print_pack(p);})
1968	return true;
1969	}
1970	}
1971	}
1972	return retValue;
1973	}
1974
1975	bool SuperWord::is_cmov_pack(Node_List* p) {
1976	return _cmovev_kit.pack(p->at(`0`)) != NULL;
1977	}
1978	//------------------------------same_inputs--------------------------
1979	// For pack p, are all idx operands the same?
1980	bool SuperWord::same_inputs(Node_List* p, int idx) {
1981	Node* p0 = p->at(`0`);
1982	uint vlen = p->size();
1983	Node* p0_def = p0->in(idx);
1984	for (uint i = `1`; i < vlen; i++) {
1985	Node* pi = p->at(i);
1986	Node* pi_def = pi->in(idx);
1987	if (p0_def != pi_def) {
1988	return false;
1989	}
1990	}
1991	return true;
1992	}
1993
1994	//------------------------------profitable---------------------------
1995	// For pack p, are all operands and all uses (with in the block) vector?
1996	bool SuperWord::profitable(Node_List* p) {
1997	Node* p0 = p->at(`0`);
1998	uint start, end;
1999	VectorNode::vector_operands(p0, &start, &end);
2000
2001	// Return false if some inputs are not vectors or vectors with different
2002	// size or alignment.
2003	// Also, for now, return false if not scalar promotion case when inputs are
2004	// the same. Later, implement PackNode and allow differing, non-vector inputs
2005	// (maybe just the ones from outside the block.)
2006	for (uint i = start; i < end; i++) {
2007	if (!is_vector_use(p0, i)) {
2008	return false;
2009	}
2010	}
2011	// Check if reductions are connected
2012	if (p0->is_reduction()) {
2013	Node* second_in = p0->in(`2`);
2014	Node_List* second_pk = my_pack(second_in);
2015	if ((second_pk == NULL) \|\| (_num_work_vecs == _num_reductions)) {
2016	// Remove reduction flag if no parent pack or if not enough work
2017	// to cover reduction expansion overhead
2018	p0->remove_flag(Node::Flag_is_reduction);
2019	return false;
2020	} else if (second_pk->size() != p->size()) {
2021	return false;
2022	}
2023	}
2024	if (VectorNode::is_shift(p0)) {
2025	// For now, return false if shift count is vector or not scalar promotion
2026	// case (different shift counts) because it is not supported yet.
2027	Node* cnt = p0->in(`2`);
2028	Node_List* cnt_pk = my_pack(cnt);
2029	if (cnt_pk != NULL)
2030	return false;
2031	if (!same_inputs(p, `2`))
2032	return false;
2033	}
2034	if (!p0->is_Store()) {
2035	// For now, return false if not all uses are vector.
2036	// Later, implement ExtractNode and allow non-vector uses (maybe
2037	// just the ones outside the block.)
2038	for (uint i = `0`; i < p->size(); i++) {
2039	Node* def = p->at(i);
2040	if (is_cmov_pack_internal_node(p, def)) {
2041	continue;
2042	}
2043	for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2044	Node* use = def->fast_out(j);
2045	for (uint k = `0`; k < use->req(); k++) {
2046	Node* n = use->in(k);
2047	if (def == n) {
2048	// reductions should only have a Phi use at the the loop
2049	// head and out of loop uses
2050	if (def->is_reduction() &&
2051	((use->is_Phi() && use->in(`0`) == _lpt->_head) \|\|
2052	!_lpt->is_member(_phase->get_loop(_phase->ctrl_or_self(use))))) {
2053	assert(i == p->size()-`1`, "must be last element of the pack");
2054	continue;
2055	}
2056	if (!is_vector_use(use, k)) {
2057	return false;
2058	}
2059	}
2060	}
2061	}
2062	}
2063	}
2064	return true;
2065	}
2066
2067	//------------------------------schedule---------------------------
2068	// Adjust the memory graph for the packed operations
2069	void SuperWord::schedule() {
2070
2071	// Co-locate in the memory graph the members of each memory pack
2072	for (int i = `0`; i < _packset.length(); i++) {
2073	co_locate_pack(_packset.at(i));
2074	}
2075	}
2076
2077	//-------------------------------remove_and_insert-------------------
2078	// Remove "current" from its current position in the memory graph and insert
2079	// it after the appropriate insertion point (lip or uip).
2080	void SuperWord::remove_and_insert(MemNode current, MemNode prev, MemNode *lip,
2081	Node *uip, Unique_Node_List &sched_before) {
2082	Node* my_mem = current->in(MemNode::Memory);
2083	bool sched_up = sched_before.member(current);
2084
2085	// remove current_store from its current position in the memmory graph
2086	for (DUIterator i = current->outs(); current->has_out(i); i++) {
2087	Node* use = current->out(i);
2088	if (use->is_Mem()) {
2089	assert(use->in(MemNode::Memory) == current, "must be");
2090	if (use == prev) { // connect prev to my_mem
2091	_igvn.replace_input_of(use, MemNode::Memory, my_mem);
2092	--i; //deleted this edge; rescan position
2093	} else if (sched_before.member(use)) {
2094	if (!sched_up) { // Will be moved together with current
2095	_igvn.replace_input_of(use, MemNode::Memory, uip);
2096	--i; //deleted this edge; rescan position
2097	}
2098	} else {
2099	if (sched_up) { // Will be moved together with current
2100	_igvn.replace_input_of(use, MemNode::Memory, lip);
2101	--i; //deleted this edge; rescan position
2102	}
2103	}
2104	}
2105	}
2106
2107	Node *insert_pt = sched_up ? uip : lip;
2108
2109	// all uses of insert_pt's memory state should use current's instead
2110	for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
2111	Node* use = insert_pt->out(i);
2112	if (use->is_Mem()) {
2113	assert(use->in(MemNode::Memory) == insert_pt, "must be");
2114	_igvn.replace_input_of(use, MemNode::Memory, current);
2115	--i; //deleted this edge; rescan position
2116	} else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
2117	uint pos; //lip (lower insert point) must be the last one in the memory slice
2118	for (pos=`1`; pos < use->req(); pos++) {
2119	if (use->in(pos) == insert_pt) break;
2120	}
2121	_igvn.replace_input_of(use, pos, current);
2122	--i;
2123	}
2124	}
2125
2126	//connect current to insert_pt
2127	_igvn.replace_input_of(current, MemNode::Memory, insert_pt);
2128	}
2129
2130	//------------------------------co_locate_pack----------------------------------
2131	// To schedule a store pack, we need to move any sandwiched memory ops either before
2132	// or after the pack, based upon dependence information:
2133	// (1) If any store in the pack depends on the sandwiched memory op, the
2134	// sandwiched memory op must be scheduled BEFORE the pack;
2135	// (2) If a sandwiched memory op depends on any store in the pack, the
2136	// sandwiched memory op must be scheduled AFTER the pack;
2137	// (3) If a sandwiched memory op (say, memA) depends on another sandwiched
2138	// memory op (say memB), memB must be scheduled before memA. So, if memA is
2139	// scheduled before the pack, memB must also be scheduled before the pack;
2140	// (4) If there is no dependence restriction for a sandwiched memory op, we simply
2141	// schedule this store AFTER the pack
2142	// (5) We know there is no dependence cycle, so there in no other case;
2143	// (6) Finally, all memory ops in another single pack should be moved in the same direction.
2144	//
2145	// To schedule a load pack, we use the memory state of either the first or the last load in
2146	// the pack, based on the dependence constraint.
2147	void SuperWord::co_locate_pack(Node_List* pk) {
2148	if (pk->at(`0`)->is_Store()) {
2149	MemNode* first = executed_first(pk)->as_Mem();
2150	MemNode* last = executed_last(pk)->as_Mem();
2151	Unique_Node_List schedule_before_pack;
2152	Unique_Node_List memops;
2153
2154	MemNode* current = last->in(MemNode::Memory)->as_Mem();
2155	MemNode* previous = last;
2156	while (true) {
2157	assert(in_bb(current), "stay in block");
2158	memops.push(previous);
2159	for (DUIterator i = current->outs(); current->has_out(i); i++) {
2160	Node* use = current->out(i);
2161	if (use->is_Mem() && use != previous)
2162	memops.push(use);
2163	}
2164	if (current == first) break;
2165	previous = current;
2166	current = current->in(MemNode::Memory)->as_Mem();
2167	}
2168
2169	// determine which memory operations should be scheduled before the pack
2170	for (uint i = `1`; i < memops.size(); i++) {
2171	Node *s1 = memops.at(i);
2172	if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
2173	for (uint j = `0`; j< i; j++) {
2174	Node *s2 = memops.at(j);
2175	if (!independent(s1, s2)) {
2176	if (in_pack(s2, pk) \|\| schedule_before_pack.member(s2)) {
2177	schedule_before_pack.push(s1); // s1 must be scheduled before
2178	Node_List* mem_pk = my_pack(s1);
2179	if (mem_pk != NULL) {
2180	for (uint ii = `0`; ii < mem_pk->size(); ii++) {
2181	Node* s = mem_pk->at(ii); // follow partner
2182	if (memops.member(s) && !schedule_before_pack.member(s))
2183	schedule_before_pack.push(s);
2184	}
2185	}
2186	break;
2187	}
2188	}
2189	}
2190	}
2191	}
2192
2193	Node* upper_insert_pt = first->in(MemNode::Memory);
2194	// Following code moves loads connected to upper_insert_pt below aliased stores.
2195	// Collect such loads here and reconnect them back to upper_insert_pt later.
2196	memops.clear();
2197	for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
2198	Node* use = upper_insert_pt->out(i);
2199	if (use->is_Mem() && !use->is_Store()) {
2200	memops.push(use);
2201	}
2202	}
2203
2204	MemNode* lower_insert_pt = last;
2205	previous = last; //previous store in pk
2206	current = last->in(MemNode::Memory)->as_Mem();
2207
2208	// start scheduling from "last" to "first"
2209	while (true) {
2210	assert(in_bb(current), "stay in block");
2211	assert(in_pack(previous, pk), "previous stays in pack");
2212	Node* my_mem = current->in(MemNode::Memory);
2213
2214	if (in_pack(current, pk)) {
2215	// Forward users of my memory state (except "previous) to my input memory state
2216	for (DUIterator i = current->outs(); current->has_out(i); i++) {
2217	Node* use = current->out(i);
2218	if (use->is_Mem() && use != previous) {
2219	assert(use->in(MemNode::Memory) == current, "must be");
2220	if (schedule_before_pack.member(use)) {
2221	_igvn.replace_input_of(use, MemNode::Memory, upper_insert_pt);
2222	} else {
2223	_igvn.replace_input_of(use, MemNode::Memory, lower_insert_pt);
2224	}
2225	--i; // deleted this edge; rescan position
2226	}
2227	}
2228	previous = current;
2229	} else { // !in_pack(current, pk) ==> a sandwiched store
2230	remove_and_insert(current, previous, lower_insert_pt, upper_insert_pt, schedule_before_pack);
2231	}
2232
2233	if (current == first) break;
2234	current = my_mem->as_Mem();
2235	} // end while
2236
2237	// Reconnect loads back to upper_insert_pt.
2238	for (uint i = `0`; i < memops.size(); i++) {
2239	Node *ld = memops.at(i);
2240	if (ld->in(MemNode::Memory) != upper_insert_pt) {
2241	_igvn.replace_input_of(ld, MemNode::Memory, upper_insert_pt);
2242	}
2243	}
2244	} else if (pk->at(`0`)->is_Load()) { //load
2245	// all loads in the pack should have the same memory state. By default,
2246	// we use the memory state of the last load. However, if any load could
2247	// not be moved down due to the dependence constraint, we use the memory
2248	// state of the first load.
2249	Node* first_mem = pk->at(`0`)->in(MemNode::Memory);
2250	Node* last_mem = first_mem;
2251	for (uint i = `1`; i < pk->size(); i++) {
2252	Node* ld = pk->at(i);
2253	Node* mem = ld->in(MemNode::Memory);
2254	assert(in_bb(first_mem) \|\| in_bb(mem) \|\| mem == first_mem, "2 different memory state from outside the loop?");
2255	if (in_bb(mem)) {
2256	if (in_bb(first_mem) && bb_idx(mem) < bb_idx(first_mem)) {
2257	first_mem = mem;
2258	}
2259	if (!in_bb(last_mem) \|\| bb_idx(mem) > bb_idx(last_mem)) {
2260	last_mem = mem;
2261	}
2262	}
2263	}
2264	bool schedule_last = true;
2265	for (uint i = `0`; i < pk->size(); i++) {
2266	Node* ld = pk->at(i);
2267	for (Node* current = last_mem; current != ld->in(MemNode::Memory);
2268	current=current->in(MemNode::Memory)) {
2269	assert(current != first_mem, "corrupted memory graph");
2270	if(current->is_Mem() && !independent(current, ld)){
2271	schedule_last = false; // a later store depends on this load
2272	break;
2273	}
2274	}
2275	}
2276
2277	Node* mem_input = schedule_last ? last_mem : first_mem;
2278	_igvn.hash_delete(mem_input);
2279	// Give each load the same memory state
2280	for (uint i = `0`; i < pk->size(); i++) {
2281	LoadNode* ld = pk->at(i)->as_Load();
2282	_igvn.replace_input_of(ld, MemNode::Memory, mem_input);
2283	}
2284	}
2285	}
2286
2287	#ifndef PRODUCT
2288	void SuperWord::print_loop(bool whole) {
2289	Node_Stack stack(_arena, _phase->C->unique() >> `2`);
2290	Node_List rpo_list;
2291	VectorSet visited(_arena);
2292	visited.set(lpt()->_head->_idx);
2293	_phase->rpo(lpt()->_head, stack, visited, rpo_list);
2294	_phase->dump(lpt(), rpo_list.size(), rpo_list );
2295	if(whole) {
2296	tty->print_cr("\n Whole loop tree");
2297	_phase->dump();
2298	tty->print_cr(" End of whole loop tree\n");
2299	}
2300	}
2301	#endif
2302
2303	//------------------------------output---------------------------
2304	// Convert packs into vector node operations
2305	void SuperWord::output() {
2306	CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2307	Compile* C = _phase->C;
2308	if (_packset.length() == `0`) {
2309	if (cl->is_main_loop()) {
2310	// Instigate more unrolling for optimization when vectorization fails.
2311	C->set_major_progress();
2312	cl->set_notpassed_slp();
2313	cl->mark_do_unroll_only();
2314	}
2315	return;
2316	}
2317
2318	#ifndef PRODUCT
2319	if (TraceLoopOpts) {
2320	tty->print("SuperWord::output ");
2321	lpt()->dump_head();
2322	}
2323	#endif
2324
2325	if (cl->is_main_loop()) {
2326	// MUST ENSURE main loop's initial value is properly aligned:
2327	// (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0
2328
2329	align_initial_loop_index(align_to_ref());
2330
2331	// Insert extract (unpack) operations for scalar uses
2332	for (int i = `0`; i < _packset.length(); i++) {
2333	insert_extracts(_packset.at(i));
2334	}
2335	}
2336
2337	uint max_vlen_in_bytes = `0`;
2338	uint max_vlen = `0`;
2339	bool can_process_post_loop = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
2340
2341	NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
2342
2343	CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
2344
2345	NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
2346
2347	if (do_reserve_copy() && !make_reversable.has_reserved()) {
2348	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
2349	return;
2350	}
2351
2352	for (int i = `0`; i < _block.length(); i++) {
2353	Node* n = _block.at(i);
2354	Node_List* p = my_pack(n);
2355	if (p && n == executed_last(p)) {
2356	uint vlen = p->size();
2357	uint vlen_in_bytes = `0`;
2358	Node* vn = NULL;
2359	Node* low_adr = p->at(`0`);
2360	Node* first = executed_first(p);
2361	if (can_process_post_loop) {
2362	// override vlen with the main loops vector length
2363	vlen = cl->slp_max_unroll();
2364	}
2365	NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
2366	int opc = n->Opcode();
2367	if (n->is_Load()) {
2368	Node* ctl = n->in(MemNode::Control);
2369	Node* mem = first->in(MemNode::Memory);
2370	SWPointer p1(n->as_Mem(), this, NULL, false);
2371	// Identify the memory dependency for the new loadVector node by
2372	// walking up through memory chain.
2373	// This is done to give flexibility to the new loadVector node so that
2374	// it can move above independent storeVector nodes.
2375	while (mem->is_StoreVector()) {
2376	SWPointer p2(mem->as_Mem(), this, NULL, false);
2377	int cmp = p1.cmp(p2);
2378	if (SWPointer::not_equal(cmp) \|\| !SWPointer::comparable(cmp)) {
2379	mem = mem->in(MemNode::Memory);
2380	} else {
2381	break; // dependent memory
2382	}
2383	}
2384	Node* adr = low_adr->in(MemNode::Address);
2385	const TypePtr* atyp = n->adr_type();
2386	vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p));
2387	vlen_in_bytes = vn->as_LoadVector()->memory_size();
2388	} else if (n->is_Store()) {
2389	// Promote value to be stored to vector
2390	Node* val = vector_opd(p, MemNode::ValueIn);
2391	if (val == NULL) {
2392	if (do_reserve_copy()) {
2393	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: val should not be NULL, exiting SuperWord");})
2394	return; //and reverse to backup IG
2395	}
2396	ShouldNotReachHere();
2397	}
2398
2399	Node* ctl = n->in(MemNode::Control);
2400	Node* mem = first->in(MemNode::Memory);
2401	Node* adr = low_adr->in(MemNode::Address);
2402	const TypePtr* atyp = n->adr_type();
2403	vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
2404	vlen_in_bytes = vn->as_StoreVector()->memory_size();
2405	} else if (VectorNode::is_muladds2i(n)) {
2406	assert(n->req() == `5u`, "MulAddS2I should have 4 operands.");
2407	Node* in1 = vector_opd(p, `1`);
2408	Node* in2 = vector_opd(p, `2`);
2409	vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2410	vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2411	} else if (n->req() == `3` && !is_cmov_pack(p)) {
2412	// Promote operands to vector
2413	Node* in1 = NULL;
2414	bool node_isa_reduction = n->is_reduction();
2415	if (node_isa_reduction) {
2416	// the input to the first reduction operation is retained
2417	in1 = low_adr->in(`1`);
2418	} else {
2419	in1 = vector_opd(p, `1`);
2420	if (in1 == NULL) {
2421	if (do_reserve_copy()) {
2422	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: in1 should not be NULL, exiting SuperWord");})
2423	return; //and reverse to backup IG
2424	}
2425	ShouldNotReachHere();
2426	}
2427	}
2428	Node* in2 = vector_opd(p, `2`);
2429	if (in2 == NULL) {
2430	if (do_reserve_copy()) {
2431	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: in2 should not be NULL, exiting SuperWord");})
2432	return; //and reverse to backup IG
2433	}
2434	ShouldNotReachHere();
2435	}
2436	if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() \|\| n->is_Mul())) {
2437	// Move invariant vector input into second position to avoid register spilling.
2438	Node* tmp = in1;
2439	in1 = in2;
2440	in2 = tmp;
2441	}
2442	if (node_isa_reduction) {
2443	const Type *arith_type = n->bottom_type();
2444	vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
2445	if (in2->is_Load()) {
2446	vlen_in_bytes = in2->as_LoadVector()->memory_size();
2447	} else {
2448	vlen_in_bytes = in2->as_Vector()->length_in_bytes();
2449	}
2450	} else {
2451	vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
2452	vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2453	}
2454	} else if (opc == Op_SqrtF \|\| opc == Op_SqrtD \|\|
2455	opc == Op_AbsF \|\| opc == Op_AbsD \|\|
2456	opc == Op_AbsI \|\| opc == Op_AbsL \|\|
2457	opc == Op_NegF \|\| opc == Op_NegD \|\|
2458	opc == Op_PopCountI) {
2459	assert(n->req() == `2`, "only one input expected");
2460	Node* in = vector_opd(p, `1`);
2461	vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
2462	vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2463	} else if (is_cmov_pack(p)) {
2464	if (can_process_post_loop) {
2465	// do not refactor of flow in post loop context
2466	return;
2467	}
2468	if (!n->is_CMove()) {
2469	continue;
2470	}
2471	// place here CMoveVDNode
2472	NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
2473	Node* bol = n->in(CMoveNode::Condition);
2474	if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > `1` ) {
2475	NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(`1`)->_idx); bol->dump(); bol->in(`1`)->dump();})
2476	bol = bol->in(`1`); //may be ExtractNode
2477	}
2478
2479	assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
2480	if (!bol->is_Bool()) {
2481	if (do_reserve_copy()) {
2482	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
2483	return; //and reverse to backup IG
2484	}
2485	ShouldNotReachHere();
2486	}
2487
2488	int cond = (int)bol->as_Bool()->_test._test;
2489	Node* in_cc = _igvn.intcon(cond);
2490	NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created intcon in_cc node %d", in_cc->_idx); in_cc->dump();})
2491	Node* cc = bol->clone();
2492	cc->set_req(`1`, in_cc);
2493	NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created bool cc node %d", cc->_idx); cc->dump();})
2494
2495	Node* src1 = vector_opd(p, `2`); //2=CMoveNode::IfFalse
2496	if (src1 == NULL) {
2497	if (do_reserve_copy()) {
2498	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: src1 should not be NULL, exiting SuperWord");})
2499	return; //and reverse to backup IG
2500	}
2501	ShouldNotReachHere();
2502	}
2503	Node* src2 = vector_opd(p, `3`); //3=CMoveNode::IfTrue
2504	if (src2 == NULL) {
2505	if (do_reserve_copy()) {
2506	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: src2 should not be NULL, exiting SuperWord");})
2507	return; //and reverse to backup IG
2508	}
2509	ShouldNotReachHere();
2510	}
2511	BasicType bt = velt_basic_type(n);
2512	const TypeVect* vt = TypeVect::make(bt, vlen);
2513	assert(bt == T_FLOAT \|\| bt == T_DOUBLE, "Only vectorization for FP cmovs is supported");
2514	if (bt == T_FLOAT) {
2515	vn = new CMoveVFNode (cc, src1, src2, vt);
2516	} else {
2517	assert(bt == T_DOUBLE, "Expected double");
2518	vn = new CMoveVDNode (cc, src1, src2, vt);
2519	}
2520	NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
2521	} else if (opc == Op_FmaD \|\| opc == Op_FmaF) {
2522	// Promote operands to vector
2523	Node* in1 = vector_opd(p, `1`);
2524	Node* in2 = vector_opd(p, `2`);
2525	Node* in3 = vector_opd(p, `3`);
2526	vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
2527	vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2528	} else {
2529	if (do_reserve_copy()) {
2530	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
2531	return; //and reverse to backup IG
2532	}
2533	ShouldNotReachHere();
2534	}
2535
2536	assert(vn != NULL, "sanity");
2537	if (vn == NULL) {
2538	if (do_reserve_copy()){
2539	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
2540	return; //and reverse to backup IG
2541	}
2542	ShouldNotReachHere();
2543	}
2544
2545	_block.at_put(i, vn);
2546	_igvn.register_new_node_with_optimizer(vn);
2547	_phase->set_ctrl(vn, _phase->get_ctrl(p->at(`0`)));
2548	for (uint j = `0`; j < p->size(); j++) {
2549	Node* pm = p->at(j);
2550	_igvn.replace_node(pm, vn);
2551	}
2552	_igvn._worklist.push(vn);
2553
2554	if (can_process_post_loop) {
2555	// first check if the vector size if the maximum vector which we can use on the machine,
2556	// other vector size have reduced values for predicated data mapping.
2557	if (vlen_in_bytes != (uint)MaxVectorSize) {
2558	return;
2559	}
2560	}
2561
2562	if (vlen_in_bytes >= max_vlen_in_bytes && vlen > max_vlen) {
2563	max_vlen = vlen;
2564	max_vlen_in_bytes = vlen_in_bytes;
2565	}
2566	#ifdef ASSERT
2567	if (TraceNewVectors) {
2568	tty->print("new Vector node: ");
2569	vn->dump();
2570	}
2571	#endif
2572	}
2573	}//for (int i = 0; i < _block.length(); i++)
2574
2575	if (max_vlen_in_bytes > C->max_vector_size()) {
2576	C->set_max_vector_size(max_vlen_in_bytes);
2577	}
2578	if (max_vlen_in_bytes > `0`) {
2579	cl->mark_loop_vectorized();
2580	}
2581
2582	if (SuperWordLoopUnrollAnalysis) {
2583	if (cl->has_passed_slp()) {
2584	uint slp_max_unroll_factor = cl->slp_max_unroll();
2585	if (slp_max_unroll_factor == max_vlen) {
2586	if (TraceSuperWordLoopUnrollAnalysis) {
2587	tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte);
2588	}
2589
2590	// For atomic unrolled loops which are vector mapped, instigate more unrolling
2591	cl->set_notpassed_slp();
2592	if (cl->is_main_loop()) {
2593	// if vector resources are limited, do not allow additional unrolling, also
2594	// do not unroll more on pure vector loops which were not reduced so that we can
2595	// program the post loop to single iteration execution.
2596	if (FLOATPRESSURE > `8`) {
2597	C->set_major_progress();
2598	cl->mark_do_unroll_only();
2599	}
2600	}
2601
2602	if (do_reserve_copy()) {
2603	if (can_process_post_loop) {
2604	// Now create the difference of trip and limit and use it as our mask index.
2605	// Note: We limited the unroll of the vectorized loop so that
2606	// only vlen-1 size iterations can remain to be mask programmed.
2607	Node *incr = cl->incr();
2608	SubINode index = new* SubINode (cl->limit(), cl->init_trip());
2609	_igvn.register_new_node_with_optimizer(index);
2610	SetVectMaskINode mask = new* SetVectMaskINode (_phase->get_ctrl(cl->init_trip()), index);
2611	_igvn.register_new_node_with_optimizer(mask);
2612	// make this a single iteration loop
2613	AddINode new_incr = new* AddINode (incr->in(`1`), mask);
2614	_igvn.register_new_node_with_optimizer(new_incr);
2615	_phase->set_ctrl(new_incr, _phase->get_ctrl(incr));
2616	_igvn.replace_node(incr, new_incr);
2617	cl->mark_is_multiversioned();
2618	cl->loopexit()->add_flag(Node::Flag_has_vector_mask_set);
2619	}
2620	}
2621	}
2622	}
2623	}
2624
2625	if (do_reserve_copy()) {
2626	make_reversable.use_new();
2627	}
2628	NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
2629	return;
2630	}
2631
2632	//------------------------------vector_opd---------------------------
2633	// Create a vector operand for the nodes in pack p for operand: in(opd_idx)
2634	Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
2635	Node* p0 = p->at(`0`);
2636	uint vlen = p->size();
2637	Node* opd = p0->in(opd_idx);
2638	CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
2639
2640	if (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop()) {
2641	// override vlen with the main loops vector length
2642	vlen = cl->slp_max_unroll();
2643	}
2644
2645	if (same_inputs(p, opd_idx)) {
2646	if (opd->is_Vector() \|\| opd->is_LoadVector()) {
2647	assert(((opd_idx != `2`) \|\| !VectorNode::is_shift(p0)), "shift's count can't be vector");
2648	if (opd_idx == `2` && VectorNode::is_shift(p0)) {
2649	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
2650	return NULL;
2651	}
2652	return opd; // input is matching vector
2653	}
2654	if ((opd_idx == `2`) && VectorNode::is_shift(p0)) {
2655	Compile* C = _phase->C;
2656	Node* cnt = opd;
2657	// Vector instructions do not mask shift count, do it here.
2658	juint mask = (p0->bottom_type() == TypeInt::INT) ? (BitsPerInt - `1`) : (BitsPerLong - `1`);
2659	const TypeInt* t = opd->find_int_type();
2660	if (t != NULL && t->is_con()) {
2661	juint shift = t->get_con();
2662	if (shift > mask) { // Unsigned cmp
2663	cnt = ConNode::make(TypeInt::make(shift & mask));
2664	}
2665	} else {
2666	if (t == NULL \|\| t->_lo < `0` \|\| t->_hi > (int)mask) {
2667	cnt = ConNode::make(TypeInt::make(mask));
2668	_igvn.register_new_node_with_optimizer(cnt);
2669	cnt = new AndINode (opd, cnt);
2670	_igvn.register_new_node_with_optimizer(cnt);
2671	_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2672	}
2673	assert(opd->bottom_type()->isa_int(), "int type only");
2674	if (!opd->bottom_type()->isa_int()) {
2675	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("Should be int type only");})
2676	return NULL;
2677	}
2678	// Move non constant shift count into vector register.
2679	cnt = VectorNode::shift_count(p0, cnt, vlen, velt_basic_type(p0));
2680	}
2681	if (cnt != opd) {
2682	_igvn.register_new_node_with_optimizer(cnt);
2683	_phase->set_ctrl(cnt, _phase->get_ctrl(opd));
2684	}
2685	return cnt;
2686	}
2687	assert(!opd->is_StoreVector(), "such vector is not expected here");
2688	if (opd->is_StoreVector()) {
2689	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("StoreVector is not expected here");})
2690	return NULL;
2691	}
2692	// Convert scalar input to vector with the same number of elements as
2693	// p0's vector. Use p0's type because size of operand's container in
2694	// vector should match p0's size regardless operand's size.
2695	const Type* p0_t = velt_type(p0);
2696	VectorNode* vn = VectorNode::scalar2vector(opd, vlen, p0_t);
2697
2698	_igvn.register_new_node_with_optimizer(vn);
2699	_phase->set_ctrl(vn, _phase->get_ctrl(opd));
2700	#ifdef ASSERT
2701	if (TraceNewVectors) {
2702	tty->print("new Vector node: ");
2703	vn->dump();
2704	}
2705	#endif
2706	return vn;
2707	}
2708
2709	// Insert pack operation
2710	BasicType bt = velt_basic_type(p0);
2711	PackNode* pk = PackNode::make(opd, vlen, bt);
2712	DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
2713
2714	for (uint i = `1`; i < vlen; i++) {
2715	Node* pi = p->at(i);
2716	Node* in = pi->in(opd_idx);
2717	assert(my_pack(in) == NULL, "Should already have been unpacked");
2718	if (my_pack(in) != NULL) {
2719	NOT_PRODUCT(if(is_trace_loop_reverse() \|\| TraceLoopOpts) {tty->print_cr("Should already have been unpacked");})
2720	return NULL;
2721	}
2722	assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
2723	pk->add_opd(in);
2724	if (VectorNode::is_muladds2i(pi)) {
2725	Node* in2 = pi->in(opd_idx + `2`);
2726	assert(my_pack(in2) == NULL, "Should already have been unpacked");
2727	if (my_pack(in2) != NULL) {
2728	NOT_PRODUCT(if (is_trace_loop_reverse() \|\| TraceLoopOpts) { tty->print_cr("Should already have been unpacked"); })
2729	return NULL;
2730	}
2731	assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
2732	pk->add_opd(in2);
2733	}
2734	}
2735	_igvn.register_new_node_with_optimizer(pk);
2736	_phase->set_ctrl(pk, _phase->get_ctrl(opd));
2737	#ifdef ASSERT
2738	if (TraceNewVectors) {
2739	tty->print("new Vector node: ");
2740	pk->dump();
2741	}
2742	#endif
2743	return pk;
2744	}
2745
2746	//------------------------------insert_extracts---------------------------
2747	// If a use of pack p is not a vector use, then replace the
2748	// use with an extract operation.
2749	void SuperWord::insert_extracts(Node_List* p) {
2750	if (p->at(`0`)->is_Store()) return;
2751	assert(_n_idx_list.is_empty(), "empty (node,index) list");
2752
2753	// Inspect each use of each pack member. For each use that is
2754	// not a vector use, replace the use with an extract operation.
2755
2756	for (uint i = `0`; i < p->size(); i++) {
2757	Node* def = p->at(i);
2758	for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
2759	Node* use = def->fast_out(j);
2760	for (uint k = `0`; k < use->req(); k++) {
2761	Node* n = use->in(k);
2762	if (def == n) {
2763	Node_List* u_pk = my_pack(use);
2764	if ((u_pk == NULL \|\| !is_cmov_pack(u_pk) \|\| use->is_CMove()) && !is_vector_use(use, k)) {
2765	_n_idx_list.push(use, k);
2766	}
2767	}
2768	}
2769	}
2770	}
2771
2772	while (_n_idx_list.is_nonempty()) {
2773	Node* use = _n_idx_list.node();
2774	int idx = _n_idx_list.index();
2775	_n_idx_list.pop();
2776	Node* def = use->in(idx);
2777
2778	if (def->is_reduction()) continue;
2779
2780	// Insert extract operation
2781	_igvn.hash_delete(def);
2782	int def_pos = alignment(def) / data_size(def);
2783
2784	Node* ex = ExtractNode::make(def, def_pos, velt_basic_type(def));
2785	_igvn.register_new_node_with_optimizer(ex);
2786	_phase->set_ctrl(ex, _phase->get_ctrl(def));
2787	_igvn.replace_input_of(use, idx, ex);
2788	_igvn._worklist.push(def);
2789
2790	bb_insert_after(ex, bb_idx(def));
2791	set_velt_type(ex, velt_type(def));
2792	}
2793	}
2794
2795	//------------------------------is_vector_use---------------------------
2796	// Is use->in(u_idx) a vector use?
2797	bool SuperWord::is_vector_use(Node* use, int u_idx) {
2798	Node_List* u_pk = my_pack(use);
2799	if (u_pk == NULL) return false;
2800	if (use->is_reduction()) return true;
2801	Node* def = use->in(u_idx);
2802	Node_List* d_pk = my_pack(def);
2803	if (d_pk == NULL) {
2804	// check for scalar promotion
2805	Node* n = u_pk->at(`0`)->in(u_idx);
2806	for (uint i = `1`; i < u_pk->size(); i++) {
2807	if (u_pk->at(i)->in(u_idx) != n) return false;
2808	}
2809	return true;
2810	}
2811	if (VectorNode::is_muladds2i(use)) {
2812	// MulAddS2I takes shorts and produces ints - hence the special checks
2813	// on alignment and size.
2814	if (u_pk->size() * `2` != d_pk->size()) {
2815	return false;
2816	}
2817	for (uint i = `0`; i < MIN2(d_pk->size(), u_pk->size()); i++) {
2818	Node* ui = u_pk->at(i);
2819	Node* di = d_pk->at(i);
2820	if (alignment(ui) != alignment(di) * `2`) {
2821	return false;
2822	}
2823	}
2824	return true;
2825	}
2826	if (u_pk->size() != d_pk->size())
2827	return false;
2828	for (uint i = `0`; i < u_pk->size(); i++) {
2829	Node* ui = u_pk->at(i);
2830	Node* di = d_pk->at(i);
2831	if (ui->in(u_idx) != di \|\| alignment(ui) != alignment(di))
2832	return false;
2833	}
2834	return true;
2835	}
2836
2837	//------------------------------construct_bb---------------------------
2838	// Construct reverse postorder list of block members
2839	bool SuperWord::construct_bb() {
2840	Node* entry = bb();
2841
2842	assert(_stk.length() == `0`, "stk is empty");
2843	assert(_block.length() == `0`, "block is empty");
2844	assert(_data_entry.length() == `0`, "data_entry is empty");
2845	assert(_mem_slice_head.length() == `0`, "mem_slice_head is empty");
2846	assert(_mem_slice_tail.length() == `0`, "mem_slice_tail is empty");
2847
2848	// Find non-control nodes with no inputs from within block,
2849	// create a temporary map from node _idx to bb_idx for use
2850	// by the visited and post_visited sets,
2851	// and count number of nodes in block.
2852	int bb_ct = `0`;
2853	for (uint i = `0`; i < lpt()->_body.size(); i++) {
2854	Node *n = lpt()->_body.at(i);
2855	set_bb_idx(n, i); // Create a temporary map
2856	if (in_bb(n)) {
2857	if (n->is_LoadStore() \|\| n->is_MergeMem() \|\|
2858	(n->is_Proj() && !n->as_Proj()->is_CFG())) {
2859	// Bailout if the loop has LoadStore, MergeMem or data Proj
2860	// nodes. Superword optimization does not work with them.
2861	return false;
2862	}
2863	bb_ct++;
2864	if (!n->is_CFG()) {
2865	bool found = false;
2866	for (uint j = `0`; j < n->req(); j++) {
2867	Node* def = n->in(j);
2868	if (def && in_bb(def)) {
2869	found = true;
2870	break;
2871	}
2872	}
2873	if (!found) {
2874	assert(n != entry, "can't be entry");
2875	_data_entry.push(n);
2876	}
2877	}
2878	}
2879	}
2880
2881	// Find memory slices (head and tail)
2882	for (DUIterator_Fast imax, i = lp()->fast_outs(imax); i < imax; i++) {
2883	Node *n = lp()->fast_out(i);
2884	if (in_bb(n) && (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
2885	Node* n_tail = n->in(LoopNode::LoopBackControl);
2886	if (n_tail != n->in(LoopNode::EntryControl)) {
2887	if (!n_tail->is_Mem()) {
2888	assert(n_tail->is_Mem(), "unexpected node for memory slice: %s", n_tail->Name());
2889	return false; // Bailout
2890	}
2891	_mem_slice_head.push(n);
2892	_mem_slice_tail.push(n_tail);
2893	}
2894	}
2895	}
2896
2897	// Create an RPO list of nodes in block
2898
2899	visited_clear();
2900	post_visited_clear();
2901
2902	// Push all non-control nodes with no inputs from within block, then control entry
2903	for (int j = `0`; j < _data_entry.length(); j++) {
2904	Node* n = _data_entry.at(j);
2905	visited_set(n);
2906	_stk.push(n);
2907	}
2908	visited_set(entry);
2909	_stk.push(entry);
2910
2911	// Do a depth first walk over out edges
2912	int rpo_idx = bb_ct - `1`;
2913	int size;
2914	int reduction_uses = `0`;
2915	while ((size = _stk.length()) > `0`) {
2916	Node* n = _stk.top(); // Leave node on stack
2917	if (!visited_test_set(n)) {
2918	// forward arc in graph
2919	} else if (!post_visited_test(n)) {
2920	// cross or back arc
2921	for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
2922	Node *use = n->fast_out(i);
2923	if (in_bb(use) && !visited_test(use) &&
2924	// Don't go around backedge
2925	(!use->is_Phi() \|\| n == entry)) {
2926	if (use->is_reduction()) {
2927	// First see if we can map the reduction on the given system we are on, then
2928	// make a data entry operation for each reduction we see.
2929	BasicType bt = use->bottom_type()->basic_type();
2930	if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
2931	reduction_uses++;
2932	}
2933	}
2934	_stk.push(use);
2935	}
2936	}
2937	if (_stk.length() == size) {
2938	// There were no additional uses, post visit node now
2939	_stk.pop(); // Remove node from stack
2940	assert(rpo_idx >= `0`, "");
2941	_block.at_put_grow(rpo_idx, n);
2942	rpo_idx--;
2943	post_visited_set(n);
2944	assert(rpo_idx >= `0` \|\| _stk.is_empty(), "");
2945	}
2946	} else {
2947	_stk.pop(); // Remove post-visited node from stack
2948	}
2949	}//while
2950
2951	int ii_current = -`1`;
2952	unsigned int load_idx = (unsigned int)-`1`;
2953	_ii_order.clear();
2954	// Create real map of block indices for nodes
2955	for (int j = `0`; j < _block.length(); j++) {
2956	Node* n = _block.at(j);
2957	set_bb_idx(n, j);
2958	if (_do_vector_loop && n->is_Load()) {
2959	if (ii_current == -`1`) {
2960	ii_current = _clone_map.gen(n->_idx);
2961	_ii_order.push(ii_current);
2962	load_idx = _clone_map.idx(n->_idx);
2963	} else if (_clone_map.idx(n->_idx) == load_idx && _clone_map.gen(n->_idx) != ii_current) {
2964	ii_current = _clone_map.gen(n->_idx);
2965	_ii_order.push(ii_current);
2966	}
2967	}
2968	}//for
2969
2970	// Ensure extra info is allocated.
2971	initialize_bb();
2972
2973	#ifndef PRODUCT
2974	if (_vector_loop_debug && _ii_order.length() > `0`) {
2975	tty->print("SuperWord::construct_bb: List of generations: ");
2976	for (int jj = `0`; jj < _ii_order.length(); ++jj) {
2977	tty->print(" %d:%d", jj, _ii_order.at(jj));
2978	}
2979	tty->print_cr(" ");
2980	}
2981	if (TraceSuperWord) {
2982	print_bb();
2983	tty->print_cr("\ndata entry nodes: %s", _data_entry.length() > `0` ? "" : "NONE");
2984	for (int m = `0`; m < _data_entry.length(); m++) {
2985	tty->print("%3d ", m);
2986	_data_entry.at(m)->dump();
2987	}
2988	tty->print_cr("\nmemory slices: %s", _mem_slice_head.length() > `0` ? "" : "NONE");
2989	for (int m = `0`; m < _mem_slice_head.length(); m++) {
2990	tty->print("%3d ", m); _mem_slice_head.at(m)->dump();
2991	tty->print(" "); _mem_slice_tail.at(m)->dump();
2992	}
2993	}
2994	#endif
2995	assert(rpo_idx == -`1` && bb_ct == _block.length(), "all block members found");
2996	return (_mem_slice_head.length() > `0`) \|\| (reduction_uses > `0`) \|\| (_data_entry.length() > `0`);
2997	}
2998
2999	//------------------------------initialize_bb---------------------------
3000	// Initialize per node info
3001	void SuperWord::initialize_bb() {
3002	Node* last = _block.at(_block.length() - `1`);
3003	grow_node_info(bb_idx(last));
3004	}
3005
3006	//------------------------------bb_insert_after---------------------------
3007	// Insert n into block after pos
3008	void SuperWord::bb_insert_after(Node* n, int pos) {
3009	int n_pos = pos + `1`;
3010	// Make room
3011	for (int i = _block.length() - `1`; i >= n_pos; i--) {
3012	_block.at_put_grow(i+`1`, _block.at(i));
3013	}
3014	for (int j = _node_info.length() - `1`; j >= n_pos; j--) {
3015	_node_info.at_put_grow(j+`1`, _node_info.at(j));
3016	}
3017	// Set value
3018	_block.at_put_grow(n_pos, n);
3019	_node_info.at_put_grow(n_pos, SWNodeInfo::initial);
3020	// Adjust map from node->_idx to _block index
3021	for (int i = n_pos; i < _block.length(); i++) {
3022	set_bb_idx(_block.at(i), i);
3023	}
3024	}
3025
3026	//------------------------------compute_max_depth---------------------------
3027	// Compute max depth for expressions from beginning of block
3028	// Use to prune search paths during test for independence.
3029	void SuperWord::compute_max_depth() {
3030	int ct = `0`;
3031	bool again;
3032	do {
3033	again = false;
3034	for (int i = `0`; i < _block.length(); i++) {
3035	Node* n = _block.at(i);
3036	if (!n->is_Phi()) {
3037	int d_orig = depth(n);
3038	int d_in = `0`;
3039	for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
3040	Node* pred = preds.current();
3041	if (in_bb(pred)) {
3042	d_in = MAX2(d_in, depth(pred));
3043	}
3044	}
3045	if (d_in + `1` != d_orig) {
3046	set_depth(n, d_in + `1`);
3047	again = true;
3048	}
3049	}
3050	}
3051	ct++;
3052	} while (again);
3053
3054	if (TraceSuperWord && Verbose) {
3055	tty->print_cr("compute_max_depth iterated: %d times", ct);
3056	}
3057	}
3058
3059	//-------------------------compute_vector_element_type-----------------------
3060	// Compute necessary vector element type for expressions
3061	// This propagates backwards a narrower integer type when the
3062	// upper bits of the value are not needed.
3063	// Example: char a,b,c; a = b + c;
3064	// Normally the type of the add is integer, but for packed character
3065	// operations the type of the add needs to be char.
3066	void SuperWord::compute_vector_element_type() {
3067	if (TraceSuperWord && Verbose) {
3068	tty->print_cr("\ncompute_velt_type:");
3069	}
3070
3071	// Initial type
3072	for (int i = `0`; i < _block.length(); i++) {
3073	Node* n = _block.at(i);
3074	set_velt_type(n, container_type(n));
3075	}
3076
3077	// Propagate integer narrowed type backwards through operations
3078	// that don't depend on higher order bits
3079	for (int i = _block.length() - `1`; i >= `0`; i--) {
3080	Node* n = _block.at(i);
3081	// Only integer types need be examined
3082	const Type* vtn = velt_type(n);
3083	if (vtn->basic_type() == T_INT) {
3084	uint start, end;
3085	VectorNode::vector_operands(n, &start, &end);
3086
3087	for (uint j = start; j < end; j++) {
3088	Node* in = n->in(j);
3089	// Don't propagate through a memory
3090	if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
3091	data_size(n) < data_size(in)) {
3092	bool same_type = true;
3093	for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
3094	Node *use = in->fast_out(k);
3095	if (!in_bb(use) \|\| !same_velt_type(use, n)) {
3096	same_type = false;
3097	break;
3098	}
3099	}
3100	if (same_type) {
3101	// For right shifts of small integer types (bool, byte, char, short)
3102	// we need precise information about sign-ness. Only Load nodes have
3103	// this information because Store nodes are the same for signed and
3104	// unsigned values. And any arithmetic operation after a load may
3105	// expand a value to signed Int so such right shifts can't be used
3106	// because vector elements do not have upper bits of Int.
3107	const Type* vt = vtn;
3108	if (VectorNode::is_shift(in)) {
3109	Node* load = in->in(`1`);
3110	if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
3111	vt = velt_type(load);
3112	} else if (in->Opcode() != Op_LShiftI) {
3113	// Widen type to Int to avoid creation of right shift vector
3114	// (align + data_size(s1) check in stmts_can_pack() will fail).
3115	// Note, left shifts work regardless type.
3116	vt = TypeInt::INT;
3117	}
3118	}
3119	set_velt_type(in, vt);
3120	}
3121	}
3122	}
3123	}
3124	}
3125	#ifndef PRODUCT
3126	if (TraceSuperWord && Verbose) {
3127	for (int i = `0`; i < _block.length(); i++) {
3128	Node* n = _block.at(i);
3129	velt_type(n)->dump();
3130	tty->print("\t");
3131	n->dump();
3132	}
3133	}
3134	#endif
3135	}
3136
3137	//------------------------------memory_alignment---------------------------
3138	// Alignment within a vector memory reference
3139	int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
3140	#ifndef PRODUCT
3141	if(TraceSuperWord && Verbose) {
3142	tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump();
3143	}
3144	#endif
3145	NOT_PRODUCT(SWPointer::Tracer::Depth ddd(`0`);)
3146	SWPointer p(s, this, NULL, false);
3147	if (!p.valid()) {
3148	NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
3149	return bottom_align;
3150	}
3151	int vw = get_vw_bytes_special(s);
3152	if (vw < `2`) {
3153	NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
3154	return bottom_align; // No vectors for this type
3155	}
3156	int offset = p.offset_in_bytes();
3157	offset += iv_adjust*p.memory_size();
3158	int off_rem = offset % vw;
3159	int off_mod = off_rem >= `0` ? off_rem : off_rem + vw;
3160	if (TraceSuperWord && Verbose) {
3161	tty->print_cr("SWPointer::memory_alignment: off_rem = %d, off_mod = %d", off_rem, off_mod);
3162	}
3163	return off_mod;
3164	}
3165
3166	//---------------------------container_type---------------------------
3167	// Smallest type containing range of values
3168	const Type* SuperWord::container_type(Node* n) {
3169	if (n->is_Mem()) {
3170	BasicType bt = n->as_Mem()->memory_type();
3171	if (n->is_Store() && (bt == T_CHAR)) {
3172	// Use T_SHORT type instead of T_CHAR for stored values because any
3173	// preceding arithmetic operation extends values to signed Int.
3174	bt = T_SHORT;
3175	}
3176	if (n->Opcode() == Op_LoadUB) {
3177	// Adjust type for unsigned byte loads, it is important for right shifts.
3178	// T_BOOLEAN is used because there is no basic type representing type
3179	// TypeInt::UBYTE. Use of T_BOOLEAN for vectors is fine because only
3180	// size (one byte) and sign is important.
3181	bt = T_BOOLEAN;
3182	}
3183	return Type::get_const_basic_type(bt);
3184	}
3185	const Type* t = _igvn.type(n);
3186	if (t->basic_type() == T_INT) {
3187	// A narrow type of arithmetic operations will be determined by
3188	// propagating the type of memory operations.
3189	return TypeInt::INT;
3190	}
3191	return t;
3192	}
3193
3194	bool SuperWord::same_velt_type(Node* n1, Node* n2) {
3195	const Type* vt1 = velt_type(n1);
3196	const Type* vt2 = velt_type(n2);
3197	if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
3198	// Compare vectors element sizes for integer types.
3199	return data_size(n1) == data_size(n2);
3200	}
3201	return vt1 == vt2;
3202	}
3203
3204	//------------------------------in_packset---------------------------
3205	// Are s1 and s2 in a pack pair and ordered as s1,s2?
3206	bool SuperWord::in_packset(Node* s1, Node* s2) {
3207	for (int i = `0`; i < _packset.length(); i++) {
3208	Node_List* p = _packset.at(i);
3209	assert(p->size() == `2`, "must be");
3210	if (p->at(`0`) == s1 && p->at(p->size()-`1`) == s2) {
3211	return true;
3212	}
3213	}
3214	return false;
3215	}
3216
3217	//------------------------------in_pack---------------------------
3218	// Is s in pack p?
3219	Node_List* SuperWord::in_pack(Node* s, Node_List* p) {
3220	for (uint i = `0`; i < p->size(); i++) {
3221	if (p->at(i) == s) {
3222	return p;
3223	}
3224	}
3225	return NULL;
3226	}
3227
3228	//------------------------------remove_pack_at---------------------------
3229	// Remove the pack at position pos in the packset
3230	void SuperWord::remove_pack_at(int pos) {
3231	Node_List* p = _packset.at(pos);
3232	for (uint i = `0`; i < p->size(); i++) {
3233	Node* s = p->at(i);
3234	set_my_pack(s, NULL);
3235	}
3236	_packset.remove_at(pos);
3237	}
3238
3239	void SuperWord::packset_sort(int n) {
3240	// simple bubble sort so that we capitalize with O(n) when its already sorted
3241	while (n != `0`) {
3242	bool swapped = false;
3243	for (int i = `1`; i < n; i++) {
3244	Node_List* q_low = _packset.at(i-`1`);
3245	Node_List* q_i = _packset.at(i);
3246
3247	// only swap when we find something to swap
3248	if (alignment(q_low->at(`0`)) > alignment(q_i->at(`0`))) {
3249	Node_List* t = q_i;
3250	*(_packset.adr_at(i)) = q_low;
3251	*(_packset.adr_at(i-`1`)) = q_i;
3252	swapped = true;
3253	}
3254	}
3255	if (swapped == false) break;
3256	n--;
3257	}
3258	}
3259
3260	//------------------------------executed_first---------------------------
3261	// Return the node executed first in pack p. Uses the RPO block list
3262	// to determine order.
3263	Node* SuperWord::executed_first(Node_List* p) {
3264	Node* n = p->at(`0`);
3265	int n_rpo = bb_idx(n);
3266	for (uint i = `1`; i < p->size(); i++) {
3267	Node* s = p->at(i);
3268	int s_rpo = bb_idx(s);
3269	if (s_rpo < n_rpo) {
3270	n = s;
3271	n_rpo = s_rpo;
3272	}
3273	}
3274	return n;
3275	}
3276
3277	//------------------------------executed_last---------------------------
3278	// Return the node executed last in pack p.
3279	Node* SuperWord::executed_last(Node_List* p) {
3280	Node* n = p->at(`0`);
3281	int n_rpo = bb_idx(n);
3282	for (uint i = `1`; i < p->size(); i++) {
3283	Node* s = p->at(i);
3284	int s_rpo = bb_idx(s);
3285	if (s_rpo > n_rpo) {
3286	n = s;
3287	n_rpo = s_rpo;
3288	}
3289	}
3290	return n;
3291	}
3292
3293	LoadNode::ControlDependency SuperWord::control_dependency(Node_List* p) {
3294	LoadNode::ControlDependency dep = LoadNode::DependsOnlyOnTest;
3295	for (uint i = `0`; i < p->size(); i++) {
3296	Node* n = p->at(i);
3297	assert(n->is_Load(), "only meaningful for loads");
3298	if (!n->depends_only_on_test()) {
3299	dep = LoadNode::Pinned;
3300	}
3301	}
3302	return dep;
3303	}
3304
3305
3306	//----------------------------align_initial_loop_index---------------------------
3307	// Adjust pre-loop limit so that in main loop, a load/store reference
3308	// to align_to_ref will be a position zero in the vector.
3309	// (iv + k) mod vector_align == 0
3310	void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
3311	CountedLoopNode *main_head = lp()->as_CountedLoop();
3312	assert(main_head->is_main_loop(), "");
3313	CountedLoopEndNode* pre_end = get_pre_loop_end(main_head);
3314	assert(pre_end != NULL, "we must have a correct pre-loop");
3315	Node *pre_opaq1 = pre_end->limit();
3316	assert(pre_opaq1->Opcode() == Op_Opaque1, "");
3317	Opaque1Node pre_opaq = (Opaque1Node)pre_opaq1;
3318	Node *lim0 = pre_opaq->in(`1`);
3319
3320	// Where we put new limit calculations
3321	Node *pre_ctrl = pre_end->loopnode()->in(LoopNode::EntryControl);
3322
3323	// Ensure the original loop limit is available from the
3324	// pre-loop Opaque1 node.
3325	Node *orig_limit = pre_opaq->original_loop_limit();
3326	assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
3327
3328	SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
3329	assert(align_to_ref_p.valid(), "sanity");
3330
3331	// Given:
3332	// lim0 == original pre loop limit
3333	// V == v_align (power of 2)
3334	// invar == extra invariant piece of the address expression
3335	// e == offset [ +/- invar ]
3336	//
3337	// When reassociating expressions involving '%' the basic rules are:
3338	// (a - b) % k == 0 => a % k == b % k
3339	// and:
3340	// (a + b) % k == 0 => a % k == (k - b) % k
3341	//
3342	// For stride > 0 && scale > 0,
3343	// Derive the new pre-loop limit "lim" such that the two constraints:
3344	// (1) lim = lim0 + N (where N is some positive integer < V)
3345	// (2) (e + lim) % V == 0
3346	// are true.
3347	//
3348	// Substituting (1) into (2),
3349	// (e + lim0 + N) % V == 0
3350	// solve for N:
3351	// N = (V - (e + lim0)) % V
3352	// substitute back into (1), so that new limit
3353	// lim = lim0 + (V - (e + lim0)) % V
3354	//
3355	// For stride > 0 && scale < 0
3356	// Constraints:
3357	// lim = lim0 + N
3358	// (e - lim) % V == 0
3359	// Solving for lim:
3360	// (e - lim0 - N) % V == 0
3361	// N = (e - lim0) % V
3362	// lim = lim0 + (e - lim0) % V
3363	//
3364	// For stride < 0 && scale > 0
3365	// Constraints:
3366	// lim = lim0 - N
3367	// (e + lim) % V == 0
3368	// Solving for lim:
3369	// (e + lim0 - N) % V == 0
3370	// N = (e + lim0) % V
3371	// lim = lim0 - (e + lim0) % V
3372	//
3373	// For stride < 0 && scale < 0
3374	// Constraints:
3375	// lim = lim0 - N
3376	// (e - lim) % V == 0
3377	// Solving for lim:
3378	// (e - lim0 + N) % V == 0
3379	// N = (V - (e - lim0)) % V
3380	// lim = lim0 - (V - (e - lim0)) % V
3381
3382	int vw = vector_width_in_bytes(align_to_ref);
3383	int stride = iv_stride();
3384	int scale = align_to_ref_p.scale_in_bytes();
3385	int elt_size = align_to_ref_p.memory_size();
3386	int v_align = vw / elt_size;
3387	assert(v_align > `1`, "sanity");
3388	int offset = align_to_ref_p.offset_in_bytes() / elt_size;
3389	Node *offsn = _igvn.intcon(offset);
3390
3391	Node *e = offsn;
3392	if (align_to_ref_p.invar() != NULL) {
3393	// incorporate any extra invariant piece producing (offset +/- invar) >>> log2(elt)
3394	Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3395	Node* invar = align_to_ref_p.invar();
3396	if (_igvn.type(invar)->isa_long()) {
3397	// Computations are done % (vector width/element size) so it's
3398	// safe to simply convert invar to an int and loose the upper 32
3399	// bit half.
3400	invar = new ConvL2INode (invar);
3401	_igvn.register_new_node_with_optimizer(invar);
3402	}
3403	Node* aref = new URShiftINode (invar, log2_elt);
3404	_igvn.register_new_node_with_optimizer(aref);
3405	_phase->set_ctrl(aref, pre_ctrl);
3406	if (align_to_ref_p.negate_invar()) {
3407	e = new SubINode (e, aref);
3408	} else {
3409	e = new AddINode (e, aref);
3410	}
3411	_igvn.register_new_node_with_optimizer(e);
3412	_phase->set_ctrl(e, pre_ctrl);
3413	}
3414	if (vw > ObjectAlignmentInBytes \|\| align_to_ref_p.base()->is_top()) {
3415	// incorporate base e +/- base && Mask >>> log2(elt)
3416	Node* xbase = new CastP2XNode (NULL, align_to_ref_p.adr());
3417	_igvn.register_new_node_with_optimizer(xbase);
3418	#ifdef _LP64
3419	xbase = new ConvL2INode (xbase);
3420	_igvn.register_new_node_with_optimizer(xbase);
3421	#endif
3422	Node* mask = _igvn.intcon(vw-`1`);
3423	Node* masked_xbase = new AndINode (xbase, mask);
3424	_igvn.register_new_node_with_optimizer(masked_xbase);
3425	Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
3426	Node* bref = new URShiftINode (masked_xbase, log2_elt);
3427	_igvn.register_new_node_with_optimizer(bref);
3428	_phase->set_ctrl(bref, pre_ctrl);
3429	e = new AddINode (e, bref);
3430	_igvn.register_new_node_with_optimizer(e);
3431	_phase->set_ctrl(e, pre_ctrl);
3432	}
3433
3434	// compute e +/- lim0
3435	if (scale < `0`) {
3436	e = new SubINode (e, lim0);
3437	} else {
3438	e = new AddINode (e, lim0);
3439	}
3440	_igvn.register_new_node_with_optimizer(e);
3441	_phase->set_ctrl(e, pre_ctrl);
3442
3443	if (stride * scale > `0`) {
3444	// compute V - (e +/- lim0)
3445	Node* va = _igvn.intcon(v_align);
3446	e = new SubINode (va, e);
3447	_igvn.register_new_node_with_optimizer(e);
3448	_phase->set_ctrl(e, pre_ctrl);
3449	}
3450	// compute N = (exp) % V
3451	Node* va_msk = _igvn.intcon(v_align - `1`);
3452	Node* N = new AndINode (e, va_msk);
3453	_igvn.register_new_node_with_optimizer(N);
3454	_phase->set_ctrl(N, pre_ctrl);
3455
3456	// substitute back into (1), so that new limit
3457	// lim = lim0 + N
3458	Node* lim;
3459	if (stride < `0`) {
3460	lim = new SubINode (lim0, N);
3461	} else {
3462	lim = new AddINode (lim0, N);
3463	}
3464	_igvn.register_new_node_with_optimizer(lim);
3465	_phase->set_ctrl(lim, pre_ctrl);
3466	Node* constrained =
3467	(stride > `0`) ? (Node) new* MinINode (lim, orig_limit)
3468	: (Node) new* MaxINode (lim, orig_limit);
3469	_igvn.register_new_node_with_optimizer(constrained);
3470	_phase->set_ctrl(constrained, pre_ctrl);
3471	_igvn.replace_input_of(pre_opaq, `1`, constrained);
3472	}
3473
3474	//----------------------------get_pre_loop_end---------------------------
3475	// Find pre loop end from main loop. Returns null if none.
3476	CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode* cl) {
3477	// The loop cannot be optimized if the graph shape at
3478	// the loop entry is inappropriate.
3479	if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
3480	return NULL;
3481	}
3482
3483	Node* p_f = cl->skip_predicates()->in(`0`)->in(`0`);
3484	if (!p_f->is_IfFalse()) return NULL;
3485	if (!p_f->in(`0`)->is_CountedLoopEnd()) return NULL;
3486	CountedLoopEndNode* pre_end = p_f->in(`0`)->as_CountedLoopEnd();
3487	CountedLoopNode* loop_node = pre_end->loopnode();
3488	if (loop_node == NULL \|\| !loop_node->is_pre_loop()) return NULL;
3489	return pre_end;
3490	}
3491
3492	//------------------------------init---------------------------
3493	void SuperWord::init() {
3494	_dg.init();
3495	_packset.clear();
3496	_disjoint_ptrs.clear();
3497	_block.clear();
3498	_post_block.clear();
3499	_data_entry.clear();
3500	_mem_slice_head.clear();
3501	_mem_slice_tail.clear();
3502	_iteration_first.clear();
3503	_iteration_last.clear();
3504	_node_info.clear();
3505	_align_to_ref = NULL;
3506	_lpt = NULL;
3507	_lp = NULL;
3508	_bb = NULL;
3509	_iv = NULL;
3510	_race_possible = `0`;
3511	_early_return = false;
3512	_num_work_vecs = `0`;
3513	_num_reductions = `0`;
3514	}
3515
3516	//------------------------------restart---------------------------
3517	void SuperWord::restart() {
3518	_dg.init();
3519	_packset.clear();
3520	_disjoint_ptrs.clear();
3521	_block.clear();
3522	_post_block.clear();
3523	_data_entry.clear();
3524	_mem_slice_head.clear();
3525	_mem_slice_tail.clear();
3526	_node_info.clear();
3527	}
3528
3529	//------------------------------print_packset---------------------------
3530	void SuperWord::print_packset() {
3531	#ifndef PRODUCT
3532	tty->print_cr("packset");
3533	for (int i = `0`; i < _packset.length(); i++) {
3534	tty->print_cr("Pack: %d", i);
3535	Node_List* p = _packset.at(i);
3536	print_pack(p);
3537	}
3538	#endif
3539	}
3540
3541	//------------------------------print_pack---------------------------
3542	void SuperWord::print_pack(Node_List* p) {
3543	for (uint i = `0`; i < p->size(); i++) {
3544	print_stmt(p->at(i));
3545	}
3546	}
3547
3548	//------------------------------print_bb---------------------------
3549	void SuperWord::print_bb() {
3550	#ifndef PRODUCT
3551	tty->print_cr("\nBlock");
3552	for (int i = `0`; i < _block.length(); i++) {
3553	Node* n = _block.at(i);
3554	tty->print("%d ", i);
3555	if (n) {
3556	n->dump();
3557	}
3558	}
3559	#endif
3560	}
3561
3562	//------------------------------print_stmt---------------------------
3563	void SuperWord::print_stmt(Node* s) {
3564	#ifndef PRODUCT
3565	tty->print(" align: %d \t", alignment(s));
3566	s->dump();
3567	#endif
3568	}
3569
3570	//------------------------------blank---------------------------
3571	char* SuperWord::blank(uint depth) {
3572	static char blanks[`101`];
3573	assert(depth < `101`, "too deep");
3574	for (uint i = `0`; i < depth; i++) blanks[i] = `' '`;
3575	blanks[depth] = `'\0'`;
3576	return blanks;
3577	}
3578
3579
3580	//==============================SWPointer===========================
3581	#ifndef PRODUCT
3582	int SWPointer::Tracer::_depth = `0`;
3583	#endif
3584	//----------------------------SWPointer------------------------
3585	SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack nstack, bool* analyze_only) :
3586	_mem(mem), _slp(slp), _base(NULL), _adr(NULL),
3587	_scale(`0`), _offset(`0`), _invar(NULL), _negate_invar(false),
3588	_nstack(nstack), _analyze_only(analyze_only),
3589	_stack_idx(`0`)
3590	#ifndef PRODUCT
3591	, _tracer(slp)
3592	#endif
3593	{
3594	NOT_PRODUCT(_tracer.ctor_1(mem);)
3595
3596	Node* adr = mem->in(MemNode::Address);
3597	if (!adr->is_AddP()) {
3598	assert(!valid(), "too complex");
3599	return;
3600	}
3601	// Match AddP(base, AddP(ptr, kiv [+ invariant]), constant)*
3602	Node* base = adr->in(AddPNode::Base);
3603	// The base address should be loop invariant
3604	if (!invariant(base)) {
3605	assert(!valid(), "base address is loop variant");
3606	return;
3607	}
3608	// unsafe references require misaligned vector access support
3609	if (base->is_top() && !Matcher::misaligned_vectors_ok()) {
3610	assert(!valid(), "unsafe access");
3611	return;
3612	}
3613
3614	NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.store_depth();)
3615	NOT_PRODUCT(_tracer.ctor_2(adr);)
3616
3617	int i;
3618	for (i = `0`; i < `3`; i++) {
3619	NOT_PRODUCT(_tracer.ctor_3(adr, i);)
3620
3621	if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) {
3622	assert(!valid(), "too complex");
3623	return;
3624	}
3625	adr = adr->in(AddPNode::Address);
3626	NOT_PRODUCT(_tracer.ctor_4(adr, i);)
3627
3628	if (base == adr \|\| !adr->is_AddP()) {
3629	NOT_PRODUCT(_tracer.ctor_5(adr, base, i);)
3630	break; // stop looking at addp's
3631	}
3632	}
3633	if (!invariant(adr)) {
3634	assert(!valid(), "adr is loop variant");
3635	return;
3636	}
3637
3638	if (!base->is_top() && adr != base) {
3639	assert(!valid(), "adr and base differ");
3640	return;
3641	}
3642
3643	NOT_PRODUCT(if(_slp->is_trace_alignment()) _tracer.restore_depth();)
3644	NOT_PRODUCT(_tracer.ctor_6(mem);)
3645
3646	_base = base;
3647	_adr = adr;
3648	assert(valid(), "Usable");
3649	}
3650
3651	// Following is used to create a temporary object during
3652	// the pattern match of an address expression.
3653	SWPointer::SWPointer(SWPointer* p) :
3654	_mem(p->_mem), _slp(p->_slp), _base(NULL), _adr(NULL),
3655	_scale(`0`), _offset(`0`), _invar(NULL), _negate_invar(false),
3656	_nstack(p->_nstack), _analyze_only(p->_analyze_only),
3657	_stack_idx(p->_stack_idx)
3658	#ifndef PRODUCT
3659	, _tracer(p->_slp)
3660	#endif
3661	{}
3662
3663
3664	bool SWPointer::invariant(Node* n) {
3665	NOT_PRODUCT(Tracer::Depth dd;)
3666	Node *n_c = phase()->get_ctrl(n);
3667	NOT_PRODUCT(_tracer.invariant_1(n, n_c);)
3668	return !lpt()->is_member(phase()->get_loop(n_c));
3669	}
3670	//------------------------scaled_iv_plus_offset--------------------
3671	// Match: kiv + offset*
3672	// where: k is a constant that maybe zero, and
3673	// offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional
3674	bool SWPointer::scaled_iv_plus_offset(Node* n) {
3675	NOT_PRODUCT(Tracer::Depth ddd;)
3676	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);)
3677
3678	if (scaled_iv(n)) {
3679	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);)
3680	return true;
3681	}
3682
3683	if (offset_plus_k(n)) {
3684	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);)
3685	return true;
3686	}
3687
3688	int opc = n->Opcode();
3689	if (opc == Op_AddI) {
3690	if (scaled_iv(n->in(`1`)) && offset_plus_k(n->in(`2`))) {
3691	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);)
3692	return true;
3693	}
3694	if (scaled_iv(n->in(`2`)) && offset_plus_k(n->in(`1`))) {
3695	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);)
3696	return true;
3697	}
3698	} else if (opc == Op_SubI) {
3699	if (scaled_iv(n->in(`1`)) && offset_plus_k(n->in(`2`), true)) {
3700	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);)
3701	return true;
3702	}
3703	if (scaled_iv(n->in(`2`)) && offset_plus_k(n->in(`1`))) {
3704	_scale *= -`1`;
3705	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);)
3706	return true;
3707	}
3708	}
3709
3710	NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);)
3711	return false;
3712	}
3713
3714	//----------------------------scaled_iv------------------------
3715	// Match: kiv where k is a constant that's not zero*
3716	bool SWPointer::scaled_iv(Node* n) {
3717	NOT_PRODUCT(Tracer::Depth ddd;)
3718	NOT_PRODUCT(_tracer.scaled_iv_1(n);)
3719
3720	if (_scale != `0`) { // already found a scale
3721	NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);)
3722	return false;
3723	}
3724
3725	if (n == iv()) {
3726	_scale = `1`;
3727	NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);)
3728	return true;
3729	}
3730	if (_analyze_only && (invariant(n) == false)) {
3731	_nstack->push(n, _stack_idx++);
3732	}
3733
3734	int opc = n->Opcode();
3735	if (opc == Op_MulI) {
3736	if (n->in(`1`) == iv() && n->in(`2`)->is_Con()) {
3737	_scale = n->in(`2`)->get_int();
3738	NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);)
3739	return true;
3740	} else if (n->in(`2`) == iv() && n->in(`1`)->is_Con()) {
3741	_scale = n->in(`1`)->get_int();
3742	NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);)
3743	return true;
3744	}
3745	} else if (opc == Op_LShiftI) {
3746	if (n->in(`1`) == iv() && n->in(`2`)->is_Con()) {
3747	_scale = `1` << n->in(`2`)->get_int();
3748	NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);)
3749	return true;
3750	}
3751	} else if (opc == Op_ConvI2L) {
3752	if (n->in(`1`)->Opcode() == Op_CastII &&
3753	n->in(`1`)->as_CastII()->has_range_check()) {
3754	// Skip range check dependent CastII nodes
3755	n = n->in(`1`);
3756	}
3757	if (scaled_iv_plus_offset(n->in(`1`))) {
3758	NOT_PRODUCT(_tracer.scaled_iv_7(n);)
3759	return true;
3760	}
3761	} else if (opc == Op_LShiftL) {
3762	if (!has_iv() && _invar == NULL) {
3763	// Need to preserve the current _offset value, so
3764	// create a temporary object for this expression subtree.
3765	// Hacky, so should re-engineer the address pattern match.
3766	NOT_PRODUCT(Tracer::Depth dddd;)
3767	SWPointer tmp(this);
3768	NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);)
3769
3770	if (tmp.scaled_iv_plus_offset(n->in(`1`))) {
3771	if (tmp._invar == NULL \|\| _slp->do_vector_loop()) {
3772	int mult = `1` << n->in(`2`)->get_int();
3773	_scale = tmp._scale * mult;
3774	_offset += tmp._offset * mult;
3775	NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, mult);)
3776	return true;
3777	}
3778	}
3779	}
3780	}
3781	NOT_PRODUCT(_tracer.scaled_iv_10(n);)
3782	return false;
3783	}
3784
3785	//----------------------------offset_plus_k------------------------
3786	// Match: offset is (k [+/- invariant])
3787	// where k maybe zero and invariant is optional, but not both.
3788	bool SWPointer::offset_plus_k(Node* n, bool negate) {
3789	NOT_PRODUCT(Tracer::Depth ddd;)
3790	NOT_PRODUCT(_tracer.offset_plus_k_1(n);)
3791
3792	int opc = n->Opcode();
3793	if (opc == Op_ConI) {
3794	_offset += negate ? -(n->get_int()) : n->get_int();
3795	NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);)
3796	return true;
3797	} else if (opc == Op_ConL) {
3798	// Okay if value fits into an int
3799	const TypeLong* t = n->find_long_type();
3800	if (t->higher_equal(TypeLong::INT)) {
3801	jlong loff = n->get_long();
3802	jint off = (jint)loff;
3803	_offset += negate ? -off : loff;
3804	NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);)
3805	return true;
3806	}
3807	NOT_PRODUCT(_tracer.offset_plus_k_4(n);)
3808	return false;
3809	}
3810	if (_invar != NULL) { // already has an invariant
3811	NOT_PRODUCT(_tracer.offset_plus_k_5(n, _invar);)
3812	return false;
3813	}
3814
3815	if (_analyze_only && (invariant(n) == false)) {
3816	_nstack->push(n, _stack_idx++);
3817	}
3818	if (opc == Op_AddI) {
3819	if (n->in(`2`)->is_Con() && invariant(n->in(`1`))) {
3820	_negate_invar = negate;
3821	_invar = n->in(`1`);
3822	_offset += negate ? -(n->in(`2`)->get_int()) : n->in(`2`)->get_int();
3823	NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, _negate_invar, _offset);)
3824	return true;
3825	} else if (n->in(`1`)->is_Con() && invariant(n->in(`2`))) {
3826	_offset += negate ? -(n->in(`1`)->get_int()) : n->in(`1`)->get_int();
3827	_negate_invar = negate;
3828	_invar = n->in(`2`);
3829	NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, _negate_invar, _offset);)
3830	return true;
3831	}
3832	}
3833	if (opc == Op_SubI) {
3834	if (n->in(`2`)->is_Con() && invariant(n->in(`1`))) {
3835	_negate_invar = negate;
3836	_invar = n->in(`1`);
3837	_offset += !negate ? -(n->in(`2`)->get_int()) : n->in(`2`)->get_int();
3838	NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, _negate_invar, _offset);)
3839	return true;
3840	} else if (n->in(`1`)->is_Con() && invariant(n->in(`2`))) {
3841	_offset += negate ? -(n->in(`1`)->get_int()) : n->in(`1`)->get_int();
3842	_negate_invar = !negate;
3843	_invar = n->in(`2`);
3844	NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, _negate_invar, _offset);)
3845	return true;
3846	}
3847	}
3848	if (invariant(n)) {
3849	if (opc == Op_ConvI2L) {
3850	n = n->in(`1`);
3851	if (n->Opcode() == Op_CastII &&
3852	n->as_CastII()->has_range_check()) {
3853	// Skip range check dependent CastII nodes
3854	assert(invariant(n), "sanity");
3855	n = n->in(`1`);
3856	}
3857	}
3858	_negate_invar = negate;
3859	_invar = n;
3860	NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, _negate_invar, _offset);)
3861	return true;
3862	}
3863
3864	NOT_PRODUCT(_tracer.offset_plus_k_11(n);)
3865	return false;
3866	}
3867
3868	//----------------------------print------------------------
3869	void SWPointer::print() {
3870	#ifndef PRODUCT
3871	tty->print("base: %d adr: %d scale: %d offset: %d invar: %c%d\n",
3872	_base != NULL ? _base->_idx : `0`,
3873	_adr != NULL ? _adr->_idx : `0`,
3874	_scale, _offset,
3875	_negate_invar?`'-'`:`'+'`,
3876	_invar != NULL ? _invar->_idx : `0`);
3877	#endif
3878	}
3879
3880	//----------------------------tracing------------------------
3881	#ifndef PRODUCT
3882	void SWPointer::Tracer::print_depth() {
3883	for (int ii = `0`; ii<_depth; ++ii) tty->print(" ");
3884	}
3885
3886	void SWPointer::Tracer::ctor_1 (Node* mem) {
3887	if(_slp->is_trace_alignment()) {
3888	print_depth(); tty->print(" %d SWPointer::SWPointer: start alignment analysis", mem->_idx); mem->dump();
3889	}
3890	}
3891
3892	void SWPointer::Tracer::ctor_2(Node* adr) {
3893	if(_slp->is_trace_alignment()) {
3894	//store_depth();
3895	inc_depth();
3896	print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: ", adr->_idx); adr->dump();
3897	inc_depth();
3898	print_depth(); tty->print(" %d (base) SWPointer::SWPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump();
3899	}
3900	}
3901
3902	void SWPointer::Tracer::ctor_3(Node* adr, int i) {
3903	if(_slp->is_trace_alignment()) {
3904	inc_depth();
3905	Node* offset = adr->in(AddPNode::Offset);
3906	print_depth(); tty->print(" %d (offset) SWPointer::SWPointer: i = %d: ", offset->_idx, i); offset->dump();
3907	}
3908	}
3909
3910	void SWPointer::Tracer::ctor_4(Node* adr, int i) {
3911	if(_slp->is_trace_alignment()) {
3912	inc_depth();
3913	print_depth(); tty->print(" %d (adr) SWPointer::SWPointer: i = %d: ", adr->_idx, i); adr->dump();
3914	}
3915	}
3916
3917	void SWPointer::Tracer::ctor_5(Node* adr, Node* base, int i) {
3918	if(_slp->is_trace_alignment()) {
3919	inc_depth();
3920	if (base == adr) {
3921	print_depth(); tty->print_cr(" \\ %d (adr) == %d (base) SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i);
3922	} else if (!adr->is_AddP()) {
3923	print_depth(); tty->print_cr(" \\ %d (adr) is NOT Addp SWPointer::SWPointer: breaking analysis at i = %d", adr->_idx, i);
3924	}
3925	}
3926	}
3927
3928	void SWPointer::Tracer::ctor_6(Node* mem) {
3929	if(_slp->is_trace_alignment()) {
3930	//restore_depth();
3931	print_depth(); tty->print_cr(" %d (adr) SWPointer::SWPointer: stop analysis", mem->_idx);
3932	}
3933	}
3934
3935	void SWPointer::Tracer::invariant_1(Node n, Node n_c) {
3936	if (_slp->do_vector_loop() && _slp->is_debug() && _slp->_lpt->is_member(_slp->_phase->get_loop(n_c)) != (int)_slp->in_bb(n)) {
3937	int is_member = _slp->_lpt->is_member(_slp->_phase->get_loop(n_c));
3938	int in_bb = _slp->in_bb(n);
3939	print_depth(); tty->print(" \\ "); tty->print_cr(" %d SWPointer::invariant conditions differ: n_c %d", n->_idx, n_c->_idx);
3940	print_depth(); tty->print(" \\ "); tty->print_cr("is_member %d, in_bb %d", is_member, in_bb);
3941	print_depth(); tty->print(" \\ "); n->dump();
3942	print_depth(); tty->print(" \\ "); n_c->dump();
3943	}
3944	}
3945
3946	void SWPointer::Tracer::scaled_iv_plus_offset_1(Node* n) {
3947	if(_slp->is_trace_alignment()) {
3948	print_depth(); tty->print(" %d SWPointer::scaled_iv_plus_offset testing node: ", n->_idx);
3949	n->dump();
3950	}
3951	}
3952
3953	void SWPointer::Tracer::scaled_iv_plus_offset_2(Node* n) {
3954	if(_slp->is_trace_alignment()) {
3955	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
3956	}
3957	}
3958
3959	void SWPointer::Tracer::scaled_iv_plus_offset_3(Node* n) {
3960	if(_slp->is_trace_alignment()) {
3961	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: PASSED", n->_idx);
3962	}
3963	}
3964
3965	void SWPointer::Tracer::scaled_iv_plus_offset_4(Node* n) {
3966	if(_slp->is_trace_alignment()) {
3967	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
3968	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(`1`)->_idx); n->in(`1`)->dump();
3969	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(`2`)->_idx); n->in(`2`)->dump();
3970	}
3971	}
3972
3973	void SWPointer::Tracer::scaled_iv_plus_offset_5(Node* n) {
3974	if(_slp->is_trace_alignment()) {
3975	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx);
3976	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(`2`)->_idx); n->in(`2`)->dump();
3977	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(`1`)->_idx); n->in(`1`)->dump();
3978	}
3979	}
3980
3981	void SWPointer::Tracer::scaled_iv_plus_offset_6(Node* n) {
3982	if(_slp->is_trace_alignment()) {
3983	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
3984	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(`1`)->_idx); n->in(`1`)->dump();
3985	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(`2`)->_idx); n->in(`2`)->dump();
3986	}
3987	}
3988
3989	void SWPointer::Tracer::scaled_iv_plus_offset_7(Node* n) {
3990	if(_slp->is_trace_alignment()) {
3991	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: Op_SubI PASSED", n->_idx);
3992	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(`2`)->_idx); n->in(`2`)->dump();
3993	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(`1`)->_idx); n->in(`1`)->dump();
3994	}
3995	}
3996
3997	void SWPointer::Tracer::scaled_iv_plus_offset_8(Node* n) {
3998	if(_slp->is_trace_alignment()) {
3999	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv_plus_offset: FAILED", n->_idx);
4000	}
4001	}
4002
4003	void SWPointer::Tracer::scaled_iv_1(Node* n) {
4004	if(_slp->is_trace_alignment()) {
4005	print_depth(); tty->print(" %d SWPointer::scaled_iv: testing node: ", n->_idx); n->dump();
4006	}
4007	}
4008
4009	void SWPointer::Tracer::scaled_iv_2(Node* n, int scale) {
4010	if(_slp->is_trace_alignment()) {
4011	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx);
4012	print_depth(); tty->print_cr(" \\ SWPointer::scaled_iv: _scale (%d) != 0", scale);
4013	}
4014	}
4015
4016	void SWPointer::Tracer::scaled_iv_3(Node* n, int scale) {
4017	if(_slp->is_trace_alignment()) {
4018	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale);
4019	}
4020	}
4021
4022	void SWPointer::Tracer::scaled_iv_4(Node* n, int scale) {
4023	if(_slp->is_trace_alignment()) {
4024	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
4025	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(`1`)->_idx); n->in(`1`)->dump();
4026	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(`2`)->_idx); n->in(`2`)->dump();
4027	}
4028	}
4029
4030	void SWPointer::Tracer::scaled_iv_5(Node* n, int scale) {
4031	if(_slp->is_trace_alignment()) {
4032	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale);
4033	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(2) is iv: ", n->in(`2`)->_idx); n->in(`2`)->dump();
4034	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(1) is Con: ", n->in(`1`)->_idx); n->in(`1`)->dump();
4035	}
4036	}
4037
4038	void SWPointer::Tracer::scaled_iv_6(Node* n, int scale) {
4039	if(_slp->is_trace_alignment()) {
4040	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale);
4041	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(1) is iv: ", n->in(`1`)->_idx); n->in(`1`)->dump();
4042	print_depth(); tty->print(" \\ %d SWPointer::scaled_iv: in(2) is Con: ", n->in(`2`)->_idx); n->in(`2`)->dump();
4043	}
4044	}
4045
4046	void SWPointer::Tracer::scaled_iv_7(Node* n) {
4047	if(_slp->is_trace_alignment()) {
4048	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx);
4049	print_depth(); tty->print_cr(" \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(`1`)->_idx);
4050	inc_depth(); inc_depth();
4051	print_depth(); n->in(`1`)->dump();
4052	dec_depth(); dec_depth();
4053	}
4054	}
4055
4056	void SWPointer::Tracer::scaled_iv_8(Node* n, SWPointer* tmp) {
4057	if(_slp->is_trace_alignment()) {
4058	print_depth(); tty->print(" %d SWPointer::scaled_iv: Op_LShiftL, creating tmp SWPointer: ", n->_idx); tmp->print();
4059	}
4060	}
4061
4062	void SWPointer::Tracer::scaled_iv_9(Node* n, int scale, int _offset, int mult) {
4063	if(_slp->is_trace_alignment()) {
4064	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, _offset);
4065	print_depth(); tty->print_cr(" \\ SWPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset, in(2) %d used to get mult = %d: _scale = %d, _offset = %d",
4066	n->in(`1`)->_idx, n->in(`2`)->_idx, mult, scale, _offset);
4067	inc_depth(); inc_depth();
4068	print_depth(); n->in(`1`)->dump();
4069	print_depth(); n->in(`2`)->dump();
4070	dec_depth(); dec_depth();
4071	}
4072	}
4073
4074	void SWPointer::Tracer::scaled_iv_10(Node* n) {
4075	if(_slp->is_trace_alignment()) {
4076	print_depth(); tty->print_cr(" %d SWPointer::scaled_iv: FAILED", n->_idx);
4077	}
4078	}
4079
4080	void SWPointer::Tracer::offset_plus_k_1(Node* n) {
4081	if(_slp->is_trace_alignment()) {
4082	print_depth(); tty->print(" %d SWPointer::offset_plus_k: testing node: ", n->_idx); n->dump();
4083	}
4084	}
4085
4086	void SWPointer::Tracer::offset_plus_k_2(Node* n, int _offset) {
4087	if(_slp->is_trace_alignment()) {
4088	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset);
4089	}
4090	}
4091
4092	void SWPointer::Tracer::offset_plus_k_3(Node* n, int _offset) {
4093	if(_slp->is_trace_alignment()) {
4094	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset);
4095	}
4096	}
4097
4098	void SWPointer::Tracer::offset_plus_k_4(Node* n) {
4099	if(_slp->is_trace_alignment()) {
4100	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
4101	print_depth(); tty->print_cr(" \\ " JLONG_FORMAT " SWPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long());
4102	}
4103	}
4104
4105	void SWPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) {
4106	if(_slp->is_trace_alignment()) {
4107	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx);
4108	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: _invar != NULL: ", _invar->_idx); _invar->dump();
4109	}
4110	}
4111
4112	void SWPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4113	if(_slp->is_trace_alignment()) {
4114	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4115	n->_idx, _negate_invar, _invar->_idx, _offset);
4116	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(`2`)->_idx); n->in(`2`)->dump();
4117	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
4118	}
4119	}
4120
4121	void SWPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4122	if(_slp->is_trace_alignment()) {
4123	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_AddI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4124	n->_idx, _negate_invar, _invar->_idx, _offset);
4125	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(`1`)->_idx); n->in(`1`)->dump();
4126	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
4127	}
4128	}
4129
4130	void SWPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4131	if(_slp->is_trace_alignment()) {
4132	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI is PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d",
4133	n->_idx, _negate_invar, _invar->_idx, _offset);
4134	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is Con: ", n->in(`2`)->_idx); n->in(`2`)->dump();
4135	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump();
4136	}
4137	}
4138
4139	void SWPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4140	if(_slp->is_trace_alignment()) {
4141	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: Op_SubI PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
4142	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(1) is Con: ", n->in(`1`)->_idx); n->in(`1`)->dump();
4143	print_depth(); tty->print(" \\ %d SWPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump();
4144	}
4145	}
4146
4147	void SWPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) {
4148	if(_slp->is_trace_alignment()) {
4149	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: PASSED, setting _negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset);
4150	print_depth(); tty->print_cr(" \\ %d SWPointer::offset_plus_k: is invariant", n->_idx);
4151	}
4152	}
4153
4154	void SWPointer::Tracer::offset_plus_k_11(Node* n) {
4155	if(_slp->is_trace_alignment()) {
4156	print_depth(); tty->print_cr(" %d SWPointer::offset_plus_k: FAILED", n->_idx);
4157	}
4158	}
4159
4160	#endif
4161	// ========================= OrderedPair =====================
4162
4163	const OrderedPair OrderedPair::initial;
4164
4165	// ========================= SWNodeInfo =====================
4166
4167	const SWNodeInfo SWNodeInfo::initial;
4168
4169
4170	// ============================ DepGraph ===========================
4171
4172	//------------------------------make_node---------------------------
4173	// Make a new dependence graph node for an ideal node.
4174	DepMem* DepGraph::make_node(Node* node) {
4175	DepMem* m = new (_arena) DepMem (node);
4176	if (node != NULL) {
4177	assert(_map.at_grow(node->_idx) == NULL, "one init only");
4178	_map.at_put_grow(node->_idx, m);
4179	}
4180	return m;
4181	}
4182
4183	//------------------------------make_edge---------------------------
4184	// Make a new dependence graph edge from dpred -> dsucc
4185	DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) {
4186	DepEdge* e = new (_arena) DepEdge (dpred, dsucc, dsucc->in_head(), dpred->out_head());
4187	dpred->set_out_head(e);
4188	dsucc->set_in_head(e);
4189	return e;
4190	}
4191
4192	// ========================== DepMem ========================
4193
4194	//------------------------------in_cnt---------------------------
4195	int DepMem::in_cnt() {
4196	int ct = `0`;
4197	for (DepEdge* e = _in_head; e != NULL; e = e->next_in()) ct++;
4198	return ct;
4199	}
4200
4201	//------------------------------out_cnt---------------------------
4202	int DepMem::out_cnt() {
4203	int ct = `0`;
4204	for (DepEdge* e = _out_head; e != NULL; e = e->next_out()) ct++;
4205	return ct;
4206	}
4207
4208	//------------------------------print-----------------------------
4209	void DepMem::print() {
4210	#ifndef PRODUCT
4211	tty->print(" DepNode %d (", _node->_idx);
4212	for (DepEdge* p = _in_head; p != NULL; p = p->next_in()) {
4213	Node* pred = p->pred()->node();
4214	tty->print(" %d", pred != NULL ? pred->_idx : `0`);
4215	}
4216	tty->print(") [");
4217	for (DepEdge* s = _out_head; s != NULL; s = s->next_out()) {
4218	Node* succ = s->succ()->node();
4219	tty->print(" %d", succ != NULL ? succ->_idx : `0`);
4220	}
4221	tty->print_cr(" ]");
4222	#endif
4223	}
4224
4225	// =========================== DepEdge =========================
4226
4227	//------------------------------DepPreds---------------------------
4228	void DepEdge::print() {
4229	#ifndef PRODUCT
4230	tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx);
4231	#endif
4232	}
4233
4234	// =========================== DepPreds =========================
4235	// Iterator over predecessor edges in the dependence graph.
4236
4237	//------------------------------DepPreds---------------------------
4238	DepPreds::DepPreds(Node* n, DepGraph& dg) {
4239	_n = n;
4240	_done = false;
4241	if (_n->is_Store() \|\| _n->is_Load()) {
4242	_next_idx = MemNode::Address;
4243	_end_idx = n->req();
4244	_dep_next = dg.dep(_n)->in_head();
4245	} else if (_n->is_Mem()) {
4246	_next_idx = `0`;
4247	_end_idx = `0`;
4248	_dep_next = dg.dep(_n)->in_head();
4249	} else {
4250	_next_idx = `1`;
4251	_end_idx = _n->req();
4252	_dep_next = NULL;
4253	}
4254	next();
4255	}
4256
4257	//------------------------------next---------------------------
4258	void DepPreds::next() {
4259	if (_dep_next != NULL) {
4260	_current = _dep_next->pred()->node();
4261	_dep_next = _dep_next->next_in();
4262	} else if (_next_idx < _end_idx) {
4263	_current = _n->in(_next_idx++);
4264	} else {
4265	_done = true;
4266	}
4267	}
4268
4269	// =========================== DepSuccs =========================
4270	// Iterator over successor edges in the dependence graph.
4271
4272	//------------------------------DepSuccs---------------------------
4273	DepSuccs::DepSuccs(Node* n, DepGraph& dg) {
4274	_n = n;
4275	_done = false;
4276	if (_n->is_Load()) {
4277	_next_idx = `0`;
4278	_end_idx = _n->outcnt();
4279	_dep_next = dg.dep(_n)->out_head();
4280	} else if (_n->is_Mem() \|\| (_n->is_Phi() && _n->bottom_type() == Type::MEMORY)) {
4281	_next_idx = `0`;
4282	_end_idx = `0`;
4283	_dep_next = dg.dep(_n)->out_head();
4284	} else {
4285	_next_idx = `0`;
4286	_end_idx = _n->outcnt();
4287	_dep_next = NULL;
4288	}
4289	next();
4290	}
4291
4292	//-------------------------------next---------------------------
4293	void DepSuccs::next() {
4294	if (_dep_next != NULL) {
4295	_current = _dep_next->succ()->node();
4296	_dep_next = _dep_next->next_out();
4297	} else if (_next_idx < _end_idx) {
4298	_current = _n->raw_out(_next_idx++);
4299	} else {
4300	_done = true;
4301	}
4302	}
4303
4304	//
4305	// --------------------------------- vectorization/simd -----------------------------------
4306	//
4307	bool SuperWord::same_origin_idx(Node* a, Node* b) const {
4308	return a != NULL && b != NULL && _clone_map.same_idx(a->_idx, b->_idx);
4309	}
4310	bool SuperWord::same_generation(Node* a, Node* b) const {
4311	return a != NULL && b != NULL && _clone_map.same_gen(a->_idx, b->_idx);
4312	}
4313
4314	Node* SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
4315	assert(in_bb(ld), "must be in block");
4316	if (_clone_map.gen(ld->_idx) == _ii_first) {
4317	#ifndef PRODUCT
4318	if (_vector_loop_debug) {
4319	tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
4320	_clone_map.gen(ld->_idx));
4321	}
4322	#endif
4323	return NULL; //we think that any ld in the first gen being vectorizable
4324	}
4325
4326	Node* mem = ld->in(MemNode::Memory);
4327	if (mem->outcnt() <= `1`) {
4328	// we don't want to remove the only edge from mem node to load
4329	#ifndef PRODUCT
4330	if (_vector_loop_debug) {
4331	tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
4332	mem->_idx, ld->_idx);
4333	ld->dump();
4334	mem->dump();
4335	}
4336	#endif
4337	return NULL;
4338	}
4339	if (!in_bb(mem) \|\| same_generation(mem, ld)) {
4340	#ifndef PRODUCT
4341	if (_vector_loop_debug) {
4342	tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
4343	_clone_map.gen(mem->_idx));
4344	}
4345	#endif
4346	return NULL; // does not depend on loop volatile node or depends on the same generation
4347	}
4348
4349	//otherwise first node should depend on mem-phi
4350	Node* first = first_node(ld);
4351	assert(first->is_Load(), "must be Load");
4352	Node* phi = first->as_Load()->in(MemNode::Memory);
4353	if (!phi->is_Phi() \|\| phi->bottom_type() != Type::MEMORY) {
4354	#ifndef PRODUCT
4355	if (_vector_loop_debug) {
4356	tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
4357	ld->dump();
4358	first->dump();
4359	}
4360	#endif
4361	return NULL;
4362	}
4363
4364	Node* tail = `0`;
4365	for (int m = `0`; m < _mem_slice_head.length(); m++) {
4366	if (_mem_slice_head.at(m) == phi) {
4367	tail = _mem_slice_tail.at(m);
4368	}
4369	}
4370	if (tail == `0`) { //test that found phi is in the list _mem_slice_head
4371	#ifndef PRODUCT
4372	if (_vector_loop_debug) {
4373	tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
4374	ld->_idx, phi->_idx);
4375	ld->dump();
4376	phi->dump();
4377	}
4378	#endif
4379	return NULL;
4380	}
4381
4382	// now all conditions are met
4383	return phi;
4384	}
4385
4386	Node* SuperWord::first_node(Node* nd) {
4387	for (int ii = `0`; ii < _iteration_first.length(); ii++) {
4388	Node* nnn = _iteration_first.at(ii);
4389	if (same_origin_idx(nnn, nd)) {
4390	#ifndef PRODUCT
4391	if (_vector_loop_debug) {
4392	tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
4393	nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
4394	}
4395	#endif
4396	return nnn;
4397	}
4398	}
4399
4400	#ifndef PRODUCT
4401	if (_vector_loop_debug) {
4402	tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
4403	nd->_idx, _clone_map.idx(nd->_idx));
4404	}
4405	#endif
4406	return `0`;
4407	}
4408
4409	Node* SuperWord::last_node(Node* nd) {
4410	for (int ii = `0`; ii < _iteration_last.length(); ii++) {
4411	Node* nnn = _iteration_last.at(ii);
4412	if (same_origin_idx(nnn, nd)) {
4413	#ifndef PRODUCT
4414	if (_vector_loop_debug) {
4415	tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
4416	_clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
4417	}
4418	#endif
4419	return nnn;
4420	}
4421	}
4422	return `0`;
4423	}
4424
4425	int SuperWord::mark_generations() {
4426	Node ii_err = NULL, tail_err = NULL;
4427	for (int i = `0`; i < _mem_slice_head.length(); i++) {
4428	Node* phi = _mem_slice_head.at(i);
4429	assert(phi->is_Phi(), "must be phi");
4430
4431	Node* tail = _mem_slice_tail.at(i);
4432	if (_ii_last == -`1`) {
4433	tail_err = tail;
4434	_ii_last = _clone_map.gen(tail->_idx);
4435	}
4436	else if (_ii_last != _clone_map.gen(tail->_idx)) {
4437	#ifndef PRODUCT
4438	if (TraceSuperWord && Verbose) {
4439	tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
4440	tail->dump();
4441	tail_err->dump();
4442	}
4443	#endif
4444	return -`1`;
4445	}
4446
4447	// find first iteration in the loop
4448	for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
4449	Node* ii = phi->fast_out(i);
4450	if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
4451	if (_ii_first == -`1`) {
4452	ii_err = ii;
4453	_ii_first = _clone_map.gen(ii->_idx);
4454	} else if (_ii_first != _clone_map.gen(ii->_idx)) {
4455	#ifndef PRODUCT
4456	if (TraceSuperWord && Verbose) {
4457	tty->print_cr("SuperWord::mark_generations: _ii_first was found before and not equal to one in this node (%d)", _ii_first);
4458	ii->dump();
4459	if (ii_err!= `0`) {
4460	ii_err->dump();
4461	}
4462	}
4463	#endif
4464	return -`1`; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
4465	}
4466	}
4467	}//for (DUIterator_Fast imax,
4468	}//for (int i...
4469
4470	if (_ii_first == -`1` \|\| _ii_last == -`1`) {
4471	if (TraceSuperWord && Verbose) {
4472	tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
4473	}
4474	return -`1`; // something vent wrong
4475	}
4476	// collect nodes in the first and last generations
4477	assert(_iteration_first.length() == `0`, "_iteration_first must be empty");
4478	assert(_iteration_last.length() == `0`, "_iteration_last must be empty");
4479	for (int j = `0`; j < _block.length(); j++) {
4480	Node* n = _block.at(j);
4481	node_idx_t gen = _clone_map.gen(n->_idx);
4482	if ((signed)gen == _ii_first) {
4483	_iteration_first.push(n);
4484	} else if ((signed)gen == _ii_last) {
4485	_iteration_last.push(n);
4486	}
4487	}
4488
4489	// building order of iterations
4490	if (_ii_order.length() == `0` && ii_err != `0`) {
4491	assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
4492	Node* nd = ii_err;
4493	while(_clone_map.gen(nd->_idx) != _ii_last) {
4494	_ii_order.push(_clone_map.gen(nd->_idx));
4495	bool found = false;
4496	for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
4497	Node* use = nd->fast_out(i);
4498	if (same_origin_idx(use, nd) && use->as_Store()->in(MemNode::Memory) == nd) {
4499	found = true;
4500	nd = use;
4501	break;
4502	}
4503	}//for
4504
4505	if (found == false) {
4506	if (TraceSuperWord && Verbose) {
4507	tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
4508	}
4509	_ii_order.clear();
4510	return -`1`;
4511	}
4512	} //while
4513	_ii_order.push(_clone_map.gen(nd->_idx));
4514	}
4515
4516	#ifndef PRODUCT
4517	if (_vector_loop_debug) {
4518	tty->print_cr("SuperWord::mark_generations");
4519	tty->print_cr("First generation (%d) nodes:", _ii_first);
4520	for (int ii = `0`; ii < _iteration_first.length(); ii++) _iteration_first.at(ii)->dump();
4521	tty->print_cr("Last generation (%d) nodes:", _ii_last);
4522	for (int ii = `0`; ii < _iteration_last.length(); ii++) _iteration_last.at(ii)->dump();
4523	tty->print_cr(" ");
4524
4525	tty->print("SuperWord::List of generations: ");
4526	for (int jj = `0`; jj < _ii_order.length(); ++jj) {
4527	tty->print("%d:%d ", jj, _ii_order.at(jj));
4528	}
4529	tty->print_cr(" ");
4530	}
4531	#endif
4532
4533	return _ii_first;
4534	}
4535
4536	bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
4537	assert(gold->is_Add() && fix->is_Add() \|\| gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
4538	assert(same_origin_idx(gold, fix), "should be clones of the same node");
4539	Node* gin1 = gold->in(`1`);
4540	Node* gin2 = gold->in(`2`);
4541	Node* fin1 = fix->in(`1`);
4542	Node* fin2 = fix->in(`2`);
4543	bool swapped = false;
4544
4545	if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
4546	if (same_origin_idx(gin1, fin1) &&
4547	same_origin_idx(gin2, fin2)) {
4548	return true; // nothing to fix
4549	}
4550	if (same_origin_idx(gin1, fin2) &&
4551	same_origin_idx(gin2, fin1)) {
4552	fix->swap_edges(`1`, `2`);
4553	swapped = true;
4554	}
4555	}
4556	// at least one input comes from outside of bb
4557	if (gin1->_idx == fin1->_idx) {
4558	return true; // nothing to fix
4559	}
4560	if (!swapped && (gin1->_idx == fin2->_idx \|\| gin2->_idx == fin1->_idx)) { //swapping is expensive, check condition first
4561	fix->swap_edges(`1`, `2`);
4562	swapped = true;
4563	}
4564
4565	if (swapped) {
4566	#ifndef PRODUCT
4567	if (_vector_loop_debug) {
4568	tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
4569	}
4570	#endif
4571	return true;
4572	}
4573
4574	if (TraceSuperWord && Verbose) {
4575	tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
4576	}
4577
4578	return false;
4579	}
4580
4581	bool SuperWord::pack_parallel() {
4582	#ifndef PRODUCT
4583	if (_vector_loop_debug) {
4584	tty->print_cr("SuperWord::pack_parallel: START");
4585	}
4586	#endif
4587
4588	_packset.clear();
4589
4590	for (int ii = `0`; ii < _iteration_first.length(); ii++) {
4591	Node* nd = _iteration_first.at(ii);
4592	if (in_bb(nd) && (nd->is_Load() \|\| nd->is_Store() \|\| nd->is_Add() \|\| nd->is_Mul())) {
4593	Node_List* pk = new Node_List ();
4594	pk->push(nd);
4595	for (int gen = `1`; gen < _ii_order.length(); ++gen) {
4596	for (int kk = `0`; kk < _block.length(); kk++) {
4597	Node* clone = _block.at(kk);
4598	if (same_origin_idx(clone, nd) &&
4599	_clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
4600	if (nd->is_Add() \|\| nd->is_Mul()) {
4601	fix_commutative_inputs(nd, clone);
4602	}
4603	pk->push(clone);
4604	if (pk->size() == `4`) {
4605	_packset.append(pk);
4606	#ifndef PRODUCT
4607	if (_vector_loop_debug) {
4608	tty->print_cr("SuperWord::pack_parallel: added pack ");
4609	pk->dump();
4610	}
4611	#endif
4612	if (_clone_map.gen(clone->_idx) != _ii_last) {
4613	pk = new Node_List ();
4614	}
4615	}
4616	break;
4617	}
4618	}
4619	}//for
4620	}//if
4621	}//for
4622
4623	#ifndef PRODUCT
4624	if (_vector_loop_debug) {
4625	tty->print_cr("SuperWord::pack_parallel: END");
4626	}
4627	#endif
4628
4629	return true;
4630	}
4631
4632	bool SuperWord::hoist_loads_in_graph() {
4633	GrowableArray<Node*> loads;
4634
4635	#ifndef PRODUCT
4636	if (_vector_loop_debug) {
4637	tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
4638	}
4639	#endif
4640
4641	for (int i = `0`; i < _mem_slice_head.length(); i++) {
4642	Node* n = _mem_slice_head.at(i);
4643	if ( !in_bb(n) \|\| !n->is_Phi() \|\| n->bottom_type() != Type::MEMORY) {
4644	if (TraceSuperWord && Verbose) {
4645	tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
4646	}
4647	continue;
4648	}
4649
4650	#ifndef PRODUCT
4651	if (_vector_loop_debug) {
4652	tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d = _mem_slice_head.at(%d);", n->_idx, i);
4653	}
4654	#endif
4655
4656	for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
4657	Node* ld = n->fast_out(i);
4658	if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
4659	for (int i = `0`; i < _block.length(); i++) {
4660	Node* ld2 = _block.at(i);
4661	if (ld2->is_Load() && same_origin_idx(ld, ld2) &&
4662	!same_generation(ld, ld2)) { // <= do not collect the first generation ld
4663	#ifndef PRODUCT
4664	if (_vector_loop_debug) {
4665	tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
4666	ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
4667	}
4668	#endif
4669	// could not do on-the-fly, since iterator is immutable
4670	loads.push(ld2);
4671	}
4672	}// for
4673	}//if
4674	}//for (DUIterator_Fast imax,
4675	}//for (int i = 0; i
4676
4677	for (int i = `0`; i < loads.length(); i++) {
4678	LoadNode* ld = loads.at(i)->as_Load();
4679	Node* phi = find_phi_for_mem_dep(ld);
4680	if (phi != NULL) {
4681	#ifndef PRODUCT
4682	if (_vector_loop_debug) {
4683	tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
4684	MemNode::Memory, ld->_idx, phi->_idx);
4685	}
4686	#endif
4687	_igvn.replace_input_of(ld, MemNode::Memory, phi);
4688	}
4689	}//for
4690
4691	restart(); // invalidate all basic structures, since we rebuilt the graph
4692
4693	if (TraceSuperWord && Verbose) {
4694	tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
4695	}
4696
4697	return true;
4698	}
4699

Browse the source code of OpenJDK/src/hotspot/share/opto/superword.cpp