Quellcode-Bibliothek

^© Kompilation durch diese Firma

[Weder Korrektheit noch Funktionsfähigkeit der Software werden zugesichert.]

Datei: superword.cpp Sprache: C

/*
* Copyright (c) 2007, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

#include "precompiled.hpp"
#include "compiler/compileLog.hpp"
#include "libadt/vectset.hpp"
#include "memory/allocation.inline.hpp"
#include "memory/resourceArea.hpp"
#include "opto/addnode.hpp"
#include "opto/callnode.hpp"
#include "opto/castnode.hpp"
#include "opto/convertnode.hpp"
#include "opto/divnode.hpp"
#include "opto/matcher.hpp"
#include "opto/memnode.hpp"
#include "opto/mulnode.hpp"
#include "opto/opcodes.hpp"
#include "opto/opaquenode.hpp"
#include "opto/superword.hpp"
#include "opto/vectornode.hpp"
#include "opto/movenode.hpp"
#include "utilities/powerOfTwo.hpp"

//
//                  S U P E R W O R D   T R A N S F O R M
//=============================================================================

//------------------------------SuperWord---------------------------
SuperWord::SuperWord(PhaseIdealLoop* phase) :
  _phase(phase),
  _arena(phase->C->comp_arena()),
  _igvn(phase->_igvn),
  _packset(arena(), 8,  0, NULL),         // packs for the current block
  _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
  _block(arena(), 8,  0, NULL),           // nodes in current block
  _post_block(arena(), 8, 0, NULL),       // nodes common to current block which are marked as post loop vectorizable
  _data_entry(arena(), 8,  0, NULL),      // nodes with all inputs from outside
  _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
  _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
  _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
  _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
  _cmovev_kit(_arena, this),              // map to facilitate CMoveV creation
  _align_to_ref(NULL),                    // memory reference to align vectors to
  _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
  _dg(_arena),                            // dependence graph
  _visited(arena()),                      // visited node set
  _post_visited(arena()),                 // post visited node set
  _n_idx_list(arena(), 8),                // scratch list of (node,index) pairs
  _nlist(arena(), 8, 0, NULL),            // scratch list of nodes
  _stk(arena(), 8, 0, NULL),              // scratch stack of nodes
  _lpt(NULL),                             // loop tree node
  _lp(NULL),                              // CountedLoopNode
  _pre_loop_end(NULL),                    // Pre loop CountedLoopEndNode
  _bb(NULL),                              // basic block
  _iv(NULL),                              // induction var
  _race_possible(false),                  // cases where SDMU is true
  _early_return(true),                    // analysis evaluations routine
  _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
  _do_reserve_copy(DoReserveCopyInSuperWord),
  _num_work_vecs(0),                      // amount of vector work we have
  _num_reductions(0),                     // amount of reduction work we have
  _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
  _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
  _ii_order(arena(), 8, 0, 0)
{
#ifndef PRODUCT
  _vector_loop_debug = 0;
  if (_phase->C->method() != NULL) {
    _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
  }

#endif
}

static const bool _do_vector_loop_experimental = false; // Experimental vectorization which uses data from loop unrolling.

//------------------------------transform_loop---------------------------
bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
  assert(UseSuperWord, "should be");
  // SuperWord only works with power of two vector sizes.
  int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
  if (vector_width < 2 || !is_power_of_2(vector_width)) {
    return false;
  }

  assert(lpt->_head->is_CountedLoop(), "must be");
  CountedLoopNode *cl = lpt->_head->as_CountedLoop();

  if (!cl->is_valid_counted_loop(T_INT)) {
    return false; // skip malformed counted loop
  }

  if (cl->is_rce_post_loop() && cl->is_reduction_loop()) {
    // Post loop vectorization doesn't support reductions
    return false;
  }

  // skip any loop that has not been assigned max unroll by analysis
  if (do_optimization) {
    if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) {
      return false;
    }
  }

  // Check for no control flow in body (other than exit)
  Node *cl_exit = cl->loopexit();
  if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
    #ifndef PRODUCT
      if (TraceSuperWord) {
        tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
        tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
        tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
        tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
        lpt->dump_head();
      }
    #endif
    return false;
  }

  // Make sure the are no extra control users of the loop backedge
  if (cl->back_control()->outcnt() != 1) {
    return false;
  }

  // Skip any loops already optimized by slp
  if (cl->is_vectorized_loop()) {
    return false;
  }

  if (cl->is_unroll_only()) {
    return false;
  }

  if (cl->is_main_loop()) {
    // Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
    CountedLoopEndNode* pre_end = find_pre_loop_end(cl);
    if (pre_end == NULL) {
      return false;
    }
    Node* pre_opaq1 = pre_end->limit();
    if (pre_opaq1->Opcode() != Op_Opaque1) {
      return false;
    }
    set_pre_loop_end(pre_end);
  }

  init(); // initialize data structures

  set_lpt(lpt);
  set_lp(cl);

  // For now, define one block which is the entire loop body
  set_bb(cl);

  bool success = true;
  if (do_optimization) {
    assert(_packset.length() == 0, "packset must be empty");
    success = SLP_extract();
    if (PostLoopMultiversioning) {
      if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
        IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next;
        CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
        _phase->has_range_checks(lpt_next);
        // Main loop SLP works well for manually unrolled loops. But post loop
        // vectorization doesn't work for these. To bail out the optimization
        // earlier, we have range check and loop stride conditions below.
        if (cl_next->is_post_loop() && !cl_next->range_checks_present() &&
            cl_next->stride_is_con() && abs(cl_next->stride_con()) == 1) {
          if (!cl_next->is_vectorized_loop()) {
            // Propagate some main loop attributes to its corresponding scalar
            // rce'd post loop for vectorization with vector masks
            cl_next->set_slp_max_unroll(cl->slp_max_unroll());
            cl_next->set_slp_pack_count(cl->slp_pack_count());
          }
        }
      }
    }
  }
  return success;
}

//------------------------------max vector size------------------------------
int SuperWord::max_vector_size(BasicType bt) {
  int max_vector = Matcher::max_vector_size(bt);
  int sw_max_vector_limit = SuperWordMaxVectorSize / type2aelembytes(bt);
  if (max_vector > sw_max_vector_limit) {
    max_vector = sw_max_vector_limit;
  }
  return max_vector;
}

//------------------------------early unrolling analysis------------------------------
void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
  bool is_slp = true;
  ResourceMark rm;
  size_t ignored_size = lpt()->_body.size();
  int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
  Node_Stack nstack((int)ignored_size);
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
  Node *cl_exit = cl->loopexit_or_null();
  int rpo_idx = _post_block.length();

  assert(rpo_idx == 0, "post loop block is empty");

  // First clear the entries
  for (uint i = 0; i < lpt()->_body.size(); i++) {
    ignored_loop_nodes[i] = -1;
  }

  int max_vector = max_vector_size(T_BYTE);

  // Process the loop, some/all of the stack entries will not be in order, ergo
  // need to preprocess the ignored initial state before we process the loop
  for (uint i = 0; i < lpt()->_body.size(); i++) {
    Node* n = lpt()->_body.at(i);
    if (n == cl->incr() ||
      n->is_reduction() ||
      n->is_AddP() ||
      n->is_Cmp() ||
      n->is_Bool() ||
      n->is_IfTrue() ||
      n->is_CountedLoop() ||
      (n == cl_exit)) {
      ignored_loop_nodes[i] = n->_idx;
      continue;
    }

    if (n->is_If()) {
      IfNode *iff = n->as_If();
      if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
        if (lpt()->is_loop_exit(iff)) {
          ignored_loop_nodes[i] = n->_idx;
          continue;
        }
      }
    }

    if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
      Node* n_tail = n->in(LoopNode::LoopBackControl);
      if (n_tail != n->in(LoopNode::EntryControl)) {
        if (!n_tail->is_Mem()) {
          is_slp = false;
          break;
        }
      }
    }

    // This must happen after check of phi/if
    if (n->is_Phi() || n->is_If()) {
      ignored_loop_nodes[i] = n->_idx;
      continue;
    }

    if (n->is_LoadStore() || n->is_MergeMem() ||
      (n->is_Proj() && !n->as_Proj()->is_CFG())) {
      is_slp = false;
      break;
    }

    // Ignore nodes with non-primitive type.
    BasicType bt;
    if (n->is_Mem()) {
      bt = n->as_Mem()->memory_type();
    } else {
      bt = n->bottom_type()->basic_type();
    }
    if (is_java_primitive(bt) == false) {
      ignored_loop_nodes[i] = n->_idx;
      continue;
    }

    if (n->is_Mem()) {
      MemNode* current = n->as_Mem();
      Node* adr = n->in(MemNode::Address);
      Node* n_ctrl = _phase->get_ctrl(adr);

      // save a queue of post process nodes
      if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
        // Process the memory expression
        int stack_idx = 0;
        bool have_side_effects = true;
        if (adr->is_AddP() == false) {
          nstack.push(adr, stack_idx++);
        } else {
          // Mark the components of the memory operation in nstack
          SWPointer p1(current, this, &nstack, true);
          have_side_effects = p1.node_stack()->is_nonempty();
        }

        // Process the pointer stack
        while (have_side_effects) {
          Node* pointer_node = nstack.node();
          for (uint j = 0; j < lpt()->_body.size(); j++) {
            Node* cur_node = lpt()->_body.at(j);
            if (cur_node == pointer_node) {
              ignored_loop_nodes[j] = cur_node->_idx;
              break;
            }
          }
          nstack.pop();
          have_side_effects = nstack.is_nonempty();
        }
      }
    }
  }

  if (is_slp) {
    // In the main loop, SLP works well if parts of the operations in the loop body
    // are not vectorizable and those non-vectorizable parts will be unrolled only.
    // But in post loops with vector masks, we create singleton packs directly from
    // scalars so all operations should be vectorized together. This compares the
    // number of packs in the post loop with the main loop and bail out if the post
    // loop potentially has more packs.
    if (cl->is_rce_post_loop()) {
      for (uint i = 0; i < lpt()->_body.size(); i++) {
        if (ignored_loop_nodes[i] == -1) {
          _post_block.at_put_grow(rpo_idx++, lpt()->_body.at(i));
        }
      }
      if (_post_block.length() > cl->slp_pack_count()) {
        // Clear local_loop_unroll_factor and bail out directly from here
        local_loop_unroll_factor = 0;
        cl->mark_was_slp();
        cl->set_slp_max_unroll(0);
        return;
      }
    }

    // Now we try to find the maximum supported consistent vector which the machine
    // description can use
    bool flag_small_bt = false;
    for (uint i = 0; i < lpt()->_body.size(); i++) {
      if (ignored_loop_nodes[i] != -1) continue;

      BasicType bt;
      Node* n = lpt()->_body.at(i);
      if (n->is_Mem()) {
        bt = n->as_Mem()->memory_type();
      } else {
        bt = n->bottom_type()->basic_type();
      }

      if (is_java_primitive(bt) == false) continue;

      int cur_max_vector = max_vector_size(bt);

      // If a max vector exists which is not larger than _local_loop_unroll_factor
      // stop looking, we already have the max vector to map to.
      if (cur_max_vector < local_loop_unroll_factor) {
        is_slp = false;
        if (TraceSuperWordLoopUnrollAnalysis) {
          tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
        }
        break;
      }

      // Map the maximal common vector except conversion nodes, because we can't get
      // the precise basic type for conversion nodes in the stage of early analysis.
      if (!VectorNode::is_convert_opcode(n->Opcode()) &&
          VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
        if (cur_max_vector < max_vector && !flag_small_bt) {
          max_vector = cur_max_vector;
        } else if (cur_max_vector > max_vector && UseSubwordForMaxVector) {
          // Analyse subword in the loop to set maximum vector size to take advantage of full vector width for subword types.
          // Here we analyze if narrowing is likely to happen and if it is we set vector size more aggressively.
          // We check for possibility of narrowing by looking through chain operations using subword types.
          if (is_subword_type(bt)) {
            uint start, end;
            VectorNode::vector_operands(n, &start, &end);

            for (uint j = start; j < end; j++) {
              Node* in = n->in(j);
              // Don't propagate through a memory
              if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
                bool same_type = true;
                for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
                  Node *use = in->fast_out(k);
                  if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
                    same_type = false;
                    break;
                  }
                }
                if (same_type) {
                  max_vector = cur_max_vector;
                  flag_small_bt = true;
                  cl->mark_subword_loop();
                }
              }
            }
          }
        }
      }
    }
    if (is_slp) {
      local_loop_unroll_factor = max_vector;
      cl->mark_passed_slp();
    }
    cl->mark_was_slp();
    if (cl->is_main_loop() || cl->is_rce_post_loop()) {
      cl->set_slp_max_unroll(local_loop_unroll_factor);
    }
  }
}

//------------------------------SLP_extract---------------------------
// Extract the superword level parallelism
//
// 1) A reverse post-order of nodes in the block is constructed.  By scanning
//    this list from first to last, all definitions are visited before their uses.
//
// 2) A point-to-point dependence graph is constructed between memory references.
//    This simplifies the upcoming "independence" checker.
//
// 3) The maximum depth in the node graph from the beginning of the block
//    to each node is computed.  This is used to prune the graph search
//    in the independence checker.
//
// 4) For integer types, the necessary bit width is propagated backwards
//    from stores to allow packed operations on byte, char, and short
//    integers.  This reverses the promotion to type "int" that javac
//    did for operations like: char c1,c2,c3;  c1 = c2 + c3.
//
// 5) One of the memory references is picked to be an aligned vector reference.
//    The pre-loop trip count is adjusted to align this reference in the
//    unrolled body.
//
// 6) The initial set of pack pairs is seeded with memory references.
//
// 7) The set of pack pairs is extended by following use->def and def->use links.
//
// 8) The pairs are combined into vector sized packs.
//
// 9) Reorder the memory slices to co-locate members of the memory packs.
//
// 10) Generate ideal vector nodes for the final set of packs and where necessary,
//    inserting scalar promotion, vector creation from multiple scalars, and
//    extraction of scalar values from vectors.
//
bool SuperWord::SLP_extract() {

#ifndef PRODUCT
  if (_do_vector_loop && TraceSuperWord) {
    tty->print("SuperWord::SLP_extract\n");
    tty->print("input loop\n");
    _lpt->dump_head();
    _lpt->dump();
    for (uint i = 0; i < _lpt->_body.size(); i++) {
      _lpt->_body.at(i)->dump();
    }
  }
#endif
  // Ready the block
  if (!construct_bb()) {
    return false; // Exit if no interesting nodes or complex graph.
  }

  // build    _dg, _disjoint_ptrs
  dependence_graph();

  // compute function depth(Node*)
  compute_max_depth();

  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
  if (cl->is_main_loop()) {
    if (_do_vector_loop_experimental) {
      if (mark_generations() != -1) {
        hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly

        if (!construct_bb()) {
          return false; // Exit if no interesting nodes or complex graph.
        }
        dependence_graph();
        compute_max_depth();
      }

#ifndef PRODUCT
      if (TraceSuperWord) {
        tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
        _lpt->dump_head();
        for (int j = 0; j < _block.length(); j++) {
          Node* n = _block.at(j);
          int d = depth(n);
          for (int i = 0; i < d; i++) tty->print("%s", " ");
          tty->print("%d :", d);
          n->dump();
        }
      }
#endif
    }

    compute_vector_element_type();

    // Attempt vectorization

    find_adjacent_refs();

    if (align_to_ref() == NULL) {
      return false; // Did not find memory reference to align vectors
    }

    extend_packlist();

    if (_do_vector_loop_experimental) {
      if (_packset.length() == 0) {
#ifndef PRODUCT
        if (TraceSuperWord) {
          tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
        }
#endif
        pack_parallel();
      }
    }

    combine_packs();

    construct_my_pack_map();
    if (UseVectorCmov) {
      merge_packs_to_cmove();
    }

    filter_packs();

    schedule();

    // Record eventual count of vector packs for checks in post loop vectorization
    if (PostLoopMultiversioning) {
      cl->set_slp_pack_count(_packset.length());
    }
  } else {
    assert(cl->is_rce_post_loop(), "Must be an rce'd post loop");
    int saved_mapped_unroll_factor = cl->slp_max_unroll();
    if (saved_mapped_unroll_factor) {
      int vector_mapped_unroll_factor = saved_mapped_unroll_factor;

      // now reset the slp_unroll_factor so that we can check the analysis mapped
      // what the vector loop was mapped to
      cl->set_slp_max_unroll(0);

      // do the analysis on the post loop
      unrolling_analysis(vector_mapped_unroll_factor);

      // if our analyzed loop is a canonical fit, start processing it
      if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
        // now add the vector nodes to packsets
        for (int i = 0; i < _post_block.length(); i++) {
          Node* n = _post_block.at(i);
          Node_List* singleton = new Node_List();
          singleton->push(n);
          _packset.append(singleton);
          set_my_pack(n, singleton);
        }

        // map base types for vector usage
        compute_vector_element_type();
      } else {
        return false;
      }
    } else {
      // for some reason we could not map the slp analysis state of the vectorized loop
      return false;
    }
  }

  return output();
}

//------------------------------find_adjacent_refs---------------------------
// Find the adjacent memory references and create pack pairs for them.
// This is the initial set of packs that will then be extended by
// following use->def and def->use links.  The align positions are
// assigned relative to the reference "align_to_ref"
void SuperWord::find_adjacent_refs() {
  // Get list of memory operations
  Node_List memops;
  for (int i = 0; i < _block.length(); i++) {
    Node* n = _block.at(i);
    if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
        is_java_primitive(n->as_Mem()->memory_type())) {
      int align = memory_alignment(n->as_Mem(), 0);
      if (align != bottom_align) {
        memops.push(n);
      }
    }
  }
  if (TraceSuperWord) {
    tty->print_cr("\nfind_adjacent_refs found %d memops", memops.size());
  }

  Node_List align_to_refs;
  int max_idx;
  int best_iv_adjustment = 0;
  MemNode* best_align_to_mem_ref = NULL;

  while (memops.size() != 0) {
    // Find a memory reference to align to.
    MemNode* mem_ref = find_align_to_ref(memops, max_idx);
    if (mem_ref == NULL) break;
    align_to_refs.push(mem_ref);
    int iv_adjustment = get_iv_adjustment(mem_ref);

    if (best_align_to_mem_ref == NULL) {
      // Set memory reference which is the best from all memory operations
      // to be used for alignment. The pre-loop trip count is modified to align
      // this reference to a vector-aligned address.
      best_align_to_mem_ref = mem_ref;
      best_iv_adjustment = iv_adjustment;
      NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
    }

    SWPointer align_to_ref_p(mem_ref, this, NULL, false);
    // Set alignment relative to "align_to_ref" for all related memory operations.
    for (int i = memops.size() - 1; i >= 0; i--) {
      MemNode* s = memops.at(i)->as_Mem();
      if (isomorphic(s, mem_ref) &&
           (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
        SWPointer p2(s, this, NULL, false);
        if (p2.comparable(align_to_ref_p)) {
          int align = memory_alignment(s, iv_adjustment);
          set_alignment(s, align);
        }
      }
    }

    // Create initial pack pairs of memory operations for which
    // alignment is set and vectors will be aligned.
    bool create_pack = true;
    if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
      if (vectors_should_be_aligned()) {
        int vw = vector_width(mem_ref);
        int vw_best = vector_width(best_align_to_mem_ref);
        if (vw > vw_best) {
          // Do not vectorize a memory access with more elements per vector
          // if unaligned memory access is not allowed because number of
          // iterations in pre-loop will be not enough to align it.
          create_pack = false;
        } else {
          SWPointer p2(best_align_to_mem_ref, this, NULL, false);
          if (!align_to_ref_p.invar_equals(p2)) {
            // Do not vectorize memory accesses with different invariants
            // if unaligned memory accesses are not allowed.
            create_pack = false;
          }
        }
      }
    } else {
      if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
        // Can't allow vectorization of unaligned memory accesses with the
        // same type since it could be overlapped accesses to the same array.
        create_pack = false;
      } else {
        // Allow independent (different type) unaligned memory operations
        // if HW supports them.
        if (vectors_should_be_aligned()) {
          create_pack = false;
        } else {
          // Check if packs of the same memory type but
          // with a different alignment were created before.
          for (uint i = 0; i < align_to_refs.size(); i++) {
            MemNode* mr = align_to_refs.at(i)->as_Mem();
            if (mr == mem_ref) {
              // Skip when we are looking at same memory operation.
              continue;
            }
            if (same_velt_type(mr, mem_ref) &&
                memory_alignment(mr, iv_adjustment) != 0)
              create_pack = false;
          }
        }
      }
    }
    if (create_pack) {
      for (uint i = 0; i < memops.size(); i++) {
        Node* s1 = memops.at(i);
        int align = alignment(s1);
        if (align == top_align) continue;
        for (uint j = 0; j < memops.size(); j++) {
          Node* s2 = memops.at(j);
          if (alignment(s2) == top_align) continue;
          if (s1 != s2 && are_adjacent_refs(s1, s2)) {
            if (stmts_can_pack(s1, s2, align)) {
              Node_List* pair = new Node_List();
              pair->push(s1);
              pair->push(s2);
              if (!_do_vector_loop || same_origin_idx(s1, s2)) {
                _packset.append(pair);
              }
            }
          }
        }
      }
    } else { // Don't create unaligned pack
      // First, remove remaining memory ops of the same type from the list.
      for (int i = memops.size() - 1; i >= 0; i--) {
        MemNode* s = memops.at(i)->as_Mem();
        if (same_velt_type(s, mem_ref)) {
          memops.remove(i);
        }
      }

      // Second, remove already constructed packs of the same type.
      for (int i = _packset.length() - 1; i >= 0; i--) {
        Node_List* p = _packset.at(i);
        MemNode* s = p->at(0)->as_Mem();
        if (same_velt_type(s, mem_ref)) {
          remove_pack_at(i);
        }
      }

      // If needed find the best memory reference for loop alignment again.
      if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
        // Put memory ops from remaining packs back on memops list for
        // the best alignment search.
        uint orig_msize = memops.size();
        for (int i = 0; i < _packset.length(); i++) {
          Node_List* p = _packset.at(i);
          MemNode* s = p->at(0)->as_Mem();
          assert(!same_velt_type(s, mem_ref), "sanity");
          memops.push(s);
        }
        best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
        if (best_align_to_mem_ref == NULL) {
          if (TraceSuperWord) {
            tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");
          }
          // best_align_to_mem_ref will be used for adjusting the pre-loop limit in
          // SuperWord::align_initial_loop_index. Find one with the biggest vector size,
          // smallest data size and smallest iv offset from memory ops from remaining packs.
          if (_packset.length() > 0) {
            if (orig_msize == 0) {
              best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
            } else {
              for (uint i = 0; i < orig_msize; i++) {
                memops.remove(0);
              }
              best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
              assert(best_align_to_mem_ref == NULL, "sanity");
              best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
            }
            assert(best_align_to_mem_ref != NULL, "sanity");
          }
          break;
        }
        best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
        NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
        // Restore list.
        while (memops.size() > orig_msize)
          (void)memops.pop();
      }
    } // unaligned memory accesses

    // Remove used mem nodes.
    for (int i = memops.size() - 1; i >= 0; i--) {
      MemNode* m = memops.at(i)->as_Mem();
      if (alignment(m) != top_align) {
        memops.remove(i);
      }
    }

  } // while (memops.size() != 0
  set_align_to_ref(best_align_to_mem_ref);

  if (TraceSuperWord) {
    tty->print_cr("\nAfter find_adjacent_refs");
    print_packset();
  }
}

#ifndef PRODUCT
void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
  if (is_trace_adjacent()) {
    tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
       best_align_to_mem_ref->_idx, best_iv_adjustment);
       best_align_to_mem_ref->dump();
  }
}
#endif

//------------------------------find_align_to_ref---------------------------
// Find a memory reference to align the loop induction variable to.
// Looks first at stores then at loads, looking for a memory reference
// with the largest number of references similar to it.
MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
  GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);

  // Count number of comparable memory ops
  for (uint i = 0; i < memops.size(); i++) {
    MemNode* s1 = memops.at(i)->as_Mem();
    SWPointer p1(s1, this, NULL, false);
    // Only discard unalignable memory references if vector memory references
    // should be aligned on this platform.
    if (vectors_should_be_aligned() && !ref_is_alignable(p1)) {
      *cmp_ct.adr_at(i) = 0;
      continue;
    }
    for (uint j = i+1; j < memops.size(); j++) {
      MemNode* s2 = memops.at(j)->as_Mem();
      if (isomorphic(s1, s2)) {
        SWPointer p2(s2, this, NULL, false);
        if (p1.comparable(p2)) {
          (*cmp_ct.adr_at(i))++;
          (*cmp_ct.adr_at(j))++;
        }
      }
    }
  }

  // Find Store (or Load) with the greatest number of "comparable" references,
  // biggest vector size, smallest data size and smallest iv offset.
  int max_ct        = 0;
  int max_vw        = 0;
  int max_idx       = -1;
  int min_size      = max_jint;
  int min_iv_offset = max_jint;
  for (uint j = 0; j < memops.size(); j++) {
    MemNode* s = memops.at(j)->as_Mem();
    if (s->is_Store()) {
      int vw = vector_width_in_bytes(s);
      assert(vw > 1, "sanity");
      SWPointer p(s, this, NULL, false);
      if ( cmp_ct.at(j) >  max_ct ||
          (cmp_ct.at(j) == max_ct &&
            ( vw >  max_vw ||
             (vw == max_vw &&
              ( data_size(s) <  min_size ||
               (data_size(s) == min_size &&
                p.offset_in_bytes() < min_iv_offset)))))) {
        max_ct = cmp_ct.at(j);
        max_vw = vw;
        max_idx = j;
        min_size = data_size(s);
        min_iv_offset = p.offset_in_bytes();
      }
    }
  }
  // If no stores, look at loads
  if (max_ct == 0) {
    for (uint j = 0; j < memops.size(); j++) {
      MemNode* s = memops.at(j)->as_Mem();
      if (s->is_Load()) {
        int vw = vector_width_in_bytes(s);
        assert(vw > 1, "sanity");
        SWPointer p(s, this, NULL, false);
        if ( cmp_ct.at(j) >  max_ct ||
            (cmp_ct.at(j) == max_ct &&
              ( vw >  max_vw ||
               (vw == max_vw &&
                ( data_size(s) <  min_size ||
                 (data_size(s) == min_size &&
                  p.offset_in_bytes() < min_iv_offset)))))) {
          max_ct = cmp_ct.at(j);
          max_vw = vw;
          max_idx = j;
          min_size = data_size(s);
          min_iv_offset = p.offset_in_bytes();
        }
      }
    }
  }

#ifdef ASSERT
  if (TraceSuperWord && Verbose) {
    tty->print_cr("\nVector memops after find_align_to_ref");
    for (uint i = 0; i < memops.size(); i++) {
      MemNode* s = memops.at(i)->as_Mem();
      s->dump();
    }
  }
#endif

  idx = max_idx;
  if (max_ct > 0) {
#ifdef ASSERT
    if (TraceSuperWord) {
      tty->print("\nVector align to node: ");
      memops.at(max_idx)->as_Mem()->dump();
    }
#endif
    return memops.at(max_idx)->as_Mem();
  }
  return NULL;
}

//------------------span_works_for_memory_size-----------------------------
static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) {
  bool span_matches_memory = false;
  if ((mem_size == type2aelembytes(T_BYTE) || mem_size == type2aelembytes(T_SHORT))
    && ABS(span) == type2aelembytes(T_INT)) {
    // There is a mismatch on span size compared to memory.
    for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) {
      Node* use = mem->fast_out(j);
      if (!VectorNode::is_type_transition_to_int(use)) {
        return false;
      }
    }
    // If all uses transition to integer, it means that we can successfully align even on mismatch.
    return true;
  }
  else {
    span_matches_memory = ABS(span) == mem_size;
  }
  return span_matches_memory && (ABS(offset) % mem_size) == 0;
}

//------------------------------ref_is_alignable---------------------------
// Can the preloop align the reference to position zero in the vector?
bool SuperWord::ref_is_alignable(SWPointer& p) {
  if (!p.has_iv()) {
    return true;   // no induction variable
  }
  CountedLoopEndNode* pre_end = pre_loop_end();
  assert(pre_end->stride_is_con(), "pre loop stride is constant");
  int preloop_stride = pre_end->stride_con();

  int span = preloop_stride * p.scale_in_bytes();
  int mem_size = p.memory_size();
  int offset   = p.offset_in_bytes();
  // Stride one accesses are alignable if offset is aligned to memory operation size.
  // Offset can be unaligned when UseUnalignedAccesses is used.
  if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) {
    return true;
  }
  // If the initial offset from start of the object is computable,
  // check if the pre-loop can align the final offset accordingly.
  //
  // In other words: Can we find an i such that the offset
  // after i pre-loop iterations is aligned to vw?
  //   (init_offset + pre_loop) % vw == 0              (1)
  // where
  //   pre_loop = i * span
  // is the number of bytes added to the offset by i pre-loop iterations.
  //
  // For this to hold we need pre_loop to increase init_offset by
  //   pre_loop = vw - (init_offset % vw)
  //
  // This is only possible if pre_loop is divisible by span because each
  // pre-loop iteration increases the initial offset by 'span' bytes:
  //   (vw - (init_offset % vw)) % span == 0
  //
  int vw = vector_width_in_bytes(p.mem());
  assert(vw > 1, "sanity");
  Node* init_nd = pre_end->init_trip();
  if (init_nd->is_Con() && p.invar() == NULL) {
    int init = init_nd->bottom_type()->is_int()->get_con();
    int init_offset = init * p.scale_in_bytes() + offset;
    if (init_offset < 0) { // negative offset from object start?
      return false;        // may happen in dead loop
    }
    if (vw % span == 0) {
      // If vm is a multiple of span, we use formula (1).
      if (span > 0) {
        return (vw - (init_offset % vw)) % span == 0;
      } else {
        assert(span < 0, "nonzero stride * scale");
        return (init_offset % vw) % -span == 0;
      }
    } else if (span % vw == 0) {
      // If span is a multiple of vw, we can simplify formula (1) to:
      //   (init_offset + i * span) % vw == 0
      //     =>
      //   (init_offset % vw) + ((i * span) % vw) == 0
      //     =>
      //   init_offset % vw == 0
      //
      // Because we add a multiple of vw to the initial offset, the final
      // offset is a multiple of vw if and only if init_offset is a multiple.
      //
      return (init_offset % vw) == 0;
    }
  }
  return false;
}
//---------------------------get_vw_bytes_special------------------------
int SuperWord::get_vw_bytes_special(MemNode* s) {
  // Get the vector width in bytes.
  int vw = vector_width_in_bytes(s);

  // Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
  BasicType btype = velt_basic_type(s);
  if (type2aelembytes(btype) == 2) {
    bool should_combine_adjacent = true;
    for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
      Node* user = s->fast_out(i);
      if (!VectorNode::is_muladds2i(user)) {
        should_combine_adjacent = false;
      }
    }
    if (should_combine_adjacent) {
      vw = MIN2(max_vector_size(btype)*type2aelembytes(btype), vw * 2);
    }
  }

  // Check for special case where there is a type conversion between different data size.
  int vectsize = max_vector_size_in_def_use_chain(s);
  if (vectsize < max_vector_size(btype)) {
    vw = MIN2(vectsize * type2aelembytes(btype), vw);
  }

  return vw;
}

//---------------------------get_iv_adjustment---------------------------
// Calculate loop's iv adjustment for this memory ops.
int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
  SWPointer align_to_ref_p(mem_ref, this, NULL, false);
  int offset = align_to_ref_p.offset_in_bytes();
  int scale  = align_to_ref_p.scale_in_bytes();
  int elt_size = align_to_ref_p.memory_size();
  int vw       = get_vw_bytes_special(mem_ref);
  assert(vw > 1, "sanity");
  int iv_adjustment;
  if (scale != 0) {
    int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
    // At least one iteration is executed in pre-loop by default. As result
    // several iterations are needed to align memory operations in main-loop even
    // if offset is 0.
    int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
    // iv_adjustment_in_bytes must be a multiple of elt_size if vector memory
    // references should be aligned on this platform.
    assert((ABS(iv_adjustment_in_bytes) % elt_size) == 0 || !vectors_should_be_aligned(),
           "(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size);
    iv_adjustment = iv_adjustment_in_bytes/elt_size;
  } else {
    // This memory op is not dependent on iv (scale == 0)
    iv_adjustment = 0;
  }

#ifndef PRODUCT
  if (TraceSuperWord) {
    tty->print("SuperWord::get_iv_adjustment: n = %d, noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d: ",
      mem_ref->_idx, offset, iv_adjustment, elt_size, scale, iv_stride(), vw);
    mem_ref->dump();
  }
#endif
  return iv_adjustment;
}

//---------------------------dependence_graph---------------------------
// Construct dependency graph.
// Add dependence edges to load/store nodes for memory dependence
//    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
void SuperWord::dependence_graph() {
  CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
  // First, assign a dependence node to each memory node
  for (int i = 0; i < _block.length(); i++ ) {
    Node *n = _block.at(i);
    if (n->is_Mem() || (n->is_Phi() && n->bottom_type() == Type::MEMORY)) {
      _dg.make_node(n);
    }
  }

  // For each memory slice, create the dependences
  for (int i = 0; i < _mem_slice_head.length(); i++) {
    Node* n      = _mem_slice_head.at(i);
    Node* n_tail = _mem_slice_tail.at(i);

    // Get slice in predecessor order (last is first)
    if (cl->is_main_loop()) {
      mem_slice_preds(n_tail, n, _nlist);
    }

#ifndef PRODUCT
    if(TraceSuperWord && Verbose) {
      tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
      for (int j = _nlist.length() - 1; j >= 0 ; j--) {
        _nlist.at(j)->dump();
      }
    }
#endif
    // Make the slice dependent on the root
    DepMem* slice = _dg.dep(n);
    _dg.make_edge(_dg.root(), slice);

    // Create a sink for the slice
    DepMem* slice_sink = _dg.make_node(NULL);
    _dg.make_edge(slice_sink, _dg.tail());

    // Now visit each pair of memory ops, creating the edges
    for (int j = _nlist.length() - 1; j >= 0 ; j--) {
      Node* s1 = _nlist.at(j);

      // If no dependency yet, use slice
      if (_dg.dep(s1)->in_cnt() == 0) {
        _dg.make_edge(slice, s1);
      }
      SWPointer p1(s1->as_Mem(), this, NULL, false);
      bool sink_dependent = true;
      for (int k = j - 1; k >= 0; k--) {
        Node* s2 = _nlist.at(k);
        if (s1->is_Load() && s2->is_Load())
          continue;
        SWPointer p2(s2->as_Mem(), this, NULL, false);

        int cmp = p1.cmp(p2);
        if (SuperWordRTDepCheck &&
            p1.base() != p2.base() && p1.valid() && p2.valid()) {
          // Trace disjoint pointers
          OrderedPair pp(p1.base(), p2.base());
          _disjoint_ptrs.append_if_missing(pp);
        }
        if (!SWPointer::not_equal(cmp)) {
          // Possibly same address
          _dg.make_edge(s1, s2);
          sink_dependent = false;
        }
      }
      if (sink_dependent) {
        _dg.make_edge(s1, slice_sink);
      }
    }

    if (TraceSuperWord) {
      tty->print_cr("\nDependence graph for slice: %d", n->_idx);
      for (int q = 0; q < _nlist.length(); q++) {
        _dg.print(_nlist.at(q));
      }
      tty->cr();
    }

    _nlist.clear();
  }

  if (TraceSuperWord) {
    tty->print_cr("\ndisjoint_ptrs: %s", _disjoint_ptrs.length() > 0 ? "" : "NONE");
    for (int r = 0; r < _disjoint_ptrs.length(); r++) {
      _disjoint_ptrs.at(r).print();
      tty->cr();
    }
    tty->cr();
  }

}

//---------------------------mem_slice_preds---------------------------
// Return a memory slice (node list) in predecessor order starting at "start"
void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
  assert(preds.length() == 0, "start empty");
  Node* n = start;
  Node* prev = NULL;
  while (true) {
    NOT_PRODUCT( if(is_trace_mem_slice()) tty->print_cr("SuperWord::mem_slice_preds: n %d", n->_idx);)
    assert(in_bb(n), "must be in block");
    for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
      Node* out = n->fast_out(i);
      if (out->is_Load()) {
        if (in_bb(out)) {
          preds.push(out);
          if (TraceSuperWord && Verbose) {
            tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", out->_idx);
          }
        }
      } else {
        // FIXME
        if (out->is_MergeMem() && !in_bb(out)) {
          // Either unrolling is causing a memory edge not to disappear,
          // or need to run igvn.optimize() again before SLP
        } else if (out->is_Phi() && out->bottom_type() == Type::MEMORY && !in_bb(out)) {
          // Ditto.  Not sure what else to check further.
        } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
          // StoreCM has an input edge used as a precedence edge.
          // Maybe an issue when oop stores are vectorized.
        } else {
          assert(out == prev || prev == NULL, "no branches off of store slice");
        }
      }//else
    }//for
    if (n == stop) break;
    preds.push(n);
    if (TraceSuperWord && Verbose) {
      tty->print_cr("SuperWord::mem_slice_preds: added pred(%d)", n->_idx);
    }
    prev = n;
    assert(n->is_Mem(), "unexpected node %s", n->Name());
    n = n->in(MemNode::Memory);
  }
}

//------------------------------stmts_can_pack---------------------------
// Can s1 and s2 be in a pack with s1 immediately preceding s2 and
// s1 aligned at "align"
bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {

  // Do not use superword for non-primitives
  BasicType bt1 = velt_basic_type(s1);
  BasicType bt2 = velt_basic_type(s2);
  if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
    return false;
  BasicType longer_bt = longer_type_for_conversion(s1);
  if (max_vector_size(bt1) < 2 ||
      (longer_bt != T_ILLEGAL && max_vector_size(longer_bt) < 2)) {
    return false; // No vectors for this type
  }

  if (isomorphic(s1, s2)) {
    if ((independent(s1, s2) && have_similar_inputs(s1, s2)) || reduction(s1, s2)) {
      if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
        if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
          int s1_align = alignment(s1);
          int s2_align = alignment(s2);
          if (s1_align == top_align || s1_align == align) {
            if (s2_align == top_align || s2_align == align + data_size(s1)) {
              return true;
            }
          }
        }
      }
    }
  }
  return false;
}

//------------------------------exists_at---------------------------
// Does s exist in a pack at position pos?
bool SuperWord::exists_at(Node* s, uint pos) {
  for (int i = 0; i < _packset.length(); i++) {
    Node_List* p = _packset.at(i);
    if (p->at(pos) == s) {
      return true;
    }
  }
  return false;
}

//------------------------------are_adjacent_refs---------------------------
// Is s1 immediately before s2 in memory?
bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
  if (!s1->is_Mem() || !s2->is_Mem()) return false;
  if (!in_bb(s1)    || !in_bb(s2))    return false;

  // Do not use superword for non-primitives
  if (!is_java_primitive(s1->as_Mem()->memory_type()) ||
      !is_java_primitive(s2->as_Mem()->memory_type())) {
    return false;
  }

  // FIXME - co_locate_pack fails on Stores in different mem-slices, so
  // only pack memops that are in the same alias set until that's fixed.
  if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
      _phase->C->get_alias_index(s2->as_Mem()->adr_type()))
    return false;
  SWPointer p1(s1->as_Mem(), this, NULL, false);
  SWPointer p2(s2->as_Mem(), this, NULL, false);
  if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
  int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
  return diff == data_size(s1);
}

//------------------------------isomorphic---------------------------
// Are s1 and s2 similar?
bool SuperWord::isomorphic(Node* s1, Node* s2) {
  if (s1->Opcode() != s2->Opcode()) return false;
  if (s1->req() != s2->req()) return false;
  if (!same_velt_type(s1, s2)) return false;
  Node* s1_ctrl = s1->in(0);
  Node* s2_ctrl = s2->in(0);
  // If the control nodes are equivalent, no further checks are required to test for isomorphism.
  if (s1_ctrl == s2_ctrl) {
    return true;
  } else {
    bool s1_ctrl_inv = ((s1_ctrl == NULL) ? true : lpt()->is_invariant(s1_ctrl));
    bool s2_ctrl_inv = ((s2_ctrl == NULL) ? true : lpt()->is_invariant(s2_ctrl));
    // If the control nodes are not invariant for the loop, fail isomorphism test.
    if (!s1_ctrl_inv || !s2_ctrl_inv) {
      return false;
    }
    if(s1_ctrl != NULL && s2_ctrl != NULL) {
      if (s1_ctrl->is_Proj()) {
        s1_ctrl = s1_ctrl->in(0);
        assert(lpt()->is_invariant(s1_ctrl), "must be invariant");
      }
      if (s2_ctrl->is_Proj()) {
        s2_ctrl = s2_ctrl->in(0);
        assert(lpt()->is_invariant(s2_ctrl), "must be invariant");
      }
      if (!s1_ctrl->is_RangeCheck() || !s2_ctrl->is_RangeCheck()) {
        return false;
      }
    }
    // Control nodes are invariant. However, we have no way of checking whether they resolve
    // in an equivalent manner. But, we know that invariant range checks are guaranteed to
    // throw before the loop (if they would have thrown). Thus, the loop would not have been reached.
    // Therefore, if the control nodes for both are range checks, we accept them to be isomorphic.
    for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
      Node* t1 = s1->fast_out(i);
      for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
        Node* t2 = s2->fast_out(j);
        if (VectorNode::is_muladds2i(t1) && VectorNode::is_muladds2i(t2)) {
          return true;
        }
      }
    }
  }
  return false;
}

//------------------------------independent---------------------------
// Is there no data path from s1 to s2 or s2 to s1?
bool SuperWord::independent(Node* s1, Node* s2) {
  //  assert(s1->Opcode() == s2->Opcode(), "check isomorphic first");
  int d1 = depth(s1);
  int d2 = depth(s2);
  if (d1 == d2) return s1 != s2;
  Node* deep    = d1 > d2 ? s1 : s2;
  Node* shallow = d1 > d2 ? s2 : s1;

  visited_clear();

  return independent_path(shallow, deep);
}

//--------------------------have_similar_inputs-----------------------
// For a node pair (s1, s2) which is isomorphic and independent,
// do s1 and s2 have similar input edges?
bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
  // assert(isomorphic(s1, s2) == true, "check isomorphic");
  // assert(independent(s1, s2) == true, "check independent");
  if (s1->req() > 1 && !s1->is_Store() && !s1->is_Load()) {
    for (uint i = 1; i < s1->req(); i++) {
      Node* s1_in = s1->in(i);
      Node* s2_in = s2->in(i);
      if (s1_in->is_Phi() && s2_in->is_Add() && s2_in->in(1) == s1_in) {
        // Special handling for expressions with loop iv, like "b[i] = a[i] * i".
        // In this case, one node has an input from the tripcount iv and another
        // node has an input from iv plus an offset.
        if (!s1_in->as_Phi()->is_tripcount(T_INT)) return false;
      } else {
        if (s1_in->Opcode() != s2_in->Opcode()) return false;
      }
    }
  }
  return true;
}

//------------------------------reduction---------------------------
// Is there a data path between s1 and s2 and the nodes reductions?
bool SuperWord::reduction(Node* s1, Node* s2) {
  bool retValue = false;
  int d1 = depth(s1);
  int d2 = depth(s2);
  if (d2 > d1) {
    if (s1->is_reduction() && s2->is_reduction()) {
      // This is an ordered set, so s1 should define s2
      for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
        Node* t1 = s1->fast_out(i);
        if (t1 == s2) {
          // both nodes are reductions and connected
          retValue = true;
        }
      }
    }
  }

  return retValue;
}

//------------------------------independent_path------------------------------
// Helper for independent
bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
  if (dp >= 1000) return false; // stop deep recursion
  visited_set(deep);
  int shal_depth = depth(shallow);
  assert(shal_depth <= depth(deep), "must be");
  for (DepPreds preds(deep, _dg); !preds.done(); preds.next()) {
    Node* pred = preds.current();
    if (in_bb(pred) && !visited_test(pred)) {
      if (shallow == pred) {
        return false;
      }
      if (shal_depth < depth(pred) && !independent_path(shallow, pred, dp+1)) {
        return false;
      }
    }
  }
  return true;
}

//------------------------------set_alignment---------------------------
void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
  set_alignment(s1, align);
  if (align == top_align || align == bottom_align) {
    set_alignment(s2, align);
  } else {
    set_alignment(s2, align + data_size(s1));
  }
}

//------------------------------data_size---------------------------
int SuperWord::data_size(Node* s) {
  Node* use = NULL; //test if the node is a candidate for CMoveV optimization, then return the size of CMov
  if (UseVectorCmov) {
    use = _cmovev_kit.is_Bool_candidate(s);
    if (use != NULL) {
      return data_size(use);
    }
    use = _cmovev_kit.is_Cmp_candidate(s);
    if (use != NULL) {
      return data_size(use);
    }
  }

  int bsize = type2aelembytes(velt_basic_type(s));
  assert(bsize != 0, "valid size");
  return bsize;
}

//------------------------------extend_packlist---------------------------
// Extend packset by following use->def and def->use links from pack members.
void SuperWord::extend_packlist() {
  bool changed;
  do {
    packset_sort(_packset.length());
    changed = false;
    for (int i = 0; i < _packset.length(); i++) {
      Node_List* p = _packset.at(i);
      changed |= follow_use_defs(p);
      changed |= follow_def_uses(p);
    }
  } while (changed);

  if (_race_possible) {
    for (int i = 0; i < _packset.length(); i++) {
      Node_List* p = _packset.at(i);
      order_def_uses(p);
    }
  }

  if (TraceSuperWord) {
    tty->print_cr("\nAfter extend_packlist");
    print_packset();
  }
}

//------------------------------adjust_alignment_for_type_conversion---------------------------------
// Adjust the target alignment if conversion between different data size exists in def-use nodes.
int SuperWord::adjust_alignment_for_type_conversion(Node* s, Node* t, int align) {
  // Do not use superword for non-primitives
  BasicType bt1 = velt_basic_type(s);
  BasicType bt2 = velt_basic_type(t);
  if (!is_java_primitive(bt1) || !is_java_primitive(bt2)) {
    return align;
  }
  if (longer_type_for_conversion(s) != T_ILLEGAL ||
      longer_type_for_conversion(t) != T_ILLEGAL) {
    align = align / data_size(s) * data_size(t);
  }
  return align;
}

//------------------------------follow_use_defs---------------------------
// Extend the packset by visiting operand definitions of nodes in pack p
bool SuperWord::follow_use_defs(Node_List* p) {
  assert(p->size() == 2, "just checking");
  Node* s1 = p->at(0);
  Node* s2 = p->at(1);
  assert(s1->req() == s2->req(), "just checking");
  assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");

  if (s1->is_Load()) return false;

  NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, alignment(s1));)
  bool changed = false;
  int start = s1->is_Store() ? MemNode::ValueIn   : 1;
  int end   = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
  for (int j = start; j < end; j++) {
    int align = alignment(s1);
    Node* t1 = s1->in(j);
    Node* t2 = s2->in(j);
    if (!in_bb(t1) || !in_bb(t2))
      continue;
    align = adjust_alignment_for_type_conversion(s1, t1, align);
    if (stmts_can_pack(t1, t2, align)) {
      if (est_savings(t1, t2) >= 0) {
        Node_List* pair = new Node_List();
        pair->push(t1);
        pair->push(t2);
        _packset.append(pair);
        NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)", t1->_idx, t2->_idx, align);)
        set_alignment(t1, t2, align);
        changed = true;
      }
    }
  }
  return changed;
}

//------------------------------follow_def_uses---------------------------
// Extend the packset by visiting uses of nodes in pack p
bool SuperWord::follow_def_uses(Node_List* p) {
  bool changed = false;
  Node* s1 = p->at(0);
  Node* s2 = p->at(1);
  assert(p->size() == 2, "just checking");
  assert(s1->req() == s2->req(), "just checking");
  assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");

  if (s1->is_Store()) return false;

  int align = alignment(s1);
  NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d", s1->_idx, align);)
  int savings = -1;
  int num_s1_uses = 0;
  Node* u1 = NULL;
  Node* u2 = NULL;
  for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
    Node* t1 = s1->fast_out(i);
    num_s1_uses++;
    if (!in_bb(t1)) continue;
    for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
      Node* t2 = s2->fast_out(j);
      if (!in_bb(t2)) continue;
      if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
      if (!opnd_positions_match(s1, t1, s2, t2))
        continue;
      int adjusted_align = alignment(s1);
      adjusted_align = adjust_alignment_for_type_conversion(s1, t1, adjusted_align);
      if (stmts_can_pack(t1, t2, adjusted_align)) {
        int my_savings = est_savings(t1, t2);
        if (my_savings > savings) {
          savings = my_savings;
          u1 = t1;
          u2 = t2;
          align = adjusted_align;
        }
      }
    }
  }
  if (num_s1_uses > 1) {
    _race_possible = true;
  }
  if (savings >= 0) {
    Node_List* pair = new Node_List();
    pair->push(u1);
    pair->push(u2);
    _packset.append(pair);
    NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)", u1->_idx, u2->_idx, align);)
    set_alignment(u1, u2, align);
    changed = true;
  }
  return changed;
}

//------------------------------order_def_uses---------------------------
// For extended packsets, ordinally arrange uses packset by major component
void SuperWord::order_def_uses(Node_List* p) {
  Node* s1 = p->at(0);

  if (s1->is_Store()) return;

  // reductions are always managed beforehand
  if (s1->is_reduction()) return;

  for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
    Node* t1 = s1->fast_out(i);

    // Only allow operand swap on commuting operations
    if (!t1->is_Add() && !t1->is_Mul() && !VectorNode::is_muladds2i(t1)) {
      break;
    }

    // Now find t1's packset
    Node_List* p2 = NULL;
    for (int j = 0; j < _packset.length(); j++) {
      p2 = _packset.at(j);
      Node* first = p2->at(0);
      if (t1 == first) {
        break;
      }
      p2 = NULL;
    }
    // Arrange all sub components by the major component
    if (p2 != NULL) {
      for (uint j = 1; j < p->size(); j++) {
        Node* d1 = p->at(j);
        Node* u1 = p2->at(j);
        opnd_positions_match(s1, t1, d1, u1);
      }
    }
  }
}

//---------------------------opnd_positions_match-------------------------
// Is the use of d1 in u1 at the same operand position as d2 in u2?
bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
  // check reductions to see if they are marshalled to represent the reduction
  // operator in a specified opnd
  if (u1->is_reduction() && u2->is_reduction()) {
    // ensure reductions have phis and reduction definitions feeding the 1st operand
    Node* first = u1->in(2);
    if (first->is_Phi() || first->is_reduction()) {
      u1->swap_edges(1, 2);
    }
    // ensure reductions have phis and reduction definitions feeding the 1st operand
    first = u2->in(2);
    if (first->is_Phi() || first->is_reduction()) {
      u2->swap_edges(1, 2);
    }
    return true;
  }

  uint ct = u1->req();
  if (ct != u2->req()) return false;
  uint i1 = 0;
  uint i2 = 0;
  do {
    for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
    for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
    if (i1 != i2) {
      if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
        // Further analysis relies on operands position matching.
        u2->swap_edges(i1, i2);
      } else if (VectorNode::is_muladds2i(u2) && u1 != u2) {
        if (i1 == 5 - i2) { // ((i1 == 3 && i2 == 2) || (i1 == 2 && i2 == 3) || (i1 == 1 && i2 == 4) || (i1 == 4 && i2 == 1))
          u2->swap_edges(1, 2);
          u2->swap_edges(3, 4);
        }
        if (i1 == 3 - i2 || i1 == 7 - i2) { // ((i1 == 1 && i2 == 2) || (i1 == 2 && i2 == 1) || (i1 == 3 && i2 == 4) || (i1 == 4 && i2 == 3))
          u2->swap_edges(2, 3);
          u2->swap_edges(1, 4);
        }
        return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
      } else {
        return false;
      }
    } else if (i1 == i2 && VectorNode::is_muladds2i(u2) && u1 != u2) {
      u2->swap_edges(1, 3);
      u2->swap_edges(2, 4);
      return false; // Just swap the edges, the muladds2i nodes get packed in follow_use_defs
    }
  } while (i1 < ct);
  return true;
}

//------------------------------est_savings---------------------------
// Estimate the savings from executing s1 and s2 as a pack
int SuperWord::est_savings(Node* s1, Node* s2) {
  int save_in = 2 - 1; // 2 operations per instruction in packed form

  // inputs
  for (uint i = 1; i < s1->req(); i++) {
    Node* x1 = s1->in(i);
    Node* x2 = s2->in(i);
    if (x1 != x2) {
      if (are_adjacent_refs(x1, x2)) {
        save_in += adjacent_profit(x1, x2);
      } else if (!in_packset(x1, x2)) {
        save_in -= pack_cost(2);
      } else {
        save_in += unpack_cost(2);
      }
    }
  }

  // uses of result
  uint ct = 0;
  int save_use = 0;
  for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
    Node* s1_use = s1->fast_out(i);
    for (int j = 0; j < _packset.length(); j++) {
      Node_List* p = _packset.at(j);
      if (p->at(0) == s1_use) {
        for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
          Node* s2_use = s2->fast_out(k);
          if (p->at(p->size()-1) == s2_use) {
            ct++;
            if (are_adjacent_refs(s1_use, s2_use)) {
              save_use += adjacent_profit(s1_use, s2_use);
            }
          }
        }
      }
    }
  }

  if (ct < s1->outcnt()) save_use += unpack_cost(1);
  if (ct < s2->outcnt()) save_use += unpack_cost(1);

  return MAX2(save_in, save_use);
}

//------------------------------costs---------------------------
int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
int SuperWord::pack_cost(int ct)   { return ct; }
int SuperWord::unpack_cost(int ct) { return ct; }

//------------------------------combine_packs---------------------------
// Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
void SuperWord::combine_packs() {
  bool changed = true;
  // Combine packs regardless max vector size.
  while (changed) {
    changed = false;
    for (int i = 0; i < _packset.length(); i++) {
      Node_List* p1 = _packset.at(i);
      if (p1 == NULL) continue;
      // Because of sorting we can start at i + 1
      for (int j = i + 1; j < _packset.length(); j++) {
        Node_List* p2 = _packset.at(j);
        if (p2 == NULL) continue;
        if (i == j) continue;
        if (p1->at(p1->size()-1) == p2->at(0)) {
          for (uint k = 1; k < p2->size(); k++) {
            p1->push(p2->at(k));
          }
          _packset.at_put(j, NULL);
          changed = true;
        }
      }
    }
  }

  // Split packs which have size greater then max vector size.
  for (int i = 0; i < _packset.length(); i++) {
    Node_List* p1 = _packset.at(i);
    if (p1 != NULL) {
      uint max_vlen = max_vector_size_in_def_use_chain(p1->at(0)); // Max elements in vector
      assert(is_power_of_2(max_vlen), "sanity");
      uint psize = p1->size();
      if (!is_power_of_2(psize)) {
        // Skip pack which can't be vector.
        // case1: for(...) { a[i] = i; }    elements values are different (i+x)
        // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
        _packset.at_put(i, NULL);
        continue;
      }
      if (psize > max_vlen) {
        Node_List* pack = new Node_List();
        for (uint j = 0; j < psize; j++) {
          pack->push(p1->at(j));
          if (pack->size() >= max_vlen) {
            assert(is_power_of_2(pack->size()), "sanity");
            _packset.append(pack);
            pack = new Node_List();
          }
        }
        _packset.at_put(i, NULL);
      }
    }
  }

  // Compress list.
  for (int i = _packset.length() - 1; i >= 0; i--) {
    Node_List* p1 = _packset.at(i);
    if (p1 == NULL) {
      _packset.remove_at(i);
    }
  }

  if (TraceSuperWord) {
    tty->print_cr("\nAfter combine_packs");
    print_packset();
  }
}

//-----------------------------construct_my_pack_map--------------------------
// Construct the map from nodes to packs.  Only valid after the
// point where a node is only in one pack (after combine_packs).
void SuperWord::construct_my_pack_map() {
  Node_List* rslt = NULL;
  for (int i = 0; i < _packset.length(); i++) {
    Node_List* p = _packset.at(i);
    for (uint j = 0; j < p->size(); j++) {
      Node* s = p->at(j);
#ifdef ASSERT
      if (my_pack(s) != NULL) {
        s->dump(1);
        tty->print_cr("packs[%d]:", i);
        print_pack(p);
        assert(false, "only in one pack");
      }
#endif
      set_my_pack(s, p);
    }
  }
}

//------------------------------filter_packs---------------------------
// Remove packs that are not implemented or not profitable.
void SuperWord::filter_packs() {
  // Remove packs that are not implemented
  for (int i = _packset.length() - 1; i >= 0; i--) {
    Node_List* pk = _packset.at(i);
    bool impl = implemented(pk);
    if (!impl) {
#ifndef PRODUCT
      if ((TraceSuperWord && Verbose) || _vector_loop_debug) {
        tty->print_cr("Unimplemented");
        pk->at(0)->dump();
      }
#endif
      remove_pack_at(i);
    }
    Node *n = pk->at(0);
    if (n->is_reduction()) {
      _num_reductions++;
    } else {
      _num_work_vecs++;
    }
  }

  // Remove packs that are not profitable
  bool changed;
  do {
    changed = false;
    for (int i = _packset.length() - 1; i >= 0; i--) {
      Node_List* pk = _packset.at(i);
      bool prof = profitable(pk);
      if (!prof) {
#ifndef PRODUCT
        if ((TraceSuperWord && Verbose) || _vector_loop_debug) {
          tty->print_cr("Unprofitable");
          pk->at(0)->dump();
        }
#endif
        remove_pack_at(i);
        changed = true;
      }
    }
  } while (changed);

#ifndef PRODUCT
  if (TraceSuperWord) {
    tty->print_cr("\nAfter filter_packs");
    print_packset();
    tty->cr();
  }
#endif
}

//------------------------------merge_packs_to_cmove---------------------------
// Merge qualified CMove into new vector-nodes
// We want to catch this pattern and subsume Cmp and Bool into CMove
//
//                   Sub              Con
//                  /  |               /
//                 /   |           /   /
//                /    |       /      /
//               /     |   /         /
//              /      /            /
//             /    /  |           /
//            v /      |          /
//         Cmp         |         /
--> --------------------

--> maximum size reached

--> --------------------

¤ Dauer der Verarbeitung: 0.865 Sekunden (vorverarbeitet) ¤

Download des Quellennavigators
Download des sprechenden Kalenders
in der Quellcodebibliothek suchen

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung ist noch experimentell.