// Copyright (C)  2000 Intel Corporation.  All rights reserved.
//
// $Header: /usr/development/orp/orp/arch/ia32/ia32_o3_jit/opt_bound_elimination.cpp,v 1.3 2001/09/07 10:14:01 zying1 Exp $
//


#include <stdlib.h>
#include "defines.h"
#include "ir.h"
#include "flow_graph.h"
#include "local_reg_alloc.h"
#include "expression.h"
#include "opt_bound_elimination.h"
#include "bit_vector.h"
#include "optimizer.h"
#include "stack.h"
#include "local_cse.h"
#include "build_ir_routines.h"
#include "../ia32_o1_jit/profiling.h"
#include "internal_jit_intf.h"

//#define TURN_OFF_BOUNDS // see also ir.cpp

//
// set the elems that are killed by a call  
//
void Bound_Set::set_elem_killed_by_call(Bit_Vector *bv) {
    unsigned short i;
    for (i = 0; i < _size; i++) {
        if (bv->is_set(i)) continue;
        Exp *e = (_elems[i]->is_branch_exp()) ? ((Inst_Exp*)_elems[i])->left_child() : _elems[i];
        if (e->is_alias_across_call())
            bv->set(i);
    }
}

//
// set the elems that are killed by exp 
//
void Bound_Set::set_elem_killed_by(Exp *exp, Bit_Vector *bv) {
    unsigned short i;
    unsigned kill_id;
    switch(exp->op) {
    case Exp::Array:
        for (i = 0; i < _size; i++) {
            if (bv->is_set(i)) continue;
            Exp *e = (_elems[i]->is_branch_exp()) ? ((Inst_Exp*)_elems[i])->left_child() : _elems[i];
            if (e->contain_of_array(exp->type))
                bv->set(i);
        }
        break;
    case Exp::Opnd:
    case Exp::Field:
        kill_id = exp->kill_id();
        assert(IS_VALID_KILL_ID(kill_id));
        for (i = 0; i < _size; i++) {
            if (bv->is_set(i)) continue;
            Exp *e = (_elems[i]->is_branch_exp()) ? ((Inst_Exp*)_elems[i])->left_child() : _elems[i];
            if (e->is_in_kill_set(kill_id))
                bv->set(i);
        }
        break;
    default: assert(0);
    }
}

Loop_Bound_Closure::Loop_Bound_Closure(Mem_Manager& mm, Bound_Set& b_set, 
    Cfg_Node **loop_headers, int elems, int bbs, int lps) : 
    bound_set(b_set), max_elem(elems), max_bbs(bbs), max_lps(lps), bad_loop_structure(false) {
    assert(bbs < 0xFFFF && lps < 0xFFFF); // doesn't exceed 16 bits

    iinc_and  = NULL; // allocate space later if needed
    iv        = (Bit_Vector**)mm.alloc(max_bbs*sizeof(Bit_Vector*));
    elem_kill = (Bit_Vector**)mm.alloc(max_bbs*sizeof(Bit_Vector*));
    bnd       = (Bit_Vector**)mm.alloc(max_bbs*sizeof(Bit_Vector*));        
    iv_reached= (Bit_Vector**)mm.alloc(max_bbs*sizeof(Bit_Vector*));        
    exit_cond = (unsigned short*)mm.alloc(max_bbs*sizeof(unsigned short));
    n_inst    = (unsigned short*)mm.alloc(max_bbs*sizeof(unsigned short));
    int i;
    for (i = 0; i < max_bbs; i++) {
        iv[i] = elem_kill[i] = bnd[i] = iv_reached[i] = NULL;
        exit_cond[i] = INVALID_ENTRY;
        n_inst[i] = 0;
    }
    //
    // create bit vectors for induction assignments & bound checking
    // clear all bit vectors
    //
    for (i = 0; i < max_lps; i++) {
        int label = loop_headers[i]->label;
        //
        // make sure that no two loops share the same header
        //
        if (iv[label] != NULL) {
            bad_loop_structure = true;
            return;
        }
        iv[label]          = new (mm) Bit_Vector(max_elem, mm, false); 
        elem_kill[label]   = new (mm) Bit_Vector(max_elem, mm, false);
        bnd[label]         = new (mm) Bit_Vector(max_elem, mm, false); 
        iv_reached[label]  = new (mm) Bit_Vector(max_elem, mm, false);
    }
    iv_bb       = new (mm) Bit_Vector(max_elem,mm, false);
    iinc_and_bb = new (mm) Bit_Vector(max_elem,mm, false);
    //
    // create bit vector for recording which loops are innermost loops
    //
    inner_loop = new (mm) Bit_Vector(max_bbs, mm, false);
    //
    // create bit vector for detecting loop exit expressions
    //
    one_exit = new (mm) Bit_Vector(max_bbs, mm, false);
    //
    // create bit vector for checking if the exit condition branches to a 
    // node that is within the loop
    //
    target_within_loop = new (mm) Bit_Vector(max_bbs, mm, false);
}

//
// Actually, "i = (i + imm1) & imm2" happens rarely.  We don't want to
// allocate spaces for iinc_and bitvectors for the normal case, we 
// create spaces for them only when necessary.
//
void Loop_Bound_Closure::create_space_for_iinc_and(Mem_Manager& mm,
                                                   Cfg_Node **loops) {
    if (iinc_and != NULL) return;
    iinc_and = (Bit_Vector**)mm.alloc(max_bbs*sizeof(Bit_Vector*));
    int i;
    for (i = 0; i < max_bbs; i++) iinc_and[i] = NULL;
    for (i = 0; i < max_lps; i++) {
        int label = loops[i]->loop_header->label;
        iinc_and[label] = new (mm) Bit_Vector(max_elem, mm, false); 
    }
}

void Loop_Bound_Closure::print_bvs_of_loop(Cfg_Node *loop_header) {
    assert(loop_header == loop_header->loop_header);
    cout << "---- Loop Header: " << loop_header->label << " ---- ";
    if (inner_loop->is_set(loop_header->label)) cout << "innermost";
    cout << endl;
    cout << "     IV      : ";
    Bit_Vector *bv = iv[loop_header->label];
    unsigned i;
    for (i = 0; i < bound_set.size(); i++)
        if (bv->is_set(i)) {
            bound_set.elem(i)->print(cout);
            cout << "; ";
        }
    cout << endl;
    cout << "     IV_KILL : ";
    bv = elem_kill[loop_header->label];
    for (i = 0; i < bound_set.size(); i++)
        if (bv->is_set(i)) {
            bound_set.elem(i)->print(cout);
            cout << "; ";
        }
    cout << endl;
    cout << "     BOUND   : ";
    bv = bnd[loop_header->label];
    for (i = 0; i < bound_set.size(); i++)
        if (bv->is_set(i)) {
            bound_set.elem(i)->print(cout);
            cout << "; ";
        }
    cout << endl;
    cout << "     EXIT    : ";
    if (exit_cond[loop_header->label] != INVALID_ENTRY) {
        bound_set.elem(exit_cond[loop_header->label])->print(cout);
    }
    cout << endl;
    if (iinc_and != NULL) {
        cout << "     IINC_AND: ";
        bv = iinc_and[loop_header->label];
        unsigned i;
        for (i = 0; i < bound_set.size(); i++)
            if (bv->is_set(i)) {
                bound_set.elem(i)->print(cout);
                cout << "; ";
            }
    }
    cout << endl << endl;
}

static int node_dominate_tail(Cfg_Node *node, Cfg_Node *curr, Cfg_Node *head) {
    if (curr == node) return 1;

    if (curr == head) return 0;
    assert(curr->idom != NULL);
    return node_dominate_tail(node,curr->idom,head);
}

//
// decide if bound checking and induction assignments are killed within loop
//
void Bound_Elim::compute_invariant_bounds(Cfg_Node *node, Cfg_Node *header, 
                                          Cfg_Node *tail, Exp *iv, bool iv_reached) {
    if (node->loop_depth() < header->loop_depth()) return; 
    //
    // if node within the same loop has been visited
    //
    if (node->loop_header == header && 
        node->latest_traversal >= fg->traversal_num())
        return;

    node->latest_traversal = fg->traversal_num();
    if (node->loop_header == header) { // within the loop
        unsigned h_label = header->label;

        Bit_Vector *bnd_bv    = lbc.bnd[h_label];
        Bit_Vector *elem_kill = lbc.elem_kill[h_label];
        Bit_Vector *iv_bv     = lbc.iv[h_label];
        Bit_Vector *reached_bv= lbc.iv_reached[h_label];
        assert(bnd_bv != NULL && elem_kill != NULL && iv_bv != NULL);

        unsigned short entry;
        //
        // remove bound checking expressions if they are killed within 
        // the loop.
        //
        Inst *head = node->IR_instruction_list();
        Inst *i;
        for (i = head->prev(); i != head; i = i->prev()) {
            if (i->is_call()) 
                lbc.bound_set.set_elem_killed_by_call(elem_kill);
            else if (i->exp->op == Exp::Assign) {
                if (i->is_iinc() || i->is_field_inc()) { // skip iv
                    entry = lbc.bound_set.lookup_elem(i->exp);
                    if (entry != INVALID_ENTRY && 
                        iv_bv->is_set(entry)) {
                        iv_reached = true;
                        continue;
                    }
                } else if (i->is_iinc_and()) { 
                    entry = lbc.bound_set.lookup_elem(i->exp);
                    if (entry != INVALID_ENTRY) continue;
                }
                Exp *left = ((Inst_Exp*)i->exp)->left_child();
                lbc.bound_set.set_elem_killed_by(left, elem_kill);
            } else if (iv_reached && i->is_compare() && 
                       ((Compare_Inst*)i)->gen_branch) {
                //
                //    i++     For pre-increment array accesses, the current
                //    a[i]    transformation will consider that it is safe
                //            to eliminate bound checking.  As a matter of 
                //            fact, it isn't.
                //
                entry = lbc.bound_set.lookup_elem(i->exp);
                if (entry != INVALID_ENTRY && iv != NULL && 
                    i->exp->is_in_kill_set(iv->kill_id())) 
                    reached_bv->set(entry);
            }
        }
    } else { // not an inner loop
        assert(0); // we only deal with innermost loops
    }
    //
    // Process this node's predecessors.  
    //
    Cfg_Int edge;
    for (edge=0; edge < node->in_edge_size(); edge++) {
        Cfg_Node *pred = node->in_edges(edge);
        if (pred->loop_header != node) // skip back edges of inner loops
            compute_invariant_bounds(node->in_edges(edge), header, tail, iv ,iv_reached);
    }
}

//
// "i = i + imm" or "i = (i+imm1) & imm2" happen
// if any "i = ..." in tmp, then we reset bv
//
bool Bound_Elim::more_than_once(unsigned entry,
                                Bit_Vector *tmp,
                                Bit_Vector *bv) {
    if (bv == NULL) return false;
    bool res = false;
    Exp *elem = ((Inst_Exp*)lbc.bound_set.elem(entry))->left_child();
    unsigned i;
    for (i = 0; i < tmp->numbits(); i++) {
        if (!tmp->is_set(i) ||
            elem != ((Inst_Exp*)lbc.bound_set.elem(i))->left_child())
            continue;
        res = true;
        bv->reset(i);
    }
    return res;
}

void Bound_Elim::init_bound_set(Cfg_Node *node, Cfg_Node *header, 
                                Cfg_Node *tail, Cfg_Node **loops) {
    if (node->loop_depth() < header->loop_depth()) return; 
    //
    // if node within the same loop has been visited
    //
    if (node->loop_header == header && 
        node->latest_traversal >= fg->traversal_num())
        return;

    if (node->loop_header == header) { // within the loop
        int dominate_tail = -1;

        node->latest_traversal = fg->traversal_num();
        if (node->flowgraph->remove_all_bounds_checks) return;
        unsigned label = header->label;

        Bit_Vector *bnd_bv = lbc.bnd[label];
        Bit_Vector *iv_bv  = lbc.iv[label];
        unsigned inst_count = 0;
        assert(bnd_bv != NULL && iv_bv != NULL);
        //
        // go over inst list to find bound checking and induction assignments
        //
        Inst *head = node->IR_instruction_list();
        Inst *i;
        for (i = head->next(); i != head; i = i->next()) {
            inst_count += (i->is_call())? 5 : 1;
            //
            // check if inst is a bound compare inst
            //
            int entry;
            if (i->is_compare() && ((Compare_Inst*)i)->gen_branch) {
                entry = lbc.bound_set.add_elem(i->exp);
                if (entry != INVALID_ENTRY) bnd_bv->set((unsigned)entry);
            } else if (i->is_iinc() || i->is_field_inc()) {
                if (dominate_tail == -1)
                    dominate_tail = node_dominate_tail(node,tail,header);
                //
                // We consider i is an induction assignment only if node 
                // dominates the tail.
                //
                entry = lbc.bound_set.add_elem(i->exp);
                //
                // if induction assignment does not dominate the loop tail, or
                // induction assignment appear more than once
                //
                if (entry != INVALID_ENTRY) {
                    if (!more_than_once(entry,lbc.iv_bb,iv_bv)) {
                        Bit_Vector *iinc_and_bv =
                        (lbc.iinc_and == NULL) ? NULL : lbc.iinc_and[label];
                        if (!more_than_once(entry,lbc.iinc_and_bb,iinc_and_bv) && 
                            dominate_tail != 0 ) // not dominate the loop tail
                            iv_bv->set((unsigned)entry); 
                    }
                    lbc.iv_bb->set((unsigned)entry);
                }
            } else if (i->is_iinc_and()) { // i = (i + imm1) & imm2
                //
                // The maximum increment of i is less than 0xF (15). We want to
                // treat this case specially.  We don't need to care if 
                // iinc_and dominates the exit, iinc_and occurs more than once,
                // or there exists only one exit condition.
                //
                entry = lbc.bound_set.add_elem(i->exp);
                lbc.create_space_for_iinc_and(mem, loops);
                Bit_Vector *iinc_and_bv = lbc.iinc_and[label];
                if ( entry != INVALID_ENTRY) { // overflow if entry == INVALID_ENTRY
                    if (!more_than_once(entry,lbc.iv_bb,iv_bv))
                        iinc_and_bv->set((unsigned)entry);
                    lbc.iinc_and_bb->set((unsigned)entry);
                }
            } else if (i->exp->is_inst_exp()) {
                Inst_Exp *e = (Inst_Exp*)i->exp;
                if (e->op >= Exp::Add && e->op < Exp::Compare &&
                   (e->left_child()->op == Exp::Field ||
                   (e->rght_child() != NULL && e->rght_child()->op == Exp::Field))) {
                    entry = lbc.bound_set.add_elem(i->exp);
                    if (entry != INVALID_ENTRY) bnd_bv->set((unsigned)entry);
                }
            }
        }
        //
        // update inst count
        //
        if (inst_count + lbc.n_inst[label] > (1u<<(sizeof(Count_Int)*8)) - 1)
            lbc.n_inst[label] = (1u<<(sizeof(Count_Int)*8)) - 1;
        else 
            lbc.n_inst[label] += inst_count;
        //
        // detect loop exit expressions
        //
        Cfg_Int edge;
        for (edge=0; edge < node->out_edge_size(); edge++) {
            Cfg_Node *succ = node->out_edges(edge);
            if (succ->loop_depth() >= node->loop_depth()) continue;
            if (dominate_tail == -1)
                dominate_tail = node_dominate_tail(node,tail,header);
            Inst *last_inst = head->prev();
            //
            // if the node doesn't dominate the tail or the last instruction
            // isn't a branch
            //
            if (dominate_tail == 0 || !last_inst->is_branch()) break;
            //
            // check if the exit branch is "x ble y"
            //
            Inst_Exp *cmp = (Inst_Exp*)((Inst_Exp*)last_inst->exp)->left_child();
            Exp *l = cmp->left_child();
            if (!l->is_opnd_exp() && l->op != Exp::Field) break;
            //
            // succ is an exit
            //
            if (lbc.one_exit->is_set(label)) {// more than one
                assert(lbc.exit_cond[label] != INVALID_ENTRY);
                Inst_Exp *bnd_exit = (Inst_Exp *)lbc.bound_set.elem(lbc.exit_cond[label]);
                //
                // set target_within_loop bit vector so that we know if the 
                // branch branches to a node that is within the loop
                //
                if (edge == 0) // exit is fall-through
                    lbc.target_within_loop->set(label);
                else
                    lbc.target_within_loop->reset(label);

                if (bnd_exit->is_in_kill_set(l->kill_id()))
                    lbc.exit_cond[label] = INVALID_ENTRY;
                else
                    lbc.exit_cond[label] = lbc.bound_set.add_elem(last_inst->exp);
            } else { // first exit
                assert(lbc.exit_cond[label] == INVALID_ENTRY);
                lbc.one_exit->set(label);
                lbc.exit_cond[label] = lbc.bound_set.add_elem(last_inst->exp);
                if (edge == 0) // exit is fall-through
                    lbc.target_within_loop->set(label);
            }
        }
    } else  // not an inner loop 
        assert(0); // we only traverse innermost loops

    //
    // Process this node's predecessors.  
    //
    Cfg_Int edge;
    for (edge=0; edge < node->in_edge_size(); edge++) {
        Cfg_Node *pred = node->in_edges(edge);
        if (pred->loop_header != node) // skip back edges of inner loops
            init_bound_set(pred, header, tail, loops);
    }
}

Inst *Imm_Operand::lookup_exp(Expressions& exprs, Inst *inst_head) {
    return exprs.lookup_imm(_imm, type, inst_head);
}
Inst *Reg_Operand::lookup_exp(Expressions& exprs, Inst *inst_head) {
    Operand_Exp *exp;
    if (is_temp_reg())
        exp = exprs.lookup_temp_reg_exp(this); // lookup temp reg
    else
        exp = exprs.lookup_reg_exp(id,type,is_vreg()); // lookup virtual reg
    // generate tuple inst and append the inst to the inst list
	return  exprs.gen_opnd_tuple(inst_head,exp);
}
Inst *Arg_Operand::lookup_exp(Expressions& exprs, Inst *inst_head) {
    return exprs.lookup_arg(num, type, inst_head);
}
Inst *Ret_Operand::lookup_exp(Expressions& exprs, Inst *inst_head) {
    return exprs.lookup_ret(type, inst_head);
}
Inst *Static_Operand::lookup_exp(Expressions& exprs, Inst *inst_head) {
    return exprs.lookup_static(addr, type, inst_head, _fh);
}
Inst *Const_Operand::lookup_exp(Expressions& exprs, Inst *inst_head) {
    Value v = val;
    return exprs.lookup_const(&v, type, inst_head);
}

//
// return iv assignment "e = e + c".
//
static Inst_Exp *iv_assignment_exist(Bit_Vector *iv, Bound_Set& bound_set, Exp *e) {
    unsigned size = bound_set.size();
    unsigned i;
    for (i = 0; i < size; i++)
        if (iv->is_set(i) && 
            e->is_in_kill_set(((Inst_Exp*)bound_set.elem(i))->left_child()->kill_id())) 
            return (Inst_Exp*)bound_set.elem(i);
    return NULL;
}

//
// return true, if e contains an iinc_and variable 
//
static bool contain_iinc_and(Exp *e, Bit_Vector *iinc_and,
                             Bound_Set &bound_set) {
    if (iinc_and == NULL) return false;

    unsigned size = bound_set.size();
    unsigned i;
    for (i = 0; i < size; i++) {
        if (!iinc_and->is_set(i)) continue;
        //
        // check if i (i = (i+imm1) & imm2) is in e
        //
        unsigned id = ((Inst_Exp*)bound_set.elem(i))->left_child()->kill_id();
        if (e->is_in_kill_set(id))
            return true;
    }
    return false;
}

static bool is_simple_expression(Exp *e) {
    return ((e->op == Exp::Add || e->op == Exp::Sub) &&
           ((Inst_Exp*)e)->left_child()->is_opnd_exp() &&
           ((Inst_Exp*)e)->rght_child()->is_opnd_exp() &&
           ((Inst_Exp*)e)->left_child() != ((Inst_Exp*)e)->rght_child());
}

//
// if e is an valid entry, then reset the correspond bit position of 
// the bnd bit vector
//
static void reset_bound_bv(Cfg_Node *header,
                           Exp      *e, 
                           Loop_Bound_Closure& lbc) {
    int entry = lbc.bound_set.lookup_elem(e);
    if (entry == INVALID_ENTRY) return;

    Bit_Vector *bnd = lbc.bnd[header->label];
    bnd->reset(entry);
}

static Inst_Exp *find_next_bnd_cand(Cfg_Node *header,  Loop_Bound_Closure& lbc, 
                                    Exp *exit_var,     unsigned& nbc, 
                                    bool& loop_invariant, bool& iinc_and_bnd) {
    loop_invariant = iinc_and_bnd = false;

    Bit_Vector *bnd = lbc.bnd[header->label];
    Bit_Vector *iv  = lbc.iv[header->label];
    Bit_Vector *iinc_and = (lbc.iinc_and == NULL) ? NULL : lbc.iinc_and[header->label];
    unsigned size = lbc.bound_set.size();
    for (; nbc < size; nbc++) 
        if (bnd->is_set(nbc)) {
            Inst_Exp *e = (Inst_Exp*)lbc.bound_set.elem(nbc);
            //
            // check if left and right contain induction variable
            //
            bool left_contain_iv = 
                 e->left_child()->is_in_kill_set(exit_var->kill_id());
            bool rght_contain_iv = (e->rght_child()) ?
                 e->rght_child()->is_in_kill_set(exit_var->kill_id()) : false;
            //
            // check if left and right contain iinc_and variable
            //
            bool rght_contain_iinc_and = (e->rght_child()) ?
                 contain_iinc_and(e->rght_child(),iinc_and,lbc.bound_set) : false;
            bool left_contain_iinc_and = 
                 contain_iinc_and(e->left_child(),iinc_and,lbc.bound_set);

            if (!left_contain_iv       && !rght_contain_iv && 
                !left_contain_iinc_and && !rght_contain_iinc_and) {
#ifndef NO_LOOP_INVARIANT                
				loop_invariant = true;
                break;
#endif
            }
            bool rght_is_invariant= !rght_contain_iv && !rght_contain_iinc_and;
            //
            // look for things like "i cmp a.length"
            //
            if (e->op == Exp::Compare && rght_is_invariant) {
                Exp *left = e->left_child();
                bool field_var = left->is_opnd_exp() ||
                                 left->op == Exp::Field ||
                                 is_simple_expression(left);
                if (field_var && (left_contain_iinc_and || left_contain_iv)) 
                    break;
            } 
            //
            // remove bnd from bnd bit vector (In other words, we cannot 
            // eliminate the bound checking code)
            //
            bnd->reset(nbc);
        }
   if (nbc < size) {
        nbc++;
        return (Inst_Exp*)lbc.bound_set.elem(nbc-1);
   }
    return NULL;
}

Cfg_Node *add_one_new_node(Flow_Graph *fg, Expressions& exprs, Cfg_Node *node,
                           Cfg_Node *old_header, Cfg_Node *new_header) {
    Mem_Manager& mm = fg->mem_manager;
	exprs.live_lcse()->snap_shot_of_lcse(); // end of node
    //
    // create Live_LCSE for new_node
    //
    Cfg_Node *new_node = new(mm) Cfg_Node(mm, 0, fg, node->linearization_node());
    Live_LCSE *live_lcse = new (mm) Live_LCSE(exprs.lcse_pool);
    new_node->set_live_lcse(live_lcse);
    node->live_lcse()->propagate_to(*live_lcse, NULL); // extended bb
    exprs.set_live_lcse(live_lcse);

    node->add_edge(mm, new_node);   // fall through edge (first edge)
    node->add_edge(mm, old_header); // target
    return new_node;
}

//
// return the biggest imm of "var = (var + c) & imm
//
static Exp *find_iinc(Exp *var, 
                      Bit_Vector *iinc_and, 
                      Bound_Set& set) {
    assert(var->is_opnd_exp());
    Operand_Exp *biggest_imm = NULL;
    unsigned size = iinc_and->numbits();
    unsigned i;
    for (i = 0; i < size; i++) {
        if (!iinc_and->is_set(i)) continue;

        Exp *asgn = set.elem(i);
        //
        // match var and compare imm with biggest_imm
        //
        if (((Inst_Exp*)asgn)->left_child() == var) {
            Exp *imm = ((Inst_Exp*)((Inst_Exp*)asgn)->rght_child())->rght_child();
            assert(imm->is_imm_exp());
            if (biggest_imm == NULL ||
                ((Imm_Operand*)biggest_imm->opnd)->imm() < 
                ((Imm_Operand*)((Operand_Exp*)imm)->opnd)->imm())
                biggest_imm = (Operand_Exp*)imm;
        }
    }
    return biggest_imm;
}

//
// generate cmp A.length, imm
//
static Cfg_Node *gen_iinc_and_bnd(Flow_Graph *fg,  Expressions& exprs, 
                                  Cfg_Node *node,  Cfg_Node *old_header, 
                                  Cfg_Node *new_header, Loop_Bound_Closure& lbc, 
                                  Exp *var_exp,    Exp *len_exp,
                                  bool end_of_bb) {
    //
    // create a new node for the instructions
    //
    if (end_of_bb)
        node = add_one_new_node(fg, exprs, node, old_header, new_header);
    Inst *inst_head = node->IR_instruction_list();

    Bit_Vector *iinc_and = lbc.iinc_and[old_header->label];
    Exp *imm_exp = find_iinc(var_exp,iinc_and,lbc.bound_set);
    assert(imm_exp != NULL);
    Inst* imm = imm_exp->build_inst(exprs, inst_head);
    Inst *len = len_exp->build_inst(exprs, inst_head);
    Inst *cmp_i = gen_cmp_inst(exprs.mem,exprs,inst_head, len, imm, JIT_TYPE_INT, false);
    gen_branch(exprs.mem,exprs,inst_head,false, Exp::Ble, cmp_i);
    return node;
}

//
// for(i=0; i < 100; i++)  generate    cmp  a.length, 100
//
// for(i=0; i < 100; ) {
//     i++;                generate    cmp  a.length, 101
//     a[i]
// }
//
static Cfg_Node *gen_increment_bnd(Flow_Graph *fg,  Expressions& exprs, 
                                   Cfg_Node *node,  Cfg_Node *old_header, 
                                   Cfg_Node *new_header, Loop_Bound_Closure& lbc, 
                                   Exp *idx_exp, Exp* len_exp, Exp *limit_exp,
                                   bool iv_reached, int inc,
                                   Exp::Kind exit_op, bool end_of_bb) {
    //
    // create a new node for the instructions
    //
    if (end_of_bb)
        node = add_one_new_node(fg, exprs, node, old_header, new_header);
    Inst *inst_head = node->IR_instruction_list();
    if (limit_exp == NULL) { // test against zero
        limit_exp = exprs.lookup_imm_exp(0, JIT_TYPE_INT);
    }

    Inst *x;
    Inst *len = len_exp->build_inst(exprs, inst_head);
    Exp *adj_exp = NULL;
    if (idx_exp->op == Exp::Add || idx_exp->op == Exp::Sub)
        adj_exp = ((Inst_Exp*)idx_exp)->rght_child();
    if (!iv_reached) {
        if (limit_exp->is_imm_exp()) {
            inc += (int)((Imm_Operand*)((Operand_Exp*)limit_exp)->opnd)->imm();
            if (adj_exp != NULL && adj_exp->is_imm_exp())
                inc += (int)((Imm_Operand*)((Operand_Exp*)adj_exp)->opnd)->imm();
            x = exprs.lookup_imm(inc,JIT_TYPE_INT,inst_head);
        } else {
            x = limit_exp->build_inst(exprs, inst_head);
            if (adj_exp != NULL && adj_exp->is_imm_exp())
                inc += (int)((Imm_Operand*)((Operand_Exp*)adj_exp)->opnd)->imm();
            Inst *imm = exprs.lookup_imm(inc,JIT_TYPE_INT,inst_head);
            x   = exprs.lookup_inst(Exp::Add,x,imm,x->type(),inst_head);
        }
    } else
        x = limit_exp->build_inst(exprs, inst_head);
    //
    // A[i-1]    ====>    A.length < limit + 1
    //
    if (adj_exp != NULL && (iv_reached || !adj_exp->is_imm_exp())) {
        Inst *adj = adj_exp->build_inst(exprs, inst_head);
        x = exprs.lookup_inst((Exp::Kind)idx_exp->op, x, adj, x->type(), inst_head);
    }

    Inst *cmp_i = gen_cmp_inst(exprs.mem,exprs,inst_head, len, x, JIT_TYPE_INT, false);
    if (exit_op == Exp::Bge)  
        exit_op = Exp::Ble;  // target_within_loop must be set
    else if (exit_op == Exp::Bgt)
        exit_op = Exp::Blt;
    assert(exit_op == Exp::Ble || exit_op == Exp::Blt);
    gen_branch(exprs.mem, exprs, inst_head, false, exit_op, cmp_i);
    //
    // if limit_exp is a loop invariant exp, then we need to remove it 
    // from bnd bit vector to avoid duplicating the exp twice.
    //
    reset_bound_bv(old_header,limit_exp,lbc);
    if (x->exp != limit_exp)
        reset_bound_bv(old_header,x->exp,lbc);
    return node;
}

//
// for(i=100; i > 10; i--)
//     a[i]                   generate    cmp  a.length, 11
//
// for(i=100; i > 10;) {
//     i--;                   generate    cmp  a.length, 10
//     a[i]
// }
//
static Cfg_Node *gen_decrement_bnd(Flow_Graph *fg,  Expressions& exprs, 
                                   Cfg_Node *node,  Cfg_Node *old_header, 
                                   Cfg_Node *new_header, Loop_Bound_Closure& lbc, 
                                   Exp *idx_exp, Exp* len_exp, Exp *limit_exp,
                                   bool iv_reached, int inc,
                                   Exp::Kind exit_op, bool end_of_bb) {
    Inst *x, *inst_head;
    int adjust = (!iv_reached) ? inc : 0; 
    if (limit_exp == NULL || limit_exp->is_imm_exp()) {
        if (limit_exp == NULL)  // test against zero
            inc = adjust + 1;
        else
            inc = adjust + ((int)((Imm_Operand*)((Operand_Exp*)limit_exp)->opnd)->imm())+1;
        if (inc == 0)
            return node;
        else {
            //
            // create a new node for the instructions
            //
            if (end_of_bb)
                node = add_one_new_node(fg, exprs, node, old_header, new_header);
            inst_head = node->IR_instruction_list();
            x = exprs.lookup_imm(inc,JIT_TYPE_INT,inst_head);
        }
    } else {
        if (end_of_bb)
            node = add_one_new_node(fg, exprs, node, old_header, new_header);
        inst_head = node->IR_instruction_list();
        x = limit_exp->build_inst(exprs, inst_head);
        Inst *imm = exprs.lookup_imm(inc+1,JIT_TYPE_INT,inst_head);
        x   = exprs.lookup_inst(Exp::Add,x,imm,x->type(),inst_head);
    }
    Inst *len = len_exp->build_inst(exprs, inst_head);
    Inst *cmp_i = gen_cmp_inst(exprs.mem,exprs,inst_head, len, x, JIT_TYPE_INT, false);
    if (exit_op == Exp::Ble) 
        exit_op = Exp::Bge;  // target_within_loop must be set
    else if (exit_op == Exp::Blt)
        exit_op = Exp::Bgt;

    assert(exit_op == Exp::Bge || exit_op == Exp::Bgt);
    if (exit_op == Exp::Bge)
        gen_branch(exprs.mem,exprs,inst_head,false, Exp::Ble, cmp_i);
    else
        gen_branch(exprs.mem,exprs,inst_head,false, Exp::Blt, cmp_i);
    //
    // if limit_exp is a loop invariant exp, then we need to remove it 
    // from bnd bit vector to avoid duplicating the exp twice.
    //
    reset_bound_bv(old_header,limit_exp,lbc);
    if (x->exp != limit_exp)
        reset_bound_bv(old_header,x->exp,lbc);
    return  node;
}

//
// return the node that enters the new_header
//
Cfg_Node *Bound_Elim::build_IR_for_test_node(Cfg_Node *node, 
                                             Cfg_Node *old_header,
                                             Cfg_Node *new_header, 
                                             Inst_Exp *bnd,
                                             Inst_Exp *exit,
                                             unsigned next_bnd_cand,
                                             bool loop_invariant,
                                             bool iinc_and_bnd,
                                             int  inc) {
    Inst *inst_head = node->IR_instruction_list();

    assert(exit->is_branch_exp());

    Inst_Exp *cmp = (Inst_Exp*)exit->left_child(); // compare or test
    Exp *b_l = bnd->left_child();
    Exp *b_r = bnd->rght_child();
    bool end_of_bb = false;
    Mem_Manager& mm = fg->mem_manager;
    if (loop_invariant) {
        if (bnd->op == Exp::Compare) {
            //
            // bnd is an loop invariant, e.g. for (i=0; i < 100; i++) a[i] += a[5]; 
            //
            Inst *l_i = b_l->build_inst(exprs, inst_head);
            Inst *r_i = b_r->build_inst(exprs, inst_head);
            Inst *bnd = gen_array_cmp_inst(exprs.mem,exprs,inst_head,l_i, r_i,JIT_TYPE_INT,true);
        } else // loop invariant code motion
            bnd->build_inst(exprs, inst_head);
    } else  { 
        assert (bnd->op == Exp::Compare); // only deal with bound elimination
        unsigned short entry = lbc.bound_set.lookup_elem(bnd);
        assert(entry != INVALID_ENTRY);
        //
        // generate cmp i, A.length
        //
        Inst *imm;
        Inst *i   = b_l->build_inst(exprs, inst_head);
        Inst *len = b_r->build_inst(exprs, inst_head);
        bool iv_reached = !(!lbc.iv_reached[old_header->label]->is_set(entry));
        if (!iv_reached) {
            imm = exprs.lookup_imm(inc,JIT_TYPE_INT,inst_head);
            i   = exprs.lookup_inst(Exp::Add,i,imm,i->type(),inst_head);
        }
        //
        // if bound checking code "cmp i, A.length" is still available at 
        // this point, we don't need to generate the compare and branch insts.
        //
        if (!exprs.is_local_cse(exprs.lookup_inst_exp(Exp::Compare,i->exp,len->exp,JIT_TYPE_INT)))
        {
            Inst *cmp_i = gen_cmp_inst(exprs.mem,exprs,inst_head, i, len, JIT_TYPE_INT, false);
            gen_branch(exprs.mem,exprs,inst_head,false, Exp::Bge, cmp_i);
            end_of_bb = true;
        }
        Cfg_Node *old_node = node;
        if (iinc_and_bnd) {
            //
            // generate cmp A.length, imm
            //
            node = gen_iinc_and_bnd(fg, exprs, node, old_header, 
                                    new_header, lbc, b_l, b_r, end_of_bb);
        } else { // !!! need to rewrite the following code later
            //
            // generate cmp A.length, x
            //
            Exp *e_r = cmp->rght_child();
            if (inc > 0) {  // for (i=0; i < 100; i++)
                node = gen_increment_bnd(fg, exprs, node, old_header, 
                                         new_header, lbc, b_l, b_r, e_r, iv_reached,
                                         inc, (Exp::Kind)exit->op, end_of_bb);
            } else {  // for (i=100; i > 2; i--)
                node = gen_decrement_bnd(fg, exprs, node, old_header, 
                                         new_header, lbc, b_l, b_r, e_r, iv_reached,
                                         inc, (Exp::Kind)exit->op, end_of_bb);
            }
        }
        if (old_node != node)
            end_of_bb = true;
    }

    Inst_Exp *next_bnd = find_next_bnd_cand(old_header,lbc, cmp->left_child(), 
                                            next_bnd_cand, loop_invariant,
                                            iinc_and_bnd);
    if (next_bnd == NULL) {
        node->add_edge(mm, new_header); // fall through edge (first edge)
        node->add_edge(mm, old_header); // target
        return node;
    } 
    
    if (end_of_bb)
        node = add_one_new_node(fg, exprs, node, old_header, new_header);

    return build_IR_for_test_node(node, old_header, new_header, next_bnd, exit,
                               next_bnd_cand, loop_invariant, iinc_and_bnd,inc);
}

void Bound_Elim::remove_bound_inst(Cfg_Node *node, Bit_Vector* bnd, Bound_Set& bound_set) {
    if (node->mark() != 'G') return; 
    node->set_mark('R'); // newly created blocks are marked as "Red"
    Inst *head = node->IR_instruction_list();
    Inst *i;
    for (i = head->next(); i != head; i = i->next()) {
        if (!i->is_compare() || !((Compare_Inst*)i)->gen_branch) continue;
        int entry = bound_set.lookup_elem(i->exp);
        if (entry == INVALID_ENTRY) continue;
        if (bnd->is_set(entry))
            i->mark_dead();
    }
    // process successors
    Cfg_Int edge;
    for (edge=0; edge < node->out_edge_size(); edge++)
        remove_bound_inst(node->out_edges(edge),bnd,bound_set);
}

void set_traversal_num(Cfg_Node *node) {
    if (node->mark() != ' ') return;
    node->set_mark('G'); // newly created blocks are marked as "Green"
    node->latest_traversal = 0;
    Cfg_Int edge;
    for (edge=0; edge < node->out_edge_size(); edge++)
        set_traversal_num(node->out_edges(edge));
}

void Bound_Elim::build_IR_for_cloned_loop(Cfg_Node *node, 
                                          Cfg_Node *old_header,
                                          Cfg_Node *new_header,
                                          Inst_Exp *bnd,
                                          Inst_Exp *exit,
                                          Exp      *exit_var,
                                          unsigned next_bnd_cand,
                                          bool loop_invariant,
                                          bool iinc_and_bnd,
                                          int  inc) {

    //
    // create Live_LCSE for node
    //
    Live_LCSE *live_lcse = new (fg->mem_manager) Live_LCSE(exprs.lcse_pool);
    node->set_live_lcse(live_lcse);
    exprs.set_live_lcse(live_lcse);
    //
    // generate a.length >= x
    //
    Cfg_Node *entry_node = 
    build_IR_for_test_node(node, old_header, new_header, bnd, exit, 
                           next_bnd_cand, loop_invariant, iinc_and_bnd, inc);
    //
    // traverse newly cloned loop
    //
    set_traversal_num(new_header);
    unsigned orig_traversal_num = fg->traversal_num();
    fg->set_traversal_num(1);
    //
    // create java mimic stack
    //
    Stack stack(mem, comp_env.max_stack);
    char *initial_stack_sig = (char *)mem.alloc(comp_env.max_stack);
	//
    // kill exps that contain IV so that they are not propagated into the loop.
	// take a snap shot of live cse at end of bb
	//
    exprs.live_lcse()->kill_lcse_contain(exit_var->kill_id());
	exprs.live_lcse()->snap_shot_of_lcse();
    //
    // build IR for the newly cloned loop
    //
    fg->Build_IR_node(new_header, comp_env.comp_handle, exprs, stack,
                      initial_stack_sig, 0, mem, entry_node, 1,
                      gc_requires_write_barriers);
    //
    // We want to traverse the newly cloned loop to eliminate bound checking
    //
    remove_bound_inst(new_header,lbc.bnd[new_header->label],lbc.bound_set);
    Cfg_Int edge;
    for (edge=0; edge < old_header->in_edge_size(); edge++) 
        if (old_header->in_edges(edge)->loop_header == old_header) break;
    assert(edge != old_header->in_edge_size());
    //
    // add compensation code if necessary
    //
    fg->set_traversal_num(orig_traversal_num +1);
    add_compensation_code(old_header, old_header->in_edges(edge));
}

//
// create a node between succ and pred
// put compensation code into the node
//
static void insert_node_and_code(Flow_Graph *fg, Cfg_Node *succ, 
                                 Cfg_Node *pred, Inst **compensation, 
                                 int next_entry) {
    assert((next_entry & 1) == 0); // even number

    // create node and split the edge
    Mem_Manager& mm = fg->mem_manager;
    Cfg_Node *node = new(mm) Cfg_Node(mm, 0, fg, pred->linearization_node());
    pred->replace_edge(mm, succ, node);
    node->add_edge(mm, succ);
    node->set_live_lcse(pred->live_lcse());

    Inst *inst_head = node->IR_instruction_list();
    int i;
    for (i = 0; i < next_entry; i += 2) {
        Operand *src = compensation[i+1]->dst();
		//
		// Must set the compensation[i]->dst() to multiple_defs
		// To avoid mis-calling fold_opearnd! //::sxh 2001.6.27
		//
		Operand* dst = compensation[i]->dst() ;
		if(dst->is_temp_reg())
			((Temp_Reg*)dst)->set_temp_reg_has_multiple_defs() ;

        new (mm) Assign_Inst(compensation[i]->dst(), src,
                             compensation[i]->exp, inst_head);
        if (src->is_temp_reg())
            ((Temp_Reg*)src)->set_global_reg_cand();
    }
}

//
// In the following case, we need to insert compensation code to guarantee 
// correctness.  "getfield #2" is a cse in B2.  Building IR for the cloned
// loop generates a new set of temp registers.  We have to insert compensation
// code "t1 = t2" along the edge <B1,B2>.
//
//          |  t1 = getfield #2 |           |  t2 = getfield #2 | B1
//          |   ... t1 ...      |           |   ... t2 ...      | cloned loop
//          +-------------------+           +-------------------+
//                   |          \            /          |
//                   |           \  exit    /           |
//                   |            \        /            |
//                   |      +-------------------+       |
//                   |      |    ... t1 ...     | B2    |
//
void Bound_Elim::add_compensation_code(Cfg_Node *header, Cfg_Node *node) {
    if (node->latest_traversal >= fg->traversal_num()) return;
    
    node->latest_traversal = fg->traversal_num();
    //
    // check successors
    //
    Cfg_Int edge;
    for (edge=0; edge < node->out_edge_size(); edge++) {
        Cfg_Node *succ = node->out_edges(edge);
        if (succ->loop_header == header || succ->in_edge_size() != 2) continue;
        //
        // find the pred of succ, check if pred is a block of the cloned loop
        //
        Cfg_Node *pred = (succ->in_edges(0) == node)? succ->in_edges(1) : succ->in_edges(0);
        if (pred->mark() != 'R') continue;
        //
        // create an array for holding insts that need compensation code
        //
        unsigned sz = node->live_lcse()->list_size()*sizeof(Inst*)*2;
        Mem_Manager tmp_mm(sz);
        Inst **compensation = (Inst**)tmp_mm.alloc(sz);
        unsigned next_entry = 0;
        //
        // check if there is any cse live across block 
        //
        LCSE *lcse_list = node->live_lcse()->lcse_list();
        LCSE *pred_list = pred->live_lcse()->lcse_list();
        LCSE *lcse;
        for (lcse = lcse_list->next(); lcse != lcse_list; lcse = lcse->next()) {
            Inst *i = lcse->live();
            assert(i != NULL);
            //
            // the temp reg holds a cse value
            //
            if (!i->dst()->is_temp_reg() || 
                !((Temp_Reg*)i->dst())->global_reg_alloc_cand()) continue;
            LCSE *p;
            for (p = pred_list->next(); p != pred_list; p = p->next())
                if (p->exp() == lcse->exp()) break;
            if (p != pred_list) {
                assert(next_entry + 2 < sz);
                compensation[next_entry] = i;
                compensation[next_entry+1] = p->live();
                next_entry += 2;
            }
        }

        if (next_entry != 0) // create compensation code if needed
            insert_node_and_code(fg, succ, pred, compensation, next_entry);
    }
    if (node == header) return; // reach the loop header
    //
    // process predecessors of node
    //
    for (edge=0; edge < node->in_edge_size(); edge++) 
        add_compensation_code(header, node->in_edges(edge));
}

//
// return true, if the loop is a good candidate for eliminating bound checking
//
bool Bound_Elim::is_bound_elimination_cand(Cfg_Node *header,
                                           int&  inc) {
//    if (header->label != 26 && header->label != 4 && header->label != 9 &&
//        header->label != 21 && header->label != 17 && header->label != 13) return false;
    assert(header->loop_header == header); // make sure it is a loop header
    Bit_Vector *bnd = lbc.bnd[header->label];
    //
    // For the time being, we only deal with innermost loops
    //
    if (!lbc.inner_loop->is_set(header->label) ||
        bnd->is_empty() || // no bound
        lbc.exit_cond[header->label] == INVALID_ENTRY)
        return false;
    //
    // if the exit expression is not the form of " i ble x " where i is 
    // an induction variable, then return false
    //
    Exp *br = lbc.bound_set.elem(lbc.exit_cond[header->label]);
    assert(br->is_branch_exp());
    Inst_Exp *cmp = (Inst_Exp*)((Inst_Exp*)br)->left_child();
    Exp *l = cmp->left_child();
    //
    // if the ratio of n_inst/n_bnd above the threshold (30), then we don't
    // want to clone loops because the bound checking code do not dominate
    // the execution of the loop.
    //
    unsigned n_bnd = 0;
    unsigned i;
    for (i = 0; i < lbc.bound_set.size(); i++)
        if (bnd->is_set(i)) n_bnd++;
    if (n_bnd == 0 || lbc.n_inst[header->label]/n_bnd > 30) return false;
    //
    // find out whether "exit_var += c" is an induction assignment in the loop
    //
    Bit_Vector *iv = lbc.iv[header->label];
    Inst_Exp *iv_asgn = iv_assignment_exist(iv, lbc.bound_set, l);
    if (iv_asgn == NULL) return true;

    //
    // There are several cases in which the cloning transformation is unsafe.
    // For instance, for (i = 2; i < 10; i--)
    //
    Inst_Exp *r_e = (Inst_Exp*)iv_asgn->rght_child();
    Exp *imm = r_e->rght_child();  // v = v + 2;
    assert(imm->is_imm_exp());
    inc = ((int)((Imm_Operand*)((Operand_Exp*)imm)->opnd)->imm());
    bool is_inc =  (r_e->op == Exp::Add && inc > 0) ||
                   (r_e->op == Exp::Sub && inc < 0);
    //
    // use inc > 0 or inc < 0 to tell incrementing or decrementing
    //
    if ((is_inc && inc < 0) || (!is_inc && inc > 0))
        inc = -inc; 
    bool target_in_loop = lbc.target_within_loop->is_set(header->label) != 0;
    Exp::Kind cond = (Exp::Kind)br->op;
    if (((cond == Exp::Blt || cond == Exp::Ble) && 
           ((target_in_loop && is_inc) || (!target_in_loop && !is_inc))) ||
        ((cond == Exp::Bgt || cond == Exp::Bge) && 
           ((target_in_loop && !is_inc) || (!target_in_loop && is_inc))))
        return true;
    return false;
}

//
// return true; if we have cloned loops to eliminate bound checking
//
bool Bound_Elim::clone_loop(Cfg_Node *tail, Back_Edge& be) {
    Cfg_Node *header = tail->loop_header;
    int  inc;           // every iteration, the iv is incremented by how much
    if (!is_bound_elimination_cand(header,inc)) 
        return false;

    //
    // exit expression (i < x)
    //
    assert(lbc.exit_cond[header->label] != INVALID_ENTRY);
    Inst_Exp *exit = (Inst_Exp*)lbc.bound_set.elem(lbc.exit_cond[header->label]);
    Exp *exit_var = ((Inst_Exp*)exit->left_child())->left_child();

    //
    // bound checking expression ( i < a.length)
    //
    unsigned next_bnd_cand = 0;
    bool loop_invariant, iinc_and_bnd;
    Inst_Exp *bnd  = find_next_bnd_cand(header, lbc, exit_var, next_bnd_cand, 
                                        loop_invariant, iinc_and_bnd);
    if (bnd == NULL) return false;
    //
    //    +------+            +------+                      
    //    |  B1  |            |  B1  |  new_header
    //    +------+            +------+
    //    /   |   \               |   \ 
    //   /    V   /   ==>         V    \  
    //  / +------+            +------+  \
    //  | |  B2  |            |  B2  |   \
    //  | +------+            +------+   |
    //  |     |                   |      |
    //   \    V                   |      |
    //    +------+                |      |           
    //    |  B3  |                |      |
    //    +------+                V      |
    //                        +------+   |  
    //                header  |  B1' |   |  
    //                        +------+   |    
    //                        /   |   \  |   
    //                       /    V   /  |  
    //                      / +------+   |
    //                      | |  B2' |   |
    //                      | +------+   /
    //                      |     |     /
    //                       \    V    /
    //                        +------+L 
    //                        |  B3  |  
    //                        +------+  

    Mem_Manager& mm = fg->mem_manager;
    //
    // new header is on the top
    //
    Cfg_Node *new_header = fg->peel_loop(header, 1, mm, be);
    //
    // create a new cfg node
    //
    Cfg_Node *node = new(mm) Cfg_Node(mm, 0, fg, header->linearization_node()->prev());
    //
    // remove all edges <X,header>  ==> <X,node>
    // edge is removed form new_header after replacement
    //
    while (new_header->in_edge_size() > 0)
        new_header->in_edges(0)->replace_edge(mm, new_header, node);
    //
    // make sure that header has only two edges (one from the loop tail and
    // one from the peeled loop).
    //
    assert(header->in_edge_size() == 2);
    Cfg_Node *new_tail = header->in_edges(1);
    assert(new_tail->loop_header != header);
    //
    // Let the edge <B2, B1'> become <B2, B1>
    //
    new_tail->replace_edge(mm, header, new_header);
    //
    // indicate the loop won't be executed frequently
    //
    header->set_cold_code();
    //
    // build IR for the newly cloned loop
    //
    build_IR_for_cloned_loop(node, header, new_header, bnd, exit, exit_var,
                             next_bnd_cand, loop_invariant, iinc_and_bnd, inc);
    //
    // create exception node to catch any exceptions within node
    // By doing this, we guarantee precise exception
    //
    maintain_precise_exception(node, header, new_header);
    //
    // mark old loop header cold to indicate that the loop is infrequently
    // executed.  When we do linearization, we will move the loop out of line to
    // improve cache performance.
    //
    header->set_cold_code();
    return true;
}

static void remove_iv_not_in_exit(Cfg_Node *header, Loop_Bound_Closure& lbc) {
    unsigned short entry = lbc.exit_cond[header->label];
    if (entry == INVALID_ENTRY) return;
    Inst_Exp *exit = (Inst_Exp*)lbc.bound_set.elem(entry);
    assert(exit->is_branch_exp());

    Bit_Vector *iv = lbc.iv[header->label];
    unsigned size = lbc.bound_set.size();
    unsigned i;
    for (i = 0; i < size; i++)
        if (iv->is_set(i)) {
            // v = v + c;
            Exp *v = ((Inst_Exp*)lbc.bound_set.elem(i))->left_child();
            if (v != ((Inst_Exp*)exit->left_child())->left_child())
                iv->reset(i);
        }
}
//
// create an eh_node for the nodes that checks bounds to branch to either
// the old loop or the newly cloned loop.  If any exception occurs within
// the nodes, we immediately go back to the old loop.
//
void Bound_Elim::maintain_precise_exception(Cfg_Node *node, 
                                            Cfg_Node *old_header, 
                                            Cfg_Node *new_header) {
    Eh_Node *eh_node = fg->create_eh_node();
    Mem_Manager& mm = fg->mem_manager;
    while (node != new_header) {
        // add eh edge <node,eh_node>
        node->add_eh_edge(mm, eh_node);
        //
        // traverse fall-through edge
        //
        assert(node->out_edge_size() == 2);
        node = node->out_edges(0);
        assert(node != NULL);
    }
    // the exception handler is the old loop
    eh_node->add_edge(mm,old_header,NULL,0); // catch throwable
    // insert eh_node into fg->handlers();
    eh_node->insert_before(fg->handlers());
}

bool is_inner_loop(Flow_Graph *fg, Cfg_Node *header, Cfg_Node *node) {
    if (node->loop_depth() < header->loop_depth()) return true; 
    //
    // if node within the same loop has been visited
    //
    if (node->loop_header == header && 
        node->latest_traversal >= fg->traversal_num())
        return true;

    bool contain_inner_loop = false;
    if (node->loop_header == header) { // within the loop

        node->latest_traversal = fg->traversal_num();
        //
        // Process this node's predecessors.  
        //
        Cfg_Int edge;
        for (edge=0; edge < node->in_edge_size(); edge++) {
            Cfg_Node *pred = node->in_edges(edge);
            if (pred->loop_header != node && // skip back edges of inner loops
                !is_inner_loop(fg, header, pred))
                return false;
        }
        return true;
    } else // contain an inner loop
        return false;
}

static void mark_node(Cfg_Node *node, Closure *c) {node->set_mark('B');}
//
// start eliminating bounds checking
//
void Bound_Elim::start(Back_Edge& be, Cfg_Node **tails) {
    //
    // mark all cfg nodes "Black" so that we can easily identify what nodes are 
    // newly created
    //
    fg->apply(mark_node,NULL); 
    //
    // identify which loops are innermost loops
    //
    fg->set_traversal_num(fg->traversal_num() + 1);
    int i;
    for (i = be.back_edge_count -1; i >= 0; i--) {
        Cfg_Node *tail = tails[i];
        assert(tail != NULL && exprs.curr_exp_id() > 0);
        assert(i == be.back_edge_count -1 || 
               tail->loop_depth() <= tails[i+1]->loop_depth()); // proper sorted
        if (is_inner_loop(fg, tail->loop_header, tail))
            lbc.inner_loop->set(tail->loop_header->label);
    }
    //
    // find out all bound checking compare expressions of the innermost loops
    //
    fg->set_traversal_num(fg->traversal_num() + 1);
    for (i = be.back_edge_count -1; i >= 0; i--) {
        Cfg_Node *tail = tails[i];
        //
        // if the loop is not an innermost loop, then continue
        //
        if (!lbc.inner_loop->is_set(tail->loop_header->label))
            continue; 

        lbc.iv_bb->reset_all(); // record all i++ 
        lbc.iinc_and_bb->reset_all(); // record all iinc_and 
        //
        // return true, if the loop is an innermost loop
        //
        init_bound_set(tail,tail->loop_header,tail,tails);
        //
        // remove iv that is not in the exit condition
        //
        remove_iv_not_in_exit(tail->loop_header, lbc);
    }

    fg->set_traversal_num(fg->traversal_num() + 1);
    for (i = be.back_edge_count -1; i >= 0; i--) {
        Cfg_Node *tail = tails[i];
        Cfg_Node *head = tail->loop_header;
        //
        // if the loop is not an innermost loop, then continue
        //
        if (!lbc.inner_loop->is_set(head->label))
            continue; 

        //
        // find exit cond
        //
        unsigned short entry = lbc.exit_cond[head->label];
        Exp *exit_var = NULL;
        if (entry != INVALID_ENTRY) {
            Inst_Exp *br = (Inst_Exp*)lbc.bound_set.elem(entry);
            exit_var = ((Inst_Exp*)br->left_child())->left_child();
        }
        //
        // if lbc.iv[head->label]->is_empty(), then we don't need to track if iv increment
        // has been reached or not. Hence, we set it true
        //
        bool iv_reached = lbc.iv[head->label]->is_empty();
        compute_invariant_bounds(tail, head, tail, exit_var, iv_reached);
        //
        // remove bounds that are killed by the loop
        //
        lbc.bnd[head->label]->subtract(lbc.elem_kill[head->label]);
        //
        // determine if exit expression is killed by inner loops
        //
        if (entry != INVALID_ENTRY && 
            lbc.elem_kill[head->label]->is_set(entry))
            lbc.exit_cond[head->label] = INVALID_ENTRY;
    }

#ifdef TRACE_O3
    //
    // print out bit vector information for debugging
    //
    for (i = be.back_edge_count-1; i >= 0; i--)
        lbc.print_bvs_of_loop(tails[i]->loop_header);
#endif

    //
    // clone loops to eliminate bound checking
    // 
    bool cloned = false;
    for (i = be.back_edge_count-1; i >= 0; i--) {
        if (clone_loop(tails[i], be)) {
            fg->set_need_linearization();
            cloned = true;
        }
    }

#ifdef TRACE_O3
    if (cloned) {
        fg->print_cfg(".bound1");
        cout << "@@@ Eliminate bound checking" << endl;
    }
#endif
}


//
// compare two cfg nodes' loop depths
//
static int compare_depth(const void *e1, const void *e2) {
    return (*(Cfg_Node**)e1)->loop_depth() - (*(Cfg_Node**)e2)->loop_depth();
}

void bound_checking_elimination(Compilation_Env& comp_env,
                                Flow_Graph *fg, 
                                Expressions& exprs,
                                bool gc_requires_write_barriers) {
#ifdef TURN_OFF_BOUNDS
    return;
#endif // TURN_OFF_BOUNDS
    // there is an exception handler
    if (fg->handlers()->next() != fg->handlers()) return;

    fg->build_dom_tree();
    fg->find_loop_depths_and_headers();

    Mem_Manager mm(fg->num_fg_nodes * sizeof(void *));
    Back_Edge be(mm);
    //
    // traverse graph looking for back edges using apply.
    //
    fg->apply(find_back_edges, &be);
    // no loop in this method
    if (be.back_edge_count == 0) return; 
    //
    // retrieve profiling information collected by O1
    //
    if (instrumenting) {
        Profile_Rec *prof = o1_method_get_profile_info(comp_env.m_handle);
        uint64 entry_count = (prof->m_policy == prof->m_entry) ? __UINT64_C(1) : 
                                prof->m_policy -  prof->m_entry;
        unsigned i;
        for (i = 0; i < prof->n_back_edge; i++) {
            //
            // compute average trip count
            //
            uint64 trip_cnt = (recompilation_policy_loop - prof->back_edge[i])/entry_count;
            if (trip_cnt > __UINT64_C(10)) break;
        }
        if (i == prof->n_back_edge) return; 
    }  

    //
    // create memory manager for doing bound checking 
    //
    Cfg_Node **tails = (Cfg_Node**)mm.alloc(be.back_edge_count*sizeof(Cfg_Node*));
    int i;
    for (i = 0; i < be.back_edge_count; i++) 
        tails[i] = be.back_edge_tails[i];
    //
    // sort tails in terms of loop depth
    //
    qsort(tails,be.back_edge_count,sizeof(Cfg_Node*),compare_depth);
    //
    // reassign label to blocks so that every BB has a unique label
    //
    int max_label = fg->reassign_label();

#ifdef TRACE_O3
#if 0 // verify linearization
    Cfg_Node_List *last_node = &fg->linear_node_ordering;
    cout << "OLD linear order\n\t";
    Cfg_Node_List *cur_node;
    for (cur_node=last_node->next(); cur_node!=last_node; cur_node=cur_node->next())
        cout << cur_node->node()->label << " ";
    cout << endl;
    fg->linearize();
    cout << "NEW linear order\n\t";
    for (cur_node=last_node->next(); cur_node!=last_node; cur_node=cur_node->next())
        cout << cur_node->node()->label << " ";
    cout << endl;
#endif
    fg->print_cfg(".bound");
#endif
    //
    // decide the maximum number of bound checking and inducion assignment
    // that we would like to eliminate.  Right now, we set max_set_size = 64; 
    // Later, we should set these numbers based on the size of loops.
    //
    int max_set_size = 64;
    //
    // create bound set for flow analysis
    //
    Bound_Set bound_set(mm, max_set_size);
    //
    // create bit vectors for data flow analysis
    //
    Loop_Bound_Closure lbc(mm, bound_set, be.back_edge_heads, max_set_size, 
                           max_label, be.back_edge_count);
    //
    // if two loops share the same header, we need to normalize the loops
    //
    if (lbc.bad_loop_structure) return;

    Bound_Elim bound(mm, exprs, fg, comp_env, lbc, gc_requires_write_barriers);
    bound.start(be, tails);
}
