/*********************************************************************
 *
 *      Copyright (C) 1999-2001 Nathan Fiedler
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the Free
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * PROJECT:      Utils
 * MODULE:       String Matching
 * FILE:         BoyerMooreMatcher.java
 *
 * AUTHOR:      Nathan Fiedler
 *
 * REVISION HISTORY:
 *      Name    Date            Description
 *      ----    ----            -----------
 *      nf      02/15/99        Initial version
 *      nf      11/11/01        Changed to use CharacterIterator
 *
 * DESCRIPTION:
 *      This file defines the class for performing the Boyer-Moore
 *      string matching algorithm.
 *
 * $Id: BoyerMooreMatcher.java,v 1.2 2001/11/12 02:32:52 nfiedler Exp $
 *
 ********************************************************************/

package com.bluemarsh.util;

import java.text.CharacterIterator;
import java.text.Collator;
import java.text.CollationElementIterator;
import java.text.RuleBasedCollator;
import java.text.StringCharacterIterator;

/**
 * Implements the StringMatcher interface to perform a
 * Boyer-Moore string matching algorithm.
 *
 * @author  Nathan Fiedler
 */
public class BoyerMooreMatcher implements StringMatcher {
    /**
     * Size of the collator element map. We use 256 because it is
     * a reasonable size and collision will not be a problem so
     * long as the shifts are conservative.
     */
    protected static final int MAP_SIZE = 256;
    /**
     * Set to true to ignore case when comparing. Default is false.
     */
    protected boolean ignoreCase;
    /**
     * Number of elements in the pattern string.
     *
     * @see #computeLastOccurrence
     * @see #find
     */
    protected int patternLength;
    /**
     * Array of the pattern elements.
     *
     * @see #computeLastOccurrence
     * @see #find
     */
    protected int[] patternElements;
    /**
     * Array of the pattern element shifts.
     *
     * @see #computeLastOccurrence
     * @see #find
     */
    protected int[] elementShifts;
    /**
     * Table of the Boyer-Moore good-suffix values.
     *
     * @see #computeGoodSuffix
     */
    protected int[] suffixTable;
    /**
     * Collator used in performing string matches.
     */
    protected RuleBasedCollator coll;

    /**
     * Creates a new BoyerMooreMatcher object to perform the
     * Boyer-Moore string matching algorithm. In order to
     * search a string, the caller must first invoke the
     * init() method with the pattern to look for.
     */
    public BoyerMooreMatcher() {
        coll = (RuleBasedCollator) Collator.getInstance();
    } // BoyerMooreMatcher

    /**
     * Computes the good suffix array values according to the
     * processed pattern elements. Comes from algorithm shown
     * in "Introduction to Algorithms" by Cormen et al.
     */
    protected void computeGoodSuffix() {
        // compute prefix function
        int pi[] = computePrefix();
        int pLen = patternLength - 1;
        // reverse the pattern elements
        int pElemsR[] = new int[patternLength];
        for (int i = 0; i < patternLength; i++) {
            pElemsR[i] = patternElements[pLen - i];
        }
        int temp[] = patternElements;
        patternElements = pElemsR;
        // compute reverse prefix function
        int piR[] = computePrefix();
        patternElements = temp;

        // initialize good suffix table
        suffixTable = new int[patternLength];
        int val = pLen - pi[pLen];
        for (int j = 0; j < patternLength; j++) {
            suffixTable[j] = val;
        }

        // compute good suffix values
        for (int l = 0; l < patternLength; l++) {
            int j = pLen - piR[l];
            int lp = l - piR[l];
            if (suffixTable[j] > lp) {
                suffixTable[j] = lp;
            }
        }
    } // computeGoodSuffix

    /**
     * Initialize the Boyer-Moore shift tables in preparation for
     * searching a string for a the given pattern.
     * This is also known as the "last-occurrence-function".
     */
    protected void computeLastOccurrence() {
        // Elements not found in the pattern get zero shift distance.
        elementShifts = new int[MAP_SIZE];
        for (int i = 0; i < MAP_SIZE; i++) {
            elementShifts[i] = patternLength;
        }

        // Now compute the shift distances for the elements in the pattern.
        // The "-1" is in the calculation because Java indices are 0-based.
        for (int i = 0; i < patternLength; i++) {
            int index = hashOrder(patternElements[i]);
            elementShifts[index] = Math.min(elementShifts[index],
                                             patternLength - i - 1);
        }
    } // computeLastOccurrence

    /**
     * Computes the prefix array values according to the processed
     * pattern elements. This comes from the Compute-Prefix-Function
     * algorithm in "Introduction to Algorithms" by Cormen et al.
     *
     * @return  array of prefix values
     */
    protected int[] computePrefix() {
        int pTable[] = new int[patternLength];
        pTable[0] = 0;
        int k = 0;
        for (int q = 1; q < patternLength; q++) {
            while ((k > 0) && (patternElements[k] != patternElements[q])) {
                k = pTable[k];
            }
            if (patternElements[k] == patternElements[q]) {
                k++;
            }
            pTable[q] = k;
        }
        return pTable;
    } // computePrefix

    /**
     * Uninitialize now that searching is completely finished.
     * Useful if the string matcher needs to close open files
     * or other resources.
     */
    public void deinit() {
        elementShifts = null;
        suffixTable = null;
        patternElements = null;
        patternLength = 0;
    } // deinit

    /**
     * Search for the pattern in the given character sequence.
     * The pattern was given in the init() method call.
     * This comes from the February 1999 issue of Java Report, in the
     * article about international text searching. It contains a
     * couple of corrections to allow the implementation to work
     * properly in all situations. See the comments in the code.
     *
     * @param  iter  iterator that provides the character sequence
     *               in which to look for the pattern.
     * @param  patt  pattern string to look for.
     * @return  offset into text where pattern was found, or -1
     *          if the pattern could not be found.
     */
    public int find(CharacterIterator iter, String patt) {
        int nChars = iter.getEndIndex();
        CollationElementIterator textIter =
            coll.getCollationElementIterator(iter);
        // Anything less than tertiary and the algorithm will be
        // less than optimal if the text contains characters such
        // as spaces. The downside is accented characters will be
        // considered different than their base character.
        // (see also init)
        int mask = getMask(ignoreCase ? Collator.PRIMARY :
                           Collator.TERTIARY);

        // Start at the text position corresponding to the
        // end of the pattern. Note we use the pattern string
        // length, not the number of pattern elements.
        int textIndex;
        if (patt.length() > nChars) {
            // Can't go past the end, so start at the end.
            textIndex = nChars;
        } else {
            textIndex = patt.length();
        }

        boolean done = false;
        int pos = -1;
        while (!done) {
            boolean getPattern = true;
            boolean getText = true;
            int textElem = 0;
            int patElem = 0;

            try {
                textIter.setOffset(textIndex);
            } catch (IllegalArgumentException iae) {
                // Must have gone past end of text, try last char instead.
                textIter.setOffset(nChars);
            }
            int patIndex = patternLength;

            // Iterate backward until we hit the beginning of the pattern.
            while (patIndex > 0) {
                if (getText) {
                    textElem = textIter.previous();
                    if (textElem == CollationElementIterator.NULLORDER) {
                        // Reached the beginning of the text elements.
                        done = true;
                    }
                    textElem &= mask;
                }
                if (getPattern) {
                    patElem = patternElements[--patIndex];
                }
                getText = getPattern = true;

                if (textElem == 0) {
                    // Text element was ignorable.
                    getPattern = false;
                } else if (patElem == 0) {
                    // Pattern element was ignorable.
                    getText = false;
                } else if (textElem != patElem) {
                    // There is a mismatch at this position. Decide how far
                    // over to shift the pattern, then try again.
                    // Calculate the Bad-Character-Hueristic value.
                    int bch = textIter.getOffset() +
                        elementShifts[hashOrder(textElem)];
                    // Calculate the Good-Suffix-Hueristic value.
                    // Note, without this there's a good chance that
                    // we'd get stuck in a tight loop, never advancing.
                    int gsh = textIndex + suffixTable[patIndex];
                    // Take the larger of the two. Need to add one to
                    // account for zero-based offsets.
                    textIndex = Math.max(bch, gsh) + 1;
                    break;
                }
            }

            if (patIndex == 0) {
                // We made it back to the beginning of the pattern,
                // which means we matched it all. Return the location.
                done = true;
                pos = textIter.getOffset();
            }
            // Still didn't find a match, so keep going.
        }
        return pos;
    } // find

    /**
     * Return a mask for the part of the order we're interested in.
     *
     * @param  weight  part of the order we're interested in
     * @return  mask to select only that part of the order
     */
    protected static int getMask(int weight) {
        switch (weight) {
        case Collator.PRIMARY:
            return 0xFFFF0000;
        case Collator.SECONDARY:
            return 0xFFFFFF00;
        default:
            return 0xFFFFFFFF;
        } // switch
    } // getMask

    /**
     * Map a collation element to an array index within the
     * Boyer-Moore shift tables. For now, simply mods order
     * with MAP_SIZE.
     *
     * @param  order  the order of a collation element
     * @return  index into our shift tables
     */
    protected static int hashOrder(int order) {
        return CollationElementIterator.primaryOrder(order) % MAP_SIZE;
    } // hashOrder

    /**
     * Sets this matcher to be case-sensitive or case-insensitive,
     * depending on the argument.
     *
     * @param  ignore  true for case-insensitive.
     */
    public void ignoreCase(boolean ignore) {
        ignoreCase = ignore;
    } // ignoreCase

    /**
     * Initialize any tables that are needed for finding a pattern
     * within some unknown text. For example, the Boyer-Moore
     * algorithm will require setting up the last-occurrence and
     * good-suffix tables before searching can be performed.
     *
     * @param  patt  pattern to look for.
     */
    public void init(String patt) {
        // build the B-M shift table
        patternElements = processPattern(patt);
        patternLength = patternElements.length;
        maskPattern(getMask(ignoreCase ? Collator.PRIMARY :
                            Collator.TERTIARY));
        computeLastOccurrence();
        computeGoodSuffix();
    } // init

    /**
     * AND all the pattern elements with the given mask.
     *
     * @param  mask  mask to AND pattern elements with
     */
    protected void maskPattern(int mask) {
        for (int i = 0; i < patternLength; i++) {
            patternElements[i] &= mask;
        }
    } // maskPattern

    /**
     * Analyzes the pattern and builds a pattern elements table
     * and returns it.
     *
     * @param  P  pattern string
     * @return  array of collator elements from the pattern P
     */
    protected int[] processPattern(String P) {
        // First find out how many elements we're dealing with
        int pLen = 0;
        CollationElementIterator iter = coll.getCollationElementIterator(P);
        while (iter.next() != CollationElementIterator.NULLORDER) {
            pLen++;
        }

        // Allocate space to store the pattern elements.
        int pElems[] = new int[pLen];

        // Save the elements for quick access.
        iter.reset();
        for (int i = 0; i < pLen; i++) {
            pElems[i] = iter.next();
        }
        return pElems;
    } // processPattern

    /**
     * Test wrapper for this class. Tests the find() method
     * with a simple test case to ensure it is basically working.
     *
     * @param  args  array of command-line arguments
     */
    public static void main(String args[]) {
        BoyerMooreMatcher s = new BoyerMooreMatcher();
        String p = "strssng";
        // ae = \u00E6
        // ss = \u00DF
        String tstr = "silly spring str\u00DFng";

        s.init(p);
        int o = s.find(new StringCharacterIterator(tstr), p);
        if (o > 0) {
            System.out.println("found at " + o);
        } else {
            System.out.println("not found");
        }
        s.deinit();
    } // main
} // BoyerMooreMatcher
