/*
 * (C) Copyright IBM Corp. 1998  All rights reserved.
 *
 * US Government Users Restricted Rights Use, duplication or
 * disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
 *
 * The program is provided "as is" without any warranty express or
 * implied, including the warranty of non-infringement and the implied
 * warranties of merchantibility and fitness for a particular purpose.
 * IBM will not be liable for any damages suffered by you as a result
 * of using the Program. In no event will IBM be liable for any
 * special, indirect or consequential damages or lost profits even if
 * IBM has been advised of the possibility of their occurrence. IBM
 * will not be liable for any third party claims against you.
 */

package com.ibm.xml.omake;

import java.util.Enumeration;
import java.util.Vector;

/**
 * Regular expression matching using non-deterministic finate automaton (NFA).
 *
 * <P>Special characters are `<KBD>. * + ? [ ( ) | \</KBD>'.</P>
 * <DL>
 *   <DT><STRONG><KBD>.</KBD></STRONG>
 *   <DD>Matches any one character.
 *   <DT><STRONG><KBD>[<VAR>s</VAR>-<VAR>e</VAR>]</KBD></STRONG>
 *   <DD>...
 *   <DT><STRONG><KBD>[^<VAR>s</VAR>-<VAR>e</VAR>]</KBD></STRONG>
 *   <DD>...
 *   <DT><STRONG><KBD><VAR>X</VAR>|<VAR>Y</VAR></KBD></STRONG>
 *   <DD>...
 *   <DT><STRONG><KBD><VAR>X</VAR>*</KBD></STRONG>
 *   <DD>...
 *   <DT><STRONG><KBD><VAR>X</VAR>+</KBD></STRONG>
 *   <DD>...
 *   <DT><STRONG><KBD><VAR>X</VAR>?</KBD></STRONG>
 *   <DD>...
 *   <DT><STRONG><KBD>(<VAR>X</VAR>)</KBD></STRONG>
 *   <DD>Grouping. "<KBD>foo|bar</KBD>" matches "<KBD>fooar</KBD>" or "<KBD>fobar</KBD>".
 *     If you want it matches "<KBD>foo</KBD>" or "<KBD>bar</KBD>",
 *     you must write "<KBD>(foo|bar)</KBD>".
 *   <DT>a non-special character
 *   <DD>Matches the character.
 *   <DT><STRONG><KBD>\</KBD></STRONG> + a special character
 *   <DD>...
 * </DL>
 *
 * <P>Parensises make groups of in a regular expression and applications can know
 * where in target text each group matched with <CODE>getMatchedBeginnig() getMatchedEnd()
 * getMatchedString()</CODE> after <CODE>match() exactMatch()</CODE>.
 * The 0th group means whole of this regular expression.
 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parensis.</P>
 * <DL>
 *   <DT>For instance, a regular expression is
 *   "<FONT color=blue><KBD> *([^&lt;:]*) *&lt;([^&gt;]*)&gt; *</KBD></FONT>"
 *   and target text is
 *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
 *   <DD>getMatchedString(0):
 *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
 *   <DD>getMatchedString(1): "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
 *   <DD>getMatchedString(2): "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
 * </DL>
 *
 * @version Revision: 49 1.2 src/com/ibm/xml/omake/Regexp.java, xml4jsrc, xml4j-jtcsv, xml4j_1_1_16 
 * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
 */
public class Regexp implements java.io.Serializable {

    // ================================================================

    /**
     * Regular expression token
     */
    static class Token implements java.io.Serializable {
        static final int CHAR = 0;
        static final int CONCAT = 1;            // XY
        static final int UNION = 2;             // X|Y|Z
        static final int CLOSURE = 3;           // X*+?
        static final int RANGE = 4;             // . [a-zA-Z] etc.
        static final int NRANGE = 5;            // [^a-zA-Z] etc.
        static final int PAREN = 6;             // (X)
        static final int EMPTY = 7;             // 
        int type;

        int chardata;                           // CHAR

        static final int PERIOD_START = 1;
        static final int PERIOD_END = 0xfffe;
        int[] ranges;                           // RANGE/NRANGE
        int parennumber;
        Token child;                            // CONCAT/CLOSURE/PAREN
        Token child2;                           // CONCAT
        Vector children;                        // UNION

        Token(int type) {                       // EMPTY or RANGE/NRANGE or UNION
            this.type = type;
        }
        Token(int type, Token tok) {            // CLOSURE
            this.type = type;
            this.child = tok;
        }
        Token(int type, Token tok, int pnumber) { // PAREN
            this.type = type;
            this.child = tok;
            this.parennumber = pnumber;
        }
        Token(Token tok1, Token tok2) {         // CONCAT
            this.type = CONCAT;
            this.child = tok1;
            this.child2 = tok2;
        }
        Token(int type, int ch) {               // CHAR
            this.type = type;
            this.chardata = ch;
        }
        void addElement(Token tok) {
            if (this.children == null)  this.children = new Vector();
            this.children.addElement(tok);
        }
        void addRange(int start, int end) {
            int pos = 0;
            if (this.ranges == null)  this.ranges = new int[2];
            else {
                pos = this.ranges.length;
                int[] temp = new int[pos+2];
                System.arraycopy(this.ranges, 0, temp, 0, pos);
                this.ranges = temp;
            }
            if (start <= end) {
                this.ranges[pos++] = start;
                this.ranges[pos] = end;
            } else {
                this.ranges[pos++] = end;
                this.ranges[pos] = start;
            }
        }
        public String toString() {
            String ret = "";
            switch (this.type) {
              case CHAR:
                if (0 <= "|*+?().[\\".indexOf(this.chardata)) {
                    ret = "\\"+(char)this.chardata;
                } else
                    ret = ""+(char)this.chardata;
                break;
              case CONCAT:
                if (this.child2.type == CLOSURE && this.child2.child == this.child) {
                    ret = this.child.toString()+"+";
                } else
                    ret = this.child.toString()+this.child2.toString();
                break;
              case UNION:
                if (this.children.size() == 2 && ((Token)this.children.elementAt(0)).type == EMPTY) {
                    ret = this.children.elementAt(1).toString()+"?";
                } else {
                    Enumeration en = this.children.elements();
                    StringBuffer sb = new StringBuffer();
                    sb.append(en.nextElement().toString());
                    while (en.hasMoreElements()) {
                        sb.append((char)'|');
                        sb.append(en.nextElement().toString());
                    }
                    ret = sb.toString();
                }
                break;
              case CLOSURE:
                ret = this.child.toString()+"*";
                break;
              case RANGE:
                if (this.ranges.length == 2
                    && this.ranges[0] == PERIOD_START && this.ranges[1] == PERIOD_END) {
                    ret = ".";
                } else {
                    StringBuffer sb = new StringBuffer();
                    sb.append("[");
                    for (int i = 0;  i < this.ranges.length;  i += 2) {
                        if (this.ranges[i] == this.ranges[i+1]) {
                            sb.append((char)this.ranges[i]);
                        } else {
                            sb.append((char)this.ranges[i]);
                            sb.append((char)'-');
                            sb.append((char)this.ranges[i+1]);
                        }
                    }
                    sb.append("]");
                    ret = sb.toString();
                }
                break;
              case NRANGE:
                {
                    StringBuffer sb = new StringBuffer();
                    sb.append("[^");
                    for (int i = 0;  i < this.ranges.length;  i += 2) {
                        if (this.ranges[i] == this.ranges[i+1]) {
                            sb.append((char)this.ranges[i]);
                        } else {
                            sb.append((char)this.ranges[i]);
                            sb.append((char)'-');
                            sb.append((char)this.ranges[i+1]);
                        }
                    }
                    sb.append("]");
                    ret = sb.toString();
                }
                break;
              case PAREN:
                ret = "("+this.child.toString()+")";
                break;
              case EMPTY:
                break;
            }
            return ret;
        }
    }

    static class RegexpParser {
        private static final String className = "com.ibm.xml.omake.Regexp.RegexpParser";
        int offset;
        String regexp;

        RegexpParser(String regexp) {
            this.regexp = regexp;
            offset = 0;
        }

        private String left() {
            return this.regexp.substring(this.offset);
        }
        Token parse() throws RegexpParseException {
            next();
            Token ret = parseRegexp();
            if (this.offset != this.regexp.length())
                throw new RegexpParseException(this.className+"#parse(): Wrong character: "+left());
            return ret;
        }

        static final int T_CHAR = 0;
        static final int T_EOF = 1;
        static final int T_OR = 2;
        static final int T_STAR = 3;
        static final int T_PLUS = 4;
        static final int T_QUESTION = 5;
        static final int T_LPAREN = 6;
        static final int T_RPAREN = 7;
        static final int T_DOT = 8;
        static final int T_LBRACKET = 9;

        int chardata;
        int nexttoken;
        static final int S_NORMAL = 0;
        static final int S_NONE = 1;
        int syntax = S_NORMAL;
        int parennumber = 1;

        private void setSyntax(int syn) {
            this.syntax = syn;
        }
        int read() {
            return this.nexttoken;
        }
        void next() {
            int ret;
            if (this.offset < this.regexp.length()) {
                int ch = this.regexp.charAt(this.offset++);
                this.chardata = ch;
                if (this.syntax == S_NORMAL) {
                    switch (ch) {
                      case '|':	ret = T_OR;		break;
                      case '*':	ret = T_STAR;		break;
                      case '+':	ret = T_PLUS;		break;
                      case '?':	ret = T_QUESTION;	break;
                      case '(':	ret = T_LPAREN;		break;
                      case ')':	ret = T_RPAREN;		break;
                      case '.':	ret = T_DOT;		break;
                      case '[':	ret = T_LBRACKET;	break;
                      case '\\':
                        if (this.offset < this.regexp.length()) {
                            ret = T_CHAR;
                            this.chardata = this.regexp.charAt(this.offset++);
                        } else {
                            throw new RegexpParseException(this.className+"#next(): 1 character is required after \\.");
                        }
                      default:
                        ret = T_CHAR;
                    }
                } else
                    ret = T_CHAR;
            } else {
                ret = T_EOF;
                this.chardata = -1;
            }
            this.nexttoken = ret;
        }
        /**
         * regexp ::= term (`|` term)*
         * term ::= factor +
         * factor ::= atom ('*' | '+' | '?')?
         * atom ::= ( char | '.' | range | nrange | '(' regexp ')' )
         */
        Token parseRegexp() throws RegexpParseException {
            Token tok = parseTerm();
            Token parent = null;
            while (read() == T_OR) {
                next();                          // '|'
                if (parent == null) {
                    parent = new Token(Token.UNION);
                    parent.addElement(tok);
                    tok = parent;
                }
                tok.addElement(parseTerm());
            }
            return tok;
        }

        /**
         * term ::= factor+
         */
        Token parseTerm() throws RegexpParseException {
            int ch = read();
            if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
                return new Token(Token.EMPTY);
            } else {
                Token tok = parseFactor();
                while ((ch = read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
                    tok = new Token(tok, parseFactor());
                }
                return tok;
            }
        }

        /**
         * factor ::= atom ('*' | '+' | '?')?
         */
        Token parseFactor() throws RegexpParseException {
            Token tok = parseAtom();
            int ch = read();
            switch (ch) {
              case T_STAR:
                next();
                tok = new Token(Token.CLOSURE, tok);
                break;
              case T_PLUS:
                next();
                tok = new Token(tok, new Token(Token.CLOSURE, tok));
                break;
              case T_QUESTION:                  // X? -> |X
                next();
                Token par = new Token(Token.UNION);
                par.addElement(new Token(Token.EMPTY));
                par.addElement(tok);
                tok = par;
                break;
            }
            return tok;
        }

        /**
         * atom ::= ( char | '.' | range | nrange | '(' regexp ')' )
         */
        Token parseAtom() throws RegexpParseException {
            int ch = read();
            Token tok = null;
            switch (ch) {
              case T_LPAREN:
                next();                         // '('
                int p = this.parennumber++;
                tok = new Token(Token.PAREN, parseRegexp(), p);
                if (read() != T_RPAREN) throw new RegexpParseException(this.className+"#parseAtom(): ')' is expected: "+left());
                next();                         // ')'
                break;

              case T_DOT:
                next();                         // '.'
                tok = new Token(Token.RANGE);
                tok.addRange(Token.PERIOD_START, Token.PERIOD_END); // Unicode
                break;

              case T_LBRACKET:
                setSyntax(S_NONE);
                next();                         // '['
                boolean nrange = false;
                if (this.chardata == '^') {
                    nrange = true;
                    next();                     // '^'
                }
                tok = new Token(nrange ? Token.NRANGE : Token.RANGE);
                boolean first = true;
                boolean minusp = false;
                int start = -1;
                int c;
                while (read() != T_EOF && ((c = this.chardata) != ']' || first)) {
                    next();
                    first = false;
                    switch (c) {
                      case '-':
                        if (start < 0) {
                            start = '-';
                        } else
                            minusp = true;
                        break;

                      default:
                        if (start < 0) {
                            start = c;
                            minusp = false;
                        } else if (!minusp) {   // Single char
                            tok.addRange(start, start);
                            start = c;
                        } else {                // range
                            tok.addRange(start, c);
                            start = -1;
                            minusp = false;
                        }
                    }
                }
                if (start >= 0)
                    tok.addRange(start, start);
                if (minusp)
                    tok.addRange('-', '-');
                if (read() == T_EOF)
                    throw new RegexpParseException(this.className+"#parseAtom(): Unexpected end of expression in a range([...]).");
                
                setSyntax(S_NORMAL);
                next();
                break;

              case T_CHAR:
                tok = new Token(Token.CHAR, this.chardata);
                next();
                break;

              default:
                throw new RegexpParseException(this.className
                                               +"#parseAtom(): Unexpected special character: code="
                                               +ch+", "+(char)this.chardata);
            }
            return tok;
        }
    }

    // ================================================================

    /**
     * NFA Arrow and Node
     */
    static class NFAArrow {
        static final int CHAR = 0;              // Single character
        static final int EPSILON = 1;
        static final int RANGE = 2;             // [a-zA-Z]
        static final int NRANGE = 3;            // [^a-zA-Z]
        int type;
        int chardata;                           // when type == CHAR
        int regnumber;                          // when type == EPSILON
                                                // +n : start
                                                // -n : end
        int[] ranges;                           // when type == RANGE/NRANGE
        NFANode to;

        NFAArrow(int type, int data, NFANode to) {
            this.type = type;
            this.to = to;
            if (type == CHAR) {
                this.chardata = data;
            } else {
                this.regnumber = data;
            }
        }

        NFAArrow(int type, int[] ranges, NFANode to) {
            this.type = type;
            this.ranges = ranges;
            this.to = to;
        }

        boolean match(int ch) {
            boolean ret = false;
            if (this.type == CHAR) {
                ret = ch == this.chardata;
            } else if (this.type == RANGE) {
                for (int i = 0;  i < this.ranges.length;  i += 2) {
                    if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
                        return true;
                }
            } else if (this.type == NRANGE) {
                for (int i = 0;  i < this.ranges.length;  i += 2) {
                    if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
                        return false;
                }
                ret = true;
            } else {
                System.err.println("NFAArrow#match(): Internal error!");
            }
            return ret;
        }
    }

    static class NFANode extends Vector {
    }

    void toNFA(Token tok) {
        if (this.nfastart == null) {
            this.nfastart = new NFANode();
            this.nfaterm = new NFANode();
            toNFA(tok, this.nfastart, this.nfaterm);
        }
    }
    void toNFA(Token tok, NFANode from, NFANode to) {
        switch (tok.type) {
          case Token.CHAR:
            from.addElement(new NFAArrow(NFAArrow.CHAR, tok.chardata, to));
            break;

          case Token.CONCAT:
            NFANode n = new NFANode();
            toNFA(tok.child, from, n);
            toNFA(tok.child2, n, to);
            break;

          case Token.UNION:
            Enumeration en = tok.children.elements();
            while (en.hasMoreElements()) {
                NFANode nu = new NFANode();
                from.addElement(new NFAArrow(NFAArrow.EPSILON, 0, nu));
                toNFA((Token)en.nextElement(), nu, to);
            }
            break;

          case Token.CLOSURE:
            NFANode st = new NFANode();
            NFANode te = new NFANode();
            from.addElement(new NFAArrow(NFAArrow.EPSILON, 0, st));
            toNFA(tok.child, st, te);
            te.addElement(new NFAArrow(NFAArrow.EPSILON, 0, st));
            st.addElement(new NFAArrow(NFAArrow.EPSILON, 0, to));
            break;

          case Token.RANGE:
            from.addElement(new NFAArrow(NFAArrow.RANGE, tok.ranges, to));
            break;

          case Token.NRANGE:
            from.addElement(new NFAArrow(NFAArrow.NRANGE, tok.ranges, to));
            break;

          case Token.PAREN:
            NFANode p1 = new NFANode();
            NFANode p2 = new NFANode();
            from.addElement(new NFAArrow(NFAArrow.EPSILON, tok.parennumber, p1));
            toNFA(tok.child, p1, p2);
            p2.addElement(new NFAArrow(NFAArrow.EPSILON, -tok.parennumber, to));
            break;

          case Token.EMPTY:
            from.addElement(new NFAArrow(NFAArrow.EPSILON, 0, to));
            break;
        }
    }

    int NFAmatch(String target, int offset, int limit) {
        if (this.nfastart == null)  prepare();
        //System.err.println("NFAmatch(): start: "+this.nfastart+", term: "+this.nfaterm);
        return NFAmatch0(this.nfastart, target, offset, limit, this.positions);
    }
    int NFAmatch0(NFANode node, String target, int offset, int limit, int[] pos) {
        //System.err.println("NFAmatch0(): node: "+node+", \""+target+"\" offset:"+offset);
        int ret = -1;
        if (node == nfaterm) {
            ret = offset;
        } else if (offset > limit) {
            ret = -1;
        } else {
            int[] resultpos = (int[])pos.clone();
            int[] childpos = new int[pos.length];
            Enumeration en = node.elements();
            while (en.hasMoreElements()) {
                NFAArrow arr = (NFAArrow)en.nextElement();
                int current = -1;
                System.arraycopy(pos, 0, childpos, 0, pos.length);
                switch (arr.type) {
                  case NFAArrow.CHAR:
                  case NFAArrow.RANGE:
                  case NFAArrow.NRANGE:
                    if (offset >= limit) {
                        current = -1;
                    } else if (arr.match(target.charAt(offset)))
                        current = NFAmatch0(arr.to, target, offset+1, limit, childpos);
                    break;

                  case NFAArrow.EPSILON:
                    if (arr.regnumber > 0) {
                        childpos[arr.regnumber*2] = offset;
                    } else if (arr.regnumber < 0) {
                        childpos[(-arr.regnumber)*2+1] = offset;
                    }
                    current = NFAmatch0(arr.to, target, offset, limit, childpos);
                    break;
                }
                if (current > ret) {
                    ret = current;
                    System.arraycopy(childpos, 0, resultpos, 0, childpos.length);
                }
            }
            if (ret >= 0)
                System.arraycopy(resultpos, 0, pos, 0, pos.length);
        }
        //System.err.println("RET: "+ret);
        return ret;
    }
    
    // ================================================================

    String regexp;
    int nofparen;
    Token tokentree;
    transient NFANode nfastart = null;
    transient NFANode nfaterm = null;
    transient int[] positions;
    transient String target;

    void prepare() {
        toNFA(this.tokentree);
        this.positions = new int[this.nofparen*2];
        for (int i = 0;  i < this.nofparen*2;  i ++)
            this.positions[i] = -1;
    }

    /**
     * Constructor.
     *
     * @param regexp A regular expression
     * @exception com.ibm.xml.omake.RegexParserException <VAR>regexp</VAR> is wrong.
     */
    public Regexp(String regexp) throws RegexpParseException {
        this.regexp = regexp;
        RegexpParser rp = new RegexpParser(regexp);
        this.tokentree = rp.parse();
        //System.out.println("Token: "+this.tokentree);
        this.nofparen = rp.parennumber;
    }

    /**
     * Represents this instence in String.
     */
    public String toString() {
        return this.tokentree.toString();
    }

    /**
     *
     * @return Offset of the start position in <VAR>target</VAR>
     */
    public int match(String target) {
        this.target = target;
        int len = target.length();
        for (int i = 0;  i < len;  i ++) {
            int l;
            if (0 <= (l = NFAmatch(target, i, len))) {
                this.positions[0] = i;
                this.positions[1] = l;
                return i;
            }
        }
        return -1;
    }

    /**
     * 
     */
    public boolean exactMatch(String target) {
        this.target = target;
        int len = target.length();
        if (len == NFAmatch(target, 0, len)) {
            this.positions[0] = 0;
            this.positions[1] = len;
            return true;
        }
        return false;
    }

    /**
     * Return a start position in the target text matched to specified regular expression group.
     * <P>This method doesn't return valid value before calling <CODE>match()/exactMatch()</CODE>.
     *
     * @param index Less than getNumberOfGroups().
     * @see #getMatchedEnd
     * @see #getMatchedString
     * @see #getNumberOfGroups
     */
    public int getMatchedBeginning(int index) {
        return this.positions[index*2];
    }

    /**
     * Return an end position in the target text matched to specified regular expression group.
     * <P>This method doesn't return valid value before calling <CODE>match()/exactMatch()</CODE>.
     *
     * @param index Less than getNumberOfGroups().
     * @see #getMatchedBeginning
     * @see #getMatchedString
     * @see #getNumberOfGroups
     */
    public int getMatchedEnd(int index) {
        return this.positions[index*2+1];
    }

    /**
     * Return an substring of the target text matched to specified regular expression group.
     * <P>This method doesn't return valid value before calling <CODE>match()/exactMatch()</CODE>.
     *
     * @param index Less than getNumberOfGroups().
     * @see #getMatchedBeginning
     * @see #getMatchedEnd
     * @see #getNumberOfGroups
     */
    public String getMatchedString(int index) {
        return this.target.substring(this.positions[index*2], this.positions[index*2+1]);
    }

    /**
     * Return the number of regular expression groups.
     * This method returns 1 when the regular expression has no parensis.
     *
     * @see #getMatchedBeginning
     * @see #getMatchedString
     * @see #getMatchedEnd
     */
    public int getNumberOfGroups() {
        return this.nofparen;
    }

    /**
     * Sample entry.
     * Usage: <KBD>com.ibm.xml.omake.Regexp &lt;regexp&gt; &lt;string&gt;</KBD>
     */
    public static void main(String[] argv) {
        try {
            Regexp reg = new Regexp(argv[0]);
            System.out.println("Regexp: "+reg);
            System.out.println("Match position: "+reg.match(argv[1]));
            //System.out.println("Match? -> "+reg.exactMatch(argv[1]));
            for (int i = 0;  i < reg.getNumberOfGroups();  i ++) {
                if (i == 0 )  System.out.print("To whole pattern: ");
                else System.out.print("["+i+"]: ");
                if (reg.getMatchedBeginning(i) < 0)
                    System.out.println("-1");
                else {
                    System.out.print(reg.getMatchedBeginning(i)+", "+reg.getMatchedEnd(i)+", ");
                    System.out.println(reg.getMatchedString(i));
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
