| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.util; |
| |
| import java.util.Arrays; |
| import java.util.LinkedList; |
| import java.util.ListIterator; |
| |
| /** |
| * TrieStringMatcher is a base class for simple tree-based string matching. |
| * |
| * This class is thread-safe during string matching but not when adding strings |
| * to the trie. |
| */ |
| public abstract class TrieStringMatcher { |
| protected TrieNode root; |
| |
| protected TrieStringMatcher() { |
| this.root = new TrieNode('\000', false); |
| } |
| |
| /** |
| * Node class for the character tree. |
| */ |
| protected class TrieNode implements Comparable<TrieNode> { |
| protected TrieNode[] children; |
| protected LinkedList<TrieNode> childrenList; |
| protected char nodeChar; |
| protected boolean terminal; |
| |
| /** |
| * Creates a new TrieNode, which contains the given <code>nodeChar</code>. |
| * If <code>isTerminal</code> is <code>true</code>, the new node is a |
| * <em>terminal</em> node in the trie. |
| */ |
| TrieNode(char nodeChar, boolean isTerminal) { |
| this.nodeChar = nodeChar; |
| this.terminal = isTerminal; |
| this.childrenList = new LinkedList<>(); |
| } |
| |
| /** |
| * Returns <code>true</code> if this node is a <em>terminal</em> node in the |
| * trie. |
| */ |
| boolean isTerminal() { |
| return terminal; |
| } |
| |
| /** |
| * Returns the child node of this node whose node-character is |
| * <code>nextChar</code>. If no such node exists, one will be is added. If |
| * <em>isTerminal</em> is <code>true</code>, the node will be a terminal |
| * node in the trie. |
| */ |
| TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) { |
| if (childrenList == null) { |
| childrenList = new LinkedList<>(); |
| childrenList.addAll(Arrays.asList(children)); |
| children = null; |
| } |
| |
| if (childrenList.size() == 0) { |
| TrieNode newNode = new TrieNode(nextChar, isTerminal); |
| childrenList.add(newNode); |
| return newNode; |
| } |
| |
| ListIterator<TrieNode> iter = childrenList.listIterator(); |
| TrieNode node = iter.next(); |
| while ((node.nodeChar < nextChar) && iter.hasNext()) |
| node = iter.next(); |
| |
| if (node.nodeChar == nextChar) { |
| node.terminal = node.terminal | isTerminal; |
| return node; |
| } |
| |
| if (node.nodeChar > nextChar) |
| iter.previous(); |
| |
| TrieNode newNode = new TrieNode(nextChar, isTerminal); |
| iter.add(newNode); |
| return newNode; |
| } |
| |
| /** |
| * Returns the child node of this node whose node-character is |
| * <code>nextChar</code>. If no such node exists, <code>null</code> is |
| * returned. |
| */ |
| TrieNode getChild(char nextChar) { |
| if (children == null) { |
| compile(); |
| } |
| |
| int min = 0; |
| int max = children.length - 1; |
| int mid = 0; |
| while (min < max) { |
| mid = (min + max) / 2; |
| if (children[mid].nodeChar == nextChar) |
| return children[mid]; |
| if (children[mid].nodeChar < nextChar) |
| min = mid + 1; |
| else |
| // if (children[mid].nodeChar > nextChar) |
| max = mid - 1; |
| } |
| |
| if (min == max) |
| if (children[min].nodeChar == nextChar) |
| return children[min]; |
| |
| return null; |
| } |
| |
| public int compareTo(TrieNode other) { |
| if (this.nodeChar < other.nodeChar) |
| return -1; |
| if (this.nodeChar == other.nodeChar) |
| return 0; |
| // if (this.nodeChar > other.nodeChar) |
| return 1; |
| } |
| |
| /** |
| * Prepare node for matching. Note: this method is synchronized because it |
| * may be called concurrently when the trie is used for matching. |
| */ |
| synchronized void compile() { |
| if (childrenList != null) { |
| children = childrenList.toArray(new TrieNode[childrenList.size()]); |
| childrenList = null; |
| Arrays.sort(children); |
| } |
| } |
| } |
| |
| /** |
| * Returns the next {@link TrieNode} visited, given that you are at |
| * <code>node</code>, and the the next character in the input is the |
| * <code>idx</code>'th character of <code>s</code>. |
| */ |
| protected final TrieNode matchChar(TrieNode node, String s, int idx) { |
| return node.getChild(s.charAt(idx)); |
| } |
| |
| /** |
| * Adds any necessary nodes to the trie so that the given <code>String</code> |
| * can be decoded and the last character is represented by a terminal node. |
| * Zero-length <code>Strings</code> are ignored. |
| */ |
| protected final void addPatternForward(String s) { |
| TrieNode node = root; |
| int stop = s.length() - 1; |
| int i; |
| if (s.length() > 0) { |
| for (i = 0; i < stop; i++) |
| node = node.getChildAddIfNotPresent(s.charAt(i), false); |
| node = node.getChildAddIfNotPresent(s.charAt(i), true); |
| } |
| } |
| |
| /** |
| * Adds any necessary nodes to the trie so that the given <code>String</code> |
| * can be decoded <em>in reverse</em> and the first character is represented |
| * by a terminal node. Zero-length <code>Strings</code> are ignored. |
| */ |
| protected final void addPatternBackward(String s) { |
| TrieNode node = root; |
| if (s.length() > 0) { |
| for (int i = s.length() - 1; i > 0; i--) |
| node = node.getChildAddIfNotPresent(s.charAt(i), false); |
| node = node.getChildAddIfNotPresent(s.charAt(0), true); |
| } |
| } |
| |
| /** |
| * Returns true if the given <code>String</code> is matched by a pattern in |
| * the trie |
| */ |
| public abstract boolean matches(String input); |
| |
| /** |
| * Returns the shortest substring of <code>input</code> that is |
| * matched by a pattern in the trie, or <code>null</code> if no match |
| * exists. |
| */ |
| public abstract String shortestMatch(String input); |
| |
| /** |
| * Returns the longest substring of <code>input</code> that is |
| * matched by a pattern in the trie, or <code>null</code> if no match |
| * exists. |
| */ |
| public abstract String longestMatch(String input); |
| |
| } |