| /* |
| * Copyright (C) 2013 The Async HBase Authors. All rights reserved. |
| * This file is part of Async HBase. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * - Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * - Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * - Neither the name of the StumbleUpon nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| package org.hbase.async; |
| |
| import org.jboss.netty.buffer.ChannelBuffer; |
| import org.jboss.netty.util.CharsetUtil; |
| |
| import java.nio.charset.Charset; |
| |
| import org.hbase.async.generated.ComparatorPB; |
| import org.hbase.async.generated.FilterPB; |
| import org.hbase.async.generated.HBasePB; |
| |
| /** |
| * Filters rows based on an expression applied to the row key. |
| * <p> |
| * The regular expression will be applied on the server-side, on the row |
| * key. Rows for which the key doesn't match will not be returned to the |
| * scanner, which can be useful to carefully select which rows are matched |
| * when you can't just do a prefix match, and cut down the amount of data |
| * transferred on the network. |
| * <p> |
| * Don't use an expensive regular expression, because Java's implementation |
| * uses backtracking and matching will happen on the server side, potentially |
| * on many many row keys. |
| * See <a href="http://swtch.com/~rsc/regexp/regexp1.html">Regular Expression |
| * Matching Can Be Simple And Fast</a> for more details on regular expression |
| * performance (or lack thereof) and what "backtracking" means. |
| * <p> |
| * This means you need to <strong>be careful</strong> about using regular |
| * expressions supplied by users as that would allow them to easily DDoS |
| * HBase by sending prohibitively expensive regexps that would consume all |
| * CPU cycles and cause the entire HBase node to time out. |
| * @since 1.5 |
| */ |
| public final class KeyRegexpFilter extends ScanFilter { |
| |
| private static final byte[] ROWFILTER = Bytes.ISO88591("org.apache.hadoop" |
| + ".hbase.filter.RowFilter"); |
| private static final byte[] REGEXSTRINGCOMPARATOR = Bytes.ISO88591("org.apache.hadoop" |
| + ".hbase.filter.RegexStringComparator"); |
| private static final byte[] EQUAL = new byte[] { 'E', 'Q', 'U', 'A', 'L' }; |
| |
| private final byte[] regexp; |
| private final byte[] charset; |
| |
| /** |
| * Sets a regular expression to filter results based on the row key. |
| * <p> |
| * This is equivalent to calling {@link #KeyRegexpFilter(String, Charset)} |
| * with the ISO-8859-1 charset in argument. |
| * @param regexp The regular expression with which to filter the row keys. |
| */ |
| public KeyRegexpFilter(final String regexp) { |
| this(regexp, CharsetUtil.ISO_8859_1); |
| } |
| |
| /** |
| * Sets a regular expression to filter results based on the row key. |
| * @param regexp The regular expression with which to filter the row keys. |
| * @param charset The charset used to decode the bytes of the row key into a |
| * string. The RegionServer must support this charset, otherwise it will |
| * unexpectedly close the connection the first time you attempt to use this |
| * scanner. |
| * @see #KeyRegexpFilter(byte[], Charset) |
| */ |
| public KeyRegexpFilter(final String regexp, final Charset charset) { |
| this(Bytes.UTF8(regexp), charset); |
| } |
| |
| /** |
| * Sets a regular expression to filter results based on the row key. |
| * <p> |
| * This is equivalent to calling {@link #KeyRegexpFilter(byte[], Charset)} |
| * with the ISO-8859-1 charset in argument. |
| * @param regexp The binary regular expression with which to filter |
| * the row keys. |
| */ |
| public KeyRegexpFilter(final byte[] regexp) { |
| this(regexp, CharsetUtil.ISO_8859_1); |
| } |
| |
| /** |
| * Sets a regular expression to filter results based on the row key. |
| * @param regexp The regular expression with which to filter the row keys. |
| * @param charset The charset used to decode the bytes of the row key into a |
| * string. The RegionServer must support this charset, otherwise it will |
| * unexpectedly close the connection the first time you attempt to use this |
| * scanner. |
| */ |
| public KeyRegexpFilter(final byte[] regexp, final Charset charset) { |
| this.regexp = regexp; |
| this.charset = Bytes.UTF8(charset.name()); |
| } |
| |
| @Override |
| byte[] serialize() { |
| final ComparatorPB.Comparator.Builder comparator = |
| ComparatorPB.Comparator.newBuilder(); |
| comparator.setNameBytes(Bytes.wrap(REGEXSTRINGCOMPARATOR)); |
| final byte[] regex_cmp = ComparatorPB.RegexStringComparator.newBuilder() |
| .setPatternBytes(Bytes.wrap(regexp)) |
| .setPatternFlags(0) |
| .setCharsetBytes(Bytes.wrap(charset)) |
| .build() |
| .toByteArray(); |
| comparator.setSerializedComparator(Bytes.wrap(regex_cmp)); |
| final FilterPB.CompareFilter cmp = FilterPB.CompareFilter.newBuilder() |
| .setCompareOp(HBasePB.CompareType.EQUAL) |
| .setComparator(comparator.build()) |
| .build(); |
| return FilterPB.RowFilter.newBuilder() |
| .setCompareFilter(cmp) |
| .build() |
| .toByteArray(); |
| } |
| |
| @Override |
| byte[] name() { |
| return ROWFILTER; |
| } |
| |
| @Override |
| void serializeOld(final ChannelBuffer buf) { |
| buf.writeByte((byte) ROWFILTER.length); // 1 |
| buf.writeBytes(ROWFILTER); // 40 |
| // writeUTF of the comparison operator |
| buf.writeShort(5); // 2 |
| buf.writeBytes(EQUAL); // 5 |
| // The comparator: a RegexStringComparator |
| buf.writeByte(54); // Code for WritableByteArrayComparable // 1 |
| buf.writeByte(0); // Code for "this has no code". // 1 |
| buf.writeByte((byte) REGEXSTRINGCOMPARATOR.length); // 1 |
| buf.writeBytes(REGEXSTRINGCOMPARATOR); // 52 |
| // writeUTF the regexp |
| buf.writeShort(regexp.length); // 2 |
| buf.writeBytes(regexp); // regexp.length |
| // writeUTF the charset |
| buf.writeShort(charset.length); // 2 |
| buf.writeBytes(charset); // charset.length |
| } |
| |
| @Override |
| int predictSerializedSize() { |
| return 1 + 40 + 2 + 5 + 1 + 1 + 1 + 52 |
| + 2 + regexp.length + 2 + charset.length; |
| } |
| |
| public String toString() { |
| return "KeyRegexpFilter(\"" + new String(regexp, CharsetUtil.UTF_8) |
| + "\", " + new String(charset, CharsetUtil.UTF_8) + ')'; |
| } |
| |
| } |