| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.schema; |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.util.Attribute; |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.AttributeSource.State; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.solr.schema.PreAnalyzedField.ParseResult; |
| import org.apache.solr.schema.PreAnalyzedField.PreAnalyzedParser; |
| |
| /** |
| * Simple plain text format parser for {@link PreAnalyzedField}. |
| * <h2>Serialization format</h2> |
| * <p>The format of the serialization is as follows: |
| * <pre> |
| * content ::= version (stored)? tokens |
| * version ::= digit+ " " |
| * ; stored field value - any "=" inside must be escaped! |
| * stored ::= "=" text "=" |
| * tokens ::= (token ((" ") + token)*)* |
| * token ::= text ("," attrib)* |
| * attrib ::= name '=' value |
| * name ::= text |
| * value ::= text |
| * </pre> |
| * <p>Special characters in "text" values can be escaped |
| * using the escape character \ . The following escape sequences are recognized: |
| * <pre> |
| * "\ " - literal space character |
| * "\," - literal , character |
| * "\=" - literal = character |
| * "\\" - literal \ character |
| * "\n" - newline |
| * "\r" - carriage return |
| * "\t" - horizontal tab |
| * </pre> |
| * Please note that Unicode sequences (e.g. \u0001) are not supported. |
| * <h2>Supported attribute names</h2> |
| * The following token attributes are supported, and identified with short |
| * symbolic names: |
| * <pre> |
| * i - position increment (integer) |
| * s - token offset, start position (integer) |
| * e - token offset, end position (integer) |
| * t - token type (string) |
| * f - token flags (hexadecimal integer) |
| * p - payload (bytes in hexadecimal format; whitespace is ignored) |
| * </pre> |
| * Token offsets are tracked and implicitly added to the token stream - |
| * the start and end offsets consider only the term text and whitespace, |
| * and exclude the space taken by token attributes. |
| * <h2>Example token streams</h2> |
| * <pre> |
| * 1 one two three |
| - version 1 |
| - stored: 'null' |
| - tok: '(term=one,startOffset=0,endOffset=3)' |
| - tok: '(term=two,startOffset=4,endOffset=7)' |
| - tok: '(term=three,startOffset=8,endOffset=13)' |
| 1 one two three |
| - version 1 |
| - stored: 'null' |
| - tok: '(term=one,startOffset=0,endOffset=3)' |
| - tok: '(term=two,startOffset=5,endOffset=8)' |
| - tok: '(term=three,startOffset=11,endOffset=16)' |
| 1 one,s=123,e=128,i=22 two three,s=20,e=22 |
| - version 1 |
| - stored: 'null' |
| - tok: '(term=one,positionIncrement=22,startOffset=123,endOffset=128)' |
| - tok: '(term=two,positionIncrement=1,startOffset=5,endOffset=8)' |
| - tok: '(term=three,positionIncrement=1,startOffset=20,endOffset=22)' |
| 1 \ one\ \,,i=22,a=\, two\= |
| |
| \n,\ =\ \ |
| - version 1 |
| - stored: 'null' |
| - tok: '(term= one ,,positionIncrement=22,startOffset=0,endOffset=6)' |
| - tok: '(term=two= |
| |
| |
| ,positionIncrement=1,startOffset=7,endOffset=15)' |
| - tok: '(term=\,positionIncrement=1,startOffset=17,endOffset=18)' |
| 1 ,i=22 ,i=33,s=2,e=20 , |
| - version 1 |
| - stored: 'null' |
| - tok: '(term=,positionIncrement=22,startOffset=0,endOffset=0)' |
| - tok: '(term=,positionIncrement=33,startOffset=2,endOffset=20)' |
| - tok: '(term=,positionIncrement=1,startOffset=2,endOffset=2)' |
| 1 =This is the stored part with \= |
| \n \t escapes.=one two three |
| - version 1 |
| - stored: 'This is the stored part with = |
| \n \t escapes.' |
| - tok: '(term=one,startOffset=0,endOffset=3)' |
| - tok: '(term=two,startOffset=4,endOffset=7)' |
| - tok: '(term=three,startOffset=8,endOffset=13)' |
| 1 == |
| - version 1 |
| - stored: '' |
| - (no tokens) |
| 1 =this is a test.= |
| - version 1 |
| - stored: 'this is a test.' |
| - (no tokens) |
| * </pre> |
| */ |
| public final class SimplePreAnalyzedParser implements PreAnalyzedParser { |
| static final String VERSION = "1"; |
| |
| private static class Tok { |
| StringBuilder token = new StringBuilder(); |
| Map<String, String> attr = new HashMap<>(); |
| |
| public boolean isEmpty() { |
| return token.length() == 0 && attr.size() == 0; |
| } |
| |
| public void reset() { |
| token.setLength(0); |
| attr.clear(); |
| } |
| |
| @Override |
| public String toString() { |
| return "tok='" + token + "',attr=" + attr; |
| } |
| } |
| |
| // parser state |
| private static enum S {TOKEN, NAME, VALUE, UNDEF}; |
| |
| private static final byte[] EMPTY_BYTES = new byte[0]; |
| |
| /** Utility method to convert a hex string to a byte array. */ |
| static byte[] hexToBytes(String hex) { |
| if (hex == null) { |
| return EMPTY_BYTES; |
| } |
| hex = hex.replaceAll("\\s+", ""); |
| if (hex.length() == 0) { |
| return EMPTY_BYTES; |
| } |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(hex.length() / 2); |
| byte b; |
| for (int i = 0; i < hex.length(); i++) { |
| int high = charToNibble(hex.charAt(i)); |
| int low = 0; |
| if (i < hex.length() - 1) { |
| i++; |
| low = charToNibble(hex.charAt(i)); |
| } |
| b = (byte)(high << 4 | low); |
| baos.write(b); |
| } |
| return baos.toByteArray(); |
| } |
| |
| static final int charToNibble(char c) { |
| if (c >= '0' && c <= '9') { |
| return c - '0'; |
| } else if (c >= 'a' && c <= 'f') { |
| return 0xa + (c - 'a'); |
| } else if (c >= 'A' && c <= 'F') { |
| return 0xA + (c - 'A'); |
| } else { |
| throw new RuntimeException("Not a hex character: '" + c + "'"); |
| } |
| } |
| |
| static String bytesToHex(byte bytes[], int offset, int length) { |
| StringBuilder sb = new StringBuilder(); |
| for (int i = offset; i < offset + length; ++i) { |
| sb.append(Integer.toHexString(0x0100 + (bytes[i] & 0x00FF)) |
| .substring(1)); |
| } |
| return sb.toString(); |
| } |
| |
| public SimplePreAnalyzedParser() { |
| |
| } |
| |
| @Override |
| public ParseResult parse(Reader reader, AttributeSource parent) throws IOException { |
| ParseResult res = new ParseResult(); |
| StringBuilder sb = new StringBuilder(); |
| char[] buf = new char[128]; |
| int cnt; |
| while ((cnt = reader.read(buf)) > 0) { |
| sb.append(buf, 0, cnt); |
| } |
| String val = sb.toString(); |
| // empty string - accept even without version number |
| if (val.length() == 0) { |
| return res; |
| } |
| // first consume the version |
| int idx = val.indexOf(' '); |
| if (idx == -1) { |
| throw new IOException("Missing VERSION token"); |
| } |
| String version = val.substring(0, idx); |
| if (!VERSION.equals(version)) { |
| throw new IOException("Unknown VERSION " + version); |
| } |
| val = val.substring(idx + 1); |
| // then consume the optional stored part |
| int tsStart = 0; |
| boolean hasStored = false; |
| StringBuilder storedBuf = new StringBuilder(); |
| if (val.charAt(0) == '=') { |
| hasStored = true; |
| if (val.length() > 1) { |
| for (int i = 1; i < val.length(); i++) { |
| char c = val.charAt(i); |
| if (c == '\\') { |
| if (i < val.length() - 1) { |
| c = val.charAt(++i); |
| if (c == '=') { // we recognize only \= escape in the stored part |
| storedBuf.append('='); |
| } else { |
| storedBuf.append('\\'); |
| storedBuf.append(c); |
| continue; |
| } |
| } else { |
| storedBuf.append(c); |
| continue; |
| } |
| } else if (c == '=') { |
| // end of stored text |
| tsStart = i + 1; |
| break; |
| } else { |
| storedBuf.append(c); |
| } |
| } |
| if (tsStart == 0) { // missing end-of-stored marker |
| throw new IOException("Missing end marker of stored part"); |
| } |
| } else { |
| throw new IOException("Unexpected end of stored field"); |
| } |
| } |
| if (hasStored) { |
| res.str = storedBuf.toString(); |
| } |
| Tok tok = new Tok(); |
| StringBuilder attName = new StringBuilder(); |
| StringBuilder attVal = new StringBuilder(); |
| // parser state |
| S s = S.UNDEF; |
| int lastPos = 0; |
| for (int i = tsStart; i < val.length(); i++) { |
| char c = val.charAt(i); |
| if (c == ' ') { |
| // collect leftovers |
| switch (s) { |
| case VALUE : |
| if (attVal.length() == 0) { |
| throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute."); |
| } |
| if (attName.length() > 0) { |
| tok.attr.put(attName.toString(), attVal.toString()); |
| } |
| break; |
| case NAME: // attr name without a value ? |
| if (attName.length() > 0) { |
| throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value."); |
| } else { |
| // accept missing att name and value |
| } |
| break; |
| case TOKEN: |
| case UNDEF: |
| // do nothing, advance to next token |
| } |
| attName.setLength(0); |
| attVal.setLength(0); |
| if (!tok.isEmpty() || s == S.NAME) { |
| AttributeSource.State state = createState(parent, tok, lastPos); |
| if (state != null) res.states.add(state.clone()); |
| } |
| // reset tok |
| s = S.UNDEF; |
| tok.reset(); |
| // skip |
| lastPos++; |
| continue; |
| } |
| StringBuilder tgt = null; |
| switch (s) { |
| case TOKEN: |
| tgt = tok.token; |
| break; |
| case NAME: |
| tgt = attName; |
| break; |
| case VALUE: |
| tgt = attVal; |
| break; |
| case UNDEF: |
| tgt = tok.token; |
| s = S.TOKEN; |
| } |
| if (c == '\\') { |
| if (s == S.TOKEN) lastPos++; |
| if (i >= val.length() - 1) { // end |
| |
| tgt.append(c); |
| continue; |
| } else { |
| c = val.charAt(++i); |
| switch (c) { |
| case '\\' : |
| case '=' : |
| case ',' : |
| case ' ' : |
| tgt.append(c); |
| break; |
| case 'n': |
| tgt.append('\n'); |
| break; |
| case 'r': |
| tgt.append('\r'); |
| break; |
| case 't': |
| tgt.append('\t'); |
| break; |
| default: |
| tgt.append('\\'); |
| tgt.append(c); |
| lastPos++; |
| } |
| } |
| } else { |
| // state switch |
| if (c == ',') { |
| if (s == S.TOKEN) { |
| s = S.NAME; |
| } else if (s == S.VALUE) { // end of value, start of next attr |
| if (attVal.length() == 0) { |
| throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute."); |
| } |
| if (attName.length() > 0 && attVal.length() > 0) { |
| tok.attr.put(attName.toString(), attVal.toString()); |
| } |
| // reset |
| attName.setLength(0); |
| attVal.setLength(0); |
| s = S.NAME; |
| } else { |
| throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value."); |
| } |
| } else if (c == '=') { |
| if (s == S.NAME) { |
| s = S.VALUE; |
| } else { |
| throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute."); |
| } |
| } else { |
| tgt.append(c); |
| if (s == S.TOKEN) lastPos++; |
| } |
| } |
| } |
| // collect leftovers |
| if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) { |
| // remaining attrib? |
| if (s == S.VALUE) { |
| if (attName.length() > 0 && attVal.length() > 0) { |
| tok.attr.put(attName.toString(), attVal.toString()); |
| } |
| } |
| AttributeSource.State state = createState(parent, tok, lastPos); |
| if (state != null) res.states.add(state.clone()); |
| } |
| return res; |
| } |
| |
| private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) { |
| a.clearAttributes(); |
| CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class); |
| char[] tokChars = state.token.toString().toCharArray(); |
| termAtt.copyBuffer(tokChars, 0, tokChars.length); |
| int tokenStart = tokenEnd - state.token.length(); |
| for (Entry<String, String> e : state.attr.entrySet()) { |
| String k = e.getKey(); |
| if (k.equals("i")) { |
| // position increment |
| int incr = Integer.parseInt(e.getValue()); |
| PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class); |
| posIncr.setPositionIncrement(incr); |
| } else if (k.equals("s")) { |
| tokenStart = Integer.parseInt(e.getValue()); |
| } else if (k.equals("e")) { |
| tokenEnd = Integer.parseInt(e.getValue()); |
| } else if (k.equals("y")) { |
| TypeAttribute type = a.addAttribute(TypeAttribute.class); |
| type.setType(e.getValue()); |
| } else if (k.equals("f")) { |
| FlagsAttribute flags = a.addAttribute(FlagsAttribute.class); |
| int f = Integer.parseInt(e.getValue(), 16); |
| flags.setFlags(f); |
| } else if (k.equals("p")) { |
| PayloadAttribute p = a.addAttribute(PayloadAttribute.class); |
| byte[] data = hexToBytes(e.getValue()); |
| if (data != null && data.length > 0) { |
| p.setPayload(new BytesRef(data)); |
| } |
| } else { |
| // unknown attribute |
| } |
| } |
| // handle offset attr |
| OffsetAttribute offset = a.addAttribute(OffsetAttribute.class); |
| offset.setOffset(tokenStart, tokenEnd); |
| State resState = a.captureState(); |
| a.clearAttributes(); |
| return resState; |
| } |
| |
| @Override |
| public String toFormattedString(Field f) throws IOException { |
| StringBuilder sb = new StringBuilder(); |
| sb.append(VERSION + " "); |
| if (f.fieldType().stored()) { |
| String s = f.stringValue(); |
| if (s != null) { |
| // encode the equals sign |
| s = s.replaceAll("=", "\\="); |
| sb.append('='); |
| sb.append(s); |
| sb.append('='); |
| } |
| } |
| TokenStream ts = f.tokenStreamValue(); |
| if (ts != null) { |
| StringBuilder tok = new StringBuilder(); |
| boolean next = false; |
| while (ts.incrementToken()) { |
| if (next) { |
| sb.append(' '); |
| } else { |
| next = true; |
| } |
| tok.setLength(0); |
| Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); |
| String cTerm = null; |
| String tTerm = null; |
| while (it.hasNext()) { |
| Class<? extends Attribute> cl = it.next(); |
| Attribute att = ts.getAttribute(cl); |
| if (att == null) { |
| continue; |
| } |
| if (cl.isAssignableFrom(CharTermAttribute.class)) { |
| CharTermAttribute catt = (CharTermAttribute)att; |
| cTerm = escape(catt.buffer(), catt.length()); |
| } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) { |
| TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att; |
| char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray(); |
| tTerm = escape(tTermChars, tTermChars.length); |
| } else { |
| if (tok.length() > 0) tok.append(','); |
| if (cl.isAssignableFrom(FlagsAttribute.class)) { |
| tok.append("f=").append(Integer.toHexString(((FlagsAttribute) att).getFlags())); |
| } else if (cl.isAssignableFrom(OffsetAttribute.class)) { |
| tok.append("s=").append(((OffsetAttribute) att).startOffset()).append(",e=").append(((OffsetAttribute) att).endOffset()); |
| } else if (cl.isAssignableFrom(PayloadAttribute.class)) { |
| BytesRef p = ((PayloadAttribute)att).getPayload(); |
| if (p != null && p.length > 0) { |
| tok.append("p=").append(bytesToHex(p.bytes, p.offset, p.length)); |
| } else if (tok.length() > 0) { |
| tok.setLength(tok.length() - 1); // remove the last comma |
| } |
| } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) { |
| tok.append("i=").append(((PositionIncrementAttribute) att).getPositionIncrement()); |
| } else if (cl.isAssignableFrom(TypeAttribute.class)) { |
| tok.append("y=").append(escape(((TypeAttribute) att).type())); |
| } else { |
| |
| tok.append(cl.getName()).append('=').append(escape(att.toString())); |
| } |
| } |
| } |
| String term = null; |
| if (cTerm != null) { |
| term = cTerm; |
| } else { |
| term = tTerm; |
| } |
| if (term != null && term.length() > 0) { |
| if (tok.length() > 0) { |
| tok.insert(0, term + ","); |
| } else { |
| tok.insert(0, term); |
| } |
| } |
| sb.append(tok); |
| } |
| } |
| return sb.toString(); |
| } |
| |
| String escape(String val) { |
| return escape(val.toCharArray(), val.length()); |
| } |
| |
| String escape(char[] val, int len) { |
| if (val == null || len == 0) { |
| return ""; |
| } |
| StringBuilder sb = new StringBuilder(); |
| for (int i = 0; i < len; i++) { |
| switch (val[i]) { |
| case '\\' : |
| case '=' : |
| case ',' : |
| case ' ' : |
| sb.append('\\'); |
| sb.append(val[i]); |
| break; |
| case '\n' : |
| sb.append('\\'); |
| sb.append('n'); |
| break; |
| case '\r' : |
| sb.append('\\'); |
| sb.append('r'); |
| break; |
| case '\t' : |
| sb.append('\\'); |
| sb.append('t'); |
| break; |
| default: |
| sb.append(val[i]); |
| } |
| } |
| return sb.toString(); |
| } |
| |
| } |