solr/core/src/java/org/apache/solr/schema/SimplePreAnalyzedParser.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.schema;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.AttributeSource.State;
 import org.apache.lucene.util.BytesRef;
 import org.apache.solr.schema.PreAnalyzedField.ParseResult;
 import org.apache.solr.schema.PreAnalyzedField.PreAnalyzedParser;

 /**
  * Simple plain text format parser for {@link PreAnalyzedField}.
  * <h2>Serialization format</h2>
  * <p>The format of the serialization is as follows:
  * <pre>
  * content ::= version (stored)? tokens
  * version ::= digit+ " "
  * ; stored field value - any "=" inside must be escaped!
  * stored ::= "=" text "="
  * tokens ::= (token ((" ") + token)*)*
  * token ::= text ("," attrib)*
  * attrib ::= name '=' value
  * name ::= text
  * value ::= text
  * </pre>
  * <p>Special characters in "text" values can be escaped
  * using the escape character \ . The following escape sequences are recognized:
  * <pre>
  * "\ " - literal space character
  * "\," - literal , character
  * "\=" - literal = character
  * "\\" - literal \ character
  * "\n" - newline
  * "\r" - carriage return
  * "\t" - horizontal tab
  * </pre>
  * Please note that Unicode sequences (e.g. &#92;u0001) are not supported.
  * <h2>Supported attribute names</h2>
  * The following token attributes are supported, and identified with short
  * symbolic names:
  * <pre>
  * i - position increment (integer)
  * s - token offset, start position (integer)
  * e - token offset, end position (integer)
  * t - token type (string)
  * f - token flags (hexadecimal integer)
  * p - payload (bytes in hexadecimal format; whitespace is ignored)
  * </pre>
  * Token offsets are tracked and implicitly added to the token stream -
  * the start and end offsets consider only the term text and whitespace,
  * and exclude the space taken by token attributes.
  * <h2>Example token streams</h2>
  * <pre>
  * 1 one two three
   - version 1
   - stored: 'null'
   - tok: '(term=one,startOffset=0,endOffset=3)'
   - tok: '(term=two,startOffset=4,endOffset=7)'
   - tok: '(term=three,startOffset=8,endOffset=13)'
  1 one  two   three
   - version 1
   - stored: 'null'
   - tok: '(term=one,startOffset=0,endOffset=3)'
   - tok: '(term=two,startOffset=5,endOffset=8)'
   - tok: '(term=three,startOffset=11,endOffset=16)'
 1 one,s=123,e=128,i=22  two three,s=20,e=22
   - version 1
   - stored: 'null'
   - tok: '(term=one,positionIncrement=22,startOffset=123,endOffset=128)'
   - tok: '(term=two,positionIncrement=1,startOffset=5,endOffset=8)'
   - tok: '(term=three,positionIncrement=1,startOffset=20,endOffset=22)'
 1 \ one\ \,,i=22,a=\, two\=

   \n,\ =\   \
   - version 1
   - stored: 'null'
   - tok: '(term= one ,,positionIncrement=22,startOffset=0,endOffset=6)'
   - tok: '(term=two=


  ,positionIncrement=1,startOffset=7,endOffset=15)'
   - tok: '(term=\,positionIncrement=1,startOffset=17,endOffset=18)'
 1 ,i=22 ,i=33,s=2,e=20 ,
   - version 1
   - stored: 'null'
   - tok: '(term=,positionIncrement=22,startOffset=0,endOffset=0)'
   - tok: '(term=,positionIncrement=33,startOffset=2,endOffset=20)'
   - tok: '(term=,positionIncrement=1,startOffset=2,endOffset=2)'
 1 =This is the stored part with \=
  \n    \t escapes.=one two three
   - version 1
   - stored: 'This is the stored part with =
  \n    \t escapes.'
   - tok: '(term=one,startOffset=0,endOffset=3)'
   - tok: '(term=two,startOffset=4,endOffset=7)'
   - tok: '(term=three,startOffset=8,endOffset=13)'
 1 ==
   - version 1
   - stored: ''
   - (no tokens)
 1 =this is a test.=
   - version 1
   - stored: 'this is a test.'
   - (no tokens)
  * </pre>
  */
 public final class SimplePreAnalyzedParser implements PreAnalyzedParser {
   static final String VERSION = "1";

   private static class Tok {
     StringBuilder token = new StringBuilder();
     Map<String, String> attr = new HashMap<>();

     public boolean isEmpty() {
       return token.length() == 0 && attr.size() == 0;
     }

     public void reset() {
       token.setLength(0);
       attr.clear();
     }

     @Override
     public String toString() {
       return "tok='" + token + "',attr=" + attr;
     }
   }

   // parser state
   private static enum S {TOKEN, NAME, VALUE, UNDEF};

   private static final byte[] EMPTY_BYTES = new byte[0];

   /** Utility method to convert a hex string to a byte array. */
   static byte[] hexToBytes(String hex) {
     if (hex == null) {
       return EMPTY_BYTES;
     }
     hex = hex.replaceAll("\\s+", "");
     if (hex.length() == 0) {
       return EMPTY_BYTES;
     }
     ByteArrayOutputStream baos = new ByteArrayOutputStream(hex.length() / 2);
     byte b;
     for (int i = 0; i < hex.length(); i++) {
       int high = charToNibble(hex.charAt(i));
       int low = 0;
       if (i < hex.length() - 1) {
         i++;
         low = charToNibble(hex.charAt(i));
       }
       b = (byte)(high << 4 | low);
       baos.write(b);
     }
     return baos.toByteArray();
   }

   static final int charToNibble(char c) {
     if (c >= '0' && c <= '9') {
       return c - '0';
     } else if (c >= 'a' && c <= 'f') {
       return 0xa + (c - 'a');
     } else if (c >= 'A' && c <= 'F') {
       return 0xA + (c - 'A');
     } else {
       throw new RuntimeException("Not a hex character: '" + c + "'");
     }
   }

   static String bytesToHex(byte bytes[], int offset, int length) {
     StringBuilder sb = new StringBuilder();
     for (int i = offset; i < offset + length; ++i) {
       sb.append(Integer.toHexString(0x0100 + (bytes[i] & 0x00FF))
                        .substring(1));
     }
     return sb.toString();
   }

   public SimplePreAnalyzedParser() {

   }

   @Override
   public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
     ParseResult res = new ParseResult();
     StringBuilder sb = new StringBuilder();
     char[] buf = new char[128];
     int cnt;
     while ((cnt = reader.read(buf)) > 0) {
       sb.append(buf, 0, cnt);
     }
     String val = sb.toString();
     // empty string - accept even without version number
     if (val.length() == 0) {
       return res;
     }
     // first consume the version
     int idx = val.indexOf(' ');
     if (idx == -1) {
       throw new IOException("Missing VERSION token");
     }
     String version = val.substring(0, idx);
     if (!VERSION.equals(version)) {
       throw new IOException("Unknown VERSION " + version);
     }
     val = val.substring(idx + 1);
     // then consume the optional stored part
     int tsStart = 0;
     boolean hasStored = false;
     StringBuilder storedBuf = new StringBuilder();
     if (val.charAt(0) == '=') {
       hasStored = true;
       if (val.length() > 1) {
         for (int i = 1; i < val.length(); i++) {
           char c = val.charAt(i);
           if (c == '\\') {
             if (i < val.length() - 1) {
               c = val.charAt(++i);
               if (c == '=') { // we recognize only \= escape in the stored part
                 storedBuf.append('=');
               } else {
                 storedBuf.append('\\');
                 storedBuf.append(c);
                 continue;
               }
             } else {
               storedBuf.append(c);
               continue;
             }
           } else if (c == '=') {
             // end of stored text
             tsStart = i + 1;
             break;
           } else {
             storedBuf.append(c);
           }
         }
         if (tsStart == 0) { // missing end-of-stored marker
           throw new IOException("Missing end marker of stored part");
         }
       } else {
         throw new IOException("Unexpected end of stored field");
       }
     }
     if (hasStored) {
       res.str = storedBuf.toString();
     }
     Tok tok = new Tok();
     StringBuilder attName = new StringBuilder();
     StringBuilder attVal = new StringBuilder();
     // parser state
     S s = S.UNDEF;
     int lastPos = 0;
     for (int i = tsStart; i < val.length(); i++) {
       char c = val.charAt(i);
       if (c == ' ') {
         // collect leftovers
         switch (s) {
         case VALUE :
           if (attVal.length() == 0) {
             throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
           }
           if (attName.length() > 0) {
             tok.attr.put(attName.toString(), attVal.toString());
           }
           break;
         case NAME: // attr name without a value ?
           if (attName.length() > 0) {
             throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
           } else {
             // accept missing att name and value
           }
           break;
         case TOKEN:
         case UNDEF:
           // do nothing, advance to next token
         }
         attName.setLength(0);
         attVal.setLength(0);
         if (!tok.isEmpty() || s == S.NAME) {
           AttributeSource.State state = createState(parent, tok, lastPos);
           if (state != null) res.states.add(state.clone());
         }
         // reset tok
         s = S.UNDEF;
         tok.reset();
         // skip
         lastPos++;
         continue;
       }
       StringBuilder tgt = null;
       switch (s) {
       case TOKEN:
         tgt = tok.token;
         break;
       case NAME:
         tgt = attName;
         break;
       case VALUE:
         tgt = attVal;
         break;
       case UNDEF:
         tgt = tok.token;
         s = S.TOKEN;
       }
       if (c == '\\') {
         if (s == S.TOKEN) lastPos++;
         if (i >= val.length() - 1) { // end

           tgt.append(c);
           continue;
         } else {
           c = val.charAt(++i);
           switch (c) {
           case '\\' :
           case '=' :
           case ',' :
           case ' ' :
             tgt.append(c);
             break;
           case 'n':
             tgt.append('\n');
             break;
           case 'r':
             tgt.append('\r');
             break;
           case 't':
             tgt.append('\t');
             break;
           default:
             tgt.append('\\');
             tgt.append(c);
             lastPos++;
           }
         }
       } else {
         // state switch
         if (c == ',') {
           if (s == S.TOKEN) {
             s = S.NAME;
           } else if (s == S.VALUE) { // end of value, start of next attr
             if (attVal.length() == 0) {
               throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
             }
             if (attName.length() > 0 && attVal.length() > 0) {
               tok.attr.put(attName.toString(), attVal.toString());
             }
             // reset
             attName.setLength(0);
             attVal.setLength(0);
             s = S.NAME;
           } else {
             throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
           }
         } else if (c == '=') {
           if (s == S.NAME) {
             s = S.VALUE;
           } else {
             throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
           }
         } else {
           tgt.append(c);
           if (s == S.TOKEN) lastPos++;
         }
       }
     }
     // collect leftovers
     if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) {
       // remaining attrib?
       if (s == S.VALUE) {
         if (attName.length() > 0 && attVal.length() > 0) {
           tok.attr.put(attName.toString(), attVal.toString());
         }
       }
       AttributeSource.State state = createState(parent, tok, lastPos);
       if (state != null) res.states.add(state.clone());
     }
     return res;
   }

   private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
     a.clearAttributes();
     CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
     char[] tokChars = state.token.toString().toCharArray();
     termAtt.copyBuffer(tokChars, 0, tokChars.length);
     int tokenStart = tokenEnd - state.token.length();
     for (Entry<String, String> e : state.attr.entrySet()) {
       String k = e.getKey();
       if (k.equals("i")) {
         // position increment
         int incr = Integer.parseInt(e.getValue());
         PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
         posIncr.setPositionIncrement(incr);
       } else if (k.equals("s")) {
         tokenStart = Integer.parseInt(e.getValue());
       } else if (k.equals("e")) {
         tokenEnd = Integer.parseInt(e.getValue());
       } else if (k.equals("y")) {
         TypeAttribute type = a.addAttribute(TypeAttribute.class);
         type.setType(e.getValue());
       } else if (k.equals("f")) {
         FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
         int f = Integer.parseInt(e.getValue(), 16);
         flags.setFlags(f);
       } else if (k.equals("p")) {
         PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
         byte[] data = hexToBytes(e.getValue());
         if (data != null && data.length > 0) {
           p.setPayload(new BytesRef(data));
         }
       } else {
         // unknown attribute
       }
     }
     // handle offset attr
     OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
     offset.setOffset(tokenStart, tokenEnd);
     State resState = a.captureState();
     a.clearAttributes();
     return resState;
   }

   @Override
   public String toFormattedString(Field f) throws IOException {
     StringBuilder sb = new StringBuilder();
     sb.append(VERSION + " ");
     if (f.fieldType().stored()) {
       String s = f.stringValue();
       if (s != null) {
         // encode the equals sign
         s = s.replaceAll("=", "\\=");
         sb.append('=');
         sb.append(s);
         sb.append('=');
       }
     }
     TokenStream ts = f.tokenStreamValue();
     if (ts != null) {
       StringBuilder tok = new StringBuilder();
       boolean next = false;
       while (ts.incrementToken()) {
         if (next) {
           sb.append(' ');
         } else {
           next = true;
         }
         tok.setLength(0);
         Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
         String cTerm = null;
         String tTerm = null;
         while (it.hasNext()) {
           Class<? extends Attribute> cl = it.next();
           Attribute att = ts.getAttribute(cl);
           if (att == null) {
             continue;
           }
           if (cl.isAssignableFrom(CharTermAttribute.class)) {
             CharTermAttribute catt = (CharTermAttribute)att;
             cTerm = escape(catt.buffer(), catt.length());
           } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
             TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
             char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
             tTerm = escape(tTermChars, tTermChars.length);
           } else {
             if (tok.length() > 0) tok.append(',');
             if (cl.isAssignableFrom(FlagsAttribute.class)) {
               tok.append("f=").append(Integer.toHexString(((FlagsAttribute) att).getFlags()));
             } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
               tok.append("s=").append(((OffsetAttribute) att).startOffset()).append(",e=").append(((OffsetAttribute) att).endOffset());
             } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
               BytesRef p = ((PayloadAttribute)att).getPayload();
               if (p != null && p.length > 0) {
                 tok.append("p=").append(bytesToHex(p.bytes, p.offset, p.length));
               } else if (tok.length() > 0) {
                 tok.setLength(tok.length() - 1); // remove the last comma
               }
             } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
               tok.append("i=").append(((PositionIncrementAttribute) att).getPositionIncrement());
             } else if (cl.isAssignableFrom(TypeAttribute.class)) {
               tok.append("y=").append(escape(((TypeAttribute) att).type()));
             } else {

               tok.append(cl.getName()).append('=').append(escape(att.toString()));
             }
           }
         }
         String term = null;
         if (cTerm != null) {
           term = cTerm;
         } else {
           term = tTerm;
         }
         if (term != null && term.length() > 0) {
           if (tok.length() > 0) {
             tok.insert(0, term + ",");
           } else {
             tok.insert(0, term);
           }
         }
         sb.append(tok);
       }
     }
     return sb.toString();
   }

   String escape(String val) {
     return escape(val.toCharArray(), val.length());
   }

   String escape(char[] val, int len) {
     if (val == null || len == 0) {
       return "";
     }
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < len; i++) {
       switch (val[i]) {
       case '\\' :
       case '=' :
       case ',' :
       case ' ' :
         sb.append('\\');
         sb.append(val[i]);
         break;
       case '\n' :
         sb.append('\\');
         sb.append('n');
         break;
       case '\r' :
         sb.append('\\');
         sb.append('r');
         break;
       case '\t' :
         sb.append('\\');
         sb.append('t');
         break;
       default:
         sb.append(val[i]);
       }
     }
     return sb.toString();
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.schema;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.Reader;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.Map;
	import java.util.Map.Entry;

	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.util.Attribute;
	import org.apache.lucene.util.AttributeSource;
	import org.apache.lucene.util.AttributeSource.State;
	import org.apache.lucene.util.BytesRef;
	import org.apache.solr.schema.PreAnalyzedField.ParseResult;
	import org.apache.solr.schema.PreAnalyzedField.PreAnalyzedParser;

	/**
	* Simple plain text format parser for {@link PreAnalyzedField}.
	* <h2>Serialization format</h2>
	* <p>The format of the serialization is as follows:
	* <pre>
	* content ::= version (stored)? tokens
	* version ::= digit+ " "
	* ; stored field value - any "=" inside must be escaped!
	* stored ::= "=" text "="
	* tokens ::= (token ((" ") + token))
	* token ::= text ("," attrib)*
	* attrib ::= name '=' value
	* name ::= text
	* value ::= text
	* </pre>
	* <p>Special characters in "text" values can be escaped
	* using the escape character \ . The following escape sequences are recognized:
	* <pre>
	* "\ " - literal space character
	* "\," - literal , character
	* "\=" - literal = character
	* "\\" - literal \ character
	* "\n" - newline
	* "\r" - carriage return
	* "\t" - horizontal tab
	* </pre>
	* Please note that Unicode sequences (e.g. \u0001) are not supported.
	* <h2>Supported attribute names</h2>
	* The following token attributes are supported, and identified with short
	* symbolic names:
	* <pre>
	* i - position increment (integer)
	* s - token offset, start position (integer)
	* e - token offset, end position (integer)
	* t - token type (string)
	* f - token flags (hexadecimal integer)
	* p - payload (bytes in hexadecimal format; whitespace is ignored)
	* </pre>
	* Token offsets are tracked and implicitly added to the token stream -
	* the start and end offsets consider only the term text and whitespace,
	* and exclude the space taken by token attributes.
	* <h2>Example token streams</h2>
	* <pre>
	* 1 one two three
	- version 1
	- stored: 'null'
	- tok: '(term=one,startOffset=0,endOffset=3)'
	- tok: '(term=two,startOffset=4,endOffset=7)'
	- tok: '(term=three,startOffset=8,endOffset=13)'
	1 one two three
	- version 1
	- stored: 'null'
	- tok: '(term=one,startOffset=0,endOffset=3)'
	- tok: '(term=two,startOffset=5,endOffset=8)'
	- tok: '(term=three,startOffset=11,endOffset=16)'
	1 one,s=123,e=128,i=22 two three,s=20,e=22
	- version 1
	- stored: 'null'
	- tok: '(term=one,positionIncrement=22,startOffset=123,endOffset=128)'
	- tok: '(term=two,positionIncrement=1,startOffset=5,endOffset=8)'
	- tok: '(term=three,positionIncrement=1,startOffset=20,endOffset=22)'
	1 \ one\ \,,i=22,a=\, two\=

	\n,\ =\ \
	- version 1
	- stored: 'null'
	- tok: '(term= one ,,positionIncrement=22,startOffset=0,endOffset=6)'
	- tok: '(term=two=


	,positionIncrement=1,startOffset=7,endOffset=15)'
	- tok: '(term=\,positionIncrement=1,startOffset=17,endOffset=18)'
	1 ,i=22 ,i=33,s=2,e=20 ,
	- version 1
	- stored: 'null'
	- tok: '(term=,positionIncrement=22,startOffset=0,endOffset=0)'
	- tok: '(term=,positionIncrement=33,startOffset=2,endOffset=20)'
	- tok: '(term=,positionIncrement=1,startOffset=2,endOffset=2)'
	1 =This is the stored part with \=
	\n \t escapes.=one two three
	- version 1
	- stored: 'This is the stored part with =
	\n \t escapes.'
	- tok: '(term=one,startOffset=0,endOffset=3)'
	- tok: '(term=two,startOffset=4,endOffset=7)'
	- tok: '(term=three,startOffset=8,endOffset=13)'
	1 ==
	- version 1
	- stored: ''
	- (no tokens)
	1 =this is a test.=
	- version 1
	- stored: 'this is a test.'
	- (no tokens)
	* </pre>
	*/
	public final class SimplePreAnalyzedParser implements PreAnalyzedParser {
	static final String VERSION = "1";

	private static class Tok {
	StringBuilder token = new StringBuilder();
	Map<String, String> attr = new HashMap<>();

	public boolean isEmpty() {
	return token.length() == 0 && attr.size() == 0;
	}

	public void reset() {
	token.setLength(0);
	attr.clear();
	}

	@Override
	public String toString() {
	return "tok='" + token + "',attr=" + attr;
	}
	}

	// parser state
	private static enum S {TOKEN, NAME, VALUE, UNDEF};

	private static final byte[] EMPTY_BYTES = new byte[0];

	/** Utility method to convert a hex string to a byte array. */
	static byte[] hexToBytes(String hex) {
	if (hex == null) {
	return EMPTY_BYTES;
	}
	hex = hex.replaceAll("\\s+", "");
	if (hex.length() == 0) {
	return EMPTY_BYTES;
	}
	ByteArrayOutputStream baos = new ByteArrayOutputStream(hex.length() / 2);
	byte b;
	for (int i = 0; i < hex.length(); i++) {
	int high = charToNibble(hex.charAt(i));
	int low = 0;
	if (i < hex.length() - 1) {
	i++;
	low = charToNibble(hex.charAt(i));
	}
	b = (byte)(high << 4 \| low);
	baos.write(b);
	}
	return baos.toByteArray();
	}

	static final int charToNibble(char c) {
	if (c >= '0' && c <= '9') {
	return c - '0';
	} else if (c >= 'a' && c <= 'f') {
	return 0xa + (c - 'a');
	} else if (c >= 'A' && c <= 'F') {
	return 0xA + (c - 'A');
	} else {
	throw new RuntimeException("Not a hex character: '" + c + "'");
	}
	}

	static String bytesToHex(byte bytes[], int offset, int length) {
	StringBuilder sb = new StringBuilder();
	for (int i = offset; i < offset + length; ++i) {
	sb.append(Integer.toHexString(0x0100 + (bytes[i] & 0x00FF))
	.substring(1));
	}
	return sb.toString();
	}

	public SimplePreAnalyzedParser() {

	}

	@Override
	public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
	ParseResult res = new ParseResult();
	StringBuilder sb = new StringBuilder();
	char[] buf = new char[128];
	int cnt;
	while ((cnt = reader.read(buf)) > 0) {
	sb.append(buf, 0, cnt);
	}
	String val = sb.toString();
	// empty string - accept even without version number
	if (val.length() == 0) {
	return res;
	}
	// first consume the version
	int idx = val.indexOf(' ');
	if (idx == -1) {
	throw new IOException("Missing VERSION token");
	}
	String version = val.substring(0, idx);
	if (!VERSION.equals(version)) {
	throw new IOException("Unknown VERSION " + version);
	}
	val = val.substring(idx + 1);
	// then consume the optional stored part
	int tsStart = 0;
	boolean hasStored = false;
	StringBuilder storedBuf = new StringBuilder();
	if (val.charAt(0) == '=') {
	hasStored = true;
	if (val.length() > 1) {
	for (int i = 1; i < val.length(); i++) {
	char c = val.charAt(i);
	if (c == '\\') {
	if (i < val.length() - 1) {
	c = val.charAt(++i);
	if (c == '=') { // we recognize only \= escape in the stored part
	storedBuf.append('=');
	} else {
	storedBuf.append('\\');
	storedBuf.append(c);
	continue;
	}
	} else {
	storedBuf.append(c);
	continue;
	}
	} else if (c == '=') {
	// end of stored text
	tsStart = i + 1;
	break;
	} else {
	storedBuf.append(c);
	}
	}
	if (tsStart == 0) { // missing end-of-stored marker
	throw new IOException("Missing end marker of stored part");
	}
	} else {
	throw new IOException("Unexpected end of stored field");
	}
	}
	if (hasStored) {
	res.str = storedBuf.toString();
	}
	Tok tok = new Tok();
	StringBuilder attName = new StringBuilder();
	StringBuilder attVal = new StringBuilder();
	// parser state
	S s = S.UNDEF;
	int lastPos = 0;
	for (int i = tsStart; i < val.length(); i++) {
	char c = val.charAt(i);
	if (c == ' ') {
	// collect leftovers
	switch (s) {
	case VALUE :
	if (attVal.length() == 0) {
	throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
	}
	if (attName.length() > 0) {
	tok.attr.put(attName.toString(), attVal.toString());
	}
	break;
	case NAME: // attr name without a value ?
	if (attName.length() > 0) {
	throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
	} else {
	// accept missing att name and value
	}
	break;
	case TOKEN:
	case UNDEF:
	// do nothing, advance to next token
	}
	attName.setLength(0);
	attVal.setLength(0);
	if (!tok.isEmpty() \|\| s == S.NAME) {
	AttributeSource.State state = createState(parent, tok, lastPos);
	if (state != null) res.states.add(state.clone());
	}
	// reset tok
	s = S.UNDEF;
	tok.reset();
	// skip
	lastPos++;
	continue;
	}
	StringBuilder tgt = null;
	switch (s) {
	case TOKEN:
	tgt = tok.token;
	break;
	case NAME:
	tgt = attName;
	break;
	case VALUE:
	tgt = attVal;
	break;
	case UNDEF:
	tgt = tok.token;
	s = S.TOKEN;
	}
	if (c == '\\') {
	if (s == S.TOKEN) lastPos++;
	if (i >= val.length() - 1) { // end

	tgt.append(c);
	continue;
	} else {
	c = val.charAt(++i);
	switch (c) {
	case '\\' :
	case '=' :
	case ',' :
	case ' ' :
	tgt.append(c);
	break;
	case 'n':
	tgt.append('\n');
	break;
	case 'r':
	tgt.append('\r');
	break;
	case 't':
	tgt.append('\t');
	break;
	default:
	tgt.append('\\');
	tgt.append(c);
	lastPos++;
	}
	}
	} else {
	// state switch
	if (c == ',') {
	if (s == S.TOKEN) {
	s = S.NAME;
	} else if (s == S.VALUE) { // end of value, start of next attr
	if (attVal.length() == 0) {
	throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
	}
	if (attName.length() > 0 && attVal.length() > 0) {
	tok.attr.put(attName.toString(), attVal.toString());
	}
	// reset
	attName.setLength(0);
	attVal.setLength(0);
	s = S.NAME;
	} else {
	throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
	}
	} else if (c == '=') {
	if (s == S.NAME) {
	s = S.VALUE;
	} else {
	throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
	}
	} else {
	tgt.append(c);
	if (s == S.TOKEN) lastPos++;
	}
	}
	}
	// collect leftovers
	if (!tok.isEmpty() \|\| s == S.NAME \|\| s == S.VALUE) {
	// remaining attrib?
	if (s == S.VALUE) {
	if (attName.length() > 0 && attVal.length() > 0) {
	tok.attr.put(attName.toString(), attVal.toString());
	}
	}
	AttributeSource.State state = createState(parent, tok, lastPos);
	if (state != null) res.states.add(state.clone());
	}
	return res;
	}

	private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
	a.clearAttributes();
	CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
	char[] tokChars = state.token.toString().toCharArray();
	termAtt.copyBuffer(tokChars, 0, tokChars.length);
	int tokenStart = tokenEnd - state.token.length();
	for (Entry<String, String> e : state.attr.entrySet()) {
	String k = e.getKey();
	if (k.equals("i")) {
	// position increment
	int incr = Integer.parseInt(e.getValue());
	PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
	posIncr.setPositionIncrement(incr);
	} else if (k.equals("s")) {
	tokenStart = Integer.parseInt(e.getValue());
	} else if (k.equals("e")) {
	tokenEnd = Integer.parseInt(e.getValue());
	} else if (k.equals("y")) {
	TypeAttribute type = a.addAttribute(TypeAttribute.class);
	type.setType(e.getValue());
	} else if (k.equals("f")) {
	FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
	int f = Integer.parseInt(e.getValue(), 16);
	flags.setFlags(f);
	} else if (k.equals("p")) {
	PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
	byte[] data = hexToBytes(e.getValue());
	if (data != null && data.length > 0) {
	p.setPayload(new BytesRef(data));
	}
	} else {
	// unknown attribute
	}
	}
	// handle offset attr
	OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
	offset.setOffset(tokenStart, tokenEnd);
	State resState = a.captureState();
	a.clearAttributes();
	return resState;
	}

	@Override
	public String toFormattedString(Field f) throws IOException {
	StringBuilder sb = new StringBuilder();
	sb.append(VERSION + " ");
	if (f.fieldType().stored()) {
	String s = f.stringValue();
	if (s != null) {
	// encode the equals sign
	s = s.replaceAll("=", "\\=");
	sb.append('=');
	sb.append(s);
	sb.append('=');
	}
	}
	TokenStream ts = f.tokenStreamValue();
	if (ts != null) {
	StringBuilder tok = new StringBuilder();
	boolean next = false;
	while (ts.incrementToken()) {
	if (next) {
	sb.append(' ');
	} else {
	next = true;
	}
	tok.setLength(0);
	Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
	String cTerm = null;
	String tTerm = null;
	while (it.hasNext()) {
	Class<? extends Attribute> cl = it.next();
	Attribute att = ts.getAttribute(cl);
	if (att == null) {
	continue;
	}
	if (cl.isAssignableFrom(CharTermAttribute.class)) {
	CharTermAttribute catt = (CharTermAttribute)att;
	cTerm = escape(catt.buffer(), catt.length());
	} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
	TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
	char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
	tTerm = escape(tTermChars, tTermChars.length);
	} else {
	if (tok.length() > 0) tok.append(',');
	if (cl.isAssignableFrom(FlagsAttribute.class)) {
	tok.append("f=").append(Integer.toHexString(((FlagsAttribute) att).getFlags()));
	} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
	tok.append("s=").append(((OffsetAttribute) att).startOffset()).append(",e=").append(((OffsetAttribute) att).endOffset());
	} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
	BytesRef p = ((PayloadAttribute)att).getPayload();
	if (p != null && p.length > 0) {
	tok.append("p=").append(bytesToHex(p.bytes, p.offset, p.length));
	} else if (tok.length() > 0) {
	tok.setLength(tok.length() - 1); // remove the last comma
	}
	} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
	tok.append("i=").append(((PositionIncrementAttribute) att).getPositionIncrement());
	} else if (cl.isAssignableFrom(TypeAttribute.class)) {
	tok.append("y=").append(escape(((TypeAttribute) att).type()));
	} else {

	tok.append(cl.getName()).append('=').append(escape(att.toString()));
	}
	}
	}
	String term = null;
	if (cTerm != null) {
	term = cTerm;
	} else {
	term = tTerm;
	}
	if (term != null && term.length() > 0) {
	if (tok.length() > 0) {
	tok.insert(0, term + ",");
	} else {
	tok.insert(0, term);
	}
	}
	sb.append(tok);
	}
	}
	return sb.toString();
	}

	String escape(String val) {
	return escape(val.toCharArray(), val.length());
	}

	String escape(char[] val, int len) {
	if (val == null \|\| len == 0) {
	return "";
	}
	StringBuilder sb = new StringBuilder();
	for (int i = 0; i < len; i++) {
	switch (val[i]) {
	case '\\' :
	case '=' :
	case ',' :
	case ' ' :
	sb.append('\\');
	sb.append(val[i]);
	break;
	case '\n' :
	sb.append('\\');
	sb.append('n');
	break;
	case '\r' :
	sb.append('\\');
	sb.append('r');
	break;
	case '\t' :
	sb.append('\\');
	sb.append('t');
	break;
	default:
	sb.append(val[i]);
	}
	}
	return sb.toString();
	}

	}