mapreduce/src/contrib/sqoop/src/java/org/apache/hadoop/sqoop/lib/RecordParser.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.sqoop.lib;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;

 import org.apache.hadoop.io.Text;

 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.util.ArrayList;
 import java.util.List;

 /**
  * Parses a record containing one or more fields. Fields are separated
  * by some FIELD_DELIMITER character, e.g. a comma or a ^A character.
  * Records are terminated by a RECORD_DELIMITER character, e.g., a newline.
  *
  * Fields may be (optionally or mandatorily) enclosed by a quoting char
  * e.g., '\"'
  *
  * Fields may contain escaped characters. An escape character may be, e.g.,
  * the '\\' character. Any character following an escape character
  * is treated literally. e.g., '\n' is recorded as an 'n' character, not a
  * newline.
  *
  * Unexpected results may occur if the enclosing character escapes itself.
  * e.g., this cannot parse SQL SELECT statements where the single character
  * ['] escapes to [''].
  *
  * This class is not synchronized. Multiple threads must use separate
  * instances of RecordParser.
  *
  * The fields parsed by RecordParser are backed by an internal buffer
  * which is cleared when the next call to parseRecord() is made. If
  * the buffer is required to be preserved, you must copy it yourself.
  */
 public final class RecordParser {

   public static final Log LOG = LogFactory.getLog(RecordParser.class.getName());

   private enum ParseState {
     FIELD_START,
     ENCLOSED_FIELD,
     UNENCLOSED_FIELD,
     ENCLOSED_ESCAPE,
     ENCLOSED_EXPECT_DELIMITER,
     UNENCLOSED_ESCAPE
   }

   public static class ParseError extends Exception {
     public ParseError() {
       super("ParseError");
     }

     public ParseError(final String msg) {
       super(msg);
     }

     public ParseError(final String msg, final Throwable cause) {
       super(msg, cause);
     }

     public ParseError(final Throwable cause) {
       super(cause);
     }
   }

   private char fieldDelim;
   private char recordDelim;
   private char enclosingChar;
   private char escapeChar;
   private boolean enclosingRequired;
   private ArrayList<String> outputs;

   public RecordParser(final char field, final char record, final char enclose,
       final char escape, final boolean mustEnclose) {
     this.fieldDelim = field;
     this.recordDelim = record;
     this.enclosingChar = enclose;
     this.escapeChar = escape;
     this.enclosingRequired = mustEnclose;

     this.outputs = new ArrayList<String>();
   }

   /**
    * Return a list of strings representing the fields of the input line.
    * This list is backed by an internal buffer which is cleared by the
    * next call to parseRecord().
    */
   public List<String> parseRecord(CharSequence input) throws ParseError {
     if (null == input) {
       throw new ParseError("null input string");
     }

     return parseRecord(CharBuffer.wrap(input));
   }

   /**
    * Return a list of strings representing the fields of the input line.
    * This list is backed by an internal buffer which is cleared by the
    * next call to parseRecord().
    */
   public List<String> parseRecord(Text input) throws ParseError {
     if (null == input) {
       throw new ParseError("null input string");
     }

     // TODO(aaron): The parser should be able to handle UTF-8 strings
     // as well, to avoid this transcode operation.
     return parseRecord(input.toString());
   }

   /**
    * Return a list of strings representing the fields of the input line.
    * This list is backed by an internal buffer which is cleared by the
    * next call to parseRecord().
    */
   public List<String> parseRecord(byte [] input) throws ParseError {
     if (null == input) {
       throw new ParseError("null input string");
     }

     return parseRecord(ByteBuffer.wrap(input).asCharBuffer());
   }

   /**
    * Return a list of strings representing the fields of the input line.
    * This list is backed by an internal buffer which is cleared by the
    * next call to parseRecord().
    */
   public List<String> parseRecord(char [] input) throws ParseError {
     if (null == input) {
       throw new ParseError("null input string");
     }

     return parseRecord(CharBuffer.wrap(input));
   }

   public List<String> parseRecord(ByteBuffer input) throws ParseError {
     if (null == input) {
       throw new ParseError("null input string");
     }

     return parseRecord(input.asCharBuffer());
   }

   /**
    * Return a list of strings representing the fields of the input line.
    * This list is backed by an internal buffer which is cleared by the
    * next call to parseRecord().
    */
   public List<String> parseRecord(CharBuffer input) throws ParseError {
     if (null == input) {
       throw new ParseError("null input string");
     }

     /*
       This method implements the following state machine to perform
       parsing.

       Note that there are no restrictions on whether particular characters
       (e.g., field-sep, record-sep, etc) are distinct or the same. The
       state transitions are processed in the order seen in this comment.

       Starting state is FIELD_START
         encloser -> ENCLOSED_FIELD
         escape char -> UNENCLOSED_ESCAPE
         field delim -> FIELD_START (for a new field)
         record delim -> stops processing
         all other letters get added to current field, -> UNENCLOSED FIELD

       ENCLOSED_FIELD state:
         escape char goes to ENCLOSED_ESCAPE
         encloser goes to ENCLOSED_EXPECT_DELIMITER
         field sep or record sep gets added to the current string
         normal letters get added to the current string

       ENCLOSED_ESCAPE state:
         any character seen here is added literally, back to ENCLOSED_FIELD

       ENCLOSED_EXPECT_DELIMITER state:
         field sep goes to FIELD_START
         record sep halts processing.
         all other characters are errors.

       UNENCLOSED_FIELD state:
         ESCAPE char goes to UNENCLOSED_ESCAPE
         FIELD_SEP char goes to FIELD_START
         RECORD_SEP char halts processing
         normal chars or the enclosing char get added to the current string

       UNENCLOSED_ESCAPE:
         add charater literal to current string, return to UNENCLOSED_FIELD
     */

     char curChar = '\000';
     ParseState state = ParseState.FIELD_START;
     int len = input.length();
     StringBuilder sb = null;

     outputs.clear();

     for (int pos = 0; pos < len; pos++) {
       curChar = input.get();
       switch (state) {
       case FIELD_START:
         // ready to start processing a new field.
         if (null != sb) {
           // We finished processing a previous field. Add to the list.
           outputs.add(sb.toString());
         }

         sb = new StringBuilder();
         if (this.enclosingChar == curChar) {
           // got an opening encloser.
           state = ParseState.ENCLOSED_FIELD;
         } else if (this.escapeChar == curChar) {
           state = ParseState.UNENCLOSED_ESCAPE;
         } else if (this.fieldDelim == curChar) {
           // we have a zero-length field. This is a no-op.
         } else if (this.recordDelim == curChar) {
           // we have a zero-length field, that ends processing.
           pos = len;
         } else {
           // current char is part of the field.
           state = ParseState.UNENCLOSED_FIELD;
           sb.append(curChar);

           if (this.enclosingRequired) {
             throw new ParseError("Opening field-encloser expected at position " + pos);
           }
         }

         break;

       case ENCLOSED_FIELD:
         if (this.escapeChar == curChar) {
           // the next character is escaped. Treat it literally.
           state = ParseState.ENCLOSED_ESCAPE;
         } else if (this.enclosingChar == curChar) {
           // we're at the end of the enclosing field. Expect an EOF or EOR char.
           state = ParseState.ENCLOSED_EXPECT_DELIMITER;
         } else {
           // this is a regular char, or an EOF / EOR inside an encloser. Add to
           // the current field string, and remain in this state.
           sb.append(curChar);
         }

         break;

       case UNENCLOSED_FIELD:
         if (this.escapeChar == curChar) {
           // the next character is escaped. Treat it literally.
           state = ParseState.UNENCLOSED_ESCAPE;
         } else if (this.fieldDelim == curChar) {
           // we're at the end of this field; may be the start of another one.
           state = ParseState.FIELD_START;
         } else if (this.recordDelim == curChar) {
           pos = len; // terminate processing immediately.
         } else {
           // this is a regular char. Add to the current field string,
           // and remain in this state.
           sb.append(curChar);
         }

         break;

       case ENCLOSED_ESCAPE:
         // Treat this character literally, whatever it is, and return to enclosed
         // field processing.
         sb.append(curChar);
         state = ParseState.ENCLOSED_FIELD;
         break;

       case ENCLOSED_EXPECT_DELIMITER:
         // We were in an enclosed field, but got the final encloser. Now we expect
         // either an end-of-field or an end-of-record.
         if (this.fieldDelim == curChar) {
           // end of one field is the beginning of the next.
           state = ParseState.FIELD_START;
         } else if (this.recordDelim == curChar) {
           // stop processing.
           pos = len;
         } else {
           // Don't know what to do with this character.
           throw new ParseError("Expected delimiter at position " + pos);
         }

         break;

       case UNENCLOSED_ESCAPE:
         // Treat this character literally, whatever it is, and return to non-enclosed
         // field processing.
         sb.append(curChar);
         state = ParseState.UNENCLOSED_FIELD;
         break;
       }
     }

     if (state == ParseState.FIELD_START && curChar == this.fieldDelim) {
       // we hit an EOF/EOR as the last legal character and we need to mark
       // that string as recorded. This if block is outside the for-loop since
       // we don't have a physical 'epsilon' token in our string.
       if (null != sb) {
         outputs.add(sb.toString());
         sb = new StringBuilder();
       }
     }

     if (null != sb) {
       // There was a field that terminated by running out of chars or an EOR
       // character. Add to the list.
       outputs.add(sb.toString());
     }

     return outputs;
   }


   public boolean isEnclosingRequired() {
     return enclosingRequired;
   }

   @Override
   public String toString() {
     return "RecordParser[" + fieldDelim + ',' + recordDelim + ',' + enclosingChar + ','
         + escapeChar + ',' + enclosingRequired + "]";
   }

   @Override
   public int hashCode() {
     return this.toString().hashCode();
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.sqoop.lib;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;

	import org.apache.hadoop.io.Text;

	import java.nio.ByteBuffer;
	import java.nio.CharBuffer;
	import java.util.ArrayList;
	import java.util.List;

	/**
	* Parses a record containing one or more fields. Fields are separated
	* by some FIELD_DELIMITER character, e.g. a comma or a ^A character.
	* Records are terminated by a RECORD_DELIMITER character, e.g., a newline.
	*
	* Fields may be (optionally or mandatorily) enclosed by a quoting char
	* e.g., '\"'
	*
	* Fields may contain escaped characters. An escape character may be, e.g.,
	* the '\\' character. Any character following an escape character
	* is treated literally. e.g., '\n' is recorded as an 'n' character, not a
	* newline.
	*
	* Unexpected results may occur if the enclosing character escapes itself.
	* e.g., this cannot parse SQL SELECT statements where the single character
	* ['] escapes to [''].
	*
	* This class is not synchronized. Multiple threads must use separate
	* instances of RecordParser.
	*
	* The fields parsed by RecordParser are backed by an internal buffer
	* which is cleared when the next call to parseRecord() is made. If
	* the buffer is required to be preserved, you must copy it yourself.
	*/
	public final class RecordParser {

	public static final Log LOG = LogFactory.getLog(RecordParser.class.getName());

	private enum ParseState {
	FIELD_START,
	ENCLOSED_FIELD,
	UNENCLOSED_FIELD,
	ENCLOSED_ESCAPE,
	ENCLOSED_EXPECT_DELIMITER,
	UNENCLOSED_ESCAPE
	}

	public static class ParseError extends Exception {
	public ParseError() {
	super("ParseError");
	}

	public ParseError(final String msg) {
	super(msg);
	}

	public ParseError(final String msg, final Throwable cause) {
	super(msg, cause);
	}

	public ParseError(final Throwable cause) {
	super(cause);
	}
	}

	private char fieldDelim;
	private char recordDelim;
	private char enclosingChar;
	private char escapeChar;
	private boolean enclosingRequired;
	private ArrayList<String> outputs;

	public RecordParser(final char field, final char record, final char enclose,
	final char escape, final boolean mustEnclose) {
	this.fieldDelim = field;
	this.recordDelim = record;
	this.enclosingChar = enclose;
	this.escapeChar = escape;
	this.enclosingRequired = mustEnclose;

	this.outputs = new ArrayList<String>();
	}

	/**
	* Return a list of strings representing the fields of the input line.
	* This list is backed by an internal buffer which is cleared by the
	* next call to parseRecord().
	*/
	public List<String> parseRecord(CharSequence input) throws ParseError {
	if (null == input) {
	throw new ParseError("null input string");
	}

	return parseRecord(CharBuffer.wrap(input));
	}

	/**
	* Return a list of strings representing the fields of the input line.
	* This list is backed by an internal buffer which is cleared by the
	* next call to parseRecord().
	*/
	public List<String> parseRecord(Text input) throws ParseError {
	if (null == input) {
	throw new ParseError("null input string");
	}

	// TODO(aaron): The parser should be able to handle UTF-8 strings
	// as well, to avoid this transcode operation.
	return parseRecord(input.toString());
	}

	/**
	* Return a list of strings representing the fields of the input line.
	* This list is backed by an internal buffer which is cleared by the
	* next call to parseRecord().
	*/
	public List<String> parseRecord(byte [] input) throws ParseError {
	if (null == input) {
	throw new ParseError("null input string");
	}

	return parseRecord(ByteBuffer.wrap(input).asCharBuffer());
	}

	/**
	* Return a list of strings representing the fields of the input line.
	* This list is backed by an internal buffer which is cleared by the
	* next call to parseRecord().
	*/
	public List<String> parseRecord(char [] input) throws ParseError {
	if (null == input) {
	throw new ParseError("null input string");
	}

	return parseRecord(CharBuffer.wrap(input));
	}

	public List<String> parseRecord(ByteBuffer input) throws ParseError {
	if (null == input) {
	throw new ParseError("null input string");
	}

	return parseRecord(input.asCharBuffer());
	}

	/**
	* Return a list of strings representing the fields of the input line.
	* This list is backed by an internal buffer which is cleared by the
	* next call to parseRecord().
	*/
	public List<String> parseRecord(CharBuffer input) throws ParseError {
	if (null == input) {
	throw new ParseError("null input string");
	}

	/*
	This method implements the following state machine to perform
	parsing.

	Note that there are no restrictions on whether particular characters
	(e.g., field-sep, record-sep, etc) are distinct or the same. The
	state transitions are processed in the order seen in this comment.

	Starting state is FIELD_START
	encloser -> ENCLOSED_FIELD
	escape char -> UNENCLOSED_ESCAPE
	field delim -> FIELD_START (for a new field)
	record delim -> stops processing
	all other letters get added to current field, -> UNENCLOSED FIELD

	ENCLOSED_FIELD state:
	escape char goes to ENCLOSED_ESCAPE
	encloser goes to ENCLOSED_EXPECT_DELIMITER
	field sep or record sep gets added to the current string
	normal letters get added to the current string

	ENCLOSED_ESCAPE state:
	any character seen here is added literally, back to ENCLOSED_FIELD

	ENCLOSED_EXPECT_DELIMITER state:
	field sep goes to FIELD_START
	record sep halts processing.
	all other characters are errors.

	UNENCLOSED_FIELD state:
	ESCAPE char goes to UNENCLOSED_ESCAPE
	FIELD_SEP char goes to FIELD_START
	RECORD_SEP char halts processing
	normal chars or the enclosing char get added to the current string

	UNENCLOSED_ESCAPE:
	add charater literal to current string, return to UNENCLOSED_FIELD
	*/

	char curChar = '\000';
	ParseState state = ParseState.FIELD_START;
	int len = input.length();
	StringBuilder sb = null;

	outputs.clear();

	for (int pos = 0; pos < len; pos++) {
	curChar = input.get();
	switch (state) {
	case FIELD_START:
	// ready to start processing a new field.
	if (null != sb) {
	// We finished processing a previous field. Add to the list.
	outputs.add(sb.toString());
	}

	sb = new StringBuilder();
	if (this.enclosingChar == curChar) {
	// got an opening encloser.
	state = ParseState.ENCLOSED_FIELD;
	} else if (this.escapeChar == curChar) {
	state = ParseState.UNENCLOSED_ESCAPE;
	} else if (this.fieldDelim == curChar) {
	// we have a zero-length field. This is a no-op.
	} else if (this.recordDelim == curChar) {
	// we have a zero-length field, that ends processing.
	pos = len;
	} else {
	// current char is part of the field.
	state = ParseState.UNENCLOSED_FIELD;
	sb.append(curChar);

	if (this.enclosingRequired) {
	throw new ParseError("Opening field-encloser expected at position " + pos);
	}
	}

	break;

	case ENCLOSED_FIELD:
	if (this.escapeChar == curChar) {
	// the next character is escaped. Treat it literally.
	state = ParseState.ENCLOSED_ESCAPE;
	} else if (this.enclosingChar == curChar) {
	// we're at the end of the enclosing field. Expect an EOF or EOR char.
	state = ParseState.ENCLOSED_EXPECT_DELIMITER;
	} else {
	// this is a regular char, or an EOF / EOR inside an encloser. Add to
	// the current field string, and remain in this state.
	sb.append(curChar);
	}

	break;

	case UNENCLOSED_FIELD:
	if (this.escapeChar == curChar) {
	// the next character is escaped. Treat it literally.
	state = ParseState.UNENCLOSED_ESCAPE;
	} else if (this.fieldDelim == curChar) {
	// we're at the end of this field; may be the start of another one.
	state = ParseState.FIELD_START;
	} else if (this.recordDelim == curChar) {
	pos = len; // terminate processing immediately.
	} else {
	// this is a regular char. Add to the current field string,
	// and remain in this state.
	sb.append(curChar);
	}

	break;

	case ENCLOSED_ESCAPE:
	// Treat this character literally, whatever it is, and return to enclosed
	// field processing.
	sb.append(curChar);
	state = ParseState.ENCLOSED_FIELD;
	break;

	case ENCLOSED_EXPECT_DELIMITER:
	// We were in an enclosed field, but got the final encloser. Now we expect
	// either an end-of-field or an end-of-record.
	if (this.fieldDelim == curChar) {
	// end of one field is the beginning of the next.
	state = ParseState.FIELD_START;
	} else if (this.recordDelim == curChar) {
	// stop processing.
	pos = len;
	} else {
	// Don't know what to do with this character.
	throw new ParseError("Expected delimiter at position " + pos);
	}

	break;

	case UNENCLOSED_ESCAPE:
	// Treat this character literally, whatever it is, and return to non-enclosed
	// field processing.
	sb.append(curChar);
	state = ParseState.UNENCLOSED_FIELD;
	break;
	}
	}

	if (state == ParseState.FIELD_START && curChar == this.fieldDelim) {
	// we hit an EOF/EOR as the last legal character and we need to mark
	// that string as recorded. This if block is outside the for-loop since
	// we don't have a physical 'epsilon' token in our string.
	if (null != sb) {
	outputs.add(sb.toString());
	sb = new StringBuilder();
	}
	}

	if (null != sb) {
	// There was a field that terminated by running out of chars or an EOR
	// character. Add to the list.
	outputs.add(sb.toString());
	}

	return outputs;
	}


	public boolean isEnclosingRequired() {
	return enclosingRequired;
	}

	@Override
	public String toString() {
	return "RecordParser[" + fieldDelim + ',' + recordDelim + ',' + enclosingChar + ','
	+ escapeChar + ',' + enclosingRequired + "]";
	}

	@Override
	public int hashCode() {
	return this.toString().hashCode();
	}
	}