blob: feb01cbda15913238c29bad086e79774d014b05b [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.sqoop.lib;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.List;
/**
* Parses a record containing one or more fields. Fields are separated
* by some FIELD_DELIMITER character, e.g. a comma or a ^A character.
* Records are terminated by a RECORD_DELIMITER character, e.g., a newline.
*
* Fields may be (optionally or mandatorily) enclosed by a quoting char
* e.g., '\"'
*
* Fields may contain escaped characters. An escape character may be, e.g.,
* the '\\' character. Any character following an escape character
* is treated literally. e.g., '\n' is recorded as an 'n' character, not a
* newline.
*
* Unexpected results may occur if the enclosing character escapes itself.
* e.g., this cannot parse SQL SELECT statements where the single character
* ['] escapes to [''].
*
* This class is not synchronized. Multiple threads must use separate
* instances of RecordParser.
*
* The fields parsed by RecordParser are backed by an internal buffer
* which is cleared when the next call to parseRecord() is made. If
* the buffer is required to be preserved, you must copy it yourself.
*/
public final class RecordParser {
public static final Log LOG = LogFactory.getLog(RecordParser.class.getName());
private enum ParseState {
FIELD_START,
ENCLOSED_FIELD,
UNENCLOSED_FIELD,
ENCLOSED_ESCAPE,
ENCLOSED_EXPECT_DELIMITER,
UNENCLOSED_ESCAPE
}
public static class ParseError extends Exception {
public ParseError() {
super("ParseError");
}
public ParseError(final String msg) {
super(msg);
}
public ParseError(final String msg, final Throwable cause) {
super(msg, cause);
}
public ParseError(final Throwable cause) {
super(cause);
}
}
private char fieldDelim;
private char recordDelim;
private char enclosingChar;
private char escapeChar;
private boolean enclosingRequired;
private ArrayList<String> outputs;
public RecordParser(final char field, final char record, final char enclose,
final char escape, final boolean mustEnclose) {
this.fieldDelim = field;
this.recordDelim = record;
this.enclosingChar = enclose;
this.escapeChar = escape;
this.enclosingRequired = mustEnclose;
this.outputs = new ArrayList<String>();
}
/**
* Return a list of strings representing the fields of the input line.
* This list is backed by an internal buffer which is cleared by the
* next call to parseRecord().
*/
public List<String> parseRecord(CharSequence input) throws ParseError {
if (null == input) {
throw new ParseError("null input string");
}
return parseRecord(CharBuffer.wrap(input));
}
/**
* Return a list of strings representing the fields of the input line.
* This list is backed by an internal buffer which is cleared by the
* next call to parseRecord().
*/
public List<String> parseRecord(Text input) throws ParseError {
if (null == input) {
throw new ParseError("null input string");
}
// TODO(aaron): The parser should be able to handle UTF-8 strings
// as well, to avoid this transcode operation.
return parseRecord(input.toString());
}
/**
* Return a list of strings representing the fields of the input line.
* This list is backed by an internal buffer which is cleared by the
* next call to parseRecord().
*/
public List<String> parseRecord(byte [] input) throws ParseError {
if (null == input) {
throw new ParseError("null input string");
}
return parseRecord(ByteBuffer.wrap(input).asCharBuffer());
}
/**
* Return a list of strings representing the fields of the input line.
* This list is backed by an internal buffer which is cleared by the
* next call to parseRecord().
*/
public List<String> parseRecord(char [] input) throws ParseError {
if (null == input) {
throw new ParseError("null input string");
}
return parseRecord(CharBuffer.wrap(input));
}
public List<String> parseRecord(ByteBuffer input) throws ParseError {
if (null == input) {
throw new ParseError("null input string");
}
return parseRecord(input.asCharBuffer());
}
/**
* Return a list of strings representing the fields of the input line.
* This list is backed by an internal buffer which is cleared by the
* next call to parseRecord().
*/
public List<String> parseRecord(CharBuffer input) throws ParseError {
if (null == input) {
throw new ParseError("null input string");
}
/*
This method implements the following state machine to perform
parsing.
Note that there are no restrictions on whether particular characters
(e.g., field-sep, record-sep, etc) are distinct or the same. The
state transitions are processed in the order seen in this comment.
Starting state is FIELD_START
encloser -> ENCLOSED_FIELD
escape char -> UNENCLOSED_ESCAPE
field delim -> FIELD_START (for a new field)
record delim -> stops processing
all other letters get added to current field, -> UNENCLOSED FIELD
ENCLOSED_FIELD state:
escape char goes to ENCLOSED_ESCAPE
encloser goes to ENCLOSED_EXPECT_DELIMITER
field sep or record sep gets added to the current string
normal letters get added to the current string
ENCLOSED_ESCAPE state:
any character seen here is added literally, back to ENCLOSED_FIELD
ENCLOSED_EXPECT_DELIMITER state:
field sep goes to FIELD_START
record sep halts processing.
all other characters are errors.
UNENCLOSED_FIELD state:
ESCAPE char goes to UNENCLOSED_ESCAPE
FIELD_SEP char goes to FIELD_START
RECORD_SEP char halts processing
normal chars or the enclosing char get added to the current string
UNENCLOSED_ESCAPE:
add charater literal to current string, return to UNENCLOSED_FIELD
*/
char curChar = '\000';
ParseState state = ParseState.FIELD_START;
int len = input.length();
StringBuilder sb = null;
outputs.clear();
for (int pos = 0; pos < len; pos++) {
curChar = input.get();
switch (state) {
case FIELD_START:
// ready to start processing a new field.
if (null != sb) {
// We finished processing a previous field. Add to the list.
outputs.add(sb.toString());
}
sb = new StringBuilder();
if (this.enclosingChar == curChar) {
// got an opening encloser.
state = ParseState.ENCLOSED_FIELD;
} else if (this.escapeChar == curChar) {
state = ParseState.UNENCLOSED_ESCAPE;
} else if (this.fieldDelim == curChar) {
// we have a zero-length field. This is a no-op.
} else if (this.recordDelim == curChar) {
// we have a zero-length field, that ends processing.
pos = len;
} else {
// current char is part of the field.
state = ParseState.UNENCLOSED_FIELD;
sb.append(curChar);
if (this.enclosingRequired) {
throw new ParseError("Opening field-encloser expected at position " + pos);
}
}
break;
case ENCLOSED_FIELD:
if (this.escapeChar == curChar) {
// the next character is escaped. Treat it literally.
state = ParseState.ENCLOSED_ESCAPE;
} else if (this.enclosingChar == curChar) {
// we're at the end of the enclosing field. Expect an EOF or EOR char.
state = ParseState.ENCLOSED_EXPECT_DELIMITER;
} else {
// this is a regular char, or an EOF / EOR inside an encloser. Add to
// the current field string, and remain in this state.
sb.append(curChar);
}
break;
case UNENCLOSED_FIELD:
if (this.escapeChar == curChar) {
// the next character is escaped. Treat it literally.
state = ParseState.UNENCLOSED_ESCAPE;
} else if (this.fieldDelim == curChar) {
// we're at the end of this field; may be the start of another one.
state = ParseState.FIELD_START;
} else if (this.recordDelim == curChar) {
pos = len; // terminate processing immediately.
} else {
// this is a regular char. Add to the current field string,
// and remain in this state.
sb.append(curChar);
}
break;
case ENCLOSED_ESCAPE:
// Treat this character literally, whatever it is, and return to enclosed
// field processing.
sb.append(curChar);
state = ParseState.ENCLOSED_FIELD;
break;
case ENCLOSED_EXPECT_DELIMITER:
// We were in an enclosed field, but got the final encloser. Now we expect
// either an end-of-field or an end-of-record.
if (this.fieldDelim == curChar) {
// end of one field is the beginning of the next.
state = ParseState.FIELD_START;
} else if (this.recordDelim == curChar) {
// stop processing.
pos = len;
} else {
// Don't know what to do with this character.
throw new ParseError("Expected delimiter at position " + pos);
}
break;
case UNENCLOSED_ESCAPE:
// Treat this character literally, whatever it is, and return to non-enclosed
// field processing.
sb.append(curChar);
state = ParseState.UNENCLOSED_FIELD;
break;
}
}
if (state == ParseState.FIELD_START && curChar == this.fieldDelim) {
// we hit an EOF/EOR as the last legal character and we need to mark
// that string as recorded. This if block is outside the for-loop since
// we don't have a physical 'epsilon' token in our string.
if (null != sb) {
outputs.add(sb.toString());
sb = new StringBuilder();
}
}
if (null != sb) {
// There was a field that terminated by running out of chars or an EOR
// character. Add to the list.
outputs.add(sb.toString());
}
return outputs;
}
public boolean isEnclosingRequired() {
return enclosingRequired;
}
@Override
public String toString() {
return "RecordParser[" + fieldDelim + ',' + recordDelim + ',' + enclosingChar + ','
+ escapeChar + ',' + enclosingRequired + "]";
}
@Override
public int hashCode() {
return this.toString().hashCode();
}
}