blob: 6b1682b34c9faa787b94165751462dd5ec784e90 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.kudu.mapreduce.tools;
import java.util.ArrayList;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.yetus.audience.InterfaceStability;
import org.apache.kudu.client.Bytes;
/**
* Column-separated values parser that gives access to the different columns inside each line of
* data.
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public class CsvParser {
private final byte separatorByte;
private final int maxColumnCount;
private final List<String> columnNames;
/**
* @param columnsSpecification the list of columns to parse out, comma separated.
* @param separatorStr The 1 byte separator.
*/
public CsvParser(String columnsSpecification, String separatorStr) {
// Configure separator
byte[] separator = Bytes.fromString(separatorStr);
Preconditions.checkArgument(separator.length == 1, "CsvParser only supports single-byte " +
"separators");
separatorByte = separator[0];
// Configure columns
columnNames = Lists.newArrayList(Splitter.on(',').trimResults().split(columnsSpecification));
maxColumnCount = columnNames.size();
}
/**
* Creates a ParsedLine of a line of data.
* @param lineBytes Whole line as a byte array.
* @param length How long the line really is in the byte array
* @return A parsed line of CSV.
* @throws BadCsvLineException
*/
public ParsedLine parse(byte[] lineBytes, int length) throws BadCsvLineException {
// Enumerate separator offsets
List<Integer> tabOffsets = new ArrayList<Integer>(maxColumnCount);
for (int i = 0; i < length; i++) {
if (lineBytes[i] == separatorByte) {
tabOffsets.add(i);
}
}
if (tabOffsets.isEmpty()) {
throw new BadCsvLineException("No delimiter");
}
// trailing separator shouldn't count as a column
if (lineBytes[length - 1] != separatorByte) {
tabOffsets.add(length);
}
if (tabOffsets.size() > maxColumnCount) {
throw new BadCsvLineException("Excessive columns");
}
if (tabOffsets.size() < maxColumnCount) {
throw new BadCsvLineException("Not enough columns");
}
return new ParsedLine(tabOffsets, lineBytes);
}
/**
* Helper class that knows where the columns are situated in the line.
*/
class ParsedLine {
private final List<Integer> tabOffsets;
private final byte[] lineBytes;
ParsedLine(List<Integer> tabOffsets, byte[] lineBytes) {
this.tabOffsets = tabOffsets;
this.lineBytes = lineBytes;
}
/**
* Get the position for the given column.
* @param idx Column to lookup.
* @return Offset in the line.
*/
public int getColumnOffset(int idx) {
if (idx > 0) {
return tabOffsets.get(idx - 1) + 1;
} else {
return 0;
}
}
/**
* Get how many bytes the given column occupies.
* @param idx Column to lookup.
* @return Column's length.
*/
public int getColumnLength(int idx) {
return tabOffsets.get(idx) - getColumnOffset(idx);
}
/**
* Get the number of columns in this file.
* @return Number of columns.
*/
public int getColumnCount() {
return tabOffsets.size();
}
/**
* Get the bytes originally given for this line.
* @return Original byte array.
*/
public byte[] getLineBytes() {
return lineBytes;
}
/**
* Get the given column's name.
* @param idx Column to lookup.
* @return Column's name.
*/
public String getColumnName(int idx) {
return columnNames.get(idx);
}
}
/**
* Exception used when the CsvParser is unable to parse a line.
*/
@SuppressWarnings("serial")
public static class BadCsvLineException extends Exception {
public BadCsvLineException(String err) {
super(err);
}
}
}