blob: e80d6aaa15357b95d4ffcfaa8dc815d34e530a55 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.easy.text.compliant;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.exec.store.easy.text.reader.HeaderBuilder;
import org.apache.drill.test.DrillTest;
import org.apache.hadoop.fs.Path;
import org.junit.Test;
import java.nio.charset.StandardCharsets;
/**
* Test the mechanism that builds column names from a set of CSV
* headers. The mechanism provides reasonable defaults for missing
* or invalid headers.
*/
public class TestHeaderBuilder extends DrillTest {
@Test
public void testEmptyHeader() {
Path dummyPath = new Path("file:/dummy.csv");
HeaderBuilder hb = new HeaderBuilder(dummyPath);
try {
hb.finishRecord();
} catch (UserException e) {
assertTrue(e.getMessage().contains("must define at least one header"));
}
hb = new HeaderBuilder(dummyPath);
parse(hb,"");
try {
hb.finishRecord();
} catch (UserException e) {
assertTrue(e.getMessage().contains("must define at least one header"));
}
hb = new HeaderBuilder(dummyPath);
parse(hb," ");
validateHeader(hb, new String[] {"column_1"});
hb = new HeaderBuilder(dummyPath);
parse(hb,",");
validateHeader(hb, new String[] {"column_1", "column_2"});
hb = new HeaderBuilder(dummyPath);
parse(hb," , ");
validateHeader(hb, new String[] {"column_1", "column_2"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"a, ");
validateHeader(hb, new String[] {"a", "column_2"});
}
@Test
public void testWhiteSpace() {
Path dummyPath = new Path("file:/dummy.csv");
HeaderBuilder hb = new HeaderBuilder(dummyPath);
parse(hb,"a");
validateHeader(hb, new String[] {"a"});
hb = new HeaderBuilder(dummyPath);
parse(hb," a ");
validateHeader(hb, new String[] {"a"});
hb = new HeaderBuilder(dummyPath);
parse(hb," a ");
validateHeader(hb, new String[] {"a"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"a,b,c");
validateHeader(hb, new String[] {"a","b","c"});
hb = new HeaderBuilder(dummyPath);
parse(hb," a , b , c ");
validateHeader(hb, new String[] {"a","b","c"});
}
@Test
public void testSyntax() {
Path dummyPath = new Path("file:/dummy.csv");
HeaderBuilder hb = new HeaderBuilder(dummyPath);
parse(hb,"a_123");
validateHeader(hb, new String[] {"a_123"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"a_123_");
validateHeader(hb, new String[] {"a_123_"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"az09_");
validateHeader(hb, new String[] {"az09_"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"+");
validateHeader(hb, new String[] {"column_1"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"+,-");
validateHeader(hb, new String[] {"column_1", "column_2"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"+9a");
validateHeader(hb, new String[] {"col_9a"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"9a");
validateHeader(hb, new String[] {"col_9a"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"a+b");
validateHeader(hb, new String[] {"a_b"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"a_b");
validateHeader(hb, new String[] {"a_b"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"EXPR$0");
validateHeader(hb, new String[] {"EXPR_0"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"(_-^-_)");
validateHeader(hb, new String[] {"col_______"});
}
@Test
public void testUnicode() {
Path dummyPath = new Path("file:/dummy.csv");
HeaderBuilder hb = new HeaderBuilder(dummyPath);
parse(hb,"Αθήνα");
validateHeader(hb, new String[] {"Αθήνα"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"Москва");
validateHeader(hb, new String[] {"Москва"});
hb = new HeaderBuilder(dummyPath);
parse(hb,"Paris,Αθήνα,Москва");
validateHeader(hb, new String[] {"Paris","Αθήνα","Москва"});
}
@Test
public void testDuplicateNames() {
testParser("a,a", new String[] {"a","a_2"});
testParser("a,A", new String[] {"a","A_2"});
// It ain't pretty, but it is unique...
testParser("a,A,A_2", new String[] {"a","A_2", "A_2_2"});
// Verify with non-ASCII characters
testParser("Αθήνα,ΑθήνΑ", new String[] {"Αθήνα","ΑθήνΑ_2"});
}
private void testParser(String input, String[] expected) {
Path dummyPath = new Path("file:/dummy.csv");
HeaderBuilder hb = new HeaderBuilder(dummyPath);
parse(hb,input);
hb.finishRecord();
validateHeader(hb, expected);
}
private void parse(HeaderBuilder hb, String input) {
if (input == null) {
return;
}
byte bytes[] = input.getBytes(StandardCharsets.UTF_8);
if (bytes.length == 0) {
return;
}
int fieldIndex = -1;
hb.startField(++fieldIndex);
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
if (b == ',') {
hb.endField();
hb.startField(++fieldIndex);
} else {
hb.append(b);
}
}
hb.endField();
}
private void validateHeader(HeaderBuilder hb, String[] expected) {
String actual[] = hb.getHeaders();
assertEquals(expected.length, actual.length);
for (int i = 0; i < expected.length; i++) {
assertEquals(expected[i], actual[i]);
}
}
}