blob: 92f9986668cf2d50ad2eb5df553e2d13f05d487e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hop.pipeline.transforms.csvinput;
import java.nio.charset.StandardCharsets;
import org.apache.hop.core.exception.HopException;
import org.apache.hop.core.exception.HopTransformException;
import org.apache.hop.core.file.TextFileInputField;
import org.apache.hop.core.logging.ILoggingObject;
import org.apache.hop.core.row.IRowMeta;
import org.apache.hop.junit.rules.RestoreHopEngineEnvironment;
import org.apache.hop.pipeline.transform.RowAdapter;
import org.apache.hop.pipeline.transforms.mock.TransformMockHelper;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Test;
import org.mockito.Matchers;
import org.mockito.Mockito;
/**
* Tests for unicode support in CsvInput transform
*
* @see CsvInput
*/
public class CsvInputUnicodeTest extends CsvInputUnitTestBase {
@ClassRule public static RestoreHopEngineEnvironment env = new RestoreHopEngineEnvironment();
private static final String UTF8 = "UTF-8";
private static final String UTF16LE = "UTF-16LE";
private static final String UTF16LEBOM = "x-UTF-16LE-BOM";
private static final String UTF16BE = "UTF-16BE";
private static final String ONE_CHAR_DELIM = "\t";
private static final String MULTI_CHAR_DELIM = "|||";
private static final String TEXT = "Header1%1$sHeader2\nValue%1$sValue\nValue%1$sValue\n";
private static final String TEXTHEADER = "Header1%1$sHeader2\n";
private static final String TEXTBODY = "Value%1$sValue\nValue%1$sValue\n";
private static final String TEXT_WITH_ENCLOSURES =
"Header1%1$sHeader2\n\"Value\"%1$s\"Value\"\n\"Value\"%1$s\"Value\"\n";
private static final String TEST_DATA = String.format(TEXT, ONE_CHAR_DELIM);
private static final String TEST_DATA1 = String.format(TEXT, MULTI_CHAR_DELIM);
private static final String TEST_DATA2 = String.format(TEXT_WITH_ENCLOSURES, ONE_CHAR_DELIM);
private static final String TEST_DATA3 = String.format(TEXT_WITH_ENCLOSURES, MULTI_CHAR_DELIM);
private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
private static final String TEST_DATA_UTF8_BOM =
String.format(new String(UTF8_BOM, StandardCharsets.UTF_8) + TEXT, ONE_CHAR_DELIM);
private static final String TEST_DATA_NOHEADER_UTF8_BOM =
String.format(new String(UTF8_BOM, StandardCharsets.UTF_8) + TEXTBODY, ONE_CHAR_DELIM);
private static final byte[] UTF16LE_BOM = {(byte) 0xFF, (byte) 0xFE};
private static final String TEST_DATA_UTF16LE_BOM =
String.format(
new String(UTF16LE_BOM, StandardCharsets.UTF_16LE) + TEST_DATA2, ONE_CHAR_DELIM);
private static final byte[] UTF16BE_BOM = {(byte) 0xFE, (byte) 0xFF};
private static final String TEST_DATA_UTF16BE_BOM =
String.format(
new String(UTF16BE_BOM, StandardCharsets.UTF_16BE) + TEST_DATA2, ONE_CHAR_DELIM);
private static TransformMockHelper<CsvInputMeta, CsvInputData> transformMockHelper;
@BeforeClass
public static void setUp() throws HopException {
transformMockHelper =
TransformMockUtil.getTransformMockHelper(
CsvInputMeta.class, CsvInputData.class, "CsvInputUnicodeTest");
Mockito.when(
transformMockHelper.logChannelFactory.create(
Matchers.any(), Matchers.any(ILoggingObject.class)))
.thenReturn(transformMockHelper.iLogChannel);
Mockito.when(transformMockHelper.pipeline.isRunning()).thenReturn(true);
}
@AfterClass
public static void cleanUp() {
transformMockHelper.cleanUp();
}
@Test
public void testUTF16LE() throws Exception {
doTest(UTF16LE, UTF16LE, TEST_DATA, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF16BE() throws Exception {
doTest(UTF16BE, UTF16BE, TEST_DATA, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF16BE_multiDelim() throws Exception {
doTest(UTF16BE, UTF16BE, TEST_DATA1, MULTI_CHAR_DELIM, true);
}
@Test
public void testUTF16LEBOM() throws Exception {
doTest(UTF16LEBOM, UTF16LE, TEST_DATA, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF8() throws Exception {
doTest(UTF8, UTF8, TEST_DATA, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF8_multiDelim() throws Exception {
doTest(UTF8, UTF8, TEST_DATA1, MULTI_CHAR_DELIM, true);
}
@Test
public void testUTF8_headerWithBOM() throws Exception {
doTest(UTF8, UTF8, TEST_DATA_UTF8_BOM, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF8_withoutHeaderWithBOM() throws Exception {
doTest(UTF8, UTF8, TEST_DATA_NOHEADER_UTF8_BOM, ONE_CHAR_DELIM, false);
}
@Test
public void testUTF16LEDataWithEnclosures() throws Exception {
doTest(UTF16LE, UTF16LE, TEST_DATA2, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF16LE_headerWithBOM() throws Exception {
doTest(UTF16LE, UTF16LE, TEST_DATA_UTF16LE_BOM, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF16BEDataWithEnclosures() throws Exception {
doTest(UTF16BE, UTF16BE, TEST_DATA2, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF16BE_headerWithBOM() throws Exception {
doTest(UTF16BE, UTF16BE, TEST_DATA_UTF16BE_BOM, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF16LEBOMDataWithEnclosures() throws Exception {
doTest(UTF16LEBOM, UTF16LE, TEST_DATA2, ONE_CHAR_DELIM, true);
}
@Test
public void testUTF16BE_multiDelim_DataWithEnclosures() throws Exception {
doTest(UTF16BE, UTF16BE, TEST_DATA3, MULTI_CHAR_DELIM, true);
}
@Test
public void testUTF16LE_multiDelim_DataWithEnclosures() throws Exception {
doTest(UTF16LE, UTF16LE, TEST_DATA3, MULTI_CHAR_DELIM, true);
}
@Test
public void testUTF8_multiDelim_DataWithEnclosures() throws Exception {
doTest(UTF8, UTF8, TEST_DATA3, MULTI_CHAR_DELIM, true);
}
private void doTest(
final String fileEncoding,
final String transformEncoding,
final String testData,
final String delimiter,
final boolean useHeader)
throws Exception {
String testFilePath = createTestFile(fileEncoding, testData).getAbsolutePath();
CsvInputMeta meta = createTransformMeta(testFilePath, transformEncoding, delimiter, useHeader);
CsvInputData data = new CsvInputData();
CsvInput csvInput =
new CsvInput(
transformMockHelper.transformMeta,
meta,
data,
0,
transformMockHelper.pipelineMeta,
transformMockHelper.pipeline);
csvInput.init();
csvInput.addRowListener(
new RowAdapter() {
@Override
public void rowWrittenEvent(IRowMeta rowMeta, Object[] row) throws HopTransformException {
for (int i = 0; i < rowMeta.size(); i++) {
Assert.assertEquals("Value", row[i]);
}
}
});
boolean haveRowsToRead;
do {
haveRowsToRead = !csvInput.processRow();
} while (!haveRowsToRead);
csvInput.dispose();
Assert.assertEquals(2, csvInput.getLinesWritten());
}
private CsvInputMeta createTransformMeta(
final String testFilePath,
final String encoding,
final String delimiter,
final boolean useHeader) {
final CsvInputMeta meta = new CsvInputMeta();
meta.setFilename(testFilePath);
meta.setDelimiter(delimiter);
meta.setEncoding(encoding);
meta.setEnclosure("\"");
meta.setBufferSize("50000");
meta.setInputFields(getInputFileFields());
meta.setHeaderPresent(useHeader);
return meta;
}
private TextFileInputField[] getInputFileFields() {
return createInputFileFields("Header1", "Header2");
}
}