blob: 9110cd98b907caa06f98c3d78660354b392d847e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexwriter.csv;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.HashMap;
import java.util.TimeZone;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.indexer.IndexWriterParams;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Test CSVIndexWriter. Focus is on CSV-specific potential issues, mainly quoting and escaping.
*/
public class TestCSVIndexWriter {
protected static final Logger LOG = LoggerFactory
.getLogger(TestCSVIndexWriter.class);
/**
* Dummy IndexWriter which stores the indexed documents as CSV string in a
* {@link ByteArrayOutputStream} which can be easily accessed in test cases.
*/
public class CSVByteArrayIndexWriter extends CSVIndexWriter {
ByteArrayOutputStream byteBuffer;
FileSystem.Statistics fsStats;
@Override
public void open(IndexWriterParams parameters) throws IOException {
super.open(parameters);
byteBuffer = new ByteArrayOutputStream();
fsStats = new FileSystem.Statistics("testCSVIndexWriter");
csvout = new FSDataOutputStream(byteBuffer, fsStats);
}
@Override
public void close() throws IOException {
}
/** get the indexed documents as CSV */
public String getData() {
try {
return byteBuffer.toString(encoding.name());
} catch (UnsupportedEncodingException e) {
return "";
}
}
}
/**
* write one NutchDocument as CSV record
*
* @param configParams configuration parameters: array (property => value, prop2 => value)
* @param docs NutchDocument
* @return CSV string representing the document
*/
private String getCSV(final String[] configParams, NutchDocument[] docs)
throws IOException {
Configuration conf = NutchConfiguration.create();
IndexWriterParams params = new IndexWriterParams(new HashMap<>());
for (int i = 0; i < configParams.length; i += 2) {
params.put(configParams[i], configParams[i + 1]);
}
CSVByteArrayIndexWriter out = new CSVByteArrayIndexWriter();
out.setConf(conf);
out.open(params);
for (NutchDocument doc : docs) {
out.write(doc);
}
out.close();
String csv = out.getData();
LOG.info(csv);
return csv;
}
/**
* write one document as CSV record
*
* @param configParams configuration parameters: array (property => value, prop2 => value)
* @param fieldContent array of {field => value} maps
* @return CSV string representing the document
*/
private String getCSV(final String[] configParams, final String[] fieldContent)
throws IOException {
NutchDocument[] docs = new NutchDocument[1];
docs[0] = new NutchDocument();
for (int i = 0; i < fieldContent.length; i += 2) {
docs[0].add(fieldContent[i], fieldContent[i + 1]);
}
return getCSV(configParams, docs);
}
/** defaults, no quoting necessary */
@Test
public void testCSVdefault() throws IOException {
String[] fields = { "id", "http://nutch.apache.org/", "title",
"Welcome to Apache Nutch", "content",
"Apache Nutch is an open source web-search software project. ..." };
String csv = getCSV(new String[0], fields);
for (int i = 0; i < fields.length; i += 2) {
assertTrue("Testing field " + i + " (" + fields[i] + ")",
csv.contains(fields[i + 1]));
}
}
@Test
public void testCSVquoteFieldSeparators() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test,test2" };
String[] fields = { "test", "a,b", "test2", "c,d" };
String csv = getCSV(params, fields);
assertEquals("If field contains a fields separator, it must be quoted",
"\"a,b\",\"c,d\"", csv.trim());
}
@Test
public void testCSVquoteRecordSeparators() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test" };
String[] fields = { "test", "a\nb" };
String csv = getCSV(params, fields);
assertEquals("If field contains a fields separator, it must be quoted",
"\"a\nb\"", csv.trim());
}
@Test
public void testCSVescapeQuotes() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test" };
String[] fields = { "test", "a,b:\"quote\",c" };
String csv = getCSV(params, fields);
assertEquals("Quotes inside a quoted field must be escaped",
"\"a,b:\"\"quote\"\",c\"", csv.trim());
}
@Test
public void testCSVclipMaxLength() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test",
CSVConstants.CSV_MAXFIELDLENGTH, "8" };
String[] fields = { "test", "0123456789" };
String csv = getCSV(params, fields);
assertEquals("Field clipped to max. length = 8", "01234567", csv.trim());
}
@Test
public void testCSVclipMaxLengthQuote() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test",
CSVConstants.CSV_MAXFIELDLENGTH, "7" };
String[] fields = { "test", "1,\"2\",3,\"4\"" };
String csv = getCSV(params, fields);
assertEquals("Field clipped to max. length = 7", "\"1,\"\"2\"\",3\"",
csv.trim());
}
@Test
public void testCSVmultiValueFields() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test",
CSVConstants.CSV_VALUESEPARATOR, "|",
CSVConstants.CSV_QUOTECHARACTER, "" };
String[] fields = { "test", "abc", "test", "def" };
String csv = getCSV(params, fields);
assertEquals("Values of multi-value fields are concatenated by |",
"abc|def", csv.trim());
}
@Test
public void testCSVEncoding() throws IOException {
String[] charsets = { "iso-8859-1",
"\u00e4\u00f6\u00fc\u00df\u00e9\u00f4\u00ee", // äöüßéôî
"iso-8859-2", "\u0161\u010d\u0159\u016f", // ščřů
"iso-8859-5", "\u0430\u0441\u0434\u0444", // асдф
};
for (int i = 0; i < charsets.length; i += 2) {
String charset = charsets[i];
String test = charsets[i + 1];
String[] params = { CSVConstants.CSV_FIELDS, "test",
CSVConstants.CSV_CHARSET, charset };
String[] fields = { "test", test };
String csv = getCSV(params, fields);
assertEquals("wrong charset conversion", test, csv.trim());
}
}
/** test non-ASCII separator */
@Test
public void testCSVEncodingSeparator() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "test",
CSVConstants.CSV_CHARSET, "iso-8859-1",
CSVConstants.CSV_VALUESEPARATOR, "\u00a6", // ¦ (broken bar)
CSVConstants.CSV_QUOTECHARACTER, ""
};
String[] fields = { "test", "abc", "test", "def" };
String csv = getCSV(params, fields);
assertEquals("Values of multi-value fields are concatenated by ¦",
"abc\u00a6def", csv.trim());
}
@Test
public void testCSVtabSeparated() throws IOException {
String[] params = { CSVConstants.CSV_FIELDS, "1,2,3",
CSVConstants.CSV_FIELD_SEPARATOR, "\t",
CSVConstants.CSV_QUOTECHARACTER, ""
};
NutchDocument[] docs = new NutchDocument[2];
docs[0] = new NutchDocument();
docs[0].add("1", "a");
docs[0].add("1", "b");
docs[0].add("2", "a\"2\"b");
docs[0].add("3", "c,d");
docs[1] = new NutchDocument();
docs[1].add("1", "A");
docs[1].add("2", "B");
docs[1].add("3", "C");
String csv = getCSV(params, docs);
String[] records = csv.trim().split("\\r\\n");
assertEquals("tab-separated output", "a|b\ta\"2\"b\tc,d", records[0]);
assertEquals("tab-separated output", "A\tB\tC", records[1]);
}
@Test
public void testCSVdateField() throws IOException {
TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
String[] params = { CSVConstants.CSV_FIELDS, "date" };
NutchDocument[] docs = new NutchDocument[1];
docs[0] = new NutchDocument();
docs[0].add("date", new Date(0)); // 1970-01-01
String csv = getCSV(params, docs);
assertTrue("date conversion", csv.contains("1970"));
}
}