src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.indexwriter.csv;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.TimeZone;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.indexer.IndexWriterParams;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Test CSVIndexWriter. Focus is on CSV-specific potential issues, mainly quoting and escaping.
  */
 public class TestCSVIndexWriter {

   protected static final Logger LOG = LoggerFactory
       .getLogger(TestCSVIndexWriter.class);

   /**
    * Dummy IndexWriter which stores the indexed documents as CSV string in a
    * {@link ByteArrayOutputStream} which can be easily accessed in test cases.
    */
   public class CSVByteArrayIndexWriter extends CSVIndexWriter {

     ByteArrayOutputStream byteBuffer;
     FileSystem.Statistics fsStats;

     @Override
     public void open(IndexWriterParams parameters) throws IOException {
       super.open(parameters);
       byteBuffer = new ByteArrayOutputStream();
       fsStats = new FileSystem.Statistics("testCSVIndexWriter");
       csvout = new FSDataOutputStream(byteBuffer, fsStats);
     }

     @Override
     public void close() throws IOException {
     }

     /** get the indexed documents as CSV */
     public String getData() {
       try {
         return byteBuffer.toString(encoding.name());
       } catch (UnsupportedEncodingException e) {
         return "";
       }
     }
   }

   /**
    * write one NutchDocument as CSV record
    *
    * @param configParams configuration parameters: array (property => value, prop2 => value)
    * @param docs         NutchDocument
    * @return CSV string representing the document
    */
   private String getCSV(final String[] configParams, NutchDocument[] docs)
       throws IOException {
     Configuration conf = NutchConfiguration.create();
     IndexWriterParams params = new IndexWriterParams(new HashMap<>());
     for (int i = 0; i < configParams.length; i += 2) {
       params.put(configParams[i], configParams[i + 1]);
     }
     CSVByteArrayIndexWriter out = new CSVByteArrayIndexWriter();
     out.setConf(conf);
     out.open(params);
     for (NutchDocument doc : docs) {
       out.write(doc);
     }
     out.close();
     String csv = out.getData();
     LOG.info(csv);
     return csv;
   }

   /**
    * write one document as CSV record
    *
    * @param configParams configuration parameters: array (property => value, prop2 => value)
    * @param fieldContent array of {field => value} maps
    * @return CSV string representing the document
    */
   private String getCSV(final String[] configParams, final String[] fieldContent)
       throws IOException {
     NutchDocument[] docs = new NutchDocument[1];
     docs[0] = new NutchDocument();
     for (int i = 0; i < fieldContent.length; i += 2) {
       docs[0].add(fieldContent[i], fieldContent[i + 1]);
     }
     return getCSV(configParams, docs);
   }

   /** defaults, no quoting necessary */
   @Test
   public void testCSVdefault() throws IOException {
     String[] fields = { "id", "http://nutch.apache.org/", "title",
         "Welcome to Apache Nutch", "content",
         "Apache Nutch is an open source web-search software project. ..." };
     String csv = getCSV(new String[0], fields);
     for (int i = 0; i < fields.length; i += 2) {
       assertTrue("Testing field " + i + " (" + fields[i] + ")",
           csv.contains(fields[i + 1]));
     }
   }

   @Test
   public void testCSVquoteFieldSeparators() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test,test2" };
     String[] fields = { "test", "a,b", "test2", "c,d" };
     String csv = getCSV(params, fields);
     assertEquals("If field contains a fields separator, it must be quoted",
         "\"a,b\",\"c,d\"", csv.trim());
   }

   @Test
   public void testCSVquoteRecordSeparators() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test" };
     String[] fields = { "test", "a\nb" };
     String csv = getCSV(params, fields);
     assertEquals("If field contains a fields separator, it must be quoted",
         "\"a\nb\"", csv.trim());
   }

   @Test
   public void testCSVescapeQuotes() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test" };
     String[] fields = { "test", "a,b:\"quote\",c" };
     String csv = getCSV(params, fields);
     assertEquals("Quotes inside a quoted field must be escaped",
         "\"a,b:\"\"quote\"\",c\"", csv.trim());
   }

   @Test
   public void testCSVclipMaxLength() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test",
         CSVConstants.CSV_MAXFIELDLENGTH, "8" };
     String[] fields = { "test", "0123456789" };
     String csv = getCSV(params, fields);
     assertEquals("Field clipped to max. length = 8", "01234567", csv.trim());
   }

   @Test
   public void testCSVclipMaxLengthQuote() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test",
         CSVConstants.CSV_MAXFIELDLENGTH, "7" };
     String[] fields = { "test", "1,\"2\",3,\"4\"" };
     String csv = getCSV(params, fields);
     assertEquals("Field clipped to max. length = 7", "\"1,\"\"2\"\",3\"",
         csv.trim());
   }

   @Test
   public void testCSVmultiValueFields() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test",
         CSVConstants.CSV_VALUESEPARATOR, "|",
         CSVConstants.CSV_QUOTECHARACTER, "" };
     String[] fields = { "test", "abc", "test", "def" };
     String csv = getCSV(params, fields);
     assertEquals("Values of multi-value fields are concatenated by |",
         "abc|def", csv.trim());
   }

   @Test
   public void testCSVEncoding() throws IOException {
     String[] charsets = { "iso-8859-1",
         "\u00e4\u00f6\u00fc\u00df\u00e9\u00f4\u00ee", // äöüßéôî
         "iso-8859-2", "\u0161\u010d\u0159\u016f", // ščřů
         "iso-8859-5", "\u0430\u0441\u0434\u0444", // асдф
     };
     for (int i = 0; i < charsets.length; i += 2) {
       String charset = charsets[i];
       String test = charsets[i + 1];
       String[] params = { CSVConstants.CSV_FIELDS, "test",
           CSVConstants.CSV_CHARSET, charset };
       String[] fields = { "test", test };
       String csv = getCSV(params, fields);
       assertEquals("wrong charset conversion", test, csv.trim());
     }
   }

   /** test non-ASCII separator */
   @Test
   public void testCSVEncodingSeparator() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "test",
         CSVConstants.CSV_CHARSET, "iso-8859-1",
         CSVConstants.CSV_VALUESEPARATOR, "\u00a6", // ¦ (broken bar)
         CSVConstants.CSV_QUOTECHARACTER, ""
     };
     String[] fields = { "test", "abc", "test", "def" };
     String csv = getCSV(params, fields);
     assertEquals("Values of multi-value fields are concatenated by ¦",
         "abc\u00a6def", csv.trim());
   }

   @Test
   public void testCSVtabSeparated() throws IOException {
     String[] params = { CSVConstants.CSV_FIELDS, "1,2,3",
         CSVConstants.CSV_FIELD_SEPARATOR, "\t",
         CSVConstants.CSV_QUOTECHARACTER, ""
     };
     NutchDocument[] docs = new NutchDocument[2];
     docs[0] = new NutchDocument();
     docs[0].add("1", "a");
     docs[0].add("1", "b");
     docs[0].add("2", "a\"2\"b");
     docs[0].add("3", "c,d");
     docs[1] = new NutchDocument();
     docs[1].add("1", "A");
     docs[1].add("2", "B");
     docs[1].add("3", "C");
     String csv = getCSV(params, docs);
     String[] records = csv.trim().split("\\r\\n");
     assertEquals("tab-separated output", "a|b\ta\"2\"b\tc,d", records[0]);
     assertEquals("tab-separated output", "A\tB\tC", records[1]);
   }

   @Test
   public void testCSVdateField() throws IOException {
     TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
     String[] params = { CSVConstants.CSV_FIELDS, "date" };
     NutchDocument[] docs = new NutchDocument[1];
     docs[0] = new NutchDocument();
     docs[0].add("date", new Date(0)); // 1970-01-01
     String csv = getCSV(params, docs);
     assertTrue("date conversion", csv.contains("1970"));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.indexwriter.csv;

	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertTrue;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.UnsupportedEncodingException;
	import java.util.Date;
	import java.util.HashMap;
	import java.util.TimeZone;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FSDataOutputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.nutch.indexer.IndexWriterParams;
	import org.apache.nutch.indexer.NutchDocument;
	import org.apache.nutch.util.NutchConfiguration;
	import org.junit.Test;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Test CSVIndexWriter. Focus is on CSV-specific potential issues, mainly quoting and escaping.
	*/
	public class TestCSVIndexWriter {

	protected static final Logger LOG = LoggerFactory
	.getLogger(TestCSVIndexWriter.class);

	/**
	* Dummy IndexWriter which stores the indexed documents as CSV string in a
	* {@link ByteArrayOutputStream} which can be easily accessed in test cases.
	*/
	public class CSVByteArrayIndexWriter extends CSVIndexWriter {

	ByteArrayOutputStream byteBuffer;
	FileSystem.Statistics fsStats;

	@Override
	public void open(IndexWriterParams parameters) throws IOException {
	super.open(parameters);
	byteBuffer = new ByteArrayOutputStream();
	fsStats = new FileSystem.Statistics("testCSVIndexWriter");
	csvout = new FSDataOutputStream(byteBuffer, fsStats);
	}

	@Override
	public void close() throws IOException {
	}

	/** get the indexed documents as CSV */
	public String getData() {
	try {
	return byteBuffer.toString(encoding.name());
	} catch (UnsupportedEncodingException e) {
	return "";
	}
	}
	}

	/**
	* write one NutchDocument as CSV record
	*
	* @param configParams configuration parameters: array (property => value, prop2 => value)
	* @param docs NutchDocument
	* @return CSV string representing the document
	*/
	private String getCSV(final String[] configParams, NutchDocument[] docs)
	throws IOException {
	Configuration conf = NutchConfiguration.create();
	IndexWriterParams params = new IndexWriterParams(new HashMap<>());
	for (int i = 0; i < configParams.length; i += 2) {
	params.put(configParams[i], configParams[i + 1]);
	}
	CSVByteArrayIndexWriter out = new CSVByteArrayIndexWriter();
	out.setConf(conf);
	out.open(params);
	for (NutchDocument doc : docs) {
	out.write(doc);
	}
	out.close();
	String csv = out.getData();
	LOG.info(csv);
	return csv;
	}

	/**
	* write one document as CSV record
	*
	* @param configParams configuration parameters: array (property => value, prop2 => value)
	* @param fieldContent array of {field => value} maps
	* @return CSV string representing the document
	*/
	private String getCSV(final String[] configParams, final String[] fieldContent)
	throws IOException {
	NutchDocument[] docs = new NutchDocument[1];
	docs[0] = new NutchDocument();
	for (int i = 0; i < fieldContent.length; i += 2) {
	docs[0].add(fieldContent[i], fieldContent[i + 1]);
	}
	return getCSV(configParams, docs);
	}

	/** defaults, no quoting necessary */
	@Test
	public void testCSVdefault() throws IOException {
	String[] fields = { "id", "http://nutch.apache.org/", "title",
	"Welcome to Apache Nutch", "content",
	"Apache Nutch is an open source web-search software project. ..." };
	String csv = getCSV(new String[0], fields);
	for (int i = 0; i < fields.length; i += 2) {
	assertTrue("Testing field " + i + " (" + fields[i] + ")",
	csv.contains(fields[i + 1]));
	}
	}

	@Test
	public void testCSVquoteFieldSeparators() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "test,test2" };
	String[] fields = { "test", "a,b", "test2", "c,d" };
	String csv = getCSV(params, fields);
	assertEquals("If field contains a fields separator, it must be quoted",
	"\"a,b\",\"c,d\"", csv.trim());
	}

	@Test
	public void testCSVquoteRecordSeparators() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "test" };
	String[] fields = { "test", "a\nb" };
	String csv = getCSV(params, fields);
	assertEquals("If field contains a fields separator, it must be quoted",
	"\"a\nb\"", csv.trim());
	}

	@Test
	public void testCSVescapeQuotes() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "test" };
	String[] fields = { "test", "a,b:\"quote\",c" };
	String csv = getCSV(params, fields);
	assertEquals("Quotes inside a quoted field must be escaped",
	"\"a,b:\"\"quote\"\",c\"", csv.trim());
	}

	@Test
	public void testCSVclipMaxLength() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "test",
	CSVConstants.CSV_MAXFIELDLENGTH, "8" };
	String[] fields = { "test", "0123456789" };
	String csv = getCSV(params, fields);
	assertEquals("Field clipped to max. length = 8", "01234567", csv.trim());
	}

	@Test
	public void testCSVclipMaxLengthQuote() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "test",
	CSVConstants.CSV_MAXFIELDLENGTH, "7" };
	String[] fields = { "test", "1,\"2\",3,\"4\"" };
	String csv = getCSV(params, fields);
	assertEquals("Field clipped to max. length = 7", "\"1,\"\"2\"\",3\"",
	csv.trim());
	}

	@Test
	public void testCSVmultiValueFields() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "test",
	CSVConstants.CSV_VALUESEPARATOR, "\|",
	CSVConstants.CSV_QUOTECHARACTER, "" };
	String[] fields = { "test", "abc", "test", "def" };
	String csv = getCSV(params, fields);
	assertEquals("Values of multi-value fields are concatenated by \|",
	"abc\|def", csv.trim());
	}

	@Test
	public void testCSVEncoding() throws IOException {
	String[] charsets = { "iso-8859-1",
	"\u00e4\u00f6\u00fc\u00df\u00e9\u00f4\u00ee", // äöüßéôî
	"iso-8859-2", "\u0161\u010d\u0159\u016f", // ščřů
	"iso-8859-5", "\u0430\u0441\u0434\u0444", // асдф
	};
	for (int i = 0; i < charsets.length; i += 2) {
	String charset = charsets[i];
	String test = charsets[i + 1];
	String[] params = { CSVConstants.CSV_FIELDS, "test",
	CSVConstants.CSV_CHARSET, charset };
	String[] fields = { "test", test };
	String csv = getCSV(params, fields);
	assertEquals("wrong charset conversion", test, csv.trim());
	}
	}

	/** test non-ASCII separator */
	@Test
	public void testCSVEncodingSeparator() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "test",
	CSVConstants.CSV_CHARSET, "iso-8859-1",
	CSVConstants.CSV_VALUESEPARATOR, "\u00a6", // ¦ (broken bar)
	CSVConstants.CSV_QUOTECHARACTER, ""
	};
	String[] fields = { "test", "abc", "test", "def" };
	String csv = getCSV(params, fields);
	assertEquals("Values of multi-value fields are concatenated by ¦",
	"abc\u00a6def", csv.trim());
	}

	@Test
	public void testCSVtabSeparated() throws IOException {
	String[] params = { CSVConstants.CSV_FIELDS, "1,2,3",
	CSVConstants.CSV_FIELD_SEPARATOR, "\t",
	CSVConstants.CSV_QUOTECHARACTER, ""
	};
	NutchDocument[] docs = new NutchDocument[2];
	docs[0] = new NutchDocument();
	docs[0].add("1", "a");
	docs[0].add("1", "b");
	docs[0].add("2", "a\"2\"b");
	docs[0].add("3", "c,d");
	docs[1] = new NutchDocument();
	docs[1].add("1", "A");
	docs[1].add("2", "B");
	docs[1].add("3", "C");
	String csv = getCSV(params, docs);
	String[] records = csv.trim().split("\\r\\n");
	assertEquals("tab-separated output", "a\|b\ta\"2\"b\tc,d", records[0]);
	assertEquals("tab-separated output", "A\tB\tC", records[1]);
	}

	@Test
	public void testCSVdateField() throws IOException {
	TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
	String[] params = { CSVConstants.CSV_FIELDS, "date" };
	NutchDocument[] docs = new NutchDocument[1];
	docs[0] = new NutchDocument();
	docs[0].add("date", new Date(0)); // 1970-01-01
	String csv = getCSV(params, docs);
	assertTrue("date conversion", csv.contains("1970"));
	}
	}