src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.any23;

 import java.io.File;
 import java.io.IOException;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParserNotFound;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;

 public class TestAny23ParseFilter {


   private Configuration conf;

   private String fileSeparator = System.getProperty("file.separator");

   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");

   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/any23/build.xml during plugin compilation.
   private String file1 = "BBC_News_Scotland.html";

   private String file2 = "microdata_basic.html";

   private static final int EXPECTED_TRIPLES_1 = 68;

   private static final int EXPECTED_TRIPLES_2 = 38;

   @Before
   public void setUp() {
     this.conf = NutchConfiguration.create();
     conf.set("file.content.limit", "-1");
     conf.set("parser.timeout", "-1");
     conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,"
             + "html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,"
             + "html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,"
             + "html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
     conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
   }

   @Test
   public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException {
     String[] triplesArray = getTriples(file1);

     Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter",
         EXPECTED_TRIPLES_1, triplesArray.length);
   }

   @Test
   public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException {
     String[] triplesArray = getTriples(file2);

     Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter",
         EXPECTED_TRIPLES_2, triplesArray.length);
   }

   @Test
   public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException {
     String[] triplesArray = getTriples(file1, "application/pdf");

     Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored",
             0, triplesArray.length);
   }

   public String[] extract(String urlString, File file, String contentType) {
     try {
       System.out.println(urlString);
       Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
       Content content = protocol.getProtocolOutput(new Text(urlString),
           new CrawlDatum()).getContent();
       content.setContentType(contentType);
       Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
       return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
     } catch (Exception e) {
       e.printStackTrace();
       Assert.fail(e.toString());
     }
     return null;
   }

   private String[] getTriples(String fileName) {
     return getTriples(fileName, "text/html");
   }

   private String[] getTriples(String fileName, String contentType) {
     String urlString = "file:" + sampleDir + fileSeparator + fileName;

     File file = new File(sampleDir + fileSeparator + fileName);

     return extract(urlString, file, contentType);
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.any23;

	import java.io.File;
	import java.io.IOException;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;
	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseException;
	import org.apache.nutch.parse.ParseUtil;
	import org.apache.nutch.parse.ParserNotFound;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.protocol.Protocol;
	import org.apache.nutch.protocol.ProtocolFactory;
	import org.apache.nutch.util.NutchConfiguration;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Test;

	public class TestAny23ParseFilter {


	private Configuration conf;

	private String fileSeparator = System.getProperty("file.separator");

	// This system property is defined in ./src/plugin/build-plugin.xml
	private String sampleDir = System.getProperty("test.data", ".");

	// Make sure sample files are copied to "test.data" as specified in
	// ./src/plugin/any23/build.xml during plugin compilation.
	private String file1 = "BBC_News_Scotland.html";

	private String file2 = "microdata_basic.html";

	private static final int EXPECTED_TRIPLES_1 = 68;

	private static final int EXPECTED_TRIPLES_2 = 38;

	@Before
	public void setUp() {
	this.conf = NutchConfiguration.create();
	conf.set("file.content.limit", "-1");
	conf.set("parser.timeout", "-1");
	conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,"
	+ "html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,"
	+ "html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,"
	+ "html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
	conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
	}

	@Test
	public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException {
	String[] triplesArray = getTriples(file1);

	Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter",
	EXPECTED_TRIPLES_1, triplesArray.length);
	}

	@Test
	public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException {
	String[] triplesArray = getTriples(file2);

	Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter",
	EXPECTED_TRIPLES_2, triplesArray.length);
	}

	@Test
	public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException {
	String[] triplesArray = getTriples(file1, "application/pdf");

	Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored",
	0, triplesArray.length);
	}

	public String[] extract(String urlString, File file, String contentType) {
	try {
	System.out.println(urlString);
	Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
	Content content = protocol.getProtocolOutput(new Text(urlString),
	new CrawlDatum()).getContent();
	content.setContentType(contentType);
	Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
	return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
	} catch (Exception e) {
	e.printStackTrace();
	Assert.fail(e.toString());
	}
	return null;
	}

	private String[] getTriples(String fileName) {
	return getTriples(fileName, "text/html");
	}

	private String[] getTriples(String fileName, String contentType) {
	String urlString = "file:" + sampleDir + fileSeparator + fileName;

	File file = new File(sampleDir + fileSeparator + fileName);

	return extract(urlString, file, contentType);
	}
	}