src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.analysis.lang;

 import java.io.BufferedReader;
 import java.io.InputStreamReader;

 // Nutch imports
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.tika.language.LanguageIdentifier;
 import org.junit.Assert;
 import org.junit.Test;

 public class TestHTMLLanguageParser {

   private static String URL = "http://foo.bar/";

   private static String BASE = "http://foo.bar/";

   String docs[] = {
       "<html lang=\"fi\"><head>document 1 title</head><body>jotain suomeksi</body></html>",
       "<html><head><meta http-equiv=\"content-language\" content=\"en\"><title>document 2 title</head><body>this is english</body></html>",
       "<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 title</head><body>this is english</body></html>" };

   // Tika does not return "fi" but null
   String metalanguages[] = { "fi", "en", "en" };

   /**
    * Test parsing of language identifiers from html
    **/
   @Test
   public void testMetaHTMLParsing() {

     try {
       ParseUtil parser = new ParseUtil(NutchConfiguration.create());
       /* loop through the test documents and validate result */
       for (int t = 0; t < docs.length; t++) {
         Content content = getContent(docs[t]);
         Parse parse = parser.parse(content).get(content.getUrl());
         Assert.assertEquals(metalanguages[t], (String) parse.getData()
             .getParseMeta().get(Metadata.LANGUAGE));
       }
     } catch (Exception e) {
       e.printStackTrace(System.out);
       Assert.fail(e.toString());
     }

   }

   /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
   @Test
   public void testParseLanguage() {
     String tests[][] = { { "(SCHEME=ISO.639-1) sv", "sv" },
         { "(SCHEME=RFC1766) sv-FI", "sv" }, { "(SCHEME=Z39.53) SWE", "sv" },
         { "EN_US, SV, EN, EN_UK", "en" }, { "English Swedish", "en" },
         { "English, swedish", "en" }, { "English,Swedish", "en" },
         { "Other (Svenska)", "sv" }, { "SE", "se" }, { "SV", "sv" },
         { "SV charset=iso-8859-1", "sv" }, { "SV-FI", "sv" },
         { "SV; charset=iso-8859-1", "sv" }, { "SVE", "sv" }, { "SW", "sw" },
         { "SWE", "sv" }, { "SWEDISH", "sv" }, { "Sv", "sv" }, { "Sve", "sv" },
         { "Svenska", "sv" }, { "Swedish", "sv" }, { "Swedish, svenska", "sv" },
         { "en, sv", "en" }, { "sv", "sv" },
         { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, { "sv,en", "sv" },
         { "sv-FI", "sv" }, { "sv-SE", "sv" }, { "sv-en", "sv" },
         { "sv-fi", "sv" }, { "sv-se", "sv" },
         { "sv; Content-Language: sv", "sv" }, { "sv_SE", "sv" },
         { "sve", "sv" }, { "svenska, swedish, engelska, english", "sv" },
         { "sw", "sw" }, { "swe", "sv" }, { "swe.SPR.", "sv" },
         { "sweden", "sv" }, { "swedish", "sv" }, { "swedish,", "sv" },
         { "text/html; charset=sv-SE", "sv" }, { "text/html; sv", "sv" },
         { "torp, stuga, uthyres, bed & breakfast", null } };

     for (int i = 0; i < 44; i++) {
       Assert.assertEquals(tests[i][1],
           HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
     }
   }

   private Content getContent(String text) {
     Metadata meta = new Metadata();
     meta.add("Content-Type", "text/html");
     return new Content(URL, BASE, text.getBytes(), "text/html", meta,
         NutchConfiguration.create());
   }

   @Test
   public void testLanguageIndentifier() {
     try {
       long total = 0;
       LanguageIdentifier identifier;
       BufferedReader in = new BufferedReader(new InputStreamReader(this
           .getClass().getResourceAsStream("test-referencial.txt")));
       String line = null;
       while ((line = in.readLine()) != null) {
         String[] tokens = line.split(";");
         if (!tokens[0].equals("")) {
           StringBuilder content = new StringBuilder();
           // Test each line of the file...
           BufferedReader testFile = new BufferedReader(new InputStreamReader(
               this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
           String testLine = null, lang = null;
           while ((testLine = testFile.readLine()) != null) {
             content.append(testLine + "\n");
             testLine = testLine.trim();
             if (testLine.length() > 256) {
               identifier = new LanguageIdentifier(testLine);
               lang = identifier.getLanguage();
               Assert.assertEquals(tokens[1], lang);
             }
           }
           testFile.close();

           // Test the whole file
           long start = System.currentTimeMillis();
           System.out.println(content.toString());
           identifier = new LanguageIdentifier(content.toString());
           lang = identifier.getLanguage();
           System.out.println(lang);
           total += System.currentTimeMillis() - start;
           Assert.assertEquals(tokens[1], lang);
         }
       }
       in.close();
       System.out.println("Total Time=" + total);
     } catch (Exception e) {
       e.printStackTrace();
       Assert.fail(e.toString());
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.analysis.lang;

	import java.io.BufferedReader;
	import java.io.InputStreamReader;

	// Nutch imports
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseUtil;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.NutchConfiguration;
	import org.apache.tika.language.LanguageIdentifier;
	import org.junit.Assert;
	import org.junit.Test;

	public class TestHTMLLanguageParser {

	private static String URL = "http://foo.bar/";

	private static String BASE = "http://foo.bar/";

	String docs[] = {
	"<html lang=\"fi\"><head>document 1 title</head><body>jotain suomeksi</body></html>",
	"<html><head><meta http-equiv=\"content-language\" content=\"en\"><title>document 2 title</head><body>this is english</body></html>",
	"<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 title</head><body>this is english</body></html>" };

	// Tika does not return "fi" but null
	String metalanguages[] = { "fi", "en", "en" };

	/**
	* Test parsing of language identifiers from html
	**/
	@Test
	public void testMetaHTMLParsing() {

	try {
	ParseUtil parser = new ParseUtil(NutchConfiguration.create());
	/* loop through the test documents and validate result */
	for (int t = 0; t < docs.length; t++) {
	Content content = getContent(docs[t]);
	Parse parse = parser.parse(content).get(content.getUrl());
	Assert.assertEquals(metalanguages[t], (String) parse.getData()
	.getParseMeta().get(Metadata.LANGUAGE));
	}
	} catch (Exception e) {
	e.printStackTrace(System.out);
	Assert.fail(e.toString());
	}

	}

	/** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
	@Test
	public void testParseLanguage() {
	String tests[][] = { { "(SCHEME=ISO.639-1) sv", "sv" },
	{ "(SCHEME=RFC1766) sv-FI", "sv" }, { "(SCHEME=Z39.53) SWE", "sv" },
	{ "EN_US, SV, EN, EN_UK", "en" }, { "English Swedish", "en" },
	{ "English, swedish", "en" }, { "English,Swedish", "en" },
	{ "Other (Svenska)", "sv" }, { "SE", "se" }, { "SV", "sv" },
	{ "SV charset=iso-8859-1", "sv" }, { "SV-FI", "sv" },
	{ "SV; charset=iso-8859-1", "sv" }, { "SVE", "sv" }, { "SW", "sw" },
	{ "SWE", "sv" }, { "SWEDISH", "sv" }, { "Sv", "sv" }, { "Sve", "sv" },
	{ "Svenska", "sv" }, { "Swedish", "sv" }, { "Swedish, svenska", "sv" },
	{ "en, sv", "en" }, { "sv", "sv" },
	{ "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, { "sv,en", "sv" },
	{ "sv-FI", "sv" }, { "sv-SE", "sv" }, { "sv-en", "sv" },
	{ "sv-fi", "sv" }, { "sv-se", "sv" },
	{ "sv; Content-Language: sv", "sv" }, { "sv_SE", "sv" },
	{ "sve", "sv" }, { "svenska, swedish, engelska, english", "sv" },
	{ "sw", "sw" }, { "swe", "sv" }, { "swe.SPR.", "sv" },
	{ "sweden", "sv" }, { "swedish", "sv" }, { "swedish,", "sv" },
	{ "text/html; charset=sv-SE", "sv" }, { "text/html; sv", "sv" },
	{ "torp, stuga, uthyres, bed & breakfast", null } };

	for (int i = 0; i < 44; i++) {
	Assert.assertEquals(tests[i][1],
	HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
	}
	}

	private Content getContent(String text) {
	Metadata meta = new Metadata();
	meta.add("Content-Type", "text/html");
	return new Content(URL, BASE, text.getBytes(), "text/html", meta,
	NutchConfiguration.create());
	}

	@Test
	public void testLanguageIndentifier() {
	try {
	long total = 0;
	LanguageIdentifier identifier;
	BufferedReader in = new BufferedReader(new InputStreamReader(this
	.getClass().getResourceAsStream("test-referencial.txt")));
	String line = null;
	while ((line = in.readLine()) != null) {
	String[] tokens = line.split(";");
	if (!tokens[0].equals("")) {
	StringBuilder content = new StringBuilder();
	// Test each line of the file...
	BufferedReader testFile = new BufferedReader(new InputStreamReader(
	this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
	String testLine = null, lang = null;
	while ((testLine = testFile.readLine()) != null) {
	content.append(testLine + "\n");
	testLine = testLine.trim();
	if (testLine.length() > 256) {
	identifier = new LanguageIdentifier(testLine);
	lang = identifier.getLanguage();
	Assert.assertEquals(tokens[1], lang);
	}
	}
	testFile.close();

	// Test the whole file
	long start = System.currentTimeMillis();
	System.out.println(content.toString());
	identifier = new LanguageIdentifier(content.toString());
	lang = identifier.getLanguage();
	System.out.println(lang);
	total += System.currentTimeMillis() - start;
	Assert.assertEquals(tokens[1], lang);
	}
	}
	in.close();
	System.out.println("Total Time=" + total);
	} catch (Exception e) {
	e.printStackTrace();
	Assert.fail(e.toString());
	}
	}

	}