blob: 4a381fa68372603cb4b9c301d12a47d86819b131 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.analysis.lang;
import java.io.BufferedReader;
import java.io.InputStreamReader;
// Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.tika.language.LanguageIdentifier;
import org.junit.Assert;
import org.junit.Test;
public class TestHTMLLanguageParser {
private static String URL = "http://foo.bar/";
private static String BASE = "http://foo.bar/";
String docs[] = {
"<html lang=\"fi\"><head>document 1 title</head><body>jotain suomeksi</body></html>",
"<html><head><meta http-equiv=\"content-language\" content=\"en\"><title>document 2 title</head><body>this is english</body></html>",
"<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 title</head><body>this is english</body></html>" };
// Tika does not return "fi" but null
String metalanguages[] = { "fi", "en", "en" };
/**
* Test parsing of language identifiers from html
**/
@Test
public void testMetaHTMLParsing() {
try {
ParseUtil parser = new ParseUtil(NutchConfiguration.create());
/* loop through the test documents and validate result */
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
Parse parse = parser.parse(content).get(content.getUrl());
Assert.assertEquals(metalanguages[t], (String) parse.getData()
.getParseMeta().get(Metadata.LANGUAGE));
}
} catch (Exception e) {
e.printStackTrace(System.out);
Assert.fail(e.toString());
}
}
/** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
@Test
public void testParseLanguage() {
String tests[][] = { { "(SCHEME=ISO.639-1) sv", "sv" },
{ "(SCHEME=RFC1766) sv-FI", "sv" }, { "(SCHEME=Z39.53) SWE", "sv" },
{ "EN_US, SV, EN, EN_UK", "en" }, { "English Swedish", "en" },
{ "English, swedish", "en" }, { "English,Swedish", "en" },
{ "Other (Svenska)", "sv" }, { "SE", "se" }, { "SV", "sv" },
{ "SV charset=iso-8859-1", "sv" }, { "SV-FI", "sv" },
{ "SV; charset=iso-8859-1", "sv" }, { "SVE", "sv" }, { "SW", "sw" },
{ "SWE", "sv" }, { "SWEDISH", "sv" }, { "Sv", "sv" }, { "Sve", "sv" },
{ "Svenska", "sv" }, { "Swedish", "sv" }, { "Swedish, svenska", "sv" },
{ "en, sv", "en" }, { "sv", "sv" },
{ "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, { "sv,en", "sv" },
{ "sv-FI", "sv" }, { "sv-SE", "sv" }, { "sv-en", "sv" },
{ "sv-fi", "sv" }, { "sv-se", "sv" },
{ "sv; Content-Language: sv", "sv" }, { "sv_SE", "sv" },
{ "sve", "sv" }, { "svenska, swedish, engelska, english", "sv" },
{ "sw", "sw" }, { "swe", "sv" }, { "swe.SPR.", "sv" },
{ "sweden", "sv" }, { "swedish", "sv" }, { "swedish,", "sv" },
{ "text/html; charset=sv-SE", "sv" }, { "text/html; sv", "sv" },
{ "torp, stuga, uthyres, bed & breakfast", null } };
for (int i = 0; i < 44; i++) {
Assert.assertEquals(tests[i][1],
HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
}
}
private Content getContent(String text) {
Metadata meta = new Metadata();
meta.add("Content-Type", "text/html");
return new Content(URL, BASE, text.getBytes(), "text/html", meta,
NutchConfiguration.create());
}
@Test
public void testLanguageIndentifier() {
try {
long total = 0;
LanguageIdentifier identifier;
BufferedReader in = new BufferedReader(new InputStreamReader(this
.getClass().getResourceAsStream("test-referencial.txt")));
String line = null;
while ((line = in.readLine()) != null) {
String[] tokens = line.split(";");
if (!tokens[0].equals("")) {
StringBuilder content = new StringBuilder();
// Test each line of the file...
BufferedReader testFile = new BufferedReader(new InputStreamReader(
this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
String testLine = null, lang = null;
while ((testLine = testFile.readLine()) != null) {
content.append(testLine + "\n");
testLine = testLine.trim();
if (testLine.length() > 256) {
identifier = new LanguageIdentifier(testLine);
lang = identifier.getLanguage();
Assert.assertEquals(tokens[1], lang);
}
}
testFile.close();
// Test the whole file
long start = System.currentTimeMillis();
System.out.println(content.toString());
identifier = new LanguageIdentifier(content.toString());
lang = identifier.getLanguage();
System.out.println(lang);
total += System.currentTimeMillis() - start;
Assert.assertEquals(tokens[1], lang);
}
}
in.close();
System.out.println("Total Time=" + total);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
}
}