blob: 8697a62f08cc27955a91c4961be065d7f6559b28 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.junit.Assert;
import org.junit.Test;
public class TestEncodingDetector {
private static Configuration conf = NutchConfiguration.create();
private static byte[] contentInOctets;
static {
try {
contentInOctets = "çñôöøДЛжҶ".getBytes("utf-8");
} catch (UnsupportedEncodingException e) {
// not possible
}
}
@Test
public void testGuessing() {
// first disable auto detection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);
Metadata metadata = new Metadata();
EncodingDetector detector;
Content content;
String encoding;
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
encoding = detector.guessEncoding(content, "windows-1252");
// no information is available, so it should return default encoding
Assert.assertEquals("windows-1252", encoding.toLowerCase());
metadata.clear();
metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
encoding = detector.guessEncoding(content, "windows-1252");
Assert.assertEquals("utf-16", encoding.toLowerCase());
metadata.clear();
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue("windows-1254", "sniffed");
encoding = detector.guessEncoding(content, "windows-1252");
Assert.assertEquals("windows-1254", encoding.toLowerCase());
// enable autodetection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
metadata.clear();
metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
content = new Content("http://www.example.com", "http://www.example.com/",
contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue("utf-32", "sniffed");
encoding = detector.guessEncoding(content, "windows-1252");
Assert.assertEquals("utf-8", encoding.toLowerCase());
}
}