| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.util; |
| |
| import java.io.UnsupportedEncodingException; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.metadata.Metadata; |
| import org.apache.nutch.net.protocols.Response; |
| import org.apache.nutch.protocol.Content; |
| import org.junit.Assert; |
| import org.junit.Test; |
| |
| public class TestEncodingDetector { |
| private static Configuration conf = NutchConfiguration.create(); |
| |
| private static byte[] contentInOctets; |
| |
| static { |
| try { |
| contentInOctets = "çñôöøДЛжҶ".getBytes("utf-8"); |
| } catch (UnsupportedEncodingException e) { |
| // not possible |
| } |
| } |
| |
| @Test |
| public void testGuessing() { |
| // first disable auto detection |
| conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1); |
| |
| Metadata metadata = new Metadata(); |
| EncodingDetector detector; |
| Content content; |
| String encoding; |
| |
| content = new Content("http://www.example.com", "http://www.example.com/", |
| contentInOctets, "text/plain", metadata, conf); |
| detector = new EncodingDetector(conf); |
| detector.autoDetectClues(content, true); |
| encoding = detector.guessEncoding(content, "windows-1252"); |
| // no information is available, so it should return default encoding |
| Assert.assertEquals("windows-1252", encoding.toLowerCase()); |
| |
| metadata.clear(); |
| metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16"); |
| content = new Content("http://www.example.com", "http://www.example.com/", |
| contentInOctets, "text/plain", metadata, conf); |
| detector = new EncodingDetector(conf); |
| detector.autoDetectClues(content, true); |
| encoding = detector.guessEncoding(content, "windows-1252"); |
| Assert.assertEquals("utf-16", encoding.toLowerCase()); |
| |
| metadata.clear(); |
| content = new Content("http://www.example.com", "http://www.example.com/", |
| contentInOctets, "text/plain", metadata, conf); |
| detector = new EncodingDetector(conf); |
| detector.autoDetectClues(content, true); |
| detector.addClue("windows-1254", "sniffed"); |
| encoding = detector.guessEncoding(content, "windows-1252"); |
| Assert.assertEquals("windows-1254", encoding.toLowerCase()); |
| |
| // enable autodetection |
| conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50); |
| metadata.clear(); |
| metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16"); |
| content = new Content("http://www.example.com", "http://www.example.com/", |
| contentInOctets, "text/plain", metadata, conf); |
| detector = new EncodingDetector(conf); |
| detector.autoDetectClues(content, true); |
| detector.addClue("utf-32", "sniffed"); |
| encoding = detector.guessEncoding(content, "windows-1252"); |
| Assert.assertEquals("utf-8", encoding.toLowerCase()); |
| } |
| |
| } |