| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.net; |
| |
| import org.apache.nutch.plugin.Extension; |
| import org.apache.nutch.plugin.ExtensionPoint; |
| import org.apache.nutch.plugin.PluginRepository; |
| |
| import org.apache.hadoop.conf.Configuration; |
| |
| import org.apache.nutch.util.NutchConfiguration; |
| |
| import java.io.BufferedReader; |
| import java.io.InputStreamReader; |
| |
| /** |
| * Checks one given normalizer or all normalizers. |
| */ |
| public class URLNormalizerChecker { |
| |
| private Configuration conf; |
| |
| public URLNormalizerChecker(Configuration conf) { |
| this.conf = conf; |
| } |
| |
| private void checkOne(String normalizerName, String scope) throws Exception { |
| URLNormalizer normalizer = null; |
| |
| ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( |
| URLNormalizer.X_POINT_ID); |
| |
| if (point == null) |
| throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found."); |
| |
| Extension[] extensions = point.getExtensions(); |
| |
| for (int i = 0; i < extensions.length; i++) { |
| Extension extension = extensions[i]; |
| normalizer = (URLNormalizer) extension.getExtensionInstance(); |
| if (normalizer.getClass().getName().equals(normalizerName)) { |
| break; |
| } else { |
| normalizer = null; |
| } |
| } |
| |
| if (normalizer == null) |
| throw new RuntimeException("URLNormalizer " + normalizerName |
| + " not found."); |
| |
| System.out.println("Checking URLNormalizer " + normalizerName); |
| |
| BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); |
| String line; |
| while ((line = in.readLine()) != null) { |
| String out = normalizer.normalize(line, scope); |
| System.out.println(out); |
| } |
| } |
| |
| private void checkAll(String scope) throws Exception { |
| System.out.println("Checking combination of all URLNormalizers available"); |
| |
| BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); |
| String line; |
| URLNormalizers normalizers = new URLNormalizers(conf, scope); |
| while ((line = in.readLine()) != null) { |
| String out = normalizers.normalize(line, scope); |
| System.out.println(out); |
| } |
| } |
| |
| public static void main(String[] args) throws Exception { |
| |
| String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]" |
| + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"; |
| |
| String normalizerName = null; |
| String scope = URLNormalizers.SCOPE_DEFAULT; |
| for (int i = 0; i < args.length; i++) { |
| if (args[i].equals("-normalizer")) { |
| normalizerName = args[++i]; |
| } else if (args[i].equals("-scope")) { |
| scope = args[++i]; |
| } else { |
| throw new IllegalArgumentException(usage); |
| } |
| } |
| |
| URLNormalizerChecker checker = new URLNormalizerChecker( |
| NutchConfiguration.create()); |
| if (normalizerName != null) { |
| checker.checkOne(normalizerName, scope); |
| } else { |
| checker.checkAll(scope); |
| } |
| } |
| } |