blob: 695a0fac0cdd2e218a16585a5999fb7b88b1712b [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.net;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.ExtensionPoint;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import java.io.BufferedReader;
import java.io.InputStreamReader;
/**
* Checks one given normalizer or all normalizers.
*/
public class URLNormalizerChecker {
private Configuration conf;
public URLNormalizerChecker(Configuration conf) {
this.conf = conf;
}
private void checkOne(String normalizerName, String scope) throws Exception {
URLNormalizer normalizer = null;
ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
URLNormalizer.X_POINT_ID);
if (point == null)
throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found.");
Extension[] extensions = point.getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
normalizer = (URLNormalizer) extension.getExtensionInstance();
if (normalizer.getClass().getName().equals(normalizerName)) {
break;
} else {
normalizer = null;
}
}
if (normalizer == null)
throw new RuntimeException("URLNormalizer " + normalizerName
+ " not found.");
System.out.println("Checking URLNormalizer " + normalizerName);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = in.readLine()) != null) {
String out = normalizer.normalize(line, scope);
System.out.println(out);
}
}
private void checkAll(String scope) throws Exception {
System.out.println("Checking combination of all URLNormalizers available");
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
URLNormalizers normalizers = new URLNormalizers(conf, scope);
while ((line = in.readLine()) != null) {
String out = normalizers.normalize(line, scope);
System.out.println(out);
}
}
public static void main(String[] args) throws Exception {
String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]"
+ "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
String normalizerName = null;
String scope = URLNormalizers.SCOPE_DEFAULT;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-normalizer")) {
normalizerName = args[++i];
} else if (args[i].equals("-scope")) {
scope = args[++i];
} else {
throw new IllegalArgumentException(usage);
}
}
URLNormalizerChecker checker = new URLNormalizerChecker(
NutchConfiguration.create());
if (normalizerName != null) {
checker.checkOne(normalizerName, scope);
} else {
checker.checkAll(scope);
}
}
}