blob: 0cfb26369b345775f6b019242309e77c189d4718 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.metadata;
import org.apache.hadoop.io.Text;
/**
* A collection of Nutch internal metadata constants.
*
* @author Chris Mattmann
* @author Jérôme Charron
*/
public interface Nutch {
public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
public static final String SIGNATURE_KEY = "nutch.content.digest";
public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
public static final String SCORE_KEY = "nutch.crawl.score";
public static final String GENERATE_TIME_KEY = "_ngt_";
public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
GENERATE_TIME_KEY);
public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");
public static final String PROTO_STATUS_KEY = "_pst_";
public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
PROTO_STATUS_KEY);
public static final String FETCH_TIME_KEY = "_ftk_";
public static final String FETCH_STATUS_KEY = "_fst_";
/**
* Name to store the <a href="https://www.robotstxt.org/meta.html">robots
* metatag</a> in {@link org.apache.nutch.parse.ParseData}'s metadata.
*/
public static final String ROBOTS_METATAG = "robots";
/**
* Sites may request that search engines don't provide access to cached
* documents.
*/
public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
/** Show both original forbidden content and summaries (default). */
public static final String CACHING_FORBIDDEN_NONE = "none";
/** Don't show either original forbidden content or summaries. */
public static final String CACHING_FORBIDDEN_ALL = "all";
/** Don't show original forbidden content, but show summaries. */
public static final String CACHING_FORBIDDEN_CONTENT = "content";
public static final String REPR_URL_KEY = "_repr_";
public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
public static final String FIXED_INTERVAL_KEY = "fixedInterval";
public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
FIXED_INTERVAL_KEY);
/** For progress of job. Used by the Nutch REST service */
public static final String STAT_PROGRESS = "progress";
/**Used by Nutch REST service */
public static final String CRAWL_ID_KEY = "storage.crawl.id";
/** Argument key to specify location of the seed url dir for the REST endpoints **/
public static final String ARG_SEEDDIR = "url_dir";
/** Argument key to specify name of a seed list for the REST endpoints **/
public static final String ARG_SEEDNAME = "seedName";
/** Argument key to specify the location of crawldb for the REST endpoints **/
public static final String ARG_CRAWLDB = "crawldb";
/** Argument key to specify the location of linkdb for the REST endpoints **/
public static final String ARG_LINKDB = "linkdb";
/** Name of the key used in the Result Map sent back by the REST endpoint **/
public static final String VAL_RESULT = "result";
/** Argument key to specify the location of a directory of segments for the REST endpoints.
* Similar to the -dir command in the bin/nutch script **/
public static final String ARG_SEGMENTDIR = "segment_dir";
/** Argument key to specify the location of individual segment or list of segments for the REST endpoints. The behavior differs for diffirent endpoints: CrawlDb, LinkDb and Indexing Jobs take list of segments, Fetcher and Parse segment take one segment **/
public static final String ARG_SEGMENTS = "segment";
/** Argument key to specify the location of hostdb for the REST endpoints **/
public static final String ARG_HOSTDB = "hostdb";
/** Title key in the Pub/Sub event metadata for the title of the parsed page*/
public static final String FETCH_EVENT_TITLE = "title";
/** Content-type key in the Pub/Sub event metadata for the content-type of the parsed page*/
public static final String FETCH_EVENT_CONTENTTYPE = "content-type";
/** Score key in the Pub/Sub event metadata for the score of the parsed page*/
public static final String FETCH_EVENT_SCORE = "score";
/** Fetch time key in the Pub/Sub event metadata for the fetch time of the parsed page*/
public static final String FETCH_EVENT_FETCHTIME = "fetchTime";
/** Content-lanueage key in the Pub/Sub event metadata for the content-language of the parsed page*/
public static final String FETCH_EVENT_CONTENTLANG = "content-language";
}