| <?xml version="1.0" encoding="UTF-8" ?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or |
| more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information regarding |
| copyright ownership. The ASF licenses this file to You under the |
| Apache License, Version 2.0 (the "License"); you may not use |
| this file except in compliance with the License. You may obtain |
| a copy of the License at |
| http://www.apache.org/licenses/LICENSE-2.0 Unless required by |
| applicable law or agreed to in writing, software distributed |
| under the License is distributed on an "AS IS" BASIS, WITHOUT |
| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions |
| and limitations under the License. |
| --> |
| <!-- |
| Description: This document contains Solr 3.1 schema definition to |
| be used with Solr integration currently build into Nutch. See |
| https://issues.apache.org/jira/browse/NUTCH-442 |
| https://issues.apache.org/jira/browse/NUTCH-699 |
| https://issues.apache.org/jira/browse/NUTCH-994 |
| https://issues.apache.org/jira/browse/NUTCH-997 |
| https://issues.apache.org/jira/browse/NUTCH-1058 |
| https://issues.apache.org/jira/browse/NUTCH-1232 |
| and |
| http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/ |
| example/solr/conf/schema.xml?view=markup |
| for more info. |
| --> |
| <schema name="nutch" version="1.5"> |
| <types> |
| <fieldType name="string" class="solr.StrField" sortMissingLast="true" |
| omitNorms="true"/> |
| <fieldType name="long" class="solr.TrieLongField" precisionStep="0" |
| omitNorms="true" positionIncrementGap="0"/> |
| <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" |
| omitNorms="true" positionIncrementGap="0"/> |
| <fieldType name="date" class="solr.TrieDateField" precisionStep="0" |
| omitNorms="true" positionIncrementGap="0"/> |
| <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> |
| |
| <fieldType name="text" class="solr.TextField" |
| positionIncrementGap="100"> |
| <analyzer> |
| <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| <filter class="solr.StopFilterFactory" |
| ignoreCase="true" words="stopwords.txt"/> |
| <filter class="solr.WordDelimiterFilterFactory" |
| generateWordParts="1" generateNumberParts="1" |
| catenateWords="1" catenateNumbers="1" catenateAll="0" |
| splitOnCaseChange="1"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.EnglishPorterFilterFactory" |
| protected="protwords.txt"/> |
| <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
| </analyzer> |
| </fieldType> |
| <fieldType name="url" class="solr.TextField" |
| positionIncrementGap="100"> |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.WordDelimiterFilterFactory" |
| generateWordParts="1" generateNumberParts="1"/> |
| </analyzer> |
| </fieldType> |
| <!-- boolean type: "true" or "false" --> |
| <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> |
| |
| <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are |
| currently supported on types that are sorted internally as strings |
| and on numeric types. |
| This includes "string","boolean", and, as of 3.5 (and 4.x), |
| int, float, long, date, double, including the "Trie" variants. |
| - If sortMissingLast="true", then a sort on this field will cause documents |
| without the field to come after documents with the field, |
| regardless of the requested sort order (asc or desc). |
| - If sortMissingFirst="true", then a sort on this field will cause documents |
| without the field to come before documents with the field, |
| regardless of the requested sort order. |
| - If sortMissingLast="false" and sortMissingFirst="false" (the default), |
| then default lucene sorting will be used which places docs without the |
| field first in an ascending sort and last in a descending sort. |
| --> |
| </types> |
| <fields> |
| <field name="id" type="string" stored="true" indexed="true" |
| required="true"/> |
| |
| <field name="text" type="text" stored="false" indexed="true" multiValued="true"/> |
| |
| <!-- core fields --> |
| <field name="segment" type="string" stored="true" indexed="false"/> |
| <field name="digest" type="string" stored="true" indexed="false"/> |
| <field name="boost" type="float" stored="true" indexed="false"/> |
| |
| <!-- fields for index-basic plugin --> |
| <field name="host" type="string" stored="false" indexed="true"/> |
| <field name="url" type="url" stored="true" indexed="true"/> |
| <field name="content" type="text" stored="false" indexed="true"/> |
| <field name="title" type="text" stored="true" indexed="true"/> |
| <field name="cache" type="string" stored="true" indexed="false"/> |
| <field name="tstamp" type="date" stored="true" indexed="false"/> |
| |
| <!-- fields for index-geoip plugin --> |
| <field name="ip" type="string" stored="true" indexed="true" /> |
| <field name="cityName" type="string" stored="true" indexed="true" /> |
| <field name="cityConfidence" type="int" stored="true" indexed="true" /> |
| <field name="cityGeoNameId" type="int" stored="true" indexed="true" /> |
| <field name="continentCode" type="string" stored="true" indexed="true" /> |
| <field name="continentGeoNameId" type="int" stored="true" indexed="true" /> |
| <field name="contentName" type="string" stored="true" indexed="true" /> |
| <field name="countryIsoCode" type="string" stored="true" indexed="true"/> |
| <field name="countryName" type="string" stored="true" indexed="true" /> |
| <field name="countryConfidence" type="int" stored="true" indexed="true"/> |
| <field name="countryGeoNameId" type="int" stored="true" indexed="true"/> |
| <field name="latLon" type="string" stored="true" indexed="true"/> |
| <field name="accRadius" type="int" stored="true" indexed="true"/> |
| <field name="timeZone" type="string" stored="true" indexed="true"/> |
| <field name="metroCode" type="int" stored="true" indexed="true" /> |
| <field name="postalCode" type="string" stored="true" indexed="true" /> |
| <field name="postalConfidence" type="int" stored="true" indexed="true" /> |
| <field name="countryType" type="string" stored="true" indexed="true" /> |
| <field name="subDivName" type="string" stored="true" indexed="true" /> |
| <field name="subDivIsoCode" type="string" stored="true" indexed="true" /> |
| <field name="subDivConfidence" type="int" stored="true" indexed="true" /> |
| <field name="subDivGeoNameId" type="int" stored="true" indexed="true" /> |
| <field name="autonSystemNum" type="int" stored="true" indexed="true" /> |
| <field name="autonSystemOrg" type="string" stored="true" indexed="true" /> |
| <field name="domain" type="string" stored="true" indexed="true" /> |
| <field name="isp" type="string" stored="true" indexed="true" /> |
| <field name="org" type="string" stored="true" indexed="true" /> |
| <field name="userType" type="string" stored="true" indexed="true" /> |
| <field name="isAnonProxy" type="boolean" stored="true" indexed="true" /> |
| <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" /> |
| <field name="connType" type="string" stored="true" indexed="true" /> |
| |
| |
| |
| <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/> |
| |
| <!-- fields for index-anchor plugin --> |
| <field name="anchor" type="string" stored="true" indexed="true" |
| multiValued="true"/> |
| |
| <!-- fields for index-more plugin --> |
| <field name="type" type="string" stored="true" indexed="true" |
| multiValued="true"/> |
| <field name="contentLength" type="long" stored="true" |
| indexed="false"/> |
| <field name="lastModified" type="date" stored="true" |
| indexed="false"/> |
| <field name="date" type="date" stored="true" indexed="true"/> |
| |
| <!-- fields for languageidentifier plugin --> |
| <field name="lang" type="string" stored="true" indexed="true"/> |
| |
| <!-- fields for subcollection plugin --> |
| <field name="subcollection" type="string" stored="true" |
| indexed="true" multiValued="true"/> |
| |
| <!-- fields for feed plugin (tag is also used by microformats-reltag)--> |
| <field name="author" type="string" stored="true" indexed="true"/> |
| <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/> |
| <field name="feed" type="string" stored="true" indexed="true"/> |
| <field name="publishedDate" type="date" stored="true" |
| indexed="true"/> |
| <field name="updatedDate" type="date" stored="true" |
| indexed="true"/> |
| |
| <!-- fields for creativecommons plugin --> |
| <field name="cc" type="string" stored="true" indexed="true" |
| multiValued="true"/> |
| |
| <!-- fields for tld plugin --> |
| <field name="tld" type="string" stored="false" indexed="false"/> |
| |
| <!-- to work with Solr 4.9 and beyond that use RealTimeGetHandler --> |
| <field name="_version_" type="long" indexed="true" stored="true"/> |
| |
| </fields> |
| <uniqueKey>id</uniqueKey> |
| <defaultSearchField>content</defaultSearchField> |
| <solrQueryParser defaultOperator="OR"/> |
| |
| <!-- copyField commands copy one field to another at the time a document |
| is added to the index. It's used either to index the same field differently, |
| or to add multiple fields to the same field for easier/faster searching. |
| --> |
| <copyField source="content" dest="text"/> |
| <copyField source="url" dest="text"/> |
| <copyField source="title" dest="text"/> |
| <copyField source="anchor" dest="text"/> |
| <copyField source="author" dest="text"/> |
| <copyField source="latLon" dest="location"/> |
| |
| </schema> |