conf/schema.xml - nutch - Git at Google

 <?xml version="1.0" encoding="UTF-8" ?>
 <!--
  Licensed to the Apache Software Foundation (ASF) under one or more
  contributor license agreements.  See the NOTICE file distributed with
  this work for additional information regarding copyright ownership.
  The ASF licenses this file to You under the Apache License, Version 2.0
  (the "License"); you may not use this file except in compliance with
  the License.  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
 <!--
     Description: This document contains Solr 4.x schema definition to
     be used with Solr integration currently build into Nutch.
     This schema is not minimal, there are some useful field type definitions left,
     and the set of fields and their flags (indexed/stored/term vectors) can be
     further optimized depending on needs.  See
     http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
     for more info.
 -->

 <schema name="nutch" version="1.5">

   <types>

     <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
     <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>

     <fieldtype name="binary" class="solr.BinaryField"/>

     <!--
       Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
     -->
     <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>

     <!--
      Numeric field types that index each value at various levels of precision
      to accelerate range queries when the number of values between the range
      endpoints is large. See the javadoc for NumericRangeQuery for internal
      implementation details.

      Smaller precisionStep values (specified in bits) will lead to more tokens
      indexed per value, slightly larger index size, and faster range queries.
      A precisionStep of 0 disables indexing at different precision levels.
     -->
     <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>

     <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
          is a more restricted form of the canonical representation of dateTime
          http://www.w3.org/TR/xmlschema-2/#dateTime
          The trailing "Z" designates UTC time and is mandatory.
          Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
          All other components are mandatory.

          Expressions can also be used to denote calculations that should be
          performed relative to "NOW" to determine the value, ie...

                NOW/HOUR
                   ... Round to the start of the current hour
                NOW-1DAY
                   ... Exactly 1 day prior to now
                NOW/DAY+6MONTHS+3DAYS
                   ... 6 months and 3 days in the future from the start of
                       the current day

          Consult the DateField javadocs for more information.

          Note: For faster range queries, consider the tdate type
       -->
     <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>

     <!-- A Trie based date field for faster date range queries and date faceting. -->
     <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>


     <!-- solr.TextField allows the specification of custom text analyzers
          specified as a tokenizer and a list of token filters. Different
          analyzers may be specified for indexing and querying.

          The optional positionIncrementGap puts space between multiple fields of
          this type on the same document, with the purpose of preventing false phrase
          matching across fields.

          For more info on customizing your analyzer chain, please see
          http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
      -->

     <!-- A general text field that has reasonable, generic
          cross-language defaults: it tokenizes with StandardTokenizer,
 	 removes stop words from case-insensitive "stopwords.txt"
 	 (empty by default), and down cases.  At query time only, it
 	 also applies synonyms. -->
     <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
       <analyzer type="index">
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
         <!-- in this example, we will only use synonyms at query time
         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
         -->
         <filter class="solr.LowerCaseFilterFactory"/>
       </analyzer>
       <analyzer type="query">
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
         <filter class="solr.LowerCaseFilterFactory"/>
       </analyzer>
     </fieldType>

     <!-- A text field with defaults appropriate for English: it
          tokenizes with StandardTokenizer, removes English stop words
          (stopwords.txt), down cases, protects words from protwords.txt, and
          finally applies Porter's stemming.  The query time analyzer
          also applies synonyms from synonyms.txt. -->
     <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
       <analyzer type="index">
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <!-- in this example, we will only use synonyms at query time
         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
         -->
         <!-- Case insensitive stop word removal.
           add enablePositionIncrements=true in both the index and query
           analyzers to leave a 'gap' for more accurate phrase queries.
         -->
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
                 words="stopwords.txt"
                 enablePositionIncrements="true"
                 />
         <filter class="solr.LowerCaseFilterFactory"/>
 	<filter class="solr.EnglishPossessiveFilterFactory"/>
         <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
 	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
         <filter class="solr.EnglishMinimalStemFilterFactory"/>
 	-->
         <filter class="solr.PorterStemFilterFactory"/>
       </analyzer>
       <analyzer type="query">
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
                 words="stopwords.txt"
                 enablePositionIncrements="true"
                 />
         <filter class="solr.LowerCaseFilterFactory"/>
 	<filter class="solr.EnglishPossessiveFilterFactory"/>
         <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
 	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
         <filter class="solr.EnglishMinimalStemFilterFactory"/>
 	-->
         <filter class="solr.PorterStemFilterFactory"/>
       </analyzer>
     </fieldType>

     <!-- A text field with defaults appropriate for English, plus
 	 aggressive word-splitting and autophrase features enabled.
 	 This field is just like text_en, except it adds
 	 WordDelimiterFilter to enable splitting and matching of
 	 words on case-change, alpha numeric boundaries, and
 	 non-alphanumeric chars.  This means certain compound word
 	 cases will work, for example query "wi fi" will match
 	 document "WiFi" or "wi-fi".  However, other cases will still
 	 not match, for example if the query is "wifi" and the
 	 document is "wi fi" or if the query is "wi-fi" and the
 	 document is "wifi".
         -->
     <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
       <analyzer type="index">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <!-- in this example, we will only use synonyms at query time
         <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
         -->
         <!-- Case insensitive stop word removal.
           add enablePositionIncrements=true in both the index and query
           analyzers to leave a 'gap' for more accurate phrase queries.
         -->
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
                 words="stopwords.txt"
                 enablePositionIncrements="true"
                 />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
         <filter class="solr.PorterStemFilterFactory"/>
       </analyzer>
       <analyzer type="query">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
                 words="stopwords.txt"
                 enablePositionIncrements="true"
                 />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
         <filter class="solr.PorterStemFilterFactory"/>
       </analyzer>
     </fieldType>

     <!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
          but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
     <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
       <analyzer>
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
         <filter class="solr.EnglishMinimalStemFilterFactory"/>
         <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
              possible with WordDelimiterFilter in conjuncton with stemming. -->
         <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
       </analyzer>
     </fieldType>

     <!-- Just like text_general except it reverses the characters of
 	 each token, to enable more efficient leading wildcard queries. -->
     <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
       <analyzer type="index">
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
            maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
       </analyzer>
       <analyzer type="query">
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
         <filter class="solr.LowerCaseFilterFactory"/>
       </analyzer>
     </fieldType>

     <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
       <analyzer>
         <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
       </analyzer>
     </fieldtype>

     <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
       <analyzer>
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <!--
         The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
         a token of "foo|1.4"  would be indexed as "foo" with a payload of 1.4f
         Attributes of the DelimitedPayloadTokenFilterFactory :
          "delimiter" - a one character delimiter. Default is | (pipe)
 	 "encoder" - how to encode the following value into a playload
 	    float -> org.apache.lucene.analysis.payloads.FloatEncoder,
 	    integer -> o.a.l.a.p.IntegerEncoder
 	    identity -> o.a.l.a.p.IdentityEncoder
             Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
          -->
         <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
       </analyzer>
     </fieldtype>

     <!-- lowercases the entire field value, keeping it as a single token.  -->
     <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
       <analyzer>
         <tokenizer class="solr.KeywordTokenizerFactory"/>
         <filter class="solr.LowerCaseFilterFactory" />
       </analyzer>
     </fieldType>

     <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
       <analyzer>
         <tokenizer class="solr.StandardTokenizerFactory"/>
            <filter class="solr.LowerCaseFilterFactory"/>
            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
       </analyzer>
     </fieldType>


     <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
       <analyzer>
         <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
       </analyzer>
     </fieldType>

     <!-- since fields of this type are by default not stored or indexed,
          any data added to them will be ignored outright.  -->
     <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

  </types>

  <fields>
     <!-- This field is used internally by Solr, for example by features
     like partial update functionality and update log. It is NOT required
     if updateLog is turned off in your updateHandler, however it is advised
     to include it as performance improvements are minimal. -->
     <field name="_version_" type="long" indexed="true" stored="true"/>

     <field name="id" type="string" stored="true" indexed="true" required="true"/>

     <!-- core fields -->
     <field name="batchId" type="string" stored="true" indexed="false"/>
     <field name="digest" type="string" stored="true" indexed="false"/>
     <field name="boost" type="float" stored="true" indexed="false"/>

     <!-- fields for index-basic plugin -->
     <field name="host" type="url" stored="false" indexed="true"/>
     <field name="url" type="url" stored="true" indexed="true"/>
     <!-- stored=true for highlighting, use term vectors  and positions for fast highlighting -->
     <field name="content" type="text_general" stored="true" indexed="true"/>
     <field name="title" type="text_general" stored="true" indexed="true" multiValued="true"/>
     <field name="cache" type="string" stored="true" indexed="false"/>
     <field name="tstamp" type="date" stored="true" indexed="false"/>

     <!-- catch-all field -->
     <field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/>

     <!-- fields for index-anchor plugin -->
     <field name="anchor" type="text_general" stored="true" indexed="true"
         multiValued="true"/>

     <!-- fields for index-more plugin -->
     <field name="type" type="string" stored="true" indexed="true" multiValued="true"/>
     <field name="contentLength" type="string" stored="true" indexed="false"/>
     <field name="lastModified" type="date" stored="true" indexed="false"/>
     <field name="date" type="tdate" stored="true" indexed="true"/>

     <!-- fields for index-metadata plugin -->
     <dynamicField name="meta_*" type="string" stored="true" indexed="true"/>

     <!-- fields for languageidentifier plugin -->
     <field name="lang" type="string" stored="true" indexed="true"/>

     <!-- fields for subcollection plugin -->
     <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/>

     <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
     <field name="author" type="string" stored="true" indexed="true"/>
     <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
     <field name="feed" type="string" stored="true" indexed="true"/>
     <field name="publishedDate" type="date" stored="true" indexed="true"/>
     <field name="updatedDate" type="date" stored="true" indexed="true"/>

     <!-- fields for creativecommons plugin -->
     <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>

     <!-- fields for tld plugin -->
     <field name="tld" type="string" stored="false" indexed="false"/>

     <!-- fields for index-html plugin
          Note: although raw document content may be binary,
                index-html adds a String to the index field -->
     <field name="rawcontent" type="string" stored="true" indexed="false"/>

  </fields>
  <uniqueKey>id</uniqueKey>
  <defaultSearchField>text</defaultSearchField>
  <solrQueryParser defaultOperator="OR"/>

   <!-- copyField commands copy one field to another at the time a document
         is added to the index.  It's used either to index the same field differently,
         or to add multiple fields to the same field for easier/faster searching.  -->

  <copyField source="content" dest="text"/>
  <copyField source="url" dest="text"/>
  <copyField source="title" dest="text"/>
  <copyField source="anchor" dest="text"/>
  <copyField source="author" dest="text"/>

 </schema>
	<?xml version="1.0" encoding="UTF-8" ?>
	<!--
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	-->
	<!--
	Description: This document contains Solr 4.x schema definition to
	be used with Solr integration currently build into Nutch.
	This schema is not minimal, there are some useful field type definitions left,
	and the set of fields and their flags (indexed/stored/term vectors) can be
	further optimized depending on needs. See
	http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
	for more info.
	-->

	<schema name="nutch" version="1.5">

	<types>

	<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
	<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>

	<fieldtype name="binary" class="solr.BinaryField"/>

	<!--
	Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
	-->
	<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
	<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
	<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
	<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>

	<!--
	Numeric field types that index each value at various levels of precision
	to accelerate range queries when the number of values between the range
	endpoints is large. See the javadoc for NumericRangeQuery for internal
	implementation details.

	Smaller precisionStep values (specified in bits) will lead to more tokens
	indexed per value, slightly larger index size, and faster range queries.
	A precisionStep of 0 disables indexing at different precision levels.
	-->
	<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
	<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
	<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
	<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>

	<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
	is a more restricted form of the canonical representation of dateTime
	http://www.w3.org/TR/xmlschema-2/#dateTime
	The trailing "Z" designates UTC time and is mandatory.
	Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
	All other components are mandatory.

	Expressions can also be used to denote calculations that should be
	performed relative to "NOW" to determine the value, ie...

	NOW/HOUR
	... Round to the start of the current hour
	NOW-1DAY
	... Exactly 1 day prior to now
	NOW/DAY+6MONTHS+3DAYS
	... 6 months and 3 days in the future from the start of
	the current day

	Consult the DateField javadocs for more information.

	Note: For faster range queries, consider the tdate type
	-->
	<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>

	<!-- A Trie based date field for faster date range queries and date faceting. -->
	<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>


	<!-- solr.TextField allows the specification of custom text analyzers
	specified as a tokenizer and a list of token filters. Different
	analyzers may be specified for indexing and querying.

	The optional positionIncrementGap puts space between multiple fields of
	this type on the same document, with the purpose of preventing false phrase
	matching across fields.

	For more info on customizing your analyzer chain, please see
	http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
	-->

	<!-- A general text field that has reasonable, generic
	cross-language defaults: it tokenizes with StandardTokenizer,
	removes stop words from case-insensitive "stopwords.txt"
	(empty by default), and down cases. At query time only, it
	also applies synonyms. -->
	<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
	<analyzer type="index">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
	<!-- in this example, we will only use synonyms at query time
	<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
	-->
	<filter class="solr.LowerCaseFilterFactory"/>
	</analyzer>
	<analyzer type="query">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
	<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
	<filter class="solr.LowerCaseFilterFactory"/>
	</analyzer>
	</fieldType>

	<!-- A text field with defaults appropriate for English: it
	tokenizes with StandardTokenizer, removes English stop words
	(stopwords.txt), down cases, protects words from protwords.txt, and
	finally applies Porter's stemming. The query time analyzer
	also applies synonyms from synonyms.txt. -->
	<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
	<analyzer type="index">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<!-- in this example, we will only use synonyms at query time
	<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
	-->
	<!-- Case insensitive stop word removal.
	add enablePositionIncrements=true in both the index and query
	analyzers to leave a 'gap' for more accurate phrase queries.
	-->
	<filter class="solr.StopFilterFactory"
	ignoreCase="true"
	words="stopwords.txt"
	enablePositionIncrements="true"
	/>
	<filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.EnglishPossessiveFilterFactory"/>
	<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
	<filter class="solr.EnglishMinimalStemFilterFactory"/>
	-->
	<filter class="solr.PorterStemFilterFactory"/>
	</analyzer>
	<analyzer type="query">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
	<filter class="solr.StopFilterFactory"
	ignoreCase="true"
	words="stopwords.txt"
	enablePositionIncrements="true"
	/>
	<filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.EnglishPossessiveFilterFactory"/>
	<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
	<filter class="solr.EnglishMinimalStemFilterFactory"/>
	-->
	<filter class="solr.PorterStemFilterFactory"/>
	</analyzer>
	</fieldType>

	<!-- A text field with defaults appropriate for English, plus
	aggressive word-splitting and autophrase features enabled.
	This field is just like text_en, except it adds
	WordDelimiterFilter to enable splitting and matching of
	words on case-change, alpha numeric boundaries, and
	non-alphanumeric chars. This means certain compound word
	cases will work, for example query "wi fi" will match
	document "WiFi" or "wi-fi". However, other cases will still
	not match, for example if the query is "wifi" and the
	document is "wi fi" or if the query is "wi-fi" and the
	document is "wifi".
	-->
	<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
	<analyzer type="index">
	<tokenizer class="solr.WhitespaceTokenizerFactory"/>
	<!-- in this example, we will only use synonyms at query time
	<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
	-->
	<!-- Case insensitive stop word removal.
	add enablePositionIncrements=true in both the index and query
	analyzers to leave a 'gap' for more accurate phrase queries.
	-->
	<filter class="solr.StopFilterFactory"
	ignoreCase="true"
	words="stopwords.txt"
	enablePositionIncrements="true"
	/>
	<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
	<filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
	<filter class="solr.PorterStemFilterFactory"/>
	</analyzer>
	<analyzer type="query">
	<tokenizer class="solr.WhitespaceTokenizerFactory"/>
	<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
	<filter class="solr.StopFilterFactory"
	ignoreCase="true"
	words="stopwords.txt"
	enablePositionIncrements="true"
	/>
	<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
	<filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
	<filter class="solr.PorterStemFilterFactory"/>
	</analyzer>
	</fieldType>

	<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
	but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
	<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
	<analyzer>
	<tokenizer class="solr.WhitespaceTokenizerFactory"/>
	<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
	<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
	<filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
	<filter class="solr.EnglishMinimalStemFilterFactory"/>
	<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
	possible with WordDelimiterFilter in conjuncton with stemming. -->
	<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
	</analyzer>
	</fieldType>

	<!-- Just like text_general except it reverses the characters of
	each token, to enable more efficient leading wildcard queries. -->
	<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
	<analyzer type="index">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
	<filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
	maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
	</analyzer>
	<analyzer type="query">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
	<filter class="solr.LowerCaseFilterFactory"/>
	</analyzer>
	</fieldType>

	<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
	<analyzer>
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
	</analyzer>
	</fieldtype>

	<fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
	<analyzer>
	<tokenizer class="solr.WhitespaceTokenizerFactory"/>
	<!--
	The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
	a token of "foo\|1.4" would be indexed as "foo" with a payload of 1.4f
	Attributes of the DelimitedPayloadTokenFilterFactory :
	"delimiter" - a one character delimiter. Default is \| (pipe)
	"encoder" - how to encode the following value into a playload
	float -> org.apache.lucene.analysis.payloads.FloatEncoder,
	integer -> o.a.l.a.p.IntegerEncoder
	identity -> o.a.l.a.p.IdentityEncoder
	Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
	-->
	<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
	</analyzer>
	</fieldtype>

	<!-- lowercases the entire field value, keeping it as a single token. -->
	<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
	<analyzer>
	<tokenizer class="solr.KeywordTokenizerFactory"/>
	<filter class="solr.LowerCaseFilterFactory" />
	</analyzer>
	</fieldType>

	<fieldType name="url" class="solr.TextField" positionIncrementGap="100">
	<analyzer>
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.LowerCaseFilterFactory"/>
	<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
	</analyzer>
	</fieldType>


	<fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
	<analyzer>
	<tokenizer class="solr.PathHierarchyTokenizerFactory"/>
	</analyzer>
	</fieldType>

	<!-- since fields of this type are by default not stored or indexed,
	any data added to them will be ignored outright. -->
	<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

	</types>

	<fields>
	<!-- This field is used internally by Solr, for example by features
	like partial update functionality and update log. It is NOT required
	if updateLog is turned off in your updateHandler, however it is advised
	to include it as performance improvements are minimal. -->
	<field name="_version_" type="long" indexed="true" stored="true"/>

	<field name="id" type="string" stored="true" indexed="true" required="true"/>

	<!-- core fields -->
	<field name="batchId" type="string" stored="true" indexed="false"/>
	<field name="digest" type="string" stored="true" indexed="false"/>
	<field name="boost" type="float" stored="true" indexed="false"/>

	<!-- fields for index-basic plugin -->
	<field name="host" type="url" stored="false" indexed="true"/>
	<field name="url" type="url" stored="true" indexed="true"/>
	<!-- stored=true for highlighting, use term vectors and positions for fast highlighting -->
	<field name="content" type="text_general" stored="true" indexed="true"/>
	<field name="title" type="text_general" stored="true" indexed="true" multiValued="true"/>
	<field name="cache" type="string" stored="true" indexed="false"/>
	<field name="tstamp" type="date" stored="true" indexed="false"/>

	<!-- catch-all field -->
	<field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/>

	<!-- fields for index-anchor plugin -->
	<field name="anchor" type="text_general" stored="true" indexed="true"
	multiValued="true"/>

	<!-- fields for index-more plugin -->
	<field name="type" type="string" stored="true" indexed="true" multiValued="true"/>
	<field name="contentLength" type="string" stored="true" indexed="false"/>
	<field name="lastModified" type="date" stored="true" indexed="false"/>
	<field name="date" type="tdate" stored="true" indexed="true"/>

	<!-- fields for index-metadata plugin -->
	<dynamicField name="meta_*" type="string" stored="true" indexed="true"/>

	<!-- fields for languageidentifier plugin -->
	<field name="lang" type="string" stored="true" indexed="true"/>

	<!-- fields for subcollection plugin -->
	<field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/>

	<!-- fields for feed plugin (tag is also used by microformats-reltag)-->
	<field name="author" type="string" stored="true" indexed="true"/>
	<field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
	<field name="feed" type="string" stored="true" indexed="true"/>
	<field name="publishedDate" type="date" stored="true" indexed="true"/>
	<field name="updatedDate" type="date" stored="true" indexed="true"/>

	<!-- fields for creativecommons plugin -->
	<field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>

	<!-- fields for tld plugin -->
	<field name="tld" type="string" stored="false" indexed="false"/>

	<!-- fields for index-html plugin
	Note: although raw document content may be binary,
	index-html adds a String to the index field -->
	<field name="rawcontent" type="string" stored="true" indexed="false"/>

	</fields>
	<uniqueKey>id</uniqueKey>
	<defaultSearchField>text</defaultSearchField>
	<solrQueryParser defaultOperator="OR"/>

	<!-- copyField commands copy one field to another at the time a document
	is added to the index. It's used either to index the same field differently,
	or to add multiple fields to the same field for easier/faster searching. -->

	<copyField source="content" dest="text"/>
	<copyField source="url" dest="text"/>
	<copyField source="title" dest="text"/>
	<copyField source="anchor" dest="text"/>
	<copyField source="author" dest="text"/>

	</schema>