| <?xml version="1.0" encoding="UTF-8"?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| <properties> |
| |
| <!-- suppress Tika warnings on stderr --> |
| <service-loader initializableProblemHandler="ignore" loadErrorHandler="warn" /> |
| |
| <!-- |
| Set pool size of SAX parsers to a higher value if fetcher is |
| parsing with many threads and Tika complains about "Consider |
| increasing the XMLReaderUtils.POOL_SIZE". Tika's default pool |
| size is 10. Cf. NUTCH-2578, TIKA-2645, NUTCH-2582. |
| --> |
| <!-- work-around for TIKA-3551: add maxEntityExpansions="20" (default value) --> |
| <xml-reader-utils poolSize="20" maxEntityExpansions="20" /> |
| |
| <!-- disable TesseractOCRParser (it's enabled by default in Tika) --> |
| <parsers> |
| <parser class="org.apache.tika.parser.DefaultParser"> |
| <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> |
| </parser> |
| <parser class="org.apache.tika.parser.CompositeParser"> |
| <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> |
| </parser> |
| </parsers> |
| </properties> |