Added docker-compose example for TesseractOCR
diff --git a/docker-compose-tika-customocr.yml b/docker-compose-tika-customocr.yml
new file mode 100644
index 0000000..2084a0d
--- /dev/null
+++ b/docker-compose-tika-customocr.yml
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: "3.8"
+services:
+
+ ## Apache Tika Server
+ tika:
+ image: apache/tika:1.25-full
+ # Override default so we can add configuration on classpath
+ entrypoint: [ "/bin/sh", "-c", "exec java -cp /customocr:/tika-server-1.25.jar org.apache.tika.server.TikaServerCli -h 0.0.0.0 $$0 $$@"]
+ # Kept command as example but could be added to entrypoint too
+ command: -c /tika-config.xml
+ restart: on-failure
+ ports:
+ - "9998:9998"
+ volumes:
+ # Choose the configuration you want, or add your own custom one
+ # - ./sample-configs/customocr/tika-config-inline.xml:/tika-config.xml
+ - ./sample-configs/customocr/tika-config-rendered.xml:/tika-config.xml
+
+
\ No newline at end of file
diff --git a/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
new file mode 100644
index 0000000..b4b787f
--- /dev/null
+++ b/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# You customise or add the settings you want here
+language=eng+spa+fra+deu+ita
+timeout=240
+minFileSizeToOcr=1
+enableImageProcessing=0
+density=200
+depth=8
+filter=box
+resize=300
+applyRotation=true
\ No newline at end of file
diff --git a/sample-configs/customocr/tika-config-inline.xml b/sample-configs/customocr/tika-config-inline.xml
new file mode 100644
index 0000000..1c9b613
--- /dev/null
+++ b/sample-configs/customocr/tika-config-inline.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+<properties>
+ <parsers>
+ <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) -->
+ <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+
+ <!-- Extract and OCR Inline Images in PDF -->
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractInlineImages" type="bool">true</param>
+ </params>
+ </parser>
+
+ </parsers>
+</properties>
diff --git a/sample-configs/customocr/tika-config-rendered.xml b/sample-configs/customocr/tika-config-rendered.xml
new file mode 100644
index 0000000..bcd8666
--- /dev/null
+++ b/sample-configs/customocr/tika-config-rendered.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+<properties>
+ <parsers>
+ <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) -->
+ <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+
+ <!-- OCR on Rendered Pages -->
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <!-- no_ocr - extract text only
+ ocr_only - don't extract text and just attempt OCR
+ ocr_and_text - extract text and attempt OCR (from Tika 1.24)
+ auto - extract text but if < 10 characters try OCR
+ -->
+ <param name="ocrStrategy" type="string">ocr_only</param>
+ <param name="ocrImageType" type="string">rgb</param>
+ <param name="ocrDPI" type="int">100</param>
+ </params>
+ </parser>
+
+ </parsers>
+</properties>